{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 18867, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.300400180213606e-05, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 1.4285714285714284e-08, "logits/chosen": -23153680.0, "logits/rejected": 93036448.0, "logps/chosen": -173.05230712890625, "logps/rejected": -387.9343566894531, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.00010600800360427212, "grad_norm": 92.5, "kl": 0.0, "learning_rate": 2.857142857142857e-08, "logits/chosen": 12804130.0, "logits/rejected": -2586176.5, "logps/chosen": -624.2457885742188, "logps/rejected": -212.56422424316406, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0001590120054064082, "grad_norm": 82.5, "kl": 0.31366539001464844, "learning_rate": 4.285714285714286e-08, "logits/chosen": -53189408.0, "logits/rejected": -32432979.2, "logps/chosen": -324.1885172526042, "logps/rejected": -238.525634765625, "loss": 0.4943, "rewards/chosen": 0.06975097457567851, "rewards/margins": 0.0591783123711745, "rewards/rejected": 0.010572662204504013, "step": 3 }, { "epoch": 0.00021201600720854424, "grad_norm": 81.0, "kl": 0.13257598876953125, "learning_rate": 5.714285714285714e-08, "logits/chosen": -80376672.0, "logits/rejected": -23287130.666666668, "logps/chosen": -341.042236328125, "logps/rejected": -340.5222981770833, "loss": 0.4925, "rewards/chosen": 0.07617263495922089, "rewards/margins": 0.09083480636278789, "rewards/rejected": -0.014662171403566996, "step": 4 }, { "epoch": 0.0002650200090106803, "grad_norm": 104.0, "kl": 0.24523162841796875, "learning_rate": 7.142857142857142e-08, "logits/chosen": -13247576.0, "logits/rejected": -14956584.0, "logps/chosen": -882.19453125, "logps/rejected": -176.9351603190104, "loss": 0.4954, "rewards/chosen": 0.01639324426651001, "rewards/margins": 0.038679150243600205, "rewards/rejected": -0.0222859059770902, "step": 5 }, { "epoch": 0.0003180240108128164, "grad_norm": 72.0, "kl": 0.14484024047851562, "learning_rate": 8.571428571428572e-08, "logits/chosen": -25827136.0, "logits/rejected": -41412248.0, "logps/chosen": -341.84326171875, "logps/rejected": -373.196044921875, "loss": 0.4957, "rewards/chosen": 0.0018229000270366669, "rewards/margins": 0.03498997911810875, "rewards/rejected": -0.03316707909107208, "step": 6 }, { "epoch": 0.0003710280126149524, "grad_norm": 72.5, "kl": 0.11614227294921875, "learning_rate": 1e-07, "logits/chosen": -105527846.4, "logits/rejected": -35155554.666666664, "logps/chosen": -456.91865234375, "logps/rejected": -262.7819417317708, "loss": 0.4969, "rewards/chosen": 0.048670655488967894, "rewards/margins": 0.0033517986536025987, "rewards/rejected": 0.045318856835365295, "step": 7 }, { "epoch": 0.0004240320144170885, "grad_norm": 83.5, "kl": 0.3477134704589844, "learning_rate": 1.1428571428571427e-07, "logits/chosen": -22520578.0, "logits/rejected": -32016866.666666668, "logps/chosen": -124.05150604248047, "logps/rejected": -444.3665364583333, "loss": 0.4906, "rewards/chosen": 0.02013549953699112, "rewards/margins": 0.022689946927130222, "rewards/rejected": -0.002554447390139103, "step": 8 }, { "epoch": 0.00047703601621922455, "grad_norm": 57.5, "kl": 0.4288616180419922, "learning_rate": 1.2857142857142855e-07, "logits/chosen": -22002714.666666668, "logits/rejected": -33248966.4, "logps/chosen": -288.2696126302083, "logps/rejected": -180.441650390625, "loss": 0.4986, "rewards/chosen": -0.041149904330571495, "rewards/margins": -0.017206271489461265, "rewards/rejected": -0.02394363284111023, "step": 9 }, { "epoch": 0.0005300400180213606, "grad_norm": 59.75, "kl": 0.12288188934326172, "learning_rate": 1.4285714285714285e-07, "logits/chosen": -42334915.2, "logits/rejected": -8552325.333333334, "logps/chosen": -319.22080078125, "logps/rejected": -122.14192708333333, "loss": 0.5103, "rewards/chosen": -0.06041549444198609, "rewards/margins": -0.05814536881322662, "rewards/rejected": -0.0022701256287594638, "step": 10 }, { "epoch": 0.0005830440198234967, "grad_norm": 54.25, "kl": 0.32825183868408203, "learning_rate": 1.5714285714285714e-07, "logits/chosen": -6206063.0, "logits/rejected": -17531509.333333332, "logps/chosen": -243.45751953125, "logps/rejected": -225.87211100260416, "loss": 0.4991, "rewards/chosen": -0.04349708557128906, "rewards/margins": -0.05187416076660156, "rewards/rejected": 0.0083770751953125, "step": 11 }, { "epoch": 0.0006360480216256328, "grad_norm": 104.5, "kl": 0.31346893310546875, "learning_rate": 1.7142857142857143e-07, "logits/chosen": -19826617.6, "logits/rejected": 35928493.333333336, "logps/chosen": -803.744873046875, "logps/rejected": -362.5703938802083, "loss": 0.5177, "rewards/chosen": -0.07200226187705994, "rewards/margins": -0.08431457976500194, "rewards/rejected": 0.012312317887941996, "step": 12 }, { "epoch": 0.0006890520234277688, "grad_norm": 55.25, "kl": 0.24349498748779297, "learning_rate": 1.8571428571428572e-07, "logits/chosen": -19539284.0, "logits/rejected": -29280886.0, "logps/chosen": -189.2702840169271, "logps/rejected": -347.82415771484375, "loss": 0.4958, "rewards/chosen": 0.010057957842946053, "rewards/margins": 0.06316159851849079, "rewards/rejected": -0.05310364067554474, "step": 13 }, { "epoch": 0.0007420560252299048, "grad_norm": 68.5, "kl": 0.22354793548583984, "learning_rate": 2e-07, "logits/chosen": -46621916.8, "logits/rejected": -58266010.666666664, "logps/chosen": -388.8134033203125, "logps/rejected": -355.6566569010417, "loss": 0.5085, "rewards/chosen": -0.06039687991142273, "rewards/margins": -0.03560948620239894, "rewards/rejected": -0.024787393709023792, "step": 14 }, { "epoch": 0.0007950600270320409, "grad_norm": 66.0, "kl": 0.15793228149414062, "learning_rate": 2.1428571428571426e-07, "logits/chosen": -38025808.0, "logits/rejected": -46223092.0, "logps/chosen": -443.16107177734375, "logps/rejected": -266.92803955078125, "loss": 0.4975, "rewards/chosen": 0.0205825325101614, "rewards/margins": 0.0005198940634727478, "rewards/rejected": 0.020062638446688652, "step": 15 }, { "epoch": 0.000848064028834177, "grad_norm": 84.5, "kl": 0.1948833465576172, "learning_rate": 2.2857142857142855e-07, "logits/chosen": -64641264.0, "logits/rejected": -27592246.4, "logps/chosen": -407.514404296875, "logps/rejected": -432.396875, "loss": 0.4897, "rewards/chosen": 0.057665000359217324, "rewards/margins": 0.0892429123322169, "rewards/rejected": -0.03157791197299957, "step": 16 }, { "epoch": 0.000901068030636313, "grad_norm": 65.0, "kl": 0.10207748413085938, "learning_rate": 2.4285714285714287e-07, "logits/chosen": -35461728.0, "logits/rejected": -30689746.0, "logps/chosen": -417.682861328125, "logps/rejected": -270.60308837890625, "loss": 0.501, "rewards/chosen": -0.010592651553452015, "rewards/margins": -0.008222102653235197, "rewards/rejected": -0.002370548900216818, "step": 17 }, { "epoch": 0.0009540720324384491, "grad_norm": 54.75, "kl": 0.10400962829589844, "learning_rate": 2.571428571428571e-07, "logits/chosen": -9088591.0, "logits/rejected": -14712485.0, "logps/chosen": -259.16180419921875, "logps/rejected": -197.80682373046875, "loss": 0.4988, "rewards/chosen": 0.01218481082469225, "rewards/margins": 0.009956836700439453, "rewards/rejected": 0.002227974124252796, "step": 18 }, { "epoch": 0.0010070760342405852, "grad_norm": 78.0, "kl": 0.19533920288085938, "learning_rate": 2.714285714285714e-07, "logits/chosen": -37294108.0, "logits/rejected": -31063546.666666668, "logps/chosen": -385.905029296875, "logps/rejected": -302.3912353515625, "loss": 0.4821, "rewards/chosen": 0.05620270222425461, "rewards/margins": 0.10683543235063553, "rewards/rejected": -0.05063273012638092, "step": 19 }, { "epoch": 0.0010600800360427212, "grad_norm": 58.5, "kl": 0.31415271759033203, "learning_rate": 2.857142857142857e-07, "logits/chosen": -13620128.0, "logits/rejected": -9810684.0, "logps/chosen": -114.29273478190105, "logps/rejected": -247.46450805664062, "loss": 0.5057, "rewards/chosen": 0.020137024422486622, "rewards/margins": -0.005908964822689693, "rewards/rejected": 0.026045989245176315, "step": 20 }, { "epoch": 0.0011130840378448573, "grad_norm": 51.5, "kl": 0.38954639434814453, "learning_rate": 3e-07, "logits/chosen": 1009623.0833333334, "logits/rejected": -38483072.0, "logps/chosen": -104.95391845703125, "logps/rejected": -375.4510192871094, "loss": 0.503, "rewards/chosen": 0.007259814689556758, "rewards/margins": 0.04601942996184031, "rewards/rejected": -0.038759615272283554, "step": 21 }, { "epoch": 0.0011660880396469934, "grad_norm": 66.0, "kl": 0.08700180053710938, "learning_rate": 3.142857142857143e-07, "logits/chosen": 6304931.5, "logits/rejected": -45726392.0, "logps/chosen": -180.18011474609375, "logps/rejected": -175.6695098876953, "loss": 0.4904, "rewards/chosen": 0.015181160531938076, "rewards/margins": 0.05983991827815771, "rewards/rejected": -0.044658757746219635, "step": 22 }, { "epoch": 0.0012190920414491295, "grad_norm": 72.5, "kl": 0.38702964782714844, "learning_rate": 3.2857142857142857e-07, "logits/chosen": -19433158.0, "logits/rejected": -31411052.0, "logps/chosen": -180.67538452148438, "logps/rejected": -503.0622863769531, "loss": 0.4922, "rewards/chosen": 0.06993880122900009, "rewards/margins": 0.065038344822824, "rewards/rejected": 0.00490045640617609, "step": 23 }, { "epoch": 0.0012720960432512655, "grad_norm": 68.0, "kl": 0.14882850646972656, "learning_rate": 3.4285714285714286e-07, "logits/chosen": -32397928.0, "logits/rejected": -18990438.0, "logps/chosen": -286.8509216308594, "logps/rejected": -270.26739501953125, "loss": 0.4929, "rewards/chosen": 0.04164619743824005, "rewards/margins": 0.051537325605750084, "rewards/rejected": -0.009891128167510033, "step": 24 }, { "epoch": 0.0013251000450534016, "grad_norm": 68.5, "kl": 0.2847557067871094, "learning_rate": 3.5714285714285716e-07, "logits/chosen": 4712618.4, "logits/rejected": -26235637.333333332, "logps/chosen": -259.2828857421875, "logps/rejected": -394.4635823567708, "loss": 0.5064, "rewards/chosen": 0.008366622775793076, "rewards/margins": 0.002104415992895763, "rewards/rejected": 0.006262206782897313, "step": 25 }, { "epoch": 0.0013781040468555377, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 3.7142857142857145e-07, "logits/chosen": -30220180.0, "logits/rejected": -49641696.0, "logps/chosen": -219.58714294433594, "logps/rejected": -167.37844848632812, "loss": 0.5098, "rewards/chosen": -0.0625062957406044, "rewards/margins": -0.07874870300292969, "rewards/rejected": 0.016242407262325287, "step": 26 }, { "epoch": 0.0014311080486576738, "grad_norm": 72.5, "kl": 0.09575271606445312, "learning_rate": 3.857142857142857e-07, "logits/chosen": -27924068.0, "logits/rejected": -52171240.0, "logps/chosen": -327.057861328125, "logps/rejected": -342.90643310546875, "loss": 0.5013, "rewards/chosen": 0.014655877836048603, "rewards/margins": -0.010636518709361553, "rewards/rejected": 0.025292396545410156, "step": 27 }, { "epoch": 0.0014841120504598096, "grad_norm": 55.0, "kl": 0.053142547607421875, "learning_rate": 4e-07, "logits/chosen": -14762248.0, "logits/rejected": -16188009.0, "logps/chosen": -235.63187662760416, "logps/rejected": -158.04754638671875, "loss": 0.4968, "rewards/chosen": 0.006290849298238754, "rewards/margins": 0.03801692649722099, "rewards/rejected": -0.03172607719898224, "step": 28 }, { "epoch": 0.0015371160522619457, "grad_norm": 68.0, "kl": 0.020214080810546875, "learning_rate": 4.142857142857143e-07, "logits/chosen": -80273040.0, "logits/rejected": -36506884.0, "logps/chosen": -272.6000671386719, "logps/rejected": -336.6654052734375, "loss": 0.5004, "rewards/chosen": -0.03183918073773384, "rewards/margins": -0.0029665008187294006, "rewards/rejected": -0.02887267991900444, "step": 29 }, { "epoch": 0.0015901200540640818, "grad_norm": 46.75, "kl": 0.14395523071289062, "learning_rate": 4.285714285714285e-07, "logits/chosen": -18612348.8, "logits/rejected": -13211309.333333334, "logps/chosen": -131.0058837890625, "logps/rejected": -91.35422770182292, "loss": 0.4997, "rewards/chosen": 0.008388404548168183, "rewards/margins": -0.012940330306688945, "rewards/rejected": 0.021328734854857128, "step": 30 }, { "epoch": 0.0016431240558662178, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 4.428571428571428e-07, "logits/chosen": -26194912.0, "logits/rejected": -17792188.0, "logps/chosen": -227.8856689453125, "logps/rejected": -225.39188639322916, "loss": 0.5037, "rewards/chosen": -0.009761656820774078, "rewards/margins": -0.033066660662492114, "rewards/rejected": 0.023305003841718037, "step": 31 }, { "epoch": 0.001696128057668354, "grad_norm": 71.0, "kl": 0.2712841033935547, "learning_rate": 4.571428571428571e-07, "logits/chosen": -58688138.666666664, "logits/rejected": -3808996.5, "logps/chosen": -373.723388671875, "logps/rejected": -258.92999267578125, "loss": 0.4991, "rewards/chosen": 0.011483194927374521, "rewards/margins": 0.029936412970225014, "rewards/rejected": -0.018453218042850494, "step": 32 }, { "epoch": 0.00174913205947049, "grad_norm": 74.5, "kl": 0.2847557067871094, "learning_rate": 4.714285714285714e-07, "logits/chosen": -92055296.0, "logits/rejected": -34435644.8, "logps/chosen": -406.8272705078125, "logps/rejected": -348.07177734375, "loss": 0.4876, "rewards/chosen": 0.039562990268071495, "rewards/margins": 0.07928818066914876, "rewards/rejected": -0.03972519040107727, "step": 33 }, { "epoch": 0.001802136061272626, "grad_norm": 66.5, "kl": 0.20503807067871094, "learning_rate": 4.857142857142857e-07, "logits/chosen": -27098032.0, "logits/rejected": 5315276.8, "logps/chosen": -307.3462727864583, "logps/rejected": -274.5616455078125, "loss": 0.494, "rewards/chosen": -0.01767171174287796, "rewards/margins": 0.031286419928073884, "rewards/rejected": -0.048958131670951845, "step": 34 }, { "epoch": 0.0018551400630747621, "grad_norm": 85.5, "kl": 0.24365234375, "learning_rate": 5e-07, "logits/chosen": -57961768.0, "logits/rejected": -33442428.0, "logps/chosen": -323.3244934082031, "logps/rejected": -651.877685546875, "loss": 0.5034, "rewards/chosen": 0.026348687708377838, "rewards/margins": -0.027067754417657852, "rewards/rejected": 0.05341644212603569, "step": 35 }, { "epoch": 0.0019081440648768982, "grad_norm": 64.5, "kl": 0.16027450561523438, "learning_rate": 5e-07, "logits/chosen": -57626720.0, "logits/rejected": -1911708.5, "logps/chosen": -381.2510986328125, "logps/rejected": -124.76205444335938, "loss": 0.4968, "rewards/chosen": 0.023357647160689037, "rewards/margins": 0.03145434117565553, "rewards/rejected": -0.008096694014966488, "step": 36 }, { "epoch": 0.001961148066679034, "grad_norm": 65.0, "kl": 0.2976512908935547, "learning_rate": 5e-07, "logits/chosen": -24453618.0, "logits/rejected": 9248990.0, "logps/chosen": -385.1399230957031, "logps/rejected": -200.42864990234375, "loss": 0.4921, "rewards/chosen": 0.07230643928050995, "rewards/margins": 0.07506446563638747, "rewards/rejected": -0.0027580263558775187, "step": 37 }, { "epoch": 0.0020141520684811703, "grad_norm": 71.5, "kl": 0.19361495971679688, "learning_rate": 5e-07, "logits/chosen": -53704435.2, "logits/rejected": -2754275.0, "logps/chosen": -246.153076171875, "logps/rejected": -167.8027547200521, "loss": 0.4997, "rewards/chosen": 0.0447439581155777, "rewards/margins": 0.0024539659420649215, "rewards/rejected": 0.04228999217351278, "step": 38 }, { "epoch": 0.002067156070283306, "grad_norm": 70.5, "kl": 0.14324378967285156, "learning_rate": 5e-07, "logits/chosen": -72914880.0, "logits/rejected": -28894132.0, "logps/chosen": -245.29827880859375, "logps/rejected": -319.3249816894531, "loss": 0.4871, "rewards/chosen": 0.019582748413085938, "rewards/margins": 0.07467479258775711, "rewards/rejected": -0.05509204417467117, "step": 39 }, { "epoch": 0.0021201600720854425, "grad_norm": 69.5, "kl": 0.08287429809570312, "learning_rate": 5e-07, "logits/chosen": -44505664.0, "logits/rejected": -33106050.0, "logps/chosen": -342.0576477050781, "logps/rejected": -289.5639343261719, "loss": 0.4994, "rewards/chosen": -0.008943939581513405, "rewards/margins": 0.0062713623046875, "rewards/rejected": -0.015215301886200905, "step": 40 }, { "epoch": 0.0021731640738875783, "grad_norm": 79.0, "kl": 0.29288387298583984, "learning_rate": 5e-07, "logits/chosen": -10583904.0, "logits/rejected": -47725564.0, "logps/chosen": -218.23321533203125, "logps/rejected": -380.3946533203125, "loss": 0.4888, "rewards/chosen": 0.045654078324635826, "rewards/margins": 0.20537241299947104, "rewards/rejected": -0.1597183346748352, "step": 41 }, { "epoch": 0.0022261680756897146, "grad_norm": 86.5, "kl": 0.09705543518066406, "learning_rate": 5e-07, "logits/chosen": -18481409.6, "logits/rejected": -116493632.0, "logps/chosen": -447.50126953125, "logps/rejected": -499.6569010416667, "loss": 0.4899, "rewards/chosen": 0.050823974609375, "rewards/margins": 0.08606669108072917, "rewards/rejected": -0.035242716471354164, "step": 42 }, { "epoch": 0.0022791720774918505, "grad_norm": 71.5, "kl": 0.15149307250976562, "learning_rate": 5e-07, "logits/chosen": -76482656.0, "logits/rejected": 12089254.0, "logps/chosen": -442.94256591796875, "logps/rejected": -149.84835815429688, "loss": 0.4964, "rewards/chosen": -0.01620636135339737, "rewards/margins": 0.02873210981488228, "rewards/rejected": -0.04493847116827965, "step": 43 }, { "epoch": 0.0023321760792939868, "grad_norm": 65.5, "kl": 0.058032989501953125, "learning_rate": 5e-07, "logits/chosen": -5000350.0, "logits/rejected": -64577688.0, "logps/chosen": -169.64781188964844, "logps/rejected": -312.16192626953125, "loss": 0.4998, "rewards/chosen": -0.007651900872588158, "rewards/margins": 0.0012653358280658722, "rewards/rejected": -0.00891723670065403, "step": 44 }, { "epoch": 0.0023851800810961226, "grad_norm": 79.5, "kl": 0.14760208129882812, "learning_rate": 5e-07, "logits/chosen": -40200534.4, "logits/rejected": -60215498.666666664, "logps/chosen": -401.284814453125, "logps/rejected": -260.4440511067708, "loss": 0.5033, "rewards/chosen": -0.003991852700710297, "rewards/margins": -0.02279602165023486, "rewards/rejected": 0.018804168949524563, "step": 45 }, { "epoch": 0.002438184082898259, "grad_norm": 99.0, "kl": 0.01163482666015625, "learning_rate": 5e-07, "logits/chosen": -50343413.333333336, "logits/rejected": -38371849.6, "logps/chosen": -725.2467447916666, "logps/rejected": -448.978857421875, "loss": 0.4832, "rewards/chosen": 0.060154726107915245, "rewards/margins": 0.13149993618329367, "rewards/rejected": -0.07134521007537842, "step": 46 }, { "epoch": 0.002491188084700395, "grad_norm": 68.0, "kl": 0.0799112319946289, "learning_rate": 5e-07, "logits/chosen": 2260557.0, "logits/rejected": -231881856.0, "logps/chosen": -212.8890380859375, "logps/rejected": -432.72637939453125, "loss": 0.5008, "rewards/chosen": -0.006450558081269264, "rewards/margins": 0.009674549102783203, "rewards/rejected": -0.016125107184052467, "step": 47 }, { "epoch": 0.002544192086502531, "grad_norm": 92.0, "kl": 0.06399917602539062, "learning_rate": 5e-07, "logits/chosen": -23762600.0, "logits/rejected": -20555196.0, "logps/chosen": -629.602294921875, "logps/rejected": -327.4344889322917, "loss": 0.4927, "rewards/chosen": 0.016534043475985527, "rewards/margins": 0.041173938040932015, "rewards/rejected": -0.02463989456494649, "step": 48 }, { "epoch": 0.002597196088304667, "grad_norm": 83.5, "kl": 0.06423377990722656, "learning_rate": 5e-07, "logits/chosen": -29821356.0, "logits/rejected": -37812845.333333336, "logps/chosen": -1028.42724609375, "logps/rejected": -149.60789998372397, "loss": 0.491, "rewards/chosen": 0.06598205864429474, "rewards/margins": 0.0921984389424324, "rewards/rejected": -0.026216380298137665, "step": 49 }, { "epoch": 0.0026502000901068032, "grad_norm": 65.5, "kl": 0.04038810729980469, "learning_rate": 5e-07, "logits/chosen": -30240962.0, "logits/rejected": -69544928.0, "logps/chosen": -284.60064697265625, "logps/rejected": -305.121337890625, "loss": 0.4872, "rewards/chosen": -0.00047779083251953125, "rewards/margins": 0.10602951049804688, "rewards/rejected": -0.1065073013305664, "step": 50 }, { "epoch": 0.002703204091908939, "grad_norm": 96.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14135869.333333334, "logits/rejected": -26987308.8, "logps/chosen": -594.6626790364584, "logps/rejected": -327.8826171875, "loss": 0.4845, "rewards/chosen": 0.05222549537817637, "rewards/margins": 0.12027343610922495, "rewards/rejected": -0.06804794073104858, "step": 51 }, { "epoch": 0.0027562080937110754, "grad_norm": 79.0, "kl": 0.21288299560546875, "learning_rate": 5e-07, "logits/chosen": -76523504.0, "logits/rejected": -16128450.0, "logps/chosen": -382.5576171875, "logps/rejected": -296.1431884765625, "loss": 0.5035, "rewards/chosen": -0.06654596328735352, "rewards/margins": -0.027774903923273087, "rewards/rejected": -0.03877105936408043, "step": 52 }, { "epoch": 0.0028092120955132112, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78431776.0, "logits/rejected": -44975836.0, "logps/chosen": -414.9641520182292, "logps/rejected": -499.05389404296875, "loss": 0.4968, "rewards/chosen": 0.013149134814739227, "rewards/margins": 0.024651210755109787, "rewards/rejected": -0.01150207594037056, "step": 53 }, { "epoch": 0.0028622160973153475, "grad_norm": 81.0, "kl": 0.34370994567871094, "learning_rate": 5e-07, "logits/chosen": 3043612.8, "logits/rejected": 58365514.666666664, "logps/chosen": -265.016162109375, "logps/rejected": -574.8139241536459, "loss": 0.4957, "rewards/chosen": 0.018900451064109803, "rewards/margins": 0.03298330008983612, "rewards/rejected": -0.014082849025726318, "step": 54 }, { "epoch": 0.0029152200991174834, "grad_norm": 62.25, "kl": 0.08402252197265625, "learning_rate": 5e-07, "logits/chosen": -33150084.0, "logits/rejected": -14017796.0, "logps/chosen": -207.2146759033203, "logps/rejected": -147.95626831054688, "loss": 0.4983, "rewards/chosen": -0.028161242604255676, "rewards/margins": 0.020720429718494415, "rewards/rejected": -0.04888167232275009, "step": 55 }, { "epoch": 0.0029682241009196192, "grad_norm": 67.0, "kl": 0.12447738647460938, "learning_rate": 5e-07, "logits/chosen": -15285014.0, "logits/rejected": -26111221.333333332, "logps/chosen": -277.28228759765625, "logps/rejected": -250.806396484375, "loss": 0.4851, "rewards/chosen": -0.038770295679569244, "rewards/margins": 0.04911493510007858, "rewards/rejected": -0.08788523077964783, "step": 56 }, { "epoch": 0.0030212281027217555, "grad_norm": 67.5, "kl": 0.07439422607421875, "learning_rate": 5e-07, "logits/chosen": -38853008.0, "logits/rejected": -27482773.333333332, "logps/chosen": -257.6451171875, "logps/rejected": -415.64404296875, "loss": 0.4952, "rewards/chosen": 0.002657737582921982, "rewards/margins": 0.049587673197189965, "rewards/rejected": -0.04692993561426798, "step": 57 }, { "epoch": 0.0030742321045238914, "grad_norm": 70.5, "kl": 0.10683441162109375, "learning_rate": 5e-07, "logits/chosen": -46861418.666666664, "logits/rejected": -16969862.4, "logps/chosen": -613.0481770833334, "logps/rejected": -262.3401123046875, "loss": 0.4885, "rewards/chosen": 0.02186381071805954, "rewards/margins": 0.08258360475301743, "rewards/rejected": -0.060719794034957884, "step": 58 }, { "epoch": 0.0031272361063260277, "grad_norm": 67.0, "kl": 0.014322280883789062, "learning_rate": 5e-07, "logits/chosen": -27035462.4, "logits/rejected": 4927134.0, "logps/chosen": -403.0702392578125, "logps/rejected": -247.7732950846354, "loss": 0.5064, "rewards/chosen": -0.017776335775852203, "rewards/margins": -0.053100623687108356, "rewards/rejected": 0.03532428791125616, "step": 59 }, { "epoch": 0.0031802401081281635, "grad_norm": 89.0, "kl": 0.06764411926269531, "learning_rate": 5e-07, "logits/chosen": -9146633.333333334, "logits/rejected": 36809180.8, "logps/chosen": -145.88731892903647, "logps/rejected": -354.190625, "loss": 0.4866, "rewards/chosen": 0.02480774124463399, "rewards/margins": 0.09592682321866353, "rewards/rejected": -0.07111908197402954, "step": 60 }, { "epoch": 0.0032332441099303, "grad_norm": 60.75, "kl": 0.3339405059814453, "learning_rate": 5e-07, "logits/chosen": -44961753.6, "logits/rejected": -13453321.333333334, "logps/chosen": -283.9776611328125, "logps/rejected": -199.3962198893229, "loss": 0.5025, "rewards/chosen": -0.009340135753154755, "rewards/margins": -0.020223976174990336, "rewards/rejected": 0.010883840421835581, "step": 61 }, { "epoch": 0.0032862481117324357, "grad_norm": 74.5, "kl": 0.03864288330078125, "learning_rate": 5e-07, "logits/chosen": -33597464.0, "logits/rejected": -68445144.0, "logps/chosen": -455.113525390625, "logps/rejected": -434.8825988769531, "loss": 0.4954, "rewards/chosen": -0.023442842066287994, "rewards/margins": 0.036729808896780014, "rewards/rejected": -0.06017265096306801, "step": 62 }, { "epoch": 0.003339252113534572, "grad_norm": 70.0, "kl": 0.09145545959472656, "learning_rate": 5e-07, "logits/chosen": -16372724.0, "logits/rejected": -10040138.666666666, "logps/chosen": -443.1876525878906, "logps/rejected": -200.19295247395834, "loss": 0.4874, "rewards/chosen": -0.0007102955132722855, "rewards/margins": 0.06691100510458152, "rewards/rejected": -0.0676213006178538, "step": 63 }, { "epoch": 0.003392256115336708, "grad_norm": 69.5, "kl": 0.042865753173828125, "learning_rate": 5e-07, "logits/chosen": -76611338.66666667, "logits/rejected": -18022640.0, "logps/chosen": -572.006591796875, "logps/rejected": -299.391650390625, "loss": 0.4823, "rewards/chosen": 0.043026735385258995, "rewards/margins": 0.13092529972394307, "rewards/rejected": -0.08789856433868408, "step": 64 }, { "epoch": 0.003445260117138844, "grad_norm": 89.0, "kl": 0.1067962646484375, "learning_rate": 5e-07, "logits/chosen": -81419072.0, "logits/rejected": -40608994.666666664, "logps/chosen": -395.173095703125, "logps/rejected": -423.3981119791667, "loss": 0.4849, "rewards/chosen": 0.006782532669603825, "rewards/margins": 0.07711842749267817, "rewards/rejected": -0.07033589482307434, "step": 65 }, { "epoch": 0.00349826411894098, "grad_norm": 66.0, "kl": 0.12134933471679688, "learning_rate": 5e-07, "logits/chosen": -7214090.5, "logits/rejected": -39261724.0, "logps/chosen": -195.70758056640625, "logps/rejected": -289.0002746582031, "loss": 0.4926, "rewards/chosen": -0.01778116263449192, "rewards/margins": 0.05955610238015652, "rewards/rejected": -0.07733726501464844, "step": 66 }, { "epoch": 0.0035512681207431162, "grad_norm": 94.0, "kl": 0.5790348052978516, "learning_rate": 5e-07, "logits/chosen": -17022746.0, "logits/rejected": -11578318.0, "logps/chosen": -544.0214233398438, "logps/rejected": -409.7138366699219, "loss": 0.4933, "rewards/chosen": -0.00538411270827055, "rewards/margins": 0.08789286483079195, "rewards/rejected": -0.0932769775390625, "step": 67 }, { "epoch": 0.003604272122545252, "grad_norm": 64.5, "kl": 0.09177780151367188, "learning_rate": 5e-07, "logits/chosen": -22584496.0, "logits/rejected": -34757136.0, "logps/chosen": -287.81011962890625, "logps/rejected": -265.76715087890625, "loss": 0.4917, "rewards/chosen": 0.02940807305276394, "rewards/margins": 0.08471756242215633, "rewards/rejected": -0.055309489369392395, "step": 68 }, { "epoch": 0.0036572761243473884, "grad_norm": 62.25, "kl": 0.2628040313720703, "learning_rate": 5e-07, "logits/chosen": -31389452.8, "logits/rejected": -7622466.666666667, "logps/chosen": -222.5404052734375, "logps/rejected": -210.4384969075521, "loss": 0.499, "rewards/chosen": -0.015338438749313354, "rewards/margins": 0.0515784740447998, "rewards/rejected": -0.06691691279411316, "step": 69 }, { "epoch": 0.0037102801261495242, "grad_norm": 67.5, "kl": 0.06337738037109375, "learning_rate": 5e-07, "logits/chosen": -1738284.6666666667, "logits/rejected": -34926336.0, "logps/chosen": -332.4259847005208, "logps/rejected": -307.5785888671875, "loss": 0.4967, "rewards/chosen": -0.051861574252446495, "rewards/margins": -0.009697418411572775, "rewards/rejected": -0.04216415584087372, "step": 70 }, { "epoch": 0.0037632841279516605, "grad_norm": 76.0, "kl": 0.07609367370605469, "learning_rate": 5e-07, "logits/chosen": 26644904.0, "logits/rejected": -34749946.666666664, "logps/chosen": -61.015350341796875, "logps/rejected": -391.1946614583333, "loss": 0.4667, "rewards/chosen": 0.002614688826724887, "rewards/margins": 0.1744513630401343, "rewards/rejected": -0.17183667421340942, "step": 71 }, { "epoch": 0.0038162881297537964, "grad_norm": 49.0, "kl": 0.09589099884033203, "learning_rate": 5e-07, "logits/chosen": -12548444.0, "logits/rejected": -14210796.0, "logps/chosen": -128.60287475585938, "logps/rejected": -197.02220153808594, "loss": 0.4989, "rewards/chosen": -0.03728232532739639, "rewards/margins": 0.0091949962079525, "rewards/rejected": -0.04647732153534889, "step": 72 }, { "epoch": 0.0038692921315559327, "grad_norm": 69.0, "kl": 0.10500049591064453, "learning_rate": 5e-07, "logits/chosen": -16983537.333333332, "logits/rejected": -20208892.0, "logps/chosen": -272.6612955729167, "logps/rejected": -203.85867309570312, "loss": 0.4977, "rewards/chosen": 0.003214708218971888, "rewards/margins": 0.04238631079594294, "rewards/rejected": -0.039171602576971054, "step": 73 }, { "epoch": 0.003922296133358068, "grad_norm": 62.0, "kl": 0.1876049041748047, "learning_rate": 5e-07, "logits/chosen": -48661136.0, "logits/rejected": -26413864.0, "logps/chosen": -408.6339518229167, "logps/rejected": -188.5192138671875, "loss": 0.489, "rewards/chosen": 0.016809082279602688, "rewards/margins": 0.1008027675251166, "rewards/rejected": -0.08399368524551391, "step": 74 }, { "epoch": 0.003975300135160204, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15917653.333333334, "logits/rejected": -8592204.0, "logps/chosen": -391.3837890625, "logps/rejected": -148.85750732421874, "loss": 0.4937, "rewards/chosen": -0.018285623441139858, "rewards/margins": 0.03304265414675077, "rewards/rejected": -0.051328277587890624, "step": 75 }, { "epoch": 0.004028304136962341, "grad_norm": 63.25, "kl": 0.032886505126953125, "learning_rate": 5e-07, "logits/chosen": -27558812.8, "logits/rejected": 1069237.6666666667, "logps/chosen": -232.610009765625, "logps/rejected": -151.89313761393228, "loss": 0.497, "rewards/chosen": -0.019927369058132173, "rewards/margins": 0.04583610941966374, "rewards/rejected": -0.06576347847779591, "step": 76 }, { "epoch": 0.004081308138764477, "grad_norm": 56.5, "kl": 0.00913858413696289, "learning_rate": 5e-07, "logits/chosen": -28041996.8, "logits/rejected": -4990304.0, "logps/chosen": -288.0624267578125, "logps/rejected": -112.41458129882812, "loss": 0.4973, "rewards/chosen": -0.01812068819999695, "rewards/margins": 0.04202844003836313, "rewards/rejected": -0.060149128238360085, "step": 77 }, { "epoch": 0.004134312140566612, "grad_norm": 84.5, "kl": 0.4369182586669922, "learning_rate": 5e-07, "logits/chosen": -84340840.0, "logits/rejected": -28450940.0, "logps/chosen": -619.0479736328125, "logps/rejected": -273.26287841796875, "loss": 0.4811, "rewards/chosen": 0.04420318454504013, "rewards/margins": 0.1551271453499794, "rewards/rejected": -0.11092396080493927, "step": 78 }, { "epoch": 0.004187316142368749, "grad_norm": 61.5, "kl": 0.20664596557617188, "learning_rate": 5e-07, "logits/chosen": -1538171.25, "logits/rejected": -34422024.0, "logps/chosen": -220.51498413085938, "logps/rejected": -195.278564453125, "loss": 0.4774, "rewards/chosen": 0.03494663164019585, "rewards/margins": 0.18949336931109428, "rewards/rejected": -0.15454673767089844, "step": 79 }, { "epoch": 0.004240320144170885, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24451082.666666668, "logits/rejected": -52434304.0, "logps/chosen": -232.74808756510416, "logps/rejected": -729.46044921875, "loss": 0.4857, "rewards/chosen": -0.0027346297477682433, "rewards/margins": 0.2365659192825357, "rewards/rejected": -0.23930054903030396, "step": 80 }, { "epoch": 0.004293324145973021, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2664282.3333333335, "logits/rejected": -15967360.0, "logps/chosen": -68.89410909016927, "logps/rejected": -224.0144775390625, "loss": 0.4878, "rewards/chosen": -0.013121732821067175, "rewards/margins": 0.07321257914106051, "rewards/rejected": -0.08633431196212768, "step": 81 }, { "epoch": 0.004346328147775157, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 424520.6, "logits/rejected": 862964.5, "logps/chosen": -54.68846435546875, "logps/rejected": -72.40868631998698, "loss": 0.5004, "rewards/chosen": -0.029513511061668395, "rewards/margins": 0.014967259764671326, "rewards/rejected": -0.04448077082633972, "step": 82 }, { "epoch": 0.004399332149577293, "grad_norm": 65.5, "kl": 0.0599212646484375, "learning_rate": 5e-07, "logits/chosen": -34933544.0, "logits/rejected": -26548404.8, "logps/chosen": -384.3467610677083, "logps/rejected": -245.224462890625, "loss": 0.4805, "rewards/chosen": 0.048344930013020836, "rewards/margins": 0.14431302150090536, "rewards/rejected": -0.09596809148788452, "step": 83 }, { "epoch": 0.004452336151379429, "grad_norm": 62.5, "kl": 0.016907691955566406, "learning_rate": 5e-07, "logits/chosen": 550268.6666666666, "logits/rejected": -7568664.0, "logps/chosen": -178.5626220703125, "logps/rejected": -301.55966796875, "loss": 0.4983, "rewards/chosen": -0.017752646158138912, "rewards/margins": 0.0038705838223298365, "rewards/rejected": -0.02162322998046875, "step": 84 }, { "epoch": 0.0045053401531815656, "grad_norm": 83.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49784400.0, "logits/rejected": -22895397.333333332, "logps/chosen": -322.4627685546875, "logps/rejected": -667.5448404947916, "loss": 0.4794, "rewards/chosen": 0.045350074768066406, "rewards/margins": 0.19004002213478088, "rewards/rejected": -0.14468994736671448, "step": 85 }, { "epoch": 0.004558344154983701, "grad_norm": 93.0, "kl": 0.08426094055175781, "learning_rate": 5e-07, "logits/chosen": -84924762.66666667, "logits/rejected": 21002190.4, "logps/chosen": -305.5118815104167, "logps/rejected": -189.370556640625, "loss": 0.4797, "rewards/chosen": 0.0196990966796875, "rewards/margins": 0.13843898773193358, "rewards/rejected": -0.1187398910522461, "step": 86 }, { "epoch": 0.004611348156785837, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7344269.0, "logits/rejected": 1114291.5, "logps/chosen": -56.20735549926758, "logps/rejected": -209.558837890625, "loss": 0.4953, "rewards/chosen": -0.0021580695174634457, "rewards/margins": 0.03804483497515321, "rewards/rejected": -0.04020290449261665, "step": 87 }, { "epoch": 0.0046643521585879736, "grad_norm": 70.5, "kl": 0.10091876983642578, "learning_rate": 5e-07, "logits/chosen": 5959873.6, "logits/rejected": -10448730.666666666, "logps/chosen": -251.4006103515625, "logps/rejected": -298.6585693359375, "loss": 0.4853, "rewards/chosen": -0.007213822007179261, "rewards/margins": 0.18911541402339935, "rewards/rejected": -0.1963292360305786, "step": 88 }, { "epoch": 0.00471735616039011, "grad_norm": 62.75, "kl": 0.09423160552978516, "learning_rate": 5e-07, "logits/chosen": 24128778.666666668, "logits/rejected": -41274214.4, "logps/chosen": -282.0858561197917, "logps/rejected": -336.6745849609375, "loss": 0.4866, "rewards/chosen": 0.00631052628159523, "rewards/margins": 0.10345988646149636, "rewards/rejected": -0.09714936017990113, "step": 89 }, { "epoch": 0.004770360162192245, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4155695.0, "logits/rejected": -40437172.0, "logps/chosen": -353.5682373046875, "logps/rejected": -443.9981384277344, "loss": 0.4867, "rewards/chosen": -0.058509159833192825, "rewards/margins": 0.10698653385043144, "rewards/rejected": -0.16549569368362427, "step": 90 }, { "epoch": 0.0048233641639943816, "grad_norm": 76.0, "kl": 0.04730987548828125, "learning_rate": 5e-07, "logits/chosen": -46270265.6, "logits/rejected": -27335002.666666668, "logps/chosen": -174.78824462890626, "logps/rejected": -478.0479736328125, "loss": 0.5008, "rewards/chosen": -0.05167725086212158, "rewards/margins": 0.039697464307149245, "rewards/rejected": -0.09137471516927083, "step": 91 }, { "epoch": 0.004876368165796518, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44569832.0, "logits/rejected": -19635036.0, "logps/chosen": -266.0802001953125, "logps/rejected": -215.75689697265625, "loss": 0.4994, "rewards/chosen": -0.074030302464962, "rewards/margins": 0.005034886300563812, "rewards/rejected": -0.07906518876552582, "step": 92 }, { "epoch": 0.004929372167598653, "grad_norm": 68.0, "kl": 0.16397380828857422, "learning_rate": 5e-07, "logits/chosen": -7653026.666666667, "logits/rejected": -18126846.0, "logps/chosen": -302.39670817057294, "logps/rejected": -103.3211441040039, "loss": 0.502, "rewards/chosen": -0.002793374160925547, "rewards/margins": 0.03851893668373426, "rewards/rejected": -0.041312310844659805, "step": 93 }, { "epoch": 0.00498237616940079, "grad_norm": 96.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9578408.0, "logits/rejected": 8604.0, "logps/chosen": -394.77716064453125, "logps/rejected": -307.8691711425781, "loss": 0.4892, "rewards/chosen": -0.06056060642004013, "rewards/margins": 0.08737098425626755, "rewards/rejected": -0.14793159067630768, "step": 94 }, { "epoch": 0.005035380171202926, "grad_norm": 145.0, "kl": 0.00254058837890625, "learning_rate": 5e-07, "logits/chosen": -54535161.6, "logits/rejected": 62752.5625, "logps/chosen": -721.9654296875, "logps/rejected": -258.7533365885417, "loss": 0.4978, "rewards/chosen": 0.008234098553657532, "rewards/margins": 0.017876255015532173, "rewards/rejected": -0.009642156461874643, "step": 95 }, { "epoch": 0.005088384173005062, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63336437.333333336, "logits/rejected": -5256615.5, "logps/chosen": -321.75990804036456, "logps/rejected": -202.23089599609375, "loss": 0.4945, "rewards/chosen": -0.020580163846413296, "rewards/margins": 0.12953885768850645, "rewards/rejected": -0.15011902153491974, "step": 96 }, { "epoch": 0.005141388174807198, "grad_norm": 96.5, "kl": 0.12411117553710938, "learning_rate": 5e-07, "logits/chosen": -59467721.14285714, "logits/rejected": -5370276.0, "logps/chosen": -635.7439313616071, "logps/rejected": -56.99143981933594, "loss": 0.5081, "rewards/chosen": -0.047311948878424506, "rewards/margins": 0.024239567773682733, "rewards/rejected": -0.07155151665210724, "step": 97 }, { "epoch": 0.005194392176609334, "grad_norm": 81.5, "kl": 0.43912506103515625, "learning_rate": 5e-07, "logits/chosen": -20825048.0, "logits/rejected": -3980985.75, "logps/chosen": -410.7992350260417, "logps/rejected": -249.716796875, "loss": 0.4932, "rewards/chosen": 0.035314430793126426, "rewards/margins": 0.21428400774796805, "rewards/rejected": -0.1789695769548416, "step": 98 }, { "epoch": 0.00524739617841147, "grad_norm": 68.0, "kl": 0.09517860412597656, "learning_rate": 5e-07, "logits/chosen": -15053085.0, "logits/rejected": -47553184.0, "logps/chosen": -190.33351135253906, "logps/rejected": -429.10382080078125, "loss": 0.4715, "rewards/chosen": 0.07568226009607315, "rewards/margins": 0.2461896911263466, "rewards/rejected": -0.17050743103027344, "step": 99 }, { "epoch": 0.0053004001802136064, "grad_norm": 98.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20983996.8, "logits/rejected": -13960036.0, "logps/chosen": -728.4486328125, "logps/rejected": -183.66007486979166, "loss": 0.4787, "rewards/chosen": 0.09491775631904602, "rewards/margins": 0.16423386037349702, "rewards/rejected": -0.06931610405445099, "step": 100 }, { "epoch": 0.005353404182015742, "grad_norm": 51.25, "kl": 0.0054149627685546875, "learning_rate": 5e-07, "logits/chosen": -3766709.6666666665, "logits/rejected": 4953812.8, "logps/chosen": -78.00725809733073, "logps/rejected": -190.9758056640625, "loss": 0.4886, "rewards/chosen": 0.009764734655618668, "rewards/margins": 0.07581347003579139, "rewards/rejected": -0.06604873538017272, "step": 101 }, { "epoch": 0.005406408183817878, "grad_norm": 68.0, "kl": 0.0503692626953125, "learning_rate": 5e-07, "logits/chosen": 957756.5, "logits/rejected": -18581712.0, "logps/chosen": -141.10453287760416, "logps/rejected": -306.6376220703125, "loss": 0.4838, "rewards/chosen": -0.04482422272364298, "rewards/margins": 0.0864178498586019, "rewards/rejected": -0.13124207258224488, "step": 102 }, { "epoch": 0.0054594121856200144, "grad_norm": 91.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26237078.0, "logits/rejected": -10410398.666666666, "logps/chosen": -437.1822509765625, "logps/rejected": -438.9827473958333, "loss": 0.4726, "rewards/chosen": 0.037101078778505325, "rewards/margins": 0.1734416770438353, "rewards/rejected": -0.13634059826533, "step": 103 }, { "epoch": 0.005512416187422151, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44529109.333333336, "logits/rejected": -15319670.4, "logps/chosen": -446.1836751302083, "logps/rejected": -238.2855712890625, "loss": 0.4799, "rewards/chosen": 0.060292561848958336, "rewards/margins": 0.15266462167104086, "rewards/rejected": -0.09237205982208252, "step": 104 }, { "epoch": 0.005565420189224286, "grad_norm": 61.25, "kl": 0.018365859985351562, "learning_rate": 5e-07, "logits/chosen": -35307336.0, "logits/rejected": -38050339.2, "logps/chosen": -146.45596313476562, "logps/rejected": -296.356396484375, "loss": 0.4786, "rewards/chosen": 0.017804082483053207, "rewards/margins": 0.14454563483595848, "rewards/rejected": -0.12674155235290527, "step": 105 }, { "epoch": 0.0056184241910264224, "grad_norm": 90.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -117822944.0, "logits/rejected": -35751036.8, "logps/chosen": -385.8606770833333, "logps/rejected": -531.189697265625, "loss": 0.4691, "rewards/chosen": 0.0377787301937739, "rewards/margins": 0.21546657780806222, "rewards/rejected": -0.17768784761428832, "step": 106 }, { "epoch": 0.005671428192828559, "grad_norm": 52.75, "kl": 0.012660980224609375, "learning_rate": 5e-07, "logits/chosen": -42448088.0, "logits/rejected": -15747180.0, "logps/chosen": -244.87960815429688, "logps/rejected": -79.21749877929688, "loss": 0.4897, "rewards/chosen": 0.04024696350097656, "rewards/margins": 0.08236866071820259, "rewards/rejected": -0.04212169721722603, "step": 107 }, { "epoch": 0.005724432194630695, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42640144.0, "logits/rejected": 7505182.4, "logps/chosen": -208.74763997395834, "logps/rejected": -359.06318359375, "loss": 0.4645, "rewards/chosen": 0.04194425046443939, "rewards/margins": 0.24516843259334564, "rewards/rejected": -0.20322418212890625, "step": 108 }, { "epoch": 0.0057774361964328304, "grad_norm": 103.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35313253.333333336, "logits/rejected": -65360620.8, "logps/chosen": -433.5913492838542, "logps/rejected": -429.24384765625, "loss": 0.4484, "rewards/chosen": 0.058449303110440574, "rewards/margins": 0.36964460512002306, "rewards/rejected": -0.3111953020095825, "step": 109 }, { "epoch": 0.005830440198234967, "grad_norm": 70.0, "kl": 0.021846771240234375, "learning_rate": 5e-07, "logits/chosen": -29509610.666666668, "logits/rejected": -15806457.6, "logps/chosen": -402.3097330729167, "logps/rejected": -210.5528076171875, "loss": 0.4771, "rewards/chosen": -0.04159844915072123, "rewards/margins": 0.13067969282468161, "rewards/rejected": -0.17227814197540284, "step": 110 }, { "epoch": 0.005883444200037103, "grad_norm": 76.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31692323.2, "logits/rejected": -36139898.666666664, "logps/chosen": -349.824560546875, "logps/rejected": -461.13671875, "loss": 0.4648, "rewards/chosen": 0.023840637505054475, "rewards/margins": 0.36946341544389727, "rewards/rejected": -0.3456227779388428, "step": 111 }, { "epoch": 0.0059364482018392385, "grad_norm": 85.5, "kl": 0.08911895751953125, "learning_rate": 5e-07, "logits/chosen": -42934144.0, "logits/rejected": -25877348.0, "logps/chosen": -564.5782470703125, "logps/rejected": -257.34686279296875, "loss": 0.4854, "rewards/chosen": -0.0044036866165697575, "rewards/margins": 0.11722069373354316, "rewards/rejected": -0.12162438035011292, "step": 112 }, { "epoch": 0.005989452203641375, "grad_norm": 79.5, "kl": 0.00666046142578125, "learning_rate": 5e-07, "logits/chosen": -58436500.0, "logits/rejected": -44113402.666666664, "logps/chosen": -1008.1248779296875, "logps/rejected": -204.0950927734375, "loss": 0.4747, "rewards/chosen": 0.04736023023724556, "rewards/margins": 0.16580810025334358, "rewards/rejected": -0.11844787001609802, "step": 113 }, { "epoch": 0.006042456205443511, "grad_norm": 66.0, "kl": 0.037944793701171875, "learning_rate": 5e-07, "logits/chosen": -36788504.0, "logits/rejected": -13452446.666666666, "logps/chosen": -360.9737548828125, "logps/rejected": -204.62322998046875, "loss": 0.4794, "rewards/chosen": 0.02801818773150444, "rewards/margins": 0.1293599121272564, "rewards/rejected": -0.10134172439575195, "step": 114 }, { "epoch": 0.006095460207245647, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65016505.6, "logits/rejected": -28570954.666666668, "logps/chosen": -421.519140625, "logps/rejected": -309.8743082682292, "loss": 0.4693, "rewards/chosen": -0.003103562444448471, "rewards/margins": 0.3334092018504937, "rewards/rejected": -0.3365127642949422, "step": 115 }, { "epoch": 0.006148464209047783, "grad_norm": 78.5, "kl": 0.173736572265625, "learning_rate": 5e-07, "logits/chosen": -56941734.4, "logits/rejected": -11358072.0, "logps/chosen": -460.662158203125, "logps/rejected": -380.4465738932292, "loss": 0.4677, "rewards/chosen": 0.13259804248809814, "rewards/margins": 0.25751742720603943, "rewards/rejected": -0.12491938471794128, "step": 116 }, { "epoch": 0.006201468210849919, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66874026.666666664, "logits/rejected": 3411719.2, "logps/chosen": -416.5406087239583, "logps/rejected": -225.025732421875, "loss": 0.4919, "rewards/chosen": -0.042465592424074806, "rewards/margins": 0.03469657798608144, "rewards/rejected": -0.07716217041015624, "step": 117 }, { "epoch": 0.006254472212652055, "grad_norm": 88.0, "kl": 0.050617218017578125, "learning_rate": 5e-07, "logits/chosen": -108694080.0, "logits/rejected": -28329331.2, "logps/chosen": -625.019287109375, "logps/rejected": -348.55986328125, "loss": 0.4996, "rewards/chosen": -0.0917470355828603, "rewards/margins": -0.026172234614690135, "rewards/rejected": -0.06557480096817017, "step": 118 }, { "epoch": 0.006307476214454192, "grad_norm": 89.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9676614.4, "logits/rejected": 18270142.666666668, "logps/chosen": -283.9339111328125, "logps/rejected": -679.5097249348959, "loss": 0.4758, "rewards/chosen": -0.00010894797742366791, "rewards/margins": 0.2601775222768386, "rewards/rejected": -0.26028647025426227, "step": 119 }, { "epoch": 0.006360480216256327, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16979862.4, "logits/rejected": -17621922.666666668, "logps/chosen": -192.06982421875, "logps/rejected": -256.5340169270833, "loss": 0.481, "rewards/chosen": 0.02235824763774872, "rewards/margins": 0.1886083573102951, "rewards/rejected": -0.1662501096725464, "step": 120 }, { "epoch": 0.006413484218058463, "grad_norm": 98.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61785976.0, "logits/rejected": -41760411.428571425, "logps/chosen": -186.3748321533203, "logps/rejected": -446.7850864955357, "loss": 0.4589, "rewards/chosen": -0.03685302659869194, "rewards/margins": 0.1573029729936804, "rewards/rejected": -0.19415599959237234, "step": 121 }, { "epoch": 0.0064664882198606, "grad_norm": 69.0, "kl": 0.03118896484375, "learning_rate": 5e-07, "logits/chosen": -7223012.666666667, "logits/rejected": -10741776.0, "logps/chosen": -235.41156005859375, "logps/rejected": -257.4656677246094, "loss": 0.4938, "rewards/chosen": 0.023766296605269115, "rewards/margins": 0.06447980056206386, "rewards/rejected": -0.04071350395679474, "step": 122 }, { "epoch": 0.006519492221662736, "grad_norm": 78.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37150948.0, "logits/rejected": -30505358.0, "logps/chosen": -494.2452697753906, "logps/rejected": -335.93548583984375, "loss": 0.4657, "rewards/chosen": -0.0062210094183683395, "rewards/margins": 0.2777374852448702, "rewards/rejected": -0.2839584946632385, "step": 123 }, { "epoch": 0.006572496223464871, "grad_norm": 81.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47642392.0, "logits/rejected": -3161964.25, "logps/chosen": -501.87713623046875, "logps/rejected": -233.31979370117188, "loss": 0.481, "rewards/chosen": 0.004676055163145065, "rewards/margins": 0.15265903994441032, "rewards/rejected": -0.14798298478126526, "step": 124 }, { "epoch": 0.006625500225267008, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6089031.333333333, "logits/rejected": -14547537.6, "logps/chosen": -127.27541097005208, "logps/rejected": -159.0401611328125, "loss": 0.4814, "rewards/chosen": 0.005366771171490352, "rewards/margins": 0.1213789629439513, "rewards/rejected": -0.11601219177246094, "step": 125 }, { "epoch": 0.006678504227069144, "grad_norm": 95.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23789389.333333332, "logits/rejected": 86781708.8, "logps/chosen": -199.1682332356771, "logps/rejected": -511.78759765625, "loss": 0.4658, "rewards/chosen": -0.01905619353055954, "rewards/margins": 0.2132045581936836, "rewards/rejected": -0.23226075172424315, "step": 126 }, { "epoch": 0.00673150822887128, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9599990.0, "logits/rejected": -119308512.0, "logps/chosen": -395.40625, "logps/rejected": -398.7372131347656, "loss": 0.4737, "rewards/chosen": 0.005112458020448685, "rewards/margins": 0.2121231146156788, "rewards/rejected": -0.2070106565952301, "step": 127 }, { "epoch": 0.006784512230673416, "grad_norm": 47.25, "kl": 0.04315757751464844, "learning_rate": 5e-07, "logits/chosen": -23412814.4, "logits/rejected": -12976021.333333334, "logps/chosen": -152.96776123046874, "logps/rejected": -172.7933349609375, "loss": 0.4821, "rewards/chosen": 0.025151211023330688, "rewards/margins": 0.17400349974632262, "rewards/rejected": -0.14885228872299194, "step": 128 }, { "epoch": 0.006837516232475552, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12144724.0, "logits/rejected": -37980294.4, "logps/chosen": -249.58648681640625, "logps/rejected": -351.218603515625, "loss": 0.4794, "rewards/chosen": -0.043694496154785156, "rewards/margins": 0.11518261432647706, "rewards/rejected": -0.1588771104812622, "step": 129 }, { "epoch": 0.006890520234277688, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46869813.333333336, "logits/rejected": -13668963.0, "logps/chosen": -268.4782307942708, "logps/rejected": -68.7930679321289, "loss": 0.4994, "rewards/chosen": -0.018105190247297287, "rewards/margins": 0.04523499682545662, "rewards/rejected": -0.0633401870727539, "step": 130 }, { "epoch": 0.006943524236079824, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42351792.0, "logits/rejected": -50634560.0, "logps/chosen": -99.1216812133789, "logps/rejected": -260.1338195800781, "loss": 0.4841, "rewards/chosen": -0.04162192344665527, "rewards/margins": 0.12779316306114197, "rewards/rejected": -0.16941508650779724, "step": 131 }, { "epoch": 0.00699652823788196, "grad_norm": 44.25, "kl": 0.14606475830078125, "learning_rate": 5e-07, "logits/chosen": -13328627.0, "logits/rejected": -19189240.0, "logps/chosen": -183.91659545898438, "logps/rejected": -103.45611572265625, "loss": 0.4934, "rewards/chosen": 0.013442229479551315, "rewards/margins": 0.05259713903069496, "rewards/rejected": -0.039154909551143646, "step": 132 }, { "epoch": 0.007049532239684096, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22674684.0, "logits/rejected": -11585045.333333334, "logps/chosen": -378.1092529296875, "logps/rejected": -256.28428141276044, "loss": 0.4643, "rewards/chosen": -0.0009689321741461754, "rewards/margins": 0.1914522855853041, "rewards/rejected": -0.19242121775945029, "step": 133 }, { "epoch": 0.0071025362414862325, "grad_norm": 66.5, "kl": 0.09212684631347656, "learning_rate": 5e-07, "logits/chosen": -31981386.666666668, "logits/rejected": -7401659.5, "logps/chosen": -346.7313232421875, "logps/rejected": -143.0607147216797, "loss": 0.4906, "rewards/chosen": -0.010650633523861567, "rewards/margins": 0.20969963197906813, "rewards/rejected": -0.2203502655029297, "step": 134 }, { "epoch": 0.007155540243288368, "grad_norm": 75.5, "kl": 0.218292236328125, "learning_rate": 5e-07, "logits/chosen": -6471497.6, "logits/rejected": -28061224.0, "logps/chosen": -413.524072265625, "logps/rejected": -349.37939453125, "loss": 0.4935, "rewards/chosen": -0.07304885983467102, "rewards/margins": 0.17727920413017273, "rewards/rejected": -0.25032806396484375, "step": 135 }, { "epoch": 0.007208544245090504, "grad_norm": 52.0, "kl": 0.024672508239746094, "learning_rate": 5e-07, "logits/chosen": -8826140.8, "logits/rejected": -31116877.333333332, "logps/chosen": -174.340869140625, "logps/rejected": -163.35112508138022, "loss": 0.4896, "rewards/chosen": -0.0008294671773910522, "rewards/margins": 0.11164128879706066, "rewards/rejected": -0.1124707559744517, "step": 136 }, { "epoch": 0.0072615482468926405, "grad_norm": 69.5, "kl": 0.11832427978515625, "learning_rate": 5e-07, "logits/chosen": -27660291.2, "logits/rejected": -3508509.3333333335, "logps/chosen": -297.814111328125, "logps/rejected": -61.91020711263021, "loss": 0.4911, "rewards/chosen": 0.050965577363967896, "rewards/margins": 0.06107249048848947, "rewards/rejected": -0.010106913124521574, "step": 137 }, { "epoch": 0.007314552248694777, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52572860.8, "logits/rejected": -26562805.333333332, "logps/chosen": -276.402880859375, "logps/rejected": -315.19384765625, "loss": 0.4844, "rewards/chosen": 0.05871063470840454, "rewards/margins": 0.12727346519629162, "rewards/rejected": -0.06856283048788707, "step": 138 }, { "epoch": 0.007367556250496912, "grad_norm": 92.0, "kl": 0.07004165649414062, "learning_rate": 5e-07, "logits/chosen": -45893696.0, "logits/rejected": -23039097.6, "logps/chosen": -413.92724609375, "logps/rejected": -538.99560546875, "loss": 0.4559, "rewards/chosen": 0.06489766637484233, "rewards/margins": 0.3230244974295298, "rewards/rejected": -0.2581268310546875, "step": 139 }, { "epoch": 0.0074205602522990485, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7031136.5, "logits/rejected": -21790706.285714287, "logps/chosen": -241.96908569335938, "logps/rejected": -304.67665318080356, "loss": 0.4579, "rewards/chosen": 0.07597046345472336, "rewards/margins": 0.25948034120457514, "rewards/rejected": -0.18350987774985178, "step": 140 }, { "epoch": 0.007473564254101185, "grad_norm": 65.0, "kl": 0.004815101623535156, "learning_rate": 5e-07, "logits/chosen": -21863273.14285714, "logits/rejected": 2879925.0, "logps/chosen": -236.17063685825892, "logps/rejected": -66.67863464355469, "loss": 0.4945, "rewards/chosen": 0.015143475362232752, "rewards/margins": 0.0895987365927015, "rewards/rejected": -0.07445526123046875, "step": 141 }, { "epoch": 0.007526568255903321, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8001958.5, "logits/rejected": -7163849.333333333, "logps/chosen": -246.37750244140625, "logps/rejected": -283.61887613932294, "loss": 0.4668, "rewards/chosen": 0.00164203648455441, "rewards/margins": 0.17929914399671057, "rewards/rejected": -0.17765710751215616, "step": 142 }, { "epoch": 0.0075795722577054565, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15880014.666666666, "logits/rejected": -28644582.4, "logps/chosen": -218.2934366861979, "logps/rejected": -304.3017578125, "loss": 0.4676, "rewards/chosen": 0.012786356111367544, "rewards/margins": 0.21345607390006385, "rewards/rejected": -0.2006697177886963, "step": 143 }, { "epoch": 0.007632576259507593, "grad_norm": 90.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -120727893.33333333, "logits/rejected": -36007692.8, "logps/chosen": -855.1647135416666, "logps/rejected": -191.595068359375, "loss": 0.4611, "rewards/chosen": 0.11899465322494507, "rewards/margins": 0.2982691049575806, "rewards/rejected": -0.1792744517326355, "step": 144 }, { "epoch": 0.007685580261309729, "grad_norm": 71.0, "kl": 0.010190963745117188, "learning_rate": 5e-07, "logits/chosen": -33721852.0, "logits/rejected": -43633988.0, "logps/chosen": -258.3011474609375, "logps/rejected": -311.761474609375, "loss": 0.4576, "rewards/chosen": -0.0009996425360441208, "rewards/margins": 0.3440083432942629, "rewards/rejected": -0.345007985830307, "step": 145 }, { "epoch": 0.007738584263111865, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7178117.333333333, "logits/rejected": -23050910.0, "logps/chosen": -263.76991780598956, "logps/rejected": -211.27777099609375, "loss": 0.4892, "rewards/chosen": 0.008930745224157969, "rewards/margins": 0.15496270606915155, "rewards/rejected": -0.1460319608449936, "step": 146 }, { "epoch": 0.007791588264914001, "grad_norm": 84.5, "kl": 0.05232048034667969, "learning_rate": 5e-07, "logits/chosen": -12371356.0, "logits/rejected": -51958892.0, "logps/chosen": -266.03570556640625, "logps/rejected": -448.898681640625, "loss": 0.4581, "rewards/chosen": 0.06487178802490234, "rewards/margins": 0.3515775799751282, "rewards/rejected": -0.28670579195022583, "step": 147 }, { "epoch": 0.007844592266716136, "grad_norm": 66.0, "kl": 0.017160415649414062, "learning_rate": 5e-07, "logits/chosen": -8662536.0, "logits/rejected": 5354660.666666667, "logps/chosen": -131.04716796875, "logps/rejected": -353.3854166666667, "loss": 0.4795, "rewards/chosen": 0.00866546481847763, "rewards/margins": 0.21738453954458237, "rewards/rejected": -0.20871907472610474, "step": 148 }, { "epoch": 0.007897596268518273, "grad_norm": 67.5, "kl": 0.013696670532226562, "learning_rate": 5e-07, "logits/chosen": -19697094.0, "logits/rejected": -23649944.0, "logps/chosen": -485.2366638183594, "logps/rejected": -201.96926879882812, "loss": 0.4788, "rewards/chosen": -0.04928627610206604, "rewards/margins": 0.17340508103370667, "rewards/rejected": -0.2226913571357727, "step": 149 }, { "epoch": 0.007950600270320409, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42268516.571428575, "logits/rejected": -17162206.0, "logps/chosen": -322.70804268973217, "logps/rejected": -202.46746826171875, "loss": 0.4876, "rewards/chosen": 0.03244737642151969, "rewards/margins": 0.2035380836044039, "rewards/rejected": -0.17109070718288422, "step": 150 }, { "epoch": 0.008003604272122546, "grad_norm": 88.0, "kl": 0.0669403076171875, "learning_rate": 5e-07, "logits/chosen": -11933535.0, "logits/rejected": -35419860.0, "logps/chosen": -590.6748046875, "logps/rejected": -288.53594970703125, "loss": 0.4758, "rewards/chosen": -0.06831836700439453, "rewards/margins": 0.2142522931098938, "rewards/rejected": -0.28257066011428833, "step": 151 }, { "epoch": 0.008056608273924681, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22471275.2, "logits/rejected": 905068.5, "logps/chosen": -239.515283203125, "logps/rejected": -72.92588297526042, "loss": 0.4931, "rewards/chosen": -0.00887497067451477, "rewards/margins": 0.07964876492818196, "rewards/rejected": -0.08852373560269673, "step": 152 }, { "epoch": 0.008109612275726817, "grad_norm": 76.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21909794.666666668, "logits/rejected": -1559859.4, "logps/chosen": -314.65777587890625, "logps/rejected": -261.8334716796875, "loss": 0.4773, "rewards/chosen": -0.005164082472523053, "rewards/margins": 0.1440563483784596, "rewards/rejected": -0.14922043085098266, "step": 153 }, { "epoch": 0.008162616277528954, "grad_norm": 85.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62422084.0, "logits/rejected": -29199094.85714286, "logps/chosen": -441.62738037109375, "logps/rejected": -316.68418666294644, "loss": 0.4633, "rewards/chosen": 0.01817016676068306, "rewards/margins": 0.18430290637271746, "rewards/rejected": -0.1661327396120344, "step": 154 }, { "epoch": 0.00821562027933109, "grad_norm": 67.0, "kl": 0.0895853042602539, "learning_rate": 5e-07, "logits/chosen": 106233.0, "logits/rejected": -18409708.8, "logps/chosen": -400.4478759765625, "logps/rejected": -181.12393798828126, "loss": 0.4616, "rewards/chosen": 0.1084669828414917, "rewards/margins": 0.2940208077430725, "rewards/rejected": -0.18555382490158082, "step": 155 }, { "epoch": 0.008268624281133225, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47686762.666666664, "logits/rejected": -12786976.8, "logps/chosen": -403.9690348307292, "logps/rejected": -302.058447265625, "loss": 0.4563, "rewards/chosen": 0.02275199939807256, "rewards/margins": 0.2912947898109754, "rewards/rejected": -0.26854279041290285, "step": 156 }, { "epoch": 0.008321628282935362, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44351984.0, "logits/rejected": -30483269.333333332, "logps/chosen": -251.84384765625, "logps/rejected": -298.9681803385417, "loss": 0.4699, "rewards/chosen": 0.0413189709186554, "rewards/margins": 0.2950218896071116, "rewards/rejected": -0.25370291868845624, "step": 157 }, { "epoch": 0.008374632284737497, "grad_norm": 69.0, "kl": 0.0163421630859375, "learning_rate": 5e-07, "logits/chosen": -62690026.666666664, "logits/rejected": -58237980.0, "logps/chosen": -320.5543212890625, "logps/rejected": -509.01251220703125, "loss": 0.4818, "rewards/chosen": -0.01324183369676272, "rewards/margins": 0.3277646241088708, "rewards/rejected": -0.34100645780563354, "step": 158 }, { "epoch": 0.008427636286539635, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7040874.0, "logits/rejected": -5520134.5, "logps/chosen": -381.734375, "logps/rejected": -65.00360107421875, "loss": 0.4789, "rewards/chosen": 0.03864574804902077, "rewards/margins": 0.16975289955735207, "rewards/rejected": -0.1311071515083313, "step": 159 }, { "epoch": 0.00848064028834177, "grad_norm": 68.5, "kl": 0.375, "learning_rate": 5e-07, "logits/chosen": -59545685.333333336, "logits/rejected": -24630430.4, "logps/chosen": -386.0242513020833, "logps/rejected": -196.214306640625, "loss": 0.4719, "rewards/chosen": 0.02799466500679652, "rewards/margins": 0.2523834074536959, "rewards/rejected": -0.2243887424468994, "step": 160 }, { "epoch": 0.008533644290143905, "grad_norm": 91.5, "kl": 0.016129493713378906, "learning_rate": 5e-07, "logits/chosen": -5050259.333333333, "logits/rejected": -8354294.4, "logps/chosen": -107.43712361653645, "logps/rejected": -248.0773681640625, "loss": 0.4765, "rewards/chosen": -0.018286450455586117, "rewards/margins": 0.14610199158390363, "rewards/rejected": -0.16438844203948974, "step": 161 }, { "epoch": 0.008586648291946043, "grad_norm": 63.0, "kl": 0.04361724853515625, "learning_rate": 5e-07, "logits/chosen": -1070479.375, "logits/rejected": -13523897.142857144, "logps/chosen": -196.439453125, "logps/rejected": -148.50373186383928, "loss": 0.4659, "rewards/chosen": -0.01227417029440403, "rewards/margins": 0.14597156351166113, "rewards/rejected": -0.15824573380606516, "step": 162 }, { "epoch": 0.008639652293748178, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45298924.0, "logits/rejected": -29348412.0, "logps/chosen": -332.6632995605469, "logps/rejected": -230.5148162841797, "loss": 0.4787, "rewards/chosen": -0.0136629119515419, "rewards/margins": 0.17164801806211472, "rewards/rejected": -0.18531093001365662, "step": 163 }, { "epoch": 0.008692656295550313, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23301205.333333332, "logits/rejected": -21114118.4, "logps/chosen": -340.5160725911458, "logps/rejected": -207.9490478515625, "loss": 0.4741, "rewards/chosen": -0.013093565901120504, "rewards/margins": 0.16229636768500008, "rewards/rejected": -0.1753899335861206, "step": 164 }, { "epoch": 0.00874566029735245, "grad_norm": 60.0, "kl": 0.057700157165527344, "learning_rate": 5e-07, "logits/chosen": -2635720.1666666665, "logits/rejected": 185781.03125, "logps/chosen": -245.7386678059896, "logps/rejected": -46.25695037841797, "loss": 0.5129, "rewards/chosen": -0.08429606755574544, "rewards/margins": -0.016290215154488877, "rewards/rejected": -0.06800585240125656, "step": 165 }, { "epoch": 0.008798664299154586, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73961168.0, "logits/rejected": -38346864.0, "logps/chosen": -442.900390625, "logps/rejected": -261.98931884765625, "loss": 0.4828, "rewards/chosen": 0.04196891933679581, "rewards/margins": 0.13798070698976517, "rewards/rejected": -0.09601178765296936, "step": 166 }, { "epoch": 0.008851668300956721, "grad_norm": 55.5, "kl": 0.06264495849609375, "learning_rate": 5e-07, "logits/chosen": -12727675.0, "logits/rejected": -44920804.0, "logps/chosen": -111.06575012207031, "logps/rejected": -256.11431884765625, "loss": 0.4739, "rewards/chosen": 0.03583207353949547, "rewards/margins": 0.21042467281222343, "rewards/rejected": -0.17459259927272797, "step": 167 }, { "epoch": 0.008904672302758859, "grad_norm": 96.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32432914.666666668, "logits/rejected": -9923849.6, "logps/chosen": -665.5797526041666, "logps/rejected": -266.785009765625, "loss": 0.4653, "rewards/chosen": -0.02766519784927368, "rewards/margins": 0.214826762676239, "rewards/rejected": -0.2424919605255127, "step": 168 }, { "epoch": 0.008957676304560994, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16157761.333333334, "logits/rejected": -10594167.2, "logps/chosen": -186.87506103515625, "logps/rejected": -217.2740966796875, "loss": 0.4664, "rewards/chosen": -0.024448901414871216, "rewards/margins": 0.20681187510490417, "rewards/rejected": -0.2312607765197754, "step": 169 }, { "epoch": 0.009010680306363131, "grad_norm": 78.5, "kl": 0.28963470458984375, "learning_rate": 5e-07, "logits/chosen": -5583955.6, "logits/rejected": -72434826.66666667, "logps/chosen": -446.77666015625, "logps/rejected": -378.3843587239583, "loss": 0.4621, "rewards/chosen": 0.09522300958633423, "rewards/margins": 0.42839868863423664, "rewards/rejected": -0.3331756790479024, "step": 170 }, { "epoch": 0.009063684308165267, "grad_norm": 83.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54578096.0, "logits/rejected": -33309116.0, "logps/chosen": -564.044189453125, "logps/rejected": -318.2647705078125, "loss": 0.4423, "rewards/chosen": 0.10805321484804153, "rewards/margins": 0.46932796388864517, "rewards/rejected": -0.36127474904060364, "step": 171 }, { "epoch": 0.009116688309967402, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13390573.333333334, "logits/rejected": -17770376.0, "logps/chosen": -151.71810913085938, "logps/rejected": -186.121728515625, "loss": 0.4581, "rewards/chosen": 0.08664671579996745, "rewards/margins": 0.30595609347025554, "rewards/rejected": -0.21930937767028807, "step": 172 }, { "epoch": 0.009169692311769539, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18634398.0, "logits/rejected": -12438504.0, "logps/chosen": -382.8365783691406, "logps/rejected": -154.9538116455078, "loss": 0.4707, "rewards/chosen": 0.051278211176395416, "rewards/margins": 0.2355683371424675, "rewards/rejected": -0.18429012596607208, "step": 173 }, { "epoch": 0.009222696313571675, "grad_norm": 80.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39317042.666666664, "logits/rejected": -28061730.0, "logps/chosen": -486.622314453125, "logps/rejected": -387.2200622558594, "loss": 0.4761, "rewards/chosen": -0.0015111292401949565, "rewards/margins": 0.3921717380483945, "rewards/rejected": -0.3936828672885895, "step": 174 }, { "epoch": 0.00927570031537381, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4292149.2, "logits/rejected": 8879924.666666666, "logps/chosen": -170.7412353515625, "logps/rejected": -372.078125, "loss": 0.4817, "rewards/chosen": 0.025340193510055543, "rewards/margins": 0.17997137109438577, "rewards/rejected": -0.15463117758433023, "step": 175 }, { "epoch": 0.009328704317175947, "grad_norm": 97.0, "kl": 0.026987075805664062, "learning_rate": 5e-07, "logits/chosen": -67430634.66666667, "logits/rejected": -40629180.0, "logps/chosen": -143.62823486328125, "logps/rejected": -323.6683044433594, "loss": 0.4889, "rewards/chosen": -0.047401746114095054, "rewards/margins": 0.2804790238539378, "rewards/rejected": -0.32788076996803284, "step": 176 }, { "epoch": 0.009381708318978083, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49454392.0, "logits/rejected": -3874976.5, "logps/chosen": -204.65414428710938, "logps/rejected": -146.21566772460938, "loss": 0.4804, "rewards/chosen": 0.005507420748472214, "rewards/margins": 0.15727953240275383, "rewards/rejected": -0.15177211165428162, "step": 177 }, { "epoch": 0.00943471232078022, "grad_norm": 93.5, "kl": 0.03987407684326172, "learning_rate": 5e-07, "logits/chosen": -54086924.8, "logits/rejected": -28586016.0, "logps/chosen": -645.21953125, "logps/rejected": -566.9736735026041, "loss": 0.4484, "rewards/chosen": 0.1224045991897583, "rewards/margins": 0.49052646160125735, "rewards/rejected": -0.368121862411499, "step": 178 }, { "epoch": 0.009487716322582355, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58160565.333333336, "logits/rejected": -24320236.8, "logps/chosen": -252.73164876302084, "logps/rejected": -237.0882568359375, "loss": 0.462, "rewards/chosen": -0.055636594692866005, "rewards/margins": 0.22631685932477316, "rewards/rejected": -0.28195345401763916, "step": 179 }, { "epoch": 0.00954072032438449, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16291330.666666666, "logits/rejected": -39781464.0, "logps/chosen": -399.8694661458333, "logps/rejected": -259.3755187988281, "loss": 0.4785, "rewards/chosen": 0.04739345113436381, "rewards/margins": 0.24952814976374307, "rewards/rejected": -0.20213469862937927, "step": 180 }, { "epoch": 0.009593724326186628, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36902240.0, "logits/rejected": -14103761.6, "logps/chosen": -254.01338704427084, "logps/rejected": -249.3783935546875, "loss": 0.4597, "rewards/chosen": -0.007326254000266393, "rewards/margins": 0.25839974010984107, "rewards/rejected": -0.26572599411010744, "step": 181 }, { "epoch": 0.009646728327988763, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33448632.0, "logits/rejected": -16356572.0, "logps/chosen": -384.92071533203125, "logps/rejected": -267.1578369140625, "loss": 0.4712, "rewards/chosen": -0.00672760047018528, "rewards/margins": 0.23142204247415066, "rewards/rejected": -0.23814964294433594, "step": 182 }, { "epoch": 0.009699732329790899, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42886864.0, "logits/rejected": -11667684.8, "logps/chosen": -295.80047607421875, "logps/rejected": -109.5995361328125, "loss": 0.484, "rewards/chosen": -0.046376546223958336, "rewards/margins": 0.08426630894343057, "rewards/rejected": -0.1306428551673889, "step": 183 }, { "epoch": 0.009752736331593036, "grad_norm": 52.5, "kl": 0.041535377502441406, "learning_rate": 5e-07, "logits/chosen": -11237397.333333334, "logits/rejected": -8035549.0, "logps/chosen": -196.3008829752604, "logps/rejected": -196.43258666992188, "loss": 0.4868, "rewards/chosen": 0.005869420866171519, "rewards/margins": 0.217554601530234, "rewards/rejected": -0.2116851806640625, "step": 184 }, { "epoch": 0.009805740333395171, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7279366.4, "logits/rejected": -9346481.333333334, "logps/chosen": -163.83062744140625, "logps/rejected": -70.2287089029948, "loss": 0.4783, "rewards/chosen": 0.08178657293319702, "rewards/margins": 0.1775860885779063, "rewards/rejected": -0.09579951564470927, "step": 185 }, { "epoch": 0.009858744335197307, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94004138.66666667, "logits/rejected": -51872128.0, "logps/chosen": -590.3675944010416, "logps/rejected": -398.2965087890625, "loss": 0.4733, "rewards/chosen": -0.007401277000705401, "rewards/margins": 0.44973374468584854, "rewards/rejected": -0.45713502168655396, "step": 186 }, { "epoch": 0.009911748336999444, "grad_norm": 86.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28146368.0, "logits/rejected": -22299906.666666668, "logps/chosen": -389.1379150390625, "logps/rejected": -182.31575520833334, "loss": 0.4695, "rewards/chosen": 0.0421173095703125, "rewards/margins": 0.30162252187728883, "rewards/rejected": -0.2595052123069763, "step": 187 }, { "epoch": 0.00996475233880158, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26721232.0, "logits/rejected": -59811640.0, "logps/chosen": -354.209716796875, "logps/rejected": -225.01571655273438, "loss": 0.4627, "rewards/chosen": 0.036485958844423294, "rewards/margins": 0.3094303198158741, "rewards/rejected": -0.2729443609714508, "step": 188 }, { "epoch": 0.010017756340603716, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62133412.0, "logits/rejected": -9157100.0, "logps/chosen": -518.8868408203125, "logps/rejected": -112.64855194091797, "loss": 0.4646, "rewards/chosen": 0.07330131530761719, "rewards/margins": 0.29316921532154083, "rewards/rejected": -0.21986790001392365, "step": 189 }, { "epoch": 0.010070760342405852, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14612368.0, "logits/rejected": -30162043.42857143, "logps/chosen": -160.333740234375, "logps/rejected": -352.06162806919644, "loss": 0.4527, "rewards/chosen": -0.02228851430118084, "rewards/margins": 0.20099094483469213, "rewards/rejected": -0.22327945913587297, "step": 190 }, { "epoch": 0.010123764344207987, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24930640.0, "logits/rejected": -11680905.0, "logps/chosen": -250.61618041992188, "logps/rejected": -691.4037475585938, "loss": 0.4214, "rewards/chosen": 0.055348776280879974, "rewards/margins": 0.829714797437191, "rewards/rejected": -0.774366021156311, "step": 191 }, { "epoch": 0.010176768346010124, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26324530.0, "logits/rejected": -17467576.0, "logps/chosen": -128.53932189941406, "logps/rejected": -300.4862060546875, "loss": 0.4752, "rewards/chosen": -0.07591309398412704, "rewards/margins": 0.20093552023172379, "rewards/rejected": -0.27684861421585083, "step": 192 }, { "epoch": 0.01022977234781226, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2189819.6666666665, "logits/rejected": -103945465.6, "logps/chosen": -152.08599853515625, "logps/rejected": -315.4382568359375, "loss": 0.4529, "rewards/chosen": -0.014279811332623163, "rewards/margins": 0.29958768611152964, "rewards/rejected": -0.3138674974441528, "step": 193 }, { "epoch": 0.010282776349614395, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8636503.0, "logits/rejected": -49162160.0, "logps/chosen": -322.6131896972656, "logps/rejected": -279.6025390625, "loss": 0.4644, "rewards/chosen": 0.021471023559570312, "rewards/margins": 0.287484347820282, "rewards/rejected": -0.26601332426071167, "step": 194 }, { "epoch": 0.010335780351416532, "grad_norm": 92.5, "kl": 0.14723968505859375, "learning_rate": 5e-07, "logits/chosen": -23354244.0, "logits/rejected": -18968964.0, "logps/chosen": -762.811767578125, "logps/rejected": -302.7355041503906, "loss": 0.4724, "rewards/chosen": 0.047361090779304504, "rewards/margins": 0.2524675875902176, "rewards/rejected": -0.20510649681091309, "step": 195 }, { "epoch": 0.010388784353218668, "grad_norm": 62.25, "kl": 0.14415740966796875, "learning_rate": 5e-07, "logits/chosen": -23793420.0, "logits/rejected": -20571428.0, "logps/chosen": -216.11830139160156, "logps/rejected": -240.87432861328125, "loss": 0.4873, "rewards/chosen": -0.010664889588952065, "rewards/margins": 0.13098310492932796, "rewards/rejected": -0.14164799451828003, "step": 196 }, { "epoch": 0.010441788355020805, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16570089.0, "logits/rejected": -22986322.0, "logps/chosen": -320.46832275390625, "logps/rejected": -263.1683349609375, "loss": 0.4589, "rewards/chosen": -0.006545448675751686, "rewards/margins": 0.3352819439023733, "rewards/rejected": -0.341827392578125, "step": 197 }, { "epoch": 0.01049479235682294, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21086710.666666668, "logits/rejected": -35873572.0, "logps/chosen": -297.6399739583333, "logps/rejected": -188.3180389404297, "loss": 0.4879, "rewards/chosen": -0.019304083039363224, "rewards/margins": 0.23428593451778093, "rewards/rejected": -0.25359001755714417, "step": 198 }, { "epoch": 0.010547796358625076, "grad_norm": 110.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20892729.14285714, "logits/rejected": -17946156.0, "logps/chosen": -563.3976702008929, "logps/rejected": -202.8084716796875, "loss": 0.5036, "rewards/chosen": -0.04673723663602557, "rewards/margins": 0.16324353430952343, "rewards/rejected": -0.209980770945549, "step": 199 }, { "epoch": 0.010600800360427213, "grad_norm": 106.0, "kl": 0.025879859924316406, "learning_rate": 5e-07, "logits/chosen": 4288366.5, "logits/rejected": 4388008.285714285, "logps/chosen": -40.722782135009766, "logps/rejected": -391.5598842075893, "loss": 0.4434, "rewards/chosen": 0.07161789387464523, "rewards/margins": 0.3233806586691311, "rewards/rejected": -0.2517627647944859, "step": 200 }, { "epoch": 0.010653804362229348, "grad_norm": 59.5, "kl": 0.08112716674804688, "learning_rate": 5e-07, "logits/chosen": -70324730.66666667, "logits/rejected": -7483376.0, "logps/chosen": -324.83705647786456, "logps/rejected": -104.84344482421875, "loss": 0.4817, "rewards/chosen": 0.04127581914265951, "rewards/margins": 0.21178667744000754, "rewards/rejected": -0.17051085829734802, "step": 201 }, { "epoch": 0.010706808364031484, "grad_norm": 71.5, "kl": 0.31627655029296875, "learning_rate": 5e-07, "logits/chosen": -61002080.0, "logits/rejected": -37594764.8, "logps/chosen": -525.0139973958334, "logps/rejected": -287.1287109375, "loss": 0.4507, "rewards/chosen": 0.12824604908625284, "rewards/margins": 0.4207295695940654, "rewards/rejected": -0.2924835205078125, "step": 202 }, { "epoch": 0.010759812365833621, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13936584.0, "logits/rejected": 6312354.8, "logps/chosen": -342.8386637369792, "logps/rejected": -260.44326171875, "loss": 0.4495, "rewards/chosen": -0.07871818542480469, "rewards/margins": 0.3017838954925537, "rewards/rejected": -0.3805020809173584, "step": 203 }, { "epoch": 0.010812816367635756, "grad_norm": 84.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40912010.666666664, "logits/rejected": -17326672.0, "logps/chosen": -464.4932047526042, "logps/rejected": -388.570361328125, "loss": 0.4488, "rewards/chosen": 0.04547525942325592, "rewards/margins": 0.35111128389835355, "rewards/rejected": -0.30563602447509763, "step": 204 }, { "epoch": 0.010865820369437892, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -106967920.0, "logits/rejected": -20206924.0, "logps/chosen": -348.689453125, "logps/rejected": -214.48825073242188, "loss": 0.4595, "rewards/chosen": 0.022003700956702232, "rewards/margins": 0.3271563556045294, "rewards/rejected": -0.30515265464782715, "step": 205 }, { "epoch": 0.010918824371240029, "grad_norm": 88.0, "kl": 0.085784912109375, "learning_rate": 5e-07, "logits/chosen": -62808518.4, "logits/rejected": -23844181.333333332, "logps/chosen": -351.543212890625, "logps/rejected": -425.4446614583333, "loss": 0.4649, "rewards/chosen": 0.05525757074356079, "rewards/margins": 0.3627984642982483, "rewards/rejected": -0.3075408935546875, "step": 206 }, { "epoch": 0.010971828373042164, "grad_norm": 62.75, "kl": 0.16454124450683594, "learning_rate": 5e-07, "logits/chosen": -20811760.0, "logits/rejected": 6604027.0, "logps/chosen": -308.55702718098956, "logps/rejected": -69.02519226074219, "loss": 0.4951, "rewards/chosen": -0.013177207360665003, "rewards/margins": 0.13182649885614714, "rewards/rejected": -0.14500370621681213, "step": 207 }, { "epoch": 0.011024832374844301, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29995398.4, "logits/rejected": -22010645.333333332, "logps/chosen": -279.0072265625, "logps/rejected": -417.7610677083333, "loss": 0.4501, "rewards/chosen": 0.09282543659210205, "rewards/margins": 0.47973190148671463, "rewards/rejected": -0.3869064648946126, "step": 208 }, { "epoch": 0.011077836376646437, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16230819.0, "logits/rejected": -6219108.0, "logps/chosen": -145.41966247558594, "logps/rejected": -212.61920166015625, "loss": 0.461, "rewards/chosen": -0.010224534198641777, "rewards/margins": 0.32014732249081135, "rewards/rejected": -0.3303718566894531, "step": 209 }, { "epoch": 0.011130840378448572, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4898912.0, "logits/rejected": -22044976.0, "logps/chosen": -193.17164611816406, "logps/rejected": -336.4263000488281, "loss": 0.4744, "rewards/chosen": 0.026673031970858574, "rewards/margins": 0.20699643902480602, "rewards/rejected": -0.18032340705394745, "step": 210 }, { "epoch": 0.01118384438025071, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19931465.333333332, "logits/rejected": -36871161.6, "logps/chosen": -231.24163818359375, "logps/rejected": -386.2158935546875, "loss": 0.4678, "rewards/chosen": -0.01840769499540329, "rewards/margins": 0.20089422017335892, "rewards/rejected": -0.2193019151687622, "step": 211 }, { "epoch": 0.011236848382052845, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 636734.875, "logits/rejected": 2661568.25, "logps/chosen": -117.58097839355469, "logps/rejected": -197.90704345703125, "loss": 0.4725, "rewards/chosen": -0.018796827644109726, "rewards/margins": 0.22233601287007332, "rewards/rejected": -0.24113284051418304, "step": 212 }, { "epoch": 0.01128985238385498, "grad_norm": 74.0, "kl": 0.2894706726074219, "learning_rate": 5e-07, "logits/chosen": -33833640.0, "logits/rejected": -3259414.5, "logps/chosen": -386.8252258300781, "logps/rejected": -236.31298828125, "loss": 0.4519, "rewards/chosen": 0.12657108902931213, "rewards/margins": 0.3877039849758148, "rewards/rejected": -0.2611328959465027, "step": 213 }, { "epoch": 0.011342856385657117, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38008133.333333336, "logits/rejected": -23692704.0, "logps/chosen": -296.735107421875, "logps/rejected": -607.6205444335938, "loss": 0.4523, "rewards/chosen": -0.008110936731100082, "rewards/margins": 0.8615378774702549, "rewards/rejected": -0.869648814201355, "step": 214 }, { "epoch": 0.011395860387459253, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15639737.333333334, "logits/rejected": -38712904.0, "logps/chosen": -318.52378336588544, "logps/rejected": -590.962890625, "loss": 0.4539, "rewards/chosen": 0.0010796884695688884, "rewards/margins": 0.7852380375067393, "rewards/rejected": -0.7841583490371704, "step": 215 }, { "epoch": 0.01144886438926139, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44346648.0, "logits/rejected": -30488052.0, "logps/chosen": -298.8770446777344, "logps/rejected": -228.6018524169922, "loss": 0.4705, "rewards/chosen": 0.02702675200998783, "rewards/margins": 0.2381076905876398, "rewards/rejected": -0.21108093857765198, "step": 216 }, { "epoch": 0.011501868391063525, "grad_norm": 61.25, "kl": 0.014104843139648438, "learning_rate": 5e-07, "logits/chosen": -84999088.0, "logits/rejected": -23836158.0, "logps/chosen": -213.80377197265625, "logps/rejected": -127.70753479003906, "loss": 0.4812, "rewards/chosen": 0.046803414821624756, "rewards/margins": 0.21305975317955017, "rewards/rejected": -0.16625633835792542, "step": 217 }, { "epoch": 0.011554872392865661, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12987020.0, "logits/rejected": -8681344.0, "logps/chosen": -291.154833984375, "logps/rejected": -121.18159993489583, "loss": 0.4886, "rewards/chosen": -0.013806495070457458, "rewards/margins": 0.13111817538738252, "rewards/rejected": -0.14492467045783997, "step": 218 }, { "epoch": 0.011607876394667798, "grad_norm": 98.0, "kl": 0.2056427001953125, "learning_rate": 5e-07, "logits/chosen": -41668068.0, "logits/rejected": 179396112.0, "logps/chosen": -613.4232177734375, "logps/rejected": -227.38217163085938, "loss": 0.4706, "rewards/chosen": -0.029700851067900658, "rewards/margins": 0.2805618289858103, "rewards/rejected": -0.31026268005371094, "step": 219 }, { "epoch": 0.011660880396469933, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54105588.0, "logits/rejected": -17426416.0, "logps/chosen": -55.6715202331543, "logps/rejected": -237.57071940104166, "loss": 0.4396, "rewards/chosen": 0.04441032558679581, "rewards/margins": 0.35908863693475723, "rewards/rejected": -0.3146783113479614, "step": 220 }, { "epoch": 0.011713884398272069, "grad_norm": 50.25, "kl": 0.00091552734375, "learning_rate": 5e-07, "logits/chosen": -7367086.666666667, "logits/rejected": 1296154.5, "logps/chosen": -170.69122314453125, "logps/rejected": -139.46441650390625, "loss": 0.4799, "rewards/chosen": 0.010948116580645243, "rewards/margins": 0.3039794514576594, "rewards/rejected": -0.29303133487701416, "step": 221 }, { "epoch": 0.011766888400074206, "grad_norm": 69.5, "kl": 0.22399139404296875, "learning_rate": 5e-07, "logits/chosen": -33818864.0, "logits/rejected": -62176164.0, "logps/chosen": -284.64486258370533, "logps/rejected": -516.7198486328125, "loss": 0.4624, "rewards/chosen": 0.06557224478040423, "rewards/margins": 1.0668662616184779, "rewards/rejected": -1.0012940168380737, "step": 222 }, { "epoch": 0.011819892401876341, "grad_norm": 70.5, "kl": 0.033966064453125, "learning_rate": 5e-07, "logits/chosen": -36295077.333333336, "logits/rejected": -33619132.8, "logps/chosen": -500.1167805989583, "logps/rejected": -233.295654296875, "loss": 0.4587, "rewards/chosen": 0.025765985250473022, "rewards/margins": 0.2922972857952118, "rewards/rejected": -0.2665313005447388, "step": 223 }, { "epoch": 0.011872896403678477, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10710830.4, "logits/rejected": -46309845.333333336, "logps/chosen": -164.98770751953126, "logps/rejected": -623.9063313802084, "loss": 0.4249, "rewards/chosen": 0.04663802981376648, "rewards/margins": 0.8176280836264292, "rewards/rejected": -0.7709900538126627, "step": 224 }, { "epoch": 0.011925900405480614, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2990855.0, "logits/rejected": -1593528.2, "logps/chosen": -59.17572021484375, "logps/rejected": -180.8207763671875, "loss": 0.4719, "rewards/chosen": 0.026396306852499645, "rewards/margins": 0.19185357938210168, "rewards/rejected": -0.16545727252960205, "step": 225 }, { "epoch": 0.01197890440728275, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10973262.0, "logits/rejected": -21984545.333333332, "logps/chosen": -893.2841186523438, "logps/rejected": -281.7616373697917, "loss": 0.4535, "rewards/chosen": -0.08038330078125, "rewards/margins": 0.19906820853551227, "rewards/rejected": -0.27945150931676227, "step": 226 }, { "epoch": 0.012031908409084887, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22643763.2, "logits/rejected": -1078565.0, "logps/chosen": -167.5926513671875, "logps/rejected": -341.9405924479167, "loss": 0.4718, "rewards/chosen": 0.005058595538139343, "rewards/margins": 0.30167471269766494, "rewards/rejected": -0.29661611715952557, "step": 227 }, { "epoch": 0.012084912410887022, "grad_norm": 94.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -446422.5, "logits/rejected": -70112480.0, "logps/chosen": -262.8088073730469, "logps/rejected": -409.8620198567708, "loss": 0.4373, "rewards/chosen": -0.003384590381756425, "rewards/margins": 0.34236089365246397, "rewards/rejected": -0.3457454840342204, "step": 228 }, { "epoch": 0.012137916412689157, "grad_norm": 74.5, "kl": 0.1107177734375, "learning_rate": 5e-07, "logits/chosen": -11020858.0, "logits/rejected": -35227577.6, "logps/chosen": -529.1941731770834, "logps/rejected": -287.282470703125, "loss": 0.4312, "rewards/chosen": 0.08266143997510274, "rewards/margins": 0.4911364575227101, "rewards/rejected": -0.4084750175476074, "step": 229 }, { "epoch": 0.012190920414491295, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22217852.0, "logits/rejected": -32842480.0, "logps/chosen": -157.04238891601562, "logps/rejected": -244.0919677734375, "loss": 0.4555, "rewards/chosen": 7.985532283782959e-05, "rewards/margins": 0.2929403394460678, "rewards/rejected": -0.29286048412322996, "step": 230 }, { "epoch": 0.01224392441629343, "grad_norm": 82.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4729172.0, "logits/rejected": -31279360.0, "logps/chosen": -79.53651428222656, "logps/rejected": -429.1558837890625, "loss": 0.4016, "rewards/chosen": -0.03650627285242081, "rewards/margins": 0.5327313169836998, "rewards/rejected": -0.5692375898361206, "step": 231 }, { "epoch": 0.012296928418095565, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26449894.85714286, "logits/rejected": -25017296.0, "logps/chosen": -249.08879743303572, "logps/rejected": -406.3746032714844, "loss": 0.4819, "rewards/chosen": -0.028425816978727068, "rewards/margins": 0.7932874517781394, "rewards/rejected": -0.8217132687568665, "step": 232 }, { "epoch": 0.012349932419897703, "grad_norm": 66.5, "kl": 0.14812850952148438, "learning_rate": 5e-07, "logits/chosen": -23940454.0, "logits/rejected": -27416008.0, "logps/chosen": -143.9805908203125, "logps/rejected": -326.61199951171875, "loss": 0.4579, "rewards/chosen": 0.006283331662416458, "rewards/margins": 0.34330855682492256, "rewards/rejected": -0.3370252251625061, "step": 233 }, { "epoch": 0.012402936421699838, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34550448.0, "logits/rejected": -32090504.0, "logps/chosen": -178.62884521484375, "logps/rejected": -165.77469889322916, "loss": 0.468, "rewards/chosen": 0.06443935632705688, "rewards/margins": 0.30371751387914026, "rewards/rejected": -0.23927815755208334, "step": 234 }, { "epoch": 0.012455940423501975, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16834216.0, "logits/rejected": -19317171.2, "logps/chosen": -155.56004842122397, "logps/rejected": -322.845166015625, "loss": 0.4598, "rewards/chosen": 0.08502986033757527, "rewards/margins": 0.2928579290707906, "rewards/rejected": -0.20782806873321533, "step": 235 }, { "epoch": 0.01250894442530411, "grad_norm": 78.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23688380.0, "logits/rejected": -56778900.0, "logps/chosen": -564.720703125, "logps/rejected": -503.4051208496094, "loss": 0.4594, "rewards/chosen": -0.045243073254823685, "rewards/margins": 0.33143578097224236, "rewards/rejected": -0.37667885422706604, "step": 236 }, { "epoch": 0.012561948427106246, "grad_norm": 68.0, "kl": 0.017253875732421875, "learning_rate": 5e-07, "logits/chosen": -36196746.666666664, "logits/rejected": 89524364.8, "logps/chosen": -366.8970947265625, "logps/rejected": -313.760302734375, "loss": 0.4591, "rewards/chosen": 0.07116038103898366, "rewards/margins": 0.29362484713395437, "rewards/rejected": -0.2224644660949707, "step": 237 }, { "epoch": 0.012614952428908383, "grad_norm": 78.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63175093.333333336, "logits/rejected": -13880003.2, "logps/chosen": -402.6424560546875, "logps/rejected": -309.202392578125, "loss": 0.4556, "rewards/chosen": -0.01901194453239441, "rewards/margins": 0.28017125725746156, "rewards/rejected": -0.29918320178985597, "step": 238 }, { "epoch": 0.012667956430710519, "grad_norm": 90.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51581653.333333336, "logits/rejected": 46807692.8, "logps/chosen": -334.8542073567708, "logps/rejected": -374.0042724609375, "loss": 0.4261, "rewards/chosen": 0.016090897222359974, "rewards/margins": 0.4976903234918912, "rewards/rejected": -0.48159942626953123, "step": 239 }, { "epoch": 0.012720960432512654, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22956072.0, "logits/rejected": -29310732.0, "logps/chosen": -221.98239135742188, "logps/rejected": -144.1638946533203, "loss": 0.4625, "rewards/chosen": 0.06061401590704918, "rewards/margins": 0.3041440211236477, "rewards/rejected": -0.2435300052165985, "step": 240 }, { "epoch": 0.012773964434314791, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37485030.4, "logits/rejected": -35189546.666666664, "logps/chosen": -257.291455078125, "logps/rejected": -165.1294962565104, "loss": 0.4948, "rewards/chosen": -0.04349777102470398, "rewards/margins": 0.084178626537323, "rewards/rejected": -0.12767639756202698, "step": 241 }, { "epoch": 0.012826968436116927, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18756994.666666668, "logits/rejected": -34877352.0, "logps/chosen": -315.58123779296875, "logps/rejected": -246.90016174316406, "loss": 0.4767, "rewards/chosen": 0.01340090607603391, "rewards/margins": 0.35184162234266597, "rewards/rejected": -0.3384407162666321, "step": 242 }, { "epoch": 0.012879972437919062, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24432140.8, "logits/rejected": -16543670.666666666, "logps/chosen": -377.5432861328125, "logps/rejected": -235.17154947916666, "loss": 0.4721, "rewards/chosen": -0.0247745156288147, "rewards/margins": 0.321756915251414, "rewards/rejected": -0.3465314308802287, "step": 243 }, { "epoch": 0.0129329764397212, "grad_norm": 97.5, "kl": 0.7881927490234375, "learning_rate": 5e-07, "logits/chosen": -9125038.666666666, "logits/rejected": -23030220.8, "logps/chosen": -1027.35986328125, "logps/rejected": -226.1473876953125, "loss": 0.4428, "rewards/chosen": 0.19681191444396973, "rewards/margins": 0.585518193244934, "rewards/rejected": -0.38870627880096437, "step": 244 }, { "epoch": 0.012985980441523335, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9268054.666666666, "logits/rejected": -12567982.4, "logps/chosen": -163.26996866861978, "logps/rejected": -136.584228515625, "loss": 0.4726, "rewards/chosen": -0.06296692291895549, "rewards/margins": 0.151970907052358, "rewards/rejected": -0.21493782997131347, "step": 245 }, { "epoch": 0.013038984443325472, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17735848.0, "logits/rejected": -31334372.0, "logps/chosen": -284.6527099609375, "logps/rejected": -327.63226318359375, "loss": 0.4724, "rewards/chosen": -0.02417583577334881, "rewards/margins": 0.22617720253765583, "rewards/rejected": -0.25035303831100464, "step": 246 }, { "epoch": 0.013091988445127607, "grad_norm": 84.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57800136.0, "logits/rejected": -2897966.0, "logps/chosen": -778.3128051757812, "logps/rejected": -151.1055908203125, "loss": 0.4403, "rewards/chosen": 0.183247372508049, "rewards/margins": 0.48301972448825836, "rewards/rejected": -0.29977235198020935, "step": 247 }, { "epoch": 0.013144992446929743, "grad_norm": 82.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3665720.5, "logits/rejected": -67252042.66666667, "logps/chosen": -128.36822509765625, "logps/rejected": -558.2283528645834, "loss": 0.3976, "rewards/chosen": 0.05428295210003853, "rewards/margins": 0.6136614171167215, "rewards/rejected": -0.559378465016683, "step": 248 }, { "epoch": 0.01319799644873188, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1440581.3333333333, "logits/rejected": -542127.375, "logps/chosen": -238.26822916666666, "logps/rejected": -47.017066955566406, "loss": 0.4771, "rewards/chosen": 0.07991065581639607, "rewards/margins": 0.20773734649022418, "rewards/rejected": -0.12782669067382812, "step": 249 }, { "epoch": 0.013251000450534015, "grad_norm": 71.5, "kl": 0.02182769775390625, "learning_rate": 5e-07, "logits/chosen": -38308386.666666664, "logits/rejected": -44966056.0, "logps/chosen": -293.211181640625, "logps/rejected": -464.9405517578125, "loss": 0.5007, "rewards/chosen": -0.08103740215301514, "rewards/margins": 0.1603246033191681, "rewards/rejected": -0.24136200547218323, "step": 250 }, { "epoch": 0.01330400445233615, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43796932.0, "logits/rejected": -26814464.0, "logps/chosen": -217.37945556640625, "logps/rejected": -420.0369873046875, "loss": 0.443, "rewards/chosen": -0.0697721540927887, "rewards/margins": 0.49920853972435, "rewards/rejected": -0.5689806938171387, "step": 251 }, { "epoch": 0.013357008454138288, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8981420.8, "logits/rejected": -682394.3333333334, "logps/chosen": -266.830126953125, "logps/rejected": -479.2952067057292, "loss": 0.4662, "rewards/chosen": 0.023136258125305176, "rewards/margins": 0.3633588155110677, "rewards/rejected": -0.3402225573857625, "step": 252 }, { "epoch": 0.013410012455940423, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64702886.4, "logits/rejected": -265594.6666666667, "logps/chosen": -327.5450439453125, "logps/rejected": -457.8771158854167, "loss": 0.4481, "rewards/chosen": 0.02887992560863495, "rewards/margins": 0.5687631418307623, "rewards/rejected": -0.5398832162221273, "step": 253 }, { "epoch": 0.01346301645774256, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35345280.0, "logits/rejected": -27176485.333333332, "logps/chosen": -258.952392578125, "logps/rejected": -309.98695882161456, "loss": 0.4377, "rewards/chosen": -0.0699920728802681, "rewards/margins": 0.2941749816139539, "rewards/rejected": -0.364167054494222, "step": 254 }, { "epoch": 0.013516020459544696, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16279342.0, "logits/rejected": -24815293.333333332, "logps/chosen": -135.9296112060547, "logps/rejected": -285.7713216145833, "loss": 0.4233, "rewards/chosen": -0.012652203440666199, "rewards/margins": 0.4101097136735916, "rewards/rejected": -0.4227619171142578, "step": 255 }, { "epoch": 0.013569024461346831, "grad_norm": 85.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50923608.0, "logits/rejected": -50835928.0, "logps/chosen": -573.211181640625, "logps/rejected": -370.2464599609375, "loss": 0.421, "rewards/chosen": 0.09757985919713974, "rewards/margins": 0.6808011159300804, "rewards/rejected": -0.5832212567329407, "step": 256 }, { "epoch": 0.013622028463148968, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41249560.0, "logits/rejected": -29501668.0, "logps/chosen": -358.91278076171875, "logps/rejected": -318.47540283203125, "loss": 0.4395, "rewards/chosen": 0.10451850295066833, "rewards/margins": 0.4969533681869507, "rewards/rejected": -0.39243486523628235, "step": 257 }, { "epoch": 0.013675032464951104, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27871336.0, "logits/rejected": -37384390.4, "logps/chosen": -331.7604166666667, "logps/rejected": -377.889208984375, "loss": 0.4387, "rewards/chosen": 0.0003854110836982727, "rewards/margins": 0.4033743217587471, "rewards/rejected": -0.40298891067504883, "step": 258 }, { "epoch": 0.01372803646675324, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9019840.0, "logits/rejected": -18863142.0, "logps/chosen": -173.70657348632812, "logps/rejected": -423.1736145019531, "loss": 0.4392, "rewards/chosen": 0.010538198053836823, "rewards/margins": 0.5062924399971962, "rewards/rejected": -0.4957542419433594, "step": 259 }, { "epoch": 0.013781040468555376, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26262641.6, "logits/rejected": -48623205.333333336, "logps/chosen": -285.548828125, "logps/rejected": -378.1095784505208, "loss": 0.4604, "rewards/chosen": 0.010682372748851776, "rewards/margins": 0.43980609228213624, "rewards/rejected": -0.4291237195332845, "step": 260 }, { "epoch": 0.013834044470357512, "grad_norm": 73.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56814664.0, "logits/rejected": -33608144.0, "logps/chosen": -245.60179138183594, "logps/rejected": -392.8314615885417, "loss": 0.4259, "rewards/chosen": -0.06487731635570526, "rewards/margins": 0.36785927911599475, "rewards/rejected": -0.4327365954717, "step": 261 }, { "epoch": 0.013887048472159647, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32453157.333333332, "logits/rejected": 4507688.0, "logps/chosen": -283.19093831380206, "logps/rejected": -480.9465637207031, "loss": 0.4877, "rewards/chosen": -0.017523507277170818, "rewards/margins": 0.23334320386250815, "rewards/rejected": -0.25086671113967896, "step": 262 }, { "epoch": 0.013940052473961784, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24002433.6, "logits/rejected": -3226482.0, "logps/chosen": -476.395068359375, "logps/rejected": -112.917236328125, "loss": 0.4695, "rewards/chosen": 0.013011321425437927, "rewards/margins": 0.32032792270183563, "rewards/rejected": -0.3073166012763977, "step": 263 }, { "epoch": 0.01399305647576392, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1709499.2, "logits/rejected": -24099506.666666668, "logps/chosen": -226.5341796875, "logps/rejected": -402.1429443359375, "loss": 0.4745, "rewards/chosen": 0.016514015197753907, "rewards/margins": 0.2622804562250773, "rewards/rejected": -0.2457664410273234, "step": 264 }, { "epoch": 0.014046060477566057, "grad_norm": 87.0, "kl": 0.1568603515625, "learning_rate": 5e-07, "logits/chosen": -25338996.0, "logits/rejected": -37862888.0, "logps/chosen": -826.2267456054688, "logps/rejected": -344.515380859375, "loss": 0.4252, "rewards/chosen": 0.02278747782111168, "rewards/margins": 0.6767896674573421, "rewards/rejected": -0.6540021896362305, "step": 265 }, { "epoch": 0.014099064479368192, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10297160.8, "logits/rejected": -16872458.666666668, "logps/chosen": -355.299951171875, "logps/rejected": -341.911376953125, "loss": 0.4311, "rewards/chosen": 0.09580001831054688, "rewards/margins": 0.7203361749649048, "rewards/rejected": -0.6245361566543579, "step": 266 }, { "epoch": 0.014152068481170328, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12210472.8, "logits/rejected": -28424712.0, "logps/chosen": -254.72626953125, "logps/rejected": -246.9605916341146, "loss": 0.4589, "rewards/chosen": -0.05813514590263367, "rewards/margins": 0.49161005616188047, "rewards/rejected": -0.5497452020645142, "step": 267 }, { "epoch": 0.014205072482972465, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54820664.0, "logits/rejected": -44010616.0, "logps/chosen": -496.000732421875, "logps/rejected": -402.54193115234375, "loss": 0.4287, "rewards/chosen": 0.013754650950431824, "rewards/margins": 0.5944435447454453, "rewards/rejected": -0.5806888937950134, "step": 268 }, { "epoch": 0.0142580764847746, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13277560.0, "logits/rejected": 1204866.0, "logps/chosen": -150.65291341145834, "logps/rejected": -299.58062744140625, "loss": 0.4783, "rewards/chosen": -0.004296398411194484, "rewards/margins": 0.37328463171919185, "rewards/rejected": -0.37758103013038635, "step": 269 }, { "epoch": 0.014311080486576736, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21951838.0, "logits/rejected": -18778020.0, "logps/chosen": -229.2898712158203, "logps/rejected": -176.1447296142578, "loss": 0.4563, "rewards/chosen": 0.011684799566864967, "rewards/margins": 0.3539207521826029, "rewards/rejected": -0.3422359526157379, "step": 270 }, { "epoch": 0.014364084488378873, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49977046.4, "logits/rejected": -24828573.333333332, "logps/chosen": -300.0943603515625, "logps/rejected": -240.05659993489584, "loss": 0.4587, "rewards/chosen": 0.017740631103515626, "rewards/margins": 0.4417653322219849, "rewards/rejected": -0.42402470111846924, "step": 271 }, { "epoch": 0.014417088490181008, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23698694.0, "logits/rejected": -24435434.666666668, "logps/chosen": -357.4344787597656, "logps/rejected": -342.0044352213542, "loss": 0.4421, "rewards/chosen": -0.10913009941577911, "rewards/margins": 0.2451062649488449, "rewards/rejected": -0.354236364364624, "step": 272 }, { "epoch": 0.014470092491983146, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21854665.6, "logits/rejected": -16828397.333333332, "logps/chosen": -365.302197265625, "logps/rejected": -182.3519490559896, "loss": 0.4754, "rewards/chosen": 0.0069947801530361176, "rewards/margins": 0.2614289882282416, "rewards/rejected": -0.2544342080752055, "step": 273 }, { "epoch": 0.014523096493785281, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22191954.0, "logits/rejected": -9342896.0, "logps/chosen": -496.6129150390625, "logps/rejected": -168.66909790039062, "loss": 0.4794, "rewards/chosen": -0.09420166164636612, "rewards/margins": 0.16658630222082138, "rewards/rejected": -0.2607879638671875, "step": 274 }, { "epoch": 0.014576100495587416, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2451939.1428571427, "logits/rejected": -56132376.0, "logps/chosen": -65.06648908342633, "logps/rejected": -343.67449951171875, "loss": 0.4895, "rewards/chosen": -0.005446625607354301, "rewards/margins": 0.3737648001738957, "rewards/rejected": -0.37921142578125, "step": 275 }, { "epoch": 0.014629104497389554, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19378113.6, "logits/rejected": -18230438.666666668, "logps/chosen": -222.583837890625, "logps/rejected": -160.53589884440103, "loss": 0.4542, "rewards/chosen": 0.052214431762695315, "rewards/margins": 0.4613587935765584, "rewards/rejected": -0.4091443618138631, "step": 276 }, { "epoch": 0.014682108499191689, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 149762176.0, "logits/rejected": -30215258.666666668, "logps/chosen": -602.7608642578125, "logps/rejected": -232.67401123046875, "loss": 0.4414, "rewards/chosen": 0.0987091064453125, "rewards/margins": 0.38431787490844727, "rewards/rejected": -0.28560876846313477, "step": 277 }, { "epoch": 0.014735112500993824, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11531433.6, "logits/rejected": 3783734.3333333335, "logps/chosen": -273.430517578125, "logps/rejected": -250.39200846354166, "loss": 0.4604, "rewards/chosen": 0.009832113981246948, "rewards/margins": 0.4263288676738739, "rewards/rejected": -0.41649675369262695, "step": 278 }, { "epoch": 0.014788116502795962, "grad_norm": 93.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72468560.0, "logits/rejected": -28368761.6, "logps/chosen": -1248.02197265625, "logps/rejected": -315.12265625, "loss": 0.4394, "rewards/chosen": 0.19978535175323486, "rewards/margins": 0.47517826557159426, "rewards/rejected": -0.2753929138183594, "step": 279 }, { "epoch": 0.014841120504598097, "grad_norm": 82.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18361020.0, "logits/rejected": 14272820.0, "logps/chosen": -278.5992736816406, "logps/rejected": -493.49639892578125, "loss": 0.4319, "rewards/chosen": 0.008032798767089844, "rewards/margins": 0.562368631362915, "rewards/rejected": -0.5543358325958252, "step": 280 }, { "epoch": 0.014894124506400232, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41517140.0, "logits/rejected": -47594008.0, "logps/chosen": -150.50457763671875, "logps/rejected": -495.9222106933594, "loss": 0.4322, "rewards/chosen": 0.008515692315995693, "rewards/margins": 0.5572415953502059, "rewards/rejected": -0.5487259030342102, "step": 281 }, { "epoch": 0.01494712850820237, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20495409.333333332, "logits/rejected": -21981699.2, "logps/chosen": -252.5577392578125, "logps/rejected": -330.66015625, "loss": 0.4126, "rewards/chosen": 0.029043133060137432, "rewards/margins": 0.5886123011509578, "rewards/rejected": -0.5595691680908204, "step": 282 }, { "epoch": 0.015000132510004505, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18717078.0, "logits/rejected": -3803666.75, "logps/chosen": -164.943359375, "logps/rejected": -319.41656494140625, "loss": 0.4664, "rewards/chosen": 0.08836416900157928, "rewards/margins": 0.2697437256574631, "rewards/rejected": -0.1813795566558838, "step": 283 }, { "epoch": 0.015053136511806642, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12592256.0, "logits/rejected": -40909152.0, "logps/chosen": -147.1200714111328, "logps/rejected": -211.1006622314453, "loss": 0.4531, "rewards/chosen": -0.043477632105350494, "rewards/margins": 0.3950679823756218, "rewards/rejected": -0.4385456144809723, "step": 284 }, { "epoch": 0.015106140513608778, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32971098.666666668, "logits/rejected": -22130364.0, "logps/chosen": -145.66612752278647, "logps/rejected": -264.9101257324219, "loss": 0.4477, "rewards/chosen": 0.10155334075291951, "rewards/margins": 0.6500705679257711, "rewards/rejected": -0.5485172271728516, "step": 285 }, { "epoch": 0.015159144515410913, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -99503840.0, "logits/rejected": -33105702.4, "logps/chosen": -619.930908203125, "logps/rejected": -416.558154296875, "loss": 0.4245, "rewards/chosen": 0.046834309895833336, "rewards/margins": 0.5176740010579427, "rewards/rejected": -0.4708396911621094, "step": 286 }, { "epoch": 0.01521214851721305, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46494640.0, "logits/rejected": -13799438.857142856, "logps/chosen": -769.8773193359375, "logps/rejected": -197.86129324776786, "loss": 0.4578, "rewards/chosen": -0.06949462741613388, "rewards/margins": 0.13474480914218084, "rewards/rejected": -0.20423943655831472, "step": 287 }, { "epoch": 0.015265152519015186, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6572986.8, "logits/rejected": -35985733.333333336, "logps/chosen": -193.61524658203126, "logps/rejected": -290.99566650390625, "loss": 0.4582, "rewards/chosen": 0.001634746789932251, "rewards/margins": 0.49123608072598773, "rewards/rejected": -0.4896013339360555, "step": 288 }, { "epoch": 0.015318156520817321, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32387032.0, "logits/rejected": -5732176.5, "logps/chosen": -283.7329915364583, "logps/rejected": -235.1036834716797, "loss": 0.4586, "rewards/chosen": 0.06903492907683055, "rewards/margins": 0.5461982736984888, "rewards/rejected": -0.4771633446216583, "step": 289 }, { "epoch": 0.015371160522619458, "grad_norm": 81.0, "kl": 0.1058807373046875, "learning_rate": 5e-07, "logits/chosen": -55687408.0, "logits/rejected": -31207712.0, "logps/chosen": -469.1820068359375, "logps/rejected": -334.2898356119792, "loss": 0.4117, "rewards/chosen": 0.09795990586280823, "rewards/margins": 0.5608119467894237, "rewards/rejected": -0.4628520409266154, "step": 290 }, { "epoch": 0.015424164524421594, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46139750.4, "logits/rejected": -33758552.0, "logps/chosen": -307.2725341796875, "logps/rejected": -301.05348714192706, "loss": 0.4711, "rewards/chosen": -0.014427947998046874, "rewards/margins": 0.32360791365305586, "rewards/rejected": -0.3380358616511027, "step": 291 }, { "epoch": 0.01547716852622373, "grad_norm": 80.5, "kl": 0.2974700927734375, "learning_rate": 5e-07, "logits/chosen": -27716684.8, "logits/rejected": -3527415.3333333335, "logps/chosen": -659.1876953125, "logps/rejected": -141.73182169596353, "loss": 0.464, "rewards/chosen": 0.12016674280166625, "rewards/margins": 0.3992377956708272, "rewards/rejected": -0.27907105286916095, "step": 292 }, { "epoch": 0.015530172528025866, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9449289.6, "logits/rejected": -39112541.333333336, "logps/chosen": -98.42174072265625, "logps/rejected": -448.8753255208333, "loss": 0.4496, "rewards/chosen": 0.0012411490082740785, "rewards/margins": 0.5556529973944029, "rewards/rejected": -0.5544118483861288, "step": 293 }, { "epoch": 0.015583176529828002, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22524600.0, "logits/rejected": -37511908.0, "logps/chosen": -239.334228515625, "logps/rejected": -305.1379699707031, "loss": 0.4521, "rewards/chosen": -0.012230206280946732, "rewards/margins": 0.4094991497695446, "rewards/rejected": -0.42172935605049133, "step": 294 }, { "epoch": 0.01563618053163014, "grad_norm": 79.0, "kl": 0.38536834716796875, "learning_rate": 5e-07, "logits/chosen": -24995808.0, "logits/rejected": -42150458.666666664, "logps/chosen": -510.444921875, "logps/rejected": -589.4788004557291, "loss": 0.4222, "rewards/chosen": 0.14789073467254638, "rewards/margins": 0.7729975461959839, "rewards/rejected": -0.6251068115234375, "step": 295 }, { "epoch": 0.015689184533432272, "grad_norm": 80.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11785161.0, "logits/rejected": -16189058.666666666, "logps/chosen": -797.0465087890625, "logps/rejected": -252.79508463541666, "loss": 0.4232, "rewards/chosen": 0.17367401719093323, "rewards/margins": 0.5415506263573964, "rewards/rejected": -0.3678766091664632, "step": 296 }, { "epoch": 0.01574218853523441, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72114197.33333333, "logits/rejected": -1359304.8, "logps/chosen": -291.08465576171875, "logps/rejected": -365.585546875, "loss": 0.4439, "rewards/chosen": -0.009069502353668213, "rewards/margins": 0.36301096677780154, "rewards/rejected": -0.37208046913146975, "step": 297 }, { "epoch": 0.015795192537036547, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30802464.0, "logits/rejected": -38960848.0, "logps/chosen": -233.3116455078125, "logps/rejected": -582.2696533203125, "loss": 0.4191, "rewards/chosen": 0.043423935770988464, "rewards/margins": 0.6826251000165939, "rewards/rejected": -0.6392011642456055, "step": 298 }, { "epoch": 0.015848196538838684, "grad_norm": 81.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26770821.333333332, "logits/rejected": -29664912.0, "logps/chosen": -473.266845703125, "logps/rejected": -473.909619140625, "loss": 0.4302, "rewards/chosen": -0.15969340006510416, "rewards/margins": 0.426233990987142, "rewards/rejected": -0.5859273910522461, "step": 299 }, { "epoch": 0.015901200540640818, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42779013.333333336, "logits/rejected": 46898265.6, "logps/chosen": -196.5276082356771, "logps/rejected": -193.3817626953125, "loss": 0.4518, "rewards/chosen": 0.03140748292207718, "rewards/margins": 0.3307070568203926, "rewards/rejected": -0.2992995738983154, "step": 300 }, { "epoch": 0.015954204542442955, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43768650.666666664, "logits/rejected": -93813427.2, "logps/chosen": -278.4853922526042, "logps/rejected": -431.638427734375, "loss": 0.4002, "rewards/chosen": -0.06134275098641714, "rewards/margins": 0.6606156001488367, "rewards/rejected": -0.7219583511352539, "step": 301 }, { "epoch": 0.016007208544245092, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29510441.6, "logits/rejected": 5726576.0, "logps/chosen": -331.16201171875, "logps/rejected": -135.11383056640625, "loss": 0.4988, "rewards/chosen": -0.11089150905609131, "rewards/margins": 0.08666747013727823, "rewards/rejected": -0.19755897919336954, "step": 302 }, { "epoch": 0.016060212546047226, "grad_norm": 83.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40231664.0, "logits/rejected": -44813328.0, "logps/chosen": -161.38509114583334, "logps/rejected": -662.17646484375, "loss": 0.3718, "rewards/chosen": -0.03307406107584635, "rewards/margins": 0.8912204106648763, "rewards/rejected": -0.9242944717407227, "step": 303 }, { "epoch": 0.016113216547849363, "grad_norm": 72.0, "kl": 0.04175758361816406, "learning_rate": 5e-07, "logits/chosen": -14181386.666666666, "logits/rejected": 4025645.75, "logps/chosen": -210.79496256510416, "logps/rejected": -74.69347381591797, "loss": 0.4794, "rewards/chosen": 0.024692279597123463, "rewards/margins": 0.29914067437251407, "rewards/rejected": -0.2744483947753906, "step": 304 }, { "epoch": 0.0161662205496515, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17635600.0, "logits/rejected": -33304068.0, "logps/chosen": -209.4668426513672, "logps/rejected": -471.9109191894531, "loss": 0.4252, "rewards/chosen": -0.005862045101821423, "rewards/margins": 0.6466797115281224, "rewards/rejected": -0.6525417566299438, "step": 305 }, { "epoch": 0.016219224551453634, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55761368.0, "logits/rejected": -44568024.0, "logps/chosen": -350.9626159667969, "logps/rejected": -471.42291259765625, "loss": 0.4181, "rewards/chosen": 0.14389744400978088, "rewards/margins": 0.6730081737041473, "rewards/rejected": -0.5291107296943665, "step": 306 }, { "epoch": 0.01627222855325577, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29372796.0, "logits/rejected": -22131992.0, "logps/chosen": -231.60693359375, "logps/rejected": -101.65147908528645, "loss": 0.4508, "rewards/chosen": -0.11084633320569992, "rewards/margins": 0.1908196086684863, "rewards/rejected": -0.3016659418741862, "step": 307 }, { "epoch": 0.016325232555057908, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30298906.0, "logits/rejected": -9380482.0, "logps/chosen": -256.4494934082031, "logps/rejected": -317.107421875, "loss": 0.4481, "rewards/chosen": -0.0002716071903705597, "rewards/margins": 0.44737015292048454, "rewards/rejected": -0.4476417601108551, "step": 308 }, { "epoch": 0.01637823655686004, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17812496.0, "logits/rejected": -8047858.666666667, "logps/chosen": -149.3137451171875, "logps/rejected": -298.18467203776044, "loss": 0.4836, "rewards/chosen": -0.05833904147148132, "rewards/margins": 0.21794239481290179, "rewards/rejected": -0.2762814362843831, "step": 309 }, { "epoch": 0.01643124055866218, "grad_norm": 88.5, "kl": 0.22072601318359375, "learning_rate": 5e-07, "logits/chosen": -42894752.0, "logits/rejected": -44661834.666666664, "logps/chosen": -479.62666015625, "logps/rejected": -451.5084228515625, "loss": 0.4248, "rewards/chosen": 0.13626198768615722, "rewards/margins": 0.7325042565663655, "rewards/rejected": -0.5962422688802084, "step": 310 }, { "epoch": 0.016484244560464316, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42365112.0, "logits/rejected": -11543668.0, "logps/chosen": -365.5145263671875, "logps/rejected": -436.9675598144531, "loss": 0.4296, "rewards/chosen": 0.10586896538734436, "rewards/margins": 0.5859463214874268, "rewards/rejected": -0.4800773561000824, "step": 311 }, { "epoch": 0.01653724856226645, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34857226.666666664, "logits/rejected": -62104460.8, "logps/chosen": -321.7819010416667, "logps/rejected": -288.3716552734375, "loss": 0.4348, "rewards/chosen": -0.001257320245107015, "rewards/margins": 0.43008971611658736, "rewards/rejected": -0.43134703636169436, "step": 312 }, { "epoch": 0.016590252564068587, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11031312.0, "logits/rejected": -44498348.0, "logps/chosen": -328.8202427455357, "logps/rejected": -252.7349395751953, "loss": 0.4772, "rewards/chosen": 0.044689561639513285, "rewards/margins": 0.46994745305606295, "rewards/rejected": -0.4252578914165497, "step": 313 }, { "epoch": 0.016643256565870724, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24344728.0, "logits/rejected": -21791352.0, "logps/chosen": -211.3268839518229, "logps/rejected": -226.05185546875, "loss": 0.4204, "rewards/chosen": 0.1174774169921875, "rewards/margins": 0.5810863971710205, "rewards/rejected": -0.463608980178833, "step": 314 }, { "epoch": 0.016696260567672858, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24051963.2, "logits/rejected": -16930378.666666668, "logps/chosen": -151.2471435546875, "logps/rejected": -371.2225341796875, "loss": 0.4383, "rewards/chosen": 0.019984132051467894, "rewards/margins": 0.6844553212324778, "rewards/rejected": -0.6644711891810099, "step": 315 }, { "epoch": 0.016749264569474995, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35068438.85714286, "logits/rejected": -3929744.75, "logps/chosen": -236.44667271205358, "logps/rejected": -137.1845703125, "loss": 0.4826, "rewards/chosen": 0.05788977657045637, "rewards/margins": 0.21082404468740734, "rewards/rejected": -0.152934268116951, "step": 316 }, { "epoch": 0.016802268571277132, "grad_norm": 65.0, "kl": 0.0827646255493164, "learning_rate": 5e-07, "logits/chosen": 229680.5, "logits/rejected": -44255120.0, "logps/chosen": -114.66843668619792, "logps/rejected": -545.6293334960938, "loss": 0.4575, "rewards/chosen": 0.00906985749801, "rewards/margins": 0.7275772665937742, "rewards/rejected": -0.7185074090957642, "step": 317 }, { "epoch": 0.01685527257307927, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35772520.0, "logits/rejected": -5270846.5, "logps/chosen": -360.0618082682292, "logps/rejected": -49.31640625, "loss": 0.502, "rewards/chosen": -0.021774580081303913, "rewards/margins": 0.010505578170220058, "rewards/rejected": -0.03228015825152397, "step": 318 }, { "epoch": 0.016908276574881403, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11810470.666666666, "logits/rejected": -5935400.0, "logps/chosen": -140.55207316080728, "logps/rejected": -99.1527587890625, "loss": 0.4647, "rewards/chosen": -0.01967074101169904, "rewards/margins": 0.22078703219691911, "rewards/rejected": -0.24045777320861816, "step": 319 }, { "epoch": 0.01696128057668354, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39221798.4, "logits/rejected": -11442009.333333334, "logps/chosen": -572.83662109375, "logps/rejected": -78.66781616210938, "loss": 0.4663, "rewards/chosen": 0.10429047346115113, "rewards/margins": 0.2947511951128642, "rewards/rejected": -0.19046072165171304, "step": 320 }, { "epoch": 0.017014284578485677, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45787896.0, "logits/rejected": 12390154.0, "logps/chosen": -125.80955505371094, "logps/rejected": -146.26025390625, "loss": 0.4727, "rewards/chosen": 0.014747712761163712, "rewards/margins": 0.21925098076462746, "rewards/rejected": -0.20450326800346375, "step": 321 }, { "epoch": 0.01706728858028781, "grad_norm": 81.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37262682.666666664, "logits/rejected": 167697056.0, "logps/chosen": -532.793212890625, "logps/rejected": -740.162109375, "loss": 0.4627, "rewards/chosen": -0.06523481508096059, "rewards/margins": 0.7759791860977808, "rewards/rejected": -0.8412140011787415, "step": 322 }, { "epoch": 0.017120292582089948, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16209852.0, "logits/rejected": -46870176.0, "logps/chosen": -110.94216918945312, "logps/rejected": -407.3418273925781, "loss": 0.4245, "rewards/chosen": -0.010661270469427109, "rewards/margins": 0.6357906796038151, "rewards/rejected": -0.6464519500732422, "step": 323 }, { "epoch": 0.017173296583892085, "grad_norm": 64.5, "kl": 0.03521728515625, "learning_rate": 5e-07, "logits/chosen": -80751600.0, "logits/rejected": -19534384.0, "logps/chosen": -253.91641235351562, "logps/rejected": -255.25442504882812, "loss": 0.4677, "rewards/chosen": 0.0006580352783203125, "rewards/margins": 0.2633412480354309, "rewards/rejected": -0.2626832127571106, "step": 324 }, { "epoch": 0.01722630058569422, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1339304.1666666667, "logits/rejected": 4501915.5, "logps/chosen": -262.689208984375, "logps/rejected": -203.14154052734375, "loss": 0.4934, "rewards/chosen": -0.017714375009139378, "rewards/margins": 0.1416186752418677, "rewards/rejected": -0.15933305025100708, "step": 325 }, { "epoch": 0.017279304587496356, "grad_norm": 52.0, "kl": 0.08033943176269531, "learning_rate": 5e-07, "logits/chosen": -17717270.0, "logits/rejected": -22567712.0, "logps/chosen": -105.25981903076172, "logps/rejected": -307.2941589355469, "loss": 0.4291, "rewards/chosen": 0.029785826802253723, "rewards/margins": 0.6369626671075821, "rewards/rejected": -0.6071768403053284, "step": 326 }, { "epoch": 0.017332308589298493, "grad_norm": 92.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45275504.0, "logits/rejected": -11994624.0, "logps/chosen": -712.8599243164062, "logps/rejected": -393.72344970703125, "loss": 0.4025, "rewards/chosen": 0.12609033286571503, "rewards/margins": 0.8250893801450729, "rewards/rejected": -0.6989990472793579, "step": 327 }, { "epoch": 0.017385312591100627, "grad_norm": 74.0, "kl": 0.020721435546875, "learning_rate": 5e-07, "logits/chosen": -22799789.333333332, "logits/rejected": -43299353.6, "logps/chosen": -405.1298421223958, "logps/rejected": -279.4724365234375, "loss": 0.4104, "rewards/chosen": 0.12854613860448202, "rewards/margins": 0.6473462065060934, "rewards/rejected": -0.5188000679016114, "step": 328 }, { "epoch": 0.017438316592902764, "grad_norm": 72.0, "kl": 0.2347583770751953, "learning_rate": 5e-07, "logits/chosen": -2523056.3333333335, "logits/rejected": -25816088.0, "logps/chosen": -458.9813639322917, "logps/rejected": -212.2200927734375, "loss": 0.437, "rewards/chosen": 0.0866166353225708, "rewards/margins": 0.45387542247772217, "rewards/rejected": -0.36725878715515137, "step": 329 }, { "epoch": 0.0174913205947049, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4280739.666666667, "logits/rejected": -5979434.4, "logps/chosen": -39.481091817220054, "logps/rejected": -631.075927734375, "loss": 0.3873, "rewards/chosen": -0.064712588985761, "rewards/margins": 0.824822265903155, "rewards/rejected": -0.889534854888916, "step": 330 }, { "epoch": 0.017544324596507035, "grad_norm": 54.25, "kl": 0.044281005859375, "learning_rate": 5e-07, "logits/chosen": -34251336.0, "logits/rejected": -26561204.0, "logps/chosen": -254.611083984375, "logps/rejected": -224.9617156982422, "loss": 0.4414, "rewards/chosen": 0.03368225321173668, "rewards/margins": 0.48069652542471886, "rewards/rejected": -0.4470142722129822, "step": 331 }, { "epoch": 0.017597328598309172, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34684160.0, "logits/rejected": -11912594.0, "logps/chosen": -251.96810913085938, "logps/rejected": -368.304931640625, "loss": 0.4373, "rewards/chosen": -0.05080890655517578, "rewards/margins": 0.5404789447784424, "rewards/rejected": -0.5912878513336182, "step": 332 }, { "epoch": 0.01765033260011131, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30914777.6, "logits/rejected": -15920941.333333334, "logps/chosen": -373.086181640625, "logps/rejected": -213.0046183268229, "loss": 0.4778, "rewards/chosen": -0.0603466808795929, "rewards/margins": 0.28541887005170186, "rewards/rejected": -0.34576555093129474, "step": 333 }, { "epoch": 0.017703336601913443, "grad_norm": 65.5, "kl": 0.018123626708984375, "learning_rate": 5e-07, "logits/chosen": -19511468.0, "logits/rejected": -32263635.2, "logps/chosen": -400.4478759765625, "logps/rejected": -182.9501220703125, "loss": 0.466, "rewards/chosen": -0.08556721607844035, "rewards/margins": 0.19141933520634968, "rewards/rejected": -0.27698655128479005, "step": 334 }, { "epoch": 0.01775634060371558, "grad_norm": 65.5, "kl": 0.2534141540527344, "learning_rate": 5e-07, "logits/chosen": -52341380.0, "logits/rejected": -8531404.0, "logps/chosen": -358.19158935546875, "logps/rejected": -251.5716094970703, "loss": 0.4288, "rewards/chosen": 0.06196317449212074, "rewards/margins": 0.6495942212641239, "rewards/rejected": -0.5876310467720032, "step": 335 }, { "epoch": 0.017809344605517717, "grad_norm": 80.5, "kl": 0.3271608352661133, "learning_rate": 5e-07, "logits/chosen": -22070236.8, "logits/rejected": -28712698.666666668, "logps/chosen": -391.305810546875, "logps/rejected": -686.8938802083334, "loss": 0.4343, "rewards/chosen": 0.09169247150421142, "rewards/margins": 0.768282437324524, "rewards/rejected": -0.6765899658203125, "step": 336 }, { "epoch": 0.017862348607319854, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23861277.333333332, "logits/rejected": -34235235.2, "logps/chosen": -363.3868815104167, "logps/rejected": -235.3373046875, "loss": 0.4097, "rewards/chosen": 0.014455159505208334, "rewards/margins": 0.6439982732137045, "rewards/rejected": -0.6295431137084961, "step": 337 }, { "epoch": 0.017915352609121988, "grad_norm": 113.5, "kl": 0.014375686645507812, "learning_rate": 5e-07, "logits/chosen": -19603157.333333332, "logits/rejected": 85505504.0, "logps/chosen": -234.1588134765625, "logps/rejected": -456.8072265625, "loss": 0.4196, "rewards/chosen": -0.18159600098927817, "rewards/margins": 0.49909061590830484, "rewards/rejected": -0.680686616897583, "step": 338 }, { "epoch": 0.017968356610924125, "grad_norm": 78.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34955160.0, "logits/rejected": 3743915.6, "logps/chosen": -369.5265299479167, "logps/rejected": -369.069140625, "loss": 0.4476, "rewards/chosen": -0.0373370498418808, "rewards/margins": 0.34049603044986726, "rewards/rejected": -0.37783308029174806, "step": 339 }, { "epoch": 0.018021360612726262, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39706851.2, "logits/rejected": -12925478.666666666, "logps/chosen": -395.7193603515625, "logps/rejected": -283.8338216145833, "loss": 0.4552, "rewards/chosen": -0.034234237670898435, "rewards/margins": 0.5202866872151694, "rewards/rejected": -0.5545209248860677, "step": 340 }, { "epoch": 0.018074364614528396, "grad_norm": 62.75, "kl": 0.03842735290527344, "learning_rate": 5e-07, "logits/chosen": -52917064.0, "logits/rejected": -15742312.0, "logps/chosen": -342.439453125, "logps/rejected": -95.38087463378906, "loss": 0.4685, "rewards/chosen": 0.06857413798570633, "rewards/margins": 0.25412408262491226, "rewards/rejected": -0.18554994463920593, "step": 341 }, { "epoch": 0.018127368616330533, "grad_norm": 78.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3275600.0, "logits/rejected": -13367302.666666666, "logps/chosen": -556.4046630859375, "logps/rejected": -190.3529256184896, "loss": 0.4213, "rewards/chosen": 0.23032836616039276, "rewards/margins": 0.5820973763863246, "rewards/rejected": -0.3517690102259318, "step": 342 }, { "epoch": 0.01818037261813267, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15552917.333333334, "logits/rejected": -26225830.4, "logps/chosen": -450.9894205729167, "logps/rejected": -259.6356201171875, "loss": 0.4111, "rewards/chosen": 0.09252727031707764, "rewards/margins": 0.6216080904006958, "rewards/rejected": -0.5290808200836181, "step": 343 }, { "epoch": 0.018233376619934804, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3636444.0, "logits/rejected": -35082076.8, "logps/chosen": -58.46277872721354, "logps/rejected": -473.597412109375, "loss": 0.4154, "rewards/chosen": -0.0622423787911733, "rewards/margins": 0.5529705385367075, "rewards/rejected": -0.6152129173278809, "step": 344 }, { "epoch": 0.01828638062173694, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5595756.0, "logits/rejected": -41733877.333333336, "logps/chosen": -37.8628044128418, "logps/rejected": -323.36118570963544, "loss": 0.3991, "rewards/chosen": -0.053656768053770065, "rewards/margins": 0.5301692175368468, "rewards/rejected": -0.5838259855906168, "step": 345 }, { "epoch": 0.018339384623539078, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2289410.8333333335, "logits/rejected": 39582372.0, "logps/chosen": -364.6496988932292, "logps/rejected": -144.85406494140625, "loss": 0.4967, "rewards/chosen": -0.025970615446567535, "rewards/margins": 0.10296129435300827, "rewards/rejected": -0.1289319097995758, "step": 346 }, { "epoch": 0.018392388625341212, "grad_norm": 67.5, "kl": 0.62030029296875, "learning_rate": 5e-07, "logits/chosen": -35543268.0, "logits/rejected": -56369520.0, "logps/chosen": -501.2886657714844, "logps/rejected": -245.59637451171875, "loss": 0.4581, "rewards/chosen": 0.07223339378833771, "rewards/margins": 0.4651598185300827, "rewards/rejected": -0.392926424741745, "step": 347 }, { "epoch": 0.01844539262714335, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1526802.75, "logits/rejected": -9326025.0, "logps/chosen": -64.91675567626953, "logps/rejected": -319.45355224609375, "loss": 0.4594, "rewards/chosen": 0.08201360702514648, "rewards/margins": 0.3270437717437744, "rewards/rejected": -0.24503016471862793, "step": 348 }, { "epoch": 0.018498396628945486, "grad_norm": 76.5, "kl": 0.02004241943359375, "learning_rate": 5e-07, "logits/chosen": -40261592.0, "logits/rejected": -41068820.0, "logps/chosen": -533.9874267578125, "logps/rejected": -321.6602783203125, "loss": 0.4516, "rewards/chosen": 0.026200359066327412, "rewards/margins": 0.787511353691419, "rewards/rejected": -0.7613109946250916, "step": 349 }, { "epoch": 0.01855140063074762, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29479610.666666668, "logits/rejected": -16240489.6, "logps/chosen": -297.51747639973956, "logps/rejected": -357.8765380859375, "loss": 0.4252, "rewards/chosen": -0.004216512044270833, "rewards/margins": 0.4903272787729899, "rewards/rejected": -0.49454379081726074, "step": 350 }, { "epoch": 0.018604404632549757, "grad_norm": 58.0, "kl": 0.19256973266601562, "learning_rate": 5e-07, "logits/chosen": 2056082.875, "logits/rejected": -8161508.0, "logps/chosen": -191.3944549560547, "logps/rejected": -167.36004638671875, "loss": 0.471, "rewards/chosen": -0.041132260113954544, "rewards/margins": 0.23460040614008904, "rewards/rejected": -0.2757326662540436, "step": 351 }, { "epoch": 0.018657408634351894, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54761600.0, "logits/rejected": -38782277.333333336, "logps/chosen": -444.250537109375, "logps/rejected": -368.0337320963542, "loss": 0.4472, "rewards/chosen": -0.058661806583404544, "rewards/margins": 0.746017030874888, "rewards/rejected": -0.8046788374582926, "step": 352 }, { "epoch": 0.018710412636154028, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56938073.6, "logits/rejected": -21747872.0, "logps/chosen": -243.5123291015625, "logps/rejected": -123.00437418619792, "loss": 0.4726, "rewards/chosen": 0.03036949336528778, "rewards/margins": 0.2763592332601547, "rewards/rejected": -0.24598973989486694, "step": 353 }, { "epoch": 0.018763416637956165, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25226786.0, "logits/rejected": 979680.375, "logps/chosen": -299.34735107421875, "logps/rejected": -162.35195922851562, "loss": 0.462, "rewards/chosen": -0.04295501857995987, "rewards/margins": 0.30924131721258163, "rewards/rejected": -0.3521963357925415, "step": 354 }, { "epoch": 0.018816420639758302, "grad_norm": 73.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53454378.666666664, "logits/rejected": -28186281.6, "logps/chosen": -344.2746175130208, "logps/rejected": -295.269384765625, "loss": 0.4089, "rewards/chosen": 0.16220295429229736, "rewards/margins": 0.6692878484725953, "rewards/rejected": -0.5070848941802979, "step": 355 }, { "epoch": 0.01886942464156044, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91015769.6, "logits/rejected": -43111178.666666664, "logps/chosen": -169.5058837890625, "logps/rejected": -718.4611002604166, "loss": 0.4274, "rewards/chosen": -0.019625018537044524, "rewards/margins": 0.8432920868198076, "rewards/rejected": -0.8629171053568522, "step": 356 }, { "epoch": 0.018922428643362573, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77241792.0, "logits/rejected": -17581793.6, "logps/chosen": -288.48712158203125, "logps/rejected": -210.18212890625, "loss": 0.4471, "rewards/chosen": -0.03195953369140625, "rewards/margins": 0.3359438180923462, "rewards/rejected": -0.36790335178375244, "step": 357 }, { "epoch": 0.01897543264516471, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84105638.4, "logits/rejected": -4317116.666666667, "logps/chosen": -252.437841796875, "logps/rejected": -199.81778971354166, "loss": 0.4554, "rewards/chosen": -0.005957642197608947, "rewards/margins": 0.5049897839625677, "rewards/rejected": -0.5109474261601766, "step": 358 }, { "epoch": 0.019028436646966847, "grad_norm": 91.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34479536.0, "logits/rejected": -2304460.0, "logps/chosen": -376.53533935546875, "logps/rejected": -475.9085693359375, "loss": 0.3843, "rewards/chosen": 0.20842896401882172, "rewards/margins": 0.8062561204036077, "rewards/rejected": -0.597827156384786, "step": 359 }, { "epoch": 0.01908144064876898, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18059470.4, "logits/rejected": -73391232.0, "logps/chosen": -274.4770263671875, "logps/rejected": -484.593505859375, "loss": 0.4275, "rewards/chosen": 0.03239635527133942, "rewards/margins": 0.7984068781137467, "rewards/rejected": -0.7660105228424072, "step": 360 }, { "epoch": 0.019134444650571118, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10294434.0, "logps/chosen": -222.625732421875, "loss": 0.5119, "rewards/chosen": -0.04756560176610947, "step": 361 }, { "epoch": 0.019187448652373255, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3984858.0, "logits/rejected": -16505024.0, "logps/chosen": -178.5634562174479, "logps/rejected": -174.1970703125, "loss": 0.434, "rewards/chosen": 0.1079694132010142, "rewards/margins": 0.47220548192660017, "rewards/rejected": -0.36423606872558595, "step": 362 }, { "epoch": 0.01924045265417539, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28640701.333333332, "logits/rejected": -107646624.0, "logps/chosen": -314.54323323567706, "logps/rejected": -403.0855712890625, "loss": 0.4601, "rewards/chosen": -0.031162006159623463, "rewards/margins": 0.7604361350337664, "rewards/rejected": -0.7915981411933899, "step": 363 }, { "epoch": 0.019293456655977526, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3139033.0, "logits/rejected": -48626688.0, "logps/chosen": -133.3601837158203, "logps/rejected": -381.8675130208333, "loss": 0.4053, "rewards/chosen": -0.06195030361413956, "rewards/margins": 0.4909677331646284, "rewards/rejected": -0.5529180367787679, "step": 364 }, { "epoch": 0.019346460657779663, "grad_norm": 61.75, "kl": 0.2915458679199219, "learning_rate": 5e-07, "logits/chosen": -20759644.0, "logits/rejected": 3537990.0, "logps/chosen": -196.59442138671875, "logps/rejected": -265.37762451171875, "loss": 0.4487, "rewards/chosen": 0.09319153428077698, "rewards/margins": 0.41943562030792236, "rewards/rejected": -0.3262440860271454, "step": 365 }, { "epoch": 0.019399464659581797, "grad_norm": 81.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26301642.666666668, "logits/rejected": -27205440.0, "logps/chosen": -525.2674153645834, "logps/rejected": -381.10675048828125, "loss": 0.4441, "rewards/chosen": 0.11644439895947774, "rewards/margins": 0.7091388603051504, "rewards/rejected": -0.5926944613456726, "step": 366 }, { "epoch": 0.019452468661383934, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7879473.5, "logits/rejected": -71909264.0, "logps/chosen": -107.2363510131836, "logps/rejected": -327.97039794921875, "loss": 0.4175, "rewards/chosen": 0.040384817868471146, "rewards/margins": 0.7008442170917988, "rewards/rejected": -0.6604593992233276, "step": 367 }, { "epoch": 0.01950547266318607, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39062616.0, "logits/rejected": -18818298.0, "logps/chosen": -318.74725341796875, "logps/rejected": -134.37356567382812, "loss": 0.4672, "rewards/chosen": -0.0005099298432469368, "rewards/margins": 0.26583491545170546, "rewards/rejected": -0.2663448452949524, "step": 368 }, { "epoch": 0.019558476664988205, "grad_norm": 80.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -64705668.0, "logps/rejected": -418.1453857421875, "loss": 0.3231, "rewards/rejected": -0.7578790187835693, "step": 369 }, { "epoch": 0.019611480666790342, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10356847.333333334, "logits/rejected": -23622035.2, "logps/chosen": -271.0858561197917, "logps/rejected": -174.4809326171875, "loss": 0.4296, "rewards/chosen": -0.10287029544512431, "rewards/margins": 0.42205250461896254, "rewards/rejected": -0.5249228000640869, "step": 370 }, { "epoch": 0.01966448466859248, "grad_norm": 77.5, "kl": 0.028240203857421875, "learning_rate": 5e-07, "logits/chosen": -42120568.0, "logits/rejected": -85112872.0, "logps/chosen": -499.8033447265625, "logps/rejected": -257.3038330078125, "loss": 0.4489, "rewards/chosen": 0.005297851748764515, "rewards/margins": 0.4193019988015294, "rewards/rejected": -0.4140041470527649, "step": 371 }, { "epoch": 0.019717488670394613, "grad_norm": 90.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52430344.0, "logits/rejected": -10375277.333333334, "logps/chosen": -466.3455505371094, "logps/rejected": -467.9132486979167, "loss": 0.3566, "rewards/chosen": 0.24803009629249573, "rewards/margins": 0.9901242951552073, "rewards/rejected": -0.7420941988627116, "step": 372 }, { "epoch": 0.01977049267219675, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50559924.0, "logits/rejected": -10154847.0, "logps/chosen": -447.54571533203125, "logps/rejected": -155.61375427246094, "loss": 0.4425, "rewards/chosen": 0.06935768574476242, "rewards/margins": 0.5050451382994652, "rewards/rejected": -0.43568745255470276, "step": 373 }, { "epoch": 0.019823496673998887, "grad_norm": 93.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -131200888.0, "logits/rejected": -25603339.42857143, "logps/chosen": -486.6409912109375, "logps/rejected": -424.1015625, "loss": 0.3545, "rewards/chosen": -0.17214356362819672, "rewards/margins": 0.5716063273804528, "rewards/rejected": -0.7437498910086495, "step": 374 }, { "epoch": 0.019876500675801025, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 24968349.333333332, "logits/rejected": 13148275.2, "logps/chosen": -110.76971435546875, "logps/rejected": -337.4902099609375, "loss": 0.4494, "rewards/chosen": -0.012884584565957388, "rewards/margins": 0.33006802449623746, "rewards/rejected": -0.3429526090621948, "step": 375 }, { "epoch": 0.01992950467760316, "grad_norm": 83.5, "kl": 0.6249942779541016, "learning_rate": 5e-07, "logits/chosen": -62085536.0, "logits/rejected": -18753960.0, "logps/chosen": -654.0655517578125, "logps/rejected": -201.16690063476562, "loss": 0.4229, "rewards/chosen": 0.1967211365699768, "rewards/margins": 0.6407715678215027, "rewards/rejected": -0.4440504312515259, "step": 376 }, { "epoch": 0.019982508679405295, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5274590.0, "logits/rejected": -10960161.333333334, "logps/chosen": -202.7703857421875, "logps/rejected": -269.2037760416667, "loss": 0.3847, "rewards/chosen": -0.12880706787109375, "rewards/margins": 0.6059579849243164, "rewards/rejected": -0.7347650527954102, "step": 377 }, { "epoch": 0.020035512681207433, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10345746.0, "logits/rejected": -24090026.0, "logps/chosen": -68.57942199707031, "logps/rejected": -289.7344970703125, "loss": 0.4491, "rewards/chosen": -0.027217673137784004, "rewards/margins": 0.42625446431338787, "rewards/rejected": -0.4534721374511719, "step": 378 }, { "epoch": 0.020088516683009566, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26731776.0, "logits/rejected": -82804928.0, "logps/chosen": -289.4783203125, "logps/rejected": -620.4586588541666, "loss": 0.4243, "rewards/chosen": 0.04752475917339325, "rewards/margins": 0.8328905791044235, "rewards/rejected": -0.7853658199310303, "step": 379 }, { "epoch": 0.020141520684811703, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18959125.333333332, "logits/rejected": -31828320.0, "logps/chosen": -143.31939697265625, "logps/rejected": -548.422265625, "loss": 0.3933, "rewards/chosen": 0.041215007503827415, "rewards/margins": 0.7474432637294134, "rewards/rejected": -0.7062282562255859, "step": 380 }, { "epoch": 0.02019452468661384, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51558692.0, "logits/rejected": -16208697.0, "logps/chosen": -374.8773193359375, "logps/rejected": -181.70631408691406, "loss": 0.4361, "rewards/chosen": 0.0698772445321083, "rewards/margins": 0.5283055678009987, "rewards/rejected": -0.4584283232688904, "step": 381 }, { "epoch": 0.020247528688415974, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75139910.4, "logits/rejected": -38245677.333333336, "logps/chosen": -149.8309814453125, "logps/rejected": -241.76568603515625, "loss": 0.4718, "rewards/chosen": -0.07200260162353515, "rewards/margins": 0.3754549105962117, "rewards/rejected": -0.4474575122197469, "step": 382 }, { "epoch": 0.02030053269021811, "grad_norm": 58.75, "kl": 0.01817798614501953, "learning_rate": 5e-07, "logits/chosen": -13727493.714285715, "logits/rejected": -5025567.5, "logps/chosen": -140.41321672712053, "logps/rejected": -509.373291015625, "loss": 0.4628, "rewards/chosen": 0.0724812660898481, "rewards/margins": 0.8030415688242231, "rewards/rejected": -0.730560302734375, "step": 383 }, { "epoch": 0.02035353669202025, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14876750.0, "logits/rejected": -359196.65625, "logps/chosen": -279.6886901855469, "logps/rejected": -65.86422729492188, "loss": 0.4724, "rewards/chosen": 0.026459932327270508, "rewards/margins": 0.22214867174625397, "rewards/rejected": -0.19568873941898346, "step": 384 }, { "epoch": 0.020406540693822382, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25188816.0, "logits/rejected": -16873224.0, "logps/chosen": -310.3221130371094, "logps/rejected": -206.78369140625, "loss": 0.4631, "rewards/chosen": 0.14151859283447266, "rewards/margins": 0.2988584339618683, "rewards/rejected": -0.15733984112739563, "step": 385 }, { "epoch": 0.02045954469562452, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 960265.6, "logits/rejected": -24361970.666666668, "logps/chosen": -291.365087890625, "logps/rejected": -400.2229410807292, "loss": 0.4235, "rewards/chosen": 0.02271369993686676, "rewards/margins": 0.9164008686939875, "rewards/rejected": -0.8936871687571207, "step": 386 }, { "epoch": 0.020512548697426657, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19677646.4, "logits/rejected": -20716056.0, "logps/chosen": -271.15380859375, "logps/rejected": -243.77762858072916, "loss": 0.4692, "rewards/chosen": -0.1350764513015747, "rewards/margins": 0.46816769440968836, "rewards/rejected": -0.6032441457112631, "step": 387 }, { "epoch": 0.02056555269922879, "grad_norm": 96.0, "kl": 0.16085433959960938, "learning_rate": 5e-07, "logits/chosen": -31198076.0, "logits/rejected": -9548788.0, "logps/chosen": -568.156982421875, "logps/rejected": -435.689697265625, "loss": 0.3987, "rewards/chosen": 0.14538878202438354, "rewards/margins": 0.8624809384346008, "rewards/rejected": -0.7170921564102173, "step": 388 }, { "epoch": 0.020618556701030927, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11531476.0, "logits/rejected": -3615587.75, "logps/chosen": -300.837646484375, "logps/rejected": -94.19288635253906, "loss": 0.455, "rewards/chosen": -0.019327161833643913, "rewards/margins": 0.3668459076434374, "rewards/rejected": -0.3861730694770813, "step": 389 }, { "epoch": 0.020671560702833065, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12036202.285714285, "logits/rejected": -39568748.0, "logps/chosen": -345.21250697544644, "logps/rejected": -351.1572570800781, "loss": 0.4784, "rewards/chosen": 0.05417905535016741, "rewards/margins": 0.36973996247564045, "rewards/rejected": -0.315560907125473, "step": 390 }, { "epoch": 0.0207245647046352, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17099088.0, "logits/rejected": -25323413.333333332, "logps/chosen": -289.7099365234375, "logps/rejected": -295.4734700520833, "loss": 0.4277, "rewards/chosen": 0.03679592311382294, "rewards/margins": 0.8609338492155075, "rewards/rejected": -0.8241379261016846, "step": 391 }, { "epoch": 0.020777568706437335, "grad_norm": 85.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38732464.0, "logits/rejected": 49804448.0, "logps/chosen": -598.7952473958334, "logps/rejected": -303.01318359375, "loss": 0.3934, "rewards/chosen": -0.042884066700935364, "rewards/margins": 0.7316930741071701, "rewards/rejected": -0.7745771408081055, "step": 392 }, { "epoch": 0.020830572708239473, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38940106.666666664, "logits/rejected": -20325324.8, "logps/chosen": -519.5928548177084, "logps/rejected": -229.6139404296875, "loss": 0.4124, "rewards/chosen": 0.13494110107421875, "rewards/margins": 0.6391493320465088, "rewards/rejected": -0.5042082309722901, "step": 393 }, { "epoch": 0.02088357671004161, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2247668.6666666665, "logits/rejected": -7077576.8, "logps/chosen": -95.5233662923177, "logps/rejected": -276.30439453125, "loss": 0.3885, "rewards/chosen": 0.06778430938720703, "rewards/margins": 0.7767674446105957, "rewards/rejected": -0.7089831352233886, "step": 394 }, { "epoch": 0.020936580711843743, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17968970.666666668, "logits/rejected": -34569004.8, "logps/chosen": -298.87526448567706, "logps/rejected": -401.3975830078125, "loss": 0.369, "rewards/chosen": 0.11745471755663554, "rewards/margins": 0.967422483364741, "rewards/rejected": -0.8499677658081055, "step": 395 }, { "epoch": 0.02098958471364588, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24294638.0, "logits/rejected": -21321622.0, "logps/chosen": -188.80848693847656, "logps/rejected": -197.06723022460938, "loss": 0.471, "rewards/chosen": -0.14254407584667206, "rewards/margins": 0.24890576303005219, "rewards/rejected": -0.39144983887672424, "step": 396 }, { "epoch": 0.021042588715448018, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19673296.0, "logits/rejected": -6787927.5, "logps/chosen": -225.03167724609375, "logps/rejected": -82.27018737792969, "loss": 0.4809, "rewards/chosen": -0.057901959866285324, "rewards/margins": 0.15439261868596077, "rewards/rejected": -0.2122945785522461, "step": 397 }, { "epoch": 0.02109559271725015, "grad_norm": 60.0, "kl": 0.03112030029296875, "learning_rate": 5e-07, "logits/chosen": -4103291.0, "logits/rejected": -6653146.0, "logps/chosen": -279.574462890625, "logps/rejected": -191.78211975097656, "loss": 0.4248, "rewards/chosen": 0.0037834178656339645, "rewards/margins": 0.6299510728567839, "rewards/rejected": -0.6261676549911499, "step": 398 }, { "epoch": 0.02114859671905229, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7817314.0, "logits/rejected": -37418778.666666664, "logps/chosen": -80.37163543701172, "logps/rejected": -371.1195068359375, "loss": 0.3637, "rewards/chosen": 0.024605751037597656, "rewards/margins": 0.8012504577636719, "rewards/rejected": -0.7766447067260742, "step": 399 }, { "epoch": 0.021201600720854426, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6192934.0, "logits/rejected": -2490219.2, "logps/chosen": -47.43181864420573, "logps/rejected": -212.2238525390625, "loss": 0.4076, "rewards/chosen": 0.18975067138671875, "rewards/margins": 0.6863822937011719, "rewards/rejected": -0.4966316223144531, "step": 400 }, { "epoch": 0.02125460472265656, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -973905.875, "logits/rejected": -14764161.0, "logps/chosen": -278.78961181640625, "logps/rejected": -314.5852355957031, "loss": 0.4335, "rewards/chosen": 0.04402885586023331, "rewards/margins": 0.5662634149193764, "rewards/rejected": -0.5222345590591431, "step": 401 }, { "epoch": 0.021307608724458697, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28374000.0, "logits/rejected": -26911456.0, "logps/chosen": -177.44148763020834, "logps/rejected": -211.70986328125, "loss": 0.4424, "rewards/chosen": -0.08265253901481628, "rewards/margins": 0.35374091267585756, "rewards/rejected": -0.43639345169067384, "step": 402 }, { "epoch": 0.021360612726260834, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41188232.0, "logits/rejected": -6489727.333333333, "logps/chosen": -610.5874633789062, "logps/rejected": -301.0443929036458, "loss": 0.3983, "rewards/chosen": 0.01578369364142418, "rewards/margins": 0.5958133799334367, "rewards/rejected": -0.5800296862920126, "step": 403 }, { "epoch": 0.021413616728062967, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42527176.0, "logits/rejected": -79891512.0, "logps/chosen": -225.17617797851562, "logps/rejected": -383.6042785644531, "loss": 0.3994, "rewards/chosen": 0.10818805545568466, "rewards/margins": 0.8575666770339012, "rewards/rejected": -0.7493786215782166, "step": 404 }, { "epoch": 0.021466620729865105, "grad_norm": 69.0, "kl": 0.032840728759765625, "learning_rate": 5e-07, "logits/chosen": -4114084.0, "logits/rejected": -28036268.0, "logps/chosen": -397.7936604817708, "logps/rejected": -550.5650024414062, "loss": 0.4189, "rewards/chosen": 0.07236941655476888, "rewards/margins": 1.3228729565938313, "rewards/rejected": -1.2505035400390625, "step": 405 }, { "epoch": 0.021519624731667242, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6364317.5, "logits/rejected": -33727114.666666664, "logps/chosen": -63.349124908447266, "logps/rejected": -290.42624918619794, "loss": 0.4028, "rewards/chosen": -0.06589203327894211, "rewards/margins": 0.5085819040735563, "rewards/rejected": -0.5744739373524984, "step": 406 }, { "epoch": 0.021572628733469375, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13173482.0, "logits/rejected": -17129910.666666668, "logps/chosen": -151.9967041015625, "logps/rejected": -250.8013916015625, "loss": 0.4257, "rewards/chosen": -0.05990028381347656, "rewards/margins": 0.37482790152231854, "rewards/rejected": -0.4347281853357951, "step": 407 }, { "epoch": 0.021625632735271513, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10076497.6, "logits/rejected": -15693513.333333334, "logps/chosen": -182.226025390625, "logps/rejected": -399.7076416015625, "loss": 0.4333, "rewards/chosen": -0.02379951477050781, "rewards/margins": 0.8069734414418539, "rewards/rejected": -0.8307729562123617, "step": 408 }, { "epoch": 0.02167863673707365, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45812611.2, "logits/rejected": -21665921.333333332, "logps/chosen": -482.29072265625, "logps/rejected": -100.13185628255208, "loss": 0.469, "rewards/chosen": -0.037218934297561644, "rewards/margins": 0.3672459463278453, "rewards/rejected": -0.4044648806254069, "step": 409 }, { "epoch": 0.021731640738875783, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28836892.0, "logps/chosen": -230.21971130371094, "loss": 0.4922, "rewards/chosen": 0.031534574925899506, "step": 410 }, { "epoch": 0.02178464474067792, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53499241.6, "logits/rejected": -20532365.333333332, "logps/chosen": -379.992529296875, "logps/rejected": -495.8313395182292, "loss": 0.4456, "rewards/chosen": 0.0012478642165660858, "rewards/margins": 0.6615696561833223, "rewards/rejected": -0.6603217919667562, "step": 411 }, { "epoch": 0.021837648742480058, "grad_norm": 95.5, "kl": 0.146942138671875, "learning_rate": 5e-07, "logits/chosen": -3934529.0, "logits/rejected": -40289731.2, "logps/chosen": -720.50537109375, "logps/rejected": -380.7026611328125, "loss": 0.383, "rewards/chosen": 0.05064798891544342, "rewards/margins": 0.8371876388788223, "rewards/rejected": -0.7865396499633789, "step": 412 }, { "epoch": 0.021890652744282195, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1310112.0, "logits/rejected": -20538931.2, "logps/chosen": -387.2154134114583, "logps/rejected": -321.22919921875, "loss": 0.425, "rewards/chosen": -0.06802966197331746, "rewards/margins": 0.5007226745287577, "rewards/rejected": -0.5687523365020752, "step": 413 }, { "epoch": 0.02194365674608433, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41977760.0, "logits/rejected": -10605374.0, "logps/chosen": -257.8391520182292, "logps/rejected": -174.18460083007812, "loss": 0.4782, "rewards/chosen": -0.07107404867808025, "rewards/margins": 0.5328287680943807, "rewards/rejected": -0.6039028167724609, "step": 414 }, { "epoch": 0.021996660747886466, "grad_norm": 71.5, "kl": 0.3879547119140625, "learning_rate": 5e-07, "logits/chosen": -44735616.0, "logits/rejected": -84637424.0, "logps/chosen": -324.92626953125, "logps/rejected": -554.1691284179688, "loss": 0.4028, "rewards/chosen": 0.11632709205150604, "rewards/margins": 0.9013303965330124, "rewards/rejected": -0.7850033044815063, "step": 415 }, { "epoch": 0.022049664749688603, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39467044.0, "logits/rejected": -10037637.0, "logps/chosen": -367.2538757324219, "logps/rejected": -184.92080688476562, "loss": 0.4493, "rewards/chosen": -0.04437980800867081, "rewards/margins": 0.41922225803136826, "rewards/rejected": -0.46360206604003906, "step": 416 }, { "epoch": 0.022102668751490737, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41402288.0, "logits/rejected": -43558186.666666664, "logps/chosen": -239.527685546875, "logps/rejected": -206.47943115234375, "loss": 0.4345, "rewards/chosen": 0.14327514171600342, "rewards/margins": 0.6170681715011597, "rewards/rejected": -0.47379302978515625, "step": 417 }, { "epoch": 0.022155672753292874, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17808153.6, "logits/rejected": -826470.6666666666, "logps/chosen": -352.37275390625, "logps/rejected": -127.57271321614583, "loss": 0.4744, "rewards/chosen": 0.017140315473079683, "rewards/margins": 0.2657505139708519, "rewards/rejected": -0.24861019849777222, "step": 418 }, { "epoch": 0.02220867675509501, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25565686.0, "logits/rejected": -12730822.0, "logps/chosen": -335.50933837890625, "logps/rejected": -483.1513671875, "loss": 0.423, "rewards/chosen": -0.23401838541030884, "rewards/margins": 0.706788182258606, "rewards/rejected": -0.9408065676689148, "step": 419 }, { "epoch": 0.022261680756897145, "grad_norm": 92.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86826933.33333333, "logits/rejected": -11189172.0, "logps/chosen": -491.4039713541667, "logps/rejected": -286.23992919921875, "loss": 0.4617, "rewards/chosen": 0.009617935866117477, "rewards/margins": 0.6109225861728191, "rewards/rejected": -0.6013046503067017, "step": 420 }, { "epoch": 0.022314684758699282, "grad_norm": 48.0, "kl": 0.060314178466796875, "learning_rate": 5e-07, "logits/chosen": -18827196.0, "logits/rejected": -17822924.8, "logps/chosen": -256.21250406901044, "logps/rejected": -113.618408203125, "loss": 0.4523, "rewards/chosen": -0.019370269030332565, "rewards/margins": 0.3141986854374409, "rewards/rejected": -0.33356895446777346, "step": 421 }, { "epoch": 0.02236768876050142, "grad_norm": 78.5, "kl": 0.28844451904296875, "learning_rate": 5e-07, "logits/chosen": -32311286.4, "logits/rejected": -47690773.333333336, "logps/chosen": -570.103173828125, "logps/rejected": -204.77909342447916, "loss": 0.4442, "rewards/chosen": 0.09263489246368409, "rewards/margins": 0.6329409837722778, "rewards/rejected": -0.5403060913085938, "step": 422 }, { "epoch": 0.022420692762303553, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12874522.666666666, "logits/rejected": -25224084.8, "logps/chosen": -128.2171834309896, "logps/rejected": -201.83294677734375, "loss": 0.4091, "rewards/chosen": -0.015941111991802853, "rewards/margins": 0.5996646237870058, "rewards/rejected": -0.6156057357788086, "step": 423 }, { "epoch": 0.02247369676410569, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30407760.0, "logits/rejected": 4535120.666666667, "logps/chosen": -231.31630859375, "logps/rejected": -424.3621012369792, "loss": 0.4473, "rewards/chosen": -0.1146356225013733, "rewards/margins": 0.6995588978131613, "rewards/rejected": -0.8141945203145345, "step": 424 }, { "epoch": 0.022526700765907827, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7534450.0, "logits/rejected": -16635844.0, "logps/chosen": -474.71502685546875, "logps/rejected": -380.7734069824219, "loss": 0.4323, "rewards/chosen": -0.027787208557128906, "rewards/margins": 0.5616574883460999, "rewards/rejected": -0.5894446969032288, "step": 425 }, { "epoch": 0.02257970476770996, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7606436.0, "logits/rejected": -4086686.0, "logps/chosen": -221.87544759114584, "logps/rejected": -269.54400634765625, "loss": 0.4513, "rewards/chosen": 0.00026225919524828595, "rewards/margins": 0.8726637351016203, "rewards/rejected": -0.8724014759063721, "step": 426 }, { "epoch": 0.022632708769512098, "grad_norm": 79.0, "kl": 0.7691078186035156, "learning_rate": 5e-07, "logits/chosen": -33942715.428571425, "logits/rejected": -66934.0625, "logps/chosen": -431.828369140625, "logps/rejected": -161.18487548828125, "loss": 0.4816, "rewards/chosen": 0.13139672790254867, "rewards/margins": 0.42946498308862957, "rewards/rejected": -0.29806825518608093, "step": 427 }, { "epoch": 0.022685712771314235, "grad_norm": 75.5, "kl": 0.1919097900390625, "learning_rate": 5e-07, "logits/chosen": -43530812.0, "logits/rejected": -53873128.0, "logps/chosen": -571.67333984375, "logps/rejected": -347.0289001464844, "loss": 0.4094, "rewards/chosen": 0.16817094385623932, "rewards/margins": 0.7682297378778458, "rewards/rejected": -0.6000587940216064, "step": 428 }, { "epoch": 0.02273871677311637, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18458286.666666668, "logits/rejected": -27981424.0, "logps/chosen": -196.31439208984375, "logps/rejected": -297.0066650390625, "loss": 0.3985, "rewards/chosen": -0.19288776318232217, "rewards/margins": 0.6494279106458029, "rewards/rejected": -0.842315673828125, "step": 429 }, { "epoch": 0.022791720774918506, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37294328.0, "logits/rejected": -5585015.0, "logps/chosen": -404.26251220703125, "logps/rejected": -229.69827270507812, "loss": 0.4562, "rewards/chosen": 0.002262115478515625, "rewards/margins": 0.3599507212638855, "rewards/rejected": -0.3576886057853699, "step": 430 }, { "epoch": 0.022844724776720643, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11562297.333333334, "logits/rejected": -6815285.0, "logps/chosen": -304.9239908854167, "logps/rejected": -75.62306213378906, "loss": 0.4827, "rewards/chosen": 0.02373682955900828, "rewards/margins": 0.23357723156611124, "rewards/rejected": -0.20984040200710297, "step": 431 }, { "epoch": 0.02289772877852278, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -89499368.0, "logits/rejected": -27535206.0, "logps/chosen": -477.915283203125, "logps/rejected": -184.5809783935547, "loss": 0.4302, "rewards/chosen": 0.069158174097538, "rewards/margins": 0.5746131911873817, "rewards/rejected": -0.5054550170898438, "step": 432 }, { "epoch": 0.022950732780324914, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38460134.4, "logits/rejected": -37671010.666666664, "logps/chosen": -297.62509765625, "logps/rejected": -398.5437825520833, "loss": 0.4296, "rewards/chosen": 0.06331665515899658, "rewards/margins": 0.7742384195327758, "rewards/rejected": -0.7109217643737793, "step": 433 }, { "epoch": 0.02300373678212705, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72154240.0, "logits/rejected": -21860480.0, "logps/chosen": -332.23077392578125, "logps/rejected": -309.704345703125, "loss": 0.4379, "rewards/chosen": 0.03649158403277397, "rewards/margins": 0.5126171223819256, "rewards/rejected": -0.4761255383491516, "step": 434 }, { "epoch": 0.023056740783929188, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -891757.0, "logits/rejected": 2248922.75, "logps/chosen": -139.74819946289062, "logps/rejected": -313.8717956542969, "loss": 0.4082, "rewards/chosen": 0.024893667548894882, "rewards/margins": 0.8064090274274349, "rewards/rejected": -0.78151535987854, "step": 435 }, { "epoch": 0.023109744785731322, "grad_norm": 57.25, "kl": 0.0064239501953125, "learning_rate": 5e-07, "logits/chosen": -42394976.0, "logits/rejected": -32329616.0, "logps/chosen": -306.06719970703125, "logps/rejected": -303.2636474609375, "loss": 0.4014, "rewards/chosen": 0.05024490753809611, "rewards/margins": 0.7123058835665385, "rewards/rejected": -0.6620609760284424, "step": 436 }, { "epoch": 0.02316274878753346, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6255558.5, "logits/rejected": -18879380.57142857, "logps/chosen": -351.9159851074219, "logps/rejected": -281.33888462611606, "loss": 0.4022, "rewards/chosen": -0.5170807242393494, "rewards/margins": 0.04225995710917885, "rewards/rejected": -0.5593406813485282, "step": 437 }, { "epoch": 0.023215752789335596, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80507402.66666667, "logits/rejected": -34122809.6, "logps/chosen": -203.46671549479166, "logps/rejected": -264.9173583984375, "loss": 0.4006, "rewards/chosen": 0.04808197418848673, "rewards/margins": 0.7643447915712992, "rewards/rejected": -0.7162628173828125, "step": 438 }, { "epoch": 0.02326875679113773, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37272160.0, "logits/rejected": -33068665.6, "logps/chosen": -308.0755208333333, "logps/rejected": -359.210205078125, "loss": 0.3882, "rewards/chosen": 0.01906178891658783, "rewards/margins": 0.8299192756414413, "rewards/rejected": -0.8108574867248535, "step": 439 }, { "epoch": 0.023321760792939867, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -152285568.0, "logits/rejected": -45908292.571428575, "logps/chosen": -34.98101806640625, "logps/rejected": -279.62594168526783, "loss": 0.3524, "rewards/chosen": -0.088018037378788, "rewards/margins": 0.6561353259852954, "rewards/rejected": -0.7441533633640834, "step": 440 }, { "epoch": 0.023374764794742004, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31559574.0, "logits/rejected": -32971812.0, "logps/chosen": -310.81573486328125, "logps/rejected": -241.9796142578125, "loss": 0.4133, "rewards/chosen": -0.033008478581905365, "rewards/margins": 0.7537583485245705, "rewards/rejected": -0.7867668271064758, "step": 441 }, { "epoch": 0.023427768796544138, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22383184.0, "logits/rejected": -37583018.666666664, "logps/chosen": -317.49761962890625, "logps/rejected": -331.1689046223958, "loss": 0.4037, "rewards/chosen": -0.06507740169763565, "rewards/margins": 0.5102059667309126, "rewards/rejected": -0.5752833684285482, "step": 442 }, { "epoch": 0.023480772798346275, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38565645.333333336, "logits/rejected": -18704142.4, "logps/chosen": -424.3050130208333, "logps/rejected": -154.87701416015625, "loss": 0.4381, "rewards/chosen": 0.08111470937728882, "rewards/margins": 0.44086905717849734, "rewards/rejected": -0.3597543478012085, "step": 443 }, { "epoch": 0.023533776800148412, "grad_norm": 58.5, "kl": 0.00408935546875, "learning_rate": 5e-07, "logits/chosen": -33303817.6, "logits/rejected": -7760930.666666667, "logps/chosen": -283.18388671875, "logps/rejected": -88.3270772298177, "loss": 0.4359, "rewards/chosen": 0.2247490406036377, "rewards/margins": 0.5452787399291992, "rewards/rejected": -0.3205296993255615, "step": 444 }, { "epoch": 0.023586780801950546, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36100384.0, "logits/rejected": -37662360.0, "logps/chosen": -235.06353759765625, "logps/rejected": -322.8015543619792, "loss": 0.3482, "rewards/chosen": -0.02340555191040039, "rewards/margins": 0.8502848943074545, "rewards/rejected": -0.8736904462178549, "step": 445 }, { "epoch": 0.023639784803752683, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21130490.666666668, "logits/rejected": -14909500.0, "logps/chosen": -150.76155598958334, "logps/rejected": -276.44390869140625, "loss": 0.4589, "rewards/chosen": 0.036405655244986214, "rewards/margins": 0.6221021140615145, "rewards/rejected": -0.5856964588165283, "step": 446 }, { "epoch": 0.02369278880555482, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27212269.333333332, "logits/rejected": -4056763.75, "logps/chosen": -392.9239095052083, "logps/rejected": -182.00250244140625, "loss": 0.446, "rewards/chosen": 0.11196146408716838, "rewards/margins": 0.6547959844271342, "rewards/rejected": -0.5428345203399658, "step": 447 }, { "epoch": 0.023745792807356954, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25277469.333333332, "logits/rejected": -10792824.0, "logps/chosen": -187.2939453125, "logps/rejected": -179.718408203125, "loss": 0.4206, "rewards/chosen": -0.08716805775960286, "rewards/margins": 0.49833957354227704, "rewards/rejected": -0.5855076313018799, "step": 448 }, { "epoch": 0.02379879680915909, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12552164.57142857, "logits/rejected": -19525434.0, "logps/chosen": -216.09514508928572, "logps/rejected": -80.52861022949219, "loss": 0.4946, "rewards/chosen": -0.01794507248061044, "rewards/margins": 0.28232882278306143, "rewards/rejected": -0.3002738952636719, "step": 449 }, { "epoch": 0.023851800810961228, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28556597.333333332, "logits/rejected": -8320995.5, "logps/chosen": -251.28314208984375, "logps/rejected": -138.7413787841797, "loss": 0.4842, "rewards/chosen": -0.05345567067464193, "rewards/margins": 0.3663002649943034, "rewards/rejected": -0.4197559356689453, "step": 450 }, { "epoch": 0.023904804812763365, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59754485.333333336, "logits/rejected": -9535616.8, "logps/chosen": -262.0679117838542, "logps/rejected": -207.2813232421875, "loss": 0.3901, "rewards/chosen": 0.023492942253748577, "rewards/margins": 0.7798546175161997, "rewards/rejected": -0.7563616752624511, "step": 451 }, { "epoch": 0.0239578088145655, "grad_norm": 86.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47475944.0, "logits/rejected": -37141400.0, "logps/chosen": -386.9144592285156, "logps/rejected": -327.857421875, "loss": 0.3953, "rewards/chosen": 0.04082965850830078, "rewards/margins": 0.8943386077880859, "rewards/rejected": -0.8535089492797852, "step": 452 }, { "epoch": 0.024010812816367636, "grad_norm": 79.0, "kl": 0.4444923400878906, "learning_rate": 5e-07, "logits/chosen": -43515177.6, "logits/rejected": 6597239.333333333, "logps/chosen": -470.84765625, "logps/rejected": -200.4901123046875, "loss": 0.3965, "rewards/chosen": 0.26208679676055907, "rewards/margins": 1.0002006769180298, "rewards/rejected": -0.7381138801574707, "step": 453 }, { "epoch": 0.024063816818169773, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21598548.8, "logits/rejected": -28829344.0, "logps/chosen": -286.84453125, "logps/rejected": -190.7711181640625, "loss": 0.4129, "rewards/chosen": 0.15739777088165283, "rewards/margins": 0.9654844045639038, "rewards/rejected": -0.808086633682251, "step": 454 }, { "epoch": 0.024116820819971907, "grad_norm": 69.5, "kl": 0.08018684387207031, "learning_rate": 5e-07, "logits/chosen": -22274058.0, "logits/rejected": -14408627.0, "logps/chosen": -470.8759765625, "logps/rejected": -219.98422241210938, "loss": 0.4301, "rewards/chosen": 0.147449791431427, "rewards/margins": 0.5746360123157501, "rewards/rejected": -0.4271862208843231, "step": 455 }, { "epoch": 0.024169824821774044, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75928128.0, "logits/rejected": -10899472.0, "logps/chosen": -258.71844482421875, "logps/rejected": -251.53567504882812, "loss": 0.4555, "rewards/chosen": -0.010174942202866077, "rewards/margins": 0.3951296089217067, "rewards/rejected": -0.40530455112457275, "step": 456 }, { "epoch": 0.02422282882357618, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46229136.0, "logits/rejected": -9513356.666666666, "logps/chosen": -341.07012939453125, "logps/rejected": -339.39272054036456, "loss": 0.3749, "rewards/chosen": 0.0633087158203125, "rewards/margins": 0.7591404914855957, "rewards/rejected": -0.6958317756652832, "step": 457 }, { "epoch": 0.024275832825378315, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8615694.0, "logits/rejected": -6536347.5, "logps/chosen": -146.157958984375, "logps/rejected": -239.26998901367188, "loss": 0.4269, "rewards/chosen": 0.04427070915699005, "rewards/margins": 0.611180379986763, "rewards/rejected": -0.566909670829773, "step": 458 }, { "epoch": 0.024328836827180452, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6291564.8, "logits/rejected": -10095698.666666666, "logps/chosen": -343.8064208984375, "logps/rejected": -132.54645792643228, "loss": 0.4871, "rewards/chosen": -0.16444457769393922, "rewards/margins": 0.2529690146446228, "rewards/rejected": -0.417413592338562, "step": 459 }, { "epoch": 0.02438184082898259, "grad_norm": 80.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78009828.57142857, "logits/rejected": -2652648.25, "logps/chosen": -363.15283203125, "logps/rejected": -41.159584045410156, "loss": 0.5019, "rewards/chosen": -0.04240351915359497, "rewards/margins": 0.19407758116722107, "rewards/rejected": -0.23648110032081604, "step": 460 }, { "epoch": 0.024434844830784723, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22273869.333333332, "logits/rejected": -22465472.0, "logps/chosen": -108.98923746744792, "logps/rejected": -368.5155029296875, "loss": 0.3884, "rewards/chosen": 0.2213141123453776, "rewards/margins": 0.8315817515055338, "rewards/rejected": -0.6102676391601562, "step": 461 }, { "epoch": 0.02448784883258686, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25761205.333333332, "logits/rejected": -54200092.0, "logps/chosen": -298.5270182291667, "logps/rejected": -253.4588623046875, "loss": 0.4626, "rewards/chosen": 0.02677253137032191, "rewards/margins": 0.5852060640851656, "rewards/rejected": -0.5584335327148438, "step": 462 }, { "epoch": 0.024540852834388997, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9105842.0, "logits/rejected": -23650845.714285713, "logps/chosen": -111.46673583984375, "logps/rejected": -253.16423688616072, "loss": 0.3485, "rewards/chosen": -0.07757721096277237, "rewards/margins": 0.6722319179347583, "rewards/rejected": -0.7498091288975307, "step": 463 }, { "epoch": 0.02459385683619113, "grad_norm": 78.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -196445.375, "logits/rejected": -35187632.0, "logps/chosen": -763.397216796875, "logps/rejected": -704.77880859375, "loss": 0.4129, "rewards/chosen": -0.2226146161556244, "rewards/margins": 1.5574388802051544, "rewards/rejected": -1.7800534963607788, "step": 464 }, { "epoch": 0.024646860837993268, "grad_norm": 66.5, "kl": 0.016998291015625, "learning_rate": 5e-07, "logits/chosen": -53893877.333333336, "logits/rejected": -73942016.0, "logps/chosen": -517.785400390625, "logps/rejected": -163.612939453125, "loss": 0.4396, "rewards/chosen": -0.055889894564946495, "rewards/margins": 0.3820322016874949, "rewards/rejected": -0.4379220962524414, "step": 465 }, { "epoch": 0.024699864839795405, "grad_norm": 76.0, "kl": 0.1502084732055664, "learning_rate": 5e-07, "logits/chosen": 7655844.666666667, "logits/rejected": -41579872.0, "logps/chosen": -442.2077229817708, "logps/rejected": -433.56982421875, "loss": 0.3839, "rewards/chosen": 0.06988958021004994, "rewards/margins": 0.855507027109464, "rewards/rejected": -0.785617446899414, "step": 466 }, { "epoch": 0.02475286884159754, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30165456.0, "logits/rejected": -34835768.0, "logps/chosen": -197.8447469075521, "logps/rejected": -503.78558349609375, "loss": 0.4641, "rewards/chosen": -0.07973817487557729, "rewards/margins": 0.8956005821625391, "rewards/rejected": -0.9753387570381165, "step": 467 }, { "epoch": 0.024805872843399676, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17764617.6, "logits/rejected": -23614968.0, "logps/chosen": -205.795068359375, "logps/rejected": -129.4226277669271, "loss": 0.4601, "rewards/chosen": -0.024918897449970244, "rewards/margins": 0.45166764309008917, "rewards/rejected": -0.4765865405400594, "step": 468 }, { "epoch": 0.024858876845201813, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10440355.0, "logits/rejected": -33795451.428571425, "logps/chosen": -496.04437255859375, "logps/rejected": -410.711669921875, "loss": 0.3242, "rewards/chosen": 0.32648926973342896, "rewards/margins": 1.1719336083957126, "rewards/rejected": -0.8454443386622837, "step": 469 }, { "epoch": 0.02491188084700395, "grad_norm": 78.5, "kl": 0.5521316528320312, "learning_rate": 5e-07, "logits/chosen": -40747942.4, "logits/rejected": -23258640.0, "logps/chosen": -489.5056640625, "logps/rejected": -167.95585123697916, "loss": 0.4439, "rewards/chosen": 0.19955703020095825, "rewards/margins": 0.6447794079780579, "rewards/rejected": -0.4452223777770996, "step": 470 }, { "epoch": 0.024964884848806084, "grad_norm": 78.0, "kl": 0.009165763854980469, "learning_rate": 5e-07, "logits/chosen": 3780705.0, "logits/rejected": -88376976.0, "logps/chosen": -364.1873372395833, "logps/rejected": -938.0186157226562, "loss": 0.4127, "rewards/chosen": -0.025679017106691997, "rewards/margins": 2.202592467268308, "rewards/rejected": -2.228271484375, "step": 471 }, { "epoch": 0.02501788885060822, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9867545.0, "logits/rejected": -18814360.0, "logps/chosen": -195.31634521484375, "logps/rejected": -541.7720336914062, "loss": 0.3829, "rewards/chosen": 0.038860030472278595, "rewards/margins": 1.0257229879498482, "rewards/rejected": -0.9868629574775696, "step": 472 }, { "epoch": 0.02507089285241036, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72872528.0, "logits/rejected": -9595417.0, "logps/chosen": -541.7188720703125, "logps/rejected": -96.162353515625, "loss": 0.4709, "rewards/chosen": -0.020124807953834534, "rewards/margins": 0.23797504603862762, "rewards/rejected": -0.25809985399246216, "step": 473 }, { "epoch": 0.025123896854212492, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30889864.0, "logits/rejected": -13030342.0, "logps/chosen": -255.17268880208334, "logps/rejected": -222.6920166015625, "loss": 0.5009, "rewards/chosen": -0.14025014638900757, "rewards/margins": 0.27591273188591003, "rewards/rejected": -0.4161628782749176, "step": 474 }, { "epoch": 0.02517690085601463, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7944582.4, "logits/rejected": -15863125.333333334, "logps/chosen": -299.545068359375, "logps/rejected": -198.99104817708334, "loss": 0.4446, "rewards/chosen": 0.013850519061088562, "rewards/margins": 0.6180656641721726, "rewards/rejected": -0.604215145111084, "step": 475 }, { "epoch": 0.025229904857816766, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13400160.0, "logits/rejected": -53012736.0, "logps/chosen": -195.9812469482422, "logps/rejected": -427.6076965332031, "loss": 0.3908, "rewards/chosen": 0.04355897754430771, "rewards/margins": 0.9838116392493248, "rewards/rejected": -0.9402526617050171, "step": 476 }, { "epoch": 0.0252829088596189, "grad_norm": 65.5, "kl": 0.19766807556152344, "learning_rate": 5e-07, "logits/chosen": -58127984.0, "logits/rejected": -54368720.0, "logps/chosen": -494.900634765625, "logps/rejected": -405.435302734375, "loss": 0.3722, "rewards/chosen": 0.18289947509765625, "rewards/margins": 1.238006830215454, "rewards/rejected": -1.0551073551177979, "step": 477 }, { "epoch": 0.025335912861421037, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59606480.0, "logits/rejected": -23520558.0, "logps/chosen": -142.5194091796875, "logps/rejected": -268.4916687011719, "loss": 0.425, "rewards/chosen": -0.16148261725902557, "rewards/margins": 0.6882274895906448, "rewards/rejected": -0.8497101068496704, "step": 478 }, { "epoch": 0.025388916863223174, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23979875.2, "logits/rejected": -20300496.0, "logps/chosen": -143.4165771484375, "logps/rejected": -165.4419962565104, "loss": 0.442, "rewards/chosen": -0.035958591103553775, "rewards/margins": 0.6763231923182805, "rewards/rejected": -0.7122817834218343, "step": 479 }, { "epoch": 0.025441920865025308, "grad_norm": 60.0, "kl": 0.13114166259765625, "learning_rate": 5e-07, "logits/chosen": -33820980.0, "logits/rejected": -75289168.0, "logps/chosen": -221.56907653808594, "logps/rejected": -348.69940185546875, "loss": 0.4334, "rewards/chosen": -0.08272343128919601, "rewards/margins": 0.6237928345799446, "rewards/rejected": -0.7065162658691406, "step": 480 }, { "epoch": 0.025494924866827445, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53694480.0, "logits/rejected": -9735324.0, "logps/chosen": -347.39849853515625, "logps/rejected": -153.6577911376953, "loss": 0.4418, "rewards/chosen": -0.04207067936658859, "rewards/margins": 0.49140841513872147, "rewards/rejected": -0.5334790945053101, "step": 481 }, { "epoch": 0.025547928868629582, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24527401.6, "logits/rejected": -25151504.0, "logps/chosen": -249.95537109375, "logps/rejected": -114.98826090494792, "loss": 0.4495, "rewards/chosen": 0.0480725109577179, "rewards/margins": 0.5376840968926747, "rewards/rejected": -0.48961158593495685, "step": 482 }, { "epoch": 0.025600932870431716, "grad_norm": 66.0, "kl": 0.24271392822265625, "learning_rate": 5e-07, "logits/chosen": -69032646.4, "logits/rejected": -98897024.0, "logps/chosen": -419.76162109375, "logps/rejected": -620.9202067057291, "loss": 0.3916, "rewards/chosen": 0.0980429470539093, "rewards/margins": 1.314702969789505, "rewards/rejected": -1.2166600227355957, "step": 483 }, { "epoch": 0.025653936872233853, "grad_norm": 91.5, "kl": 0.27640533447265625, "learning_rate": 5e-07, "logits/chosen": -3793074.75, "logits/rejected": -40539712.0, "logps/chosen": -908.04541015625, "logps/rejected": -506.5746154785156, "loss": 0.3523, "rewards/chosen": 0.16487999260425568, "rewards/margins": 1.4519178420305252, "rewards/rejected": -1.2870378494262695, "step": 484 }, { "epoch": 0.02570694087403599, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9401160.8, "logits/rejected": -45022965.333333336, "logps/chosen": -411.19814453125, "logps/rejected": -418.7001546223958, "loss": 0.3996, "rewards/chosen": 0.06529075503349305, "rewards/margins": 1.1207249025503796, "rewards/rejected": -1.0554341475168865, "step": 485 }, { "epoch": 0.025759944875838124, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40500893.333333336, "logits/rejected": -23099715.2, "logps/chosen": -360.1653238932292, "logps/rejected": -404.5396484375, "loss": 0.3792, "rewards/chosen": 0.007448842128117879, "rewards/margins": 0.9259454508622488, "rewards/rejected": -0.9184966087341309, "step": 486 }, { "epoch": 0.02581294887764026, "grad_norm": 64.5, "kl": 0.01668548583984375, "learning_rate": 5e-07, "logits/chosen": -16748406.0, "logits/rejected": -36189328.0, "logps/chosen": -411.4163818359375, "logps/rejected": -417.6434631347656, "loss": 0.3943, "rewards/chosen": 0.0676983892917633, "rewards/margins": 0.9277023375034332, "rewards/rejected": -0.8600039482116699, "step": 487 }, { "epoch": 0.0258659528794424, "grad_norm": 102.5, "kl": 0.0029850006103515625, "learning_rate": 5e-07, "logits/chosen": -38504704.0, "logits/rejected": -5105646.0, "logps/chosen": -684.830078125, "logps/rejected": -349.1849365234375, "loss": 0.4514, "rewards/chosen": 0.10374422868092854, "rewards/margins": 0.6040330131848654, "rewards/rejected": -0.5002887845039368, "step": 488 }, { "epoch": 0.025918956881244536, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70156405.33333333, "logits/rejected": -4551966.4, "logps/chosen": -339.03550211588544, "logps/rejected": -162.17802734375, "loss": 0.4521, "rewards/chosen": 0.008129882315794626, "rewards/margins": 0.3138523335258166, "rewards/rejected": -0.30572245121002195, "step": 489 }, { "epoch": 0.02597196088304667, "grad_norm": 62.75, "kl": 0.07387542724609375, "learning_rate": 5e-07, "logits/chosen": -35340000.0, "logits/rejected": -11930809.333333334, "logps/chosen": -221.477978515625, "logps/rejected": -413.0591634114583, "loss": 0.4339, "rewards/chosen": -0.021790924668312072, "rewards/margins": 0.8922403424978256, "rewards/rejected": -0.9140312671661377, "step": 490 }, { "epoch": 0.026024964884848806, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33684400.0, "logits/rejected": -23609276.8, "logps/chosen": -261.32379150390625, "logps/rejected": -188.894384765625, "loss": 0.4239, "rewards/chosen": 0.1215924620628357, "rewards/margins": 0.5447365641593933, "rewards/rejected": -0.4231441020965576, "step": 491 }, { "epoch": 0.026077968886650944, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13818631.0, "logits/rejected": -42389836.0, "logps/chosen": -303.5613708496094, "logps/rejected": -294.3352966308594, "loss": 0.4107, "rewards/chosen": -0.07628097385168076, "rewards/margins": 0.7711601033806801, "rewards/rejected": -0.8474410772323608, "step": 492 }, { "epoch": 0.026130972888453077, "grad_norm": 61.75, "kl": 0.27758026123046875, "learning_rate": 5e-07, "logits/chosen": -47058632.0, "logits/rejected": -6827596.0, "logps/chosen": -392.9112243652344, "logps/rejected": -281.8734130859375, "loss": 0.3895, "rewards/chosen": 0.23024465143680573, "rewards/margins": 1.0033697038888931, "rewards/rejected": -0.7731250524520874, "step": 493 }, { "epoch": 0.026183976890255214, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39158582.4, "logits/rejected": -29165818.666666668, "logps/chosen": -284.05654296875, "logps/rejected": -192.99686686197916, "loss": 0.4415, "rewards/chosen": 0.10379676818847657, "rewards/margins": 0.5649735609690348, "rewards/rejected": -0.4611767927805583, "step": 494 }, { "epoch": 0.02623698089205735, "grad_norm": 71.0, "kl": 0.12679481506347656, "learning_rate": 5e-07, "logits/chosen": -25819380.8, "logits/rejected": -33775301.333333336, "logps/chosen": -417.5537109375, "logps/rejected": -467.2754313151042, "loss": 0.4006, "rewards/chosen": 0.09277679920196533, "rewards/margins": 1.2872668345769245, "rewards/rejected": -1.1944900353749592, "step": 495 }, { "epoch": 0.026289984893859485, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27856152.0, "logits/rejected": -83610224.0, "logps/chosen": -192.82000732421875, "logps/rejected": -447.25164794921875, "loss": 0.3955, "rewards/chosen": 0.10804887115955353, "rewards/margins": 0.9048636704683304, "rewards/rejected": -0.7968147993087769, "step": 496 }, { "epoch": 0.026342988895661622, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -809340.0, "logits/rejected": -22764037.333333332, "logps/chosen": -460.107275390625, "logps/rejected": -211.84004720052084, "loss": 0.4506, "rewards/chosen": 0.040740966796875, "rewards/margins": 0.5391701062520344, "rewards/rejected": -0.4984291394551595, "step": 497 }, { "epoch": 0.02639599289746376, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6201214.666666667, "logits/rejected": -36216102.4, "logps/chosen": -280.10546875, "logps/rejected": -335.33876953125, "loss": 0.3767, "rewards/chosen": -0.1557133992513021, "rewards/margins": 0.8048887888590495, "rewards/rejected": -0.9606021881103516, "step": 498 }, { "epoch": 0.026448996899265893, "grad_norm": 86.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31992921.6, "logits/rejected": 36345442.666666664, "logps/chosen": -462.9736328125, "logps/rejected": -490.2316080729167, "loss": 0.3798, "rewards/chosen": 0.10575733184814454, "rewards/margins": 1.370092010498047, "rewards/rejected": -1.2643346786499023, "step": 499 }, { "epoch": 0.02650200090106803, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45810044.0, "logits/rejected": -25185116.0, "logps/chosen": -347.9161376953125, "logps/rejected": -348.93878173828125, "loss": 0.4008, "rewards/chosen": -0.01025695726275444, "rewards/margins": 0.8438888676464558, "rewards/rejected": -0.8541458249092102, "step": 500 }, { "epoch": 0.026555004902870168, "grad_norm": 60.75, "kl": 0.4456520080566406, "learning_rate": 5e-07, "logits/chosen": -54132152.0, "logits/rejected": -38384664.0, "logps/chosen": -198.98463439941406, "logps/rejected": -374.2876281738281, "loss": 0.3948, "rewards/chosen": 0.09614171832799911, "rewards/margins": 1.026691697537899, "rewards/rejected": -0.9305499792098999, "step": 501 }, { "epoch": 0.0266080089046723, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22965282.666666668, "logits/rejected": -65245708.0, "logps/chosen": -249.34735107421875, "logps/rejected": -211.40798950195312, "loss": 0.4352, "rewards/chosen": 0.1833156148592631, "rewards/margins": 0.7026248375574747, "rewards/rejected": -0.5193092226982117, "step": 502 }, { "epoch": 0.02666101290647444, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13051294.666666666, "logits/rejected": -21945096.0, "logps/chosen": -320.26123046875, "logps/rejected": -214.24830627441406, "loss": 0.4718, "rewards/chosen": -0.06264909108479817, "rewards/margins": 0.613375167051951, "rewards/rejected": -0.6760242581367493, "step": 503 }, { "epoch": 0.026714016908276576, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19377010.0, "logits/rejected": -38933664.0, "logps/chosen": -145.78176879882812, "logps/rejected": -236.5196533203125, "loss": 0.4026, "rewards/chosen": 0.09132423996925354, "rewards/margins": 0.8230910003185272, "rewards/rejected": -0.7317667603492737, "step": 504 }, { "epoch": 0.02676702091007871, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35965253.333333336, "logits/rejected": -18642756.0, "logps/chosen": -206.52189127604166, "logps/rejected": -310.4316101074219, "loss": 0.4547, "rewards/chosen": -0.01862703760464986, "rewards/margins": 0.9180554846922556, "rewards/rejected": -0.9366825222969055, "step": 505 }, { "epoch": 0.026820024911880846, "grad_norm": 77.5, "kl": 0.23147964477539062, "learning_rate": 5e-07, "logits/chosen": 6994088.0, "logits/rejected": -24380466.666666668, "logps/chosen": -456.77587890625, "logps/rejected": -231.80794270833334, "loss": 0.4032, "rewards/chosen": 0.25792951583862306, "rewards/margins": 1.0455256621042888, "rewards/rejected": -0.7875961462656657, "step": 506 }, { "epoch": 0.026873028913682984, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56862309.333333336, "logits/rejected": -39304976.0, "logps/chosen": -517.559814453125, "logps/rejected": -259.5279296875, "loss": 0.3725, "rewards/chosen": -0.02453511705001195, "rewards/margins": 0.9373993540803591, "rewards/rejected": -0.9619344711303711, "step": 507 }, { "epoch": 0.02692603291548512, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34752944.0, "logits/rejected": -38689768.0, "logps/chosen": -270.7983093261719, "logps/rejected": -532.5167236328125, "loss": 0.3955, "rewards/chosen": -0.2065322995185852, "rewards/margins": 0.9881554245948792, "rewards/rejected": -1.1946877241134644, "step": 508 }, { "epoch": 0.026979036917287254, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40829152.0, "logits/rejected": -14802548.0, "logps/chosen": -403.71746826171875, "logps/rejected": -102.79901123046875, "loss": 0.4051, "rewards/chosen": 0.21969376504421234, "rewards/margins": 0.6620666434367497, "rewards/rejected": -0.4423728783925374, "step": 509 }, { "epoch": 0.02703204091908939, "grad_norm": 66.5, "kl": 0.03461456298828125, "learning_rate": 5e-07, "logits/chosen": -16185524.0, "logits/rejected": -32011372.8, "logps/chosen": -344.8957112630208, "logps/rejected": -219.75390625, "loss": 0.431, "rewards/chosen": 0.0032848368088404336, "rewards/margins": 0.47502808670202895, "rewards/rejected": -0.4717432498931885, "step": 510 }, { "epoch": 0.02708504492089153, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -95743680.0, "logits/rejected": -17003726.4, "logps/chosen": -443.5765787760417, "logps/rejected": -253.5253173828125, "loss": 0.3943, "rewards/chosen": -0.1038625140984853, "rewards/margins": 0.7655092815558115, "rewards/rejected": -0.8693717956542969, "step": 511 }, { "epoch": 0.027138048922693662, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24244770.666666668, "logits/rejected": -11343353.0, "logps/chosen": -192.6620890299479, "logps/rejected": -469.93841552734375, "loss": 0.4286, "rewards/chosen": 0.035784244537353516, "rewards/margins": 1.199363112449646, "rewards/rejected": -1.1635788679122925, "step": 512 }, { "epoch": 0.0271910529244958, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4089860.0, "logits/rejected": -43290536.0, "logps/chosen": -81.478271484375, "logps/rejected": -557.7687377929688, "loss": 0.3911, "rewards/chosen": 0.1421807607014974, "rewards/margins": 1.7337395747502644, "rewards/rejected": -1.591558814048767, "step": 513 }, { "epoch": 0.027244056926297937, "grad_norm": 86.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39183709.333333336, "logits/rejected": -27550868.0, "logps/chosen": -475.932861328125, "logps/rejected": -267.5281066894531, "loss": 0.4426, "rewards/chosen": 0.09132214387257893, "rewards/margins": 0.8157237966855367, "rewards/rejected": -0.7244016528129578, "step": 514 }, { "epoch": 0.02729706092810007, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1338573.0, "logits/rejected": -49463492.0, "logps/chosen": -199.8595174153646, "logps/rejected": -582.1170043945312, "loss": 0.4432, "rewards/chosen": -0.08444156249364217, "rewards/margins": 1.2459280292193096, "rewards/rejected": -1.3303695917129517, "step": 515 }, { "epoch": 0.027350064929902208, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36806578.666666664, "logits/rejected": -6448091.2, "logps/chosen": -359.0018310546875, "logps/rejected": -250.0943603515625, "loss": 0.4022, "rewards/chosen": -0.009187246362368265, "rewards/margins": 0.650400518377622, "rewards/rejected": -0.6595877647399903, "step": 516 }, { "epoch": 0.027403068931704345, "grad_norm": 55.25, "kl": 0.24188518524169922, "learning_rate": 5e-07, "logits/chosen": -54939984.0, "logits/rejected": -9690042.0, "logps/chosen": -419.3828125, "logps/rejected": -90.08976745605469, "loss": 0.4428, "rewards/chosen": 0.155162051320076, "rewards/margins": 0.5064456015825272, "rewards/rejected": -0.35128355026245117, "step": 517 }, { "epoch": 0.02745607293350648, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2025331.6666666667, "logits/rejected": -9109902.4, "logps/chosen": -64.65957641601562, "logps/rejected": -362.769921875, "loss": 0.3899, "rewards/chosen": 0.033389473954836525, "rewards/margins": 0.7694681177536646, "rewards/rejected": -0.7360786437988281, "step": 518 }, { "epoch": 0.027509076935308616, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30626018.666666668, "logits/rejected": -66670700.8, "logps/chosen": -392.9549153645833, "logps/rejected": -480.88662109375, "loss": 0.3171, "rewards/chosen": -0.02049458771944046, "rewards/margins": 1.4098065063357352, "rewards/rejected": -1.4303010940551757, "step": 519 }, { "epoch": 0.027562080937110753, "grad_norm": 73.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15303228.8, "logits/rejected": -34302741.333333336, "logps/chosen": -315.1672119140625, "logps/rejected": -341.2999267578125, "loss": 0.3984, "rewards/chosen": 0.10834929943084717, "rewards/margins": 1.1421592791875204, "rewards/rejected": -1.033809979756673, "step": 520 }, { "epoch": 0.027615084938912886, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28231426.0, "logits/rejected": -28564002.666666668, "logps/chosen": -404.0678405761719, "logps/rejected": -364.1532796223958, "loss": 0.2804, "rewards/chosen": 0.16050797700881958, "rewards/margins": 1.5787946184476216, "rewards/rejected": -1.418286641438802, "step": 521 }, { "epoch": 0.027668088940715024, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69522016.0, "logits/rejected": -13893739.0, "logps/chosen": -249.93499755859375, "logps/rejected": -145.10745239257812, "loss": 0.4397, "rewards/chosen": 0.11577829718589783, "rewards/margins": 0.48779234290122986, "rewards/rejected": -0.37201404571533203, "step": 522 }, { "epoch": 0.02772109294251716, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19185720.0, "logits/rejected": -903212.0, "logps/chosen": -267.4584655761719, "logps/rejected": -58.49989700317383, "loss": 0.4659, "rewards/chosen": 0.026094242930412292, "rewards/margins": 0.27538882195949554, "rewards/rejected": -0.24929457902908325, "step": 523 }, { "epoch": 0.027774096944319294, "grad_norm": 73.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59173676.8, "logits/rejected": -25323109.333333332, "logps/chosen": -416.46201171875, "logps/rejected": -452.603271484375, "loss": 0.3996, "rewards/chosen": 0.17069060802459718, "rewards/margins": 1.0672625621159872, "rewards/rejected": -0.89657195409139, "step": 524 }, { "epoch": 0.02782710094612143, "grad_norm": 77.0, "kl": 0.6885147094726562, "learning_rate": 5e-07, "logits/chosen": -29005092.0, "logits/rejected": 35597552.0, "logps/chosen": -520.4542236328125, "logps/rejected": -476.7376403808594, "loss": 0.3922, "rewards/chosen": 0.18073798716068268, "rewards/margins": 1.0617621093988419, "rewards/rejected": -0.8810241222381592, "step": 525 }, { "epoch": 0.02788010494792357, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28712098.666666668, "logits/rejected": -38557667.2, "logps/chosen": -306.8104654947917, "logps/rejected": -384.600830078125, "loss": 0.3504, "rewards/chosen": 0.07416534423828125, "rewards/margins": 1.1715725898742675, "rewards/rejected": -1.0974072456359862, "step": 526 }, { "epoch": 0.027933108949725706, "grad_norm": 77.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10939454.0, "logits/rejected": -41933856.0, "logps/chosen": -458.3921305338542, "logps/rejected": -398.17294921875, "loss": 0.3708, "rewards/chosen": -0.13722026348114014, "rewards/margins": 0.870028805732727, "rewards/rejected": -1.0072490692138671, "step": 527 }, { "epoch": 0.02798611295152784, "grad_norm": 99.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13983599.0, "logits/rejected": -19687824.0, "logps/chosen": -865.3206176757812, "logps/rejected": -418.37603759765625, "loss": 0.3984, "rewards/chosen": 0.102716825902462, "rewards/margins": 0.868592880666256, "rewards/rejected": -0.765876054763794, "step": 528 }, { "epoch": 0.028039116953329977, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84737280.0, "logits/rejected": 13158309.333333334, "logps/chosen": -415.1881408691406, "logps/rejected": -210.57157389322916, "loss": 0.4283, "rewards/chosen": -0.05759277939796448, "rewards/margins": 0.3575735191504161, "rewards/rejected": -0.41516629854838055, "step": 529 }, { "epoch": 0.028092120955132114, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67333605.33333333, "logits/rejected": -19886984.0, "logps/chosen": -324.5364990234375, "logps/rejected": -192.74345703125, "loss": 0.3676, "rewards/chosen": 0.11831939220428467, "rewards/margins": 0.983943247795105, "rewards/rejected": -0.8656238555908203, "step": 530 }, { "epoch": 0.028145124956934248, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12547392.0, "logits/rejected": -41225104.0, "logps/chosen": -173.191015625, "logps/rejected": -247.76322428385416, "loss": 0.4392, "rewards/chosen": -0.08038231134414672, "rewards/margins": 0.7655537486076355, "rewards/rejected": -0.8459360599517822, "step": 531 }, { "epoch": 0.028198128958736385, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6528104.0, "logits/rejected": 2127051.3333333335, "logps/chosen": -444.5689697265625, "logps/rejected": -303.28245035807294, "loss": 0.3674, "rewards/chosen": 0.002386856824159622, "rewards/margins": 0.8335554289321104, "rewards/rejected": -0.8311685721079508, "step": 532 }, { "epoch": 0.028251132960538522, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9401600.0, "logits/rejected": -86167672.0, "logps/chosen": -243.3358154296875, "logps/rejected": -306.9985656738281, "loss": 0.3833, "rewards/chosen": 0.10254477709531784, "rewards/margins": 1.1022831127047539, "rewards/rejected": -0.999738335609436, "step": 533 }, { "epoch": 0.028304136962340656, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14892441.333333334, "logits/rejected": 3643252.0, "logps/chosen": -141.42369588216147, "logps/rejected": -271.3794921875, "loss": 0.4009, "rewards/chosen": -0.14428838094075522, "rewards/margins": 0.6257627646128336, "rewards/rejected": -0.7700511455535889, "step": 534 }, { "epoch": 0.028357140964142793, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16838162.285714287, "logits/rejected": -1266610.5, "logps/chosen": -169.28920200892858, "logps/rejected": -205.3192138671875, "loss": 0.4884, "rewards/chosen": 0.03894078731536865, "rewards/margins": 0.1407855749130249, "rewards/rejected": -0.10184478759765625, "step": 535 }, { "epoch": 0.02841014496594493, "grad_norm": 70.5, "kl": 0.3985424041748047, "learning_rate": 5e-07, "logits/chosen": -22014032.0, "logits/rejected": -63803980.0, "logps/chosen": -335.6688755580357, "logps/rejected": -165.93682861328125, "loss": 0.4698, "rewards/chosen": 0.08305950675691877, "rewards/margins": 0.8221769503184727, "rewards/rejected": -0.739117443561554, "step": 536 }, { "epoch": 0.028463148967747064, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16809812.0, "logits/rejected": -39383272.0, "logps/chosen": -180.22274780273438, "logps/rejected": -509.1372985839844, "loss": 0.3782, "rewards/chosen": 0.02434377744793892, "rewards/margins": 1.276333499699831, "rewards/rejected": -1.251989722251892, "step": 537 }, { "epoch": 0.0285161529695492, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2888655.75, "logits/rejected": -48268946.28571428, "logps/chosen": -31.42837905883789, "logps/rejected": -233.88106863839286, "loss": 0.3445, "rewards/chosen": -0.028352737426757812, "rewards/margins": 0.7487207821437291, "rewards/rejected": -0.7770735195704869, "step": 538 }, { "epoch": 0.028569156971351338, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5053152.5, "logits/rejected": -25559716.0, "logps/chosen": -181.8839111328125, "logps/rejected": -310.4332275390625, "loss": 0.4075, "rewards/chosen": 0.0497802309691906, "rewards/margins": 0.782241802662611, "rewards/rejected": -0.7324615716934204, "step": 539 }, { "epoch": 0.02862216097315347, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31197248.0, "logits/rejected": -8875729.333333334, "logps/chosen": -308.88787841796875, "logps/rejected": -274.63120524088544, "loss": 0.4089, "rewards/chosen": 0.04205017536878586, "rewards/margins": 0.5852252766489983, "rewards/rejected": -0.5431751012802124, "step": 540 }, { "epoch": 0.02867516497495561, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27830035.2, "logits/rejected": -82496437.33333333, "logps/chosen": -301.0470947265625, "logps/rejected": -521.1018880208334, "loss": 0.3707, "rewards/chosen": 0.16786850690841676, "rewards/margins": 1.4143041491508483, "rewards/rejected": -1.2464356422424316, "step": 541 }, { "epoch": 0.028728168976757746, "grad_norm": 55.0, "kl": 0.13103199005126953, "learning_rate": 5e-07, "logits/chosen": 4205817.0, "logits/rejected": -10590649.0, "logps/chosen": -175.74197387695312, "logps/rejected": -353.65478515625, "loss": 0.4293, "rewards/chosen": 0.0011762678623199463, "rewards/margins": 0.6601645052433014, "rewards/rejected": -0.6589882373809814, "step": 542 }, { "epoch": 0.02878117297855988, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38573164.0, "logits/rejected": -25018702.0, "logps/chosen": -144.5326385498047, "logps/rejected": -583.63330078125, "loss": 0.3857, "rewards/chosen": 0.01930389180779457, "rewards/margins": 1.4927100874483585, "rewards/rejected": -1.473406195640564, "step": 543 }, { "epoch": 0.028834176980362017, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29583658.0, "logits/rejected": -11015514.0, "logps/chosen": -161.82199096679688, "logps/rejected": -307.01123046875, "loss": 0.3888, "rewards/chosen": 0.08873462677001953, "rewards/margins": 1.0118663311004639, "rewards/rejected": -0.9231317043304443, "step": 544 }, { "epoch": 0.028887180982164154, "grad_norm": 51.75, "kl": 0.16963577270507812, "learning_rate": 5e-07, "logits/chosen": -6495756.0, "logits/rejected": -9970523.0, "logps/chosen": -197.28585815429688, "logps/rejected": -126.89134216308594, "loss": 0.4164, "rewards/chosen": 0.21181049942970276, "rewards/margins": 0.7188500463962555, "rewards/rejected": -0.5070395469665527, "step": 545 }, { "epoch": 0.02894018498396629, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17342634.0, "logits/rejected": -17431510.0, "logps/chosen": -236.95498657226562, "logps/rejected": -234.14385986328125, "loss": 0.425, "rewards/chosen": 0.004236603155732155, "rewards/margins": 0.6402541641145945, "rewards/rejected": -0.6360175609588623, "step": 546 }, { "epoch": 0.028993188985768425, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6362112.4, "logits/rejected": -14587990.666666666, "logps/chosen": -155.8077880859375, "logps/rejected": -349.444580078125, "loss": 0.4231, "rewards/chosen": 0.0402135044336319, "rewards/margins": 0.8833071380853653, "rewards/rejected": -0.8430936336517334, "step": 547 }, { "epoch": 0.029046192987570562, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46982088.0, "logits/rejected": -42865876.0, "logps/chosen": -302.7421875, "logps/rejected": -412.15228271484375, "loss": 0.4109, "rewards/chosen": -0.24476434290409088, "rewards/margins": 0.8794905990362167, "rewards/rejected": -1.1242549419403076, "step": 548 }, { "epoch": 0.0290991969893727, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35120716.8, "logits/rejected": -152394229.33333334, "logps/chosen": -273.3775390625, "logps/rejected": -428.982666015625, "loss": 0.4151, "rewards/chosen": -0.05833171606063843, "rewards/margins": 1.1895625710487365, "rewards/rejected": -1.247894287109375, "step": 549 }, { "epoch": 0.029152200991174833, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17905892.8, "logits/rejected": -33181770.666666668, "logps/chosen": -129.1954833984375, "logps/rejected": -507.9973958333333, "loss": 0.3924, "rewards/chosen": 0.057004863023757936, "rewards/margins": 1.5503871579964954, "rewards/rejected": -1.4933822949727376, "step": 550 }, { "epoch": 0.02920520499297697, "grad_norm": 84.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42029587.2, "logits/rejected": 10205129.333333334, "logps/chosen": -447.056640625, "logps/rejected": -387.6676025390625, "loss": 0.3991, "rewards/chosen": 0.013500362634658813, "rewards/margins": 1.278725375731786, "rewards/rejected": -1.2652250130971272, "step": 551 }, { "epoch": 0.029258208994779107, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41648716.0, "logits/rejected": -16177070.857142856, "logps/chosen": -178.02734375, "logps/rejected": -217.55414690290178, "loss": 0.318, "rewards/chosen": -0.02277221716940403, "rewards/margins": 0.9377038407006434, "rewards/rejected": -0.9604760578700474, "step": 552 }, { "epoch": 0.02931121299658124, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35584058.666666664, "logits/rejected": -6477673.0, "logps/chosen": -256.77329508463544, "logps/rejected": -108.33985900878906, "loss": 0.4711, "rewards/chosen": 0.03501679003238678, "rewards/margins": 0.3967508226633072, "rewards/rejected": -0.3617340326309204, "step": 553 }, { "epoch": 0.029364216998383378, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17919453.333333332, "logits/rejected": -55087667.2, "logps/chosen": -273.0784098307292, "logps/rejected": -259.2543701171875, "loss": 0.3754, "rewards/chosen": -0.04271748661994934, "rewards/margins": 0.8666777312755585, "rewards/rejected": -0.9093952178955078, "step": 554 }, { "epoch": 0.029417221000185515, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69838245.33333333, "logits/rejected": -41340627.2, "logps/chosen": -593.8429361979166, "logps/rejected": -459.60771484375, "loss": 0.3386, "rewards/chosen": 0.251336673895518, "rewards/margins": 1.2314138452212016, "rewards/rejected": -0.9800771713256836, "step": 555 }, { "epoch": 0.02947022500198765, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15805636.0, "logits/rejected": -26263992.0, "logps/chosen": -228.47784423828125, "logps/rejected": -234.91189575195312, "loss": 0.4391, "rewards/chosen": -0.03937134891748428, "rewards/margins": 0.5320884808897972, "rewards/rejected": -0.5714598298072815, "step": 556 }, { "epoch": 0.029523229003789786, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49308808.0, "logits/rejected": -17969374.0, "logps/chosen": -332.25634765625, "logps/rejected": -271.3902282714844, "loss": 0.4102, "rewards/chosen": -0.04055023193359375, "rewards/margins": 0.931312084197998, "rewards/rejected": -0.9718623161315918, "step": 557 }, { "epoch": 0.029576233005591923, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81412216.0, "logits/rejected": -30797526.0, "logps/chosen": -217.86293029785156, "logps/rejected": -321.83819580078125, "loss": 0.4112, "rewards/chosen": 0.06633572280406952, "rewards/margins": 0.7629366368055344, "rewards/rejected": -0.6966009140014648, "step": 558 }, { "epoch": 0.029629237007394057, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -435381.6, "logits/rejected": -45311226.666666664, "logps/chosen": -154.11102294921875, "logps/rejected": -386.5876871744792, "loss": 0.4269, "rewards/chosen": 0.05299408435821533, "rewards/margins": 0.7984615882237752, "rewards/rejected": -0.7454675038655599, "step": 559 }, { "epoch": 0.029682241009196194, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12472772.0, "logits/rejected": -8953463.0, "logps/chosen": -322.3165283203125, "logps/rejected": -186.131103515625, "loss": 0.4186, "rewards/chosen": 0.04211263358592987, "rewards/margins": 0.7085688561201096, "rewards/rejected": -0.6664562225341797, "step": 560 }, { "epoch": 0.02973524501099833, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4356031.0, "logits/rejected": -32749846.0, "logps/chosen": -138.98252868652344, "logps/rejected": -501.9600830078125, "loss": 0.379, "rewards/chosen": 0.06097226217389107, "rewards/margins": 1.041471529752016, "rewards/rejected": -0.980499267578125, "step": 561 }, { "epoch": 0.029788249012800465, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26864317.333333332, "logits/rejected": -30939347.2, "logps/chosen": -191.53179931640625, "logps/rejected": -348.1050048828125, "loss": 0.4184, "rewards/chosen": 0.015769203503926594, "rewards/margins": 0.6101417620976767, "rewards/rejected": -0.59437255859375, "step": 562 }, { "epoch": 0.029841253014602602, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5295692.5, "logits/rejected": -23617253.333333332, "logps/chosen": -86.78425598144531, "logps/rejected": -214.7738037109375, "loss": 0.4019, "rewards/chosen": 0.013232136145234108, "rewards/margins": 0.5551155253003041, "rewards/rejected": -0.54188338915507, "step": 563 }, { "epoch": 0.02989425701640474, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33445860.0, "logits/rejected": -13319069.0, "logps/chosen": -224.5033721923828, "logps/rejected": -479.6341857910156, "loss": 0.3901, "rewards/chosen": 0.03331485018134117, "rewards/margins": 1.1748169921338558, "rewards/rejected": -1.1415021419525146, "step": 564 }, { "epoch": 0.029947261018206876, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26384052.8, "logits/rejected": -45403408.0, "logps/chosen": -209.5862548828125, "logps/rejected": -480.1739501953125, "loss": 0.4006, "rewards/chosen": -0.04432327151298523, "rewards/margins": 1.5494734585285186, "rewards/rejected": -1.593796730041504, "step": 565 }, { "epoch": 0.03000026502000901, "grad_norm": 45.0, "kl": 0.09154510498046875, "learning_rate": 5e-07, "logits/chosen": -26961830.0, "logits/rejected": -3396384.5, "logps/chosen": -206.16680908203125, "logps/rejected": -159.79551696777344, "loss": 0.4406, "rewards/chosen": 0.029206138104200363, "rewards/margins": 0.5780344419181347, "rewards/rejected": -0.5488283038139343, "step": 566 }, { "epoch": 0.030053269021811147, "grad_norm": 81.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 30063970.666666668, "logits/rejected": 5311596.5, "logps/chosen": -539.6424967447916, "logps/rejected": -125.45450592041016, "loss": 0.4958, "rewards/chosen": -0.13584924737612405, "rewards/margins": 0.3284795184930166, "rewards/rejected": -0.4643287658691406, "step": 567 }, { "epoch": 0.030106273023613284, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16166659.2, "logits/rejected": -31851256.0, "logps/chosen": -238.25615234375, "logps/rejected": -324.84474690755206, "loss": 0.4433, "rewards/chosen": 0.012475892901420593, "rewards/margins": 0.6857168724139532, "rewards/rejected": -0.6732409795125326, "step": 568 }, { "epoch": 0.030159277025415418, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44360245.333333336, "logits/rejected": -7961452.0, "logps/chosen": -231.55562337239584, "logps/rejected": -104.39654541015625, "loss": 0.4321, "rewards/chosen": 0.016622791687647503, "rewards/margins": 0.4529668430487315, "rewards/rejected": -0.436344051361084, "step": 569 }, { "epoch": 0.030212281027217555, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49135148.8, "logits/rejected": -15376077.333333334, "logps/chosen": -170.11153564453124, "logps/rejected": -220.3667195638021, "loss": 0.4316, "rewards/chosen": 0.003664398193359375, "rewards/margins": 0.76578369140625, "rewards/rejected": -0.7621192932128906, "step": 570 }, { "epoch": 0.030265285029019692, "grad_norm": 62.5, "kl": 0.5825576782226562, "learning_rate": 5e-07, "logits/chosen": -28106954.0, "logits/rejected": -13062086.0, "logps/chosen": -349.38555908203125, "logps/rejected": -233.5552978515625, "loss": 0.4007, "rewards/chosen": 0.01856536976993084, "rewards/margins": 0.9855277072638273, "rewards/rejected": -0.9669623374938965, "step": 571 }, { "epoch": 0.030318289030821826, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17189550.666666668, "logits/rejected": -40248630.4, "logps/chosen": -233.41680908203125, "logps/rejected": -197.908642578125, "loss": 0.4249, "rewards/chosen": 0.0030850724627574286, "rewards/margins": 0.5139979676653942, "rewards/rejected": -0.5109128952026367, "step": 572 }, { "epoch": 0.030371293032623963, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21563272.0, "logits/rejected": -11403731.2, "logps/chosen": -622.9627278645834, "logps/rejected": -150.7548583984375, "loss": 0.4278, "rewards/chosen": -0.056059141953786217, "rewards/margins": 0.47988279263178507, "rewards/rejected": -0.5359419345855713, "step": 573 }, { "epoch": 0.0304242970344261, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9599754.666666666, "logits/rejected": -16420676.0, "logps/chosen": -102.05415852864583, "logps/rejected": -249.1493682861328, "loss": 0.4312, "rewards/chosen": 0.13906302054723105, "rewards/margins": 0.903857966264089, "rewards/rejected": -0.7647949457168579, "step": 574 }, { "epoch": 0.030477301036228234, "grad_norm": 80.0, "kl": 0.13950347900390625, "learning_rate": 5e-07, "logits/chosen": -30681798.4, "logits/rejected": -25281440.0, "logps/chosen": -447.01640625, "logps/rejected": -197.74495442708334, "loss": 0.4318, "rewards/chosen": 0.177489173412323, "rewards/margins": 0.6798957705497741, "rewards/rejected": -0.5024065971374512, "step": 575 }, { "epoch": 0.03053030503803037, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21536470.666666668, "logits/rejected": 7089828.0, "logps/chosen": -56.94056193033854, "logps/rejected": -363.48638916015625, "loss": 0.4662, "rewards/chosen": -0.02031075209379196, "rewards/margins": 0.600929506123066, "rewards/rejected": -0.6212402582168579, "step": 576 }, { "epoch": 0.03058330903983251, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33356922.666666668, "logits/rejected": -32728275.2, "logps/chosen": -188.14424641927084, "logps/rejected": -302.757861328125, "loss": 0.3758, "rewards/chosen": 0.05620149274667104, "rewards/margins": 0.8963523258765539, "rewards/rejected": -0.8401508331298828, "step": 577 }, { "epoch": 0.030636313041634642, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -90258090.66666667, "logits/rejected": -21371585.6, "logps/chosen": -376.190185546875, "logps/rejected": -360.4702880859375, "loss": 0.3884, "rewards/chosen": -0.06736501057942708, "rewards/margins": 0.838049570719401, "rewards/rejected": -0.9054145812988281, "step": 578 }, { "epoch": 0.03068931704343678, "grad_norm": 92.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5072565.0, "logits/rejected": -2280697.8571428573, "logps/chosen": -55.48768997192383, "logps/rejected": -369.00687081473217, "loss": 0.3518, "rewards/chosen": -0.03625946119427681, "rewards/margins": 0.6813788270311696, "rewards/rejected": -0.7176382882254464, "step": 579 }, { "epoch": 0.030742321045238916, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -104120224.0, "logits/rejected": -20088585.333333332, "logps/chosen": -349.28912353515625, "logps/rejected": -211.13191731770834, "loss": 0.379, "rewards/chosen": -0.06534881889820099, "rewards/margins": 0.6661735822757086, "rewards/rejected": -0.7315224011739095, "step": 580 }, { "epoch": 0.03079532504704105, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6349958.0, "logits/rejected": -46192005.333333336, "logps/chosen": -99.0500717163086, "logps/rejected": -420.431884765625, "loss": 0.3416, "rewards/chosen": 0.1635454148054123, "rewards/margins": 1.018165792028109, "rewards/rejected": -0.8546203772226969, "step": 581 }, { "epoch": 0.030848329048843187, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51527436.8, "logits/rejected": -32414912.0, "logps/chosen": -447.44453125, "logps/rejected": -388.1177571614583, "loss": 0.3917, "rewards/chosen": 0.15311390161514282, "rewards/margins": 1.2359386086463928, "rewards/rejected": -1.08282470703125, "step": 582 }, { "epoch": 0.030901333050645324, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39786356.0, "logits/rejected": -6551041.5, "logps/chosen": -252.77276611328125, "logps/rejected": -235.88113403320312, "loss": 0.4145, "rewards/chosen": 0.07119408249855042, "rewards/margins": 0.7799305021762848, "rewards/rejected": -0.7087364196777344, "step": 583 }, { "epoch": 0.03095433705244746, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20712860.8, "logits/rejected": -25277418.666666668, "logps/chosen": -150.579736328125, "logps/rejected": -554.0349527994791, "loss": 0.3877, "rewards/chosen": -0.10142666101455688, "rewards/margins": 1.6358430584271748, "rewards/rejected": -1.7372697194417317, "step": 584 }, { "epoch": 0.031007341054249595, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4168303.0, "logits/rejected": -8935384.0, "logps/chosen": -163.84989420572916, "logps/rejected": -348.126953125, "loss": 0.4346, "rewards/chosen": 0.040519148111343384, "rewards/margins": 1.0459512770175934, "rewards/rejected": -1.00543212890625, "step": 585 }, { "epoch": 0.031060345056051732, "grad_norm": 66.5, "kl": 0.01111602783203125, "learning_rate": 5e-07, "logits/chosen": -36678950.4, "logits/rejected": -21158376.0, "logps/chosen": -392.871435546875, "logps/rejected": -291.1750895182292, "loss": 0.3905, "rewards/chosen": 0.2679141521453857, "rewards/margins": 1.0914321104685465, "rewards/rejected": -0.8235179583231608, "step": 586 }, { "epoch": 0.03111334905785387, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3464989.6666666665, "logits/rejected": -18700084.8, "logps/chosen": -71.06541442871094, "logps/rejected": -290.3623046875, "loss": 0.4107, "rewards/chosen": -0.19317328929901123, "rewards/margins": 0.644259762763977, "rewards/rejected": -0.8374330520629882, "step": 587 }, { "epoch": 0.031166353059656003, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2851435.0, "logits/rejected": -17311748.0, "logps/chosen": -112.35130310058594, "logps/rejected": -89.56629943847656, "loss": 0.4563, "rewards/chosen": -0.017580842599272728, "rewards/margins": 0.35792642273008823, "rewards/rejected": -0.37550726532936096, "step": 588 }, { "epoch": 0.03121935706145814, "grad_norm": 70.5, "kl": 0.11282730102539062, "learning_rate": 5e-07, "logits/chosen": 45865424.0, "logits/rejected": -48720792.0, "logps/chosen": -298.1995442708333, "logps/rejected": -205.7908935546875, "loss": 0.4274, "rewards/chosen": 0.13810483614603677, "rewards/margins": 0.9785547653834025, "rewards/rejected": -0.8404499292373657, "step": 589 }, { "epoch": 0.03127236106326028, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37544362.666666664, "logits/rejected": -23899059.2, "logps/chosen": -132.32450358072916, "logps/rejected": -343.96650390625, "loss": 0.385, "rewards/chosen": -0.18231958150863647, "rewards/margins": 0.7370130181312561, "rewards/rejected": -0.9193325996398926, "step": 590 }, { "epoch": 0.031325365065062415, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21954821.333333332, "logits/rejected": -28352592.0, "logps/chosen": -98.72456868489583, "logps/rejected": -121.43154296875, "loss": 0.435, "rewards/chosen": -0.02598412831624349, "rewards/margins": 0.4259481112162272, "rewards/rejected": -0.4519322395324707, "step": 591 }, { "epoch": 0.031378369066864545, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23701560.0, "logits/rejected": -2632782.6, "logps/chosen": -199.77396647135416, "logps/rejected": -357.0741455078125, "loss": 0.39, "rewards/chosen": -0.0009923279285430908, "rewards/margins": 0.8351362884044647, "rewards/rejected": -0.8361286163330078, "step": 592 }, { "epoch": 0.03143137306866668, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40391045.333333336, "logits/rejected": -16093094.4, "logps/chosen": -254.05108642578125, "logps/rejected": -631.4546875, "loss": 0.3241, "rewards/chosen": 0.0053499191999435425, "rewards/margins": 1.5980501145124435, "rewards/rejected": -1.5927001953125, "step": 593 }, { "epoch": 0.03148437707046882, "grad_norm": 73.5, "kl": 0.06058692932128906, "learning_rate": 5e-07, "logits/chosen": -14692003.2, "logits/rejected": -3783009.0, "logps/chosen": -388.164794921875, "logps/rejected": -240.99430338541666, "loss": 0.3626, "rewards/chosen": 0.3507408142089844, "rewards/margins": 1.3294039408365885, "rewards/rejected": -0.9786631266276041, "step": 594 }, { "epoch": 0.031537381072270956, "grad_norm": 77.5, "kl": 0.14307117462158203, "learning_rate": 5e-07, "logits/chosen": -1099933.5714285714, "logits/rejected": -1694694.875, "logps/chosen": -356.62949916294644, "logps/rejected": -97.79824829101562, "loss": 0.4875, "rewards/chosen": -0.0314491902078901, "rewards/margins": 0.7390998091016497, "rewards/rejected": -0.7705489993095398, "step": 595 }, { "epoch": 0.031590385074073093, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20524676.0, "logits/rejected": -25507292.0, "logps/chosen": -434.70599365234375, "logps/rejected": -313.76898193359375, "loss": 0.3796, "rewards/chosen": 0.12445011734962463, "rewards/margins": 1.0685561001300812, "rewards/rejected": -0.9441059827804565, "step": 596 }, { "epoch": 0.03164338907587523, "grad_norm": 80.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1583310.25, "logits/rejected": -13416326.666666666, "logps/chosen": -909.1705322265625, "logps/rejected": -235.41654459635416, "loss": 0.3734, "rewards/chosen": 0.19409066438674927, "rewards/margins": 0.8316023945808411, "rewards/rejected": -0.6375117301940918, "step": 597 }, { "epoch": 0.03169639307767737, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -105156864.0, "logits/rejected": -35244416.0, "logps/chosen": -202.41668701171875, "logps/rejected": -246.4278564453125, "loss": 0.3593, "rewards/chosen": 0.2409515380859375, "rewards/margins": 1.0182620684305825, "rewards/rejected": -0.7773105303446451, "step": 598 }, { "epoch": 0.0317493970794795, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16146980.0, "logps/chosen": -156.5074462890625, "loss": 0.4653, "rewards/chosen": 0.139863520860672, "step": 599 }, { "epoch": 0.031802401081281635, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33877552.0, "logits/rejected": -32808938.0, "logps/chosen": -241.50474548339844, "logps/rejected": -283.73272705078125, "loss": 0.3982, "rewards/chosen": 0.15216989815235138, "rewards/margins": 0.9139348417520523, "rewards/rejected": -0.7617649435997009, "step": 600 }, { "epoch": 0.03185540508308377, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44043904.0, "logits/rejected": -18144853.333333332, "logps/chosen": -302.4931640625, "logps/rejected": -142.88433837890625, "loss": 0.4548, "rewards/chosen": 0.012413783371448517, "rewards/margins": 0.4975131784876187, "rewards/rejected": -0.48509939511617023, "step": 601 }, { "epoch": 0.03190840908488591, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48165589.333333336, "logits/rejected": -17384643.2, "logps/chosen": -494.9535725911458, "logps/rejected": -122.3329345703125, "loss": 0.4187, "rewards/chosen": -0.03857981661955515, "rewards/margins": 0.5363579084475836, "rewards/rejected": -0.5749377250671387, "step": 602 }, { "epoch": 0.03196141308668805, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12291118.666666666, "logits/rejected": -805000.0, "logps/chosen": -56.83216857910156, "logps/rejected": -207.257373046875, "loss": 0.4192, "rewards/chosen": -0.1836301883061727, "rewards/margins": 0.5408159176508586, "rewards/rejected": -0.7244461059570313, "step": 603 }, { "epoch": 0.032014417088490184, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 85400954.66666667, "logits/rejected": -50572952.0, "logps/chosen": -309.76043701171875, "logps/rejected": -364.8819274902344, "loss": 0.4583, "rewards/chosen": -0.02255725860595703, "rewards/margins": 0.7877897620201111, "rewards/rejected": -0.8103470206260681, "step": 604 }, { "epoch": 0.032067421090292314, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26460610.0, "logits/rejected": -18316914.666666668, "logps/chosen": -136.84230041503906, "logps/rejected": -236.46822102864584, "loss": 0.3593, "rewards/chosen": 0.10502509772777557, "rewards/margins": 0.8750054488579432, "rewards/rejected": -0.7699803511301676, "step": 605 }, { "epoch": 0.03212042509209445, "grad_norm": 74.5, "kl": 0.013919830322265625, "learning_rate": 5e-07, "logits/chosen": -10943871.0, "logits/rejected": -38037248.0, "logps/chosen": -367.7221374511719, "logps/rejected": -337.6059875488281, "loss": 0.3663, "rewards/chosen": 0.19515685737133026, "rewards/margins": 1.1883144825696945, "rewards/rejected": -0.9931576251983643, "step": 606 }, { "epoch": 0.03217342909389659, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43666598.4, "logits/rejected": -43643085.333333336, "logps/chosen": -288.354736328125, "logps/rejected": -676.0337320963541, "loss": 0.3828, "rewards/chosen": -0.08085387945175171, "rewards/margins": 1.728672722975413, "rewards/rejected": -1.8095266024271648, "step": 607 }, { "epoch": 0.032226433095698725, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18540652.8, "logits/rejected": 1014114.0, "logps/chosen": -164.9349609375, "logps/rejected": -44.35332743326823, "loss": 0.494, "rewards/chosen": -0.11205060482025146, "rewards/margins": 0.14095664819081621, "rewards/rejected": -0.2530072530110677, "step": 608 }, { "epoch": 0.03227943709750086, "grad_norm": 53.0, "kl": 0.06277847290039062, "learning_rate": 5e-07, "logits/chosen": -22739166.4, "logits/rejected": -24550922.666666668, "logps/chosen": -235.7229736328125, "logps/rejected": -109.19047037760417, "loss": 0.4184, "rewards/chosen": 0.2766761779785156, "rewards/margins": 0.7252941131591797, "rewards/rejected": -0.44861793518066406, "step": 609 }, { "epoch": 0.032332441099303, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17201318.666666668, "logits/rejected": -72938728.0, "logps/chosen": -164.68647257486978, "logps/rejected": -473.4793395996094, "loss": 0.4182, "rewards/chosen": 0.06636165579160054, "rewards/margins": 1.3861515422662098, "rewards/rejected": -1.3197898864746094, "step": 610 }, { "epoch": 0.03238544510110513, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11544070.0, "logits/rejected": -29367110.0, "logps/chosen": -319.86334228515625, "logps/rejected": -406.4745178222656, "loss": 0.3644, "rewards/chosen": 0.1796410232782364, "rewards/margins": 1.2209358364343643, "rewards/rejected": -1.041294813156128, "step": 611 }, { "epoch": 0.03243844910290727, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13761507.2, "logits/rejected": -12839930.666666666, "logps/chosen": -161.94053955078124, "logps/rejected": -257.2406005859375, "loss": 0.4568, "rewards/chosen": -0.1229780673980713, "rewards/margins": 0.5732499599456787, "rewards/rejected": -0.69622802734375, "step": 612 }, { "epoch": 0.032491453104709404, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31235834.0, "logits/rejected": -17335498.0, "logps/chosen": -193.40667724609375, "logps/rejected": -297.6540832519531, "loss": 0.4152, "rewards/chosen": -0.054676540195941925, "rewards/margins": 0.8587004467844963, "rewards/rejected": -0.9133769869804382, "step": 613 }, { "epoch": 0.03254445710651154, "grad_norm": 111.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59676947.2, "logits/rejected": -13508720.0, "logps/chosen": -595.68896484375, "logps/rejected": -233.8641153971354, "loss": 0.4566, "rewards/chosen": -0.14534057378768922, "rewards/margins": 0.6414583007494608, "rewards/rejected": -0.78679887453715, "step": 614 }, { "epoch": 0.03259746110831368, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78297888.0, "logits/rejected": -20717040.0, "logps/chosen": -486.5995788574219, "logps/rejected": -342.00125558035717, "loss": 0.3556, "rewards/chosen": 0.014248657040297985, "rewards/margins": 0.7450044630095363, "rewards/rejected": -0.7307558059692383, "step": 615 }, { "epoch": 0.032650465110115816, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -418420.3125, "logits/rejected": -2623574.25, "logps/chosen": -241.42005920410156, "logps/rejected": -107.15083312988281, "loss": 0.4579, "rewards/chosen": 0.1849079132080078, "rewards/margins": 0.3393250107765198, "rewards/rejected": -0.15441709756851196, "step": 616 }, { "epoch": 0.03270346911191795, "grad_norm": 76.5, "kl": 0.7714004516601562, "learning_rate": 5e-07, "logits/chosen": -28378988.8, "logits/rejected": -46235989.333333336, "logps/chosen": -372.3365234375, "logps/rejected": -430.0008951822917, "loss": 0.3826, "rewards/chosen": 0.34618301391601564, "rewards/margins": 1.3432940165201823, "rewards/rejected": -0.9971110026041666, "step": 617 }, { "epoch": 0.03275647311372008, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45188708.571428575, "logits/rejected": -2117646.5, "logps/chosen": -454.59068080357144, "logps/rejected": -42.793434143066406, "loss": 0.495, "rewards/chosen": -0.012136378458568029, "rewards/margins": 0.23424882335322245, "rewards/rejected": -0.24638520181179047, "step": 618 }, { "epoch": 0.03280947711552222, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9143400.0, "logits/rejected": -25245344.0, "logps/chosen": -194.77701416015626, "logps/rejected": -173.1321818033854, "loss": 0.427, "rewards/chosen": 0.08599998950958251, "rewards/margins": 0.7468796014785767, "rewards/rejected": -0.6608796119689941, "step": 619 }, { "epoch": 0.03286248111732436, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11692448.0, "logits/rejected": -5038158.8, "logps/chosen": -250.11543782552084, "logps/rejected": -443.39677734375, "loss": 0.3317, "rewards/chosen": 0.1667874058087667, "rewards/margins": 1.3273439129193623, "rewards/rejected": -1.1605565071105957, "step": 620 }, { "epoch": 0.032915485119126495, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18844864.0, "logits/rejected": -44048296.0, "logps/chosen": -221.1116485595703, "logps/rejected": -357.6378173828125, "loss": 0.4111, "rewards/chosen": -0.1958560049533844, "rewards/margins": 0.9192417562007904, "rewards/rejected": -1.1150977611541748, "step": 621 }, { "epoch": 0.03296848912092863, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5554030.0, "logits/rejected": -12693762.666666666, "logps/chosen": -389.9486572265625, "logps/rejected": -72.64151000976562, "loss": 0.448, "rewards/chosen": 0.15585533380508423, "rewards/margins": 0.4548224568367004, "rewards/rejected": -0.2989671230316162, "step": 622 }, { "epoch": 0.03302149312273077, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31850400.0, "logits/rejected": 957250.0833333334, "logps/chosen": -261.670068359375, "logps/rejected": -316.9216715494792, "loss": 0.4169, "rewards/chosen": 0.10692307949066163, "rewards/margins": 0.9334418535232544, "rewards/rejected": -0.8265187740325928, "step": 623 }, { "epoch": 0.0330744971245329, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17662838.0, "logits/rejected": -34250200.0, "logps/chosen": -135.53273010253906, "logps/rejected": -255.07069396972656, "loss": 0.4053, "rewards/chosen": 0.07193098217248917, "rewards/margins": 0.8188394084572792, "rewards/rejected": -0.74690842628479, "step": 624 }, { "epoch": 0.033127501126335036, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13874699.0, "logits/rejected": -30933628.0, "logps/chosen": -161.74095153808594, "logps/rejected": -440.8947448730469, "loss": 0.427, "rewards/chosen": -0.23115701973438263, "rewards/margins": 0.6713474243879318, "rewards/rejected": -0.9025044441223145, "step": 625 }, { "epoch": 0.033180505128137174, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76368666.66666667, "logits/rejected": -34020000.0, "logps/chosen": -365.9628499348958, "logps/rejected": -286.261962890625, "loss": 0.419, "rewards/chosen": -0.20446829001108804, "rewards/margins": 0.5031902869542441, "rewards/rejected": -0.7076585769653321, "step": 626 }, { "epoch": 0.03323350912993931, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11837715.0, "logits/rejected": -23626360.0, "logps/chosen": -207.53872680664062, "logps/rejected": -401.3145751953125, "loss": 0.3923, "rewards/chosen": -0.10524959117174149, "rewards/margins": 1.0798974558711052, "rewards/rejected": -1.1851470470428467, "step": 627 }, { "epoch": 0.03328651313174145, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40302912.0, "logits/rejected": -12574880.0, "logps/chosen": -547.6168212890625, "logps/rejected": -394.7420247395833, "loss": 0.3466, "rewards/chosen": -0.304647833108902, "rewards/margins": 0.865266094605128, "rewards/rejected": -1.16991392771403, "step": 628 }, { "epoch": 0.033339517133543585, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23377618.666666668, "logits/rejected": -3816904.8, "logps/chosen": -196.9312744140625, "logps/rejected": -138.47310791015624, "loss": 0.426, "rewards/chosen": -0.07285181681315105, "rewards/margins": 0.47834475835164386, "rewards/rejected": -0.5511965751647949, "step": 629 }, { "epoch": 0.033392521135345715, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14007554.666666666, "logits/rejected": -21437196.8, "logps/chosen": -363.3710123697917, "logps/rejected": -241.5731689453125, "loss": 0.447, "rewards/chosen": 0.09424336751302083, "rewards/margins": 0.3840984185536702, "rewards/rejected": -0.2898550510406494, "step": 630 }, { "epoch": 0.03344552513714785, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17953708.8, "logits/rejected": 50729685.333333336, "logps/chosen": -191.8569091796875, "logps/rejected": -99.80641682942708, "loss": 0.4935, "rewards/chosen": -0.11404914855957031, "rewards/margins": 0.1466030756632487, "rewards/rejected": -0.26065222422281903, "step": 631 }, { "epoch": 0.03349852913894999, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17457000.0, "logits/rejected": -60931000.0, "logps/chosen": -179.46475219726562, "logps/rejected": -209.93032836914062, "loss": 0.4523, "rewards/chosen": -0.24827560782432556, "rewards/margins": 0.3963601887226105, "rewards/rejected": -0.644635796546936, "step": 632 }, { "epoch": 0.03355153314075213, "grad_norm": 82.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24296096.0, "logps/chosen": -473.578857421875, "loss": 0.4642, "rewards/chosen": 0.14532776176929474, "step": 633 }, { "epoch": 0.033604537142554264, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7119252.0, "logits/rejected": -27723602.0, "logps/chosen": -351.8575439453125, "logps/rejected": -220.8289337158203, "loss": 0.4552, "rewards/chosen": -0.003981398418545723, "rewards/margins": 0.37744267098605633, "rewards/rejected": -0.38142406940460205, "step": 634 }, { "epoch": 0.0336575411443564, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39132768.0, "logits/rejected": -32486602.0, "logps/chosen": -398.0277404785156, "logps/rejected": -529.5701904296875, "loss": 0.4025, "rewards/chosen": -0.17955532670021057, "rewards/margins": 1.0400818288326263, "rewards/rejected": -1.219637155532837, "step": 635 }, { "epoch": 0.03371054514615854, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21607485.333333332, "logits/rejected": -18895795.2, "logps/chosen": -273.62042236328125, "logps/rejected": -156.1264892578125, "loss": 0.4171, "rewards/chosen": -0.028805668155352276, "rewards/margins": 0.5631322552760443, "rewards/rejected": -0.5919379234313965, "step": 636 }, { "epoch": 0.03376354914796067, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15303556.8, "logits/rejected": -31308992.0, "logps/chosen": -312.4174560546875, "logps/rejected": -346.1139322916667, "loss": 0.4159, "rewards/chosen": -0.04701938629150391, "rewards/margins": 1.0420896371205648, "rewards/rejected": -1.0891090234120686, "step": 637 }, { "epoch": 0.033816553149762806, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63124016.0, "logits/rejected": -23822666.0, "logps/chosen": -269.4638366699219, "logps/rejected": -275.21563720703125, "loss": 0.4001, "rewards/chosen": 0.02100391313433647, "rewards/margins": 0.8889653198421001, "rewards/rejected": -0.8679614067077637, "step": 638 }, { "epoch": 0.03386955715156494, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9980917.333333334, "logits/rejected": -64308972.8, "logps/chosen": -183.54473876953125, "logps/rejected": -397.8901611328125, "loss": 0.3285, "rewards/chosen": 0.20682132244110107, "rewards/margins": 1.4016873121261597, "rewards/rejected": -1.1948659896850586, "step": 639 }, { "epoch": 0.03392256115336708, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15471241.6, "logits/rejected": -2767296.0, "logps/chosen": -278.8256103515625, "logps/rejected": -135.99809773763022, "loss": 0.441, "rewards/chosen": 0.07765579223632812, "rewards/margins": 0.6316356261571249, "rewards/rejected": -0.5539798339207967, "step": 640 }, { "epoch": 0.03397556515516922, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51929324.0, "logits/rejected": -17198828.0, "logps/chosen": -363.0469665527344, "logps/rejected": -348.206298828125, "loss": 0.3272, "rewards/chosen": 0.04209290072321892, "rewards/margins": 1.0490687899291515, "rewards/rejected": -1.0069758892059326, "step": 641 }, { "epoch": 0.034028569156971354, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12762581.333333334, "logits/rejected": -18242492.8, "logps/chosen": -164.136474609375, "logps/rejected": -193.4324462890625, "loss": 0.3802, "rewards/chosen": 0.0270846684773763, "rewards/margins": 0.8786977132161459, "rewards/rejected": -0.8516130447387695, "step": 642 }, { "epoch": 0.034081573158773484, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21643920.0, "logits/rejected": -48658952.0, "logps/chosen": -331.884033203125, "logps/rejected": -381.3204040527344, "loss": 0.4221, "rewards/chosen": -0.026938628405332565, "rewards/margins": 0.7493809349834919, "rewards/rejected": -0.7763195633888245, "step": 643 }, { "epoch": 0.03413457716057562, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 30239690.666666668, "logits/rejected": -44013049.6, "logps/chosen": -447.46240234375, "logps/rejected": -474.858935546875, "loss": 0.3141, "rewards/chosen": 0.06660766402880351, "rewards/margins": 1.6180101374785105, "rewards/rejected": -1.551402473449707, "step": 644 }, { "epoch": 0.03418758116237776, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3204370.6666666665, "logits/rejected": 889731.0, "logps/chosen": -225.69193522135416, "logps/rejected": -42.500980377197266, "loss": 0.5057, "rewards/chosen": -0.07875888546307881, "rewards/margins": 0.06762115160624187, "rewards/rejected": -0.14638003706932068, "step": 645 }, { "epoch": 0.034240585164179896, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7107100.0, "logits/rejected": 2533118.25, "logps/chosen": -158.48617553710938, "logps/rejected": -316.74334716796875, "loss": 0.4221, "rewards/chosen": -0.1765357404947281, "rewards/margins": 0.7942993491888046, "rewards/rejected": -0.9708350896835327, "step": 646 }, { "epoch": 0.03429358916598203, "grad_norm": 88.5, "kl": 0.2033843994140625, "learning_rate": 5e-07, "logits/chosen": -55062732.8, "logits/rejected": -11445970.666666666, "logps/chosen": -370.278125, "logps/rejected": -221.0246785481771, "loss": 0.4638, "rewards/chosen": -0.004641002416610718, "rewards/margins": 0.40564904014269515, "rewards/rejected": -0.41029004255930585, "step": 647 }, { "epoch": 0.03434659316778417, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22926693.333333332, "logits/rejected": 98754336.0, "logps/chosen": -323.55322265625, "logps/rejected": -239.1763671875, "loss": 0.3886, "rewards/chosen": 0.24890939394632974, "rewards/margins": 0.8338503042856852, "rewards/rejected": -0.5849409103393555, "step": 648 }, { "epoch": 0.0343995971695863, "grad_norm": 52.75, "kl": 0.03039264678955078, "learning_rate": 5e-07, "logits/chosen": -35932730.666666664, "logits/rejected": -18593382.4, "logps/chosen": -432.0434163411458, "logps/rejected": -271.75166015625, "loss": 0.4048, "rewards/chosen": 0.012450598180294037, "rewards/margins": 0.7142543777823448, "rewards/rejected": -0.7018037796020508, "step": 649 }, { "epoch": 0.03445260117138844, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36969936.0, "logits/rejected": -7229206.0, "logps/chosen": -469.4502258300781, "logps/rejected": -326.2466735839844, "loss": 0.4061, "rewards/chosen": 0.17990227043628693, "rewards/margins": 0.7890575677156448, "rewards/rejected": -0.6091552972793579, "step": 650 }, { "epoch": 0.034505605173190575, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7515311.2, "logits/rejected": -16240802.666666666, "logps/chosen": -225.7494140625, "logps/rejected": -368.4275716145833, "loss": 0.4054, "rewards/chosen": -0.0333199679851532, "rewards/margins": 1.3133503417174022, "rewards/rejected": -1.3466703097025554, "step": 651 }, { "epoch": 0.03455860917499271, "grad_norm": 82.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3270363.0, "logits/rejected": -61414612.0, "logps/chosen": -760.11328125, "logps/rejected": -505.032470703125, "loss": 0.3189, "rewards/chosen": 0.13312950730323792, "rewards/margins": 1.7992701828479767, "rewards/rejected": -1.6661406755447388, "step": 652 }, { "epoch": 0.03461161317679485, "grad_norm": 54.75, "kl": 0.24458694458007812, "learning_rate": 5e-07, "logits/chosen": -25760348.8, "logits/rejected": 3125389.3333333335, "logps/chosen": -185.399853515625, "logps/rejected": -164.84957885742188, "loss": 0.4129, "rewards/chosen": 0.2911952257156372, "rewards/margins": 0.8317599614461264, "rewards/rejected": -0.5405647357304891, "step": 653 }, { "epoch": 0.034664617178596986, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32099860.0, "logits/rejected": -23068312.0, "logps/chosen": -293.9619140625, "logps/rejected": -367.7410074869792, "loss": 0.3662, "rewards/chosen": -0.01949005201458931, "rewards/margins": 0.8062734119594097, "rewards/rejected": -0.825763463973999, "step": 654 }, { "epoch": 0.03471762118039912, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9028975.0, "logits/rejected": -17007680.0, "logps/chosen": -59.09408950805664, "logps/rejected": -472.50230189732144, "loss": 0.3012, "rewards/chosen": -0.021718217059969902, "rewards/margins": 1.3643110264092684, "rewards/rejected": -1.3860292434692383, "step": 655 }, { "epoch": 0.034770625182201254, "grad_norm": 86.5, "kl": 0.4919548034667969, "learning_rate": 5e-07, "logits/chosen": -16613805.333333334, "logits/rejected": -21500018.0, "logps/chosen": -537.8485514322916, "logps/rejected": -363.5693664550781, "loss": 0.4419, "rewards/chosen": 0.22534420092900595, "rewards/margins": 0.7028191884358724, "rewards/rejected": -0.47747498750686646, "step": 656 }, { "epoch": 0.03482362918400339, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59433446.4, "logits/rejected": -23104050.666666668, "logps/chosen": -354.38310546875, "logps/rejected": -325.7141927083333, "loss": 0.4588, "rewards/chosen": -0.08037460446357728, "rewards/margins": 0.5416951914628347, "rewards/rejected": -0.622069795926412, "step": 657 }, { "epoch": 0.03487663318580553, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -232816912.0, "logits/rejected": -39185572.571428575, "logps/chosen": -319.1783752441406, "logps/rejected": -416.20521763392856, "loss": 0.34, "rewards/chosen": -0.26638489961624146, "rewards/margins": 0.5724542226110186, "rewards/rejected": -0.8388391222272601, "step": 658 }, { "epoch": 0.034929637187607665, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29255794.666666668, "logits/rejected": 14112754.0, "logps/chosen": -192.1054890950521, "logps/rejected": -310.5571594238281, "loss": 0.4695, "rewards/chosen": 0.037786200642585754, "rewards/margins": 0.4398607462644577, "rewards/rejected": -0.40207454562187195, "step": 659 }, { "epoch": 0.0349826411894098, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27766712.0, "logits/rejected": -7984413.6, "logps/chosen": -143.2481892903646, "logps/rejected": -161.170849609375, "loss": 0.4095, "rewards/chosen": -0.15062389771143594, "rewards/margins": 0.5716779192288717, "rewards/rejected": -0.7223018169403076, "step": 660 }, { "epoch": 0.03503564519121194, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3414986.6666666665, "logits/rejected": -24747827.2, "logps/chosen": -267.7166341145833, "logps/rejected": -351.1904296875, "loss": 0.363, "rewards/chosen": 0.08815561731656392, "rewards/margins": 1.0140497823556265, "rewards/rejected": -0.9258941650390625, "step": 661 }, { "epoch": 0.03508864919301407, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10649129.6, "logits/rejected": 8267314.0, "logps/chosen": -233.0211181640625, "logps/rejected": -60.394063313802086, "loss": 0.4788, "rewards/chosen": -0.05323379635810852, "rewards/margins": 0.2656423509120941, "rewards/rejected": -0.31887614727020264, "step": 662 }, { "epoch": 0.03514165319481621, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55540678.4, "logits/rejected": -87859296.0, "logps/chosen": -347.0560791015625, "logps/rejected": -337.60927327473956, "loss": 0.4151, "rewards/chosen": 0.05437072515487671, "rewards/margins": 0.9340581933657328, "rewards/rejected": -0.8796874682108561, "step": 663 }, { "epoch": 0.035194657196618344, "grad_norm": 67.0, "kl": 0.01911163330078125, "learning_rate": 5e-07, "logits/chosen": -18547212.0, "logits/rejected": -44745964.0, "logps/chosen": -398.73870849609375, "logps/rejected": -345.3180236816406, "loss": 0.3887, "rewards/chosen": 0.16849365830421448, "rewards/margins": 1.07608363032341, "rewards/rejected": -0.9075899720191956, "step": 664 }, { "epoch": 0.03524766119842048, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -255994816.0, "logits/rejected": -20265220.57142857, "logps/chosen": -293.2882080078125, "logps/rejected": -267.99619838169644, "loss": 0.3783, "rewards/chosen": -0.08744201809167862, "rewards/margins": 0.6172806588666779, "rewards/rejected": -0.7047226769583566, "step": 665 }, { "epoch": 0.03530066520022262, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19231377.333333332, "logits/rejected": -19180787.2, "logps/chosen": -123.24192301432292, "logps/rejected": -526.88701171875, "loss": 0.3193, "rewards/chosen": -0.14589786529541016, "rewards/margins": 1.3406890869140624, "rewards/rejected": -1.4865869522094726, "step": 666 }, { "epoch": 0.035353669202024755, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16880981.333333332, "logits/rejected": -2209984.4, "logps/chosen": -291.9545084635417, "logps/rejected": -157.79991455078124, "loss": 0.4304, "rewards/chosen": 0.057461549838383995, "rewards/margins": 0.4953331967194875, "rewards/rejected": -0.4378716468811035, "step": 667 }, { "epoch": 0.035406673203826886, "grad_norm": 86.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25262338.285714287, "logits/rejected": -65865836.0, "logps/chosen": -406.05970982142856, "logps/rejected": -628.2703857421875, "loss": 0.46, "rewards/chosen": 0.015591757638113839, "rewards/margins": 1.3571323326655798, "rewards/rejected": -1.3415405750274658, "step": 668 }, { "epoch": 0.03545967720562902, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21590558.0, "logits/rejected": -22024652.0, "logps/chosen": -188.4250946044922, "logps/rejected": -319.4568176269531, "loss": 0.4058, "rewards/chosen": 0.10553435981273651, "rewards/margins": 0.8108486384153366, "rewards/rejected": -0.7053142786026001, "step": 669 }, { "epoch": 0.03551268120743116, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61130661.333333336, "logits/rejected": -17315145.6, "logps/chosen": -533.389892578125, "logps/rejected": -144.08543701171874, "loss": 0.3812, "rewards/chosen": 0.2638412316640218, "rewards/margins": 0.8927178700764973, "rewards/rejected": -0.6288766384124755, "step": 670 }, { "epoch": 0.0355656852092333, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53389094.4, "logits/rejected": -37711832.0, "logps/chosen": -503.214794921875, "logps/rejected": -412.5550130208333, "loss": 0.3818, "rewards/chosen": 0.23166451454162598, "rewards/margins": 1.2331405639648438, "rewards/rejected": -1.0014760494232178, "step": 671 }, { "epoch": 0.035618689211035434, "grad_norm": 69.5, "kl": 0.14501190185546875, "learning_rate": 5e-07, "logits/chosen": -38809624.0, "logits/rejected": -1866630.25, "logps/chosen": -390.63525390625, "logps/rejected": -294.9077453613281, "loss": 0.3951, "rewards/chosen": -0.016902877017855644, "rewards/margins": 0.9889023769646883, "rewards/rejected": -1.005805253982544, "step": 672 }, { "epoch": 0.03567169321283757, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18025629.333333332, "logits/rejected": -66399616.0, "logps/chosen": -227.9113972981771, "logps/rejected": -365.357470703125, "loss": 0.3239, "rewards/chosen": 0.10283050934473674, "rewards/margins": 1.3645538369814556, "rewards/rejected": -1.2617233276367188, "step": 673 }, { "epoch": 0.03572469721463971, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32966400.0, "logits/rejected": 6190867.0, "logps/chosen": -184.39020647321428, "logps/rejected": -211.79647827148438, "loss": 0.5127, "rewards/chosen": -0.1318299514906747, "rewards/margins": 0.37333669832774574, "rewards/rejected": -0.5051666498184204, "step": 674 }, { "epoch": 0.03577770121644184, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9138421.333333334, "logits/rejected": -57770992.0, "logps/chosen": -191.4817097981771, "logps/rejected": -419.337646484375, "loss": 0.4189, "rewards/chosen": 0.12581841150919595, "rewards/margins": 1.127431313196818, "rewards/rejected": -1.001612901687622, "step": 675 }, { "epoch": 0.035830705218243976, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17505344.0, "logits/rejected": -18134812.0, "logps/chosen": -385.9635009765625, "logps/rejected": -296.0735778808594, "loss": 0.4304, "rewards/chosen": -0.11080951988697052, "rewards/margins": 0.6824412792921066, "rewards/rejected": -0.7932507991790771, "step": 676 }, { "epoch": 0.03588370922004611, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57277386.666666664, "logits/rejected": -32182598.4, "logps/chosen": -97.34237670898438, "logps/rejected": -422.083935546875, "loss": 0.3541, "rewards/chosen": 0.1771265665690104, "rewards/margins": 1.1485901514689127, "rewards/rejected": -0.9714635848999024, "step": 677 }, { "epoch": 0.03593671322184825, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11066180.8, "logits/rejected": -39727482.666666664, "logps/chosen": -161.36654052734374, "logps/rejected": -326.69232177734375, "loss": 0.4231, "rewards/chosen": -0.07651207447052003, "rewards/margins": 1.0623622179031371, "rewards/rejected": -1.1388742923736572, "step": 678 }, { "epoch": 0.03598971722365039, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10221988.0, "logits/rejected": -2711943.0, "logps/chosen": -212.45712280273438, "logps/rejected": -210.3036651611328, "loss": 0.423, "rewards/chosen": 0.004283145070075989, "rewards/margins": 0.6779250651597977, "rewards/rejected": -0.6736419200897217, "step": 679 }, { "epoch": 0.036042721225452524, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7398726.0, "logits/rejected": -9204068.0, "logps/chosen": -174.70281982421875, "logps/rejected": -294.3570251464844, "loss": 0.4184, "rewards/chosen": -0.004016208462417126, "rewards/margins": 0.6909439684823155, "rewards/rejected": -0.6949601769447327, "step": 680 }, { "epoch": 0.036095725227254655, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30477829.333333332, "logits/rejected": -14119758.4, "logps/chosen": -149.66353352864584, "logps/rejected": -188.77939453125, "loss": 0.4015, "rewards/chosen": 0.01845703274011612, "rewards/margins": 0.7592937484383583, "rewards/rejected": -0.7408367156982422, "step": 681 }, { "epoch": 0.03614872922905679, "grad_norm": 55.75, "kl": 0.12373924255371094, "learning_rate": 5e-07, "logits/chosen": -50727628.8, "logits/rejected": -21619605.333333332, "logps/chosen": -292.52744140625, "logps/rejected": -523.2823079427084, "loss": 0.4336, "rewards/chosen": -0.09224761724472046, "rewards/margins": 1.0004993557929993, "rewards/rejected": -1.0927469730377197, "step": 682 }, { "epoch": 0.03620173323085893, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15528574.0, "logits/rejected": -49656980.0, "logps/chosen": -72.28872680664062, "logps/rejected": -240.12112426757812, "loss": 0.3569, "rewards/chosen": 0.028101257979869843, "rewards/margins": 1.390407495200634, "rewards/rejected": -1.3623062372207642, "step": 683 }, { "epoch": 0.036254737232661066, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22150320.0, "logits/rejected": -45888784.0, "logps/chosen": -208.11386108398438, "logps/rejected": -373.8597717285156, "loss": 0.3906, "rewards/chosen": -0.008417321369051933, "rewards/margins": 1.1155021656304598, "rewards/rejected": -1.1239194869995117, "step": 684 }, { "epoch": 0.0363077412344632, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18412129.333333332, "logits/rejected": -25495049.6, "logps/chosen": -110.67886352539062, "logps/rejected": -392.746142578125, "loss": 0.3416, "rewards/chosen": 0.09063504139582317, "rewards/margins": 1.296407441298167, "rewards/rejected": -1.2057723999023438, "step": 685 }, { "epoch": 0.03636074523626534, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39754474.666666664, "logits/rejected": 4242111.0, "logps/chosen": -225.58939615885416, "logps/rejected": -167.53524780273438, "loss": 0.4798, "rewards/chosen": 0.0016069461901982625, "rewards/margins": 0.324235921104749, "rewards/rejected": -0.3226289749145508, "step": 686 }, { "epoch": 0.03641374923806747, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37638880.0, "logits/rejected": -5610362.8, "logps/chosen": -248.1663614908854, "logps/rejected": -187.1699462890625, "loss": 0.3596, "rewards/chosen": -0.09800770878791809, "rewards/margins": 0.988810795545578, "rewards/rejected": -1.0868185043334961, "step": 687 }, { "epoch": 0.03646675323986961, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86686016.0, "logits/rejected": -26719344.0, "logps/chosen": -413.1573181152344, "logps/rejected": -344.0547688802083, "loss": 0.3867, "rewards/chosen": -0.0527038611471653, "rewards/margins": 0.6545880598326524, "rewards/rejected": -0.7072919209798177, "step": 688 }, { "epoch": 0.036519757241671745, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54137840.0, "logits/rejected": -88800656.0, "logps/chosen": -472.6872253417969, "logps/rejected": -260.7184753417969, "loss": 0.4438, "rewards/chosen": -0.07998428493738174, "rewards/margins": 0.47091641277074814, "rewards/rejected": -0.5509006977081299, "step": 689 }, { "epoch": 0.03657276124347388, "grad_norm": 55.5, "kl": 0.0969858169555664, "learning_rate": 5e-07, "logits/chosen": 5677908.0, "logits/rejected": -29354282.0, "logps/chosen": -256.71649169921875, "logps/rejected": -143.3745880126953, "loss": 0.4967, "rewards/chosen": -0.11438255508740743, "rewards/margins": 0.3232532540957133, "rewards/rejected": -0.4376358091831207, "step": 690 }, { "epoch": 0.03662576524527602, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14732926.0, "logits/rejected": -16596372.0, "logps/chosen": -265.59661865234375, "logps/rejected": -160.27872721354166, "loss": 0.4067, "rewards/chosen": -0.041725922375917435, "rewards/margins": 0.4947974197566509, "rewards/rejected": -0.5365233421325684, "step": 691 }, { "epoch": 0.036678769247078157, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44142186.666666664, "logits/rejected": -52605190.4, "logps/chosen": -393.4395345052083, "logps/rejected": -427.97578125, "loss": 0.3862, "rewards/chosen": 0.00524597242474556, "rewards/margins": 0.8098467834293842, "rewards/rejected": -0.8046008110046386, "step": 692 }, { "epoch": 0.036731773248880294, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35427572.0, "logits/rejected": -55133700.0, "logps/chosen": -417.97747802734375, "logps/rejected": -516.2739868164062, "loss": 0.3621, "rewards/chosen": -0.208445742726326, "rewards/margins": 1.369404599070549, "rewards/rejected": -1.577850341796875, "step": 693 }, { "epoch": 0.036784777250682424, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60855368.0, "logits/rejected": -10218364.0, "logps/chosen": -364.52777099609375, "logps/rejected": -109.86225891113281, "loss": 0.4665, "rewards/chosen": -0.1252037137746811, "rewards/margins": 0.28095240890979767, "rewards/rejected": -0.40615612268447876, "step": 694 }, { "epoch": 0.03683778125248456, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1554376.0, "logits/rejected": -24422248.0, "logps/chosen": -248.7494354248047, "logps/rejected": -399.8038635253906, "loss": 0.3662, "rewards/chosen": 0.16053663194179535, "rewards/margins": 1.2166260331869125, "rewards/rejected": -1.0560894012451172, "step": 695 }, { "epoch": 0.0368907852542867, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28659144.0, "logits/rejected": -20984186.0, "logps/chosen": -39.018795013427734, "logps/rejected": -293.52691650390625, "loss": 0.3805, "rewards/chosen": 0.030042747035622597, "rewards/margins": 1.1363378558307886, "rewards/rejected": -1.106295108795166, "step": 696 }, { "epoch": 0.036943789256088835, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25723186.666666668, "logits/rejected": -1756001.6, "logps/chosen": -195.80572509765625, "logps/rejected": -314.6142578125, "loss": 0.3955, "rewards/chosen": 0.0982205073038737, "rewards/margins": 0.7464964548746744, "rewards/rejected": -0.6482759475708008, "step": 697 }, { "epoch": 0.03699679325789097, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33375200.0, "logits/rejected": -19980776.0, "logps/chosen": -186.56036376953125, "logps/rejected": -222.58610026041666, "loss": 0.3546, "rewards/chosen": 0.11212673783302307, "rewards/margins": 1.0679619212945304, "rewards/rejected": -0.9558351834615072, "step": 698 }, { "epoch": 0.03704979725969311, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35202776.0, "logits/rejected": -10565265.6, "logps/chosen": -412.1610514322917, "logps/rejected": -258.57060546875, "loss": 0.3996, "rewards/chosen": 0.05807444453239441, "rewards/margins": 0.731516855955124, "rewards/rejected": -0.6734424114227295, "step": 699 }, { "epoch": 0.03710280126149524, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 22757248.0, "logits/rejected": -58624732.0, "logps/chosen": -286.8244934082031, "logps/rejected": -377.93988037109375, "loss": 0.3828, "rewards/chosen": 0.12474747002124786, "rewards/margins": 1.0504253655672073, "rewards/rejected": -0.9256778955459595, "step": 700 }, { "epoch": 0.03715580526329738, "grad_norm": 70.5, "kl": 0.8805923461914062, "learning_rate": 5e-07, "logits/chosen": -57825380.0, "logits/rejected": -53434948.0, "logps/chosen": -414.70404052734375, "logps/rejected": -369.7750549316406, "loss": 0.4163, "rewards/chosen": 0.03956948593258858, "rewards/margins": 0.9006249196827412, "rewards/rejected": -0.8610554337501526, "step": 701 }, { "epoch": 0.037208809265099514, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48851846.4, "logits/rejected": -18023136.0, "logps/chosen": -149.54306640625, "logps/rejected": -230.31217447916666, "loss": 0.4768, "rewards/chosen": -0.14488396644592286, "rewards/margins": 0.40449323654174807, "rewards/rejected": -0.5493772029876709, "step": 702 }, { "epoch": 0.03726181326690165, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19445937.6, "logits/rejected": -13984229.333333334, "logps/chosen": -332.33935546875, "logps/rejected": -151.83868408203125, "loss": 0.437, "rewards/chosen": 0.017268443107604982, "rewards/margins": 0.7169580698013306, "rewards/rejected": -0.6996896266937256, "step": 703 }, { "epoch": 0.03731481726870379, "grad_norm": 84.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16086804.0, "logits/rejected": -29412818.0, "logps/chosen": -411.5627746582031, "logps/rejected": -314.1609191894531, "loss": 0.3905, "rewards/chosen": -0.05159169062972069, "rewards/margins": 1.064875666052103, "rewards/rejected": -1.1164673566818237, "step": 704 }, { "epoch": 0.037367821270505926, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8492788.0, "logits/rejected": -56450021.333333336, "logps/chosen": -146.1593475341797, "logps/rejected": -398.3740234375, "loss": 0.3154, "rewards/chosen": 0.018029116094112396, "rewards/margins": 1.224891406794389, "rewards/rejected": -1.2068622907002766, "step": 705 }, { "epoch": 0.037420825272308056, "grad_norm": 83.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 123024752.0, "logits/rejected": -54843696.0, "logps/chosen": -332.4830017089844, "logps/rejected": -411.5065612792969, "loss": 0.3658, "rewards/chosen": -0.1155981570482254, "rewards/margins": 1.2713082283735275, "rewards/rejected": -1.386906385421753, "step": 706 }, { "epoch": 0.03747382927411019, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47152256.0, "logits/rejected": -45557176.0, "logps/chosen": -329.40472412109375, "logps/rejected": -449.2068786621094, "loss": 0.3762, "rewards/chosen": -0.24994468688964844, "rewards/margins": 1.2890567779541016, "rewards/rejected": -1.53900146484375, "step": 707 }, { "epoch": 0.03752683327591233, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79646656.0, "logits/rejected": -50232245.333333336, "logps/chosen": -242.7001953125, "logps/rejected": -538.770263671875, "loss": 0.3747, "rewards/chosen": 0.0406948059797287, "rewards/margins": 1.669171902537346, "rewards/rejected": -1.6284770965576172, "step": 708 }, { "epoch": 0.03757983727771447, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3559488.6666666665, "logits/rejected": -54223820.8, "logps/chosen": -164.5466105143229, "logps/rejected": -175.83040771484374, "loss": 0.4007, "rewards/chosen": 0.01196238398551941, "rewards/margins": 0.6928213775157929, "rewards/rejected": -0.6808589935302735, "step": 709 }, { "epoch": 0.037632841279516605, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78679120.0, "logits/rejected": -23090592.0, "logps/chosen": -376.60455322265625, "logps/rejected": -383.76416015625, "loss": 0.3087, "rewards/chosen": 0.1663772612810135, "rewards/margins": 1.3137341688076656, "rewards/rejected": -1.147356907526652, "step": 710 }, { "epoch": 0.03768584528131874, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29937114.0, "logits/rejected": -2358585.25, "logps/chosen": -358.0251159667969, "logps/rejected": -123.95840454101562, "loss": 0.4384, "rewards/chosen": 0.20120197534561157, "rewards/margins": 0.5019242763519287, "rewards/rejected": -0.30072230100631714, "step": 711 }, { "epoch": 0.03773884928312088, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55253962.666666664, "logits/rejected": -4427884.4, "logps/chosen": -334.55078125, "logps/rejected": -247.0797607421875, "loss": 0.3807, "rewards/chosen": 0.09985097249348958, "rewards/margins": 0.9290410359700522, "rewards/rejected": -0.8291900634765625, "step": 712 }, { "epoch": 0.03779185328492301, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55061580.8, "logits/rejected": -26474709.333333332, "logps/chosen": -455.095849609375, "logps/rejected": -436.8805745442708, "loss": 0.3894, "rewards/chosen": 0.05776122808456421, "rewards/margins": 1.3283203800519308, "rewards/rejected": -1.2705591519673665, "step": 713 }, { "epoch": 0.037844857286725146, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74547344.0, "logits/rejected": -12717614.857142856, "logps/chosen": -234.35037231445312, "logps/rejected": -187.85628836495536, "loss": 0.3714, "rewards/chosen": -0.05724639818072319, "rewards/margins": 0.6137631423771381, "rewards/rejected": -0.6710095405578613, "step": 714 }, { "epoch": 0.03789786128852728, "grad_norm": 90.0, "kl": 0.026235580444335938, "learning_rate": 5e-07, "logits/chosen": -48781013.333333336, "logits/rejected": -41655264.0, "logps/chosen": -702.8733723958334, "logps/rejected": -758.5936279296875, "loss": 0.4144, "rewards/chosen": 0.07537003358205159, "rewards/margins": 1.3954505523045857, "rewards/rejected": -1.3200805187225342, "step": 715 }, { "epoch": 0.03795086529032942, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24287272.0, "logits/rejected": -28866446.0, "logps/chosen": -349.8610026041667, "logps/rejected": -627.2496337890625, "loss": 0.4052, "rewards/chosen": 0.17143738269805908, "rewards/margins": 1.352980375289917, "rewards/rejected": -1.181542992591858, "step": 716 }, { "epoch": 0.03800386929213156, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9069919.0, "logits/rejected": -26395096.0, "logps/chosen": -334.92449951171875, "logps/rejected": -322.9012451171875, "loss": 0.3237, "rewards/chosen": 0.2342788726091385, "rewards/margins": 1.216087058186531, "rewards/rejected": -0.9818081855773926, "step": 717 }, { "epoch": 0.038056873293933695, "grad_norm": 98.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14456940.8, "logits/rejected": -23375938.666666668, "logps/chosen": -681.442431640625, "logps/rejected": -487.454345703125, "loss": 0.3727, "rewards/chosen": 0.265325927734375, "rewards/margins": 1.3156015714009603, "rewards/rejected": -1.0502756436665852, "step": 718 }, { "epoch": 0.038109877295735825, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -197328597.33333334, "logits/rejected": -45416108.8, "logps/chosen": -290.5971272786458, "logps/rejected": -251.203515625, "loss": 0.386, "rewards/chosen": 0.11893107493718465, "rewards/margins": 0.8271522243817647, "rewards/rejected": -0.70822114944458, "step": 719 }, { "epoch": 0.03816288129753796, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42488352.0, "logits/rejected": -34038968.0, "logps/chosen": -429.31854248046875, "logps/rejected": -272.244384765625, "loss": 0.3964, "rewards/chosen": -0.10337285697460175, "rewards/margins": 0.9889384061098099, "rewards/rejected": -1.0923112630844116, "step": 720 }, { "epoch": 0.0382158852993401, "grad_norm": 61.75, "kl": 0.17721843719482422, "learning_rate": 5e-07, "logits/chosen": 1012182.5, "logits/rejected": -36694048.0, "logps/chosen": -333.88938395182294, "logps/rejected": -331.02802734375, "loss": 0.3367, "rewards/chosen": 0.23730526367823282, "rewards/margins": 1.397153667608897, "rewards/rejected": -1.159848403930664, "step": 721 }, { "epoch": 0.038268889301142237, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3078763.0, "logits/rejected": -6115173.5, "logps/chosen": -100.88650512695312, "logps/rejected": -277.60345458984375, "loss": 0.4293, "rewards/chosen": 0.03912530466914177, "rewards/margins": 0.5936994962394238, "rewards/rejected": -0.554574191570282, "step": 722 }, { "epoch": 0.038321893302944374, "grad_norm": 94.5, "kl": 0.090240478515625, "learning_rate": 5e-07, "logits/chosen": -68632592.0, "logits/rejected": -8377159.0, "logps/chosen": -906.1836547851562, "logps/rejected": -361.7192687988281, "loss": 0.4335, "rewards/chosen": 0.11964111775159836, "rewards/margins": 0.5928531214594841, "rewards/rejected": -0.47321200370788574, "step": 723 }, { "epoch": 0.03837489730474651, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54701322.666666664, "logits/rejected": -23562328.0, "logps/chosen": -181.419921875, "logps/rejected": -278.103857421875, "loss": 0.3369, "rewards/chosen": 0.0674527535835902, "rewards/margins": 1.240246523420016, "rewards/rejected": -1.1727937698364257, "step": 724 }, { "epoch": 0.03842790130654864, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26187678.0, "logits/rejected": -32822602.0, "logps/chosen": -147.2847137451172, "logps/rejected": -280.20989990234375, "loss": 0.3905, "rewards/chosen": 0.023198410868644714, "rewards/margins": 0.952608272433281, "rewards/rejected": -0.9294098615646362, "step": 725 }, { "epoch": 0.03848090530835078, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6555406.0, "logits/rejected": -5592041.0, "logps/chosen": -384.53948974609375, "logps/rejected": -125.95219421386719, "loss": 0.4138, "rewards/chosen": 0.01196594163775444, "rewards/margins": 0.7657994739711285, "rewards/rejected": -0.753833532333374, "step": 726 }, { "epoch": 0.038533909310152915, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46917411.2, "logits/rejected": -44897610.666666664, "logps/chosen": -403.944775390625, "logps/rejected": -319.7852783203125, "loss": 0.423, "rewards/chosen": 0.07693581581115723, "rewards/margins": 0.821923303604126, "rewards/rejected": -0.7449874877929688, "step": 727 }, { "epoch": 0.03858691331195505, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17274744.0, "logits/rejected": 5930842.0, "logps/chosen": -172.8734893798828, "logps/rejected": -377.50469970703125, "loss": 0.3836, "rewards/chosen": 0.11266545951366425, "rewards/margins": 0.9853678792715073, "rewards/rejected": -0.872702419757843, "step": 728 }, { "epoch": 0.03863991731375719, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22032456.0, "logits/rejected": -34335657.6, "logps/chosen": -181.60909016927084, "logps/rejected": -278.909130859375, "loss": 0.3818, "rewards/chosen": 0.22815303007761636, "rewards/margins": 0.8928423802057902, "rewards/rejected": -0.6646893501281739, "step": 729 }, { "epoch": 0.03869292131555933, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24167368.0, "logits/rejected": -16239704.0, "logps/chosen": -314.7174072265625, "logps/rejected": -247.19816080729166, "loss": 0.4327, "rewards/chosen": 0.029607999324798583, "rewards/margins": 0.7412613908449809, "rewards/rejected": -0.7116533915201823, "step": 730 }, { "epoch": 0.038745925317361464, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26199988.0, "logits/rejected": -16319931.0, "logps/chosen": -338.690185546875, "logps/rejected": -289.3343811035156, "loss": 0.4157, "rewards/chosen": -0.11205806583166122, "rewards/margins": 0.8809941783547401, "rewards/rejected": -0.9930522441864014, "step": 731 }, { "epoch": 0.038798929319163594, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42257688.0, "logits/rejected": 88969248.0, "logps/chosen": -345.91839599609375, "logps/rejected": -454.88507080078125, "loss": 0.3912, "rewards/chosen": -0.06479434669017792, "rewards/margins": 1.0214182883501053, "rewards/rejected": -1.0862126350402832, "step": 732 }, { "epoch": 0.03885193332096573, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25808261.333333332, "logits/rejected": -2411711.0, "logps/chosen": -553.7288411458334, "logps/rejected": -66.31159210205078, "loss": 0.4515, "rewards/chosen": 0.07330182194709778, "rewards/margins": 0.6506730616092682, "rewards/rejected": -0.5773712396621704, "step": 733 }, { "epoch": 0.03890493732276787, "grad_norm": 94.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19937654.666666668, "logits/rejected": 74984838.4, "logps/chosen": -276.9227294921875, "logps/rejected": -420.755078125, "loss": 0.3987, "rewards/chosen": -0.34284452597300213, "rewards/margins": 0.6536510070164998, "rewards/rejected": -0.996495532989502, "step": 734 }, { "epoch": 0.038957941324570006, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3986849.25, "logits/rejected": -35856328.0, "logps/chosen": -93.133544921875, "logps/rejected": -287.30133056640625, "loss": 0.4243, "rewards/chosen": -0.11074233055114746, "rewards/margins": 0.691931962966919, "rewards/rejected": -0.8026742935180664, "step": 735 }, { "epoch": 0.03901094532637214, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11337728.0, "logits/rejected": -84869843.2, "logps/chosen": -190.83304850260416, "logps/rejected": -468.0583984375, "loss": 0.3271, "rewards/chosen": -0.0836954116821289, "rewards/margins": 1.3049236297607423, "rewards/rejected": -1.3886190414428712, "step": 736 }, { "epoch": 0.03906394932817428, "grad_norm": 72.5, "kl": 0.124603271484375, "learning_rate": 5e-07, "logits/chosen": -79563992.0, "logits/rejected": -36435864.0, "logps/chosen": -470.20166015625, "logps/rejected": -188.87762451171875, "loss": 0.4276, "rewards/chosen": -0.005344398319721222, "rewards/margins": 0.6472350284457207, "rewards/rejected": -0.6525794267654419, "step": 737 }, { "epoch": 0.03911695332997641, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43660595.2, "logits/rejected": -6393136.0, "logps/chosen": -277.08154296875, "logps/rejected": -284.6072998046875, "loss": 0.3989, "rewards/chosen": 0.03215896487236023, "rewards/margins": 1.294281357526779, "rewards/rejected": -1.262122392654419, "step": 738 }, { "epoch": 0.03916995733177855, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13121730.666666666, "logits/rejected": -22974051.2, "logps/chosen": -227.0632527669271, "logps/rejected": -375.8521728515625, "loss": 0.3258, "rewards/chosen": -0.001040900746981303, "rewards/margins": 1.560031830271085, "rewards/rejected": -1.5610727310180663, "step": 739 }, { "epoch": 0.039222961333580685, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19550576.0, "logits/rejected": -66591208.0, "logps/chosen": -172.849365234375, "logps/rejected": -476.934814453125, "loss": 0.337, "rewards/chosen": 0.19380122423171997, "rewards/margins": 1.5771299004554749, "rewards/rejected": -1.3833286762237549, "step": 740 }, { "epoch": 0.03927596533538282, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28915322.666666668, "logits/rejected": -48585792.0, "logps/chosen": -461.4796142578125, "logps/rejected": -459.6673828125, "loss": 0.3415, "rewards/chosen": 0.038996378580729164, "rewards/margins": 1.21144863764445, "rewards/rejected": -1.1724522590637207, "step": 741 }, { "epoch": 0.03932896933718496, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62211178.666666664, "logits/rejected": -32061900.8, "logps/chosen": -518.6190999348959, "logps/rejected": -314.523779296875, "loss": 0.3671, "rewards/chosen": -0.03465677797794342, "rewards/margins": 1.005587324500084, "rewards/rejected": -1.0402441024780273, "step": 742 }, { "epoch": 0.039381973338987096, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51913941.333333336, "logits/rejected": -45270150.4, "logps/chosen": -182.4579060872396, "logps/rejected": -515.5333984375, "loss": 0.3554, "rewards/chosen": 0.010736207167307535, "rewards/margins": 1.0319117824236552, "rewards/rejected": -1.0211755752563476, "step": 743 }, { "epoch": 0.039434977340789226, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34925740.8, "logits/rejected": -46751232.0, "logps/chosen": -306.767919921875, "logps/rejected": -367.1941324869792, "loss": 0.3733, "rewards/chosen": 0.10886390209197998, "rewards/margins": 1.4703559001286823, "rewards/rejected": -1.3614919980367024, "step": 744 }, { "epoch": 0.03948798134259136, "grad_norm": 78.0, "kl": 0.397216796875, "learning_rate": 5e-07, "logits/chosen": -88032008.0, "logits/rejected": -6414995.0, "logps/chosen": -537.226318359375, "logps/rejected": -382.11260986328125, "loss": 0.3748, "rewards/chosen": 0.07094039767980576, "rewards/margins": 1.1365963444113731, "rewards/rejected": -1.0656559467315674, "step": 745 }, { "epoch": 0.0395409853443935, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16965056.0, "logits/rejected": -15978142.666666666, "logps/chosen": -261.4190368652344, "logps/rejected": -225.5769246419271, "loss": 0.3581, "rewards/chosen": 0.11368294060230255, "rewards/margins": 0.8860799819231033, "rewards/rejected": -0.7723970413208008, "step": 746 }, { "epoch": 0.03959398934619564, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5877056.0, "logits/rejected": -9457289.6, "logps/chosen": -61.598653157552086, "logps/rejected": -186.957666015625, "loss": 0.362, "rewards/chosen": -0.1051898996035258, "rewards/margins": 0.9832563360532126, "rewards/rejected": -1.0884462356567384, "step": 747 }, { "epoch": 0.039646993347997775, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -85942528.0, "logits/rejected": -23333170.666666668, "logps/chosen": -310.694580078125, "logps/rejected": -253.79459635416666, "loss": 0.3451, "rewards/chosen": -0.5976364612579346, "rewards/margins": 0.6250919500986736, "rewards/rejected": -1.2227284113566081, "step": 748 }, { "epoch": 0.03969999734979991, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65908842.666666664, "logits/rejected": -11773973.6, "logps/chosen": -523.4549967447916, "logps/rejected": -398.31220703125, "loss": 0.3609, "rewards/chosen": -0.01322733610868454, "rewards/margins": 1.1887018516659738, "rewards/rejected": -1.2019291877746583, "step": 749 }, { "epoch": 0.03975300135160205, "grad_norm": 51.75, "kl": 0.3307533264160156, "learning_rate": 5e-07, "logits/chosen": -40086380.0, "logits/rejected": 657092.125, "logps/chosen": -201.90512084960938, "logps/rejected": -123.79104614257812, "loss": 0.4146, "rewards/chosen": 0.26892322301864624, "rewards/margins": 0.7773075103759766, "rewards/rejected": -0.5083842873573303, "step": 750 }, { "epoch": 0.03980600535340418, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16692140.0, "logits/rejected": -34964036.0, "logps/chosen": -242.7214813232422, "logps/rejected": -384.4458923339844, "loss": 0.3086, "rewards/chosen": 0.1595126986503601, "rewards/margins": 1.922410786151886, "rewards/rejected": -1.7628980875015259, "step": 751 }, { "epoch": 0.03985900935520632, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3133025.6666666665, "logits/rejected": 25031048.0, "logps/chosen": -79.76145935058594, "logps/rejected": -380.0476989746094, "loss": 0.4405, "rewards/chosen": -0.021757061282793682, "rewards/margins": 1.1810345103343327, "rewards/rejected": -1.2027915716171265, "step": 752 }, { "epoch": 0.039912013357008454, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37764906.666666664, "logits/rejected": -80765964.8, "logps/chosen": -254.85286458333334, "logps/rejected": -141.598095703125, "loss": 0.3925, "rewards/chosen": 0.034035682678222656, "rewards/margins": 0.7331232070922852, "rewards/rejected": -0.6990875244140625, "step": 753 }, { "epoch": 0.03996501735881059, "grad_norm": 61.25, "kl": 0.0310821533203125, "learning_rate": 5e-07, "logits/chosen": -80808182.85714285, "logits/rejected": -4340365.0, "logps/chosen": -310.30684988839283, "logps/rejected": -58.176937103271484, "loss": 0.4766, "rewards/chosen": 0.01858318703515189, "rewards/margins": 0.6876357282911029, "rewards/rejected": -0.6690525412559509, "step": 754 }, { "epoch": 0.04001802136061273, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35647696.0, "logits/rejected": -50453456.0, "logps/chosen": -365.3277282714844, "logps/rejected": -371.910400390625, "loss": 0.328, "rewards/chosen": 0.19342002272605896, "rewards/margins": 1.567243903875351, "rewards/rejected": -1.373823881149292, "step": 755 }, { "epoch": 0.040071025362414865, "grad_norm": 85.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8022648.0, "logits/rejected": -44146528.0, "logps/chosen": -1422.0174560546875, "logps/rejected": -307.0203450520833, "loss": 0.3753, "rewards/chosen": -0.049285903573036194, "rewards/margins": 0.6909304310878118, "rewards/rejected": -0.740216334660848, "step": 756 }, { "epoch": 0.040124029364216995, "grad_norm": 81.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39589744.0, "logits/rejected": -38262210.666666664, "logps/chosen": -562.66708984375, "logps/rejected": -733.2928059895834, "loss": 0.4313, "rewards/chosen": -0.2675281524658203, "rewards/margins": 1.3212037722269696, "rewards/rejected": -1.5887319246927898, "step": 757 }, { "epoch": 0.04017703336601913, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3591135.3333333335, "logits/rejected": -32870668.8, "logps/chosen": -163.75581868489584, "logps/rejected": -341.6419921875, "loss": 0.3837, "rewards/chosen": -0.09059410293896993, "rewards/margins": 0.7855475405851999, "rewards/rejected": -0.8761416435241699, "step": 758 }, { "epoch": 0.04023003736782127, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16653680.0, "logits/rejected": 231715.5625, "logps/chosen": -281.82562255859375, "logps/rejected": -87.06769561767578, "loss": 0.4285, "rewards/chosen": 0.23607182502746582, "rewards/margins": 0.7027777135372162, "rewards/rejected": -0.46670588850975037, "step": 759 }, { "epoch": 0.04028304136962341, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57621464.0, "logits/rejected": -44971312.0, "logps/chosen": -312.2678527832031, "logps/rejected": -378.06561279296875, "loss": 0.3644, "rewards/chosen": 0.12293072044849396, "rewards/margins": 1.2224874049425125, "rewards/rejected": -1.0995566844940186, "step": 760 }, { "epoch": 0.040336045371425544, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11171650.0, "logits/rejected": -15417830.666666666, "logps/chosen": -142.9630584716797, "logps/rejected": -300.0606282552083, "loss": 0.3484, "rewards/chosen": -0.07597503811120987, "rewards/margins": 0.9573664491375287, "rewards/rejected": -1.0333414872487385, "step": 761 }, { "epoch": 0.04038904937322768, "grad_norm": 62.0, "kl": 0.12875747680664062, "learning_rate": 5e-07, "logits/chosen": -76609318.4, "logits/rejected": -72889952.0, "logps/chosen": -300.1365234375, "logps/rejected": -534.4321695963541, "loss": 0.3926, "rewards/chosen": -0.06805450320243836, "rewards/margins": 1.473667158683141, "rewards/rejected": -1.5417216618855794, "step": 762 }, { "epoch": 0.04044205337502981, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11783923.2, "logits/rejected": -17528710.666666668, "logps/chosen": -257.707373046875, "logps/rejected": -254.1546427408854, "loss": 0.3913, "rewards/chosen": 0.15219597816467284, "rewards/margins": 1.2343136946360271, "rewards/rejected": -1.0821177164713542, "step": 763 }, { "epoch": 0.04049505737683195, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2257682.75, "logits/rejected": -38591808.0, "logps/chosen": -120.75433349609375, "logps/rejected": -378.034423828125, "loss": 0.3603, "rewards/chosen": -0.08396968990564346, "rewards/margins": 0.9165177966157596, "rewards/rejected": -1.000487486521403, "step": 764 }, { "epoch": 0.040548061378634086, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68956808.0, "logits/rejected": -7928754.0, "logps/chosen": -905.4127197265625, "logps/rejected": -237.6390177408854, "loss": 0.292, "rewards/chosen": 0.17901915311813354, "rewards/margins": 1.3866578936576843, "rewards/rejected": -1.2076387405395508, "step": 765 }, { "epoch": 0.04060106538043622, "grad_norm": 76.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 20284612.0, "logits/rejected": -9915187.0, "logps/chosen": -285.2510070800781, "logps/rejected": -85.89314270019531, "loss": 0.4151, "rewards/chosen": 0.00911615788936615, "rewards/margins": 0.7543155997991562, "rewards/rejected": -0.74519944190979, "step": 766 }, { "epoch": 0.04065406938223836, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8020554.666666667, "logits/rejected": -23230978.0, "logps/chosen": -181.5270792643229, "logps/rejected": -146.07786560058594, "loss": 0.4678, "rewards/chosen": -0.009588209291299185, "rewards/margins": 0.5497915471593539, "rewards/rejected": -0.5593797564506531, "step": 767 }, { "epoch": 0.0407070733840405, "grad_norm": 71.0, "kl": 0.11925888061523438, "learning_rate": 5e-07, "logits/chosen": -45497478.4, "logits/rejected": -9028810.666666666, "logps/chosen": -317.863134765625, "logps/rejected": -361.6653238932292, "loss": 0.393, "rewards/chosen": 0.04877914488315582, "rewards/margins": 1.4788552671670914, "rewards/rejected": -1.4300761222839355, "step": 768 }, { "epoch": 0.040760077385842634, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33938288.0, "logits/rejected": -7621614.0, "logps/chosen": -221.90098571777344, "logps/rejected": -155.85855102539062, "loss": 0.4248, "rewards/chosen": 0.05708923190832138, "rewards/margins": 0.646971307694912, "rewards/rejected": -0.5898820757865906, "step": 769 }, { "epoch": 0.040813081387644765, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40784080.0, "logits/rejected": -16019006.4, "logps/chosen": -322.6855061848958, "logps/rejected": -207.710546875, "loss": 0.3692, "rewards/chosen": -0.10570500294367473, "rewards/margins": 0.8988146503766378, "rewards/rejected": -1.0045196533203125, "step": 770 }, { "epoch": 0.0408660853894469, "grad_norm": 47.75, "kl": 0.4144706726074219, "learning_rate": 5e-07, "logits/chosen": 72904312.0, "logits/rejected": -31547168.0, "logps/chosen": -199.36636352539062, "logps/rejected": -312.6044616699219, "loss": 0.3797, "rewards/chosen": 0.002897031605243683, "rewards/margins": 1.3075972869992256, "rewards/rejected": -1.304700255393982, "step": 771 }, { "epoch": 0.04091908939124904, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37236613.333333336, "logits/rejected": -44493888.0, "logps/chosen": -485.1399739583333, "logps/rejected": -318.3490234375, "loss": 0.3339, "rewards/chosen": 0.08753357330958049, "rewards/margins": 1.286886219183604, "rewards/rejected": -1.1993526458740233, "step": 772 }, { "epoch": 0.040972093393051176, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10006616.0, "logits/rejected": -36007176.0, "logps/chosen": -88.03460693359375, "logps/rejected": -272.6976318359375, "loss": 0.3933, "rewards/chosen": 0.01237541139125824, "rewards/margins": 1.5119688282410304, "rewards/rejected": -1.4995934168497722, "step": 773 }, { "epoch": 0.04102509739485331, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23035483.2, "logits/rejected": -11104372.0, "logps/chosen": -206.737939453125, "logps/rejected": -224.4908447265625, "loss": 0.4507, "rewards/chosen": -0.15439788103103638, "rewards/margins": 0.6910529971122742, "rewards/rejected": -0.8454508781433105, "step": 774 }, { "epoch": 0.04107810139665545, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47950560.0, "logits/rejected": -30850536.0, "logps/chosen": -460.2121887207031, "logps/rejected": -321.12451171875, "loss": 0.3741, "rewards/chosen": 0.07599639892578125, "rewards/margins": 1.1931613683700562, "rewards/rejected": -1.117164969444275, "step": 775 }, { "epoch": 0.04113110539845758, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2234659.6666666665, "logits/rejected": -47076339.2, "logps/chosen": -148.4325968424479, "logps/rejected": -583.88984375, "loss": 0.3486, "rewards/chosen": 0.11083908875783284, "rewards/margins": 1.2844233592351277, "rewards/rejected": -1.173584270477295, "step": 776 }, { "epoch": 0.04118410940025972, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46116640.0, "logits/rejected": -316973.2916666667, "logps/chosen": -253.7784912109375, "logps/rejected": -120.13614908854167, "loss": 0.4383, "rewards/chosen": 0.12177642583847045, "rewards/margins": 0.6363865971565247, "rewards/rejected": -0.5146101713180542, "step": 777 }, { "epoch": 0.041237113402061855, "grad_norm": 92.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -106111520.0, "logits/rejected": -24125901.333333332, "logps/chosen": -1805.0087890625, "logps/rejected": -346.2995198567708, "loss": 0.2859, "rewards/chosen": 0.21769410371780396, "rewards/margins": 1.559541364510854, "rewards/rejected": -1.34184726079305, "step": 778 }, { "epoch": 0.04129011740386399, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22119434.0, "logits/rejected": -13970069.0, "logps/chosen": -129.55946350097656, "logps/rejected": -311.77020263671875, "loss": 0.3759, "rewards/chosen": -0.17864704132080078, "rewards/margins": 1.204380989074707, "rewards/rejected": -1.3830280303955078, "step": 779 }, { "epoch": 0.04134312140566613, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49584693.333333336, "logits/rejected": -42499366.4, "logps/chosen": -443.3382161458333, "logps/rejected": -233.3241455078125, "loss": 0.3528, "rewards/chosen": 0.44780019919077557, "rewards/margins": 1.2098588864008586, "rewards/rejected": -0.762058687210083, "step": 780 }, { "epoch": 0.041396125407468266, "grad_norm": 58.25, "kl": 0.09413528442382812, "learning_rate": 5e-07, "logits/chosen": -37063674.666666664, "logits/rejected": -30660014.0, "logps/chosen": -267.16908772786456, "logps/rejected": -327.6402893066406, "loss": 0.4367, "rewards/chosen": 0.00806604822476705, "rewards/margins": 1.1440693040688832, "rewards/rejected": -1.1360032558441162, "step": 781 }, { "epoch": 0.0414491294092704, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 378067.9375, "logits/rejected": -7798548.0, "logps/chosen": -171.80702209472656, "logps/rejected": -64.96937561035156, "loss": 0.4331, "rewards/chosen": 0.03664703667163849, "rewards/margins": 0.5507795363664627, "rewards/rejected": -0.5141324996948242, "step": 782 }, { "epoch": 0.041502133411072534, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13489873.6, "logits/rejected": -18337716.0, "logps/chosen": -159.9955322265625, "logps/rejected": -229.22786458333334, "loss": 0.4039, "rewards/chosen": 0.040617871284484866, "rewards/margins": 1.1183428049087525, "rewards/rejected": -1.0777249336242676, "step": 783 }, { "epoch": 0.04155513741287467, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15635971.2, "logits/rejected": -12558478.666666666, "logps/chosen": -355.95322265625, "logps/rejected": -274.8190511067708, "loss": 0.4509, "rewards/chosen": -0.054696661233901975, "rewards/margins": 0.6425698379675547, "rewards/rejected": -0.6972664992014567, "step": 784 }, { "epoch": 0.04160814141467681, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24557222.0, "logits/rejected": -38742124.0, "logps/chosen": -290.4659729003906, "logps/rejected": -446.20806884765625, "loss": 0.4068, "rewards/chosen": -0.37064629793167114, "rewards/margins": 1.2859821915626526, "rewards/rejected": -1.6566284894943237, "step": 785 }, { "epoch": 0.041661145416478945, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29690018.0, "logits/rejected": -22428158.0, "logps/chosen": -193.09820556640625, "logps/rejected": -187.6837921142578, "loss": 0.4208, "rewards/chosen": -0.05465611070394516, "rewards/margins": 0.8122842088341713, "rewards/rejected": -0.8669403195381165, "step": 786 }, { "epoch": 0.04171414941828108, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57730538.666666664, "logits/rejected": -10379275.2, "logps/chosen": -403.3902994791667, "logps/rejected": -275.43662109375, "loss": 0.3378, "rewards/chosen": 0.2268290321032206, "rewards/margins": 1.2405901710192364, "rewards/rejected": -1.0137611389160157, "step": 787 }, { "epoch": 0.04176715342008322, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -103973752.0, "logits/rejected": -49426012.0, "logps/chosen": -256.30157470703125, "logps/rejected": -597.4227905273438, "loss": 0.2971, "rewards/chosen": -0.026656903326511383, "rewards/margins": 2.4998933896422386, "rewards/rejected": -2.52655029296875, "step": 788 }, { "epoch": 0.04182015742188535, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27236508.0, "logits/rejected": -8464389.0, "logps/chosen": -185.41403198242188, "logps/rejected": -182.83554077148438, "loss": 0.4169, "rewards/chosen": 0.07049807906150818, "rewards/margins": 0.6977616250514984, "rewards/rejected": -0.6272635459899902, "step": 789 }, { "epoch": 0.04187316142368749, "grad_norm": 64.5, "kl": 0.17515945434570312, "learning_rate": 5e-07, "logits/chosen": -14951490.666666666, "logits/rejected": -180954416.0, "logps/chosen": -289.3210042317708, "logps/rejected": -474.5202331542969, "loss": 0.4097, "rewards/chosen": 0.1416622201601664, "rewards/margins": 1.432597021261851, "rewards/rejected": -1.2909348011016846, "step": 790 }, { "epoch": 0.041926165425489624, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14509205.0, "logits/rejected": -23954648.0, "logps/chosen": -149.93711853027344, "logps/rejected": -445.8136800130208, "loss": 0.3163, "rewards/chosen": 0.1645715832710266, "rewards/margins": 1.649194061756134, "rewards/rejected": -1.4846224784851074, "step": 791 }, { "epoch": 0.04197916942729176, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23518909.333333332, "logits/rejected": -44076848.0, "logps/chosen": -332.70237223307294, "logps/rejected": -262.36456298828125, "loss": 0.4562, "rewards/chosen": -0.05506961544354757, "rewards/margins": 0.9520314435164133, "rewards/rejected": -1.007101058959961, "step": 792 }, { "epoch": 0.0420321734290939, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21314546.666666668, "logits/rejected": -10090160.0, "logps/chosen": -202.5071004231771, "logps/rejected": -207.275927734375, "loss": 0.4075, "rewards/chosen": 0.02295595407485962, "rewards/margins": 0.6958192467689515, "rewards/rejected": -0.6728632926940918, "step": 793 }, { "epoch": 0.042085177430896036, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31215026.666666668, "logits/rejected": -35503040.0, "logps/chosen": -398.6709798177083, "logps/rejected": -270.6338134765625, "loss": 0.4003, "rewards/chosen": -0.02274678647518158, "rewards/margins": 0.6731155067682266, "rewards/rejected": -0.6958622932434082, "step": 794 }, { "epoch": 0.042138181432698166, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16429038.0, "logits/rejected": -17825804.0, "logps/chosen": -268.15203857421875, "logps/rejected": -155.71649169921875, "loss": 0.4002, "rewards/chosen": 0.20789781212806702, "rewards/margins": 0.8704722225666046, "rewards/rejected": -0.6625744104385376, "step": 795 }, { "epoch": 0.0421911854345003, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7077610.666666667, "logits/rejected": -42214374.4, "logps/chosen": -169.5943806966146, "logps/rejected": -335.607275390625, "loss": 0.3162, "rewards/chosen": -0.028473665316899616, "rewards/margins": 1.4062376002470653, "rewards/rejected": -1.4347112655639649, "step": 796 }, { "epoch": 0.04224418943630244, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69079308.8, "logits/rejected": -47539738.666666664, "logps/chosen": -466.56337890625, "logps/rejected": -297.71327718098956, "loss": 0.4098, "rewards/chosen": -0.03449585139751434, "rewards/margins": 1.2650693049033481, "rewards/rejected": -1.2995651563008626, "step": 797 }, { "epoch": 0.04229719343810458, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25978322.666666668, "logits/rejected": -16347505.6, "logps/chosen": -135.97720336914062, "logps/rejected": -310.9434814453125, "loss": 0.3827, "rewards/chosen": -0.23424363136291504, "rewards/margins": 0.8002966403961183, "rewards/rejected": -1.0345402717590333, "step": 798 }, { "epoch": 0.042350197439906714, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10270994.666666666, "logits/rejected": -31668691.2, "logps/chosen": -192.5303955078125, "logps/rejected": -431.7361328125, "loss": 0.2956, "rewards/chosen": 0.1092694600423177, "rewards/margins": 1.6785338719685872, "rewards/rejected": -1.5692644119262695, "step": 799 }, { "epoch": 0.04240320144170885, "grad_norm": 60.0, "kl": 0.06104278564453125, "learning_rate": 5e-07, "logits/chosen": -46267882.666666664, "logits/rejected": -18177820.0, "logps/chosen": -301.99586995442706, "logps/rejected": -497.9830017089844, "loss": 0.396, "rewards/chosen": 0.1500017245610555, "rewards/margins": 1.7170122464497883, "rewards/rejected": -1.567010521888733, "step": 800 }, { "epoch": 0.04245620544351098, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5313506.5, "logits/rejected": -12324898.666666666, "logps/chosen": -37.63868713378906, "logps/rejected": -243.80098470052084, "loss": 0.3783, "rewards/chosen": -0.18623018264770508, "rewards/margins": 0.6085762977600098, "rewards/rejected": -0.7948064804077148, "step": 801 }, { "epoch": 0.04250920944531312, "grad_norm": 55.5, "kl": 0.3043670654296875, "learning_rate": 5e-07, "logits/chosen": 8411934.0, "logits/rejected": -41020828.0, "logps/chosen": -195.5248260498047, "logps/rejected": -337.1378173828125, "loss": 0.3789, "rewards/chosen": 0.047616906464099884, "rewards/margins": 1.1377247050404549, "rewards/rejected": -1.090107798576355, "step": 802 }, { "epoch": 0.042562213447115256, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 557633.8125, "logits/rejected": -3270957.0, "logps/chosen": -126.98517608642578, "logps/rejected": -143.1565704345703, "loss": 0.3911, "rewards/chosen": 0.1960892230272293, "rewards/margins": 0.918209508061409, "rewards/rejected": -0.7221202850341797, "step": 803 }, { "epoch": 0.04261521744891739, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 173011520.0, "logits/rejected": -20586547.2, "logps/chosen": -276.08221435546875, "logps/rejected": -324.133642578125, "loss": 0.3099, "rewards/chosen": 0.13131014506022134, "rewards/margins": 1.5359948794047038, "rewards/rejected": -1.4046847343444824, "step": 804 }, { "epoch": 0.04266822145071953, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34325968.0, "logits/rejected": -43985608.0, "logps/chosen": -312.5453186035156, "logps/rejected": -298.0598449707031, "loss": 0.3674, "rewards/chosen": 0.03895911946892738, "rewards/margins": 1.2387674786150455, "rewards/rejected": -1.1998083591461182, "step": 805 }, { "epoch": 0.04272122545252167, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24782128.0, "logits/rejected": -32074157.333333332, "logps/chosen": -482.17333984375, "logps/rejected": -381.8129069010417, "loss": 0.3363, "rewards/chosen": -0.016500860452651978, "rewards/margins": 0.9755028982957205, "rewards/rejected": -0.9920037587483724, "step": 806 }, { "epoch": 0.042774229454323805, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10919243.2, "logits/rejected": -26319573.333333332, "logps/chosen": -288.39169921875, "logps/rejected": -259.9099934895833, "loss": 0.3947, "rewards/chosen": 0.06793243885040283, "rewards/margins": 1.2227105220158894, "rewards/rejected": -1.1547780831654866, "step": 807 }, { "epoch": 0.042827233456125935, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -191754608.0, "logits/rejected": -36132525.71428572, "logps/chosen": -889.400146484375, "logps/rejected": -410.46756417410717, "loss": 0.2705, "rewards/chosen": -0.15233154594898224, "rewards/margins": 1.1570341352905547, "rewards/rejected": -1.309365681239537, "step": 808 }, { "epoch": 0.04288023745792807, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58203196.0, "logits/rejected": -66817496.0, "logps/chosen": -475.7894592285156, "logps/rejected": -457.02532958984375, "loss": 0.3184, "rewards/chosen": 0.24713516235351562, "rewards/margins": 1.799111247062683, "rewards/rejected": -1.5519760847091675, "step": 809 }, { "epoch": 0.04293324145973021, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52078025.14285714, "logits/rejected": -29327746.0, "logps/chosen": -368.10177176339283, "logps/rejected": -349.044921875, "loss": 0.4925, "rewards/chosen": -0.016972298068659648, "rewards/margins": 0.3455277138522693, "rewards/rejected": -0.36250001192092896, "step": 810 }, { "epoch": 0.042986245461532346, "grad_norm": 80.0, "kl": 0.5573883056640625, "learning_rate": 5e-07, "logits/chosen": -23908641.6, "logits/rejected": -7151494.666666667, "logps/chosen": -645.033349609375, "logps/rejected": -548.50439453125, "loss": 0.3358, "rewards/chosen": 0.25858273506164553, "rewards/margins": 2.737192932764689, "rewards/rejected": -2.4786101977030435, "step": 811 }, { "epoch": 0.043039249463334484, "grad_norm": 76.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52756246.4, "logits/rejected": -37306336.0, "logps/chosen": -539.17958984375, "logps/rejected": -355.2698567708333, "loss": 0.3847, "rewards/chosen": 0.2183095932006836, "rewards/margins": 1.3161036491394043, "rewards/rejected": -1.0977940559387207, "step": 812 }, { "epoch": 0.04309225346513662, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45965440.0, "logits/rejected": -18585436.0, "logps/chosen": -411.2674865722656, "logps/rejected": -287.01336669921875, "loss": 0.4593, "rewards/chosen": -0.19419345259666443, "rewards/margins": 0.3687876760959625, "rewards/rejected": -0.562981128692627, "step": 813 }, { "epoch": 0.04314525746693875, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42832394.666666664, "logits/rejected": -28743635.2, "logps/chosen": -395.0847981770833, "logps/rejected": -253.8419677734375, "loss": 0.3996, "rewards/chosen": -0.1598226030667623, "rewards/margins": 0.6528101483980814, "rewards/rejected": -0.8126327514648437, "step": 814 }, { "epoch": 0.04319826146874089, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41809408.0, "logits/rejected": -6052140.0, "logps/chosen": -250.47503662109375, "logps/rejected": -189.517822265625, "loss": 0.3946, "rewards/chosen": 0.16347312927246094, "rewards/margins": 0.9064720273017883, "rewards/rejected": -0.7429988980293274, "step": 815 }, { "epoch": 0.043251265470543025, "grad_norm": 66.5, "kl": 0.30042266845703125, "learning_rate": 5e-07, "logits/chosen": -13798817.333333334, "logits/rejected": -14645374.0, "logps/chosen": -342.0240071614583, "logps/rejected": -139.92620849609375, "loss": 0.4329, "rewards/chosen": -0.015520731608072916, "rewards/margins": 1.4308888514836628, "rewards/rejected": -1.4464095830917358, "step": 816 }, { "epoch": 0.04330426947234516, "grad_norm": 46.0, "kl": 0.07606315612792969, "learning_rate": 5e-07, "logits/chosen": 7306988.0, "logits/rejected": -33512640.0, "logps/chosen": -131.773876953125, "logps/rejected": -186.27205403645834, "loss": 0.4767, "rewards/chosen": 0.020109404623508454, "rewards/margins": 0.2570522124568621, "rewards/rejected": -0.23694280783335367, "step": 817 }, { "epoch": 0.0433572734741473, "grad_norm": 89.0, "kl": 0.14348983764648438, "learning_rate": 5e-07, "logits/chosen": -16413394.0, "logits/rejected": -14182512.0, "logps/chosen": -881.396728515625, "logps/rejected": -232.14328002929688, "loss": 0.3534, "rewards/chosen": 0.3643386662006378, "rewards/margins": 1.3200136125087738, "rewards/rejected": -0.955674946308136, "step": 818 }, { "epoch": 0.04341027747594944, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 38423336.0, "logits/rejected": -24372413.333333332, "logps/chosen": -484.1325378417969, "logps/rejected": -367.4293619791667, "loss": 0.3375, "rewards/chosen": -0.06859435886144638, "rewards/margins": 1.0086864804228146, "rewards/rejected": -1.077280839284261, "step": 819 }, { "epoch": 0.04346328147775157, "grad_norm": 61.75, "kl": 0.018035888671875, "learning_rate": 5e-07, "logits/chosen": -13587094.4, "logits/rejected": -69125664.0, "logps/chosen": -302.0971923828125, "logps/rejected": -195.45916748046875, "loss": 0.423, "rewards/chosen": 0.04025872349739075, "rewards/margins": 0.8457869668801626, "rewards/rejected": -0.8055282433827718, "step": 820 }, { "epoch": 0.043516285479553704, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2180553.75, "logits/rejected": -32595588.57142857, "logps/chosen": -74.80616760253906, "logps/rejected": -387.5234375, "loss": 0.2807, "rewards/chosen": 0.039752960205078125, "rewards/margins": 1.2373783929007394, "rewards/rejected": -1.1976254326956612, "step": 821 }, { "epoch": 0.04356928948135584, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1264004.0, "logits/rejected": -33954003.2, "logps/chosen": -340.0978597005208, "logps/rejected": -466.790087890625, "loss": 0.3333, "rewards/chosen": -0.03110809127489726, "rewards/margins": 1.3103262921174366, "rewards/rejected": -1.341434383392334, "step": 822 }, { "epoch": 0.04362229348315798, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7627190.0, "logits/rejected": -1140114.8, "logps/chosen": -78.50473022460938, "logps/rejected": -303.051318359375, "loss": 0.3623, "rewards/chosen": 0.003908028205235799, "rewards/margins": 1.0111502309640248, "rewards/rejected": -1.007242202758789, "step": 823 }, { "epoch": 0.043675297484960116, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16261377.0, "logits/rejected": -24345325.333333332, "logps/chosen": -235.61178588867188, "logps/rejected": -174.60137939453125, "loss": 0.381, "rewards/chosen": -0.13528481125831604, "rewards/margins": 0.7556927303473154, "rewards/rejected": -0.8909775416056315, "step": 824 }, { "epoch": 0.04372830148676225, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52253536.0, "logits/rejected": -35728512.0, "logps/chosen": -609.8384602864584, "logps/rejected": -272.159033203125, "loss": 0.3743, "rewards/chosen": -0.10822143157323201, "rewards/margins": 0.9073228875795999, "rewards/rejected": -1.015544319152832, "step": 825 }, { "epoch": 0.04378130548856439, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55111424.0, "logits/rejected": -30988124.0, "logps/chosen": -302.614501953125, "logps/rejected": -439.1798095703125, "loss": 0.3359, "rewards/chosen": 0.056052349507808685, "rewards/margins": 1.577861450612545, "rewards/rejected": -1.5218091011047363, "step": 826 }, { "epoch": 0.04383430949036652, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15441138.666666666, "logits/rejected": -32110316.0, "logps/chosen": -457.6693522135417, "logps/rejected": -490.9873046875, "loss": 0.3899, "rewards/chosen": 0.16436869899431863, "rewards/margins": 2.1839228371779122, "rewards/rejected": -2.0195541381835938, "step": 827 }, { "epoch": 0.04388731349216866, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53810240.0, "logits/rejected": -9706956.0, "logps/chosen": -241.0330810546875, "logps/rejected": -156.5283203125, "loss": 0.4317, "rewards/chosen": -0.0010655373334884644, "rewards/margins": 0.5777517110109329, "rewards/rejected": -0.5788172483444214, "step": 828 }, { "epoch": 0.043940317493970794, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22229451.2, "logits/rejected": -31707373.333333332, "logps/chosen": -202.045263671875, "logps/rejected": -467.1298828125, "loss": 0.3945, "rewards/chosen": -0.1110230803489685, "rewards/margins": 1.559807328383128, "rewards/rejected": -1.6708304087320964, "step": 829 }, { "epoch": 0.04399332149577293, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3471915.5, "logits/rejected": -4981266.5, "logps/chosen": -213.21090698242188, "logps/rejected": -267.249267578125, "loss": 0.3897, "rewards/chosen": 0.11435041576623917, "rewards/margins": 0.9480019584298134, "rewards/rejected": -0.8336515426635742, "step": 830 }, { "epoch": 0.04404632549757507, "grad_norm": 73.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30973478.4, "logits/rejected": -72795717.33333333, "logps/chosen": -426.612060546875, "logps/rejected": -332.82090250651044, "loss": 0.3932, "rewards/chosen": 0.22378723621368407, "rewards/margins": 1.076027798652649, "rewards/rejected": -0.8522405624389648, "step": 831 }, { "epoch": 0.044099329499377206, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19735344.0, "logits/rejected": -44233696.0, "logps/chosen": -333.0643310546875, "logps/rejected": -213.900634765625, "loss": 0.3228, "rewards/chosen": -0.18404541909694672, "rewards/margins": 0.7429202262844358, "rewards/rejected": -0.9269656453813825, "step": 832 }, { "epoch": 0.044152333501179336, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23965762.0, "logits/rejected": -12936328.0, "logps/chosen": -218.475341796875, "logps/rejected": -214.91363525390625, "loss": 0.3888, "rewards/chosen": 0.009407900273799896, "rewards/margins": 0.6425084099173546, "rewards/rejected": -0.6331005096435547, "step": 833 }, { "epoch": 0.04420533750298147, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40459917.71428572, "logits/rejected": -21890134.0, "logps/chosen": -388.9193638392857, "logps/rejected": -436.9042663574219, "loss": 0.4263, "rewards/chosen": 0.12084536041532244, "rewards/margins": 2.1745074221066067, "rewards/rejected": -2.053662061691284, "step": 834 }, { "epoch": 0.04425834150478361, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13730478.666666666, "logits/rejected": -13835496.0, "logps/chosen": -225.13374837239584, "logps/rejected": -248.1458740234375, "loss": 0.4107, "rewards/chosen": 0.10472209254900615, "rewards/margins": 0.7124354104200998, "rewards/rejected": -0.6077133178710937, "step": 835 }, { "epoch": 0.04431134550658575, "grad_norm": 66.5, "kl": 0.20557022094726562, "learning_rate": 5e-07, "logits/chosen": -31429242.0, "logits/rejected": -16594375.0, "logps/chosen": -600.2833251953125, "logps/rejected": -347.0199890136719, "loss": 0.3192, "rewards/chosen": 0.4669734537601471, "rewards/margins": 1.7256633341312408, "rewards/rejected": -1.2586898803710938, "step": 836 }, { "epoch": 0.044364349508387885, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -124109232.0, "logits/rejected": -9305689.333333334, "logps/chosen": -586.2725830078125, "logps/rejected": -325.94065348307294, "loss": 0.3422, "rewards/chosen": 0.16856002807617188, "rewards/margins": 1.0709026654561362, "rewards/rejected": -0.9023426373799642, "step": 837 }, { "epoch": 0.04441735351019002, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28308572.0, "logits/rejected": -17196624.0, "logps/chosen": -275.7161865234375, "logps/rejected": -229.9454345703125, "loss": 0.4182, "rewards/chosen": 0.003988074138760567, "rewards/margins": 0.8359126318246126, "rewards/rejected": -0.831924557685852, "step": 838 }, { "epoch": 0.04447035751199215, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34334008.0, "logits/rejected": -47097252.0, "logps/chosen": -42.537506103515625, "logps/rejected": -423.47259521484375, "loss": 0.3527, "rewards/chosen": 0.048282478004693985, "rewards/margins": 1.4121275879442692, "rewards/rejected": -1.3638451099395752, "step": 839 }, { "epoch": 0.04452336151379429, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40647392.0, "logits/rejected": -33742653.333333336, "logps/chosen": -154.6840057373047, "logps/rejected": -425.8055419921875, "loss": 0.3096, "rewards/chosen": 0.03211822360754013, "rewards/margins": 1.2255351369579632, "rewards/rejected": -1.193416913350423, "step": 840 }, { "epoch": 0.044576365515596426, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38371364.571428575, "logits/rejected": -55580000.0, "logps/chosen": -167.14327566964286, "logps/rejected": -379.896240234375, "loss": 0.4371, "rewards/chosen": 0.08868140833718437, "rewards/margins": 1.8159092153821672, "rewards/rejected": -1.727227807044983, "step": 841 }, { "epoch": 0.044629369517398564, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74977808.0, "logits/rejected": -22903475.2, "logps/chosen": -428.7980143229167, "logps/rejected": -598.38125, "loss": 0.2807, "rewards/chosen": 0.17688904205958048, "rewards/margins": 2.007541278998057, "rewards/rejected": -1.8306522369384766, "step": 842 }, { "epoch": 0.0446823735192007, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38031500.8, "logits/rejected": -3395698.6666666665, "logps/chosen": -183.165234375, "logps/rejected": -75.37305704752605, "loss": 0.4564, "rewards/chosen": 0.02821533679962158, "rewards/margins": 0.45880171457926433, "rewards/rejected": -0.43058637777964276, "step": 843 }, { "epoch": 0.04473537752100284, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17423184.0, "logits/rejected": -34458468.571428575, "logps/chosen": -541.16796875, "logps/rejected": -195.83119419642858, "loss": 0.2869, "rewards/chosen": 0.22382812201976776, "rewards/margins": 1.3438321628740855, "rewards/rejected": -1.1200040408543177, "step": 844 }, { "epoch": 0.044788381522804975, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78552426.66666667, "logits/rejected": -24471128.0, "logps/chosen": -248.563720703125, "logps/rejected": -491.3978515625, "loss": 0.3238, "rewards/chosen": 0.3132942318916321, "rewards/margins": 2.274003040790558, "rewards/rejected": -1.9607088088989257, "step": 845 }, { "epoch": 0.044841385524607105, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1629153.0, "logits/rejected": -10668348.8, "logps/chosen": -143.16455078125, "logps/rejected": -207.981591796875, "loss": 0.3564, "rewards/chosen": 0.31900501251220703, "rewards/margins": 1.095127534866333, "rewards/rejected": -0.7761225223541259, "step": 846 }, { "epoch": 0.04489438952640924, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2849342.5, "logits/rejected": -14890806.0, "logps/chosen": -136.5152587890625, "logps/rejected": -276.99169921875, "loss": 0.3907, "rewards/chosen": 0.15509939193725586, "rewards/margins": 1.1117583513259888, "rewards/rejected": -0.9566589593887329, "step": 847 }, { "epoch": 0.04494739352821138, "grad_norm": 62.0, "kl": 0.07132720947265625, "learning_rate": 5e-07, "logits/chosen": -26361216.0, "logits/rejected": -15370950.0, "logps/chosen": -300.83233642578125, "logps/rejected": -128.13967895507812, "loss": 0.3865, "rewards/chosen": 0.13108883798122406, "rewards/margins": 1.000498041510582, "rewards/rejected": -0.8694092035293579, "step": 848 }, { "epoch": 0.04500039753001352, "grad_norm": 59.75, "kl": 0.6202373504638672, "learning_rate": 5e-07, "logits/chosen": -25543124.0, "logits/rejected": -14667045.0, "logps/chosen": -568.14892578125, "logps/rejected": -126.83366394042969, "loss": 0.398, "rewards/chosen": 0.2852174639701843, "rewards/margins": 0.8686002492904663, "rewards/rejected": -0.583382785320282, "step": 849 }, { "epoch": 0.045053401531815654, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55825152.0, "logits/rejected": -14326641.0, "logps/chosen": -304.1975402832031, "logps/rejected": -193.53921508789062, "loss": 0.401, "rewards/chosen": 0.12251468002796173, "rewards/margins": 0.8247344046831131, "rewards/rejected": -0.7022197246551514, "step": 850 }, { "epoch": 0.04510640553361779, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25515298.666666668, "logits/rejected": -5234553.5, "logps/chosen": -290.24680582682294, "logps/rejected": -329.93890380859375, "loss": 0.4152, "rewards/chosen": 0.15350624918937683, "rewards/margins": 1.1596815288066864, "rewards/rejected": -1.0061752796173096, "step": 851 }, { "epoch": 0.04515940953541992, "grad_norm": 53.25, "kl": 0.06816291809082031, "learning_rate": 5e-07, "logits/chosen": -35201420.8, "logits/rejected": 6820584.666666667, "logps/chosen": -167.125830078125, "logps/rejected": -104.06824747721355, "loss": 0.4904, "rewards/chosen": -0.1860111713409424, "rewards/margins": 0.2251036802927653, "rewards/rejected": -0.4111148516337077, "step": 852 }, { "epoch": 0.04521241353722206, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23910376.0, "logits/rejected": -27746608.0, "logps/chosen": -126.64274088541667, "logps/rejected": -186.17713623046876, "loss": 0.3369, "rewards/chosen": 0.20159403483072916, "rewards/margins": 1.2350199381510418, "rewards/rejected": -1.0334259033203126, "step": 853 }, { "epoch": 0.045265417539024196, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 776351.75, "logits/rejected": -6441449.5, "logps/chosen": -160.50048828125, "logps/rejected": -152.8829345703125, "loss": 0.4357, "rewards/chosen": 0.15256467461585999, "rewards/margins": 0.5254429280757904, "rewards/rejected": -0.3728782534599304, "step": 854 }, { "epoch": 0.04531842154082633, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33757640.0, "logits/rejected": -16796860.0, "logps/chosen": -215.39825439453125, "logps/rejected": -328.82635498046875, "loss": 0.3375, "rewards/chosen": 0.23447436094284058, "rewards/margins": 1.7087362408638, "rewards/rejected": -1.4742618799209595, "step": 855 }, { "epoch": 0.04537142554262847, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -409071104.0, "logits/rejected": -22365213.714285713, "logps/chosen": -482.44586181640625, "logps/rejected": -308.37583705357144, "loss": 0.296, "rewards/chosen": 0.14732666313648224, "rewards/margins": 1.2063093760183878, "rewards/rejected": -1.0589827128819056, "step": 856 }, { "epoch": 0.04542442954443061, "grad_norm": 80.5, "kl": 0.17990684509277344, "learning_rate": 5e-07, "logits/chosen": -11244488.8, "logits/rejected": 39633312.0, "logps/chosen": -762.206640625, "logps/rejected": -259.6763102213542, "loss": 0.4198, "rewards/chosen": 0.2096173048019409, "rewards/margins": 0.7604008595148721, "rewards/rejected": -0.5507835547129313, "step": 857 }, { "epoch": 0.04547743354623274, "grad_norm": 82.0, "kl": 0.12969970703125, "learning_rate": 5e-07, "logits/chosen": -10780862.4, "logits/rejected": -9060237.333333334, "logps/chosen": -460.083056640625, "logps/rejected": -247.2447509765625, "loss": 0.4036, "rewards/chosen": 0.06368317008018494, "rewards/margins": 1.1098278144995373, "rewards/rejected": -1.0461446444193523, "step": 858 }, { "epoch": 0.045530437548034874, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57761075.2, "logits/rejected": 34086485.333333336, "logps/chosen": -696.681982421875, "logps/rejected": -446.4600423177083, "loss": 0.3968, "rewards/chosen": -0.00938194990158081, "rewards/margins": 1.3498573104540508, "rewards/rejected": -1.3592392603556316, "step": 859 }, { "epoch": 0.04558344154983701, "grad_norm": 65.5, "kl": 0.3552894592285156, "learning_rate": 5e-07, "logits/chosen": -35030496.0, "logits/rejected": -30916149.333333332, "logps/chosen": -684.35498046875, "logps/rejected": -310.8985595703125, "loss": 0.2959, "rewards/chosen": 0.0029449462890625, "rewards/margins": 1.3999557495117188, "rewards/rejected": -1.3970108032226562, "step": 860 }, { "epoch": 0.04563644555163915, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59244288.0, "logits/rejected": -37167858.666666664, "logps/chosen": -331.1151123046875, "logps/rejected": -438.0462239583333, "loss": 0.3858, "rewards/chosen": -0.0007375359535217285, "rewards/margins": 1.6444205959637959, "rewards/rejected": -1.6451581319173176, "step": 861 }, { "epoch": 0.045689449553441286, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48141256.0, "logits/rejected": -13377846.0, "logps/chosen": -675.2970581054688, "logps/rejected": -220.47988891601562, "loss": 0.3845, "rewards/chosen": 0.06329269707202911, "rewards/margins": 1.0457146912813187, "rewards/rejected": -0.9824219942092896, "step": 862 }, { "epoch": 0.04574245355524342, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22753138.0, "logits/rejected": -19650293.333333332, "logps/chosen": -308.63787841796875, "logps/rejected": -263.7410888671875, "loss": 0.3323, "rewards/chosen": 0.25518110394477844, "rewards/margins": 1.1630919277668, "rewards/rejected": -0.9079108238220215, "step": 863 }, { "epoch": 0.04579545755704556, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24920306.666666668, "logits/rejected": -20170188.8, "logps/chosen": -325.3438313802083, "logps/rejected": -207.566015625, "loss": 0.3669, "rewards/chosen": 0.06977132459481557, "rewards/margins": 0.9655198444922766, "rewards/rejected": -0.895748519897461, "step": 864 }, { "epoch": 0.04584846155884769, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39945608.0, "logits/rejected": -26594942.0, "logps/chosen": -388.2631429036458, "logps/rejected": -541.337158203125, "loss": 0.4302, "rewards/chosen": -0.027722671627998352, "rewards/margins": 1.5058116167783737, "rewards/rejected": -1.533534288406372, "step": 865 }, { "epoch": 0.04590146556064983, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67090160.0, "logits/rejected": -27811117.333333332, "logps/chosen": -814.2501831054688, "logps/rejected": -366.3517659505208, "loss": 0.3077, "rewards/chosen": -0.04055481031537056, "rewards/margins": 1.312701415270567, "rewards/rejected": -1.3532562255859375, "step": 866 }, { "epoch": 0.045954469562451965, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13920923.0, "logits/rejected": -18179088.0, "logps/chosen": -620.8251342773438, "logps/rejected": -154.5585174560547, "loss": 0.3805, "rewards/chosen": 0.17159277200698853, "rewards/margins": 1.0481602549552917, "rewards/rejected": -0.8765674829483032, "step": 867 }, { "epoch": 0.0460074735642541, "grad_norm": 43.0, "kl": 0.0477142333984375, "learning_rate": 5e-07, "logits/chosen": -32996675.2, "logits/rejected": -22626568.0, "logps/chosen": -162.310009765625, "logps/rejected": -296.3148193359375, "loss": 0.3815, "rewards/chosen": 0.02123173475265503, "rewards/margins": 1.530856692790985, "rewards/rejected": -1.50962495803833, "step": 868 }, { "epoch": 0.04606047756605624, "grad_norm": 61.75, "kl": 0.5061626434326172, "learning_rate": 5e-07, "logits/chosen": -37627685.333333336, "logits/rejected": -8163346.4, "logps/chosen": -325.2049967447917, "logps/rejected": -196.184912109375, "loss": 0.3854, "rewards/chosen": 0.2321858008702596, "rewards/margins": 0.9734038432439168, "rewards/rejected": -0.7412180423736572, "step": 869 }, { "epoch": 0.046113481567858376, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -104622122.66666667, "logits/rejected": -40989481.6, "logps/chosen": -440.746826171875, "logps/rejected": -222.87373046875, "loss": 0.3962, "rewards/chosen": 0.013361612955729166, "rewards/margins": 0.730449644724528, "rewards/rejected": -0.7170880317687989, "step": 870 }, { "epoch": 0.046166485569660506, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24034782.0, "logits/rejected": -10695664.0, "logps/chosen": -376.87152099609375, "logps/rejected": -154.65785217285156, "loss": 0.4202, "rewards/chosen": 0.06517629325389862, "rewards/margins": 0.6868126839399338, "rewards/rejected": -0.6216363906860352, "step": 871 }, { "epoch": 0.046219489571462644, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1866660.0, "logits/rejected": -21490734.0, "logps/chosen": -317.6517028808594, "logps/rejected": -416.1939697265625, "loss": 0.3689, "rewards/chosen": 0.047801923006772995, "rewards/margins": 1.421863865107298, "rewards/rejected": -1.374061942100525, "step": 872 }, { "epoch": 0.04627249357326478, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19182376.0, "logits/rejected": -11501509.333333334, "logps/chosen": -376.75018310546875, "logps/rejected": -248.70892333984375, "loss": 0.3526, "rewards/chosen": -0.06109008193016052, "rewards/margins": 0.9290177722771963, "rewards/rejected": -0.9901078542073568, "step": 873 }, { "epoch": 0.04632549757506692, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 29062058.0, "logits/rejected": -23140180.57142857, "logps/chosen": -74.01029205322266, "logps/rejected": -284.84151785714283, "loss": 0.2934, "rewards/chosen": 0.008578491397202015, "rewards/margins": 1.1041827067466718, "rewards/rejected": -1.0956042153494698, "step": 874 }, { "epoch": 0.046378501576869055, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55077776.0, "logits/rejected": -10102001.6, "logps/chosen": -415.5544026692708, "logps/rejected": -223.504931640625, "loss": 0.391, "rewards/chosen": 0.21419519186019897, "rewards/margins": 0.8234341979026795, "rewards/rejected": -0.6092390060424805, "step": 875 }, { "epoch": 0.04643150557867119, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11435016.0, "logits/rejected": -32021140.0, "logps/chosen": -205.76547241210938, "logps/rejected": -217.3858642578125, "loss": 0.3948, "rewards/chosen": 0.10229606926441193, "rewards/margins": 1.1755752116441727, "rewards/rejected": -1.0732791423797607, "step": 876 }, { "epoch": 0.04648450958047332, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16391084.8, "logits/rejected": -45839696.0, "logps/chosen": -224.1462890625, "logps/rejected": -373.345703125, "loss": 0.3608, "rewards/chosen": 0.38753952980041506, "rewards/margins": 1.393782122929891, "rewards/rejected": -1.0062425931294758, "step": 877 }, { "epoch": 0.04653751358227546, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11678141.333333334, "logits/rejected": -20527875.2, "logps/chosen": -349.7694905598958, "logps/rejected": -363.2597412109375, "loss": 0.3673, "rewards/chosen": -0.23712233702341715, "rewards/margins": 0.9142734607060751, "rewards/rejected": -1.1513957977294922, "step": 878 }, { "epoch": 0.0465905175840776, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 209870.75, "logits/rejected": -25437307.2, "logps/chosen": -13.631483713785807, "logps/rejected": -157.18135986328124, "loss": 0.3337, "rewards/chosen": 0.20783982674280801, "rewards/margins": 1.2563342610994976, "rewards/rejected": -1.0484944343566895, "step": 879 }, { "epoch": 0.046643521585879734, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48596474.666666664, "logits/rejected": -34329084.8, "logps/chosen": -460.4562174479167, "logps/rejected": -485.7044921875, "loss": 0.2868, "rewards/chosen": 0.08602396647135417, "rewards/margins": 1.733076349894206, "rewards/rejected": -1.6470523834228517, "step": 880 }, { "epoch": 0.04669652558768187, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31977382.4, "logits/rejected": -15752948.0, "logps/chosen": -317.78076171875, "logps/rejected": -289.75242106119794, "loss": 0.4319, "rewards/chosen": -0.03517246842384338, "rewards/margins": 0.8596189757188162, "rewards/rejected": -0.8947914441426595, "step": 881 }, { "epoch": 0.04674952958948401, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40207972.0, "logits/rejected": -1735218.6666666667, "logps/chosen": -391.9004821777344, "logps/rejected": -257.8690592447917, "loss": 0.3635, "rewards/chosen": 0.17895175516605377, "rewards/margins": 0.976763978600502, "rewards/rejected": -0.7978122234344482, "step": 882 }, { "epoch": 0.046802533591286145, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38852550.4, "logits/rejected": -19779234.666666668, "logps/chosen": -169.92491455078124, "logps/rejected": -301.5673421223958, "loss": 0.424, "rewards/chosen": -0.15444483757019042, "rewards/margins": 1.0579390684763592, "rewards/rejected": -1.2123839060465496, "step": 883 }, { "epoch": 0.046855537593088276, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36009546.666666664, "logits/rejected": -24483472.0, "logps/chosen": -350.1937662760417, "logps/rejected": -294.3946533203125, "loss": 0.4169, "rewards/chosen": 0.01315237581729889, "rewards/margins": 1.570518508553505, "rewards/rejected": -1.557366132736206, "step": 884 }, { "epoch": 0.04690854159489041, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41963046.4, "logits/rejected": -32296586.666666668, "logps/chosen": -248.7151123046875, "logps/rejected": -293.5743408203125, "loss": 0.4109, "rewards/chosen": -0.21655843257904053, "rewards/margins": 1.28352898756663, "rewards/rejected": -1.5000874201456706, "step": 885 }, { "epoch": 0.04696154559669255, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6783499.5, "logits/rejected": -18242308.0, "logps/chosen": -184.77517700195312, "logps/rejected": -244.34036254882812, "loss": 0.3723, "rewards/chosen": 0.08934865146875381, "rewards/margins": 1.1099362149834633, "rewards/rejected": -1.0205875635147095, "step": 886 }, { "epoch": 0.04701454959849469, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22613933.333333332, "logits/rejected": -13778351.0, "logps/chosen": -248.85196940104166, "logps/rejected": -176.1649932861328, "loss": 0.4567, "rewards/chosen": -0.07044642170270284, "rewards/margins": 1.0362960795561473, "rewards/rejected": -1.10674250125885, "step": 887 }, { "epoch": 0.047067553600296824, "grad_norm": 65.5, "kl": 0.5964469909667969, "learning_rate": 5e-07, "logits/chosen": -68854816.0, "logits/rejected": -65984680.0, "logps/chosen": -460.5384826660156, "logps/rejected": -418.6734619140625, "loss": 0.3525, "rewards/chosen": 0.24292373657226562, "rewards/margins": 1.4782437086105347, "rewards/rejected": -1.235319972038269, "step": 888 }, { "epoch": 0.04712055760209896, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5580903.333333333, "logits/rejected": -9435653.6, "logps/chosen": -88.52133178710938, "logps/rejected": -154.655712890625, "loss": 0.3737, "rewards/chosen": -0.03730977326631546, "rewards/margins": 0.9381395027041435, "rewards/rejected": -0.9754492759704589, "step": 889 }, { "epoch": 0.04717356160390109, "grad_norm": 55.75, "kl": 0.020982742309570312, "learning_rate": 5e-07, "logits/chosen": -916581.3333333334, "logits/rejected": -20188460.8, "logps/chosen": -268.3887532552083, "logps/rejected": -324.61064453125, "loss": 0.4038, "rewards/chosen": -0.03130010018746058, "rewards/margins": 0.7325678502519926, "rewards/rejected": -0.7638679504394531, "step": 890 }, { "epoch": 0.04722656560570323, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4024163.6666666665, "logits/rejected": -16141163.2, "logps/chosen": -142.1269734700521, "logps/rejected": -213.108837890625, "loss": 0.3497, "rewards/chosen": -0.08346811930338542, "rewards/margins": 1.3210584004720052, "rewards/rejected": -1.4045265197753907, "step": 891 }, { "epoch": 0.047279569607505366, "grad_norm": 59.25, "kl": 1.15753173828125, "learning_rate": 5e-07, "logits/chosen": -43865304.0, "logits/rejected": -87607189.33333333, "logps/chosen": -836.873046875, "logps/rejected": -153.920654296875, "loss": 0.3845, "rewards/chosen": 0.2267906218767166, "rewards/margins": 0.8826097200314204, "rewards/rejected": -0.6558190981547037, "step": 892 }, { "epoch": 0.0473325736093075, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17255086.0, "logits/rejected": -78431968.0, "logps/chosen": -186.15994262695312, "logps/rejected": -270.5146179199219, "loss": 0.3979, "rewards/chosen": -0.09821557998657227, "rewards/margins": 0.9602949619293213, "rewards/rejected": -1.0585105419158936, "step": 893 }, { "epoch": 0.04738557761110964, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8354466.0, "logits/rejected": -35367733.333333336, "logps/chosen": -595.1943969726562, "logps/rejected": -195.3125, "loss": 0.3341, "rewards/chosen": -0.0074630677700042725, "rewards/margins": 1.012555827697118, "rewards/rejected": -1.0200188954671223, "step": 894 }, { "epoch": 0.04743858161291178, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -182313.33333333334, "logits/rejected": -53035891.2, "logps/chosen": -246.37103271484375, "logps/rejected": -486.522998046875, "loss": 0.3367, "rewards/chosen": 0.022962316870689392, "rewards/margins": 1.179896292090416, "rewards/rejected": -1.1569339752197265, "step": 895 }, { "epoch": 0.04749158561471391, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66960800.0, "logits/rejected": -14152547.0, "logps/chosen": -298.87652587890625, "logps/rejected": -96.91261291503906, "loss": 0.4513, "rewards/chosen": -0.03206062316894531, "rewards/margins": 0.39804065227508545, "rewards/rejected": -0.43010127544403076, "step": 896 }, { "epoch": 0.047544589616516045, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22503568.0, "logits/rejected": 9845532.666666666, "logps/chosen": -253.52705078125, "logps/rejected": -93.26295979817708, "loss": 0.4158, "rewards/chosen": 0.2862274169921875, "rewards/margins": 0.7530792474746704, "rewards/rejected": -0.4668518304824829, "step": 897 }, { "epoch": 0.04759759361831818, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17471541.333333332, "logits/rejected": -37887193.6, "logps/chosen": -176.6881103515625, "logps/rejected": -283.7299072265625, "loss": 0.3884, "rewards/chosen": -0.1259684960047404, "rewards/margins": 0.7739368041356405, "rewards/rejected": -0.8999053001403808, "step": 898 }, { "epoch": 0.04765059762012032, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44327254.4, "logits/rejected": -3142491.0, "logps/chosen": -403.8743408203125, "logps/rejected": -160.98116048177084, "loss": 0.428, "rewards/chosen": 0.06766006350517273, "rewards/margins": 0.7724748949209849, "rewards/rejected": -0.7048148314158121, "step": 899 }, { "epoch": 0.047703601621922456, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55850608.0, "logits/rejected": -4201247.0, "logps/chosen": -169.6449737548828, "logps/rejected": -380.48516845703125, "loss": 0.3689, "rewards/chosen": -0.062081627547740936, "rewards/margins": 1.422387309372425, "rewards/rejected": -1.484468936920166, "step": 900 }, { "epoch": 0.04775660562372459, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22029088.0, "logits/rejected": -274790.75, "logps/chosen": -166.2646687825521, "logps/rejected": -95.86250305175781, "loss": 0.4742, "rewards/chosen": -0.05328947305679321, "rewards/margins": 0.5335304141044617, "rewards/rejected": -0.5868198871612549, "step": 901 }, { "epoch": 0.04780960962552673, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20571468.0, "logits/rejected": -11078405.0, "logps/chosen": -240.95083618164062, "logps/rejected": -187.83450317382812, "loss": 0.382, "rewards/chosen": 0.12942036986351013, "rewards/margins": 1.1151452362537384, "rewards/rejected": -0.9857248663902283, "step": 902 }, { "epoch": 0.04786261362732886, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10800946.0, "logits/rejected": -33708216.0, "logps/chosen": -407.8376770019531, "logps/rejected": -407.18280029296875, "loss": 0.3464, "rewards/chosen": 0.01486067846417427, "rewards/margins": 1.510251808911562, "rewards/rejected": -1.4953911304473877, "step": 903 }, { "epoch": 0.047915617629131, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1336432.75, "logits/rejected": -13691756.8, "logps/chosen": -26.49572245279948, "logps/rejected": -233.26513671875, "loss": 0.3504, "rewards/chosen": 0.061699102322260536, "rewards/margins": 1.0982978800932568, "rewards/rejected": -1.0365987777709962, "step": 904 }, { "epoch": 0.047968621630933135, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27936196.0, "logits/rejected": -25166992.0, "logps/chosen": -602.2552490234375, "logps/rejected": -267.4632568359375, "loss": 0.2896, "rewards/chosen": 0.12002868950366974, "rewards/margins": 1.383988653620084, "rewards/rejected": -1.2639599641164143, "step": 905 }, { "epoch": 0.04802162563273527, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18702416.0, "logits/rejected": -9866182.0, "logps/chosen": -330.462158203125, "logps/rejected": -173.5424346923828, "loss": 0.4457, "rewards/chosen": -0.019018178805708885, "rewards/margins": 0.461078280583024, "rewards/rejected": -0.4800964593887329, "step": 906 }, { "epoch": 0.04807462963453741, "grad_norm": 74.0, "kl": 0.1294403076171875, "learning_rate": 5e-07, "logits/chosen": -27180473.6, "logits/rejected": -35410461.333333336, "logps/chosen": -311.347119140625, "logps/rejected": -201.17146809895834, "loss": 0.4063, "rewards/chosen": 0.0948147714138031, "rewards/margins": 1.0076019704341888, "rewards/rejected": -0.9127871990203857, "step": 907 }, { "epoch": 0.04812763363633955, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76823397.33333333, "logits/rejected": -30055910.4, "logps/chosen": -481.5327962239583, "logps/rejected": -432.226953125, "loss": 0.3359, "rewards/chosen": 0.062011723717053734, "rewards/margins": 1.2999322940905889, "rewards/rejected": -1.2379205703735352, "step": 908 }, { "epoch": 0.04818063763814168, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6475073.0, "logits/rejected": -16496264.0, "logps/chosen": -45.56565856933594, "logps/rejected": -278.8900451660156, "loss": 0.3921, "rewards/chosen": -0.007582094520330429, "rewards/margins": 0.997785422950983, "rewards/rejected": -1.0053675174713135, "step": 909 }, { "epoch": 0.048233641639943814, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38246469.333333336, "logits/rejected": -68512824.0, "logps/chosen": -243.89034016927084, "logps/rejected": -501.9886474609375, "loss": 0.4159, "rewards/chosen": -0.1137582262357076, "rewards/margins": 2.3356483976046243, "rewards/rejected": -2.449406623840332, "step": 910 }, { "epoch": 0.04828664564174595, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11978740.8, "logits/rejected": -21090972.0, "logps/chosen": -280.1271728515625, "logps/rejected": -519.1109212239584, "loss": 0.3506, "rewards/chosen": 0.19764820337295533, "rewards/margins": 1.8374075849850973, "rewards/rejected": -1.6397593816121419, "step": 911 }, { "epoch": 0.04833964964354809, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3081475.2, "logits/rejected": -19236637.333333332, "logps/chosen": -215.74833984375, "logps/rejected": -734.603759765625, "loss": 0.4043, "rewards/chosen": -0.17116117477416992, "rewards/margins": 1.4066912333170574, "rewards/rejected": -1.5778524080912273, "step": 912 }, { "epoch": 0.048392653645350225, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11130154.0, "logits/rejected": -21161056.0, "logps/chosen": -131.76043701171875, "logps/rejected": -286.275390625, "loss": 0.3578, "rewards/chosen": -0.26260071992874146, "rewards/margins": 0.5120893631662641, "rewards/rejected": -0.7746900830950055, "step": 913 }, { "epoch": 0.04844565764715236, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52445760.0, "logits/rejected": -43263696.0, "logps/chosen": -324.1822998046875, "logps/rejected": -442.8658040364583, "loss": 0.3672, "rewards/chosen": 0.005160564184188842, "rewards/margins": 1.8439956446488697, "rewards/rejected": -1.838835080464681, "step": 914 }, { "epoch": 0.04849866164895449, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18823837.333333332, "logits/rejected": -2831938.4, "logps/chosen": -199.900146484375, "logps/rejected": -347.1787353515625, "loss": 0.3377, "rewards/chosen": 0.020266151676575344, "rewards/margins": 1.1774663927654425, "rewards/rejected": -1.157200241088867, "step": 915 }, { "epoch": 0.04855166565075663, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34991000.0, "logits/rejected": -67677174.85714285, "logps/chosen": -286.75457763671875, "logps/rejected": -307.905029296875, "loss": 0.3136, "rewards/chosen": -0.21136474609375, "rewards/margins": 0.8558275359017509, "rewards/rejected": -1.0671922819955009, "step": 916 }, { "epoch": 0.04860466965255877, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9609730.666666666, "logits/rejected": -20588428.8, "logps/chosen": -404.8861897786458, "logps/rejected": -123.78203125, "loss": 0.3995, "rewards/chosen": 0.12753886977831522, "rewards/margins": 0.7574264506498972, "rewards/rejected": -0.629887580871582, "step": 917 }, { "epoch": 0.048657673654360904, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7868050.285714285, "logits/rejected": 4388727.0, "logps/chosen": -172.96833147321428, "logps/rejected": -36.98727798461914, "loss": 0.4825, "rewards/chosen": 0.04552789671080453, "rewards/margins": 0.2935446415628706, "rewards/rejected": -0.24801674485206604, "step": 918 }, { "epoch": 0.04871067765616304, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43390441.6, "logits/rejected": -6373026.666666667, "logps/chosen": -340.1189453125, "logps/rejected": -96.54392496744792, "loss": 0.4292, "rewards/chosen": -0.15400938987731932, "rewards/margins": 0.9652092774709065, "rewards/rejected": -1.1192186673482258, "step": 919 }, { "epoch": 0.04876368165796518, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51811920.0, "logits/rejected": -38752432.0, "logps/chosen": -351.44451904296875, "logps/rejected": -349.3455403645833, "loss": 0.3298, "rewards/chosen": -0.05523681640625, "rewards/margins": 1.1433526674906414, "rewards/rejected": -1.1985894838968914, "step": 920 }, { "epoch": 0.048816685659767316, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15060602.666666666, "logits/rejected": -19748880.0, "logps/chosen": -215.4559122721354, "logps/rejected": -310.0156494140625, "loss": 0.3535, "rewards/chosen": 0.09028701980908711, "rewards/margins": 1.0941470166047413, "rewards/rejected": -1.0038599967956543, "step": 921 }, { "epoch": 0.048869689661569446, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34071257.6, "logits/rejected": -105482144.0, "logps/chosen": -258.9715576171875, "logps/rejected": -335.43446858723956, "loss": 0.4259, "rewards/chosen": -0.21685714721679689, "rewards/margins": 1.2033862749735516, "rewards/rejected": -1.4202434221903484, "step": 922 }, { "epoch": 0.04892269366337158, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60487368.0, "logits/rejected": -21579708.0, "logps/chosen": -291.0906066894531, "logps/rejected": -175.1020050048828, "loss": 0.4059, "rewards/chosen": 0.09705333411693573, "rewards/margins": 0.8553386777639389, "rewards/rejected": -0.7582853436470032, "step": 923 }, { "epoch": 0.04897569766517372, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4669809.0, "logits/rejected": -74631061.33333333, "logps/chosen": -259.02691650390625, "logps/rejected": -386.1641438802083, "loss": 0.2656, "rewards/chosen": 0.09609679877758026, "rewards/margins": 1.750013475616773, "rewards/rejected": -1.6539166768391926, "step": 924 }, { "epoch": 0.04902870166697586, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22434101.333333332, "logits/rejected": -22602080.0, "logps/chosen": -291.14040120442706, "logps/rejected": -254.2545654296875, "loss": 0.3373, "rewards/chosen": -0.03185424953699112, "rewards/margins": 1.166600988805294, "rewards/rejected": -1.198455238342285, "step": 925 }, { "epoch": 0.049081705668777995, "grad_norm": 105.5, "kl": 0.03666877746582031, "learning_rate": 5e-07, "logits/chosen": -25492484.57142857, "logits/rejected": -516098.9375, "logps/chosen": -495.80751255580356, "logps/rejected": -32.02735137939453, "loss": 0.4667, "rewards/chosen": 0.05442485639027187, "rewards/margins": 0.770857070173536, "rewards/rejected": -0.7164322137832642, "step": 926 }, { "epoch": 0.04913470967058013, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8081896.0, "logits/rejected": -45027104.0, "logps/chosen": -177.65283203125, "logps/rejected": -358.7569986979167, "loss": 0.4057, "rewards/chosen": 0.08224297165870667, "rewards/margins": 1.017909453312556, "rewards/rejected": -0.9356664816538492, "step": 927 }, { "epoch": 0.04918771367238226, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49343736.0, "logits/rejected": -60197120.0, "logps/chosen": -516.7463989257812, "logps/rejected": -470.52703857421875, "loss": 0.3168, "rewards/chosen": 0.11847281455993652, "rewards/margins": 2.3911445140838623, "rewards/rejected": -2.272671699523926, "step": 928 }, { "epoch": 0.0492407176741844, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17827758.0, "logits/rejected": -29050108.0, "logps/chosen": -326.952392578125, "logps/rejected": -248.72706604003906, "loss": 0.3899, "rewards/chosen": 0.118499755859375, "rewards/margins": 0.9857821464538574, "rewards/rejected": -0.8672823905944824, "step": 929 }, { "epoch": 0.049293721675986536, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6923066.666666667, "logits/rejected": -9155710.4, "logps/chosen": -284.09588623046875, "logps/rejected": -119.6180419921875, "loss": 0.4328, "rewards/chosen": 0.09657440582911174, "rewards/margins": 0.48324358860651656, "rewards/rejected": -0.3866691827774048, "step": 930 }, { "epoch": 0.04934672567778867, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1789450.6666666667, "logits/rejected": -7143416.8, "logps/chosen": -416.2160237630208, "logps/rejected": -346.0050048828125, "loss": 0.3835, "rewards/chosen": -0.31261595090230304, "rewards/margins": 0.8166719595591228, "rewards/rejected": -1.1292879104614257, "step": 931 }, { "epoch": 0.04939972967959081, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44399769.6, "logits/rejected": -43594408.0, "logps/chosen": -363.5275146484375, "logps/rejected": -298.218017578125, "loss": 0.3825, "rewards/chosen": 0.09183366298675537, "rewards/margins": 1.6212287664413452, "rewards/rejected": -1.5293951034545898, "step": 932 }, { "epoch": 0.04945273368139295, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28955603.2, "logits/rejected": -35010160.0, "logps/chosen": -428.6255859375, "logps/rejected": -420.350341796875, "loss": 0.3648, "rewards/chosen": 0.20381882190704345, "rewards/margins": 1.505239987373352, "rewards/rejected": -1.3014211654663086, "step": 933 }, { "epoch": 0.04950573768319508, "grad_norm": 71.5, "kl": 0.00304412841796875, "learning_rate": 5e-07, "logits/chosen": -42880869.333333336, "logits/rejected": -34121504.0, "logps/chosen": -434.4812825520833, "logps/rejected": -260.99298095703125, "loss": 0.43, "rewards/chosen": 0.1618866721789042, "rewards/margins": 0.8952035109202067, "rewards/rejected": -0.7333168387413025, "step": 934 }, { "epoch": 0.049558741684997215, "grad_norm": 77.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25166892.0, "logits/rejected": -36941292.0, "logps/chosen": -679.2896118164062, "logps/rejected": -721.6875, "loss": 0.301, "rewards/chosen": 0.18109750747680664, "rewards/margins": 2.2657310962677, "rewards/rejected": -2.0846335887908936, "step": 935 }, { "epoch": 0.04961174568679935, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38158296.0, "logits/rejected": -33283094.85714286, "logps/chosen": -316.1295471191406, "logps/rejected": -370.7942592075893, "loss": 0.2871, "rewards/chosen": 0.04551086574792862, "rewards/margins": 1.2937024267656463, "rewards/rejected": -1.2481915610177177, "step": 936 }, { "epoch": 0.04966474968860149, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66432100.0, "logits/rejected": -42875442.666666664, "logps/chosen": -371.85601806640625, "logps/rejected": -440.8137613932292, "loss": 0.3206, "rewards/chosen": -0.10007324069738388, "rewards/margins": 1.3230545694629352, "rewards/rejected": -1.423127810160319, "step": 937 }, { "epoch": 0.04971775369040363, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4511462.0, "logits/rejected": -9539737.6, "logps/chosen": -181.7737019856771, "logps/rejected": -150.0636962890625, "loss": 0.3987, "rewards/chosen": 0.2714605927467346, "rewards/margins": 0.7902093052864074, "rewards/rejected": -0.5187487125396728, "step": 938 }, { "epoch": 0.049770757692205764, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14084182.0, "logits/rejected": -19781414.0, "logps/chosen": -146.1806182861328, "logps/rejected": -387.81695556640625, "loss": 0.3829, "rewards/chosen": -0.08589201420545578, "rewards/margins": 1.2310876324772835, "rewards/rejected": -1.3169796466827393, "step": 939 }, { "epoch": 0.0498237616940079, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23375731.2, "logits/rejected": 1291720.0, "logps/chosen": -244.2773681640625, "logps/rejected": -363.9246419270833, "loss": 0.4101, "rewards/chosen": 0.03759319186210632, "rewards/margins": 1.0982357521851858, "rewards/rejected": -1.0606425603230794, "step": 940 }, { "epoch": 0.04987676569581003, "grad_norm": 192.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38565304.0, "logits/rejected": -113104120.0, "logps/chosen": -254.1380157470703, "logps/rejected": -137.85208129882812, "loss": 0.4142, "rewards/chosen": -0.05580615624785423, "rewards/margins": 0.8336247242987156, "rewards/rejected": -0.8894308805465698, "step": 941 }, { "epoch": 0.04992976969761217, "grad_norm": 63.25, "kl": 0.27426910400390625, "learning_rate": 5e-07, "logits/chosen": -70428640.0, "logits/rejected": -36764592.0, "logps/chosen": -654.9813639322916, "logps/rejected": -327.498388671875, "loss": 0.3579, "rewards/chosen": 0.2652486364046733, "rewards/margins": 1.2926766912142436, "rewards/rejected": -1.0274280548095702, "step": 942 }, { "epoch": 0.049982773699414305, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8744875.0, "logits/rejected": -44040592.0, "logps/chosen": -375.0894470214844, "logps/rejected": -278.9850260416667, "loss": 0.3517, "rewards/chosen": 0.15504151582717896, "rewards/margins": 1.030769964059194, "rewards/rejected": -0.875728448232015, "step": 943 }, { "epoch": 0.05003577770121644, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54896486.4, "logits/rejected": -23629530.666666668, "logps/chosen": -461.5453125, "logps/rejected": -161.1572265625, "loss": 0.4462, "rewards/chosen": -0.04045349061489105, "rewards/margins": 0.6541126002868017, "rewards/rejected": -0.6945660909016927, "step": 944 }, { "epoch": 0.05008878170301858, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55846165.333333336, "logits/rejected": -42604259.2, "logps/chosen": -427.4921061197917, "logps/rejected": -344.283154296875, "loss": 0.3076, "rewards/chosen": 0.2119556466738383, "rewards/margins": 1.6487022439638774, "rewards/rejected": -1.436746597290039, "step": 945 }, { "epoch": 0.05014178570482072, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18232979.2, "logits/rejected": -38998010.666666664, "logps/chosen": -363.8970703125, "logps/rejected": -238.1491902669271, "loss": 0.4206, "rewards/chosen": 0.059683531522750854, "rewards/margins": 0.8589717249075571, "rewards/rejected": -0.7992881933848063, "step": 946 }, { "epoch": 0.05019478970662285, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26521305.6, "logits/rejected": -10733128.666666666, "logps/chosen": -252.3950439453125, "logps/rejected": -101.57010904947917, "loss": 0.4497, "rewards/chosen": -0.031734317541122437, "rewards/margins": 0.5779741108417511, "rewards/rejected": -0.6097084283828735, "step": 947 }, { "epoch": 0.050247793708424984, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47350612.0, "logits/rejected": -21140778.666666668, "logps/chosen": -246.8099822998047, "logps/rejected": -323.8865966796875, "loss": 0.3371, "rewards/chosen": -0.2644355893135071, "rewards/margins": 0.9815736015637715, "rewards/rejected": -1.2460091908772786, "step": 948 }, { "epoch": 0.05030079771022712, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20736334.666666668, "logits/rejected": 3540300.0, "logps/chosen": -318.1142171223958, "logps/rejected": -638.068115234375, "loss": 0.3563, "rewards/chosen": 0.17351178328196207, "rewards/margins": 3.464365760485331, "rewards/rejected": -3.290853977203369, "step": 949 }, { "epoch": 0.05035380171202926, "grad_norm": 90.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41664597.333333336, "logits/rejected": -30941496.0, "logps/chosen": -634.0284830729166, "logps/rejected": -298.8666687011719, "loss": 0.3915, "rewards/chosen": 0.2465229034423828, "rewards/margins": 1.4075138568878174, "rewards/rejected": -1.1609909534454346, "step": 950 }, { "epoch": 0.050406805713831396, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3521562.6666666665, "logits/rejected": 1284348.0, "logps/chosen": -306.40663655598956, "logps/rejected": -358.907470703125, "loss": 0.4287, "rewards/chosen": 0.0460992157459259, "rewards/margins": 1.1535150110721588, "rewards/rejected": -1.107415795326233, "step": 951 }, { "epoch": 0.05045980971563353, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30012982.4, "logits/rejected": -23476976.0, "logps/chosen": -273.7015869140625, "logps/rejected": -428.1875, "loss": 0.3708, "rewards/chosen": 0.0473275363445282, "rewards/margins": 1.7056360423564911, "rewards/rejected": -1.658308506011963, "step": 952 }, { "epoch": 0.05051281371743566, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -82137784.0, "logits/rejected": -41459803.428571425, "logps/chosen": -371.2620849609375, "logps/rejected": -393.0044642857143, "loss": 0.2619, "rewards/chosen": 0.006045532412827015, "rewards/margins": 1.298791122622788, "rewards/rejected": -1.292745590209961, "step": 953 }, { "epoch": 0.0505658177192378, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11211784.0, "logits/rejected": -28881838.0, "logps/chosen": -138.94020080566406, "logps/rejected": -246.9486083984375, "loss": 0.394, "rewards/chosen": -0.11909495294094086, "rewards/margins": 1.007318153977394, "rewards/rejected": -1.126413106918335, "step": 954 }, { "epoch": 0.05061882172103994, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13295114.666666666, "logits/rejected": -87058208.0, "logps/chosen": -230.2001953125, "logps/rejected": -331.126708984375, "loss": 0.428, "rewards/chosen": 0.09936304887135823, "rewards/margins": 1.088727315266927, "rewards/rejected": -0.9893642663955688, "step": 955 }, { "epoch": 0.050671825722842075, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6889210.0, "logits/rejected": -7783910.0, "logps/chosen": -382.819580078125, "logps/rejected": -297.650146484375, "loss": 0.4009, "rewards/chosen": -0.14979678392410278, "rewards/margins": 0.8872169852256775, "rewards/rejected": -1.0370137691497803, "step": 956 }, { "epoch": 0.05072482972464421, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8457650.0, "logits/rejected": -30513536.0, "logps/chosen": -159.5579833984375, "logps/rejected": -337.5986022949219, "loss": 0.3957, "rewards/chosen": -0.29422932863235474, "rewards/margins": 1.0224961638450623, "rewards/rejected": -1.316725492477417, "step": 957 }, { "epoch": 0.05077783372644635, "grad_norm": 79.5, "kl": 0.9141616821289062, "learning_rate": 5e-07, "logits/chosen": -20632448.0, "logits/rejected": -22541004.0, "logps/chosen": -520.7411499023438, "logps/rejected": -504.7378234863281, "loss": 0.3609, "rewards/chosen": 0.2266073226928711, "rewards/margins": 1.4743348360061646, "rewards/rejected": -1.2477275133132935, "step": 958 }, { "epoch": 0.050830837728248486, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 231824.25, "logits/rejected": 2555866.0, "logps/chosen": -191.27671813964844, "logps/rejected": -497.36883544921875, "loss": 0.3329, "rewards/chosen": 0.15104298293590546, "rewards/margins": 1.7046773880720139, "rewards/rejected": -1.5536344051361084, "step": 959 }, { "epoch": 0.050883841730050616, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13851009.333333334, "logits/rejected": -21330425.6, "logps/chosen": -194.52665201822916, "logps/rejected": -336.101611328125, "loss": 0.302, "rewards/chosen": 0.2254810929298401, "rewards/margins": 1.5769280076026917, "rewards/rejected": -1.3514469146728516, "step": 960 }, { "epoch": 0.05093684573185275, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7373968.0, "logits/rejected": -2972940.0, "logps/chosen": -214.382568359375, "logps/rejected": -389.2899983723958, "loss": 0.3899, "rewards/chosen": 0.04892609119415283, "rewards/margins": 1.3326638301213583, "rewards/rejected": -1.2837377389272053, "step": 961 }, { "epoch": 0.05098984973365489, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56858328.0, "logits/rejected": -23751725.333333332, "logps/chosen": -561.6233520507812, "logps/rejected": -284.6805013020833, "loss": 0.3455, "rewards/chosen": 0.13454285264015198, "rewards/margins": 1.01101882259051, "rewards/rejected": -0.876475969950358, "step": 962 }, { "epoch": 0.05104285373545703, "grad_norm": 73.5, "kl": 1.0437240600585938, "learning_rate": 5e-07, "logits/chosen": -65948448.0, "logits/rejected": 4619874.0, "logps/chosen": -485.2051595052083, "logps/rejected": -423.2056579589844, "loss": 0.4523, "rewards/chosen": 0.10509267449378967, "rewards/margins": 1.1103356182575226, "rewards/rejected": -1.005242943763733, "step": 963 }, { "epoch": 0.051095857737259165, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21512574.4, "logits/rejected": -20929350.666666668, "logps/chosen": -115.56121826171875, "logps/rejected": -403.4501139322917, "loss": 0.3976, "rewards/chosen": -0.0009920187294483185, "rewards/margins": 1.2761742683748405, "rewards/rejected": -1.2771662871042888, "step": 964 }, { "epoch": 0.0511488617390613, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57108136.0, "logits/rejected": -25187882.0, "logps/chosen": -435.7178955078125, "logps/rejected": -184.77992248535156, "loss": 0.4158, "rewards/chosen": -0.05219726637005806, "rewards/margins": 0.7917001359164715, "rewards/rejected": -0.8438974022865295, "step": 965 }, { "epoch": 0.05120186574086343, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79064085.33333333, "logits/rejected": -24410566.4, "logps/chosen": -547.2476399739584, "logps/rejected": -318.19892578125, "loss": 0.3159, "rewards/chosen": 0.09354044993718465, "rewards/margins": 1.3884053905804952, "rewards/rejected": -1.2948649406433106, "step": 966 }, { "epoch": 0.05125486974266557, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22684272.0, "logits/rejected": -33328614.0, "logps/chosen": -382.547607421875, "logps/rejected": -318.90985107421875, "loss": 0.3555, "rewards/chosen": 0.2464607208967209, "rewards/margins": 1.3013918846845627, "rewards/rejected": -1.0549311637878418, "step": 967 }, { "epoch": 0.05130787374446771, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12963360.0, "logits/rejected": 202957632.0, "logps/chosen": -184.46119689941406, "logps/rejected": -361.4192199707031, "loss": 0.3905, "rewards/chosen": -0.21509304642677307, "rewards/margins": 1.069962590932846, "rewards/rejected": -1.2850556373596191, "step": 968 }, { "epoch": 0.051360877746269844, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21046068.0, "logits/rejected": -35874388.0, "logps/chosen": -212.56784057617188, "logps/rejected": -289.88568115234375, "loss": 0.3914, "rewards/chosen": -0.1682482659816742, "rewards/margins": 1.2722852528095245, "rewards/rejected": -1.4405335187911987, "step": 969 }, { "epoch": 0.05141388174807198, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28118928.0, "logits/rejected": -20341428.0, "logps/chosen": -543.77822265625, "logps/rejected": -376.2530924479167, "loss": 0.3834, "rewards/chosen": 0.12937484979629515, "rewards/margins": 1.2998215635617574, "rewards/rejected": -1.1704467137654622, "step": 970 }, { "epoch": 0.05146688574987412, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3305634.5, "logits/rejected": -26409792.0, "logps/chosen": -79.1991195678711, "logps/rejected": -271.3103724888393, "loss": 0.3732, "rewards/chosen": -0.46043166518211365, "rewards/margins": 0.2328288001673562, "rewards/rejected": -0.6932604653494698, "step": 971 }, { "epoch": 0.05151988975167625, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11320904.0, "logits/rejected": -57629440.0, "logps/chosen": -264.337158203125, "logps/rejected": -475.4710388183594, "loss": 0.3486, "rewards/chosen": 0.03726950287818909, "rewards/margins": 1.615185171365738, "rewards/rejected": -1.5779156684875488, "step": 972 }, { "epoch": 0.051572893753478385, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72358028.8, "logits/rejected": -13173165.333333334, "logps/chosen": -353.7701171875, "logps/rejected": -269.6872151692708, "loss": 0.4436, "rewards/chosen": -0.09955687522888183, "rewards/margins": 0.7915175437927247, "rewards/rejected": -0.8910744190216064, "step": 973 }, { "epoch": 0.05162589775528052, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32126656.0, "logits/rejected": -22673650.666666668, "logps/chosen": -561.1334838867188, "logps/rejected": -239.58479817708334, "loss": 0.3143, "rewards/chosen": 0.04500466585159302, "rewards/margins": 1.2184819181760151, "rewards/rejected": -1.1734772523244221, "step": 974 }, { "epoch": 0.05167890175708266, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13250529.0, "logits/rejected": -30905850.0, "logps/chosen": -143.26454162597656, "logps/rejected": -259.70404052734375, "loss": 0.3743, "rewards/chosen": 0.06300154328346252, "rewards/margins": 1.2187128365039825, "rewards/rejected": -1.15571129322052, "step": 975 }, { "epoch": 0.0517319057588848, "grad_norm": 82.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21533366.85714286, "logits/rejected": -11703622.0, "logps/chosen": -577.5802176339286, "logps/rejected": -196.326416015625, "loss": 0.4268, "rewards/chosen": 0.19293345723833358, "rewards/margins": 1.323667781693595, "rewards/rejected": -1.1307343244552612, "step": 976 }, { "epoch": 0.051784909760686934, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43100058.666666664, "logits/rejected": -40275440.0, "logps/chosen": -400.615234375, "logps/rejected": -320.110986328125, "loss": 0.3557, "rewards/chosen": 0.04131952921549479, "rewards/margins": 1.1172953287760417, "rewards/rejected": -1.0759757995605468, "step": 977 }, { "epoch": 0.05183791376248907, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22316156.0, "logits/rejected": -21245440.0, "logps/chosen": -375.22821044921875, "logps/rejected": -184.2974853515625, "loss": 0.3665, "rewards/chosen": 0.2299930602312088, "rewards/margins": 1.1401798278093338, "rewards/rejected": -0.910186767578125, "step": 978 }, { "epoch": 0.0518909177642912, "grad_norm": 54.75, "kl": 0.15881919860839844, "learning_rate": 5e-07, "logits/chosen": -22886608.0, "logits/rejected": -29391180.0, "logps/chosen": -368.927978515625, "logps/rejected": -272.4889221191406, "loss": 0.3701, "rewards/chosen": 0.3451904356479645, "rewards/margins": 1.263023167848587, "rewards/rejected": -0.9178327322006226, "step": 979 }, { "epoch": 0.05194392176609334, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9951346.0, "logits/rejected": -39914395.428571425, "logps/chosen": -428.1072082519531, "logps/rejected": -321.44224330357144, "loss": 0.3354, "rewards/chosen": 0.1734161376953125, "rewards/margins": 0.9763547352382115, "rewards/rejected": -0.802938597542899, "step": 980 }, { "epoch": 0.051996925767895476, "grad_norm": 56.0, "kl": 0.32384777069091797, "learning_rate": 5e-07, "logits/chosen": -2746100.0, "logits/rejected": -7994648.0, "logps/chosen": -216.2145263671875, "logps/rejected": -56.06707763671875, "loss": 0.4711, "rewards/chosen": -0.10888595581054687, "rewards/margins": 0.5018834749857585, "rewards/rejected": -0.6107694307963053, "step": 981 }, { "epoch": 0.05204992976969761, "grad_norm": 86.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53873301.333333336, "logits/rejected": -17950932.0, "logps/chosen": -444.9375813802083, "logps/rejected": -81.17508697509766, "loss": 0.473, "rewards/chosen": -0.06847534577051799, "rewards/margins": 0.6272584994633993, "rewards/rejected": -0.6957338452339172, "step": 982 }, { "epoch": 0.05210293377149975, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11985794.666666666, "logits/rejected": -2857585.0, "logps/chosen": -254.95332845052084, "logps/rejected": -69.66917419433594, "loss": 0.4854, "rewards/chosen": -0.1741008758544922, "rewards/margins": 0.6122490167617798, "rewards/rejected": -0.786349892616272, "step": 983 }, { "epoch": 0.05215593777330189, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 47348064.0, "logits/rejected": -25576317.333333332, "logps/chosen": -311.04193115234375, "logps/rejected": -258.0461018880208, "loss": 0.3485, "rewards/chosen": 0.17111483216285706, "rewards/margins": 1.1168554524580636, "rewards/rejected": -0.9457406202952067, "step": 984 }, { "epoch": 0.05220894177510402, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6735472.0, "logits/rejected": -3857612.0, "logps/chosen": -475.2250061035156, "logps/rejected": -268.5066731770833, "loss": 0.3464, "rewards/chosen": 0.17119446396827698, "rewards/margins": 1.02458984653155, "rewards/rejected": -0.8533953825632731, "step": 985 }, { "epoch": 0.052261945776906155, "grad_norm": 82.5, "kl": 0.12079620361328125, "learning_rate": 5e-07, "logits/chosen": -874020.0, "logits/rejected": -41524122.666666664, "logps/chosen": -897.72861328125, "logps/rejected": -463.4126383463542, "loss": 0.3517, "rewards/chosen": 0.29111082553863527, "rewards/margins": 2.0667246103286745, "rewards/rejected": -1.775613784790039, "step": 986 }, { "epoch": 0.05231494977870829, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53151032.0, "logits/rejected": -20402748.0, "logps/chosen": -410.6642761230469, "logps/rejected": -251.34747314453125, "loss": 0.3587, "rewards/chosen": -0.05875817686319351, "rewards/margins": 1.3740654811263084, "rewards/rejected": -1.432823657989502, "step": 987 }, { "epoch": 0.05236795378051043, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70301978.66666667, "logits/rejected": -17545784.0, "logps/chosen": -268.7937825520833, "logps/rejected": -158.504248046875, "loss": 0.3689, "rewards/chosen": 0.1921344796816508, "rewards/margins": 0.9991022149721781, "rewards/rejected": -0.8069677352905273, "step": 988 }, { "epoch": 0.052420957782312566, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -88841632.0, "logits/rejected": -34004297.6, "logps/chosen": -283.84796142578125, "logps/rejected": -340.9876953125, "loss": 0.3257, "rewards/chosen": -0.09441128373146057, "rewards/margins": 1.3568008363246917, "rewards/rejected": -1.4512121200561523, "step": 989 }, { "epoch": 0.0524739617841147, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16781268.0, "logits/rejected": -14804493.0, "logps/chosen": -112.724365234375, "logps/rejected": -177.57745361328125, "loss": 0.3442, "rewards/chosen": 0.08644543588161469, "rewards/margins": 1.4657763093709946, "rewards/rejected": -1.3793308734893799, "step": 990 }, { "epoch": 0.05252696578591683, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33724272.0, "logits/rejected": -25723235.2, "logps/chosen": -587.2709147135416, "logps/rejected": -347.9071533203125, "loss": 0.3054, "rewards/chosen": 0.42655893166859943, "rewards/margins": 1.678116758664449, "rewards/rejected": -1.2515578269958496, "step": 991 }, { "epoch": 0.05257996978771897, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31121340.0, "logits/rejected": -2607162.0, "logps/chosen": -451.7406005859375, "logps/rejected": -642.1546630859375, "loss": 0.2619, "rewards/chosen": 0.372567743062973, "rewards/margins": 2.6035448014736176, "rewards/rejected": -2.2309770584106445, "step": 992 }, { "epoch": 0.05263297378952111, "grad_norm": 64.5, "kl": 0.13332748413085938, "learning_rate": 5e-07, "logits/chosen": -20852102.4, "logits/rejected": -20922726.666666668, "logps/chosen": -319.37724609375, "logps/rejected": -204.81937662760416, "loss": 0.4412, "rewards/chosen": 0.0007686614990234375, "rewards/margins": 0.8336071173350016, "rewards/rejected": -0.8328384558359782, "step": 993 }, { "epoch": 0.052685977791323245, "grad_norm": 62.0, "kl": 0.21531963348388672, "learning_rate": 5e-07, "logits/chosen": -29528324.0, "logits/rejected": -30815412.0, "logps/chosen": -448.5677185058594, "logps/rejected": -410.7911071777344, "loss": 0.3547, "rewards/chosen": 0.2971143126487732, "rewards/margins": 1.7697914242744446, "rewards/rejected": -1.4726771116256714, "step": 994 }, { "epoch": 0.05273898179312538, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26885685.333333332, "logits/rejected": -20872465.6, "logps/chosen": -101.11224365234375, "logps/rejected": -197.6864501953125, "loss": 0.3242, "rewards/chosen": 0.24158593018849692, "rewards/margins": 1.3696155627568563, "rewards/rejected": -1.1280296325683594, "step": 995 }, { "epoch": 0.05279198579492752, "grad_norm": 86.0, "kl": 0.49337196350097656, "learning_rate": 5e-07, "logits/chosen": -14208116.57142857, "logits/rejected": -839044.125, "logps/chosen": -377.3433314732143, "logps/rejected": -112.27490234375, "loss": 0.5108, "rewards/chosen": -0.08275394780295235, "rewards/margins": 0.5431326116834369, "rewards/rejected": -0.6258865594863892, "step": 996 }, { "epoch": 0.052844989796729656, "grad_norm": 90.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84141256.0, "logits/rejected": 5607191.0, "logps/chosen": -218.82769775390625, "logps/rejected": -281.1037292480469, "loss": 0.4477, "rewards/chosen": -0.29076749086380005, "rewards/margins": 0.4842931628227234, "rewards/rejected": -0.7750606536865234, "step": 997 }, { "epoch": 0.05289799379853179, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30494096.0, "logits/rejected": -14795884.8, "logps/chosen": -261.56317138671875, "logps/rejected": -101.58062744140625, "loss": 0.4056, "rewards/chosen": -0.11257146795590718, "rewards/margins": 0.6059552292029062, "rewards/rejected": -0.7185266971588135, "step": 998 }, { "epoch": 0.052950997800333924, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67469568.0, "logits/rejected": -7673823.333333333, "logps/chosen": -348.67315673828125, "logps/rejected": -228.626220703125, "loss": 0.381, "rewards/chosen": -0.22083130478858948, "rewards/margins": 0.5904294749101003, "rewards/rejected": -0.8112607796986898, "step": 999 }, { "epoch": 0.05300400180213606, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32295054.0, "logits/rejected": -44224269.333333336, "logps/chosen": -239.4865264892578, "logps/rejected": -382.9202880859375, "loss": 0.3118, "rewards/chosen": 0.005410764366388321, "rewards/margins": 1.1428265708188217, "rewards/rejected": -1.1374158064524333, "step": 1000 }, { "epoch": 0.0530570058039382, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1963066.5, "logits/rejected": -37608224.0, "logps/chosen": -194.085205078125, "logps/rejected": -356.5828552246094, "loss": 0.3744, "rewards/chosen": 0.10395193845033646, "rewards/margins": 1.1603317335247993, "rewards/rejected": -1.056379795074463, "step": 1001 }, { "epoch": 0.053110009805740335, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41535532.0, "logits/rejected": -48258341.333333336, "logps/chosen": -275.692626953125, "logps/rejected": -276.94097900390625, "loss": 0.3028, "rewards/chosen": 0.12252512574195862, "rewards/margins": 1.5482572615146637, "rewards/rejected": -1.425732135772705, "step": 1002 }, { "epoch": 0.05316301380754247, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39705881.6, "logits/rejected": -13419665.333333334, "logps/chosen": -304.27568359375, "logps/rejected": -210.9749755859375, "loss": 0.4346, "rewards/chosen": 0.01116935908794403, "rewards/margins": 0.7859486788511276, "rewards/rejected": -0.7747793197631836, "step": 1003 }, { "epoch": 0.0532160178093446, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23138832.0, "logits/rejected": 25432306.0, "logps/chosen": -305.2870178222656, "logps/rejected": -265.167236328125, "loss": 0.3684, "rewards/chosen": 0.004700187593698502, "rewards/margins": 1.253372672945261, "rewards/rejected": -1.2486724853515625, "step": 1004 }, { "epoch": 0.05326902181114674, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23423150.0, "logits/rejected": -22550197.333333332, "logps/chosen": -472.78265380859375, "logps/rejected": -341.7607828776042, "loss": 0.3231, "rewards/chosen": 0.25545692443847656, "rewards/margins": 1.3373899459838867, "rewards/rejected": -1.0819330215454102, "step": 1005 }, { "epoch": 0.05332202581294888, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22500464.0, "logits/rejected": -53031764.0, "logps/chosen": -190.50935872395834, "logps/rejected": -83.92914581298828, "loss": 0.4876, "rewards/chosen": -0.04564832150936127, "rewards/margins": 0.2927031069993973, "rewards/rejected": -0.33835142850875854, "step": 1006 }, { "epoch": 0.053375029814751014, "grad_norm": 84.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70909600.0, "logits/rejected": -32728534.4, "logps/chosen": -832.5166015625, "logps/rejected": -511.46005859375, "loss": 0.2995, "rewards/chosen": 0.26785685618718463, "rewards/margins": 1.7052298267682393, "rewards/rejected": -1.4373729705810547, "step": 1007 }, { "epoch": 0.05342803381655315, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5079267.5, "logits/rejected": -53199858.28571428, "logps/chosen": -30.699623107910156, "logps/rejected": -643.8791155133929, "loss": 0.2142, "rewards/chosen": -0.07886924594640732, "rewards/margins": 1.9472472614475658, "rewards/rejected": -2.026116507393973, "step": 1008 }, { "epoch": 0.05348103781835529, "grad_norm": 65.0, "kl": 0.3032035827636719, "learning_rate": 5e-07, "logits/chosen": -35362752.0, "logits/rejected": -30877386.666666668, "logps/chosen": -955.5440673828125, "logps/rejected": -187.03116861979166, "loss": 0.3574, "rewards/chosen": 0.32814332842826843, "rewards/margins": 1.048805763324102, "rewards/rejected": -0.7206624348958334, "step": 1009 }, { "epoch": 0.05353404182015742, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53339658.666666664, "logits/rejected": -19653680.0, "logps/chosen": -369.4543863932292, "logps/rejected": -447.23017578125, "loss": 0.2616, "rewards/chosen": 0.17716383934020996, "rewards/margins": 2.126952123641968, "rewards/rejected": -1.9497882843017578, "step": 1010 }, { "epoch": 0.053587045821959556, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15884857.6, "logits/rejected": -6114449.333333333, "logps/chosen": -401.4314208984375, "logps/rejected": -176.50675455729166, "loss": 0.3971, "rewards/chosen": 0.04728989005088806, "rewards/margins": 1.1765542447566986, "rewards/rejected": -1.1292643547058105, "step": 1011 }, { "epoch": 0.05364004982376169, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36520693.333333336, "logits/rejected": -23989451.2, "logps/chosen": -350.2607828776042, "logps/rejected": -221.168994140625, "loss": 0.3929, "rewards/chosen": -0.17857335011164346, "rewards/margins": 0.6961894710858664, "rewards/rejected": -0.8747628211975098, "step": 1012 }, { "epoch": 0.05369305382556383, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56368005.333333336, "logits/rejected": -11664110.4, "logps/chosen": -253.36370849609375, "logps/rejected": -205.474951171875, "loss": 0.4228, "rewards/chosen": -0.06805089116096497, "rewards/margins": 0.5523994147777558, "rewards/rejected": -0.6204503059387207, "step": 1013 }, { "epoch": 0.05374605782736597, "grad_norm": 79.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16861061.333333332, "logits/rejected": -9972306.4, "logps/chosen": -741.93603515625, "logps/rejected": -606.785986328125, "loss": 0.325, "rewards/chosen": 0.26352185010910034, "rewards/margins": 1.5810924410820006, "rewards/rejected": -1.3175705909729003, "step": 1014 }, { "epoch": 0.053799061829168104, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20650350.666666668, "logits/rejected": -46814825.6, "logps/chosen": -196.91365559895834, "logps/rejected": -360.6360595703125, "loss": 0.3524, "rewards/chosen": 0.04640775918960571, "rewards/margins": 1.2738303780555724, "rewards/rejected": -1.2274226188659667, "step": 1015 }, { "epoch": 0.05385206583097024, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23157498.666666668, "logits/rejected": -32382970.0, "logps/chosen": -253.75679524739584, "logps/rejected": -451.26641845703125, "loss": 0.3697, "rewards/chosen": 0.16960789759953818, "rewards/margins": 2.6061039169629416, "rewards/rejected": -2.4364960193634033, "step": 1016 }, { "epoch": 0.05390506983277237, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41163449.6, "logits/rejected": 18817416.0, "logps/chosen": -496.21279296875, "logps/rejected": -284.6124674479167, "loss": 0.3939, "rewards/chosen": 0.2701117038726807, "rewards/margins": 1.0109867095947265, "rewards/rejected": -0.7408750057220459, "step": 1017 }, { "epoch": 0.05395807383457451, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47856665.6, "logits/rejected": -4254723.666666667, "logps/chosen": -285.3930908203125, "logps/rejected": -84.98336791992188, "loss": 0.4825, "rewards/chosen": -0.24055724143981932, "rewards/margins": 0.3804342667261759, "rewards/rejected": -0.6209915081659952, "step": 1018 }, { "epoch": 0.054011077836376646, "grad_norm": 77.5, "kl": 0.43602752685546875, "learning_rate": 5e-07, "logits/chosen": -33676243.2, "logits/rejected": 8655532.666666666, "logps/chosen": -343.4998291015625, "logps/rejected": -224.89310709635416, "loss": 0.3899, "rewards/chosen": 0.31243584156036375, "rewards/margins": 1.145509457588196, "rewards/rejected": -0.833073616027832, "step": 1019 }, { "epoch": 0.05406408183817878, "grad_norm": 54.5, "kl": 0.12259864807128906, "learning_rate": 5e-07, "logits/chosen": -42506757.333333336, "logits/rejected": -8664016.0, "logps/chosen": -429.9383951822917, "logps/rejected": -198.730322265625, "loss": 0.3473, "rewards/chosen": 0.20736084381739298, "rewards/margins": 1.2008941690127055, "rewards/rejected": -0.9935333251953125, "step": 1020 }, { "epoch": 0.05411708583998092, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42558840.0, "logits/rejected": -34945772.8, "logps/chosen": -286.4354248046875, "logps/rejected": -395.5748779296875, "loss": 0.3776, "rewards/chosen": 0.2358678181966146, "rewards/margins": 1.192924435933431, "rewards/rejected": -0.9570566177368164, "step": 1021 }, { "epoch": 0.05417008984178306, "grad_norm": 77.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 802376.0, "logits/rejected": -6553314.0, "logps/chosen": -203.97079467773438, "logps/rejected": -272.7427673339844, "loss": 0.3682, "rewards/chosen": 0.1238580197095871, "rewards/margins": 1.189827486872673, "rewards/rejected": -1.065969467163086, "step": 1022 }, { "epoch": 0.05422309384358519, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23476992.0, "logits/rejected": -65842648.0, "logps/chosen": -156.94351196289062, "logps/rejected": -356.88519287109375, "loss": 0.3791, "rewards/chosen": -0.20609378814697266, "rewards/margins": 1.3130362033843994, "rewards/rejected": -1.519129991531372, "step": 1023 }, { "epoch": 0.054276097845387325, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45477176.0, "logits/rejected": -50640368.0, "logps/chosen": -307.193359375, "logps/rejected": -247.5477294921875, "loss": 0.3633, "rewards/chosen": -0.020506009459495544, "rewards/margins": 1.4317186027765274, "rewards/rejected": -1.452224612236023, "step": 1024 }, { "epoch": 0.05432910184718946, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35082630.4, "logits/rejected": -32630250.666666668, "logps/chosen": -296.730126953125, "logps/rejected": -521.171630859375, "loss": 0.4184, "rewards/chosen": -0.0055610060691833494, "rewards/margins": 1.1466059962908428, "rewards/rejected": -1.1521670023600261, "step": 1025 }, { "epoch": 0.0543821058489916, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32617978.0, "logits/rejected": -29433954.0, "logps/chosen": -365.02618408203125, "logps/rejected": -447.1971130371094, "loss": 0.3742, "rewards/chosen": -0.06903108954429626, "rewards/margins": 1.2006386816501617, "rewards/rejected": -1.269669771194458, "step": 1026 }, { "epoch": 0.054435109850793736, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57669348.0, "logits/rejected": -44779162.666666664, "logps/chosen": -465.92974853515625, "logps/rejected": -440.9384358723958, "loss": 0.2536, "rewards/chosen": 0.258657842874527, "rewards/margins": 2.148264477650325, "rewards/rejected": -1.8896066347757976, "step": 1027 }, { "epoch": 0.054488113852595874, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7856956.0, "logits/rejected": -24743352.0, "logps/chosen": -130.0452677408854, "logps/rejected": -320.59039306640625, "loss": 0.3996, "rewards/chosen": 0.08079216877619426, "rewards/margins": 1.7841628591219585, "rewards/rejected": -1.7033706903457642, "step": 1028 }, { "epoch": 0.054541117854398004, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35112261.333333336, "logits/rejected": -18136948.8, "logps/chosen": -300.08933512369794, "logps/rejected": -380.261572265625, "loss": 0.3495, "rewards/chosen": 0.19592666625976562, "rewards/margins": 1.3197601318359375, "rewards/rejected": -1.123833465576172, "step": 1029 }, { "epoch": 0.05459412185620014, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28851520.0, "logits/rejected": -21043763.2, "logps/chosen": -422.0107421875, "logps/rejected": -292.765283203125, "loss": 0.3751, "rewards/chosen": 0.04672547181447347, "rewards/margins": 0.9443267901738485, "rewards/rejected": -0.897601318359375, "step": 1030 }, { "epoch": 0.05464712585800228, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39994572.0, "logits/rejected": -43937949.333333336, "logps/chosen": -150.39395141601562, "logps/rejected": -321.8520100911458, "loss": 0.3101, "rewards/chosen": -0.07678356021642685, "rewards/margins": 1.162815588215987, "rewards/rejected": -1.2395991484324138, "step": 1031 }, { "epoch": 0.054700129859804415, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -133352352.0, "logits/rejected": -12519364.0, "logps/chosen": -461.9473876953125, "logps/rejected": -330.9490966796875, "loss": 0.3609, "rewards/chosen": 0.27362060546875, "rewards/margins": 1.1903085708618164, "rewards/rejected": -0.9166879653930664, "step": 1032 }, { "epoch": 0.05475313386160655, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35016164.0, "logits/rejected": -50553996.0, "logps/chosen": -509.03851318359375, "logps/rejected": -306.26806640625, "loss": 0.3636, "rewards/chosen": 0.18813934922218323, "rewards/margins": 1.2074337303638458, "rewards/rejected": -1.0192943811416626, "step": 1033 }, { "epoch": 0.05480613786340869, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39478965.333333336, "logits/rejected": -19176712.0, "logps/chosen": -486.39794921875, "logps/rejected": -354.8323059082031, "loss": 0.4154, "rewards/chosen": 0.05331277847290039, "rewards/margins": 1.6850206851959229, "rewards/rejected": -1.6317079067230225, "step": 1034 }, { "epoch": 0.05485914186521083, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57029989.333333336, "logits/rejected": -127654352.0, "logps/chosen": -363.5231526692708, "logps/rejected": -363.58782958984375, "loss": 0.471, "rewards/chosen": -0.27001341183980304, "rewards/margins": 1.2130905787150066, "rewards/rejected": -1.4831039905548096, "step": 1035 }, { "epoch": 0.05491214586701296, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6777413.333333333, "logits/rejected": 21489770.0, "logps/chosen": -199.1366984049479, "logps/rejected": -144.10690307617188, "loss": 0.4575, "rewards/chosen": 0.056958287954330444, "rewards/margins": 0.5917468369007111, "rewards/rejected": -0.5347885489463806, "step": 1036 }, { "epoch": 0.054965149868815094, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -681778.0, "logits/rejected": -11943305.0, "logps/chosen": -93.51133728027344, "logps/rejected": -203.2213134765625, "loss": 0.3321, "rewards/chosen": 0.2883037328720093, "rewards/margins": 1.5804177522659302, "rewards/rejected": -1.292114019393921, "step": 1037 }, { "epoch": 0.05501815387061723, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45576613.333333336, "logits/rejected": -40548230.4, "logps/chosen": -448.773193359375, "logps/rejected": -307.565673828125, "loss": 0.3263, "rewards/chosen": 0.15753480792045593, "rewards/margins": 1.353301066160202, "rewards/rejected": -1.1957662582397461, "step": 1038 }, { "epoch": 0.05507115787241937, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 20734097.6, "logits/rejected": 11768485.333333334, "logps/chosen": -180.57474365234376, "logps/rejected": -431.6039225260417, "loss": 0.3982, "rewards/chosen": 0.035008543729782106, "rewards/margins": 1.1957025070985157, "rewards/rejected": -1.1606939633687336, "step": 1039 }, { "epoch": 0.055124161874221506, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8125812.5, "logits/rejected": -72896736.0, "logps/chosen": -62.96287155151367, "logps/rejected": -242.8367919921875, "loss": 0.336, "rewards/chosen": 0.02417716383934021, "rewards/margins": 0.9586702287197113, "rewards/rejected": -0.9344930648803711, "step": 1040 }, { "epoch": 0.05517716587602364, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29353432.0, "logits/rejected": -9727886.0, "logps/chosen": -357.2797037760417, "logps/rejected": -126.68156433105469, "loss": 0.4081, "rewards/chosen": 0.1898716688156128, "rewards/margins": 1.3172361850738525, "rewards/rejected": -1.1273645162582397, "step": 1041 }, { "epoch": 0.05523016987782577, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31744478.0, "logits/rejected": -10541267.0, "logps/chosen": -394.705810546875, "logps/rejected": -162.1546630859375, "loss": 0.4051, "rewards/chosen": 0.14980773627758026, "rewards/margins": 0.8363870829343796, "rewards/rejected": -0.6865793466567993, "step": 1042 }, { "epoch": 0.05528317387962791, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16626042.0, "logits/rejected": -6035608.666666667, "logps/chosen": -427.4546813964844, "logps/rejected": -225.67814127604166, "loss": 0.3402, "rewards/chosen": 0.4166702330112457, "rewards/margins": 1.216782957315445, "rewards/rejected": -0.8001127243041992, "step": 1043 }, { "epoch": 0.05533617788143005, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10013977.333333334, "logits/rejected": -14073088.0, "logps/chosen": -322.80251057942706, "logps/rejected": -344.326025390625, "loss": 0.3637, "rewards/chosen": -0.317923108736674, "rewards/margins": 0.9908693869908649, "rewards/rejected": -1.308792495727539, "step": 1044 }, { "epoch": 0.055389181883232184, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59819644.0, "logits/rejected": -88260544.0, "logps/chosen": -288.90606689453125, "logps/rejected": -615.141845703125, "loss": 0.295, "rewards/chosen": 0.04331245273351669, "rewards/margins": 2.2374624237418175, "rewards/rejected": -2.194149971008301, "step": 1045 }, { "epoch": 0.05544218588503432, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58187192.0, "logits/rejected": -27053064.0, "logps/chosen": -352.59912109375, "logps/rejected": -198.67361450195312, "loss": 0.4339, "rewards/chosen": -0.0864463821053505, "rewards/margins": 0.6130796298384666, "rewards/rejected": -0.6995260119438171, "step": 1046 }, { "epoch": 0.05549518988683646, "grad_norm": 48.5, "kl": 0.3251218795776367, "learning_rate": 5e-07, "logits/chosen": -1987271.2, "logits/rejected": -43996218.666666664, "logps/chosen": -109.48154296875, "logps/rejected": -357.0055338541667, "loss": 0.4851, "rewards/chosen": -0.3335805654525757, "rewards/margins": 0.7472943544387818, "rewards/rejected": -1.0808749198913574, "step": 1047 }, { "epoch": 0.05554819388863859, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28377939.2, "logits/rejected": -20323304.0, "logps/chosen": -444.208349609375, "logps/rejected": -394.1846516927083, "loss": 0.3833, "rewards/chosen": 0.1887895345687866, "rewards/margins": 1.3183786153793335, "rewards/rejected": -1.1295890808105469, "step": 1048 }, { "epoch": 0.055601197890440726, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50854613.333333336, "logits/rejected": -39811433.6, "logps/chosen": -447.1923828125, "logps/rejected": -347.16865234375, "loss": 0.3302, "rewards/chosen": 0.053884377082188926, "rewards/margins": 1.3440713544686635, "rewards/rejected": -1.2901869773864747, "step": 1049 }, { "epoch": 0.05565420189224286, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59491996.0, "logits/rejected": -21265769.333333332, "logps/chosen": -312.4566650390625, "logps/rejected": -171.4091593424479, "loss": 0.339, "rewards/chosen": -0.011036679148674011, "rewards/margins": 0.9330067982276281, "rewards/rejected": -0.9440434773763021, "step": 1050 }, { "epoch": 0.055707205894045, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14959600.0, "logits/rejected": -18138548.8, "logps/chosen": -119.57198079427083, "logps/rejected": -84.05302734375, "loss": 0.376, "rewards/chosen": -0.002107431491216024, "rewards/margins": 0.873641965786616, "rewards/rejected": -0.8757493972778321, "step": 1051 }, { "epoch": 0.05576020989584714, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16345242.666666666, "logits/rejected": -34990392.0, "logps/chosen": -298.19215901692706, "logps/rejected": -227.85714721679688, "loss": 0.4368, "rewards/chosen": 0.23434207836786905, "rewards/margins": 0.5796614686648051, "rewards/rejected": -0.34531939029693604, "step": 1052 }, { "epoch": 0.055813213897649275, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44357248.0, "logits/rejected": -29874313.6, "logps/chosen": -197.52530924479166, "logps/rejected": -197.681884765625, "loss": 0.3727, "rewards/chosen": 0.03203271081050237, "rewards/margins": 0.9801245038708052, "rewards/rejected": -0.9480917930603028, "step": 1053 }, { "epoch": 0.05586621789945141, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61828458.666666664, "logits/rejected": 5770196.0, "logps/chosen": -242.3734130859375, "logps/rejected": -531.1138671875, "loss": 0.3526, "rewards/chosen": 0.0002695719401041667, "rewards/margins": 1.2084977785746258, "rewards/rejected": -1.2082282066345216, "step": 1054 }, { "epoch": 0.05591922190125354, "grad_norm": 79.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45005300.0, "logits/rejected": -20103012.0, "logps/chosen": -507.76873779296875, "logps/rejected": -125.55278778076172, "loss": 0.4521, "rewards/chosen": -0.20032179355621338, "rewards/margins": 0.41067683696746826, "rewards/rejected": -0.6109986305236816, "step": 1055 }, { "epoch": 0.05597222590305568, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11900629.333333334, "logits/rejected": -32634809.6, "logps/chosen": -274.33258056640625, "logps/rejected": -283.08134765625, "loss": 0.3388, "rewards/chosen": 0.29105937480926514, "rewards/margins": 1.3047425985336303, "rewards/rejected": -1.0136832237243651, "step": 1056 }, { "epoch": 0.056025229904857816, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64743648.0, "logits/rejected": -24491756.8, "logps/chosen": -536.3173014322916, "logps/rejected": -262.4408935546875, "loss": 0.3549, "rewards/chosen": -0.04648233950138092, "rewards/margins": 1.022085890173912, "rewards/rejected": -1.068568229675293, "step": 1057 }, { "epoch": 0.056078233906659954, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 24050270.0, "logits/rejected": -8560993.0, "logps/chosen": -295.62628173828125, "logps/rejected": -428.806396484375, "loss": 0.3676, "rewards/chosen": -0.13220930099487305, "rewards/margins": 1.337679386138916, "rewards/rejected": -1.469888687133789, "step": 1058 }, { "epoch": 0.05613123790846209, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30882700.8, "logits/rejected": 9653594.666666666, "logps/chosen": -138.263623046875, "logps/rejected": -65.50535583496094, "loss": 0.496, "rewards/chosen": -0.2757869720458984, "rewards/margins": 0.2267199595769247, "rewards/rejected": -0.5025069316228231, "step": 1059 }, { "epoch": 0.05618424191026423, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23708297.6, "logits/rejected": -17714514.666666668, "logps/chosen": -171.09605712890624, "logps/rejected": -232.7689005533854, "loss": 0.3839, "rewards/chosen": 0.33734307289123533, "rewards/margins": 1.0741816520690919, "rewards/rejected": -0.7368385791778564, "step": 1060 }, { "epoch": 0.05623724591206636, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75199045.33333333, "logits/rejected": -26066638.4, "logps/chosen": -238.66035970052084, "logps/rejected": -497.030078125, "loss": 0.2953, "rewards/chosen": 0.270745853583018, "rewards/margins": 1.805899719397227, "rewards/rejected": -1.535153865814209, "step": 1061 }, { "epoch": 0.056290249913868495, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24421254.4, "logits/rejected": -21090728.0, "logps/chosen": -279.041259765625, "logps/rejected": -301.714111328125, "loss": 0.4337, "rewards/chosen": -0.05600254535675049, "rewards/margins": 0.7952250560124715, "rewards/rejected": -0.851227601369222, "step": 1062 }, { "epoch": 0.05634325391567063, "grad_norm": 66.0, "kl": 0.67828369140625, "learning_rate": 5e-07, "logits/chosen": -15046556.0, "logits/rejected": -13920704.0, "logps/chosen": -912.6729125976562, "logps/rejected": -279.0514729817708, "loss": 0.3083, "rewards/chosen": 0.4873666763305664, "rewards/margins": 1.6479403972625732, "rewards/rejected": -1.1605737209320068, "step": 1063 }, { "epoch": 0.05639625791747277, "grad_norm": 70.5, "kl": 0.3311920166015625, "learning_rate": 5e-07, "logits/chosen": -18628482.666666668, "logits/rejected": -29667340.8, "logps/chosen": -897.8290201822916, "logps/rejected": -415.51875, "loss": 0.2542, "rewards/chosen": 0.5834311246871948, "rewards/margins": 2.2608649015426634, "rewards/rejected": -1.6774337768554688, "step": 1064 }, { "epoch": 0.05644926191927491, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32202018.666666668, "logits/rejected": -3003849.2, "logps/chosen": -271.8293863932292, "logps/rejected": -249.026171875, "loss": 0.3292, "rewards/chosen": 0.14628875255584717, "rewards/margins": 1.4143208265304565, "rewards/rejected": -1.2680320739746094, "step": 1065 }, { "epoch": 0.056502265921077044, "grad_norm": 75.0, "kl": 0.013952255249023438, "learning_rate": 5e-07, "logits/chosen": -43432176.0, "logits/rejected": -9163854.0, "logps/chosen": -552.504638671875, "logps/rejected": -140.34085083007812, "loss": 0.3846, "rewards/chosen": 0.12159968167543411, "rewards/margins": 1.0118872001767159, "rewards/rejected": -0.8902875185012817, "step": 1066 }, { "epoch": 0.056555269922879174, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26866149.333333332, "logits/rejected": -39888956.8, "logps/chosen": -239.408447265625, "logps/rejected": -424.1087890625, "loss": 0.3203, "rewards/chosen": 0.016241957743962605, "rewards/margins": 1.452228667338689, "rewards/rejected": -1.4359867095947265, "step": 1067 }, { "epoch": 0.05660827392468131, "grad_norm": 62.75, "kl": 0.10304450988769531, "learning_rate": 5e-07, "logits/chosen": -50462864.0, "logits/rejected": -15549718.0, "logps/chosen": -273.0341389973958, "logps/rejected": -351.0425720214844, "loss": 0.4358, "rewards/chosen": 0.03588638703028361, "rewards/margins": 1.5686483184496562, "rewards/rejected": -1.5327619314193726, "step": 1068 }, { "epoch": 0.05666127792648345, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27688460.0, "logits/rejected": -18292880.0, "logps/chosen": -579.7762451171875, "logps/rejected": -441.2900085449219, "loss": 0.3942, "rewards/chosen": 0.006771087646484375, "rewards/margins": 1.2982503175735474, "rewards/rejected": -1.291479229927063, "step": 1069 }, { "epoch": 0.056714281928285586, "grad_norm": 57.0, "kl": 0.7397670745849609, "learning_rate": 5e-07, "logits/chosen": -27723498.666666668, "logits/rejected": -26407766.4, "logps/chosen": -311.77260335286456, "logps/rejected": -283.8695556640625, "loss": 0.3479, "rewards/chosen": 0.26247737805048627, "rewards/margins": 1.196212653319041, "rewards/rejected": -0.9337352752685547, "step": 1070 }, { "epoch": 0.05676728593008772, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5022336.0, "logits/rejected": -19663004.0, "logps/chosen": -253.7400390625, "logps/rejected": -225.3915812174479, "loss": 0.4318, "rewards/chosen": -0.20876712799072267, "rewards/margins": 0.9983444690704346, "rewards/rejected": -1.2071115970611572, "step": 1071 }, { "epoch": 0.05682028993188986, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10327716.0, "logits/rejected": -19086130.0, "logps/chosen": -238.12838745117188, "logps/rejected": -322.6331787109375, "loss": 0.3461, "rewards/chosen": -0.004317663609981537, "rewards/margins": 1.5714356675744057, "rewards/rejected": -1.5757533311843872, "step": 1072 }, { "epoch": 0.056873293933692, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15308268.8, "logits/rejected": -37943813.333333336, "logps/chosen": -650.907958984375, "logps/rejected": -306.5187581380208, "loss": 0.3534, "rewards/chosen": 0.42443222999572755, "rewards/margins": 1.5171713352203369, "rewards/rejected": -1.0927391052246094, "step": 1073 }, { "epoch": 0.05692629793549413, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15197362.0, "logits/rejected": -37075904.0, "logps/chosen": -298.43389892578125, "logps/rejected": -452.32550048828125, "loss": 0.3391, "rewards/chosen": 0.132402241230011, "rewards/margins": 1.6349664330482483, "rewards/rejected": -1.5025641918182373, "step": 1074 }, { "epoch": 0.056979301937296264, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24625732.0, "logits/rejected": -5904003.5, "logps/chosen": -200.76181030273438, "logps/rejected": -175.26583862304688, "loss": 0.4005, "rewards/chosen": 0.0020858794450759888, "rewards/margins": 0.9089093953371048, "rewards/rejected": -0.9068235158920288, "step": 1075 }, { "epoch": 0.0570323059390984, "grad_norm": 80.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37460172.8, "logits/rejected": 11605661.333333334, "logps/chosen": -568.890234375, "logps/rejected": -306.59035237630206, "loss": 0.4013, "rewards/chosen": 0.03728027939796448, "rewards/margins": 1.2799290398756664, "rewards/rejected": -1.242648760477702, "step": 1076 }, { "epoch": 0.05708530994090054, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1420963.6666666667, "logits/rejected": -26017286.4, "logps/chosen": -122.48402913411458, "logps/rejected": -228.77646484375, "loss": 0.3308, "rewards/chosen": 0.45232729117075604, "rewards/margins": 1.3574342171351115, "rewards/rejected": -0.9051069259643555, "step": 1077 }, { "epoch": 0.057138313942702676, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15195096.0, "logits/rejected": -43384922.666666664, "logps/chosen": -159.624072265625, "logps/rejected": -354.6218668619792, "loss": 0.3349, "rewards/chosen": 0.2168339967727661, "rewards/margins": 2.0028911511103313, "rewards/rejected": -1.7860571543375652, "step": 1078 }, { "epoch": 0.05719131794450481, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21030360.0, "logits/rejected": -22093768.0, "logps/chosen": -281.5182189941406, "logps/rejected": -243.02427673339844, "loss": 0.4203, "rewards/chosen": -0.16689586639404297, "rewards/margins": 0.7658950686454773, "rewards/rejected": -0.9327909350395203, "step": 1079 }, { "epoch": 0.05724432194630694, "grad_norm": 55.25, "kl": 0.9736175537109375, "learning_rate": 5e-07, "logits/chosen": -9866256.0, "logits/rejected": 27113132.0, "logps/chosen": -357.1266276041667, "logps/rejected": -201.65663146972656, "loss": 0.4069, "rewards/chosen": 0.3771125078201294, "rewards/margins": 0.8807910680770874, "rewards/rejected": -0.503678560256958, "step": 1080 }, { "epoch": 0.05729732594810908, "grad_norm": 78.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46744348.8, "logits/rejected": -24320138.666666668, "logps/chosen": -478.65498046875, "logps/rejected": -252.7053426106771, "loss": 0.3903, "rewards/chosen": 0.08964798450469971, "rewards/margins": 1.2580786307652791, "rewards/rejected": -1.1684306462605794, "step": 1081 }, { "epoch": 0.05735032994991122, "grad_norm": 99.0, "kl": 0.949920654296875, "learning_rate": 5e-07, "logits/chosen": -8266057.714285715, "logits/rejected": -259600864.0, "logps/chosen": -560.5091029575893, "logps/rejected": -234.8244171142578, "loss": 0.4648, "rewards/chosen": 0.16266722338540213, "rewards/margins": 0.9708443965230669, "rewards/rejected": -0.8081771731376648, "step": 1082 }, { "epoch": 0.057403333951713355, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64071584.0, "logits/rejected": -31830997.333333332, "logps/chosen": -311.07470703125, "logps/rejected": -205.63509114583334, "loss": 0.3796, "rewards/chosen": -0.23895111680030823, "rewards/margins": 0.5676076511542002, "rewards/rejected": -0.8065587679545084, "step": 1083 }, { "epoch": 0.05745633795351549, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73919896.0, "logits/rejected": -23002370.666666668, "logps/chosen": -305.9755554199219, "logps/rejected": -373.1898600260417, "loss": 0.2698, "rewards/chosen": 0.2461029291152954, "rewards/margins": 1.985255519549052, "rewards/rejected": -1.7391525904337566, "step": 1084 }, { "epoch": 0.05750934195531763, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27017161.6, "logits/rejected": -28854378.666666668, "logps/chosen": -554.105029296875, "logps/rejected": -186.97281901041666, "loss": 0.4338, "rewards/chosen": -0.007039797306060791, "rewards/margins": 0.7516313274701437, "rewards/rejected": -0.7586711247762045, "step": 1085 }, { "epoch": 0.05756234595711976, "grad_norm": 175.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42562112.0, "logits/rejected": -15823539.0, "logps/chosen": -471.20452880859375, "logps/rejected": -167.39529418945312, "loss": 0.427, "rewards/chosen": -0.01469431258738041, "rewards/margins": 0.7200045790523291, "rewards/rejected": -0.7346988916397095, "step": 1086 }, { "epoch": 0.057615349958921896, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42877272.0, "logits/rejected": -22039852.0, "logps/chosen": -178.45404052734375, "logps/rejected": -284.6724853515625, "loss": 0.3826, "rewards/chosen": 0.06275424361228943, "rewards/margins": 1.2381531894207, "rewards/rejected": -1.1753989458084106, "step": 1087 }, { "epoch": 0.057668353960724034, "grad_norm": 72.0, "kl": 1.189535140991211, "learning_rate": 5e-07, "logits/chosen": -22376652.0, "logits/rejected": -5199881.0, "logps/chosen": -847.066650390625, "logps/rejected": -167.186767578125, "loss": 0.3067, "rewards/chosen": 0.561994194984436, "rewards/margins": 1.7675131559371948, "rewards/rejected": -1.2055189609527588, "step": 1088 }, { "epoch": 0.05772135796252617, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62329267.2, "logits/rejected": -39291578.666666664, "logps/chosen": -425.79404296875, "logps/rejected": -283.6832275390625, "loss": 0.3707, "rewards/chosen": -0.0022705078125, "rewards/margins": 1.837953758239746, "rewards/rejected": -1.840224266052246, "step": 1089 }, { "epoch": 0.05777436196432831, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20723868.0, "logits/rejected": 7913618.4, "logps/chosen": -309.4622802734375, "logps/rejected": -381.7580078125, "loss": 0.321, "rewards/chosen": -0.12242406606674194, "rewards/margins": 1.4753580689430237, "rewards/rejected": -1.5977821350097656, "step": 1090 }, { "epoch": 0.057827365966130445, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15057927.0, "logits/rejected": -39508368.0, "logps/chosen": -350.0087585449219, "logps/rejected": -438.4833577473958, "loss": 0.2241, "rewards/chosen": 0.6413971185684204, "rewards/margins": 2.258115728696187, "rewards/rejected": -1.6167186101277669, "step": 1091 }, { "epoch": 0.05788036996793258, "grad_norm": 82.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16808828.8, "logits/rejected": -21441610.666666668, "logps/chosen": -534.723828125, "logps/rejected": -143.4496053059896, "loss": 0.4594, "rewards/chosen": -0.1323535203933716, "rewards/margins": 0.5551282008488974, "rewards/rejected": -0.6874817212422689, "step": 1092 }, { "epoch": 0.05793337396973471, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31146828.8, "logits/rejected": -29882037.333333332, "logps/chosen": -373.37978515625, "logps/rejected": -211.3196004231771, "loss": 0.4124, "rewards/chosen": 0.007247316837310791, "rewards/margins": 1.0230586012204488, "rewards/rejected": -1.015811284383138, "step": 1093 }, { "epoch": 0.05798637797153685, "grad_norm": 63.25, "kl": 0.39591503143310547, "learning_rate": 5e-07, "logits/chosen": -25921164.8, "logits/rejected": -22236620.0, "logps/chosen": -313.3516845703125, "logps/rejected": -132.07247924804688, "loss": 0.3941, "rewards/chosen": 0.3913766384124756, "rewards/margins": 1.0495914459228515, "rewards/rejected": -0.658214807510376, "step": 1094 }, { "epoch": 0.05803938197333899, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4916735.0, "logits/rejected": -22559241.14285714, "logps/chosen": -473.9837341308594, "logps/rejected": -302.24051339285717, "loss": 0.3309, "rewards/chosen": -0.26425477862358093, "rewards/margins": 0.6870059243270329, "rewards/rejected": -0.9512607029506138, "step": 1095 }, { "epoch": 0.058092385975141124, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23015050.666666668, "logits/rejected": -929541.2, "logps/chosen": -382.6152750651042, "logps/rejected": -248.7317626953125, "loss": 0.3497, "rewards/chosen": 0.3617693583170573, "rewards/margins": 1.1657319704691569, "rewards/rejected": -0.8039626121520996, "step": 1096 }, { "epoch": 0.05814538997694326, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -83196245.33333333, "logits/rejected": -12693984.0, "logps/chosen": -368.064453125, "logps/rejected": -252.87490234375, "loss": 0.327, "rewards/chosen": 0.2374643882115682, "rewards/margins": 1.2997552474339802, "rewards/rejected": -1.062290859222412, "step": 1097 }, { "epoch": 0.0581983939787454, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40332496.0, "logits/rejected": -22452336.0, "logps/chosen": -228.1597900390625, "logps/rejected": -377.5530598958333, "loss": 0.3454, "rewards/chosen": 0.168712842464447, "rewards/margins": 2.0662151058514913, "rewards/rejected": -1.8975022633870442, "step": 1098 }, { "epoch": 0.05825139798054753, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26411232.0, "logits/rejected": -39490224.0, "logps/chosen": -472.1126403808594, "logps/rejected": -402.022216796875, "loss": 0.2967, "rewards/chosen": 0.5605279207229614, "rewards/margins": 2.1067906618118286, "rewards/rejected": -1.5462627410888672, "step": 1099 }, { "epoch": 0.058304401982349666, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13988376.0, "logits/rejected": -18424408.0, "logps/chosen": -299.30255126953125, "logps/rejected": -551.8790893554688, "loss": 0.3604, "rewards/chosen": -0.02573489397764206, "rewards/margins": 1.4902598932385445, "rewards/rejected": -1.5159947872161865, "step": 1100 }, { "epoch": 0.0583574059841518, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84030986.66666667, "logits/rejected": -10677398.4, "logps/chosen": -278.6317545572917, "logps/rejected": -392.77783203125, "loss": 0.382, "rewards/chosen": 0.050578951835632324, "rewards/margins": 0.8923519849777222, "rewards/rejected": -0.8417730331420898, "step": 1101 }, { "epoch": 0.05841040998595394, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13296267.2, "logits/rejected": -48195978.666666664, "logps/chosen": -168.77076416015626, "logps/rejected": -951.9212239583334, "loss": 0.3887, "rewards/chosen": -0.3518739461898804, "rewards/margins": 2.458119797706604, "rewards/rejected": -2.8099937438964844, "step": 1102 }, { "epoch": 0.05846341398775608, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2988229.6, "logits/rejected": 1310260.0, "logps/chosen": -200.5851318359375, "logps/rejected": -154.14872233072916, "loss": 0.3909, "rewards/chosen": 0.1615664005279541, "rewards/margins": 1.143762731552124, "rewards/rejected": -0.9821963310241699, "step": 1103 }, { "epoch": 0.058516417989558214, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 143978.375, "logits/rejected": -23234360.0, "logps/chosen": -215.41932678222656, "logps/rejected": -245.08514404296875, "loss": 0.2433, "rewards/chosen": 0.17356261610984802, "rewards/margins": 1.8872832556565602, "rewards/rejected": -1.7137206395467122, "step": 1104 }, { "epoch": 0.058569421991360344, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21548541.333333332, "logits/rejected": -42577408.0, "logps/chosen": -359.5940348307292, "logps/rejected": -421.05731201171875, "loss": 0.4287, "rewards/chosen": 0.03611301134030024, "rewards/margins": 1.2640839690963428, "rewards/rejected": -1.2279709577560425, "step": 1105 }, { "epoch": 0.05862242599316248, "grad_norm": 58.25, "kl": 0.623046875, "learning_rate": 5e-07, "logits/chosen": -35535286.85714286, "logits/rejected": -58415164.0, "logps/chosen": -275.8709019252232, "logps/rejected": -24.330387115478516, "loss": 0.4813, "rewards/chosen": 0.13209021091461182, "rewards/margins": 0.30944083631038666, "rewards/rejected": -0.17735062539577484, "step": 1106 }, { "epoch": 0.05867542999496462, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3661130.0, "logits/rejected": -33814997.333333336, "logps/chosen": -114.64013671875, "logps/rejected": -545.8810221354166, "loss": 0.2536, "rewards/chosen": -0.4239450693130493, "rewards/margins": 1.5088790655136108, "rewards/rejected": -1.9328241348266602, "step": 1107 }, { "epoch": 0.058728433996766756, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12193352.0, "logits/rejected": -22744308.0, "logps/chosen": -207.19649251302084, "logps/rejected": -135.86012268066406, "loss": 0.5062, "rewards/chosen": -0.22126724322636923, "rewards/margins": 0.3440846006075541, "rewards/rejected": -0.5653518438339233, "step": 1108 }, { "epoch": 0.05878143799856889, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7679363.2, "logits/rejected": -21752314.666666668, "logps/chosen": -280.36982421875, "logps/rejected": -487.8819986979167, "loss": 0.3663, "rewards/chosen": 0.10706490278244019, "rewards/margins": 1.8997748891512554, "rewards/rejected": -1.7927099863688152, "step": 1109 }, { "epoch": 0.05883444200037103, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18945786.0, "logits/rejected": -20408336.0, "logps/chosen": -209.00396728515625, "logps/rejected": -454.3458251953125, "loss": 0.3482, "rewards/chosen": -0.0791749432682991, "rewards/margins": 1.5702484175562859, "rewards/rejected": -1.649423360824585, "step": 1110 }, { "epoch": 0.05888744600217317, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14794288.0, "logits/rejected": -29747193.6, "logps/chosen": -394.1700846354167, "logps/rejected": -402.1923828125, "loss": 0.3135, "rewards/chosen": 0.16851832469304404, "rewards/margins": 1.9116108616193135, "rewards/rejected": -1.7430925369262695, "step": 1111 }, { "epoch": 0.0589404500039753, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22482732.0, "logits/rejected": -50376624.0, "logps/chosen": -247.11624145507812, "logps/rejected": -731.3576049804688, "loss": 0.299, "rewards/chosen": 0.16943077743053436, "rewards/margins": 2.3549914807081223, "rewards/rejected": -2.185560703277588, "step": 1112 }, { "epoch": 0.058993454005777435, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49101845.333333336, "logits/rejected": -49414681.6, "logps/chosen": -404.0557047526042, "logps/rejected": -482.848974609375, "loss": 0.3325, "rewards/chosen": 0.00461578369140625, "rewards/margins": 1.4066472053527832, "rewards/rejected": -1.402031421661377, "step": 1113 }, { "epoch": 0.05904645800757957, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7232034.666666667, "logits/rejected": -50512067.2, "logps/chosen": -70.52462768554688, "logps/rejected": -374.115625, "loss": 0.3361, "rewards/chosen": 0.03329251209894816, "rewards/margins": 2.146877602736155, "rewards/rejected": -2.113585090637207, "step": 1114 }, { "epoch": 0.05909946200938171, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49685824.0, "logits/rejected": -36620608.0, "logps/chosen": -352.97979736328125, "logps/rejected": -127.16853332519531, "loss": 0.394, "rewards/chosen": 0.07074026763439178, "rewards/margins": 1.0041826218366623, "rewards/rejected": -0.9334423542022705, "step": 1115 }, { "epoch": 0.059152466011183846, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55727744.0, "logits/rejected": -491281.75, "logps/chosen": -272.9103597005208, "logps/rejected": -212.9708251953125, "loss": 0.3871, "rewards/chosen": -0.20157472292582193, "rewards/margins": 0.7947277863820394, "rewards/rejected": -0.9963025093078614, "step": 1116 }, { "epoch": 0.05920547001298598, "grad_norm": 92.5, "kl": 0.44913482666015625, "learning_rate": 5e-07, "logits/chosen": 27827160.0, "logits/rejected": -14016682.666666666, "logps/chosen": -1194.3653564453125, "logps/rejected": -259.11517333984375, "loss": 0.3394, "rewards/chosen": 0.5327696204185486, "rewards/margins": 1.270537793636322, "rewards/rejected": -0.7377681732177734, "step": 1117 }, { "epoch": 0.059258474014788114, "grad_norm": 66.5, "kl": 0.5005683898925781, "learning_rate": 5e-07, "logits/chosen": -13963785.333333334, "logits/rejected": -4887194.4, "logps/chosen": -603.908203125, "logps/rejected": -104.226708984375, "loss": 0.3363, "rewards/chosen": 0.3419698079427083, "rewards/margins": 1.3375413258870443, "rewards/rejected": -0.995571517944336, "step": 1118 }, { "epoch": 0.05931147801659025, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45338416.0, "logits/rejected": -30958048.0, "logps/chosen": -274.52099609375, "logps/rejected": -359.3620198567708, "loss": 0.3012, "rewards/chosen": 0.2098245769739151, "rewards/margins": 1.367095485329628, "rewards/rejected": -1.157270908355713, "step": 1119 }, { "epoch": 0.05936448201839239, "grad_norm": 94.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49854533.333333336, "logits/rejected": -138895216.0, "logps/chosen": -487.268798828125, "logps/rejected": -1822.5899658203125, "loss": 0.4825, "rewards/chosen": -0.25622057914733887, "rewards/margins": 0.9631756544113159, "rewards/rejected": -1.2193962335586548, "step": 1120 }, { "epoch": 0.059417486020194525, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -90020288.0, "logits/rejected": -47585962.666666664, "logps/chosen": -399.82684326171875, "logps/rejected": -564.1112874348959, "loss": 0.2241, "rewards/chosen": 0.22499695420265198, "rewards/margins": 2.048739979664485, "rewards/rejected": -1.8237430254618328, "step": 1121 }, { "epoch": 0.05947049002199666, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -113966880.0, "logits/rejected": -19363976.0, "logps/chosen": -439.9554036458333, "logps/rejected": -276.0392333984375, "loss": 0.3551, "rewards/chosen": 0.025498447318871815, "rewards/margins": 1.0656311606367428, "rewards/rejected": -1.040132713317871, "step": 1122 }, { "epoch": 0.0595234940237988, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14061342.4, "logits/rejected": -30371738.666666668, "logps/chosen": -212.42158203125, "logps/rejected": -440.2696126302083, "loss": 0.4178, "rewards/chosen": -0.12174124717712402, "rewards/margins": 1.4215674877166748, "rewards/rejected": -1.5433087348937988, "step": 1123 }, { "epoch": 0.05957649802560093, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21423190.0, "logits/rejected": -8818965.0, "logps/chosen": -225.1970977783203, "logps/rejected": -245.2805938720703, "loss": 0.3882, "rewards/chosen": -0.07886219024658203, "rewards/margins": 1.4433317184448242, "rewards/rejected": -1.5221939086914062, "step": 1124 }, { "epoch": 0.05962950202740307, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3018609.3333333335, "logits/rejected": -1878380.8, "logps/chosen": -200.7713419596354, "logps/rejected": -198.10408935546874, "loss": 0.3751, "rewards/chosen": 0.22701225678126016, "rewards/margins": 0.9819760839144388, "rewards/rejected": -0.7549638271331787, "step": 1125 }, { "epoch": 0.059682506029205204, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18080008.0, "logits/rejected": -19262622.0, "logps/chosen": -191.8553466796875, "logps/rejected": -243.36785888671875, "loss": 0.39, "rewards/chosen": -0.057491499930620193, "rewards/margins": 0.9706088714301586, "rewards/rejected": -1.0281003713607788, "step": 1126 }, { "epoch": 0.05973551003100734, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12533661.0, "logits/rejected": -57671424.0, "logps/chosen": -170.42784118652344, "logps/rejected": -229.35447692871094, "loss": 0.3749, "rewards/chosen": 0.1118195578455925, "rewards/margins": 1.1817985102534294, "rewards/rejected": -1.069978952407837, "step": 1127 }, { "epoch": 0.05978851403280948, "grad_norm": 75.5, "kl": 1.0030803680419922, "learning_rate": 5e-07, "logits/chosen": 1701568.0, "logits/rejected": -8034951.5, "logps/chosen": -549.8992919921875, "logps/rejected": -290.7729797363281, "loss": 0.3406, "rewards/chosen": 0.4292663335800171, "rewards/margins": 1.4315996170043945, "rewards/rejected": -1.0023332834243774, "step": 1128 }, { "epoch": 0.059841518034611615, "grad_norm": 63.0, "kl": 0.10327529907226562, "learning_rate": 5e-07, "logits/chosen": -143358336.0, "logits/rejected": -39004944.0, "logps/chosen": -467.0880126953125, "logps/rejected": -437.8001708984375, "loss": 0.357, "rewards/chosen": -0.13958054780960083, "rewards/margins": 1.5672051310539246, "rewards/rejected": -1.7067856788635254, "step": 1129 }, { "epoch": 0.05989452203641375, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39609794.666666664, "logits/rejected": -3683998.5, "logps/chosen": -370.0332845052083, "logps/rejected": -176.08958435058594, "loss": 0.4246, "rewards/chosen": 0.08035647869110107, "rewards/margins": 1.136399269104004, "rewards/rejected": -1.0560427904129028, "step": 1130 }, { "epoch": 0.05994752603821588, "grad_norm": 61.5, "kl": 0.7655410766601562, "learning_rate": 5e-07, "logits/chosen": -24511108.0, "logits/rejected": -37840680.0, "logps/chosen": -351.0914306640625, "logps/rejected": -383.4716796875, "loss": 0.3418, "rewards/chosen": 0.0667797178030014, "rewards/margins": 2.215690240263939, "rewards/rejected": -2.1489105224609375, "step": 1131 }, { "epoch": 0.06000053004001802, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -23562012.0, "logps/rejected": -323.9725341796875, "loss": 0.2015, "rewards/rejected": -1.497011661529541, "step": 1132 }, { "epoch": 0.06005353404182016, "grad_norm": 88.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17362846.4, "logits/rejected": -40332458.666666664, "logps/chosen": -700.411767578125, "logps/rejected": -137.72272745768228, "loss": 0.4324, "rewards/chosen": 0.15434842109680175, "rewards/margins": 0.6534822861353556, "rewards/rejected": -0.4991338650385539, "step": 1133 }, { "epoch": 0.060106538043622294, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28829322.0, "logits/rejected": -19810676.0, "logps/chosen": -238.98312377929688, "logps/rejected": -411.69207763671875, "loss": 0.3378, "rewards/chosen": -0.03466359153389931, "rewards/margins": 2.242117967456579, "rewards/rejected": -2.2767815589904785, "step": 1134 }, { "epoch": 0.06015954204542443, "grad_norm": 93.5, "kl": 2.1586952209472656, "learning_rate": 5e-07, "logits/chosen": -55701192.0, "logits/rejected": -8213762.5, "logps/chosen": -1274.6904296875, "logps/rejected": -155.07350158691406, "loss": 0.3158, "rewards/chosen": 0.5336257815361023, "rewards/margins": 1.9418088793754578, "rewards/rejected": -1.4081830978393555, "step": 1135 }, { "epoch": 0.06021254604722657, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 607665.25, "logits/rejected": -36124428.0, "logps/chosen": -75.74774932861328, "logps/rejected": -298.81103515625, "loss": 0.4345, "rewards/chosen": -0.25156164169311523, "rewards/margins": 0.597914457321167, "rewards/rejected": -0.8494760990142822, "step": 1136 }, { "epoch": 0.0602655500490287, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68773296.0, "logits/rejected": -21749626.0, "logps/chosen": -421.2328796386719, "logps/rejected": -291.8821716308594, "loss": 0.3932, "rewards/chosen": -0.02490997314453125, "rewards/margins": 0.964520275592804, "rewards/rejected": -0.9894302487373352, "step": 1137 }, { "epoch": 0.060318554050830836, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34382905.6, "logits/rejected": -39254469.333333336, "logps/chosen": -397.2389404296875, "logps/rejected": -535.611328125, "loss": 0.3825, "rewards/chosen": -0.09308013916015626, "rewards/margins": 2.247017923990885, "rewards/rejected": -2.3400980631510415, "step": 1138 }, { "epoch": 0.06037155805263297, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66860772.0, "logits/rejected": -44329512.0, "logps/chosen": -183.9888916015625, "logps/rejected": -287.8008728027344, "loss": 0.4221, "rewards/chosen": -0.1074254959821701, "rewards/margins": 0.8052763491868973, "rewards/rejected": -0.9127018451690674, "step": 1139 }, { "epoch": 0.06042456205443511, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20218156.0, "logits/rejected": -32697717.333333332, "logps/chosen": -370.8482666015625, "logps/rejected": -255.65045166015625, "loss": 0.3502, "rewards/chosen": -0.16981124877929688, "rewards/margins": 0.7679661909739176, "rewards/rejected": -0.9377774397532145, "step": 1140 }, { "epoch": 0.06047756605623725, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66962560.0, "logits/rejected": -34324512.0, "logps/chosen": -513.5082397460938, "logps/rejected": -280.25189208984375, "loss": 0.3957, "rewards/chosen": -0.18873482942581177, "rewards/margins": 1.0773507952690125, "rewards/rejected": -1.2660856246948242, "step": 1141 }, { "epoch": 0.060530570058039385, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12782880.8, "logits/rejected": -42282245.333333336, "logps/chosen": -160.27470703125, "logps/rejected": -352.2860107421875, "loss": 0.3796, "rewards/chosen": -0.0190952330827713, "rewards/margins": 1.6344030986229579, "rewards/rejected": -1.6534983317057292, "step": 1142 }, { "epoch": 0.060583574059841515, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58705008.0, "logits/rejected": -3056672.0, "logps/chosen": -606.9014078776041, "logps/rejected": -321.5008056640625, "loss": 0.3176, "rewards/chosen": 0.2179468870162964, "rewards/margins": 1.4194802045822144, "rewards/rejected": -1.201533317565918, "step": 1143 }, { "epoch": 0.06063657806164365, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13224814.0, "logits/rejected": -15841683.0, "logps/chosen": -244.72723388671875, "logps/rejected": -188.48406982421875, "loss": 0.3785, "rewards/chosen": 0.16525974869728088, "rewards/margins": 1.0742258131504059, "rewards/rejected": -0.908966064453125, "step": 1144 }, { "epoch": 0.06068958206344579, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59856731.428571425, "logits/rejected": -48688900.0, "logps/chosen": -352.11265345982144, "logps/rejected": -526.6947021484375, "loss": 0.4423, "rewards/chosen": 0.04825875588825771, "rewards/margins": 2.135670397962843, "rewards/rejected": -2.087411642074585, "step": 1145 }, { "epoch": 0.060742586065247926, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20542122.666666668, "logits/rejected": -74768320.0, "logps/chosen": -348.4477132161458, "logps/rejected": -541.7273559570312, "loss": 0.4182, "rewards/chosen": 0.003586381673812866, "rewards/margins": 1.5885015428066254, "rewards/rejected": -1.5849151611328125, "step": 1146 }, { "epoch": 0.06079559006705006, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29112532.0, "logits/rejected": -20511844.0, "logps/chosen": -712.052490234375, "logps/rejected": -378.196044921875, "loss": 0.2935, "rewards/chosen": 0.6360830068588257, "rewards/margins": 2.02936851978302, "rewards/rejected": -1.3932855129241943, "step": 1147 }, { "epoch": 0.0608485940688522, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50602992.0, "logits/rejected": -38368824.0, "logps/chosen": -394.4964904785156, "logps/rejected": -433.99249267578125, "loss": 0.3275, "rewards/chosen": -0.015321731567382812, "rewards/margins": 1.77567720413208, "rewards/rejected": -1.790998935699463, "step": 1148 }, { "epoch": 0.06090159807065434, "grad_norm": 63.75, "kl": 1.12408447265625, "learning_rate": 5e-07, "logits/chosen": -49220124.0, "logits/rejected": -16000376.0, "logps/chosen": -653.9144287109375, "logps/rejected": -221.09544372558594, "loss": 0.368, "rewards/chosen": 0.346771240234375, "rewards/margins": 1.4323012828826904, "rewards/rejected": -1.0855300426483154, "step": 1149 }, { "epoch": 0.06095460207245647, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10954154.666666666, "logits/rejected": -24646664.0, "logps/chosen": -317.73480224609375, "logps/rejected": -478.1333984375, "loss": 0.298, "rewards/chosen": 0.13006744782129923, "rewards/margins": 1.805549434820811, "rewards/rejected": -1.6754819869995117, "step": 1150 }, { "epoch": 0.061007606074258605, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -142486320.0, "logits/rejected": -61167850.666666664, "logps/chosen": -537.6593017578125, "logps/rejected": -459.9899495442708, "loss": 0.2858, "rewards/chosen": 0.22214815020561218, "rewards/margins": 2.0007207095623016, "rewards/rejected": -1.7785725593566895, "step": 1151 }, { "epoch": 0.06106061007606074, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47662800.0, "logits/rejected": 21630388.0, "logps/chosen": -519.5670166015625, "logps/rejected": -216.30734252929688, "loss": 0.35, "rewards/chosen": 0.1307426393032074, "rewards/margins": 1.3788319528102875, "rewards/rejected": -1.24808931350708, "step": 1152 }, { "epoch": 0.06111361407786288, "grad_norm": 78.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14631798.857142856, "logits/rejected": 177269.71875, "logps/chosen": -355.6220005580357, "logps/rejected": -72.01113891601562, "loss": 0.4611, "rewards/chosen": 0.04927706718444824, "rewards/margins": 0.9809497594833374, "rewards/rejected": -0.9316726922988892, "step": 1153 }, { "epoch": 0.06116661807966502, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38212774.4, "logits/rejected": -24082058.666666668, "logps/chosen": -230.9006591796875, "logps/rejected": -112.41780598958333, "loss": 0.428, "rewards/chosen": 0.06971221566200256, "rewards/margins": 0.7601016898949942, "rewards/rejected": -0.6903894742329916, "step": 1154 }, { "epoch": 0.061219622081467154, "grad_norm": 47.25, "kl": 0.22985458374023438, "learning_rate": 5e-07, "logits/chosen": -34250704.0, "logits/rejected": -5494885.2, "logps/chosen": -236.287109375, "logps/rejected": -144.2398681640625, "loss": 0.3708, "rewards/chosen": 0.0686148206392924, "rewards/margins": 0.9730320493380228, "rewards/rejected": -0.9044172286987304, "step": 1155 }, { "epoch": 0.061272626083269284, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22707685.333333332, "logits/rejected": -36687964.0, "logps/chosen": -222.79915364583334, "logps/rejected": -313.69970703125, "loss": 0.4155, "rewards/chosen": 0.09989865620930989, "rewards/margins": 1.344424565633138, "rewards/rejected": -1.2445259094238281, "step": 1156 }, { "epoch": 0.06132563008507142, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13618209.333333334, "logits/rejected": -24112434.0, "logps/chosen": -240.9488525390625, "logps/rejected": -101.17732238769531, "loss": 0.4342, "rewards/chosen": 0.19643878936767578, "rewards/margins": 0.6991720199584961, "rewards/rejected": -0.5027332305908203, "step": 1157 }, { "epoch": 0.06137863408687356, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 16439912.0, "logits/rejected": -25340636.8, "logps/chosen": -413.807373046875, "logps/rejected": -253.9345947265625, "loss": 0.3342, "rewards/chosen": 0.1412998835245768, "rewards/margins": 1.328646723429362, "rewards/rejected": -1.187346839904785, "step": 1158 }, { "epoch": 0.061431638088675695, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13519894.0, "logits/rejected": 86670728.0, "logps/chosen": -493.04779052734375, "logps/rejected": -318.8182373046875, "loss": 0.3598, "rewards/chosen": 0.0020591765642166138, "rewards/margins": 1.5403173714876175, "rewards/rejected": -1.5382581949234009, "step": 1159 }, { "epoch": 0.06148464209047783, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39486104.0, "logits/rejected": -47098384.0, "logps/chosen": -152.50070190429688, "logps/rejected": -346.7188720703125, "loss": 0.4113, "rewards/chosen": -0.2710195779800415, "rewards/margins": 0.8796979188919067, "rewards/rejected": -1.1507174968719482, "step": 1160 }, { "epoch": 0.06153764609227997, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9246575.0, "logits/rejected": -22028993.333333332, "logps/chosen": -284.1475830078125, "logps/rejected": -195.2785847981771, "loss": 0.3204, "rewards/chosen": -0.26286011934280396, "rewards/margins": 0.9699425101280212, "rewards/rejected": -1.2328026294708252, "step": 1161 }, { "epoch": 0.0615906500940821, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72068432.0, "logits/rejected": -38605580.8, "logps/chosen": -600.2098388671875, "logps/rejected": -446.769921875, "loss": 0.2471, "rewards/chosen": 0.27641602357228595, "rewards/margins": 2.4424427111943565, "rewards/rejected": -2.1660266876220704, "step": 1162 }, { "epoch": 0.06164365409588424, "grad_norm": 76.0, "kl": 0.28002166748046875, "learning_rate": 5e-07, "logits/chosen": -53561628.8, "logits/rejected": -30016880.0, "logps/chosen": -550.789794921875, "logps/rejected": -181.2864786783854, "loss": 0.3808, "rewards/chosen": 0.3991473436355591, "rewards/margins": 1.140547267595927, "rewards/rejected": -0.7413999239603678, "step": 1163 }, { "epoch": 0.061696658097686374, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24914144.0, "logits/rejected": 20222006.0, "logps/chosen": -118.64042663574219, "logps/rejected": -100.47139739990234, "loss": 0.3904, "rewards/chosen": 0.022913552820682526, "rewards/margins": 0.9613253846764565, "rewards/rejected": -0.9384118318557739, "step": 1164 }, { "epoch": 0.06174966209948851, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46446154.666666664, "logits/rejected": -987647.0, "logps/chosen": -427.2162272135417, "logps/rejected": -360.8291320800781, "loss": 0.3619, "rewards/chosen": 0.3164141575495402, "rewards/margins": 1.8926539818445842, "rewards/rejected": -1.576239824295044, "step": 1165 }, { "epoch": 0.06180266610129065, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5798057.6, "logits/rejected": -15329185.333333334, "logps/chosen": -171.050830078125, "logps/rejected": -166.60010782877603, "loss": 0.4117, "rewards/chosen": 0.11616538763046265, "rewards/margins": 0.9355488499005636, "rewards/rejected": -0.819383462270101, "step": 1166 }, { "epoch": 0.061855670103092786, "grad_norm": 51.25, "kl": 0.28166961669921875, "learning_rate": 5e-07, "logits/chosen": -44582835.2, "logits/rejected": -50336256.0, "logps/chosen": -222.8728515625, "logps/rejected": -231.5697224934896, "loss": 0.3707, "rewards/chosen": 0.18997181653976442, "rewards/margins": 1.6440067807833354, "rewards/rejected": -1.454034964243571, "step": 1167 }, { "epoch": 0.06190867410489492, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14132090.666666666, "logits/rejected": -9007978.4, "logps/chosen": -316.8863118489583, "logps/rejected": -201.68043212890626, "loss": 0.3599, "rewards/chosen": -0.05631117026011149, "rewards/margins": 1.0179131110509236, "rewards/rejected": -1.0742242813110352, "step": 1168 }, { "epoch": 0.06196167810669705, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13455268.0, "logits/rejected": -7475732.8, "logps/chosen": -285.50262451171875, "logps/rejected": -82.42420043945313, "loss": 0.4177, "rewards/chosen": -0.10551885763804118, "rewards/margins": 0.5412694056828817, "rewards/rejected": -0.6467882633209229, "step": 1169 }, { "epoch": 0.06201468210849919, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 154461568.0, "logits/rejected": -62895628.8, "logps/chosen": -398.3551839192708, "logps/rejected": -266.112744140625, "loss": 0.3341, "rewards/chosen": -0.05697288612524668, "rewards/margins": 1.222807024916013, "rewards/rejected": -1.2797799110412598, "step": 1170 }, { "epoch": 0.06206768611030133, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -83897610.66666667, "logits/rejected": -10206383.2, "logps/chosen": -548.2430419921875, "logps/rejected": -279.35390625, "loss": 0.319, "rewards/chosen": 0.08485921223958333, "rewards/margins": 1.3934146245320638, "rewards/rejected": -1.3085554122924805, "step": 1171 }, { "epoch": 0.062120690112103465, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14158505.333333334, "logits/rejected": -63600588.8, "logps/chosen": -132.62812296549478, "logps/rejected": -343.11962890625, "loss": 0.2678, "rewards/chosen": 0.325458288192749, "rewards/margins": 1.9367591381072997, "rewards/rejected": -1.6113008499145507, "step": 1172 }, { "epoch": 0.0621736941139056, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4148523.3333333335, "logits/rejected": -17698110.4, "logps/chosen": -48.86971537272135, "logps/rejected": -213.053271484375, "loss": 0.3367, "rewards/chosen": 0.22792251904805502, "rewards/margins": 1.2264487107594808, "rewards/rejected": -0.9985261917114258, "step": 1173 }, { "epoch": 0.06222669811570774, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8876042.666666666, "logits/rejected": -4542643.5, "logps/chosen": -302.21287027994794, "logps/rejected": -168.52603149414062, "loss": 0.4411, "rewards/chosen": -0.0005076080560684204, "rewards/margins": 1.111768826842308, "rewards/rejected": -1.1122764348983765, "step": 1174 }, { "epoch": 0.06227970211750987, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40449664.0, "logits/rejected": -19165546.666666668, "logps/chosen": -412.9613037109375, "logps/rejected": -398.8535970052083, "loss": 0.2361, "rewards/chosen": 0.24545976519584656, "rewards/margins": 2.197360883156459, "rewards/rejected": -1.951901117960612, "step": 1175 }, { "epoch": 0.062332706119312006, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53867188.0, "logits/rejected": 2988652.5714285714, "logps/chosen": -194.8800048828125, "logps/rejected": -326.9547642299107, "loss": 0.2757, "rewards/chosen": 0.39948272705078125, "rewards/margins": 1.7134246826171875, "rewards/rejected": -1.3139419555664062, "step": 1176 }, { "epoch": 0.062385710121114143, "grad_norm": 76.0, "kl": 0.024730682373046875, "learning_rate": 5e-07, "logits/chosen": 262901.5833333333, "logits/rejected": 10087683.0, "logps/chosen": -342.1581624348958, "logps/rejected": -244.38943481445312, "loss": 0.4242, "rewards/chosen": 0.20984782775243124, "rewards/margins": 0.8396879633267721, "rewards/rejected": -0.6298401355743408, "step": 1177 }, { "epoch": 0.06243871412291628, "grad_norm": 83.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62963460.0, "logits/rejected": -21488986.0, "logps/chosen": -730.123779296875, "logps/rejected": -253.13250732421875, "loss": 0.3628, "rewards/chosen": 0.1687929779291153, "rewards/margins": 1.3971405178308487, "rewards/rejected": -1.2283475399017334, "step": 1178 }, { "epoch": 0.06249171812471842, "grad_norm": 69.5, "kl": 0.52008056640625, "learning_rate": 5e-07, "logits/chosen": -87735200.0, "logits/rejected": -43607093.333333336, "logps/chosen": -1390.5830078125, "logps/rejected": -232.17626953125, "loss": 0.3304, "rewards/chosen": 0.7631500363349915, "rewards/margins": 1.6128012935320535, "rewards/rejected": -0.8496512571970621, "step": 1179 }, { "epoch": 0.06254472212652055, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 131270954.66666667, "logits/rejected": -14660988.8, "logps/chosen": -505.4666748046875, "logps/rejected": -355.00478515625, "loss": 0.3381, "rewards/chosen": 0.11435547471046448, "rewards/margins": 1.3398382246494294, "rewards/rejected": -1.2254827499389649, "step": 1180 }, { "epoch": 0.06259772612832269, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38405938.666666664, "logits/rejected": -7943744.0, "logps/chosen": -396.8387451171875, "logps/rejected": -172.89968872070312, "loss": 0.4161, "rewards/chosen": 0.10121689240137736, "rewards/margins": 1.2710914413134258, "rewards/rejected": -1.1698745489120483, "step": 1181 }, { "epoch": 0.06265073013012483, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9868210.666666666, "logits/rejected": 41938624.0, "logps/chosen": -382.7541097005208, "logps/rejected": -444.34873046875, "loss": 0.4012, "rewards/chosen": -1.0823291142781575, "rewards/margins": 0.5600933710734051, "rewards/rejected": -1.6424224853515625, "step": 1182 }, { "epoch": 0.06270373413192697, "grad_norm": 61.25, "kl": 0.862030029296875, "learning_rate": 5e-07, "logits/chosen": -31081113.14285714, "logits/rejected": -11552235.0, "logps/chosen": -271.1141357421875, "logps/rejected": -127.76346588134766, "loss": 0.5146, "rewards/chosen": -0.008106069905417306, "rewards/margins": 0.2545427211693355, "rewards/rejected": -0.2626487910747528, "step": 1183 }, { "epoch": 0.06275673813372909, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22701792.0, "logits/rejected": 16640817.0, "logps/chosen": -343.4409993489583, "logps/rejected": -138.445556640625, "loss": 0.4583, "rewards/chosen": 0.0033267935117085776, "rewards/margins": 0.6856731375058492, "rewards/rejected": -0.6823463439941406, "step": 1184 }, { "epoch": 0.06280974213553123, "grad_norm": 60.5, "kl": 0.23751068115234375, "learning_rate": 5e-07, "logits/chosen": -68685080.0, "logits/rejected": -27686734.0, "logps/chosen": -353.3921813964844, "logps/rejected": -170.167236328125, "loss": 0.3696, "rewards/chosen": 0.20179004967212677, "rewards/margins": 1.2055456191301346, "rewards/rejected": -1.0037555694580078, "step": 1185 }, { "epoch": 0.06286274613733336, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28802114.666666668, "logits/rejected": -41219481.6, "logps/chosen": -368.2364908854167, "logps/rejected": -373.27607421875, "loss": 0.3024, "rewards/chosen": -0.06149037182331085, "rewards/margins": 1.6502178877592086, "rewards/rejected": -1.7117082595825195, "step": 1186 }, { "epoch": 0.0629157501391355, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44329302.4, "logits/rejected": -1315416.6666666667, "logps/chosen": -331.97548828125, "logps/rejected": -558.740234375, "loss": 0.3949, "rewards/chosen": -0.2717933416366577, "rewards/margins": 1.8219865401585897, "rewards/rejected": -2.0937798817952475, "step": 1187 }, { "epoch": 0.06296875414093764, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13246104.0, "logits/rejected": 19060742.0, "logps/chosen": -179.107421875, "logps/rejected": -214.7181396484375, "loss": 0.4619, "rewards/chosen": 0.08040444056193034, "rewards/margins": 0.4731210271517436, "rewards/rejected": -0.39271658658981323, "step": 1188 }, { "epoch": 0.06302175814273978, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45836554.666666664, "logits/rejected": -27789926.4, "logps/chosen": -356.0089518229167, "logps/rejected": -195.81507568359376, "loss": 0.3512, "rewards/chosen": 0.11431783437728882, "rewards/margins": 1.2054643988609315, "rewards/rejected": -1.0911465644836427, "step": 1189 }, { "epoch": 0.06307476214454191, "grad_norm": 77.5, "kl": 2.0171966552734375, "learning_rate": 5e-07, "logits/chosen": -8940752.666666666, "logits/rejected": -45233936.0, "logps/chosen": -423.7974446614583, "logps/rejected": -308.84234619140625, "loss": 0.3979, "rewards/chosen": 0.4141850471496582, "rewards/margins": 1.9065953493118286, "rewards/rejected": -1.4924103021621704, "step": 1190 }, { "epoch": 0.06312776614634405, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4899329.0, "logits/rejected": -10206418.0, "logps/chosen": -261.9329833984375, "logps/rejected": -274.113037109375, "loss": 0.3608, "rewards/chosen": -0.04491925239562988, "rewards/margins": 1.422876000404358, "rewards/rejected": -1.4677952527999878, "step": 1191 }, { "epoch": 0.06318077014814619, "grad_norm": 61.0, "kl": 0.09383964538574219, "learning_rate": 5e-07, "logits/chosen": -41330424.0, "logits/rejected": -29693858.0, "logps/chosen": -254.1605682373047, "logps/rejected": -290.129638671875, "loss": 0.335, "rewards/chosen": 0.22101697325706482, "rewards/margins": 1.5182803571224213, "rewards/rejected": -1.2972633838653564, "step": 1192 }, { "epoch": 0.06323377414994832, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12872405.333333334, "logits/rejected": -56732872.0, "logps/chosen": -207.73590087890625, "logps/rejected": -291.1496276855469, "loss": 0.4126, "rewards/chosen": 0.12217124303181966, "rewards/margins": 1.3916698296864827, "rewards/rejected": -1.269498586654663, "step": 1193 }, { "epoch": 0.06328677815175046, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11676195.2, "logits/rejected": -25216626.666666668, "logps/chosen": -189.335400390625, "logps/rejected": -653.0882975260416, "loss": 0.3732, "rewards/chosen": -0.1173516035079956, "rewards/margins": 2.087826339403788, "rewards/rejected": -2.2051779429117837, "step": 1194 }, { "epoch": 0.0633397821535526, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2250518.25, "logits/rejected": -47414000.0, "logps/chosen": -248.58843994140625, "logps/rejected": -342.503173828125, "loss": 0.3459, "rewards/chosen": 0.0876302719116211, "rewards/margins": 1.455836296081543, "rewards/rejected": -1.3682060241699219, "step": 1195 }, { "epoch": 0.06339278615535474, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1711894.875, "logits/rejected": -3087733.714285714, "logps/chosen": -330.08258056640625, "logps/rejected": -190.78410993303572, "loss": 0.3712, "rewards/chosen": -0.41696473956108093, "rewards/margins": 0.2658570791993823, "rewards/rejected": -0.6828218187604632, "step": 1196 }, { "epoch": 0.06344579015715686, "grad_norm": 54.75, "kl": 0.23719024658203125, "learning_rate": 5e-07, "logits/chosen": -51407524.0, "logits/rejected": -26562950.0, "logps/chosen": -262.2796630859375, "logps/rejected": -382.5147399902344, "loss": 0.3474, "rewards/chosen": 0.33478957414627075, "rewards/margins": 1.6403520703315735, "rewards/rejected": -1.3055624961853027, "step": 1197 }, { "epoch": 0.063498794158959, "grad_norm": 58.75, "kl": 0.01277923583984375, "learning_rate": 5e-07, "logits/chosen": -21124179.2, "logits/rejected": -32095613.333333332, "logps/chosen": -726.48818359375, "logps/rejected": -271.97357177734375, "loss": 0.3179, "rewards/chosen": 0.6002252101898193, "rewards/margins": 2.01690951983134, "rewards/rejected": -1.4166843096415203, "step": 1198 }, { "epoch": 0.06355179816076113, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42277142.4, "logits/rejected": -34909421.333333336, "logps/chosen": -678.290869140625, "logps/rejected": -682.2640380859375, "loss": 0.2865, "rewards/chosen": 0.47238922119140625, "rewards/margins": 3.0822178522745767, "rewards/rejected": -2.6098286310831704, "step": 1199 }, { "epoch": 0.06360480216256327, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79789256.0, "logits/rejected": -29976130.666666668, "logps/chosen": -252.55050659179688, "logps/rejected": -148.47855631510416, "loss": 0.3167, "rewards/chosen": -0.047303102910518646, "rewards/margins": 1.0796275312701862, "rewards/rejected": -1.1269306341807048, "step": 1200 }, { "epoch": 0.06365780616436541, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4952457.0, "logits/rejected": -41431145.6, "logps/chosen": -147.7275390625, "logps/rejected": -367.325439453125, "loss": 0.3451, "rewards/chosen": -0.2236084540685018, "rewards/margins": 1.1461605469385783, "rewards/rejected": -1.36976900100708, "step": 1201 }, { "epoch": 0.06371081016616754, "grad_norm": 70.0, "kl": 0.47789764404296875, "learning_rate": 5e-07, "logits/chosen": -65694922.666666664, "logits/rejected": -41543812.0, "logps/chosen": -525.9415690104166, "logps/rejected": -171.0749969482422, "loss": 0.4216, "rewards/chosen": 0.16022319595019022, "rewards/margins": 1.0507372121016185, "rewards/rejected": -0.8905140161514282, "step": 1202 }, { "epoch": 0.06376381416796968, "grad_norm": 60.5, "kl": 0.16161346435546875, "learning_rate": 5e-07, "logits/chosen": -34762656.0, "logits/rejected": -28401616.0, "logps/chosen": -516.3601684570312, "logps/rejected": -274.0645751953125, "loss": 0.3899, "rewards/chosen": 0.17500686645507812, "rewards/margins": 1.091360330581665, "rewards/rejected": -0.9163534641265869, "step": 1203 }, { "epoch": 0.06381681816977182, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52702688.0, "logits/rejected": -34468684.8, "logps/chosen": -240.70491536458334, "logps/rejected": -300.774755859375, "loss": 0.3207, "rewards/chosen": 0.04617818693319956, "rewards/margins": 1.4440959344307582, "rewards/rejected": -1.3979177474975586, "step": 1204 }, { "epoch": 0.06386982217157396, "grad_norm": 53.5, "kl": 0.0863800048828125, "learning_rate": 5e-07, "logits/chosen": -35290992.0, "logits/rejected": -20754955.2, "logps/chosen": -392.9314778645833, "logps/rejected": -293.3906494140625, "loss": 0.3941, "rewards/chosen": -0.14503987630208334, "rewards/margins": 0.7341357866923014, "rewards/rejected": -0.8791756629943848, "step": 1205 }, { "epoch": 0.0639228261733761, "grad_norm": 81.5, "kl": 0.07635688781738281, "learning_rate": 5e-07, "logits/chosen": -21887019.2, "logits/rejected": -20661398.666666668, "logps/chosen": -646.837158203125, "logps/rejected": -316.21388753255206, "loss": 0.402, "rewards/chosen": 0.09606765508651734, "rewards/margins": 1.082686173915863, "rewards/rejected": -0.9866185188293457, "step": 1206 }, { "epoch": 0.06397583017517823, "grad_norm": 75.0, "kl": 0.3938636779785156, "learning_rate": 5e-07, "logits/chosen": -28793648.0, "logits/rejected": -2477245.5, "logps/chosen": -481.1345621744792, "logps/rejected": -160.50399780273438, "loss": 0.4223, "rewards/chosen": 0.20135962963104248, "rewards/margins": 1.0867397785186768, "rewards/rejected": -0.8853801488876343, "step": 1207 }, { "epoch": 0.06402883417698037, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15697000.0, "logits/rejected": 7716611.333333333, "logps/chosen": -61.379661560058594, "logps/rejected": -382.6353352864583, "loss": 0.2822, "rewards/chosen": -0.13033056259155273, "rewards/margins": 1.584032376607259, "rewards/rejected": -1.7143629391988118, "step": 1208 }, { "epoch": 0.0640818381787825, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26503846.4, "logits/rejected": -76687989.33333333, "logps/chosen": -200.23795166015626, "logps/rejected": -480.517822265625, "loss": 0.4028, "rewards/chosen": -0.00652519017457962, "rewards/margins": 1.3240539317329725, "rewards/rejected": -1.330579121907552, "step": 1209 }, { "epoch": 0.06413484218058463, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26100998.4, "logits/rejected": -31434794.666666668, "logps/chosen": -388.2751708984375, "logps/rejected": -410.3728841145833, "loss": 0.3485, "rewards/chosen": 0.19725487232208253, "rewards/margins": 2.1294880469640094, "rewards/rejected": -1.932233174641927, "step": 1210 }, { "epoch": 0.06418784618238677, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34560428.0, "logits/rejected": -13983510.0, "logps/chosen": -366.0400695800781, "logps/rejected": -224.62034606933594, "loss": 0.4028, "rewards/chosen": -0.028699595481157303, "rewards/margins": 0.9456442408263683, "rewards/rejected": -0.9743438363075256, "step": 1211 }, { "epoch": 0.0642408501841889, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18488266.0, "logits/rejected": -17418849.333333332, "logps/chosen": -370.7762451171875, "logps/rejected": -328.4329833984375, "loss": 0.2747, "rewards/chosen": -0.1105392575263977, "rewards/margins": 1.6012905637423198, "rewards/rejected": -1.7118298212687175, "step": 1212 }, { "epoch": 0.06429385418599104, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19811766.0, "logits/rejected": -6404357.0, "logps/chosen": -296.2402038574219, "logps/rejected": -64.0255126953125, "loss": 0.4144, "rewards/chosen": 0.013283926993608475, "rewards/margins": 0.7218688912689686, "rewards/rejected": -0.7085849642753601, "step": 1213 }, { "epoch": 0.06434685818779318, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7218995.5, "logits/rejected": -7497556.5, "logps/chosen": -156.48728942871094, "logps/rejected": -323.0675964355469, "loss": 0.4002, "rewards/chosen": -0.21172672510147095, "rewards/margins": 1.0373699069023132, "rewards/rejected": -1.2490966320037842, "step": 1214 }, { "epoch": 0.06439986218959531, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54732844.0, "logits/rejected": -5763463.0, "logps/chosen": -227.04156494140625, "logps/rejected": -95.45101165771484, "loss": 0.3959, "rewards/chosen": 0.10962266474962234, "rewards/margins": 0.9038379862904549, "rewards/rejected": -0.7942153215408325, "step": 1215 }, { "epoch": 0.06445286619139745, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54572058.666666664, "logits/rejected": -32785497.6, "logps/chosen": -526.7942708333334, "logps/rejected": -246.9365478515625, "loss": 0.3198, "rewards/chosen": 0.05565389494101206, "rewards/margins": 1.4213820825020473, "rewards/rejected": -1.3657281875610352, "step": 1216 }, { "epoch": 0.06450587019319959, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13607314.0, "logits/rejected": -32194962.285714287, "logps/chosen": -43.693519592285156, "logps/rejected": -296.1704799107143, "loss": 0.2637, "rewards/chosen": 0.12768784165382385, "rewards/margins": 1.5067109848771776, "rewards/rejected": -1.3790231432233537, "step": 1217 }, { "epoch": 0.06455887419500173, "grad_norm": 88.0, "kl": 0.3296928405761719, "learning_rate": 5e-07, "logits/chosen": -47485126.4, "logits/rejected": -24439914.666666668, "logps/chosen": -743.535595703125, "logps/rejected": -532.3583577473959, "loss": 0.3584, "rewards/chosen": 0.15452942848205567, "rewards/margins": 1.8997759977976483, "rewards/rejected": -1.7452465693155925, "step": 1218 }, { "epoch": 0.06461187819680386, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45257769.6, "logits/rejected": -9349073.333333334, "logps/chosen": -269.986767578125, "logps/rejected": -336.6580810546875, "loss": 0.4002, "rewards/chosen": 0.1372656226158142, "rewards/margins": 1.111710011959076, "rewards/rejected": -0.9744443893432617, "step": 1219 }, { "epoch": 0.064664882198606, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40630962.666666664, "logits/rejected": -15063508.0, "logps/chosen": -424.7552083333333, "logps/rejected": -90.23345947265625, "loss": 0.4076, "rewards/chosen": 0.1085486610730489, "rewards/margins": 1.6297900875409443, "rewards/rejected": -1.5212414264678955, "step": 1220 }, { "epoch": 0.06471788620040814, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12616890.666666666, "logits/rejected": -23860779.2, "logps/chosen": -250.16971842447916, "logps/rejected": -209.5225341796875, "loss": 0.3619, "rewards/chosen": 0.1388652821381887, "rewards/margins": 1.0056165715058645, "rewards/rejected": -0.8667512893676758, "step": 1221 }, { "epoch": 0.06477089020221026, "grad_norm": 51.0, "kl": 0.0020122528076171875, "learning_rate": 5e-07, "logits/chosen": -42043176.0, "logits/rejected": -17054794.0, "logps/chosen": -287.6903991699219, "logps/rejected": -120.60997009277344, "loss": 0.4245, "rewards/chosen": -0.028334617614746094, "rewards/margins": 0.6359148621559143, "rewards/rejected": -0.6642494797706604, "step": 1222 }, { "epoch": 0.0648238942040124, "grad_norm": 43.5, "kl": 0.0051059722900390625, "learning_rate": 5e-07, "logits/chosen": -20376813.333333332, "logits/rejected": -8387145.6, "logps/chosen": -195.42508951822916, "logps/rejected": -139.71240234375, "loss": 0.3456, "rewards/chosen": 0.08470357457796733, "rewards/margins": 1.1655822137991587, "rewards/rejected": -1.0808786392211913, "step": 1223 }, { "epoch": 0.06487689820581453, "grad_norm": 64.0, "kl": 0.2136077880859375, "learning_rate": 5e-07, "logits/chosen": -66632140.0, "logits/rejected": -18157242.0, "logps/chosen": -341.53985595703125, "logps/rejected": -269.9142761230469, "loss": 0.3787, "rewards/chosen": 0.19000034034252167, "rewards/margins": 1.1511539667844772, "rewards/rejected": -0.9611536264419556, "step": 1224 }, { "epoch": 0.06492990220761667, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 39383328.0, "logps/chosen": -328.84967041015625, "loss": 0.4764, "rewards/chosen": 0.09562008082866669, "step": 1225 }, { "epoch": 0.06498290620941881, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27781872.0, "logits/rejected": -36021734.4, "logps/chosen": -336.03407796223956, "logps/rejected": -377.7521484375, "loss": 0.3199, "rewards/chosen": 0.09274444977442424, "rewards/margins": 1.395926288763682, "rewards/rejected": -1.3031818389892578, "step": 1226 }, { "epoch": 0.06503591021122095, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2148430.0, "logits/rejected": 18045030.0, "logps/chosen": -307.7086181640625, "logps/rejected": -177.41171264648438, "loss": 0.4137, "rewards/chosen": -0.030572131276130676, "rewards/margins": 0.7363508194684982, "rewards/rejected": -0.7669229507446289, "step": 1227 }, { "epoch": 0.06508891421302308, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36729280.0, "logits/rejected": -36076922.666666664, "logps/chosen": -153.73104858398438, "logps/rejected": -259.20363362630206, "loss": 0.3361, "rewards/chosen": 0.04170875623822212, "rewards/margins": 1.1743478464583557, "rewards/rejected": -1.1326390902201335, "step": 1228 }, { "epoch": 0.06514191821482522, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26038408.0, "logits/rejected": -22893402.0, "logps/chosen": -211.87255859375, "logps/rejected": -130.4517822265625, "loss": 0.3679, "rewards/chosen": 0.08026465773582458, "rewards/margins": 1.2011272609233856, "rewards/rejected": -1.120862603187561, "step": 1229 }, { "epoch": 0.06519492221662736, "grad_norm": 89.5, "kl": 2.2064208984375, "learning_rate": 5e-07, "logits/chosen": -27132124.0, "logits/rejected": -7363795.0, "logps/chosen": -760.3038330078125, "logps/rejected": -153.56756591796875, "loss": 0.3962, "rewards/chosen": 0.3437977433204651, "rewards/margins": 1.408915936946869, "rewards/rejected": -1.0651181936264038, "step": 1230 }, { "epoch": 0.0652479262184295, "grad_norm": 68.0, "kl": 1.295074462890625, "learning_rate": 5e-07, "logits/chosen": -59943808.0, "logits/rejected": -16464584.0, "logps/chosen": -360.4847412109375, "logps/rejected": -301.6457926432292, "loss": 0.3836, "rewards/chosen": 0.1205450415611267, "rewards/margins": 1.9158885995546977, "rewards/rejected": -1.795343557993571, "step": 1231 }, { "epoch": 0.06530093022023163, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21127860.8, "logits/rejected": -34073528.0, "logps/chosen": -247.57421875, "logps/rejected": -416.1099853515625, "loss": 0.3385, "rewards/chosen": 0.19485108852386473, "rewards/margins": 2.060979692141215, "rewards/rejected": -1.8661286036173503, "step": 1232 }, { "epoch": 0.06535393422203377, "grad_norm": 46.0, "kl": 0.3719358444213867, "learning_rate": 5e-07, "logits/chosen": -40598422.85714286, "logits/rejected": -161836320.0, "logps/chosen": -208.96194893973214, "logps/rejected": -159.94760131835938, "loss": 0.4729, "rewards/chosen": 0.015394117150987898, "rewards/margins": 1.179157521043505, "rewards/rejected": -1.163763403892517, "step": 1233 }, { "epoch": 0.0654069382238359, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6688906.0, "logits/rejected": -18119576.0, "logps/chosen": -101.16030883789062, "logps/rejected": -320.008056640625, "loss": 0.3109, "rewards/chosen": 0.20068232218424478, "rewards/margins": 1.6235914866129557, "rewards/rejected": -1.4229091644287108, "step": 1234 }, { "epoch": 0.06545994222563803, "grad_norm": 76.0, "kl": 0.43727874755859375, "learning_rate": 5e-07, "logits/chosen": -40797608.0, "logits/rejected": -26118604.0, "logps/chosen": -436.6482340494792, "logps/rejected": -95.35296630859375, "loss": 0.3765, "rewards/chosen": 0.35525206724802655, "rewards/margins": 1.6129039923350017, "rewards/rejected": -1.257651925086975, "step": 1235 }, { "epoch": 0.06551294622744017, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -797314.8333333334, "logits/rejected": -12994456.8, "logps/chosen": -202.91532389322916, "logps/rejected": -223.8885498046875, "loss": 0.3426, "rewards/chosen": 0.18405330181121826, "rewards/margins": 1.424780821800232, "rewards/rejected": -1.2407275199890138, "step": 1236 }, { "epoch": 0.0655659502292423, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57534570.666666664, "logits/rejected": -28857139.2, "logps/chosen": -542.237060546875, "logps/rejected": -312.35625, "loss": 0.2793, "rewards/chosen": 0.35934754212697345, "rewards/margins": 1.7630215724309284, "rewards/rejected": -1.403674030303955, "step": 1237 }, { "epoch": 0.06561895423104444, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25385576.0, "logits/rejected": -54682620.0, "logps/chosen": -185.76045735677084, "logps/rejected": -444.9115905761719, "loss": 0.4088, "rewards/chosen": 0.05870495239893595, "rewards/margins": 1.6097395221392314, "rewards/rejected": -1.5510345697402954, "step": 1238 }, { "epoch": 0.06567195823284658, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10579780.0, "logits/rejected": -71049280.0, "logps/chosen": -621.7133178710938, "logps/rejected": -488.665771484375, "loss": 0.3244, "rewards/chosen": 0.1377224624156952, "rewards/margins": 1.69060680270195, "rewards/rejected": -1.5528843402862549, "step": 1239 }, { "epoch": 0.06572496223464872, "grad_norm": 56.25, "kl": 0.5022392272949219, "learning_rate": 5e-07, "logits/chosen": 43756556.0, "logits/rejected": -27729888.0, "logps/chosen": -364.60284423828125, "logps/rejected": -204.92042541503906, "loss": 0.4054, "rewards/chosen": -0.3743302524089813, "rewards/margins": 0.9051013290882111, "rewards/rejected": -1.2794315814971924, "step": 1240 }, { "epoch": 0.06577796623645085, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19665805.714285713, "logits/rejected": -7683841.0, "logps/chosen": -304.73849051339283, "logps/rejected": -430.96319580078125, "loss": 0.4854, "rewards/chosen": -0.04970702954701015, "rewards/margins": 0.8271972792489188, "rewards/rejected": -0.876904308795929, "step": 1241 }, { "epoch": 0.06583097023825299, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12815612.8, "logits/rejected": -53019781.333333336, "logps/chosen": -216.105810546875, "logps/rejected": -336.10882568359375, "loss": 0.4333, "rewards/chosen": -0.3527242183685303, "rewards/margins": 1.117999728520711, "rewards/rejected": -1.4707239468892415, "step": 1242 }, { "epoch": 0.06588397424005513, "grad_norm": 71.0, "kl": 0.0180511474609375, "learning_rate": 5e-07, "logits/chosen": -41269941.333333336, "logits/rejected": -10062725.6, "logps/chosen": -559.6426595052084, "logps/rejected": -236.69287109375, "loss": 0.4077, "rewards/chosen": -0.37338892618815106, "rewards/margins": 0.5481258710225423, "rewards/rejected": -0.9215147972106934, "step": 1243 }, { "epoch": 0.06593697824185726, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37294697.6, "logits/rejected": -1111968.0, "logps/chosen": -340.171728515625, "logps/rejected": -573.3576253255209, "loss": 0.3883, "rewards/chosen": -0.02851501703262329, "rewards/margins": 1.449051543076833, "rewards/rejected": -1.4775665601094563, "step": 1244 }, { "epoch": 0.0659899822436594, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62497126.4, "logits/rejected": 61892245.333333336, "logps/chosen": -420.652880859375, "logps/rejected": -521.6964925130209, "loss": 0.4677, "rewards/chosen": -0.38697266578674316, "rewards/margins": 0.9690333207448323, "rewards/rejected": -1.3560059865315754, "step": 1245 }, { "epoch": 0.06604298624546154, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39258104.0, "logits/rejected": -13439984.0, "logps/chosen": -212.20481872558594, "logps/rejected": -338.65032958984375, "loss": 0.3735, "rewards/chosen": -0.19207324087619781, "rewards/margins": 1.4422589391469955, "rewards/rejected": -1.6343321800231934, "step": 1246 }, { "epoch": 0.06609599024726368, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18073762.0, "logits/rejected": -6586329.0, "logps/chosen": -355.452880859375, "logps/rejected": -262.1337585449219, "loss": 0.3879, "rewards/chosen": 0.10180055350065231, "rewards/margins": 1.0464777871966362, "rewards/rejected": -0.9446772336959839, "step": 1247 }, { "epoch": 0.0661489942490658, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61686005.333333336, "logits/rejected": -35106339.2, "logps/chosen": -361.0360921223958, "logps/rejected": -272.807568359375, "loss": 0.2841, "rewards/chosen": 0.12238057454427083, "rewards/margins": 1.7086686452229818, "rewards/rejected": -1.586288070678711, "step": 1248 }, { "epoch": 0.06620199825086794, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10407768.0, "logits/rejected": -42846768.0, "logps/chosen": -153.0926513671875, "logps/rejected": -363.2381591796875, "loss": 0.3725, "rewards/chosen": -0.17493800818920135, "rewards/margins": 1.3853084594011307, "rewards/rejected": -1.560246467590332, "step": 1249 }, { "epoch": 0.06625500225267007, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3989753.0, "logits/rejected": -25866433.6, "logps/chosen": -122.43203735351562, "logps/rejected": -226.1787109375, "loss": 0.3455, "rewards/chosen": 0.2472987174987793, "rewards/margins": 1.2916603088378906, "rewards/rejected": -1.0443615913391113, "step": 1250 }, { "epoch": 0.06630800625447221, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6727916.0, "logits/rejected": -23770026.0, "logps/chosen": -168.6720733642578, "logps/rejected": -150.7919921875, "loss": 0.4348, "rewards/chosen": -0.1951509565114975, "rewards/margins": 0.6494661718606949, "rewards/rejected": -0.8446171283721924, "step": 1251 }, { "epoch": 0.06636101025627435, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1567147.0, "logits/rejected": -20406915.2, "logps/chosen": -109.62003580729167, "logps/rejected": -185.8995361328125, "loss": 0.3823, "rewards/chosen": -0.14637795090675354, "rewards/margins": 0.8036942422389984, "rewards/rejected": -0.950072193145752, "step": 1252 }, { "epoch": 0.06641401425807648, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36008208.0, "logits/rejected": -54600808.0, "logps/chosen": -219.79556274414062, "logps/rejected": -798.3987426757812, "loss": 0.298, "rewards/chosen": 0.04482211917638779, "rewards/margins": 2.222451113164425, "rewards/rejected": -2.177628993988037, "step": 1253 }, { "epoch": 0.06646701825987862, "grad_norm": 92.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26939868.8, "logits/rejected": 37033408.0, "logps/chosen": -283.788330078125, "logps/rejected": -456.8387858072917, "loss": 0.409, "rewards/chosen": -0.23180160522460938, "rewards/margins": 1.5227741241455077, "rewards/rejected": -1.7545757293701172, "step": 1254 }, { "epoch": 0.06652002226168076, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7553893.6, "logits/rejected": 3450898.3333333335, "logps/chosen": -101.11812744140624, "logps/rejected": -71.49792989095052, "loss": 0.4673, "rewards/chosen": -0.1576349377632141, "rewards/margins": 0.4755506873130798, "rewards/rejected": -0.633185625076294, "step": 1255 }, { "epoch": 0.0665730262634829, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22345412.0, "logits/rejected": -26254972.0, "logps/chosen": -327.928466796875, "logps/rejected": -378.56060791015625, "loss": 0.3394, "rewards/chosen": 0.05232276767492294, "rewards/margins": 2.04509749263525, "rewards/rejected": -1.9927747249603271, "step": 1256 }, { "epoch": 0.06662603026528503, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11137649.142857144, "logits/rejected": 5037755.5, "logps/chosen": -217.27130998883928, "logps/rejected": -356.296875, "loss": 0.4543, "rewards/chosen": 0.04657963344029018, "rewards/margins": 1.3310095412390572, "rewards/rejected": -1.284429907798767, "step": 1257 }, { "epoch": 0.06667903426708717, "grad_norm": 49.5, "kl": 0.21785831451416016, "learning_rate": 5e-07, "logits/chosen": -11597236.0, "logits/rejected": -19777730.0, "logps/chosen": -144.53494262695312, "logps/rejected": -153.55935668945312, "loss": 0.4321, "rewards/chosen": -0.3041988015174866, "rewards/margins": 0.6193341612815857, "rewards/rejected": -0.9235329627990723, "step": 1258 }, { "epoch": 0.06673203826888931, "grad_norm": 80.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40180348.8, "logits/rejected": -59164784.0, "logps/chosen": -651.49716796875, "logps/rejected": -508.6753743489583, "loss": 0.3317, "rewards/chosen": 0.2691948890686035, "rewards/margins": 2.0920414606730144, "rewards/rejected": -1.8228465716044109, "step": 1259 }, { "epoch": 0.06678504227069143, "grad_norm": 60.25, "kl": 0.1283721923828125, "learning_rate": 5e-07, "logits/chosen": -14653090.0, "logits/rejected": -3912506.0, "logps/chosen": -330.30511474609375, "logps/rejected": -260.90228271484375, "loss": 0.4541, "rewards/chosen": -0.2376188337802887, "rewards/margins": 0.43187764286994934, "rewards/rejected": -0.669496476650238, "step": 1260 }, { "epoch": 0.06683804627249357, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18872420.0, "logits/rejected": -25556198.0, "logps/chosen": -333.836669921875, "logps/rejected": -195.7034149169922, "loss": 0.3395, "rewards/chosen": 0.08333931118249893, "rewards/margins": 1.5175395980477333, "rewards/rejected": -1.4342002868652344, "step": 1261 }, { "epoch": 0.0668910502742957, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40119602.666666664, "logits/rejected": -7782971.0, "logps/chosen": -340.3063557942708, "logps/rejected": -197.61691284179688, "loss": 0.421, "rewards/chosen": 0.1992403268814087, "rewards/margins": 0.9450182914733887, "rewards/rejected": -0.74577796459198, "step": 1262 }, { "epoch": 0.06694405427609784, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37803173.333333336, "logits/rejected": -20812617.6, "logps/chosen": -540.2578125, "logps/rejected": -242.843359375, "loss": 0.3216, "rewards/chosen": 0.24216308196385702, "rewards/margins": 1.4467535932858784, "rewards/rejected": -1.2045905113220214, "step": 1263 }, { "epoch": 0.06699705827789998, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29310106.666666668, "logits/rejected": -17903297.6, "logps/chosen": -239.3310546875, "logps/rejected": -215.5171875, "loss": 0.323, "rewards/chosen": 0.2855173746744792, "rewards/margins": 1.4451282183329266, "rewards/rejected": -1.1596108436584474, "step": 1264 }, { "epoch": 0.06705006227970212, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33060880.0, "logits/rejected": -1620301.0, "logps/chosen": -181.61785888671875, "logps/rejected": -117.9804458618164, "loss": 0.3969, "rewards/chosen": -0.14817753434181213, "rewards/margins": 0.9507932364940643, "rewards/rejected": -1.0989707708358765, "step": 1265 }, { "epoch": 0.06710306628150425, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2118280.8, "logits/rejected": -51318256.0, "logps/chosen": -404.5715087890625, "logps/rejected": -723.0646158854166, "loss": 0.3775, "rewards/chosen": -0.09677520990371705, "rewards/margins": 2.141739054520925, "rewards/rejected": -2.238514264424642, "step": 1266 }, { "epoch": 0.06715607028330639, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6147624.4, "logits/rejected": -56368160.0, "logps/chosen": -274.3715576171875, "logps/rejected": -377.1992594401042, "loss": 0.3872, "rewards/chosen": -0.07062653303146363, "rewards/margins": 1.7200082580248515, "rewards/rejected": -1.7906347910563152, "step": 1267 }, { "epoch": 0.06720907428510853, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30988848.0, "logits/rejected": -16221850.666666666, "logps/chosen": -546.83466796875, "logps/rejected": -220.1557820638021, "loss": 0.3645, "rewards/chosen": 0.05228393077850342, "rewards/margins": 1.7397372007369996, "rewards/rejected": -1.687453269958496, "step": 1268 }, { "epoch": 0.06726207828691066, "grad_norm": 47.25, "kl": 0.05652046203613281, "learning_rate": 5e-07, "logits/chosen": 2341989.0, "logits/rejected": -87128096.0, "logps/chosen": -133.16578674316406, "logps/rejected": -421.25, "loss": 0.3448, "rewards/chosen": 0.15526843070983887, "rewards/margins": 1.4337447881698608, "rewards/rejected": -1.278476357460022, "step": 1269 }, { "epoch": 0.0673150822887128, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8241597.0, "logits/rejected": -39585928.0, "logps/chosen": -435.1309509277344, "logps/rejected": -244.7665812174479, "loss": 0.2668, "rewards/chosen": 0.16052724421024323, "rewards/margins": 1.5853430579106014, "rewards/rejected": -1.4248158137003581, "step": 1270 }, { "epoch": 0.06736808629051494, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51854936.0, "logits/rejected": -28165040.0, "logps/chosen": -375.5619201660156, "logps/rejected": -455.02862548828125, "loss": 0.3562, "rewards/chosen": -0.3567143678665161, "rewards/margins": 1.7447329759597778, "rewards/rejected": -2.101447343826294, "step": 1271 }, { "epoch": 0.06742109029231708, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55787269.333333336, "logits/rejected": -3679904.0, "logps/chosen": -214.86419677734375, "logps/rejected": -78.22017211914063, "loss": 0.4536, "rewards/chosen": -0.47047853469848633, "rewards/margins": 0.13902711868286133, "rewards/rejected": -0.6095056533813477, "step": 1272 }, { "epoch": 0.0674740942941192, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14796979.2, "logits/rejected": -51691776.0, "logps/chosen": -287.110986328125, "logps/rejected": -340.3780924479167, "loss": 0.3733, "rewards/chosen": -0.042767184972763064, "rewards/margins": 1.8796383758385975, "rewards/rejected": -1.9224055608113606, "step": 1273 }, { "epoch": 0.06752709829592134, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32957606.0, "logits/rejected": -12064805.714285715, "logps/chosen": -246.92562866210938, "logps/rejected": -187.35757882254464, "loss": 0.3416, "rewards/chosen": -0.14181672036647797, "rewards/margins": 0.707696197288377, "rewards/rejected": -0.8495129176548549, "step": 1274 }, { "epoch": 0.06758010229772347, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27784818.0, "logits/rejected": -20613508.0, "logps/chosen": -174.07293701171875, "logps/rejected": -153.11709594726562, "loss": 0.3639, "rewards/chosen": 0.19781893491744995, "rewards/margins": 1.3203343749046326, "rewards/rejected": -1.1225154399871826, "step": 1275 }, { "epoch": 0.06763310629952561, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7710986.0, "logits/rejected": -29995432.0, "logps/chosen": -120.48831939697266, "logps/rejected": -547.5694580078125, "loss": 0.3111, "rewards/chosen": 0.33356183767318726, "rewards/margins": 2.605343520641327, "rewards/rejected": -2.2717816829681396, "step": 1276 }, { "epoch": 0.06768611030132775, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64609552.0, "logits/rejected": -29594822.0, "logps/chosen": -395.34478759765625, "logps/rejected": -457.54949951171875, "loss": 0.291, "rewards/chosen": 0.23581963777542114, "rewards/margins": 2.135961949825287, "rewards/rejected": -1.9001423120498657, "step": 1277 }, { "epoch": 0.06773911430312989, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37222292.0, "logits/rejected": -18495030.0, "logps/chosen": -252.6370391845703, "logps/rejected": -213.74713134765625, "loss": 0.4181, "rewards/chosen": -0.03644237667322159, "rewards/margins": 0.709295891225338, "rewards/rejected": -0.7457382678985596, "step": 1278 }, { "epoch": 0.06779211830493202, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1412658.0, "logits/rejected": -22390288.0, "logps/chosen": -230.50482177734375, "logps/rejected": -270.283349609375, "loss": 0.3517, "rewards/chosen": 0.020891825358072918, "rewards/margins": 1.0771968523661297, "rewards/rejected": -1.0563050270080567, "step": 1279 }, { "epoch": 0.06784512230673416, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10444283.0, "logits/rejected": -36701552.0, "logps/chosen": -242.77085876464844, "logps/rejected": -572.35205078125, "loss": 0.3116, "rewards/chosen": 0.07932205498218536, "rewards/margins": 2.0606498271226883, "rewards/rejected": -1.981327772140503, "step": 1280 }, { "epoch": 0.0678981263085363, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8606.5, "logits/rejected": -14220793.6, "logps/chosen": -140.38480631510416, "logps/rejected": -283.5306640625, "loss": 0.3473, "rewards/chosen": -0.1311077872912089, "rewards/margins": 1.15624391635259, "rewards/rejected": -1.287351703643799, "step": 1281 }, { "epoch": 0.06795113031033843, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17743032.0, "logits/rejected": -9516007.0, "logps/chosen": -228.94183349609375, "logps/rejected": -308.5597839355469, "loss": 0.3755, "rewards/chosen": -0.3786768913269043, "rewards/margins": 1.275465965270996, "rewards/rejected": -1.6541428565979004, "step": 1282 }, { "epoch": 0.06800413431214057, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7278812.666666667, "logits/rejected": -76341094.4, "logps/chosen": -111.16309611002605, "logps/rejected": -369.762158203125, "loss": 0.366, "rewards/chosen": -0.16783583164215088, "rewards/margins": 0.9118953466415405, "rewards/rejected": -1.0797311782836914, "step": 1283 }, { "epoch": 0.06805713831394271, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10454561.0, "logits/rejected": -9689834.0, "logps/chosen": -123.51238250732422, "logps/rejected": -474.38262939453125, "loss": 0.3338, "rewards/chosen": 0.028717372566461563, "rewards/margins": 2.201325986534357, "rewards/rejected": -2.1726086139678955, "step": 1284 }, { "epoch": 0.06811014231574485, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13422432.0, "logits/rejected": -11109440.666666666, "logps/chosen": -249.24755859375, "logps/rejected": -211.61356608072916, "loss": 0.3971, "rewards/chosen": 0.1143419623374939, "rewards/margins": 1.1598923087120057, "rewards/rejected": -1.0455503463745117, "step": 1285 }, { "epoch": 0.06816314631754697, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9786163.333333334, "logits/rejected": -28751848.0, "logps/chosen": -303.99428304036456, "logps/rejected": -276.4831237792969, "loss": 0.3996, "rewards/chosen": 0.09967458248138428, "rewards/margins": 1.899819254875183, "rewards/rejected": -1.8001446723937988, "step": 1286 }, { "epoch": 0.0682161503193491, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38191040.0, "logits/rejected": -12586910.0, "logps/chosen": -206.21885681152344, "logps/rejected": -338.66363525390625, "loss": 0.3742, "rewards/chosen": -0.05957365036010742, "rewards/margins": 1.2011985778808594, "rewards/rejected": -1.2607722282409668, "step": 1287 }, { "epoch": 0.06826915432115124, "grad_norm": 54.75, "kl": 0.15743255615234375, "learning_rate": 5e-07, "logits/chosen": -29178550.0, "logits/rejected": -79165240.0, "logps/chosen": -367.2449645996094, "logps/rejected": -220.46078491210938, "loss": 0.359, "rewards/chosen": 0.1505664885044098, "rewards/margins": 1.2378126680850983, "rewards/rejected": -1.0872461795806885, "step": 1288 }, { "epoch": 0.06832215832295338, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23705556.8, "logits/rejected": -25902682.666666668, "logps/chosen": -83.76889038085938, "logps/rejected": -184.986572265625, "loss": 0.4068, "rewards/chosen": 0.10365833044052124, "rewards/margins": 1.0573797742525737, "rewards/rejected": -0.9537214438120524, "step": 1289 }, { "epoch": 0.06837516232475552, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71843584.0, "logits/rejected": -19704712.0, "logps/chosen": -326.7239685058594, "logps/rejected": -321.70648193359375, "loss": 0.382, "rewards/chosen": -0.07952285557985306, "rewards/margins": 1.8242368623614311, "rewards/rejected": -1.9037597179412842, "step": 1290 }, { "epoch": 0.06842816632655765, "grad_norm": 52.0, "kl": 0.10443878173828125, "learning_rate": 5e-07, "logits/chosen": -23739962.666666668, "logits/rejected": -24402816.0, "logps/chosen": -398.6328938802083, "logps/rejected": -374.5934326171875, "loss": 0.2966, "rewards/chosen": 0.2193237543106079, "rewards/margins": 1.976817536354065, "rewards/rejected": -1.757493782043457, "step": 1291 }, { "epoch": 0.06848117032835979, "grad_norm": 54.75, "kl": 0.032989501953125, "learning_rate": 5e-07, "logits/chosen": -81899481.6, "logits/rejected": -6690376.0, "logps/chosen": -318.0192138671875, "logps/rejected": -80.55077107747395, "loss": 0.4702, "rewards/chosen": 0.011535267531871795, "rewards/margins": 0.3392625853419304, "rewards/rejected": -0.3277273178100586, "step": 1292 }, { "epoch": 0.06853417433016193, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32483386.0, "logits/rejected": -7892819.5, "logps/chosen": -337.4515686035156, "logps/rejected": -112.50125122070312, "loss": 0.4221, "rewards/chosen": -0.028787702322006226, "rewards/margins": 0.7282072007656097, "rewards/rejected": -0.756994903087616, "step": 1293 }, { "epoch": 0.06858717833196407, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14388234.0, "logits/rejected": 9727440.0, "logps/chosen": -72.22710418701172, "logps/rejected": -243.8872833251953, "loss": 0.3761, "rewards/chosen": 0.21589255332946777, "rewards/margins": 1.1838761568069458, "rewards/rejected": -0.967983603477478, "step": 1294 }, { "epoch": 0.0686401823337662, "grad_norm": 53.5, "kl": 0.22267913818359375, "learning_rate": 5e-07, "logits/chosen": -16073904.0, "logits/rejected": 7418582.0, "logps/chosen": -267.86151123046875, "logps/rejected": -286.9417724609375, "loss": 0.3698, "rewards/chosen": 0.19188469648361206, "rewards/margins": 1.1161488890647888, "rewards/rejected": -0.9242641925811768, "step": 1295 }, { "epoch": 0.06869318633556834, "grad_norm": 71.5, "kl": 0.6556777954101562, "learning_rate": 5e-07, "logits/chosen": -35033936.0, "logits/rejected": 12890.875, "logps/chosen": -450.3779296875, "logps/rejected": -83.65880584716797, "loss": 0.4272, "rewards/chosen": 0.0940802792708079, "rewards/margins": 1.1930605868498485, "rewards/rejected": -1.0989803075790405, "step": 1296 }, { "epoch": 0.06874619033737048, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3301365.3333333335, "logits/rejected": -21645598.4, "logps/chosen": -357.1083984375, "logps/rejected": -351.547509765625, "loss": 0.3257, "rewards/chosen": -0.10592702031135559, "rewards/margins": 1.6268425643444062, "rewards/rejected": -1.7327695846557618, "step": 1297 }, { "epoch": 0.0687991943391726, "grad_norm": 61.5, "kl": 0.09914016723632812, "learning_rate": 5e-07, "logits/chosen": -34115776.0, "logits/rejected": -2008108.0, "logps/chosen": -397.91241455078125, "logps/rejected": -440.92608642578125, "loss": 0.3147, "rewards/chosen": 0.2803594172000885, "rewards/margins": 1.7377786934375763, "rewards/rejected": -1.4574192762374878, "step": 1298 }, { "epoch": 0.06885219834097474, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65992684.8, "logits/rejected": -13737842.666666666, "logps/chosen": -230.872509765625, "logps/rejected": -461.368896484375, "loss": 0.3718, "rewards/chosen": -0.02874908447265625, "rewards/margins": 1.7625698089599608, "rewards/rejected": -1.7913188934326172, "step": 1299 }, { "epoch": 0.06890520234277688, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13920331.2, "logits/rejected": -31940437.333333332, "logps/chosen": -642.1982421875, "logps/rejected": -228.74381510416666, "loss": 0.345, "rewards/chosen": 0.5550648689270019, "rewards/margins": 1.5258124192555744, "rewards/rejected": -0.9707475503285726, "step": 1300 }, { "epoch": 0.06895820634457901, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 40045370.666666664, "logits/rejected": -23556470.4, "logps/chosen": -411.1829427083333, "logps/rejected": -305.95068359375, "loss": 0.355, "rewards/chosen": -0.15364354848861694, "rewards/margins": 1.2978973984718323, "rewards/rejected": -1.4515409469604492, "step": 1301 }, { "epoch": 0.06901121034638115, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42053594.666666664, "logits/rejected": 1723790.0, "logps/chosen": -377.3709716796875, "logps/rejected": -312.6380310058594, "loss": 0.3954, "rewards/chosen": 0.18430805206298828, "rewards/margins": 1.4582544565200806, "rewards/rejected": -1.2739464044570923, "step": 1302 }, { "epoch": 0.06906421434818329, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 52014784.0, "logits/rejected": -12391448.8, "logps/chosen": -281.8742268880208, "logps/rejected": -199.02889404296874, "loss": 0.3391, "rewards/chosen": 0.08222554624080658, "rewards/margins": 1.2100291579961777, "rewards/rejected": -1.1278036117553711, "step": 1303 }, { "epoch": 0.06911721834998542, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54272744.0, "logits/rejected": -31051920.0, "logps/chosen": -516.2413330078125, "logps/rejected": -268.364501953125, "loss": 0.3431, "rewards/chosen": 0.008558657020330429, "rewards/margins": 1.7026103995740414, "rewards/rejected": -1.694051742553711, "step": 1304 }, { "epoch": 0.06917022235178756, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27274282.666666668, "logits/rejected": -18672336.0, "logps/chosen": -454.6581217447917, "logps/rejected": -138.200439453125, "loss": 0.3769, "rewards/chosen": 0.2481085459391276, "rewards/margins": 0.9559584299723307, "rewards/rejected": -0.7078498840332031, "step": 1305 }, { "epoch": 0.0692232263535897, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16630282.0, "logits/rejected": -16524340.0, "logps/chosen": -233.88743591308594, "logps/rejected": -352.302001953125, "loss": 0.386, "rewards/chosen": -0.27572470903396606, "rewards/margins": 1.412373960018158, "rewards/rejected": -1.688098669052124, "step": 1306 }, { "epoch": 0.06927623035539184, "grad_norm": 49.0, "kl": 0.12815475463867188, "learning_rate": 5e-07, "logits/chosen": 11754114.0, "logits/rejected": -13928609.0, "logps/chosen": -119.17948150634766, "logps/rejected": -218.35467529296875, "loss": 0.3891, "rewards/chosen": 0.0008271150290966034, "rewards/margins": 1.0918592624366283, "rewards/rejected": -1.0910321474075317, "step": 1307 }, { "epoch": 0.06932923435719397, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1159500.5, "logits/rejected": -25659694.0, "logps/chosen": -388.3917236328125, "logps/rejected": -336.82464599609375, "loss": 0.3372, "rewards/chosen": 0.13167190551757812, "rewards/margins": 1.9242870807647705, "rewards/rejected": -1.7926151752471924, "step": 1308 }, { "epoch": 0.06938223835899611, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13751350.0, "logits/rejected": -43612635.428571425, "logps/chosen": -80.06913757324219, "logps/rejected": -241.33349609375, "loss": 0.2956, "rewards/chosen": -0.08087845146656036, "rewards/margins": 0.9888529202767782, "rewards/rejected": -1.0697313717433385, "step": 1309 }, { "epoch": 0.06943524236079825, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9725226.666666666, "logits/rejected": -40930768.0, "logps/chosen": -237.31144205729166, "logps/rejected": -968.9268188476562, "loss": 0.3987, "rewards/chosen": 0.0016623834768931072, "rewards/margins": 2.8808022836844125, "rewards/rejected": -2.8791399002075195, "step": 1310 }, { "epoch": 0.06948824636260037, "grad_norm": 68.5, "kl": 0.3231391906738281, "learning_rate": 5e-07, "logits/chosen": -21762800.0, "logits/rejected": -66725304.0, "logps/chosen": -323.02657645089283, "logps/rejected": -516.7864379882812, "loss": 0.4297, "rewards/chosen": 0.12702463354383195, "rewards/margins": 2.4627425500324795, "rewards/rejected": -2.3357179164886475, "step": 1311 }, { "epoch": 0.06954125036440251, "grad_norm": 66.5, "kl": 0.42889404296875, "learning_rate": 5e-07, "logits/chosen": -27140690.666666668, "logits/rejected": -35339840.0, "logps/chosen": -294.2232666015625, "logps/rejected": -436.5753173828125, "loss": 0.372, "rewards/chosen": 0.2660580476125081, "rewards/margins": 2.181512196858724, "rewards/rejected": -1.9154541492462158, "step": 1312 }, { "epoch": 0.06959425436620464, "grad_norm": 63.5, "kl": 0.13791179656982422, "learning_rate": 5e-07, "logits/chosen": -23224634.666666668, "logits/rejected": -63248776.0, "logps/chosen": -390.1525472005208, "logps/rejected": -618.9137573242188, "loss": 0.3213, "rewards/chosen": 0.47994788487752277, "rewards/margins": 2.4853769143422446, "rewards/rejected": -2.0054290294647217, "step": 1313 }, { "epoch": 0.06964725836800678, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1968252.6666666667, "logits/rejected": -11663578.4, "logps/chosen": -208.30340576171875, "logps/rejected": -229.962255859375, "loss": 0.402, "rewards/chosen": 0.03541188935438792, "rewards/margins": 0.7247602055470148, "rewards/rejected": -0.6893483161926269, "step": 1314 }, { "epoch": 0.06970026236980892, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34745336.0, "logits/rejected": -21297722.666666668, "logps/chosen": -443.949951171875, "logps/rejected": -233.97857666015625, "loss": 0.3147, "rewards/chosen": -0.286917120218277, "rewards/margins": 1.0154917339483898, "rewards/rejected": -1.3024088541666667, "step": 1315 }, { "epoch": 0.06975326637161106, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3112776.0, "logits/rejected": -22799656.0, "logps/chosen": -301.8673502604167, "logps/rejected": -87.55125732421875, "loss": 0.3893, "rewards/chosen": 0.1470314065615336, "rewards/margins": 0.8694441358248393, "rewards/rejected": -0.7224127292633057, "step": 1316 }, { "epoch": 0.06980627037341319, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37242928.0, "logits/rejected": -8701728.0, "logps/chosen": -523.4742838541666, "logps/rejected": -159.24346923828125, "loss": 0.3442, "rewards/chosen": -0.13605042298634848, "rewards/margins": 1.2078641812006634, "rewards/rejected": -1.3439146041870118, "step": 1317 }, { "epoch": 0.06985927437521533, "grad_norm": 71.5, "kl": 1.1335563659667969, "learning_rate": 5e-07, "logits/chosen": -75909224.0, "logits/rejected": -4801235.0, "logps/chosen": -917.0936889648438, "logps/rejected": -350.76470947265625, "loss": 0.2717, "rewards/chosen": 0.611926257610321, "rewards/margins": 2.577777087688446, "rewards/rejected": -1.965850830078125, "step": 1318 }, { "epoch": 0.06991227837701747, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20546848.0, "logits/rejected": -13628504.0, "logps/chosen": -206.1517578125, "logps/rejected": -301.5359700520833, "loss": 0.4253, "rewards/chosen": -0.07049064636230469, "rewards/margins": 0.9308971087137858, "rewards/rejected": -1.0013877550760906, "step": 1319 }, { "epoch": 0.0699652823788196, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40800808.0, "logits/rejected": -48348580.0, "logps/chosen": -278.27593994140625, "logps/rejected": -257.31658935546875, "loss": 0.3644, "rewards/chosen": -0.043558888137340546, "rewards/margins": 1.4496411755681038, "rewards/rejected": -1.4932000637054443, "step": 1320 }, { "epoch": 0.07001828638062174, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27319178.666666668, "logits/rejected": 23443372.8, "logps/chosen": -214.89263916015625, "logps/rejected": -397.586474609375, "loss": 0.3192, "rewards/chosen": 0.19603323936462402, "rewards/margins": 1.564275598526001, "rewards/rejected": -1.368242359161377, "step": 1321 }, { "epoch": 0.07007129038242388, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29744636.8, "logits/rejected": -49701440.0, "logps/chosen": -363.2083740234375, "logps/rejected": -268.4847412109375, "loss": 0.3689, "rewards/chosen": 0.2697943925857544, "rewards/margins": 1.434983499844869, "rewards/rejected": -1.1651891072591145, "step": 1322 }, { "epoch": 0.07012429438422602, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31652952.0, "logits/rejected": -44190468.0, "logps/chosen": -383.9044494628906, "logps/rejected": -439.10052490234375, "loss": 0.3336, "rewards/chosen": 0.18874120712280273, "rewards/margins": 1.790126919746399, "rewards/rejected": -1.6013857126235962, "step": 1323 }, { "epoch": 0.07017729838602814, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5070632.0, "logits/rejected": -19675500.0, "logps/chosen": -240.51414489746094, "logps/rejected": -355.737060546875, "loss": 0.2661, "rewards/chosen": -0.04590683430433273, "rewards/margins": 1.7160417512059212, "rewards/rejected": -1.761948585510254, "step": 1324 }, { "epoch": 0.07023030238783028, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39461144.0, "logits/rejected": -6493678.0, "logps/chosen": -386.8591613769531, "logps/rejected": -245.28195190429688, "loss": 0.3854, "rewards/chosen": 0.06223946809768677, "rewards/margins": 1.1375542283058167, "rewards/rejected": -1.0753147602081299, "step": 1325 }, { "epoch": 0.07028330638963241, "grad_norm": 81.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35584248.0, "logits/rejected": -14900778.0, "logps/chosen": -657.4095865885416, "logps/rejected": -124.51011657714844, "loss": 0.4194, "rewards/chosen": 0.25328630208969116, "rewards/margins": 0.8456515669822693, "rewards/rejected": -0.5923652648925781, "step": 1326 }, { "epoch": 0.07033631039143455, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22247276.0, "logits/rejected": -28171494.4, "logps/chosen": -408.8734944661458, "logps/rejected": -286.07626953125, "loss": 0.3318, "rewards/chosen": -0.049383545915285744, "rewards/margins": 1.2448531140883763, "rewards/rejected": -1.294236660003662, "step": 1327 }, { "epoch": 0.07038931439323669, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14581776.0, "logits/rejected": -16745126.666666666, "logps/chosen": -156.3007568359375, "logps/rejected": -335.01418050130206, "loss": 0.4814, "rewards/chosen": -0.23009734153747557, "rewards/margins": 0.4452472845713297, "rewards/rejected": -0.6753446261088053, "step": 1328 }, { "epoch": 0.07044231839503882, "grad_norm": 61.0, "kl": 0.33284759521484375, "learning_rate": 5e-07, "logits/chosen": -32093277.333333332, "logits/rejected": -42290246.4, "logps/chosen": -405.1710205078125, "logps/rejected": -220.37265625, "loss": 0.3707, "rewards/chosen": -0.1539586385091146, "rewards/margins": 1.1582471529642742, "rewards/rejected": -1.3122057914733887, "step": 1329 }, { "epoch": 0.07049532239684096, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10100956.0, "logits/rejected": -31972530.666666668, "logps/chosen": -85.30015563964844, "logps/rejected": -221.26910400390625, "loss": 0.4072, "rewards/chosen": -0.10044307261705399, "rewards/margins": 0.4525144621729851, "rewards/rejected": -0.5529575347900391, "step": 1330 }, { "epoch": 0.0705483263986431, "grad_norm": 53.5, "kl": 0.19758987426757812, "learning_rate": 5e-07, "logits/chosen": -55240420.0, "logits/rejected": -15583026.0, "logps/chosen": -243.31092834472656, "logps/rejected": -240.8131103515625, "loss": 0.3042, "rewards/chosen": 0.35564810037612915, "rewards/margins": 1.8477984070777893, "rewards/rejected": -1.4921503067016602, "step": 1331 }, { "epoch": 0.07060133040044524, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22349357.333333332, "logits/rejected": -20915875.2, "logps/chosen": -216.45161946614584, "logps/rejected": -364.690185546875, "loss": 0.3176, "rewards/chosen": -0.10828610261281331, "rewards/margins": 1.530902663866679, "rewards/rejected": -1.6391887664794922, "step": 1332 }, { "epoch": 0.07065433440224737, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40296613.333333336, "logits/rejected": -17462043.2, "logps/chosen": -462.0391845703125, "logps/rejected": -172.9806884765625, "loss": 0.3509, "rewards/chosen": 0.04366860787073771, "rewards/margins": 1.0902011434237162, "rewards/rejected": -1.0465325355529784, "step": 1333 }, { "epoch": 0.07070733840404951, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38409560.0, "logits/rejected": -7579545.0, "logps/chosen": -436.5362955729167, "logps/rejected": -279.96734619140625, "loss": 0.3997, "rewards/chosen": 0.30504417419433594, "rewards/margins": 1.276197910308838, "rewards/rejected": -0.971153736114502, "step": 1334 }, { "epoch": 0.07076034240585165, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1521744.3333333333, "logits/rejected": 8766161.0, "logps/chosen": -117.89876302083333, "logps/rejected": -182.76295471191406, "loss": 0.4489, "rewards/chosen": 0.121859610080719, "rewards/margins": 0.5875574946403503, "rewards/rejected": -0.46569788455963135, "step": 1335 }, { "epoch": 0.07081334640765377, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47680341.333333336, "logits/rejected": -28296224.0, "logps/chosen": -576.3170979817709, "logps/rejected": -261.5906494140625, "loss": 0.3298, "rewards/chosen": 0.20058085521062216, "rewards/margins": 1.4372453411420185, "rewards/rejected": -1.2366644859313964, "step": 1336 }, { "epoch": 0.07086635040945591, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51743733.333333336, "logits/rejected": -10629530.4, "logps/chosen": -381.3264973958333, "logps/rejected": -336.1861572265625, "loss": 0.3099, "rewards/chosen": 0.2185204823811849, "rewards/margins": 1.530925210316976, "rewards/rejected": -1.312404727935791, "step": 1337 }, { "epoch": 0.07091935441125805, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 133234592.0, "logits/rejected": -14122988.0, "logps/chosen": -454.72637939453125, "logps/rejected": -336.90334065755206, "loss": 0.2266, "rewards/chosen": -0.09366302192211151, "rewards/margins": 2.0972509731849036, "rewards/rejected": -2.190913995107015, "step": 1338 }, { "epoch": 0.07097235841306018, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53767834.666666664, "logits/rejected": -57170982.4, "logps/chosen": -440.1416015625, "logps/rejected": -442.558740234375, "loss": 0.3062, "rewards/chosen": -0.08980762958526611, "rewards/margins": 1.8472281217575073, "rewards/rejected": -1.9370357513427734, "step": 1339 }, { "epoch": 0.07102536241486232, "grad_norm": 77.5, "kl": 1.6277179718017578, "learning_rate": 5e-07, "logits/chosen": -26129446.85714286, "logits/rejected": -74492128.0, "logps/chosen": -486.53041294642856, "logps/rejected": -631.6063232421875, "loss": 0.4087, "rewards/chosen": 0.39004250935145784, "rewards/margins": 2.4177524702889577, "rewards/rejected": -2.0277099609375, "step": 1340 }, { "epoch": 0.07107836641666446, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20199932.0, "logits/rejected": 2997788.0, "logps/chosen": -210.9715779622396, "logps/rejected": -176.703173828125, "loss": 0.3482, "rewards/chosen": -0.1314126451810201, "rewards/margins": 1.0820457975069682, "rewards/rejected": -1.2134584426879882, "step": 1341 }, { "epoch": 0.0711313704184666, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14946225.333333334, "logits/rejected": -41829747.2, "logps/chosen": -232.0726114908854, "logps/rejected": -473.19462890625, "loss": 0.2917, "rewards/chosen": -0.1929672360420227, "rewards/margins": 1.7146337389945985, "rewards/rejected": -1.9076009750366212, "step": 1342 }, { "epoch": 0.07118437442026873, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21457420.8, "logits/rejected": -48263237.333333336, "logps/chosen": -61.58878173828125, "logps/rejected": -255.41451009114584, "loss": 0.3966, "rewards/chosen": 0.03917209506034851, "rewards/margins": 1.204788448413213, "rewards/rejected": -1.1656163533528645, "step": 1343 }, { "epoch": 0.07123737842207087, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60259928.0, "logits/rejected": -34620660.571428575, "logps/chosen": -382.33050537109375, "logps/rejected": -464.978515625, "loss": 0.1992, "rewards/chosen": -0.38843995332717896, "rewards/margins": 1.5523537652833121, "rewards/rejected": -1.940793718610491, "step": 1344 }, { "epoch": 0.071290382423873, "grad_norm": 50.75, "kl": 0.5417327880859375, "learning_rate": 5e-07, "logits/chosen": -36619801.6, "logits/rejected": -44123992.0, "logps/chosen": -355.4455810546875, "logps/rejected": -341.7367757161458, "loss": 0.3388, "rewards/chosen": 0.2754473924636841, "rewards/margins": 2.2006573915481566, "rewards/rejected": -1.9252099990844727, "step": 1345 }, { "epoch": 0.07134338642567514, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61204998.4, "logits/rejected": -71179056.0, "logps/chosen": -191.0286865234375, "logps/rejected": -634.3099365234375, "loss": 0.3954, "rewards/chosen": -0.3360142707824707, "rewards/margins": 2.1800033251444497, "rewards/rejected": -2.5160175959269204, "step": 1346 }, { "epoch": 0.07139639042747728, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37552250.666666664, "logits/rejected": 6006206.0, "logps/chosen": -277.4652913411458, "logps/rejected": -89.05980682373047, "loss": 0.4431, "rewards/chosen": 0.2368822693824768, "rewards/margins": 0.4738280922174454, "rewards/rejected": -0.23694582283496857, "step": 1347 }, { "epoch": 0.07144939442927942, "grad_norm": 67.0, "kl": 0.0172576904296875, "learning_rate": 5e-07, "logits/chosen": 9395196.0, "logits/rejected": -424919.3125, "logps/chosen": -278.08058675130206, "logps/rejected": -134.8612518310547, "loss": 0.4142, "rewards/chosen": 0.241012970606486, "rewards/margins": 0.981780211130778, "rewards/rejected": -0.740767240524292, "step": 1348 }, { "epoch": 0.07150239843108154, "grad_norm": 68.0, "kl": 0.5666694641113281, "learning_rate": 5e-07, "logits/chosen": -20261780.0, "logits/rejected": -34049308.0, "logps/chosen": -785.2999877929688, "logps/rejected": -300.59912109375, "loss": 0.357, "rewards/chosen": 0.43345561623573303, "rewards/margins": 1.373269110918045, "rewards/rejected": -0.939813494682312, "step": 1349 }, { "epoch": 0.07155540243288368, "grad_norm": 52.5, "kl": 0.4181556701660156, "learning_rate": 5e-07, "logits/chosen": -55075320.0, "logits/rejected": -27147998.0, "logps/chosen": -301.9873352050781, "logps/rejected": -633.9800415039062, "loss": 0.3412, "rewards/chosen": 0.13166466355323792, "rewards/margins": 2.0328284800052643, "rewards/rejected": -1.9011638164520264, "step": 1350 }, { "epoch": 0.07160840643468581, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34824053.333333336, "logits/rejected": -42104832.0, "logps/chosen": -241.40116373697916, "logps/rejected": -329.128466796875, "loss": 0.2922, "rewards/chosen": 0.26826198895772296, "rewards/margins": 1.7172439416249592, "rewards/rejected": -1.4489819526672363, "step": 1351 }, { "epoch": 0.07166141043648795, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35299830.4, "logits/rejected": -11213024.0, "logps/chosen": -300.140673828125, "logps/rejected": -399.7249348958333, "loss": 0.4023, "rewards/chosen": -0.019645354151725768, "rewards/margins": 1.8604750821987788, "rewards/rejected": -1.8801204363505046, "step": 1352 }, { "epoch": 0.07171441443829009, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6097715.333333333, "logits/rejected": -4021137.6, "logps/chosen": -210.38525390625, "logps/rejected": -120.6335205078125, "loss": 0.4078, "rewards/chosen": 0.07625263929367065, "rewards/margins": 0.6472969889640808, "rewards/rejected": -0.5710443496704102, "step": 1353 }, { "epoch": 0.07176741844009223, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 423187.0, "logits/rejected": -38881958.4, "logps/chosen": -88.52809651692708, "logps/rejected": -265.4136474609375, "loss": 0.3434, "rewards/chosen": -0.15377254287401834, "rewards/margins": 1.246901418765386, "rewards/rejected": -1.4006739616394044, "step": 1354 }, { "epoch": 0.07182042244189436, "grad_norm": 76.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11633420.8, "logits/rejected": -6763422.666666667, "logps/chosen": -479.201806640625, "logps/rejected": -446.3260498046875, "loss": 0.4108, "rewards/chosen": -0.02615966796875, "rewards/margins": 1.0695099194844564, "rewards/rejected": -1.0956695874532063, "step": 1355 }, { "epoch": 0.0718734264436965, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31959484.0, "logits/rejected": -15732185.0, "logps/chosen": -284.00146484375, "logps/rejected": -340.06085205078125, "loss": 0.3229, "rewards/chosen": -0.08215923607349396, "rewards/margins": 2.0432830303907394, "rewards/rejected": -2.1254422664642334, "step": 1356 }, { "epoch": 0.07192643044549864, "grad_norm": 58.25, "kl": 1.0233154296875, "learning_rate": 5e-07, "logits/chosen": -26402720.0, "logits/rejected": -10088982.0, "logps/chosen": -621.745361328125, "logps/rejected": -195.25018310546875, "loss": 0.3273, "rewards/chosen": 0.4227561056613922, "rewards/margins": 1.6125278770923615, "rewards/rejected": -1.1897717714309692, "step": 1357 }, { "epoch": 0.07197943444730077, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18861928.0, "logits/rejected": -20541981.333333332, "logps/chosen": -112.16316986083984, "logps/rejected": -197.76155598958334, "loss": 0.3565, "rewards/chosen": 0.044112011790275574, "rewards/margins": 0.9712748179833094, "rewards/rejected": -0.9271628061930338, "step": 1358 }, { "epoch": 0.07203243844910291, "grad_norm": 65.5, "kl": 1.2010116577148438, "learning_rate": 5e-07, "logits/chosen": 12782552.8, "logits/rejected": -18356888.0, "logps/chosen": -279.970703125, "logps/rejected": -231.5642293294271, "loss": 0.4081, "rewards/chosen": 0.30531837940216067, "rewards/margins": 1.14768594900767, "rewards/rejected": -0.8423675696055094, "step": 1359 }, { "epoch": 0.07208544245090505, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20848878.0, "logits/rejected": 7419623.0, "logps/chosen": -241.96688842773438, "logps/rejected": -244.34963989257812, "loss": 0.409, "rewards/chosen": -0.2745288908481598, "rewards/margins": 0.961153119802475, "rewards/rejected": -1.2356820106506348, "step": 1360 }, { "epoch": 0.07213844645270719, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65195864.0, "logits/rejected": -11389026.285714285, "logps/chosen": -645.2569580078125, "logps/rejected": -247.12974330357142, "loss": 0.3121, "rewards/chosen": 0.8822388052940369, "rewards/margins": 1.79709278685706, "rewards/rejected": -0.9148539815630231, "step": 1361 }, { "epoch": 0.07219145045450931, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7745273.0, "logits/rejected": -33530098.285714287, "logps/chosen": -44.006019592285156, "logps/rejected": -347.0818568638393, "loss": 0.2877, "rewards/chosen": 0.4304542541503906, "rewards/margins": 1.6357735225132533, "rewards/rejected": -1.2053192683628626, "step": 1362 }, { "epoch": 0.07224445445631145, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14345000.0, "logits/rejected": -19253892.8, "logps/chosen": -62.783223470052086, "logps/rejected": -582.684521484375, "loss": 0.286, "rewards/chosen": -0.287319819132487, "rewards/margins": 2.075393613179525, "rewards/rejected": -2.362713432312012, "step": 1363 }, { "epoch": 0.07229745845811358, "grad_norm": 59.25, "kl": 0.0409088134765625, "learning_rate": 5e-07, "logits/chosen": -12919066.0, "logits/rejected": 21702912.0, "logps/chosen": -227.74429321289062, "logps/rejected": -477.14959716796875, "loss": 0.3398, "rewards/chosen": 0.1104377806186676, "rewards/margins": 1.5631023943424225, "rewards/rejected": -1.4526646137237549, "step": 1364 }, { "epoch": 0.07235046245991572, "grad_norm": 73.0, "kl": 0.19956207275390625, "learning_rate": 5e-07, "logits/chosen": -48257212.8, "logits/rejected": -31164880.0, "logps/chosen": -462.52880859375, "logps/rejected": -455.7065022786458, "loss": 0.3746, "rewards/chosen": 0.19346131086349488, "rewards/margins": 1.8846676071484882, "rewards/rejected": -1.6912062962849934, "step": 1365 }, { "epoch": 0.07240346646171786, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36153872.0, "logits/rejected": -5852999.2, "logps/chosen": -242.8331095377604, "logps/rejected": -138.397314453125, "loss": 0.4282, "rewards/chosen": -0.21864763895670572, "rewards/margins": 0.4133158047993978, "rewards/rejected": -0.6319634437561035, "step": 1366 }, { "epoch": 0.07245647046352, "grad_norm": 61.5, "kl": 0.4644660949707031, "learning_rate": 5e-07, "logits/chosen": -39947648.0, "logits/rejected": 19596325.333333332, "logps/chosen": -278.430859375, "logps/rejected": -343.0845947265625, "loss": 0.4212, "rewards/chosen": 0.07665314674377441, "rewards/margins": 1.0706508795420329, "rewards/rejected": -0.9939977327982584, "step": 1367 }, { "epoch": 0.07250947446532213, "grad_norm": 134.0, "kl": 0.5841026306152344, "learning_rate": 5e-07, "logits/chosen": -22729856.0, "logits/rejected": -50270104.0, "logps/chosen": -250.00885881696428, "logps/rejected": -594.904541015625, "loss": 0.4746, "rewards/chosen": -0.10289241586412702, "rewards/margins": 3.0288886853626797, "rewards/rejected": -3.1317811012268066, "step": 1368 }, { "epoch": 0.07256247846712427, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29279662.0, "logits/rejected": -32557798.0, "logps/chosen": -421.4819641113281, "logps/rejected": -227.62734985351562, "loss": 0.4635, "rewards/chosen": -0.315449982881546, "rewards/margins": 0.3012807071208954, "rewards/rejected": -0.6167306900024414, "step": 1369 }, { "epoch": 0.0726154824689264, "grad_norm": 51.0, "kl": 0.4987449645996094, "learning_rate": 5e-07, "logits/chosen": -11798691.0, "logits/rejected": -22692060.0, "logps/chosen": -191.061767578125, "logps/rejected": -387.20916748046875, "loss": 0.2901, "rewards/chosen": 0.3437235951423645, "rewards/margins": 2.3102641701698303, "rewards/rejected": -1.9665405750274658, "step": 1370 }, { "epoch": 0.07266848647072854, "grad_norm": 66.0, "kl": 0.36293792724609375, "learning_rate": 5e-07, "logits/chosen": -46368908.8, "logits/rejected": -26030592.0, "logps/chosen": -459.954931640625, "logps/rejected": -321.67677815755206, "loss": 0.4055, "rewards/chosen": 0.07285080552101135, "rewards/margins": 1.2813813110192616, "rewards/rejected": -1.2085305054982503, "step": 1371 }, { "epoch": 0.07272149047253068, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11783916.0, "logits/rejected": -33669472.0, "logps/chosen": -155.81666564941406, "logps/rejected": -290.57122802734375, "loss": 0.3597, "rewards/chosen": 0.19953016936779022, "rewards/margins": 1.3698218315839767, "rewards/rejected": -1.1702916622161865, "step": 1372 }, { "epoch": 0.07277449447433282, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9028260.8, "logits/rejected": -9295698.0, "logps/chosen": -195.368115234375, "logps/rejected": -150.6817626953125, "loss": 0.4158, "rewards/chosen": 0.0626289427280426, "rewards/margins": 0.9242480178674063, "rewards/rejected": -0.8616190751393636, "step": 1373 }, { "epoch": 0.07282749847613494, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -93689648.0, "logits/rejected": -767205.125, "logps/chosen": -322.3277893066406, "logps/rejected": -165.1678466796875, "loss": 0.3828, "rewards/chosen": 0.1709280014038086, "rewards/margins": 0.9911086559295654, "rewards/rejected": -0.8201806545257568, "step": 1374 }, { "epoch": 0.07288050247793708, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44552236.0, "logits/rejected": -23744638.0, "logps/chosen": -343.1134033203125, "logps/rejected": -343.16729736328125, "loss": 0.3776, "rewards/chosen": 0.13671454787254333, "rewards/margins": 1.054976910352707, "rewards/rejected": -0.9182623624801636, "step": 1375 }, { "epoch": 0.07293350647973922, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -160947648.0, "logits/rejected": -12770440.0, "logps/chosen": -415.554931640625, "logps/rejected": -246.62955729166666, "loss": 0.3285, "rewards/chosen": 0.3253433108329773, "rewards/margins": 1.3404757380485535, "rewards/rejected": -1.0151324272155762, "step": 1376 }, { "epoch": 0.07298651048154135, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43846102.4, "logits/rejected": -44123469.333333336, "logps/chosen": -270.172802734375, "logps/rejected": -222.13934326171875, "loss": 0.3747, "rewards/chosen": 0.13800674676895142, "rewards/margins": 1.4080552856127422, "rewards/rejected": -1.2700485388437908, "step": 1377 }, { "epoch": 0.07303951448334349, "grad_norm": 63.5, "kl": 1.1789016723632812, "learning_rate": 5e-07, "logits/chosen": -87510393.6, "logits/rejected": -5819684.666666667, "logps/chosen": -542.7482421875, "logps/rejected": -149.8122762044271, "loss": 0.3972, "rewards/chosen": 0.29455742835998533, "rewards/margins": 1.5402657349904376, "rewards/rejected": -1.2457083066304524, "step": 1378 }, { "epoch": 0.07309251848514563, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12867548.0, "logits/rejected": -26975240.0, "logps/chosen": -213.62429809570312, "logps/rejected": -285.1156921386719, "loss": 0.3658, "rewards/chosen": 0.04918556660413742, "rewards/margins": 1.2687484547495842, "rewards/rejected": -1.2195628881454468, "step": 1379 }, { "epoch": 0.07314552248694776, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -112317680.0, "logits/rejected": -27773707.42857143, "logps/chosen": -373.8260803222656, "logps/rejected": -171.89794921875, "loss": 0.3757, "rewards/chosen": -0.5995880365371704, "rewards/margins": 0.12549783502306255, "rewards/rejected": -0.725085871560233, "step": 1380 }, { "epoch": 0.0731985264887499, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54803130.666666664, "logits/rejected": -36555324.0, "logps/chosen": -327.7985026041667, "logps/rejected": -264.1063232421875, "loss": 0.3807, "rewards/chosen": 0.23395133018493652, "rewards/margins": 1.681165099143982, "rewards/rejected": -1.4472137689590454, "step": 1381 }, { "epoch": 0.07325153049055204, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55391528.0, "logits/rejected": -63385232.0, "logps/chosen": -286.8551940917969, "logps/rejected": -275.434814453125, "loss": 0.3421, "rewards/chosen": 0.15329284965991974, "rewards/margins": 1.5210072547197342, "rewards/rejected": -1.3677144050598145, "step": 1382 }, { "epoch": 0.07330453449235418, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51021733.333333336, "logits/rejected": -22856232.0, "logps/chosen": -370.81982421875, "logps/rejected": -248.32022094726562, "loss": 0.4716, "rewards/chosen": -0.06553624073664348, "rewards/margins": 0.6778372724850973, "rewards/rejected": -0.7433735132217407, "step": 1383 }, { "epoch": 0.07335753849415631, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -95164115.2, "logits/rejected": -49796885.333333336, "logps/chosen": -429.77119140625, "logps/rejected": -518.8670247395834, "loss": 0.3193, "rewards/chosen": 0.35657534599304197, "rewards/margins": 2.305509265263875, "rewards/rejected": -1.9489339192708333, "step": 1384 }, { "epoch": 0.07341054249595845, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21839552.0, "logits/rejected": -80955128.0, "logps/chosen": -316.8691101074219, "logps/rejected": -225.43186950683594, "loss": 0.3779, "rewards/chosen": -0.008210567757487297, "rewards/margins": 1.1093172747641802, "rewards/rejected": -1.1175278425216675, "step": 1385 }, { "epoch": 0.07346354649776059, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 74057408.0, "logits/rejected": -29938876.0, "logps/chosen": -305.4822184244792, "logps/rejected": -112.75807189941406, "loss": 0.4436, "rewards/chosen": 0.006257948776086171, "rewards/margins": 0.9559985970457395, "rewards/rejected": -0.9497406482696533, "step": 1386 }, { "epoch": 0.07351655049956271, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38905524.0, "logits/rejected": -12629070.666666666, "logps/chosen": -345.26080322265625, "logps/rejected": -321.13427734375, "loss": 0.2866, "rewards/chosen": 0.02378845028579235, "rewards/margins": 1.464190322284897, "rewards/rejected": -1.4404018719991047, "step": 1387 }, { "epoch": 0.07356955450136485, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23210720.0, "logits/rejected": -42096296.0, "logps/chosen": -336.6979675292969, "logps/rejected": -299.07672119140625, "loss": 0.3562, "rewards/chosen": 0.10061530768871307, "rewards/margins": 1.6003143042325974, "rewards/rejected": -1.4996989965438843, "step": 1388 }, { "epoch": 0.07362255850316698, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11557436.0, "logits/rejected": -13150721.6, "logps/chosen": -222.74971516927084, "logps/rejected": -279.2689453125, "loss": 0.3712, "rewards/chosen": 0.09881210327148438, "rewards/margins": 0.9790139198303223, "rewards/rejected": -0.8802018165588379, "step": 1389 }, { "epoch": 0.07367556250496912, "grad_norm": 77.5, "kl": 0.08460426330566406, "learning_rate": 5e-07, "logits/chosen": -45816976.0, "logits/rejected": -73721664.0, "logps/chosen": -400.257568359375, "logps/rejected": -566.1201782226562, "loss": 0.4348, "rewards/chosen": -0.10282257199287415, "rewards/margins": 1.6638996303081512, "rewards/rejected": -1.7667222023010254, "step": 1390 }, { "epoch": 0.07372856650677126, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -103673384.0, "logits/rejected": -57109613.71428572, "logps/chosen": -415.0525817871094, "logps/rejected": -495.44454520089283, "loss": 0.2079, "rewards/chosen": 0.404946893453598, "rewards/margins": 2.15052974649838, "rewards/rejected": -1.7455828530447823, "step": 1391 }, { "epoch": 0.0737815705085734, "grad_norm": 49.5, "kl": 0.3798236846923828, "learning_rate": 5e-07, "logits/chosen": -42216384.0, "logits/rejected": -46958512.0, "logps/chosen": -198.15166015625, "logps/rejected": -555.794189453125, "loss": 0.3535, "rewards/chosen": 0.03578231632709503, "rewards/margins": 2.3167780031760534, "rewards/rejected": -2.2809956868489585, "step": 1392 }, { "epoch": 0.07383457451037553, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46678064.0, "logits/rejected": -11168736.0, "logps/chosen": -377.8534342447917, "logps/rejected": -204.149072265625, "loss": 0.3597, "rewards/chosen": 0.10513611634572347, "rewards/margins": 1.0474400599797566, "rewards/rejected": -0.9423039436340332, "step": 1393 }, { "epoch": 0.07388757851217767, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28400732.8, "logits/rejected": -15699596.0, "logps/chosen": -98.604443359375, "logps/rejected": -282.47943115234375, "loss": 0.3925, "rewards/chosen": -0.1150357484817505, "rewards/margins": 1.607263986269633, "rewards/rejected": -1.7222997347513835, "step": 1394 }, { "epoch": 0.07394058251397981, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -974142.375, "logits/rejected": -43739061.333333336, "logps/chosen": -245.4093017578125, "logps/rejected": -376.8792724609375, "loss": 0.3519, "rewards/chosen": 0.18983496725559235, "rewards/margins": 1.2230485826730728, "rewards/rejected": -1.0332136154174805, "step": 1395 }, { "epoch": 0.07399358651578195, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36073368.0, "logits/rejected": -69627696.0, "logps/chosen": -216.31845092773438, "logps/rejected": -253.12094116210938, "loss": 0.364, "rewards/chosen": 0.26829075813293457, "rewards/margins": 1.2063199281692505, "rewards/rejected": -0.9380291700363159, "step": 1396 }, { "epoch": 0.07404659051758408, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39399877.333333336, "logits/rejected": -26649131.2, "logps/chosen": -355.5885009765625, "logps/rejected": -335.4361328125, "loss": 0.3321, "rewards/chosen": 0.02725626031557719, "rewards/margins": 1.2755454639593762, "rewards/rejected": -1.248289203643799, "step": 1397 }, { "epoch": 0.07409959451938622, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40352147.2, "logits/rejected": -32690880.0, "logps/chosen": -155.88138427734376, "logps/rejected": -182.36865234375, "loss": 0.4241, "rewards/chosen": 0.13078960180282592, "rewards/margins": 0.765159293015798, "rewards/rejected": -0.634369691212972, "step": 1398 }, { "epoch": 0.07415259852118836, "grad_norm": 77.5, "kl": 0.2656211853027344, "learning_rate": 5e-07, "logits/chosen": -42907413.333333336, "logits/rejected": -37255276.0, "logps/chosen": -433.9236246744792, "logps/rejected": -283.58258056640625, "loss": 0.4444, "rewards/chosen": -0.10843376318613689, "rewards/margins": 1.2343275149663289, "rewards/rejected": -1.3427612781524658, "step": 1399 }, { "epoch": 0.07420560252299048, "grad_norm": 88.0, "kl": 0.7655925750732422, "learning_rate": 5e-07, "logits/chosen": -93938954.66666667, "logits/rejected": -9935368.0, "logps/chosen": -1550.7130533854167, "logps/rejected": -176.2328369140625, "loss": 0.2659, "rewards/chosen": 0.8214548428853353, "rewards/margins": 2.046886094411214, "rewards/rejected": -1.225431251525879, "step": 1400 }, { "epoch": 0.07425860652479262, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37901864.0, "logits/rejected": -24514860.0, "logps/chosen": -244.82000732421875, "logps/rejected": -211.52439880371094, "loss": 0.3815, "rewards/chosen": 0.044046975672245026, "rewards/margins": 1.0556924119591713, "rewards/rejected": -1.0116454362869263, "step": 1401 }, { "epoch": 0.07431161052659475, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21355040.0, "logits/rejected": -24647569.6, "logps/chosen": -174.02461751302084, "logps/rejected": -412.1091796875, "loss": 0.2775, "rewards/chosen": 0.3702923854192098, "rewards/margins": 1.9106125911076863, "rewards/rejected": -1.5403202056884766, "step": 1402 }, { "epoch": 0.07436461452839689, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53550310.4, "logits/rejected": -61735008.0, "logps/chosen": -479.47060546875, "logps/rejected": -684.587890625, "loss": 0.3341, "rewards/chosen": 0.19447969198226928, "rewards/margins": 2.164818513393402, "rewards/rejected": -1.9703388214111328, "step": 1403 }, { "epoch": 0.07441761853019903, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40656876.0, "logits/rejected": -15386798.0, "logps/chosen": -475.5862731933594, "logps/rejected": -197.4607391357422, "loss": 0.3851, "rewards/chosen": -0.1186954528093338, "rewards/margins": 1.064754530787468, "rewards/rejected": -1.1834499835968018, "step": 1404 }, { "epoch": 0.07447062253200117, "grad_norm": 79.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36857033.6, "logits/rejected": -45155056.0, "logps/chosen": -855.080859375, "logps/rejected": -342.7946370442708, "loss": 0.294, "rewards/chosen": 0.40709333419799804, "rewards/margins": 2.9847906112670897, "rewards/rejected": -2.577697277069092, "step": 1405 }, { "epoch": 0.0745236265338033, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45756048.0, "logits/rejected": -30658361.6, "logps/chosen": -306.3465983072917, "logps/rejected": -412.2779296875, "loss": 0.3497, "rewards/chosen": -0.3591264883677165, "rewards/margins": 1.876259978612264, "rewards/rejected": -2.2353864669799806, "step": 1406 }, { "epoch": 0.07457663053560544, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23337005.333333332, "logits/rejected": -36706424.0, "logps/chosen": -90.09834798177083, "logps/rejected": -240.69204711914062, "loss": 0.433, "rewards/chosen": -0.045662502447764076, "rewards/margins": 1.3988934954007466, "rewards/rejected": -1.4445559978485107, "step": 1407 }, { "epoch": 0.07462963453740758, "grad_norm": 54.0, "kl": 0.7051620483398438, "learning_rate": 5e-07, "logits/chosen": -47200272.0, "logits/rejected": -18830802.666666668, "logps/chosen": -529.8812866210938, "logps/rejected": -223.22635904947916, "loss": 0.3216, "rewards/chosen": 0.47723543643951416, "rewards/margins": 1.515514651934306, "rewards/rejected": -1.0382792154947917, "step": 1408 }, { "epoch": 0.07468263853920971, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59305936.0, "logits/rejected": -14437622.0, "logps/chosen": -453.9273376464844, "logps/rejected": -454.5091247558594, "loss": 0.3835, "rewards/chosen": -0.2605857849121094, "rewards/margins": 1.2578299045562744, "rewards/rejected": -1.5184156894683838, "step": 1409 }, { "epoch": 0.07473564254101185, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22347173.333333332, "logits/rejected": -55180600.0, "logps/chosen": -281.6022542317708, "logps/rejected": -317.7020263671875, "loss": 0.4372, "rewards/chosen": -0.14506861567497253, "rewards/margins": 1.6729977428913116, "rewards/rejected": -1.8180663585662842, "step": 1410 }, { "epoch": 0.07478864654281399, "grad_norm": 61.5, "kl": 1.0107650756835938, "learning_rate": 5e-07, "logits/chosen": -37587472.0, "logits/rejected": -56700100.0, "logps/chosen": -432.886474609375, "logps/rejected": -186.0970458984375, "loss": 0.3643, "rewards/chosen": 0.2632026672363281, "rewards/margins": 1.2214359045028687, "rewards/rejected": -0.9582332372665405, "step": 1411 }, { "epoch": 0.07484165054461611, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42300109.333333336, "logits/rejected": -27008755.2, "logps/chosen": -429.0423990885417, "logps/rejected": -341.09853515625, "loss": 0.2802, "rewards/chosen": 0.4374524752298991, "rewards/margins": 1.8235239664713543, "rewards/rejected": -1.3860714912414551, "step": 1412 }, { "epoch": 0.07489465454641825, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26733314.285714287, "logits/rejected": -8746977.0, "logps/chosen": -214.41629464285714, "logps/rejected": -74.43075561523438, "loss": 0.4466, "rewards/chosen": 0.09285306079047066, "rewards/margins": 1.2891135130609785, "rewards/rejected": -1.1962604522705078, "step": 1413 }, { "epoch": 0.07494765854822039, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6325237.6, "logits/rejected": -17603600.0, "logps/chosen": -93.53731079101563, "logps/rejected": -240.0202840169271, "loss": 0.3537, "rewards/chosen": 0.11862038373947144, "rewards/margins": 2.115711518128713, "rewards/rejected": -1.9970911343892415, "step": 1414 }, { "epoch": 0.07500066255002252, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6299090.4, "logits/rejected": -34544432.0, "logps/chosen": -365.329443359375, "logps/rejected": -497.8909912109375, "loss": 0.4071, "rewards/chosen": -0.0588458240032196, "rewards/margins": 1.410176767905553, "rewards/rejected": -1.4690225919087727, "step": 1415 }, { "epoch": 0.07505366655182466, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55515008.0, "logits/rejected": 2080234.8, "logps/chosen": -200.94793701171875, "logps/rejected": -521.00634765625, "loss": 0.3166, "rewards/chosen": -0.15604809919993082, "rewards/margins": 1.4591221888860066, "rewards/rejected": -1.6151702880859375, "step": 1416 }, { "epoch": 0.0751066705536268, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27441126.0, "logits/rejected": -20098226.666666668, "logps/chosen": -353.9501953125, "logps/rejected": -225.51131184895834, "loss": 0.3534, "rewards/chosen": 0.03103943169116974, "rewards/margins": 0.8938723752895991, "rewards/rejected": -0.8628329435984293, "step": 1417 }, { "epoch": 0.07515967455542893, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7790457.6, "logits/rejected": -17335428.0, "logps/chosen": -227.465087890625, "logps/rejected": -126.5165303548177, "loss": 0.4243, "rewards/chosen": -0.016836968064308167, "rewards/margins": 0.9552297522624335, "rewards/rejected": -0.9720667203267416, "step": 1418 }, { "epoch": 0.07521267855723107, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14093756.8, "logits/rejected": -65516981.333333336, "logps/chosen": -289.9348876953125, "logps/rejected": -358.6671956380208, "loss": 0.4037, "rewards/chosen": -0.04912933111190796, "rewards/margins": 1.3555933594703675, "rewards/rejected": -1.4047226905822754, "step": 1419 }, { "epoch": 0.07526568255903321, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38476003.2, "logits/rejected": -27068698.666666668, "logps/chosen": -322.8253662109375, "logps/rejected": -249.27022298177084, "loss": 0.364, "rewards/chosen": 0.3386551380157471, "rewards/margins": 1.5840716997782391, "rewards/rejected": -1.245416561762492, "step": 1420 }, { "epoch": 0.07531868656083535, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5244604.0, "logits/rejected": -50299852.8, "logps/chosen": -175.90372721354166, "logps/rejected": -423.516259765625, "loss": 0.3318, "rewards/chosen": -0.10285135110219319, "rewards/margins": 1.2242955605189005, "rewards/rejected": -1.3271469116210937, "step": 1421 }, { "epoch": 0.07537169056263748, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67240480.0, "logits/rejected": -16394668.8, "logps/chosen": -434.432373046875, "logps/rejected": -225.6517333984375, "loss": 0.3574, "rewards/chosen": 0.3044823408126831, "rewards/margins": 1.2018193006515503, "rewards/rejected": -0.8973369598388672, "step": 1422 }, { "epoch": 0.07542469456443962, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18051643.42857143, "logits/rejected": 2809927.0, "logps/chosen": -155.150390625, "logps/rejected": -36.71686553955078, "loss": 0.5112, "rewards/chosen": -0.19899346147264754, "rewards/margins": 0.8537278005055019, "rewards/rejected": -1.0527212619781494, "step": 1423 }, { "epoch": 0.07547769856624176, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30349747.2, "logits/rejected": -6755495.333333333, "logps/chosen": -279.703466796875, "logps/rejected": -181.9958699544271, "loss": 0.4232, "rewards/chosen": 0.194583523273468, "rewards/margins": 0.715610651175181, "rewards/rejected": -0.521027127901713, "step": 1424 }, { "epoch": 0.07553070256804388, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7546668.5, "logits/rejected": -20148714.0, "logps/chosen": -207.77359008789062, "logps/rejected": -258.3262939453125, "loss": 0.3935, "rewards/chosen": 0.045522548258304596, "rewards/margins": 0.9935393705964088, "rewards/rejected": -0.9480168223381042, "step": 1425 }, { "epoch": 0.07558370656984602, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8952106.0, "logits/rejected": -16707822.666666666, "logps/chosen": -262.415771484375, "logps/rejected": -183.93216959635416, "loss": 0.3407, "rewards/chosen": 0.5258692502975464, "rewards/margins": 1.2464441061019897, "rewards/rejected": -0.7205748558044434, "step": 1426 }, { "epoch": 0.07563671057164816, "grad_norm": 48.75, "kl": 0.03661346435546875, "learning_rate": 5e-07, "logits/chosen": -31198877.333333332, "logits/rejected": -14912608.0, "logps/chosen": -262.4367268880208, "logps/rejected": -202.5276336669922, "loss": 0.4023, "rewards/chosen": 0.21088045835494995, "rewards/margins": 1.4212365746498108, "rewards/rejected": -1.2103561162948608, "step": 1427 }, { "epoch": 0.07568971457345029, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13572641.0, "logits/rejected": -37148522.666666664, "logps/chosen": -203.84605407714844, "logps/rejected": -305.8000081380208, "loss": 0.3101, "rewards/chosen": 0.3585205078125, "rewards/margins": 1.3804561297098796, "rewards/rejected": -1.0219356218973796, "step": 1428 }, { "epoch": 0.07574271857525243, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13017637.333333334, "logits/rejected": -48307340.8, "logps/chosen": -237.010498046875, "logps/rejected": -361.173583984375, "loss": 0.2933, "rewards/chosen": 0.322917381922404, "rewards/margins": 1.778042523066203, "rewards/rejected": -1.4551251411437989, "step": 1429 }, { "epoch": 0.07579572257705457, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7703730.0, "logits/rejected": -38878544.0, "logps/chosen": -59.278526306152344, "logps/rejected": -338.1411946614583, "loss": 0.239, "rewards/chosen": 0.2193916290998459, "rewards/margins": 1.9515293091535568, "rewards/rejected": -1.732137680053711, "step": 1430 }, { "epoch": 0.0758487265788567, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28500704.0, "logits/rejected": -15038092.0, "logps/chosen": -282.0927429199219, "logps/rejected": -227.92303466796875, "loss": 0.3941, "rewards/chosen": -0.2888075113296509, "rewards/margins": 1.1973332166671753, "rewards/rejected": -1.4861407279968262, "step": 1431 }, { "epoch": 0.07590173058065884, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28669344.0, "logits/rejected": -21620050.0, "logps/chosen": -200.79452514648438, "logps/rejected": -679.2413940429688, "loss": 0.3467, "rewards/chosen": 0.17276111245155334, "rewards/margins": 2.0926217138767242, "rewards/rejected": -1.919860601425171, "step": 1432 }, { "epoch": 0.07595473458246098, "grad_norm": 60.0, "kl": 0.15429306030273438, "learning_rate": 5e-07, "logits/chosen": -46482304.0, "logits/rejected": -24753754.666666668, "logps/chosen": -338.6305419921875, "logps/rejected": -152.07855224609375, "loss": 0.4318, "rewards/chosen": -0.10750885009765625, "rewards/margins": 1.0404471238454183, "rewards/rejected": -1.1479559739430745, "step": 1433 }, { "epoch": 0.07600773858426312, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5173916.5, "logits/rejected": -18562760.0, "logps/chosen": -267.50262451171875, "logps/rejected": -147.98733520507812, "loss": 0.352, "rewards/chosen": 0.2816336154937744, "rewards/margins": 1.3173032999038696, "rewards/rejected": -1.0356696844100952, "step": 1434 }, { "epoch": 0.07606074258606525, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27432973.333333332, "logits/rejected": -55790484.0, "logps/chosen": -240.12483723958334, "logps/rejected": -294.6312255859375, "loss": 0.4448, "rewards/chosen": -0.006220688422520955, "rewards/margins": 1.4236282209555309, "rewards/rejected": -1.4298489093780518, "step": 1435 }, { "epoch": 0.07611374658786739, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33727088.0, "logits/rejected": -52241988.0, "logps/chosen": -257.50848388671875, "logps/rejected": -309.43499755859375, "loss": 0.3745, "rewards/chosen": 0.17310714721679688, "rewards/margins": 1.1149022579193115, "rewards/rejected": -0.9417951107025146, "step": 1436 }, { "epoch": 0.07616675058966953, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1612830.75, "logits/rejected": -18988814.666666668, "logps/chosen": -203.76744079589844, "logps/rejected": -274.1459147135417, "loss": 0.2905, "rewards/chosen": 0.12747114896774292, "rewards/margins": 1.7326504588127136, "rewards/rejected": -1.6051793098449707, "step": 1437 }, { "epoch": 0.07621975459147165, "grad_norm": 93.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -127097994.66666667, "logits/rejected": -21373792.0, "logps/chosen": -361.7992350260417, "logps/rejected": -377.6794189453125, "loss": 0.2548, "rewards/chosen": 0.3875495990117391, "rewards/margins": 2.035019691785177, "rewards/rejected": -1.6474700927734376, "step": 1438 }, { "epoch": 0.07627275859327379, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16955344.0, "logits/rejected": -17160130.666666668, "logps/chosen": -290.1844970703125, "logps/rejected": -115.70131429036458, "loss": 0.3918, "rewards/chosen": 0.29460678100585935, "rewards/margins": 1.0444995721181234, "rewards/rejected": -0.749892791112264, "step": 1439 }, { "epoch": 0.07632576259507592, "grad_norm": 58.5, "kl": 0.2489166259765625, "learning_rate": 5e-07, "logits/chosen": -32361886.0, "logits/rejected": -54306232.0, "logps/chosen": -318.5247802734375, "logps/rejected": -698.121826171875, "loss": 0.2931, "rewards/chosen": 0.19871646165847778, "rewards/margins": 2.456667959690094, "rewards/rejected": -2.257951498031616, "step": 1440 }, { "epoch": 0.07637876659687806, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48197672.0, "logits/rejected": -28457010.666666668, "logps/chosen": -451.96636962890625, "logps/rejected": -229.6479695638021, "loss": 0.2907, "rewards/chosen": 0.02580413967370987, "rewards/margins": 1.3819128051400185, "rewards/rejected": -1.3561086654663086, "step": 1441 }, { "epoch": 0.0764317705986802, "grad_norm": 61.25, "kl": 0.6211013793945312, "learning_rate": 5e-07, "logits/chosen": -58949286.4, "logits/rejected": -25222048.0, "logps/chosen": -337.7691650390625, "logps/rejected": -398.9319254557292, "loss": 0.3639, "rewards/chosen": 0.33654968738555907, "rewards/margins": 1.658773668607076, "rewards/rejected": -1.3222239812215169, "step": 1442 }, { "epoch": 0.07648477460048234, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30877020.0, "logits/rejected": -55663541.333333336, "logps/chosen": -369.25177001953125, "logps/rejected": -332.5263264973958, "loss": 0.2464, "rewards/chosen": 0.16880150139331818, "rewards/margins": 1.9923504541317623, "rewards/rejected": -1.823548952738444, "step": 1443 }, { "epoch": 0.07653777860228447, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46714341.333333336, "logits/rejected": -85317779.2, "logps/chosen": -161.1270955403646, "logps/rejected": -596.99580078125, "loss": 0.2455, "rewards/chosen": 0.3244070808092753, "rewards/margins": 2.7249748984972633, "rewards/rejected": -2.400567817687988, "step": 1444 }, { "epoch": 0.07659078260408661, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5082485.6, "logits/rejected": -8326922.666666667, "logps/chosen": -137.813037109375, "logps/rejected": -187.74637858072916, "loss": 0.3732, "rewards/chosen": 0.030935174226760863, "rewards/margins": 1.6181458294391633, "rewards/rejected": -1.5872106552124023, "step": 1445 }, { "epoch": 0.07664378660588875, "grad_norm": 61.0, "kl": 0.01584625244140625, "learning_rate": 5e-07, "logits/chosen": -51118496.0, "logits/rejected": -4843836.0, "logps/chosen": -697.4822998046875, "logps/rejected": -381.287353515625, "loss": 0.3145, "rewards/chosen": 0.48557665944099426, "rewards/margins": 1.8874683678150177, "rewards/rejected": -1.4018917083740234, "step": 1446 }, { "epoch": 0.07669679060769088, "grad_norm": 47.0, "kl": 0.001430511474609375, "learning_rate": 5e-07, "logits/chosen": -42997276.0, "logits/rejected": -32213578.0, "logps/chosen": -312.4885559082031, "logps/rejected": -306.8868408203125, "loss": 0.3211, "rewards/chosen": 0.41122323274612427, "rewards/margins": 2.0121914744377136, "rewards/rejected": -1.6009682416915894, "step": 1447 }, { "epoch": 0.07674979460949302, "grad_norm": 61.75, "kl": 0.1343231201171875, "learning_rate": 5e-07, "logits/chosen": -83210408.0, "logits/rejected": -5287790.0, "logps/chosen": -371.60516357421875, "logps/rejected": -144.94406127929688, "loss": 0.3758, "rewards/chosen": 0.11082153022289276, "rewards/margins": 1.1715640276670456, "rewards/rejected": -1.0607424974441528, "step": 1448 }, { "epoch": 0.07680279861129516, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -83015616.0, "logits/rejected": -61387466.666666664, "logps/chosen": -333.2694091796875, "logps/rejected": -277.5797119140625, "loss": 0.2809, "rewards/chosen": 0.033738717436790466, "rewards/margins": 1.5511947721242905, "rewards/rejected": -1.5174560546875, "step": 1449 }, { "epoch": 0.07685580261309728, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51940320.0, "logits/rejected": -6618486.0, "logps/chosen": -466.5857340494792, "logps/rejected": -301.24176025390625, "loss": 0.3498, "rewards/chosen": 0.4348948796590169, "rewards/margins": 2.1361704667409263, "rewards/rejected": -1.7012755870819092, "step": 1450 }, { "epoch": 0.07690880661489942, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2667944.4, "logits/rejected": -26553226.666666668, "logps/chosen": -143.2069091796875, "logps/rejected": -488.1368815104167, "loss": 0.3687, "rewards/chosen": 0.12460452318191528, "rewards/margins": 1.5994005004564922, "rewards/rejected": -1.474795977274577, "step": 1451 }, { "epoch": 0.07696181061670156, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20297957.333333332, "logits/rejected": -34210956.8, "logps/chosen": -196.45357259114584, "logps/rejected": -303.73876953125, "loss": 0.3532, "rewards/chosen": -0.03856086482604345, "rewards/margins": 1.0979450250665348, "rewards/rejected": -1.1365058898925782, "step": 1452 }, { "epoch": 0.0770148146185037, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2780803.5, "logits/rejected": -47552949.333333336, "logps/chosen": -37.976898193359375, "logps/rejected": -332.21360270182294, "loss": 0.3041, "rewards/chosen": -0.4333420991897583, "rewards/margins": 0.961267352104187, "rewards/rejected": -1.3946094512939453, "step": 1453 }, { "epoch": 0.07706781862030583, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24947308.0, "logits/rejected": -3884578.5, "logps/chosen": -229.5204315185547, "logps/rejected": -209.6848602294922, "loss": 0.3978, "rewards/chosen": -0.08630065619945526, "rewards/margins": 1.0299890786409378, "rewards/rejected": -1.116289734840393, "step": 1454 }, { "epoch": 0.07712082262210797, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3606564.6666666665, "logits/rejected": -22397027.2, "logps/chosen": -165.1344197591146, "logps/rejected": -395.020751953125, "loss": 0.2935, "rewards/chosen": -0.14339929819107056, "rewards/margins": 1.7788779616355896, "rewards/rejected": -1.9222772598266602, "step": 1455 }, { "epoch": 0.0771738266239101, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 715223.8125, "logits/rejected": -7270200.0, "logps/chosen": -54.09185791015625, "logps/rejected": -149.4591064453125, "loss": 0.3296, "rewards/chosen": -0.23875732719898224, "rewards/margins": 0.8856172213951747, "rewards/rejected": -1.124374548594157, "step": 1456 }, { "epoch": 0.07722683062571224, "grad_norm": 47.0, "kl": 0.1559734344482422, "learning_rate": 5e-07, "logits/chosen": -16122069.0, "logits/rejected": -35338840.0, "logps/chosen": -170.62562561035156, "logps/rejected": -448.65478515625, "loss": 0.3382, "rewards/chosen": 0.04778657108545303, "rewards/margins": 1.6104255691170692, "rewards/rejected": -1.5626389980316162, "step": 1457 }, { "epoch": 0.07727983462751438, "grad_norm": 57.0, "kl": 0.1690521240234375, "learning_rate": 5e-07, "logits/chosen": -1346395.4285714286, "logits/rejected": -64414224.0, "logps/chosen": -223.80950055803572, "logps/rejected": -398.2519226074219, "loss": 0.4232, "rewards/chosen": 0.1528514793940953, "rewards/margins": 2.1750713757106235, "rewards/rejected": -2.0222198963165283, "step": 1458 }, { "epoch": 0.07733283862931652, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29358360.0, "logits/rejected": -8455684.0, "logps/chosen": -212.32083129882812, "logps/rejected": -186.25320434570312, "loss": 0.4387, "rewards/chosen": -0.1097206175327301, "rewards/margins": 0.6736684143543243, "rewards/rejected": -0.7833890318870544, "step": 1459 }, { "epoch": 0.07738584263111865, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19523586.666666668, "logits/rejected": -8013985.6, "logps/chosen": -277.6046956380208, "logps/rejected": -179.2090087890625, "loss": 0.3467, "rewards/chosen": 0.1213872234026591, "rewards/margins": 1.1546016017595928, "rewards/rejected": -1.0332143783569336, "step": 1460 }, { "epoch": 0.07743884663292079, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30417406.0, "logits/rejected": -14836609.0, "logps/chosen": -206.05862426757812, "logps/rejected": -171.00755310058594, "loss": 0.4088, "rewards/chosen": -0.08662158250808716, "rewards/margins": 0.8064345717430115, "rewards/rejected": -0.8930561542510986, "step": 1461 }, { "epoch": 0.07749185063472293, "grad_norm": 38.5, "kl": 0.19389677047729492, "learning_rate": 5e-07, "logits/chosen": -4631770.0, "logits/rejected": -7843413.333333333, "logps/chosen": -163.5777099609375, "logps/rejected": -211.7065633138021, "loss": 0.3775, "rewards/chosen": 0.18205811977386474, "rewards/margins": 1.5008495887120563, "rewards/rejected": -1.3187914689381917, "step": 1462 }, { "epoch": 0.07754485463652505, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15752660.57142857, "logits/rejected": -49187704.0, "logps/chosen": -328.4892578125, "logps/rejected": -574.6776733398438, "loss": 0.4204, "rewards/chosen": 0.09459184748785836, "rewards/margins": 3.6929621611322676, "rewards/rejected": -3.598370313644409, "step": 1463 }, { "epoch": 0.07759785863832719, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30075394.666666668, "logits/rejected": -13714798.4, "logps/chosen": -192.01810709635416, "logps/rejected": -348.99599609375, "loss": 0.2949, "rewards/chosen": 0.25512848297754925, "rewards/margins": 1.8335580865542094, "rewards/rejected": -1.57842960357666, "step": 1464 }, { "epoch": 0.07765086264012933, "grad_norm": 49.25, "kl": 0.08334922790527344, "learning_rate": 5e-07, "logits/chosen": -26286451.2, "logits/rejected": 33833720.0, "logps/chosen": -119.184375, "logps/rejected": -565.3170572916666, "loss": 0.3737, "rewards/chosen": -0.060970115661621097, "rewards/margins": 1.845438512166341, "rewards/rejected": -1.9064086278279622, "step": 1465 }, { "epoch": 0.07770386664193146, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3400779.5, "logits/rejected": -14300957.714285715, "logps/chosen": -61.05524444580078, "logps/rejected": -400.70040457589283, "loss": 0.2042, "rewards/chosen": -0.220428466796875, "rewards/margins": 1.6073875427246094, "rewards/rejected": -1.8278160095214844, "step": 1466 }, { "epoch": 0.0777568706437336, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30524770.0, "logits/rejected": -30945204.0, "logps/chosen": -457.0281982421875, "logps/rejected": -234.9890899658203, "loss": 0.3658, "rewards/chosen": -0.05964374542236328, "rewards/margins": 1.3084410429000854, "rewards/rejected": -1.3680847883224487, "step": 1467 }, { "epoch": 0.07780987464553574, "grad_norm": 73.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31763218.0, "logits/rejected": -7575350.5, "logps/chosen": -346.53271484375, "logps/rejected": -197.80162048339844, "loss": 0.3474, "rewards/chosen": 0.10280227661132812, "rewards/margins": 1.5891942977905273, "rewards/rejected": -1.4863920211791992, "step": 1468 }, { "epoch": 0.07786287864733787, "grad_norm": 79.0, "kl": 0.48163604736328125, "learning_rate": 5e-07, "logits/chosen": -50520966.4, "logits/rejected": -13527305.333333334, "logps/chosen": -784.8322265625, "logps/rejected": -307.75091552734375, "loss": 0.3704, "rewards/chosen": 0.24279022216796875, "rewards/margins": 1.7189955711364746, "rewards/rejected": -1.4762053489685059, "step": 1469 }, { "epoch": 0.07791588264914001, "grad_norm": 56.5, "kl": 1.3743743896484375, "learning_rate": 5e-07, "logits/chosen": -13340587.0, "logits/rejected": -5409035.5, "logps/chosen": -388.2838134765625, "logps/rejected": -188.44064331054688, "loss": 0.3883, "rewards/chosen": 0.28443241119384766, "rewards/margins": 1.3295466899871826, "rewards/rejected": -1.045114278793335, "step": 1470 }, { "epoch": 0.07796888665094215, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33264546.666666668, "logits/rejected": -68555059.2, "logps/chosen": -426.6678059895833, "logps/rejected": -278.256640625, "loss": 0.336, "rewards/chosen": 0.14616801341374716, "rewards/margins": 1.2890107830365498, "rewards/rejected": -1.1428427696228027, "step": 1471 }, { "epoch": 0.07802189065274429, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34091901.333333336, "logits/rejected": -9226105.6, "logps/chosen": -118.57871500651042, "logps/rejected": -408.935302734375, "loss": 0.3473, "rewards/chosen": -0.30242005983988446, "rewards/margins": 1.6307693322499592, "rewards/rejected": -1.9331893920898438, "step": 1472 }, { "epoch": 0.07807489465454642, "grad_norm": 55.5, "kl": 0.34517669677734375, "learning_rate": 5e-07, "logits/chosen": -49253492.0, "logits/rejected": -33889664.0, "logps/chosen": -311.1181335449219, "logps/rejected": -515.3275146484375, "loss": 0.35, "rewards/chosen": -0.09599561989307404, "rewards/margins": 1.63661627471447, "rewards/rejected": -1.732611894607544, "step": 1473 }, { "epoch": 0.07812789865634856, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24136656.0, "logits/rejected": -24177537.6, "logps/chosen": -231.9532267252604, "logps/rejected": -215.9359619140625, "loss": 0.273, "rewards/chosen": 0.25302175680796307, "rewards/margins": 1.8469455162684123, "rewards/rejected": -1.5939237594604492, "step": 1474 }, { "epoch": 0.0781809026581507, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19710502.4, "logits/rejected": -36457605.333333336, "logps/chosen": -294.799609375, "logps/rejected": -294.91943359375, "loss": 0.3573, "rewards/chosen": 0.28438620567321776, "rewards/margins": 1.5765485445658367, "rewards/rejected": -1.292162338892619, "step": 1475 }, { "epoch": 0.07823390665995282, "grad_norm": 81.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23402245.333333332, "logits/rejected": -50543984.0, "logps/chosen": -414.7926839192708, "logps/rejected": -221.0297088623047, "loss": 0.4419, "rewards/chosen": -0.10000215967496236, "rewards/margins": 1.4928414920965831, "rewards/rejected": -1.5928436517715454, "step": 1476 }, { "epoch": 0.07828691066175496, "grad_norm": 83.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15722496.0, "logits/rejected": -25620930.0, "logps/chosen": -390.5736083984375, "logps/rejected": -93.82290649414062, "loss": 0.3926, "rewards/chosen": 0.01715412177145481, "rewards/margins": 1.0581999067217112, "rewards/rejected": -1.0410457849502563, "step": 1477 }, { "epoch": 0.0783399146635571, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4865718.5, "logits/rejected": -38612616.0, "logps/chosen": -163.13235473632812, "logps/rejected": -397.1629943847656, "loss": 0.3518, "rewards/chosen": 0.2117893397808075, "rewards/margins": 2.0693832337856293, "rewards/rejected": -1.8575938940048218, "step": 1478 }, { "epoch": 0.07839291866535923, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4622347.6, "logits/rejected": -31147498.666666668, "logps/chosen": -178.5265380859375, "logps/rejected": -390.8165690104167, "loss": 0.3416, "rewards/chosen": 0.10053186416625977, "rewards/margins": 2.125047492980957, "rewards/rejected": -2.0245156288146973, "step": 1479 }, { "epoch": 0.07844592266716137, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41363800.0, "logits/rejected": -20507292.8, "logps/chosen": -317.16811116536456, "logps/rejected": -176.36165771484374, "loss": 0.3457, "rewards/chosen": 0.02743638555208842, "rewards/margins": 1.1704335550467173, "rewards/rejected": -1.1429971694946288, "step": 1480 }, { "epoch": 0.0784989266689635, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15940273.6, "logits/rejected": -24401024.0, "logps/chosen": -197.80596923828125, "logps/rejected": -191.4515584309896, "loss": 0.4678, "rewards/chosen": -0.17849122285842894, "rewards/margins": 0.5026659687360129, "rewards/rejected": -0.6811571915944418, "step": 1481 }, { "epoch": 0.07855193067076564, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29409214.0, "logits/rejected": -21920608.0, "logps/chosen": -281.49053955078125, "logps/rejected": -211.69879150390625, "loss": 0.367, "rewards/chosen": 0.25410062074661255, "rewards/margins": 1.1752610802650452, "rewards/rejected": -0.9211604595184326, "step": 1482 }, { "epoch": 0.07860493467256778, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58853952.0, "logits/rejected": -25863808.0, "logps/chosen": -351.8440348307292, "logps/rejected": -143.197119140625, "loss": 0.3444, "rewards/chosen": 0.372333288192749, "rewards/margins": 1.2421278476715087, "rewards/rejected": -0.8697945594787597, "step": 1483 }, { "epoch": 0.07865793867436992, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52752144.0, "logits/rejected": -21094772.0, "logps/chosen": -465.7738342285156, "logps/rejected": -218.599609375, "loss": 0.3054, "rewards/chosen": 0.1732589602470398, "rewards/margins": 1.3311652541160583, "rewards/rejected": -1.1579062938690186, "step": 1484 }, { "epoch": 0.07871094267617205, "grad_norm": 55.25, "kl": 0.24628829956054688, "learning_rate": 5e-07, "logits/chosen": -25001698.666666668, "logits/rejected": -21409276.0, "logps/chosen": -452.4143880208333, "logps/rejected": -194.3971405029297, "loss": 0.3342, "rewards/chosen": 0.4661645491917928, "rewards/margins": 2.389554818471273, "rewards/rejected": -1.92339026927948, "step": 1485 }, { "epoch": 0.07876394667797419, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2948124.0, "logits/rejected": -24602246.4, "logps/chosen": -40.07647196451823, "logps/rejected": -235.03603515625, "loss": 0.3464, "rewards/chosen": -0.04297746221224467, "rewards/margins": 1.1986017843087513, "rewards/rejected": -1.241579246520996, "step": 1486 }, { "epoch": 0.07881695067977633, "grad_norm": 72.5, "kl": 1.0258216857910156, "learning_rate": 5e-07, "logits/chosen": -36050740.0, "logps/chosen": -351.6792297363281, "loss": 0.527, "rewards/chosen": -0.00904989242553711, "step": 1487 }, { "epoch": 0.07886995468157845, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10527729.0, "logits/rejected": -16820018.285714287, "logps/chosen": -84.84969329833984, "logps/rejected": -223.88882882254464, "loss": 0.33, "rewards/chosen": -0.15264587104320526, "rewards/margins": 0.8039829965148654, "rewards/rejected": -0.9566288675580706, "step": 1488 }, { "epoch": 0.07892295868338059, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29785254.0, "logits/rejected": 7449249.0, "logps/chosen": -330.06085205078125, "logps/rejected": -194.2536163330078, "loss": 0.3487, "rewards/chosen": 0.21817374229431152, "rewards/margins": 1.3894304037094116, "rewards/rejected": -1.1712566614151, "step": 1489 }, { "epoch": 0.07897596268518273, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65540144.0, "logits/rejected": -24422452.0, "logps/chosen": -400.23016357421875, "logps/rejected": -226.07395935058594, "loss": 0.3673, "rewards/chosen": 0.025678250938653946, "rewards/margins": 1.345709417015314, "rewards/rejected": -1.3200311660766602, "step": 1490 }, { "epoch": 0.07902896668698486, "grad_norm": 70.0, "kl": 0.3621196746826172, "learning_rate": 5e-07, "logits/chosen": -34642325.333333336, "logits/rejected": -7513026.5, "logps/chosen": -312.6100260416667, "logps/rejected": -337.44256591796875, "loss": 0.4352, "rewards/chosen": -0.057332451144854225, "rewards/margins": 1.5819605042537053, "rewards/rejected": -1.6392929553985596, "step": 1491 }, { "epoch": 0.079081970688787, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 33632106.666666664, "logits/rejected": -58892166.4, "logps/chosen": -135.715087890625, "logps/rejected": -452.0974609375, "loss": 0.3272, "rewards/chosen": -0.03971118976672491, "rewards/margins": 1.4857149119178454, "rewards/rejected": -1.5254261016845703, "step": 1492 }, { "epoch": 0.07913497469058914, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45636480.0, "logits/rejected": -56513946.666666664, "logps/chosen": -374.71807861328125, "logps/rejected": -296.2262776692708, "loss": 0.3334, "rewards/chosen": -0.010536186397075653, "rewards/margins": 1.0156660949190457, "rewards/rejected": -1.0262022813161213, "step": 1493 }, { "epoch": 0.07918797869239128, "grad_norm": 53.25, "kl": 0.1515655517578125, "learning_rate": 5e-07, "logits/chosen": -3388006.0, "logits/rejected": -32809242.666666668, "logps/chosen": -247.885546875, "logps/rejected": -138.18709309895834, "loss": 0.4279, "rewards/chosen": 0.17822227478027344, "rewards/margins": 0.7574230194091797, "rewards/rejected": -0.5792007446289062, "step": 1494 }, { "epoch": 0.07924098269419341, "grad_norm": 79.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24320768.0, "logits/rejected": -11080648.0, "logps/chosen": -382.0382995605469, "logps/rejected": -250.33580017089844, "loss": 0.3337, "rewards/chosen": 0.12677417695522308, "rewards/margins": 1.6772833913564682, "rewards/rejected": -1.5505092144012451, "step": 1495 }, { "epoch": 0.07929398669599555, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42243096.0, "logits/rejected": -30523196.0, "logps/chosen": -417.9402669270833, "logps/rejected": -389.75244140625, "loss": 0.4094, "rewards/chosen": 0.12021242578824361, "rewards/margins": 1.350374569495519, "rewards/rejected": -1.2301621437072754, "step": 1496 }, { "epoch": 0.07934699069779769, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76843176.0, "logits/rejected": -33565572.0, "logps/chosen": -304.80682373046875, "logps/rejected": -220.43478393554688, "loss": 0.3951, "rewards/chosen": -0.1268250048160553, "rewards/margins": 1.2271208465099335, "rewards/rejected": -1.3539458513259888, "step": 1497 }, { "epoch": 0.07939999469959982, "grad_norm": 82.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32189052.8, "logits/rejected": -25870296.0, "logps/chosen": -542.473779296875, "logps/rejected": -270.78847249348956, "loss": 0.426, "rewards/chosen": -0.2578349351882935, "rewards/margins": 1.4127100070317584, "rewards/rejected": -1.670544942220052, "step": 1498 }, { "epoch": 0.07945299870140196, "grad_norm": 74.0, "kl": 0.10534286499023438, "learning_rate": 5e-07, "logits/chosen": -37859184.0, "logits/rejected": -17969433.333333332, "logps/chosen": -398.85078125, "logps/rejected": -184.5630086263021, "loss": 0.4313, "rewards/chosen": -0.07803502082824706, "rewards/margins": 0.8751465638478597, "rewards/rejected": -0.9531815846761068, "step": 1499 }, { "epoch": 0.0795060027032041, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49093816.0, "logits/rejected": -29463582.0, "logps/chosen": -357.1445007324219, "logps/rejected": -121.44242858886719, "loss": 0.3532, "rewards/chosen": 0.04974756017327309, "rewards/margins": 1.3922971226274967, "rewards/rejected": -1.3425495624542236, "step": 1500 }, { "epoch": 0.07955900670500622, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -102708432.0, "logits/rejected": -38196272.0, "logps/chosen": -382.71478271484375, "logps/rejected": -360.9541015625, "loss": 0.3482, "rewards/chosen": 0.2568935453891754, "rewards/margins": 1.5116406977176666, "rewards/rejected": -1.2547471523284912, "step": 1501 }, { "epoch": 0.07961201070680836, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11823434.0, "logits/rejected": -22807064.0, "logps/chosen": -287.1029052734375, "logps/rejected": -450.53955078125, "loss": 0.2598, "rewards/chosen": 0.44123637676239014, "rewards/margins": 2.417789101600647, "rewards/rejected": -1.9765527248382568, "step": 1502 }, { "epoch": 0.0796650147086105, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4947036.8, "logits/rejected": -3669923.3333333335, "logps/chosen": -369.0297607421875, "logps/rejected": -424.6123046875, "loss": 0.3594, "rewards/chosen": 0.15172417163848878, "rewards/margins": 1.887711517016093, "rewards/rejected": -1.7359873453776042, "step": 1503 }, { "epoch": 0.07971801871041263, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71438090.66666667, "logits/rejected": 3514337.75, "logps/chosen": -282.08774820963544, "logps/rejected": -47.13148498535156, "loss": 0.5057, "rewards/chosen": -0.14839007457097372, "rewards/margins": 0.18813050786654154, "rewards/rejected": -0.33652058243751526, "step": 1504 }, { "epoch": 0.07977102271221477, "grad_norm": 69.0, "kl": 0.2983217239379883, "learning_rate": 5e-07, "logits/chosen": -21825956.0, "logits/rejected": -23298884.0, "logps/chosen": -566.517578125, "logps/rejected": -373.5247802734375, "loss": 0.3259, "rewards/chosen": 0.5771989425023397, "rewards/margins": 2.2450279792149863, "rewards/rejected": -1.6678290367126465, "step": 1505 }, { "epoch": 0.07982402671401691, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1201773.5833333333, "logits/rejected": -33217612.8, "logps/chosen": -263.52126057942706, "logps/rejected": -147.72703857421874, "loss": 0.3712, "rewards/chosen": 0.006580543393890063, "rewards/margins": 0.9095869063089291, "rewards/rejected": -0.9030063629150391, "step": 1506 }, { "epoch": 0.07987703071581904, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18229792.0, "logits/rejected": -24484740.0, "logps/chosen": -357.08441162109375, "logps/rejected": -230.77920532226562, "loss": 0.3031, "rewards/chosen": 0.37300243973731995, "rewards/margins": 1.8426012098789215, "rewards/rejected": -1.4695987701416016, "step": 1507 }, { "epoch": 0.07993003471762118, "grad_norm": 44.0, "kl": 0.08100128173828125, "learning_rate": 5e-07, "logits/chosen": -18467910.0, "logits/rejected": -6676611.5, "logps/chosen": -155.4207305908203, "logps/rejected": -148.10739135742188, "loss": 0.4233, "rewards/chosen": -0.025652695447206497, "rewards/margins": 0.6753397695720196, "rewards/rejected": -0.7009924650192261, "step": 1508 }, { "epoch": 0.07998303871942332, "grad_norm": 50.25, "kl": 0.07138442993164062, "learning_rate": 5e-07, "logits/chosen": -5669691.0, "logits/rejected": -37182400.0, "logps/chosen": -173.7967071533203, "logps/rejected": -501.97943115234375, "loss": 0.3165, "rewards/chosen": 0.1623116135597229, "rewards/margins": 1.8239814639091492, "rewards/rejected": -1.6616698503494263, "step": 1509 }, { "epoch": 0.08003604272122546, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73708272.0, "logits/rejected": -29050704.0, "logps/chosen": -138.640869140625, "logps/rejected": -277.1941441127232, "loss": 0.2997, "rewards/chosen": -0.3500427305698395, "rewards/margins": 0.8272825181484222, "rewards/rejected": -1.1773252487182617, "step": 1510 }, { "epoch": 0.0800890467230276, "grad_norm": 59.25, "kl": 0.3232269287109375, "learning_rate": 5e-07, "logits/chosen": -27675028.57142857, "logits/rejected": 460114880.0, "logps/chosen": -199.39263044084822, "logps/rejected": -1285.7076416015625, "loss": 0.4301, "rewards/chosen": 0.124123181615557, "rewards/margins": 2.454018201146807, "rewards/rejected": -2.32989501953125, "step": 1511 }, { "epoch": 0.08014205072482973, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81744896.0, "logits/rejected": -25696776.0, "logps/chosen": -176.46115112304688, "logps/rejected": -335.0772705078125, "loss": 0.2312, "rewards/chosen": 0.26965588331222534, "rewards/margins": 2.088629424571991, "rewards/rejected": -1.8189735412597656, "step": 1512 }, { "epoch": 0.08019505472663187, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24029248.0, "logits/rejected": -45650760.0, "logps/chosen": -208.85769653320312, "logps/rejected": -533.8970947265625, "loss": 0.3113, "rewards/chosen": 0.3585294485092163, "rewards/margins": 1.8278234004974365, "rewards/rejected": -1.4692939519882202, "step": 1513 }, { "epoch": 0.08024805872843399, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34299177.6, "logits/rejected": -7342798.0, "logps/chosen": -364.45869140625, "logps/rejected": -258.32851155598956, "loss": 0.3334, "rewards/chosen": 0.28277053833007815, "rewards/margins": 1.888377571105957, "rewards/rejected": -1.605607032775879, "step": 1514 }, { "epoch": 0.08030106273023613, "grad_norm": 56.25, "kl": 0.40775108337402344, "learning_rate": 5e-07, "logits/chosen": -27651728.0, "logits/rejected": -37813668.0, "logps/chosen": -340.9286193847656, "logps/rejected": -379.89727783203125, "loss": 0.3151, "rewards/chosen": 0.47801297903060913, "rewards/margins": 2.035554349422455, "rewards/rejected": -1.5575413703918457, "step": 1515 }, { "epoch": 0.08035406673203827, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35821864.0, "logits/rejected": -33225586.666666668, "logps/chosen": -614.7721557617188, "logps/rejected": -450.5638427734375, "loss": 0.2655, "rewards/chosen": 0.037334442138671875, "rewards/margins": 1.8888007799784343, "rewards/rejected": -1.8514663378397624, "step": 1516 }, { "epoch": 0.0804070707338404, "grad_norm": 62.5, "kl": 0.1710052490234375, "learning_rate": 5e-07, "logits/chosen": -65834438.4, "logits/rejected": -5415253.0, "logps/chosen": -348.401513671875, "logps/rejected": -133.3470255533854, "loss": 0.396, "rewards/chosen": 0.1130096435546875, "rewards/margins": 1.1341659545898437, "rewards/rejected": -1.0211563110351562, "step": 1517 }, { "epoch": 0.08046007473564254, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10118950.0, "logits/rejected": -42526384.0, "logps/chosen": -519.4570922851562, "logps/rejected": -454.45672607421875, "loss": 0.3548, "rewards/chosen": 0.24237027764320374, "rewards/margins": 1.3249503672122955, "rewards/rejected": -1.0825800895690918, "step": 1518 }, { "epoch": 0.08051307873744468, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 772553.4285714285, "logits/rejected": 4502671.5, "logps/chosen": -140.50773402622767, "logps/rejected": -300.4299621582031, "loss": 0.4087, "rewards/chosen": 0.20842242240905762, "rewards/margins": 2.14984393119812, "rewards/rejected": -1.9414215087890625, "step": 1519 }, { "epoch": 0.08056608273924681, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2442449.0, "logits/rejected": -42737088.0, "logps/chosen": -223.4734649658203, "logps/rejected": -648.8485717773438, "loss": 0.2712, "rewards/chosen": 0.17151106894016266, "rewards/margins": 2.657114401459694, "rewards/rejected": -2.4856033325195312, "step": 1520 }, { "epoch": 0.08061908674104895, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62617728.0, "logits/rejected": -40287984.0, "logps/chosen": -343.816259765625, "logps/rejected": -576.2490234375, "loss": 0.3859, "rewards/chosen": 0.056102752685546875, "rewards/margins": 1.644213040669759, "rewards/rejected": -1.5881102879842122, "step": 1521 }, { "epoch": 0.08067209074285109, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12637900.0, "logits/rejected": -34575304.0, "logps/chosen": -119.19202423095703, "logps/rejected": -877.8009643554688, "loss": 0.3199, "rewards/chosen": -0.2867472767829895, "rewards/margins": 3.10746830701828, "rewards/rejected": -3.3942155838012695, "step": 1522 }, { "epoch": 0.08072509474465323, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7646136.5, "logits/rejected": -30066754.0, "logps/chosen": -539.560302734375, "logps/rejected": -325.66827392578125, "loss": 0.3076, "rewards/chosen": 0.3636695146560669, "rewards/margins": 1.828098177909851, "rewards/rejected": -1.4644286632537842, "step": 1523 }, { "epoch": 0.08077809874645536, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19228988.0, "logits/rejected": 1044153.875, "logps/chosen": -276.3094482421875, "logps/rejected": -334.8063659667969, "loss": 0.3409, "rewards/chosen": 0.21878451108932495, "rewards/margins": 1.550954282283783, "rewards/rejected": -1.332169771194458, "step": 1524 }, { "epoch": 0.0808311027482575, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27017332.0, "logits/rejected": 1034371.75, "logps/chosen": -177.90902709960938, "logps/rejected": -303.5303955078125, "loss": 0.4089, "rewards/chosen": -0.067413330078125, "rewards/margins": 0.8533157110214233, "rewards/rejected": -0.9207290410995483, "step": 1525 }, { "epoch": 0.08088410675005962, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32367686.4, "logits/rejected": -35428613.333333336, "logps/chosen": -507.215625, "logps/rejected": -314.5662027994792, "loss": 0.3534, "rewards/chosen": 0.2696964740753174, "rewards/margins": 1.6413268248240154, "rewards/rejected": -1.371630350748698, "step": 1526 }, { "epoch": 0.08093711075186176, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52950037.333333336, "logits/rejected": -25601761.6, "logps/chosen": -480.0838216145833, "logps/rejected": -265.542626953125, "loss": 0.3426, "rewards/chosen": 0.04703572392463684, "rewards/margins": 1.383470755815506, "rewards/rejected": -1.336435031890869, "step": 1527 }, { "epoch": 0.0809901147536639, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20966036.0, "logits/rejected": 7296923.0, "logps/chosen": -237.41824340820312, "logps/rejected": -377.2080078125, "loss": 0.4141, "rewards/chosen": -0.0542246513068676, "rewards/margins": 0.8222411461174488, "rewards/rejected": -0.8764657974243164, "step": 1528 }, { "epoch": 0.08104311875546603, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1439831.3333333333, "logits/rejected": 1329232.0, "logps/chosen": -91.42074584960938, "logps/rejected": -113.1525390625, "loss": 0.3897, "rewards/chosen": -0.132568359375, "rewards/margins": 0.7456090927124024, "rewards/rejected": -0.8781774520874024, "step": 1529 }, { "epoch": 0.08109612275726817, "grad_norm": 55.25, "kl": 0.7068748474121094, "learning_rate": 5e-07, "logits/chosen": -37731234.666666664, "logits/rejected": -25125342.4, "logps/chosen": -232.15144856770834, "logps/rejected": -321.6389404296875, "loss": 0.3004, "rewards/chosen": 0.22508615255355835, "rewards/margins": 1.7359443068504334, "rewards/rejected": -1.510858154296875, "step": 1530 }, { "epoch": 0.08114912675907031, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44427237.333333336, "logits/rejected": -41959564.0, "logps/chosen": -531.2120768229166, "logps/rejected": -350.04534912109375, "loss": 0.371, "rewards/chosen": 0.20563743511835733, "rewards/margins": 2.1641855041186013, "rewards/rejected": -1.9585480690002441, "step": 1531 }, { "epoch": 0.08120213076087245, "grad_norm": 69.5, "kl": 0.49840545654296875, "learning_rate": 5e-07, "logits/chosen": -41751680.0, "logits/rejected": -38285060.0, "logps/chosen": -540.7782592773438, "logps/rejected": -212.6458282470703, "loss": 0.3961, "rewards/chosen": 0.23141151666641235, "rewards/margins": 1.0071981549263, "rewards/rejected": -0.7757866382598877, "step": 1532 }, { "epoch": 0.08125513476267458, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9488287.0, "logits/rejected": 2517720.5, "logps/chosen": -243.283447265625, "logps/rejected": -215.3172607421875, "loss": 0.3471, "rewards/chosen": 0.286845862865448, "rewards/margins": 1.3743296265602112, "rewards/rejected": -1.0874837636947632, "step": 1533 }, { "epoch": 0.08130813876447672, "grad_norm": 61.25, "kl": 1.5320587158203125, "learning_rate": 5e-07, "logits/chosen": -33573872.0, "logits/rejected": -37061272.0, "logps/chosen": -300.3344319661458, "logps/rejected": -894.8262329101562, "loss": 0.419, "rewards/chosen": 0.08771624167760213, "rewards/margins": 2.333685060342153, "rewards/rejected": -2.245968818664551, "step": 1534 }, { "epoch": 0.08136114276627886, "grad_norm": 72.0, "kl": 0.5652790069580078, "learning_rate": 5e-07, "logits/chosen": -20797196.0, "logits/rejected": -36095074.666666664, "logps/chosen": -1184.9295654296875, "logps/rejected": -184.86299641927084, "loss": 0.2544, "rewards/chosen": 0.7485595941543579, "rewards/margins": 2.052959720293681, "rewards/rejected": -1.304400126139323, "step": 1535 }, { "epoch": 0.081414146768081, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6327573.333333333, "logits/rejected": -31211888.0, "logps/chosen": -183.8544921875, "logps/rejected": -360.6451110839844, "loss": 0.449, "rewards/chosen": -0.1366362969080607, "rewards/margins": 1.3919992049535115, "rewards/rejected": -1.5286355018615723, "step": 1536 }, { "epoch": 0.08146715076988313, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43977644.8, "logits/rejected": -33574280.0, "logps/chosen": -258.5870361328125, "logps/rejected": -383.1446940104167, "loss": 0.409, "rewards/chosen": -0.04711887538433075, "rewards/margins": 1.3132347534100215, "rewards/rejected": -1.3603536287943523, "step": 1537 }, { "epoch": 0.08152015477168527, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24026210.0, "logits/rejected": -23469910.0, "logps/chosen": -295.6172790527344, "logps/rejected": -169.06651306152344, "loss": 0.2995, "rewards/chosen": 0.1963103860616684, "rewards/margins": 2.2853020280599594, "rewards/rejected": -2.088991641998291, "step": 1538 }, { "epoch": 0.08157315877348739, "grad_norm": 98.5, "kl": 0.5499954223632812, "learning_rate": 5e-07, "logits/chosen": -26005780.57142857, "logits/rejected": -19487492.0, "logps/chosen": -618.64501953125, "logps/rejected": -179.01458740234375, "loss": 0.457, "rewards/chosen": 0.09812138761792864, "rewards/margins": 1.3973401614597865, "rewards/rejected": -1.299218773841858, "step": 1539 }, { "epoch": 0.08162616277528953, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15609565.333333334, "logits/rejected": -17518667.2, "logps/chosen": -338.878662109375, "logps/rejected": -390.567578125, "loss": 0.3033, "rewards/chosen": -0.036148580412069954, "rewards/margins": 1.8630720769365627, "rewards/rejected": -1.8992206573486328, "step": 1540 }, { "epoch": 0.08167916677709167, "grad_norm": 59.5, "kl": 0.08787727355957031, "learning_rate": 5e-07, "logits/chosen": -2493658.4, "logits/rejected": -19559093.333333332, "logps/chosen": -344.26474609375, "logps/rejected": -499.61279296875, "loss": 0.3806, "rewards/chosen": -0.1498876094818115, "rewards/margins": 2.1586797555287682, "rewards/rejected": -2.3085673650105796, "step": 1541 }, { "epoch": 0.0817321707788938, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16161690.0, "logits/rejected": 6412903.0, "logps/chosen": -295.4502258300781, "logps/rejected": -219.27525329589844, "loss": 0.3421, "rewards/chosen": 0.2553417682647705, "rewards/margins": 1.4051035642623901, "rewards/rejected": -1.1497617959976196, "step": 1542 }, { "epoch": 0.08178517478069594, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24756512.0, "logits/rejected": -50196314.666666664, "logps/chosen": -281.91689453125, "logps/rejected": -218.99910481770834, "loss": 0.4492, "rewards/chosen": -0.22151975631713866, "rewards/margins": 0.8741040547688801, "rewards/rejected": -1.0956238110860188, "step": 1543 }, { "epoch": 0.08183817878249808, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11775845.333333334, "logits/rejected": -30767566.0, "logps/chosen": -172.5135498046875, "logps/rejected": -270.0863037109375, "loss": 0.5079, "rewards/chosen": -0.4636911948521932, "rewards/margins": 0.8564355770746868, "rewards/rejected": -1.3201267719268799, "step": 1544 }, { "epoch": 0.08189118278430021, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41910972.0, "logits/rejected": -69352440.0, "logps/chosen": -338.0000915527344, "logps/rejected": -377.9376220703125, "loss": 0.3305, "rewards/chosen": 0.4002952575683594, "rewards/margins": 1.4890300035476685, "rewards/rejected": -1.088734745979309, "step": 1545 }, { "epoch": 0.08194418678610235, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5299813.333333333, "logits/rejected": -39129411.2, "logps/chosen": -110.04689534505208, "logps/rejected": -546.83916015625, "loss": 0.2977, "rewards/chosen": 0.2047581672668457, "rewards/margins": 2.2020453453063964, "rewards/rejected": -1.9972871780395507, "step": 1546 }, { "epoch": 0.08199719078790449, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32419058.0, "logits/rejected": -38408828.0, "logps/chosen": -215.09295654296875, "logps/rejected": -527.91748046875, "loss": 0.3147, "rewards/chosen": 0.017984725534915924, "rewards/margins": 2.0502844825387, "rewards/rejected": -2.032299757003784, "step": 1547 }, { "epoch": 0.08205019478970663, "grad_norm": 79.5, "kl": 0.4052734375, "learning_rate": 5e-07, "logits/chosen": 38353488.0, "logits/rejected": -27104435.2, "logps/chosen": -608.4665120442709, "logps/rejected": -256.6169189453125, "loss": 0.3069, "rewards/chosen": 0.4252858559290568, "rewards/margins": 1.7339583794275921, "rewards/rejected": -1.3086725234985352, "step": 1548 }, { "epoch": 0.08210319879150876, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36411616.0, "logits/rejected": -10396246.0, "logps/chosen": -228.8456573486328, "logps/rejected": -219.91517639160156, "loss": 0.4159, "rewards/chosen": -0.06104031205177307, "rewards/margins": 0.9671308696269989, "rewards/rejected": -1.028171181678772, "step": 1549 }, { "epoch": 0.0821562027933109, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8198736.0, "logits/rejected": -18084128.0, "logps/chosen": -128.47664388020834, "logps/rejected": -265.379345703125, "loss": 0.4001, "rewards/chosen": -0.04617831607659658, "rewards/margins": 0.7196864376465479, "rewards/rejected": -0.7658647537231446, "step": 1550 }, { "epoch": 0.08220920679511304, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 23434958.0, "logits/rejected": -44717188.571428575, "logps/chosen": -362.39874267578125, "logps/rejected": -218.86980329241072, "loss": 0.2723, "rewards/chosen": -0.3434814512729645, "rewards/margins": 0.9576055194650377, "rewards/rejected": -1.3010869707380022, "step": 1551 }, { "epoch": 0.08226221079691516, "grad_norm": 78.5, "kl": 1.2323989868164062, "learning_rate": 5e-07, "logits/chosen": -59614704.0, "logits/rejected": -41888240.0, "logps/chosen": -875.4089965820312, "logps/rejected": -323.42718505859375, "loss": 0.3521, "rewards/chosen": 0.3310321867465973, "rewards/margins": 1.8007340729236603, "rewards/rejected": -1.469701886177063, "step": 1552 }, { "epoch": 0.0823152147987173, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15078469.0, "logits/rejected": -20412485.333333332, "logps/chosen": -271.2019958496094, "logps/rejected": -327.2464599609375, "loss": 0.2238, "rewards/chosen": -0.12985610961914062, "rewards/margins": 2.094072182973226, "rewards/rejected": -2.2239282925923667, "step": 1553 }, { "epoch": 0.08236821880051944, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50362336.0, "logits/rejected": -36644453.333333336, "logps/chosen": -342.01220703125, "logps/rejected": -371.4241943359375, "loss": 0.3889, "rewards/chosen": -0.126898193359375, "rewards/margins": 1.6453697840372723, "rewards/rejected": -1.7722679773966472, "step": 1554 }, { "epoch": 0.08242122280232157, "grad_norm": 70.0, "kl": 1.3624725341796875, "learning_rate": 5e-07, "logits/chosen": -77213984.0, "logits/rejected": -58619272.0, "logps/chosen": -355.9989827473958, "logps/rejected": -415.565673828125, "loss": 0.4038, "rewards/chosen": 0.24403993288675943, "rewards/margins": 1.9341339270273845, "rewards/rejected": -1.690093994140625, "step": 1555 }, { "epoch": 0.08247422680412371, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33933080.0, "logits/rejected": -25522598.0, "logps/chosen": -256.36968994140625, "logps/rejected": -390.6104736328125, "loss": 0.338, "rewards/chosen": -0.06369401514530182, "rewards/margins": 1.6360124200582504, "rewards/rejected": -1.6997064352035522, "step": 1556 }, { "epoch": 0.08252723080592585, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4558424.5, "logits/rejected": -17751832.0, "logps/chosen": -298.6987609863281, "logps/rejected": -154.2773895263672, "loss": 0.3764, "rewards/chosen": 0.4679010510444641, "rewards/margins": 1.0826486945152283, "rewards/rejected": -0.6147476434707642, "step": 1557 }, { "epoch": 0.08258023480772798, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24375990.85714286, "logits/rejected": -11919288.0, "logps/chosen": -266.2727573939732, "logps/rejected": -116.71761322021484, "loss": 0.4711, "rewards/chosen": 0.12841696398598806, "rewards/margins": 0.22120642768485205, "rewards/rejected": -0.09278946369886398, "step": 1558 }, { "epoch": 0.08263323880953012, "grad_norm": 62.5, "kl": 0.9888496398925781, "learning_rate": 5e-07, "logits/chosen": -22752030.4, "logits/rejected": 3126256.6666666665, "logps/chosen": -494.52041015625, "logps/rejected": -84.84879048665364, "loss": 0.4023, "rewards/chosen": 0.07200950384140015, "rewards/margins": 1.154586096604665, "rewards/rejected": -1.082576592763265, "step": 1559 }, { "epoch": 0.08268624281133226, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28977088.0, "logits/rejected": -8642475.2, "logps/chosen": -255.467529296875, "logps/rejected": -141.13052978515626, "loss": 0.3186, "rewards/chosen": 0.17979806661605835, "rewards/margins": 1.5437143683433532, "rewards/rejected": -1.3639163017272948, "step": 1560 }, { "epoch": 0.0827392468131344, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2425574.0, "logits/rejected": -45314725.333333336, "logps/chosen": -167.47561645507812, "logps/rejected": -550.9939778645834, "loss": 0.2098, "rewards/chosen": 0.39708414673805237, "rewards/margins": 2.277565071980159, "rewards/rejected": -1.8804809252421062, "step": 1561 }, { "epoch": 0.08279225081493653, "grad_norm": 80.0, "kl": 1.1340246200561523, "learning_rate": 5e-07, "logits/chosen": -51449364.0, "logits/rejected": -28124576.0, "logps/chosen": -900.617919921875, "logps/rejected": -307.3149719238281, "loss": 0.3493, "rewards/chosen": 0.4982188940048218, "rewards/margins": 1.4445813298225403, "rewards/rejected": -0.9463624358177185, "step": 1562 }, { "epoch": 0.08284525481673867, "grad_norm": 62.75, "kl": 0.5714340209960938, "learning_rate": 5e-07, "logits/chosen": -32112373.333333332, "logits/rejected": -58333192.0, "logps/chosen": -374.9127604166667, "logps/rejected": -279.96722412109375, "loss": 0.4004, "rewards/chosen": 0.12667871514956155, "rewards/margins": 2.2545635799566903, "rewards/rejected": -2.127884864807129, "step": 1563 }, { "epoch": 0.0828982588185408, "grad_norm": 72.5, "kl": 0.00261688232421875, "learning_rate": 5e-07, "logits/chosen": -68124144.0, "logits/rejected": -5670424.0, "logps/chosen": -499.1884765625, "logps/rejected": -337.62054443359375, "loss": 0.3796, "rewards/chosen": 0.04684295505285263, "rewards/margins": 1.730351708829403, "rewards/rejected": -1.6835087537765503, "step": 1564 }, { "epoch": 0.08295126282034293, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21724177.6, "logits/rejected": -36389706.666666664, "logps/chosen": -285.251513671875, "logps/rejected": -411.0628255208333, "loss": 0.3351, "rewards/chosen": 0.41330685615539553, "rewards/margins": 1.7371299902598065, "rewards/rejected": -1.3238231341044109, "step": 1565 }, { "epoch": 0.08300426682214507, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17196003.2, "logits/rejected": -55573632.0, "logps/chosen": -188.7457763671875, "logps/rejected": -328.620849609375, "loss": 0.401, "rewards/chosen": -0.12326035499572754, "rewards/margins": 1.483516200383504, "rewards/rejected": -1.6067765553792317, "step": 1566 }, { "epoch": 0.0830572708239472, "grad_norm": 72.5, "kl": 0.5113754272460938, "learning_rate": 5e-07, "logits/chosen": 5870035.2, "logits/rejected": -41095333.333333336, "logps/chosen": -471.87578125, "logps/rejected": -331.05645751953125, "loss": 0.4055, "rewards/chosen": -0.06970741748809814, "rewards/margins": 1.4556288480758668, "rewards/rejected": -1.5253362655639648, "step": 1567 }, { "epoch": 0.08311027482574934, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47507200.0, "logits/rejected": -16925728.0, "logps/chosen": -423.631591796875, "logps/rejected": -237.93002319335938, "loss": 0.2841, "rewards/chosen": 0.5645660758018494, "rewards/margins": 2.0829063057899475, "rewards/rejected": -1.5183402299880981, "step": 1568 }, { "epoch": 0.08316327882755148, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1885247.125, "logits/rejected": -1072021.8333333333, "logps/chosen": -258.330810546875, "logps/rejected": -229.61612955729166, "loss": 0.3822, "rewards/chosen": 0.10118646919727325, "rewards/margins": 0.7990673730770746, "rewards/rejected": -0.6978809038798014, "step": 1569 }, { "epoch": 0.08321628282935362, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26432485.333333332, "logits/rejected": -107749312.0, "logps/chosen": -242.17891438802084, "logps/rejected": -958.7825317382812, "loss": 0.4099, "rewards/chosen": -0.0315882017215093, "rewards/margins": 2.096256131927172, "rewards/rejected": -2.1278443336486816, "step": 1570 }, { "epoch": 0.08326928683115575, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1345854.6, "logits/rejected": -34582170.666666664, "logps/chosen": -241.6064208984375, "logps/rejected": -349.71484375, "loss": 0.3889, "rewards/chosen": -0.019131791591644288, "rewards/margins": 1.659143316745758, "rewards/rejected": -1.6782751083374023, "step": 1571 }, { "epoch": 0.08332229083295789, "grad_norm": 53.25, "kl": 0.11207962036132812, "learning_rate": 5e-07, "logits/chosen": -26624384.0, "logits/rejected": -7790088.5, "logps/chosen": -224.92706298828125, "logps/rejected": -202.38803100585938, "loss": 0.3478, "rewards/chosen": 0.19227710366249084, "rewards/margins": 1.5298503935337067, "rewards/rejected": -1.3375732898712158, "step": 1572 }, { "epoch": 0.08337529483476003, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 16493835.2, "logits/rejected": 11438556.0, "logps/chosen": -288.481396484375, "logps/rejected": -186.09586588541666, "loss": 0.3519, "rewards/chosen": 0.21566174030303956, "rewards/margins": 1.698404844601949, "rewards/rejected": -1.4827431042989094, "step": 1573 }, { "epoch": 0.08342829883656216, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61360224.0, "logits/rejected": -6255596.0, "logps/chosen": -430.721923828125, "logps/rejected": -344.1118977864583, "loss": 0.326, "rewards/chosen": 0.0782928317785263, "rewards/margins": 1.2697976281245549, "rewards/rejected": -1.1915047963460286, "step": 1574 }, { "epoch": 0.0834813028383643, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12869188.0, "logits/rejected": 32644634.0, "logps/chosen": -135.147705078125, "logps/rejected": -497.8915710449219, "loss": 0.3587, "rewards/chosen": 0.18107232451438904, "rewards/margins": 1.477023333311081, "rewards/rejected": -1.295951008796692, "step": 1575 }, { "epoch": 0.08353430684016644, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23233648.0, "logits/rejected": 43920164.0, "logps/chosen": -307.0154724121094, "logps/rejected": -403.15777587890625, "loss": 0.3419, "rewards/chosen": -0.1605861783027649, "rewards/margins": 2.2127320170402527, "rewards/rejected": -2.3733181953430176, "step": 1576 }, { "epoch": 0.08358731084196856, "grad_norm": 58.75, "kl": 0.10237693786621094, "learning_rate": 5e-07, "logits/chosen": -59362128.0, "logits/rejected": -37452156.8, "logps/chosen": -276.0145670572917, "logps/rejected": -402.5879150390625, "loss": 0.3446, "rewards/chosen": -0.1615478495756785, "rewards/margins": 1.2468417187531788, "rewards/rejected": -1.4083895683288574, "step": 1577 }, { "epoch": 0.0836403148437707, "grad_norm": 79.0, "kl": 1.1669292449951172, "learning_rate": 5e-07, "logits/chosen": -14464673.333333334, "logits/rejected": -24019348.0, "logps/chosen": -462.7668050130208, "logps/rejected": -261.4963684082031, "loss": 0.419, "rewards/chosen": 0.1415428320566813, "rewards/margins": 1.6979880730311077, "rewards/rejected": -1.5564452409744263, "step": 1578 }, { "epoch": 0.08369331884557284, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34402364.0, "logits/rejected": -18314874.0, "logps/chosen": -417.4954833984375, "logps/rejected": -151.73129272460938, "loss": 0.3224, "rewards/chosen": 0.14399223029613495, "rewards/margins": 1.7094600647687912, "rewards/rejected": -1.5654678344726562, "step": 1579 }, { "epoch": 0.08374632284737497, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61907110.4, "logits/rejected": -30735413.333333332, "logps/chosen": -289.70947265625, "logps/rejected": -451.7318522135417, "loss": 0.4283, "rewards/chosen": -0.2848461866378784, "rewards/margins": 1.9024622837702432, "rewards/rejected": -2.1873084704081216, "step": 1580 }, { "epoch": 0.08379932684917711, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10932195.2, "logits/rejected": -7053112.666666667, "logps/chosen": -166.78856201171874, "logps/rejected": -167.8122762044271, "loss": 0.4147, "rewards/chosen": 0.020294761657714842, "rewards/margins": 0.9826345602671305, "rewards/rejected": -0.9623397986094157, "step": 1581 }, { "epoch": 0.08385233085097925, "grad_norm": 69.5, "kl": 0.3137474060058594, "learning_rate": 5e-07, "logits/chosen": -36186176.0, "logits/rejected": -24702082.666666668, "logps/chosen": -321.066943359375, "logps/rejected": -394.0938720703125, "loss": 0.4323, "rewards/chosen": -0.17346421480178834, "rewards/margins": 1.2079424103101093, "rewards/rejected": -1.3814066251118977, "step": 1582 }, { "epoch": 0.08390533485278139, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26175202.0, "logits/rejected": -3809325.25, "logps/chosen": -295.27484130859375, "logps/rejected": -729.18994140625, "loss": 0.3031, "rewards/chosen": 0.23275423049926758, "rewards/margins": 2.423064947128296, "rewards/rejected": -2.1903107166290283, "step": 1583 }, { "epoch": 0.08395833885458352, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10562427.0, "logits/rejected": 106407616.0, "logps/chosen": -199.42665100097656, "logps/rejected": -499.35711669921875, "loss": 0.4495, "rewards/chosen": -0.33139440417289734, "rewards/margins": 0.7574868500232697, "rewards/rejected": -1.088881254196167, "step": 1584 }, { "epoch": 0.08401134285638566, "grad_norm": 54.5, "kl": 0.28311634063720703, "learning_rate": 5e-07, "logits/chosen": 18961008.0, "logits/rejected": -14830324.0, "logps/chosen": -639.8443603515625, "logps/rejected": -284.97926839192706, "loss": 0.235, "rewards/chosen": 0.7478485107421875, "rewards/margins": 2.5021036465962725, "rewards/rejected": -1.7542551358540852, "step": 1585 }, { "epoch": 0.0840643468581878, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25131104.0, "logits/rejected": -54534694.4, "logps/chosen": -503.6414794921875, "logps/rejected": -374.994970703125, "loss": 0.2978, "rewards/chosen": 0.22632445891698202, "rewards/margins": 1.672104259332021, "rewards/rejected": -1.445779800415039, "step": 1586 }, { "epoch": 0.08411735085998993, "grad_norm": 66.5, "kl": 0.12886619567871094, "learning_rate": 5e-07, "logits/chosen": -44968310.4, "logits/rejected": -64343168.0, "logps/chosen": -387.117578125, "logps/rejected": -163.6676025390625, "loss": 0.4474, "rewards/chosen": -0.07368698120117187, "rewards/margins": 0.66964643796285, "rewards/rejected": -0.7433334191640218, "step": 1587 }, { "epoch": 0.08417035486179207, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 350306.0, "logits/rejected": -73952309.33333333, "logps/chosen": -489.9134521484375, "logps/rejected": -369.0517171223958, "loss": 0.2327, "rewards/chosen": 0.5384494662284851, "rewards/margins": 2.6190964579582214, "rewards/rejected": -2.0806469917297363, "step": 1588 }, { "epoch": 0.08422335886359421, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5988729.333333333, "logits/rejected": 7713344.0, "logps/chosen": -239.2070515950521, "logps/rejected": -191.7283935546875, "loss": 0.3268, "rewards/chosen": 0.1834965944290161, "rewards/margins": 1.322679352760315, "rewards/rejected": -1.1391827583312988, "step": 1589 }, { "epoch": 0.08427636286539633, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17093769.6, "logits/rejected": -14943924.0, "logps/chosen": -249.7849609375, "logps/rejected": -127.71405029296875, "loss": 0.3796, "rewards/chosen": 0.13937522172927858, "rewards/margins": 1.3311216155687968, "rewards/rejected": -1.1917463938395183, "step": 1590 }, { "epoch": 0.08432936686719847, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17593658.0, "logits/rejected": 104473930.66666667, "logps/chosen": -138.1791229248047, "logps/rejected": -312.3818766276042, "loss": 0.2705, "rewards/chosen": 0.36437568068504333, "rewards/margins": 1.706539084513982, "rewards/rejected": -1.3421634038289387, "step": 1591 }, { "epoch": 0.0843823708690006, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22073874.666666668, "logits/rejected": -23542460.8, "logps/chosen": -45.70721435546875, "logps/rejected": -335.664013671875, "loss": 0.3155, "rewards/chosen": -0.039102556804815926, "rewards/margins": 1.5130168889959652, "rewards/rejected": -1.5521194458007812, "step": 1592 }, { "epoch": 0.08443537487080274, "grad_norm": 72.0, "kl": 0.5834083557128906, "learning_rate": 5e-07, "logits/chosen": -20720294.4, "logits/rejected": -27484690.666666668, "logps/chosen": -743.8814453125, "logps/rejected": -289.8492431640625, "loss": 0.312, "rewards/chosen": 0.6646016120910645, "rewards/margins": 2.1442444801330565, "rewards/rejected": -1.4796428680419922, "step": 1593 }, { "epoch": 0.08448837887260488, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53471465.6, "logits/rejected": -24985248.0, "logps/chosen": -271.306298828125, "logps/rejected": -172.69978841145834, "loss": 0.4524, "rewards/chosen": -0.037626650929450986, "rewards/margins": 0.5614329308271409, "rewards/rejected": -0.5990595817565918, "step": 1594 }, { "epoch": 0.08454138287440702, "grad_norm": 47.25, "kl": 0.89508056640625, "learning_rate": 5e-07, "logits/chosen": -44686032.0, "logits/rejected": -25552494.0, "logps/chosen": -183.55691528320312, "logps/rejected": -453.737060546875, "loss": 0.351, "rewards/chosen": -0.0531105101108551, "rewards/margins": 1.9697960317134857, "rewards/rejected": -2.022906541824341, "step": 1595 }, { "epoch": 0.08459438687620915, "grad_norm": 50.5, "kl": 0.2760047912597656, "learning_rate": 5e-07, "logits/chosen": -2966463.0, "logits/rejected": -28090934.0, "logps/chosen": -288.8702087402344, "logps/rejected": -435.49737548828125, "loss": 0.2961, "rewards/chosen": 0.41010046005249023, "rewards/margins": 2.0957882404327393, "rewards/rejected": -1.685687780380249, "step": 1596 }, { "epoch": 0.08464739087801129, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38610909.333333336, "logits/rejected": -36038886.4, "logps/chosen": -472.717529296875, "logps/rejected": -250.53125, "loss": 0.3224, "rewards/chosen": 0.2015706499417623, "rewards/margins": 1.4467336138089497, "rewards/rejected": -1.2451629638671875, "step": 1597 }, { "epoch": 0.08470039487981343, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38714540.0, "logits/rejected": -29079256.0, "logps/chosen": -206.12901306152344, "logps/rejected": -424.8887939453125, "loss": 0.32, "rewards/chosen": 0.2672765851020813, "rewards/margins": 1.8901422619819641, "rewards/rejected": -1.6228656768798828, "step": 1598 }, { "epoch": 0.08475339888161557, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25742084.0, "logits/rejected": -43529560.0, "logps/chosen": -263.7891845703125, "logps/rejected": -298.99224853515625, "loss": 0.3703, "rewards/chosen": -0.08858991414308548, "rewards/margins": 1.3567874357104301, "rewards/rejected": -1.4453773498535156, "step": 1599 }, { "epoch": 0.0848064028834177, "grad_norm": 70.0, "kl": 0.9523086547851562, "learning_rate": 5e-07, "logits/chosen": -40287117.333333336, "logits/rejected": -22214152.0, "logps/chosen": -559.0266520182291, "logps/rejected": -94.13180541992188, "loss": 0.4201, "rewards/chosen": 0.2203097144762675, "rewards/margins": 1.2284453908602397, "rewards/rejected": -1.0081356763839722, "step": 1600 }, { "epoch": 0.08485940688521984, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59153688.0, "logits/rejected": -16920941.333333332, "logps/chosen": -629.910888671875, "logps/rejected": -182.29744466145834, "loss": 0.3507, "rewards/chosen": 0.19842529296875, "rewards/margins": 1.1140480041503906, "rewards/rejected": -0.9156227111816406, "step": 1601 }, { "epoch": 0.08491241088702196, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 22804.666666666668, "logits/rejected": -7706866.4, "logps/chosen": -92.35270182291667, "logps/rejected": -264.7051513671875, "loss": 0.3604, "rewards/chosen": -0.11000670989354451, "rewards/margins": 1.1541134874025982, "rewards/rejected": -1.2641201972961427, "step": 1602 }, { "epoch": 0.0849654148888241, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8417523.333333334, "logits/rejected": -18647491.2, "logps/chosen": -274.2468668619792, "logps/rejected": -181.9402587890625, "loss": 0.305, "rewards/chosen": 0.3098047971725464, "rewards/margins": 1.56443030834198, "rewards/rejected": -1.2546255111694335, "step": 1603 }, { "epoch": 0.08501841889062624, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33695080.0, "logits/rejected": 5134520.0, "logps/chosen": -314.2306213378906, "logps/rejected": -198.15420532226562, "loss": 0.3548, "rewards/chosen": 0.23083487153053284, "rewards/margins": 1.3181945383548737, "rewards/rejected": -1.0873596668243408, "step": 1604 }, { "epoch": 0.08507142289242837, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9997670.4, "logits/rejected": -26474752.0, "logps/chosen": -301.2250244140625, "logps/rejected": -474.5353597005208, "loss": 0.3532, "rewards/chosen": 0.100303316116333, "rewards/margins": 2.019767745335897, "rewards/rejected": -1.9194644292195637, "step": 1605 }, { "epoch": 0.08512442689423051, "grad_norm": 60.5, "kl": 0.11760330200195312, "learning_rate": 5e-07, "logits/chosen": -25005368.0, "logits/rejected": -45109100.0, "logps/chosen": -423.2271728515625, "logps/rejected": -228.6431121826172, "loss": 0.3333, "rewards/chosen": 0.3002592623233795, "rewards/margins": 1.7119598686695099, "rewards/rejected": -1.4117006063461304, "step": 1606 }, { "epoch": 0.08517743089603265, "grad_norm": 59.25, "kl": 0.9114980697631836, "learning_rate": 5e-07, "logits/chosen": -12447471.2, "logits/rejected": -8690526.0, "logps/chosen": -369.5691650390625, "logps/rejected": -273.69118245442706, "loss": 0.4135, "rewards/chosen": 0.12380971908569335, "rewards/margins": 1.2757406870524088, "rewards/rejected": -1.1519309679667156, "step": 1607 }, { "epoch": 0.08523043489783479, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1843540.0, "logits/rejected": -92447824.0, "logps/chosen": -503.74114990234375, "logps/rejected": -606.697998046875, "loss": 0.3352, "rewards/chosen": -0.215007022023201, "rewards/margins": 2.173298165202141, "rewards/rejected": -2.388305187225342, "step": 1608 }, { "epoch": 0.08528343889963692, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29886661.333333332, "logits/rejected": -52089328.0, "logps/chosen": -423.6763509114583, "logps/rejected": -319.93304443359375, "loss": 0.4159, "rewards/chosen": 0.14209556579589844, "rewards/margins": 1.2143619060516357, "rewards/rejected": -1.0722663402557373, "step": 1609 }, { "epoch": 0.08533644290143906, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71284096.0, "logits/rejected": -26748256.0, "logps/chosen": -236.15025329589844, "logps/rejected": -210.66473388671875, "loss": 0.372, "rewards/chosen": 0.03625956177711487, "rewards/margins": 1.293895274400711, "rewards/rejected": -1.2576357126235962, "step": 1610 }, { "epoch": 0.0853894469032412, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1136365.0, "logits/rejected": -3724736.0, "logps/chosen": -158.62122802734376, "logps/rejected": -385.902587890625, "loss": 0.4197, "rewards/chosen": -0.04476081132888794, "rewards/margins": 1.022732388973236, "rewards/rejected": -1.067493200302124, "step": 1611 }, { "epoch": 0.08544245090504334, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24205872.0, "logits/rejected": -25424819.2, "logps/chosen": -312.4291585286458, "logps/rejected": -229.9253173828125, "loss": 0.2972, "rewards/chosen": 0.33803101380666095, "rewards/margins": 1.6560020526250203, "rewards/rejected": -1.3179710388183594, "step": 1612 }, { "epoch": 0.08549545490684547, "grad_norm": 62.25, "kl": 0.5924606323242188, "learning_rate": 5e-07, "logits/chosen": -28068067.2, "logits/rejected": -21389748.0, "logps/chosen": -299.295751953125, "logps/rejected": -224.69134521484375, "loss": 0.3547, "rewards/chosen": 0.35458343029022216, "rewards/margins": 1.7384135007858277, "rewards/rejected": -1.3838300704956055, "step": 1613 }, { "epoch": 0.08554845890864761, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1450024.5, "logits/rejected": -42664864.0, "logps/chosen": -194.25576782226562, "logps/rejected": -448.75469970703125, "loss": 0.3586, "rewards/chosen": -0.10861469060182571, "rewards/margins": 1.4394642040133476, "rewards/rejected": -1.5480788946151733, "step": 1614 }, { "epoch": 0.08560146291044973, "grad_norm": 52.5, "kl": 1.2468433380126953, "learning_rate": 5e-07, "logits/chosen": -3145920.8571428573, "logits/rejected": -5600766.5, "logps/chosen": -269.51321847098217, "logps/rejected": -325.62664794921875, "loss": 0.41, "rewards/chosen": 0.5125332219260079, "rewards/margins": 1.0764187318938119, "rewards/rejected": -0.563885509967804, "step": 1615 }, { "epoch": 0.08565446691225187, "grad_norm": 62.75, "kl": 0.4246025085449219, "learning_rate": 5e-07, "logits/chosen": -63484490.666666664, "logits/rejected": -53637836.8, "logps/chosen": -720.952392578125, "logps/rejected": -526.517529296875, "loss": 0.2983, "rewards/chosen": 0.10007119178771973, "rewards/margins": 1.9636114597320558, "rewards/rejected": -1.863540267944336, "step": 1616 }, { "epoch": 0.08570747091405401, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24984710.0, "logits/rejected": -17938769.333333332, "logps/chosen": -181.34788513183594, "logps/rejected": -338.7002766927083, "loss": 0.2739, "rewards/chosen": 0.05900688096880913, "rewards/margins": 1.7431969953080018, "rewards/rejected": -1.6841901143391926, "step": 1617 }, { "epoch": 0.08576047491585614, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70835280.0, "logits/rejected": -29534763.42857143, "logps/chosen": -483.88604736328125, "logps/rejected": -230.13352748325892, "loss": 0.3241, "rewards/chosen": 0.17492370307445526, "rewards/margins": 1.0902793854475021, "rewards/rejected": -0.9153556823730469, "step": 1618 }, { "epoch": 0.08581347891765828, "grad_norm": 53.5, "kl": 0.19219207763671875, "learning_rate": 5e-07, "logits/chosen": -1350190.6666666667, "logits/rejected": -61466592.0, "logps/chosen": -383.6529947916667, "logps/rejected": -311.6369873046875, "loss": 0.3319, "rewards/chosen": 0.20798643430074057, "rewards/margins": 1.3758502801259358, "rewards/rejected": -1.1678638458251953, "step": 1619 }, { "epoch": 0.08586648291946042, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7790365.333333333, "logits/rejected": -37719206.4, "logps/chosen": -60.11329650878906, "logps/rejected": -261.3402099609375, "loss": 0.3106, "rewards/chosen": 0.3720939556757609, "rewards/margins": 1.56231898466746, "rewards/rejected": -1.1902250289916991, "step": 1620 }, { "epoch": 0.08591948692126256, "grad_norm": 98.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 43774476.8, "logits/rejected": -56088496.0, "logps/chosen": -436.812109375, "logps/rejected": -444.4974365234375, "loss": 0.4227, "rewards/chosen": -0.20990328788757323, "rewards/margins": 1.2390403270721435, "rewards/rejected": -1.4489436149597168, "step": 1621 }, { "epoch": 0.08597249092306469, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23594410.666666668, "logits/rejected": -38769318.4, "logps/chosen": -435.7576904296875, "logps/rejected": -362.4013427734375, "loss": 0.3272, "rewards/chosen": 0.5332376956939697, "rewards/margins": 1.4137281894683837, "rewards/rejected": -0.880490493774414, "step": 1622 }, { "epoch": 0.08602549492486683, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66017996.8, "logits/rejected": -12349098.666666666, "logps/chosen": -372.9576171875, "logps/rejected": -97.4680887858073, "loss": 0.3886, "rewards/chosen": 0.3857278347015381, "rewards/margins": 0.9773788770039877, "rewards/rejected": -0.5916510423024496, "step": 1623 }, { "epoch": 0.08607849892666897, "grad_norm": 57.75, "kl": 0.38355064392089844, "learning_rate": 5e-07, "logits/chosen": -31676259.2, "logits/rejected": -24202944.0, "logps/chosen": -367.012109375, "logps/rejected": -191.90706380208334, "loss": 0.3771, "rewards/chosen": 0.24333927631378174, "rewards/margins": 1.4977785507837933, "rewards/rejected": -1.2544392744700115, "step": 1624 }, { "epoch": 0.0861315029284711, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33338550.0, "logits/rejected": -55245356.0, "logps/chosen": -335.00872802734375, "logps/rejected": -283.36090087890625, "loss": 0.3835, "rewards/chosen": -0.1559045910835266, "rewards/margins": 1.0791321396827698, "rewards/rejected": -1.2350367307662964, "step": 1625 }, { "epoch": 0.08618450693027324, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58415896.0, "logits/rejected": -10847259.333333334, "logps/chosen": -364.68634033203125, "logps/rejected": -145.10539754231772, "loss": 0.3817, "rewards/chosen": -0.6058456897735596, "rewards/margins": 0.418155590693156, "rewards/rejected": -1.0240012804667156, "step": 1626 }, { "epoch": 0.08623751093207538, "grad_norm": 65.0, "kl": 0.6515312194824219, "learning_rate": 5e-07, "logits/chosen": -26126853.333333332, "logits/rejected": -27329321.6, "logps/chosen": -716.9190266927084, "logps/rejected": -311.537353515625, "loss": 0.3397, "rewards/chosen": 0.1963273286819458, "rewards/margins": 1.4326130151748657, "rewards/rejected": -1.23628568649292, "step": 1627 }, { "epoch": 0.0862905149338775, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46220576.0, "logits/rejected": -22721588.0, "logps/chosen": -385.4778137207031, "logps/rejected": -133.36151123046875, "loss": 0.388, "rewards/chosen": -0.11005458235740662, "rewards/margins": 1.0370269119739532, "rewards/rejected": -1.1470814943313599, "step": 1628 }, { "epoch": 0.08634351893567964, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -317365.875, "logits/rejected": -15552670.0, "logps/chosen": -1208.569580078125, "logps/rejected": -179.00830078125, "loss": 0.3979, "rewards/chosen": -0.37271732091903687, "rewards/margins": 0.9325096011161804, "rewards/rejected": -1.3052269220352173, "step": 1629 }, { "epoch": 0.08639652293748178, "grad_norm": 64.5, "kl": 0.008363723754882812, "learning_rate": 5e-07, "logits/chosen": -6169664.666666667, "logits/rejected": -28825084.0, "logps/chosen": -289.4004720052083, "logps/rejected": -396.66082763671875, "loss": 0.3484, "rewards/chosen": 0.3803393840789795, "rewards/margins": 2.2904014587402344, "rewards/rejected": -1.9100620746612549, "step": 1630 }, { "epoch": 0.08644952693928391, "grad_norm": 70.5, "kl": 0.5367622375488281, "learning_rate": 5e-07, "logits/chosen": -69046544.0, "logits/rejected": -70836256.0, "logps/chosen": -479.8860677083333, "logps/rejected": -254.0372314453125, "loss": 0.3912, "rewards/chosen": 0.17799301942189535, "rewards/margins": 2.0633380015691123, "rewards/rejected": -1.8853449821472168, "step": 1631 }, { "epoch": 0.08650253094108605, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22430712.0, "logits/rejected": -12084977.0, "logps/chosen": -199.9685262044271, "logps/rejected": -201.0689697265625, "loss": 0.4305, "rewards/chosen": 0.059125691652297974, "rewards/margins": 1.0592203438282013, "rewards/rejected": -1.0000946521759033, "step": 1632 }, { "epoch": 0.08655553494288819, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19831760.0, "logits/rejected": -16016468.8, "logps/chosen": -410.1295166015625, "logps/rejected": -320.603759765625, "loss": 0.3084, "rewards/chosen": -0.17970174551010132, "rewards/margins": 1.6225943207740783, "rewards/rejected": -1.8022960662841796, "step": 1633 }, { "epoch": 0.08660853894469032, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34506277.333333336, "logits/rejected": 19888324.8, "logps/chosen": -413.1578369140625, "logps/rejected": -581.50556640625, "loss": 0.301, "rewards/chosen": -0.032869468132654824, "rewards/margins": 2.4320782323678336, "rewards/rejected": -2.4649477005004883, "step": 1634 }, { "epoch": 0.08666154294649246, "grad_norm": 52.0, "kl": 0.15178298950195312, "learning_rate": 5e-07, "logits/chosen": -21173750.0, "logits/rejected": -36715752.0, "logps/chosen": -323.95281982421875, "logps/rejected": -332.53985595703125, "loss": 0.296, "rewards/chosen": 0.26149874925613403, "rewards/margins": 2.108795702457428, "rewards/rejected": -1.847296953201294, "step": 1635 }, { "epoch": 0.0867145469482946, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41553530.666666664, "logits/rejected": -52284339.2, "logps/chosen": -338.2940266927083, "logps/rejected": -435.9794921875, "loss": 0.2592, "rewards/chosen": -0.16097610195477804, "rewards/margins": 2.6385392407576242, "rewards/rejected": -2.7995153427124024, "step": 1636 }, { "epoch": 0.08676755095009674, "grad_norm": 60.25, "kl": 0.59600830078125, "learning_rate": 5e-07, "logits/chosen": -43356485.333333336, "logits/rejected": -17096019.2, "logps/chosen": -505.9624837239583, "logps/rejected": -351.807568359375, "loss": 0.3502, "rewards/chosen": 0.09119568268458049, "rewards/margins": 1.2447919885317484, "rewards/rejected": -1.1535963058471679, "step": 1637 }, { "epoch": 0.08682055495189887, "grad_norm": 61.75, "kl": 0.14076995849609375, "learning_rate": 5e-07, "logits/chosen": 38715270.4, "logits/rejected": -68576218.66666667, "logps/chosen": -414.229296875, "logps/rejected": -413.079345703125, "loss": 0.3544, "rewards/chosen": 0.08052652478218078, "rewards/margins": 2.056726310650508, "rewards/rejected": -1.976199785868327, "step": 1638 }, { "epoch": 0.08687355895370101, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29187506.0, "logits/rejected": 525288.25, "logps/chosen": -488.8035888671875, "logps/rejected": -320.50335693359375, "loss": 0.3204, "rewards/chosen": 0.3962387144565582, "rewards/margins": 1.6734580099582672, "rewards/rejected": -1.277219295501709, "step": 1639 }, { "epoch": 0.08692656295550313, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27286016.0, "logits/rejected": -20444125.333333332, "logps/chosen": -317.973095703125, "logps/rejected": -242.98421223958334, "loss": 0.347, "rewards/chosen": 0.40834474563598633, "rewards/margins": 1.5550274848937988, "rewards/rejected": -1.1466827392578125, "step": 1640 }, { "epoch": 0.08697956695730527, "grad_norm": 74.5, "kl": 1.1143035888671875, "learning_rate": 5e-07, "logits/chosen": -38849625.6, "logits/rejected": -21499928.0, "logps/chosen": -531.9845703125, "logps/rejected": -249.1875203450521, "loss": 0.4125, "rewards/chosen": 0.24796814918518068, "rewards/margins": 1.144753646850586, "rewards/rejected": -0.8967854976654053, "step": 1641 }, { "epoch": 0.08703257095910741, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32018105.6, "logits/rejected": -18729168.0, "logps/chosen": -617.93212890625, "logps/rejected": -322.24025472005206, "loss": 0.3395, "rewards/chosen": 0.2399169921875, "rewards/margins": 1.976160717010498, "rewards/rejected": -1.736243724822998, "step": 1642 }, { "epoch": 0.08708557496090955, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25994488.0, "logits/rejected": -28310576.0, "logps/chosen": -428.1328938802083, "logps/rejected": -152.150634765625, "loss": 0.3911, "rewards/chosen": 0.3162689407666524, "rewards/margins": 1.2200435598691304, "rewards/rejected": -0.903774619102478, "step": 1643 }, { "epoch": 0.08713857896271168, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4392899.0, "logits/rejected": -9947688.8, "logps/chosen": -130.30699666341147, "logps/rejected": -387.898876953125, "loss": 0.3415, "rewards/chosen": -0.0975293517112732, "rewards/margins": 1.2142869591712953, "rewards/rejected": -1.3118163108825684, "step": 1644 }, { "epoch": 0.08719158296451382, "grad_norm": 60.0, "kl": 0.6952438354492188, "learning_rate": 5e-07, "logits/chosen": -25765496.0, "logits/rejected": -20633920.0, "logps/chosen": -623.019775390625, "logps/rejected": -240.83407592773438, "loss": 0.2808, "rewards/chosen": 0.5140869617462158, "rewards/margins": 2.250002145767212, "rewards/rejected": -1.735915184020996, "step": 1645 }, { "epoch": 0.08724458696631596, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20983328.0, "logits/rejected": -18086657.333333332, "logps/chosen": -285.97454833984375, "logps/rejected": -404.5594889322917, "loss": 0.2471, "rewards/chosen": 0.4925277829170227, "rewards/margins": 1.9460825721422832, "rewards/rejected": -1.4535547892252605, "step": 1646 }, { "epoch": 0.0872975909681181, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19159018.666666668, "logits/rejected": -2463192.0, "logps/chosen": -656.0649820963541, "logps/rejected": -246.6472412109375, "loss": 0.3095, "rewards/chosen": 0.2990560134251912, "rewards/margins": 1.5035147269566853, "rewards/rejected": -1.2044587135314941, "step": 1647 }, { "epoch": 0.08735059496992023, "grad_norm": 68.5, "kl": 0.163238525390625, "learning_rate": 5e-07, "logits/chosen": 73122316.8, "logits/rejected": -15014036.0, "logps/chosen": -339.59052734375, "logps/rejected": -125.76267496744792, "loss": 0.4116, "rewards/chosen": 0.22554855346679686, "rewards/margins": 0.9332128842671712, "rewards/rejected": -0.7076643308003744, "step": 1648 }, { "epoch": 0.08740359897172237, "grad_norm": 42.25, "kl": 0.11883544921875, "learning_rate": 5e-07, "logits/chosen": -46163948.0, "logits/rejected": -54598864.0, "logps/chosen": -329.290771484375, "logps/rejected": -568.97119140625, "loss": 0.322, "rewards/chosen": 0.22577553987503052, "rewards/margins": 2.4985631108283997, "rewards/rejected": -2.272787570953369, "step": 1649 }, { "epoch": 0.0874566029735245, "grad_norm": 73.0, "kl": 0.6596460342407227, "learning_rate": 5e-07, "logits/chosen": -13059705.333333334, "logits/rejected": -19595330.0, "logps/chosen": -364.040771484375, "logps/rejected": -280.50115966796875, "loss": 0.4238, "rewards/chosen": 0.4001307487487793, "rewards/margins": 0.7253676354885101, "rewards/rejected": -0.32523688673973083, "step": 1650 }, { "epoch": 0.08750960697532664, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 780946.0, "logits/rejected": -17848704.0, "logps/chosen": -281.9460856119792, "logps/rejected": -435.000244140625, "loss": 0.4417, "rewards/chosen": -0.005953567723433177, "rewards/margins": 1.027342659731706, "rewards/rejected": -1.0332962274551392, "step": 1651 }, { "epoch": 0.08756261097712878, "grad_norm": 64.5, "kl": 0.1331806182861328, "learning_rate": 5e-07, "logits/chosen": -50475776.0, "logits/rejected": -31653364.0, "logps/chosen": -393.83868408203125, "logps/rejected": -318.8677673339844, "loss": 0.3069, "rewards/chosen": 0.37683621048927307, "rewards/margins": 2.1134590208530426, "rewards/rejected": -1.7366228103637695, "step": 1652 }, { "epoch": 0.0876156149789309, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45375984.0, "logits/rejected": -24422534.85714286, "logps/chosen": -332.5481262207031, "logps/rejected": -293.14913504464283, "loss": 0.2025, "rewards/chosen": 0.24402160942554474, "rewards/margins": 2.0720714053937366, "rewards/rejected": -1.828049795968192, "step": 1653 }, { "epoch": 0.08766861898073304, "grad_norm": 89.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53832677.333333336, "logits/rejected": 36503120.0, "logps/chosen": -236.70723470052084, "logps/rejected": -189.2877197265625, "loss": 0.375, "rewards/chosen": 0.28063758214314777, "rewards/margins": 1.00328262646993, "rewards/rejected": -0.7226450443267822, "step": 1654 }, { "epoch": 0.08772162298253518, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52435120.0, "logits/rejected": 2453404.8333333335, "logps/chosen": -477.810546875, "logps/rejected": -116.30747477213542, "loss": 0.4275, "rewards/chosen": 0.08816269636154175, "rewards/margins": 0.7401185393333435, "rewards/rejected": -0.6519558429718018, "step": 1655 }, { "epoch": 0.08777462698433731, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15744452.0, "logits/rejected": -25709276.0, "logps/chosen": -160.11361694335938, "logps/rejected": -220.69989013671875, "loss": 0.3299, "rewards/chosen": 0.09459391236305237, "rewards/margins": 1.6449619829654694, "rewards/rejected": -1.550368070602417, "step": 1656 }, { "epoch": 0.08782763098613945, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15435849.0, "logits/rejected": -9790658.0, "logps/chosen": -395.10015869140625, "logps/rejected": -219.52018229166666, "loss": 0.3157, "rewards/chosen": 0.382559210062027, "rewards/margins": 1.3225034773349762, "rewards/rejected": -0.9399442672729492, "step": 1657 }, { "epoch": 0.08788063498794159, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30902492.0, "logits/rejected": -37153552.0, "logps/chosen": -388.0698547363281, "logps/rejected": -543.9616088867188, "loss": 0.2616, "rewards/chosen": 0.16309280693531036, "rewards/margins": 3.065410330891609, "rewards/rejected": -2.902317523956299, "step": 1658 }, { "epoch": 0.08793363898974373, "grad_norm": 50.25, "kl": 0.3159828186035156, "learning_rate": 5e-07, "logits/chosen": -15571894.4, "logits/rejected": -3538522.6666666665, "logps/chosen": -287.4860595703125, "logps/rejected": -369.8428955078125, "loss": 0.4091, "rewards/chosen": -0.07217719554901122, "rewards/margins": 1.4744676510492962, "rewards/rejected": -1.5466448465983074, "step": 1659 }, { "epoch": 0.08798664299154586, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28280520.0, "logits/rejected": -26358365.333333332, "logps/chosen": -101.91341400146484, "logps/rejected": -422.7779947916667, "loss": 0.2531, "rewards/chosen": 0.04440803453326225, "rewards/margins": 1.7735400510330994, "rewards/rejected": -1.7291320164998372, "step": 1660 }, { "epoch": 0.088039646993348, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64623210.666666664, "logits/rejected": -13419962.4, "logps/chosen": -236.23701985677084, "logps/rejected": -277.684033203125, "loss": 0.3425, "rewards/chosen": -0.2488581339518229, "rewards/margins": 1.2637242635091146, "rewards/rejected": -1.5125823974609376, "step": 1661 }, { "epoch": 0.08809265099515014, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 20513502.666666668, "logits/rejected": -9660041.6, "logps/chosen": -289.9200439453125, "logps/rejected": -138.0837890625, "loss": 0.3304, "rewards/chosen": -0.07940966387589772, "rewards/margins": 1.2369904826084774, "rewards/rejected": -1.316400146484375, "step": 1662 }, { "epoch": 0.08814565499695227, "grad_norm": 65.5, "kl": 0.27994537353515625, "learning_rate": 5e-07, "logits/chosen": 1145923.2, "logits/rejected": -30050018.666666668, "logps/chosen": -375.874267578125, "logps/rejected": -254.5341796875, "loss": 0.3885, "rewards/chosen": 0.04358566403388977, "rewards/margins": 1.4785433272520703, "rewards/rejected": -1.4349576632181804, "step": 1663 }, { "epoch": 0.08819865899875441, "grad_norm": 76.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 46653024.0, "logits/rejected": -28944404.0, "logps/chosen": -269.952392578125, "logps/rejected": -178.45449829101562, "loss": 0.5146, "rewards/chosen": -0.37001657485961914, "rewards/margins": 0.561142086982727, "rewards/rejected": -0.9311586618423462, "step": 1664 }, { "epoch": 0.08825166300055655, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14010169.333333334, "logits/rejected": -16371954.0, "logps/chosen": -343.91943359375, "logps/rejected": -183.63003540039062, "loss": 0.3549, "rewards/chosen": 0.3337666988372803, "rewards/margins": 2.0878255367279053, "rewards/rejected": -1.754058837890625, "step": 1665 }, { "epoch": 0.08830466700235867, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28679112.0, "logits/rejected": -68070448.0, "logps/chosen": -505.6568603515625, "logps/rejected": -225.52286783854166, "loss": 0.4004, "rewards/chosen": -0.19753265380859375, "rewards/margins": 0.4244225025177002, "rewards/rejected": -0.621955156326294, "step": 1666 }, { "epoch": 0.08835767100416081, "grad_norm": 54.0, "kl": 0.13252639770507812, "learning_rate": 5e-07, "logits/chosen": -27999334.4, "logits/rejected": -27015114.666666668, "logps/chosen": -758.331005859375, "logps/rejected": -407.7259114583333, "loss": 0.3301, "rewards/chosen": 0.5983915328979492, "rewards/margins": 2.4806067148844404, "rewards/rejected": -1.882215181986491, "step": 1667 }, { "epoch": 0.08841067500596295, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -87805893.33333333, "logits/rejected": -27810470.4, "logps/chosen": -300.0775146484375, "logps/rejected": -340.509423828125, "loss": 0.2974, "rewards/chosen": 0.14863687753677368, "rewards/margins": 1.821043646335602, "rewards/rejected": -1.6724067687988282, "step": 1668 }, { "epoch": 0.08846367900776508, "grad_norm": 59.0, "kl": 0.05139923095703125, "learning_rate": 5e-07, "logits/chosen": -13901665.0, "logits/rejected": 19312648.0, "logps/chosen": -431.638916015625, "logps/rejected": -255.45916748046875, "loss": 0.3372, "rewards/chosen": 0.2276287078857422, "rewards/margins": 1.5409196615219116, "rewards/rejected": -1.3132909536361694, "step": 1669 }, { "epoch": 0.08851668300956722, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46632508.8, "logits/rejected": -30264090.666666668, "logps/chosen": -402.440185546875, "logps/rejected": -346.3909505208333, "loss": 0.3709, "rewards/chosen": -0.16348892450332642, "rewards/margins": 2.4034749070803323, "rewards/rejected": -2.5669638315836587, "step": 1670 }, { "epoch": 0.08856968701136936, "grad_norm": 86.0, "kl": 0.038249969482421875, "learning_rate": 5e-07, "logits/chosen": -23127426.666666668, "logits/rejected": 67430048.0, "logps/chosen": -745.800048828125, "logps/rejected": -243.30128479003906, "loss": 0.4128, "rewards/chosen": 0.3457612593968709, "rewards/margins": 0.8047522207101185, "rewards/rejected": -0.4589909613132477, "step": 1671 }, { "epoch": 0.0886226910131715, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18926360.0, "logits/rejected": -44556080.0, "logps/chosen": -131.38402303059897, "logps/rejected": -517.933056640625, "loss": 0.2799, "rewards/chosen": -0.022313882907231648, "rewards/margins": 2.074216268459956, "rewards/rejected": -2.0965301513671877, "step": 1672 }, { "epoch": 0.08867569501497363, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14982237.333333334, "logits/rejected": -486762.0, "logps/chosen": -262.28765869140625, "logps/rejected": -278.6033203125, "loss": 0.366, "rewards/chosen": -0.1423090696334839, "rewards/margins": 1.0365888833999635, "rewards/rejected": -1.1788979530334474, "step": 1673 }, { "epoch": 0.08872869901677577, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29117517.333333332, "logits/rejected": -44683878.4, "logps/chosen": -302.04709879557294, "logps/rejected": -462.130908203125, "loss": 0.2609, "rewards/chosen": 0.1773752768834432, "rewards/margins": 2.2842691977818808, "rewards/rejected": -2.1068939208984374, "step": 1674 }, { "epoch": 0.0887817030185779, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50726448.0, "logits/rejected": -60757440.0, "logps/chosen": -239.4547119140625, "logps/rejected": -359.9642578125, "loss": 0.3431, "rewards/chosen": -0.0005884766578674316, "rewards/margins": 1.3218470931053161, "rewards/rejected": -1.3224355697631835, "step": 1675 }, { "epoch": 0.08883470702038004, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42199760.0, "logits/rejected": -40125990.4, "logps/chosen": -365.802001953125, "logps/rejected": -301.351220703125, "loss": 0.3212, "rewards/chosen": 0.3828663428624471, "rewards/margins": 1.500369127591451, "rewards/rejected": -1.117502784729004, "step": 1676 }, { "epoch": 0.08888771102218218, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54380800.0, "logits/rejected": -40981068.0, "logps/chosen": -402.234375, "logps/rejected": -266.9542236328125, "loss": 0.362, "rewards/chosen": 0.24293097853660583, "rewards/margins": 1.2442816197872162, "rewards/rejected": -1.0013506412506104, "step": 1677 }, { "epoch": 0.0889407150239843, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12846125.333333334, "logits/rejected": -21894870.0, "logps/chosen": -269.7917073567708, "logps/rejected": -134.6622314453125, "loss": 0.4547, "rewards/chosen": -0.06313718358675639, "rewards/margins": 0.9542021652062734, "rewards/rejected": -1.0173393487930298, "step": 1678 }, { "epoch": 0.08899371902578644, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46743161.6, "logits/rejected": -83419653.33333333, "logps/chosen": -365.95322265625, "logps/rejected": -258.9203287760417, "loss": 0.3545, "rewards/chosen": 0.10596771240234375, "rewards/margins": 2.016858927408854, "rewards/rejected": -1.9108912150065105, "step": 1679 }, { "epoch": 0.08904672302758858, "grad_norm": 74.0, "kl": 0.01438140869140625, "learning_rate": 5e-07, "logits/chosen": -21933595.42857143, "logits/rejected": 800672.8125, "logps/chosen": -253.50324358258928, "logps/rejected": -140.95840454101562, "loss": 0.4774, "rewards/chosen": -0.06735949431146894, "rewards/margins": 1.3193310030869074, "rewards/rejected": -1.3866904973983765, "step": 1680 }, { "epoch": 0.08909972702939072, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22494250.666666668, "logits/rejected": -13829718.4, "logps/chosen": -268.38319905598956, "logps/rejected": -297.304150390625, "loss": 0.3354, "rewards/chosen": 0.11743622024854024, "rewards/margins": 1.7552209873994191, "rewards/rejected": -1.637784767150879, "step": 1681 }, { "epoch": 0.08915273103119285, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22104908.8, "logits/rejected": -26960154.666666668, "logps/chosen": -138.54788818359376, "logps/rejected": -222.98299153645834, "loss": 0.4381, "rewards/chosen": -0.2386491298675537, "rewards/margins": 0.8791563510894775, "rewards/rejected": -1.1178054809570312, "step": 1682 }, { "epoch": 0.08920573503299499, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24760652.8, "logits/rejected": -54523136.0, "logps/chosen": -222.4774658203125, "logps/rejected": -172.93343098958334, "loss": 0.5034, "rewards/chosen": -0.3238878011703491, "rewards/margins": 0.21206063429514566, "rewards/rejected": -0.5359484354654948, "step": 1683 }, { "epoch": 0.08925873903479713, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32200680.0, "logits/rejected": -37143144.0, "logps/chosen": -205.95565795898438, "logps/rejected": -195.88107299804688, "loss": 0.3937, "rewards/chosen": 0.19924011826515198, "rewards/margins": 0.9092248976230621, "rewards/rejected": -0.7099847793579102, "step": 1684 }, { "epoch": 0.08931174303659926, "grad_norm": 55.5, "kl": 0.16553115844726562, "learning_rate": 5e-07, "logits/chosen": -19944188.8, "logits/rejected": -14723674.666666666, "logps/chosen": -305.238623046875, "logps/rejected": -230.19108072916666, "loss": 0.3543, "rewards/chosen": 0.4641775131225586, "rewards/margins": 1.439661725362142, "rewards/rejected": -0.9754842122395834, "step": 1685 }, { "epoch": 0.0893647470384014, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1851151.375, "logits/rejected": -44293536.0, "logps/chosen": -338.4107666015625, "logps/rejected": -268.265625, "loss": 0.2738, "rewards/chosen": -0.06913328170776367, "rewards/margins": 1.444229284922282, "rewards/rejected": -1.5133625666300456, "step": 1686 }, { "epoch": 0.08941775104020354, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20833568.0, "logits/rejected": -56689290.666666664, "logps/chosen": -264.007275390625, "logps/rejected": -349.1203206380208, "loss": 0.4144, "rewards/chosen": -0.15619728565216065, "rewards/margins": 1.2803515831629435, "rewards/rejected": -1.4365488688151042, "step": 1687 }, { "epoch": 0.08947075504200568, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7846810.0, "logits/rejected": -4704694.0, "logps/chosen": -294.1322326660156, "logps/rejected": -138.91305541992188, "loss": 0.3806, "rewards/chosen": -0.10139942169189453, "rewards/margins": 1.1045061349868774, "rewards/rejected": -1.205905556678772, "step": 1688 }, { "epoch": 0.08952375904380781, "grad_norm": 71.0, "kl": 0.2504615783691406, "learning_rate": 5e-07, "logits/chosen": -47656022.4, "logits/rejected": 3135192.0, "logps/chosen": -488.137158203125, "logps/rejected": -269.975341796875, "loss": 0.2871, "rewards/chosen": 0.5756102085113526, "rewards/margins": 2.6627785841623943, "rewards/rejected": -2.0871683756510415, "step": 1689 }, { "epoch": 0.08957676304560995, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38228842.666666664, "logits/rejected": -23809145.6, "logps/chosen": -311.2182210286458, "logps/rejected": -256.3761474609375, "loss": 0.3248, "rewards/chosen": -0.08519300818443298, "rewards/margins": 1.399879413843155, "rewards/rejected": -1.485072422027588, "step": 1690 }, { "epoch": 0.08962976704741207, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25501540.0, "logits/rejected": -11792428.0, "logps/chosen": -149.30511474609375, "logps/rejected": -163.32772827148438, "loss": 0.409, "rewards/chosen": 0.019484519958496094, "rewards/margins": 0.7937345504760742, "rewards/rejected": -0.7742500305175781, "step": 1691 }, { "epoch": 0.08968277104921421, "grad_norm": 67.0, "kl": 0.43294525146484375, "learning_rate": 5e-07, "logits/chosen": -18464137.6, "logits/rejected": -30767405.333333332, "logps/chosen": -535.4275390625, "logps/rejected": -357.0411783854167, "loss": 0.2975, "rewards/chosen": 0.7158166885375976, "rewards/margins": 2.233320109049479, "rewards/rejected": -1.5175034205118816, "step": 1692 }, { "epoch": 0.08973577505101635, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29791072.0, "logits/rejected": 12962819.0, "logps/chosen": -391.3080240885417, "logps/rejected": -214.86724853515625, "loss": 0.4178, "rewards/chosen": 0.1255466341972351, "rewards/margins": 1.1667587161064148, "rewards/rejected": -1.0412120819091797, "step": 1693 }, { "epoch": 0.08978877905281848, "grad_norm": 155.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58910490.666666664, "logits/rejected": -53660985.6, "logps/chosen": -424.3516438802083, "logps/rejected": -189.36624755859376, "loss": 0.2854, "rewards/chosen": 0.2643292744954427, "rewards/margins": 1.9825158437093098, "rewards/rejected": -1.7181865692138671, "step": 1694 }, { "epoch": 0.08984178305462062, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2418378.8333333335, "logits/rejected": -39937081.6, "logps/chosen": -133.2166748046875, "logps/rejected": -373.71220703125, "loss": 0.3406, "rewards/chosen": 0.24324798583984375, "rewards/margins": 1.5778679847717285, "rewards/rejected": -1.3346199989318848, "step": 1695 }, { "epoch": 0.08989478705642276, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63570060.0, "logits/rejected": 6898780.666666667, "logps/chosen": -206.85609436035156, "logps/rejected": -304.07834879557294, "loss": 0.2902, "rewards/chosen": -0.30948030948638916, "rewards/margins": 1.1798240741093953, "rewards/rejected": -1.4893043835957844, "step": 1696 }, { "epoch": 0.0899477910582249, "grad_norm": 55.25, "kl": 0.24433135986328125, "learning_rate": 5e-07, "logits/chosen": 8766236.0, "logits/rejected": -22988336.0, "logps/chosen": -323.49359130859375, "logps/rejected": -357.134765625, "loss": 0.3422, "rewards/chosen": 0.17755575478076935, "rewards/margins": 1.6818834096193314, "rewards/rejected": -1.504327654838562, "step": 1697 }, { "epoch": 0.09000079506002703, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38537458.666666664, "logits/rejected": -49376048.0, "logps/chosen": -366.76806640625, "logps/rejected": -313.69708251953125, "loss": 0.3927, "rewards/chosen": 0.21220576763153076, "rewards/margins": 1.5527087450027466, "rewards/rejected": -1.3405029773712158, "step": 1698 }, { "epoch": 0.09005379906182917, "grad_norm": 95.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32892780.8, "logits/rejected": -4633780.666666667, "logps/chosen": -661.0962890625, "logps/rejected": -90.36789957682292, "loss": 0.4118, "rewards/chosen": -0.0495880126953125, "rewards/margins": 1.2952550570170085, "rewards/rejected": -1.344843069712321, "step": 1699 }, { "epoch": 0.09010680306363131, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29788364.8, "logits/rejected": 15938618.666666666, "logps/chosen": -241.1885986328125, "logps/rejected": -276.48537190755206, "loss": 0.3939, "rewards/chosen": -0.057762032747268675, "rewards/margins": 1.4060026347637176, "rewards/rejected": -1.4637646675109863, "step": 1700 }, { "epoch": 0.09015980706543344, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1051960.375, "logits/rejected": -22402176.0, "logps/chosen": -31.54668426513672, "logps/rejected": -409.208251953125, "loss": 0.2268, "rewards/chosen": 0.9236981868743896, "rewards/margins": 2.4826784928639727, "rewards/rejected": -1.5589803059895833, "step": 1701 }, { "epoch": 0.09021281106723558, "grad_norm": 69.5, "kl": 0.5488128662109375, "learning_rate": 5e-07, "logits/chosen": -23036817.6, "logits/rejected": -13700136.0, "logps/chosen": -487.2556640625, "logps/rejected": -203.00201416015625, "loss": 0.3521, "rewards/chosen": 0.2671952724456787, "rewards/margins": 1.9579943497975667, "rewards/rejected": -1.690799077351888, "step": 1702 }, { "epoch": 0.09026581506903772, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 16301434.666666666, "logits/rejected": -4833944.0, "logps/chosen": -345.279541015625, "logps/rejected": -138.41729736328125, "loss": 0.4084, "rewards/chosen": 0.16704084475835165, "rewards/margins": 1.2443548639615376, "rewards/rejected": -1.077314019203186, "step": 1703 }, { "epoch": 0.09031881907083984, "grad_norm": 62.5, "kl": 0.0120697021484375, "learning_rate": 5e-07, "logits/chosen": -50914261.333333336, "logits/rejected": 4004130.25, "logps/chosen": -474.9666341145833, "logps/rejected": -305.4700622558594, "loss": 0.4739, "rewards/chosen": -0.3777014414469401, "rewards/margins": 1.4336576064427693, "rewards/rejected": -1.8113590478897095, "step": 1704 }, { "epoch": 0.09037182307264198, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48948016.0, "logits/rejected": -49832396.0, "logps/chosen": -175.92520141601562, "logps/rejected": -384.7658996582031, "loss": 0.3848, "rewards/chosen": -0.147308349609375, "rewards/margins": 1.5956027507781982, "rewards/rejected": -1.7429111003875732, "step": 1705 }, { "epoch": 0.09042482707444412, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57359640.0, "logits/rejected": -13237764.0, "logps/chosen": -862.1851806640625, "logps/rejected": -416.5729166666667, "loss": 0.2829, "rewards/chosen": -0.09988250583410263, "rewards/margins": 1.6535275156299274, "rewards/rejected": -1.75341002146403, "step": 1706 }, { "epoch": 0.09047783107624625, "grad_norm": 78.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7709552.0, "logits/rejected": -68570280.0, "logps/chosen": -679.0202026367188, "logps/rejected": -278.0691223144531, "loss": 0.262, "rewards/chosen": 0.4836570918560028, "rewards/margins": 2.5159649550914764, "rewards/rejected": -2.0323078632354736, "step": 1707 }, { "epoch": 0.09053083507804839, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30792560.0, "logits/rejected": -53424704.0, "logps/chosen": -372.80206298828125, "logps/rejected": -449.1983119419643, "loss": 0.1597, "rewards/chosen": -0.1655120849609375, "rewards/margins": 2.0623039518083846, "rewards/rejected": -2.227816036769322, "step": 1708 }, { "epoch": 0.09058383907985053, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67454730.66666667, "logits/rejected": 6377221.6, "logps/chosen": -375.5600992838542, "logps/rejected": -258.8365234375, "loss": 0.397, "rewards/chosen": -0.33455808957417804, "rewards/margins": 0.6453411261240642, "rewards/rejected": -0.9798992156982422, "step": 1709 }, { "epoch": 0.09063684308165267, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -97360480.0, "logits/rejected": -20806342.85714286, "logps/chosen": -489.7130126953125, "logps/rejected": -197.38190569196428, "loss": 0.2454, "rewards/chosen": 0.13063354790210724, "rewards/margins": 1.4906902887991496, "rewards/rejected": -1.3600567408970423, "step": 1710 }, { "epoch": 0.0906898470834548, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54696426.666666664, "logits/rejected": 13447952.0, "logps/chosen": -323.39208984375, "logps/rejected": -411.48740234375, "loss": 0.3142, "rewards/chosen": -0.05814005434513092, "rewards/margins": 1.6219100326299667, "rewards/rejected": -1.6800500869750976, "step": 1711 }, { "epoch": 0.09074285108525694, "grad_norm": 52.0, "kl": 0.6485652923583984, "learning_rate": 5e-07, "logits/chosen": -17868521.333333332, "logits/rejected": -13549204.0, "logps/chosen": -219.84505208333334, "logps/rejected": -267.724365234375, "loss": 0.4419, "rewards/chosen": 0.061546514431635536, "rewards/margins": 1.1628031233946483, "rewards/rejected": -1.1012566089630127, "step": 1712 }, { "epoch": 0.09079585508705908, "grad_norm": 48.25, "kl": 0.20481586456298828, "learning_rate": 5e-07, "logits/chosen": -16534202.0, "logits/rejected": -621492.125, "logps/chosen": -311.842041015625, "logps/rejected": -290.8031921386719, "loss": 0.3692, "rewards/chosen": 0.09728802740573883, "rewards/margins": 1.2740634828805923, "rewards/rejected": -1.1767754554748535, "step": 1713 }, { "epoch": 0.09084885908886121, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3659817.0, "logits/rejected": -34046392.0, "logps/chosen": -42.75046920776367, "logps/rejected": -294.50689697265625, "loss": 0.269, "rewards/chosen": 0.26350706815719604, "rewards/margins": 1.8390870292981465, "rewards/rejected": -1.5755799611409504, "step": 1714 }, { "epoch": 0.09090186309066335, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17912478.0, "logits/rejected": -11477386.0, "logps/chosen": -305.5788269042969, "logps/rejected": -183.1956787109375, "loss": 0.3599, "rewards/chosen": 0.29851168394088745, "rewards/margins": 1.2239976525306702, "rewards/rejected": -0.9254859685897827, "step": 1715 }, { "epoch": 0.09095486709246547, "grad_norm": 57.5, "kl": 0.2292327880859375, "learning_rate": 5e-07, "logits/chosen": -22851654.4, "logits/rejected": -14431352.0, "logps/chosen": -356.6897216796875, "logps/rejected": -266.69106038411456, "loss": 0.2814, "rewards/chosen": 0.48432273864746095, "rewards/margins": 2.730442555745443, "rewards/rejected": -2.246119817097982, "step": 1716 }, { "epoch": 0.09100787109426761, "grad_norm": 51.25, "kl": 0.1774454116821289, "learning_rate": 5e-07, "logits/chosen": -22866930.0, "logits/rejected": -5962704.0, "logps/chosen": -346.99078369140625, "logps/rejected": -336.8804931640625, "loss": 0.3716, "rewards/chosen": 0.16529475152492523, "rewards/margins": 1.4527724832296371, "rewards/rejected": -1.287477731704712, "step": 1717 }, { "epoch": 0.09106087509606975, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16375177.333333334, "logits/rejected": 7508243.5, "logps/chosen": -107.57666015625, "logps/rejected": -454.2059020996094, "loss": 0.3734, "rewards/chosen": 0.3484923839569092, "rewards/margins": 1.4909499883651733, "rewards/rejected": -1.1424576044082642, "step": 1718 }, { "epoch": 0.09111387909787189, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28130600.0, "logits/rejected": -4824840.666666667, "logps/chosen": -292.0907897949219, "logps/rejected": -129.5875447591146, "loss": 0.3975, "rewards/chosen": -0.009728813543915749, "rewards/margins": 0.6128280318031708, "rewards/rejected": -0.6225568453470866, "step": 1719 }, { "epoch": 0.09116688309967402, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27146754.0, "logits/rejected": -27785725.333333332, "logps/chosen": -346.1495666503906, "logps/rejected": -438.5891927083333, "loss": 0.2399, "rewards/chosen": -0.11206245422363281, "rewards/margins": 1.8577707608540852, "rewards/rejected": -1.969833215077718, "step": 1720 }, { "epoch": 0.09121988710147616, "grad_norm": 54.0, "kl": 0.5280036926269531, "learning_rate": 5e-07, "logits/chosen": -9076267.2, "logits/rejected": -11942840.0, "logps/chosen": -320.7325927734375, "logps/rejected": -85.46622721354167, "loss": 0.3936, "rewards/chosen": 0.16065155267715453, "rewards/margins": 1.4645466287930806, "rewards/rejected": -1.303895076115926, "step": 1721 }, { "epoch": 0.0912728911032783, "grad_norm": 58.75, "kl": 1.2179222106933594, "learning_rate": 5e-07, "logits/chosen": -40233096.0, "logits/rejected": -27073638.0, "logps/chosen": -490.40008544921875, "logps/rejected": -432.04241943359375, "loss": 0.2638, "rewards/chosen": 0.5669841766357422, "rewards/margins": 2.701807975769043, "rewards/rejected": -2.134823799133301, "step": 1722 }, { "epoch": 0.09132589510508043, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21621724.8, "logits/rejected": -22759880.0, "logps/chosen": -230.1266357421875, "logps/rejected": -353.1044108072917, "loss": 0.4129, "rewards/chosen": -0.159471595287323, "rewards/margins": 1.7094685991605123, "rewards/rejected": -1.8689401944478352, "step": 1723 }, { "epoch": 0.09137889910688257, "grad_norm": 46.25, "kl": 0.3949565887451172, "learning_rate": 5e-07, "logits/chosen": -2902089.0, "logits/rejected": -22510428.0, "logps/chosen": -215.78890991210938, "logps/rejected": -199.60403442382812, "loss": 0.3322, "rewards/chosen": 0.12268877774477005, "rewards/margins": 1.575181968510151, "rewards/rejected": -1.4524931907653809, "step": 1724 }, { "epoch": 0.09143190310868471, "grad_norm": 66.5, "kl": 1.3933029174804688, "learning_rate": 5e-07, "logits/chosen": -8736215.0, "logits/rejected": -26073912.0, "logps/chosen": -607.0252685546875, "logps/rejected": -268.7960205078125, "loss": 0.279, "rewards/chosen": 0.5190727114677429, "rewards/margins": 2.6353623270988464, "rewards/rejected": -2.1162896156311035, "step": 1725 }, { "epoch": 0.09148490711048685, "grad_norm": 76.0, "kl": 2.0760765075683594, "learning_rate": 5e-07, "logits/chosen": -89384888.0, "logits/rejected": -39152472.0, "logps/chosen": -707.2083740234375, "logps/rejected": -344.23419189453125, "loss": 0.3995, "rewards/chosen": 0.4453068971633911, "rewards/margins": 1.3482656478881836, "rewards/rejected": -0.9029587507247925, "step": 1726 }, { "epoch": 0.09153791111228898, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27767328.0, "logits/rejected": 9186654.4, "logps/chosen": -303.7428792317708, "logps/rejected": -343.46162109375, "loss": 0.3474, "rewards/chosen": 0.07284107307593028, "rewards/margins": 1.277609063188235, "rewards/rejected": -1.2047679901123047, "step": 1727 }, { "epoch": 0.09159091511409112, "grad_norm": 55.0, "kl": 0.031162261962890625, "learning_rate": 5e-07, "logits/chosen": -37510118.4, "logits/rejected": -68981536.0, "logps/chosen": -276.383837890625, "logps/rejected": -710.10986328125, "loss": 0.3029, "rewards/chosen": 0.35475113391876223, "rewards/margins": 2.9216254790623983, "rewards/rejected": -2.566874345143636, "step": 1728 }, { "epoch": 0.09164391911589324, "grad_norm": 57.5, "kl": 0.5767250061035156, "learning_rate": 5e-07, "logits/chosen": -32166280.0, "logits/rejected": -26734606.4, "logps/chosen": -526.0749918619791, "logps/rejected": -276.404541015625, "loss": 0.3049, "rewards/chosen": 0.28568994998931885, "rewards/margins": 1.790082287788391, "rewards/rejected": -1.5043923377990722, "step": 1729 }, { "epoch": 0.09169692311769538, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29663602.666666668, "logits/rejected": 7442728.5, "logps/chosen": -190.74296061197916, "logps/rejected": -269.4112854003906, "loss": 0.4168, "rewards/chosen": 0.02883955587943395, "rewards/margins": 1.494638887544473, "rewards/rejected": -1.465799331665039, "step": 1730 }, { "epoch": 0.09174992711949752, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59457888.0, "logits/rejected": -19067974.0, "logps/chosen": -315.22833251953125, "logps/rejected": -576.2647705078125, "loss": 0.3277, "rewards/chosen": -0.17156802117824554, "rewards/margins": 2.244662180542946, "rewards/rejected": -2.4162302017211914, "step": 1731 }, { "epoch": 0.09180293112129966, "grad_norm": 68.0, "kl": 0.4663543701171875, "learning_rate": 5e-07, "logits/chosen": -27697461.333333332, "logits/rejected": -1781380.5, "logps/chosen": -323.3605550130208, "logps/rejected": -258.9765930175781, "loss": 0.389, "rewards/chosen": 0.3110546072324117, "rewards/margins": 1.5097896059354146, "rewards/rejected": -1.198734998703003, "step": 1732 }, { "epoch": 0.09185593512310179, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20363827.2, "logits/rejected": -40230760.0, "logps/chosen": -252.280419921875, "logps/rejected": -148.84162394205728, "loss": 0.4627, "rewards/chosen": -0.254133939743042, "rewards/margins": 0.6307212511698405, "rewards/rejected": -0.8848551909128824, "step": 1733 }, { "epoch": 0.09190893912490393, "grad_norm": 49.25, "kl": 0.029575347900390625, "learning_rate": 5e-07, "logits/chosen": -34026276.0, "logits/rejected": -27437628.0, "logps/chosen": -266.3955078125, "logps/rejected": -240.04443359375, "loss": 0.3552, "rewards/chosen": 0.21761122345924377, "rewards/margins": 1.4186038076877594, "rewards/rejected": -1.2009925842285156, "step": 1734 }, { "epoch": 0.09196194312670607, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14149944.0, "logits/rejected": 9404671.0, "logps/chosen": -528.2386474609375, "logps/rejected": -212.34873962402344, "loss": 0.3841, "rewards/chosen": 0.13291092216968536, "rewards/margins": 1.108351781964302, "rewards/rejected": -0.9754408597946167, "step": 1735 }, { "epoch": 0.0920149471285082, "grad_norm": 60.25, "kl": 0.1365966796875, "learning_rate": 5e-07, "logits/chosen": -8733823.0, "logits/rejected": -81557008.0, "logps/chosen": -274.73968505859375, "logps/rejected": -352.59783935546875, "loss": 0.3364, "rewards/chosen": 0.2652125358581543, "rewards/margins": 1.7783153057098389, "rewards/rejected": -1.5131027698516846, "step": 1736 }, { "epoch": 0.09206795113031034, "grad_norm": 77.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30753961.14285714, "logits/rejected": 6061356.0, "logps/chosen": -228.00357491629464, "logps/rejected": -42.83647918701172, "loss": 0.4313, "rewards/chosen": 0.25955772399902344, "rewards/margins": 0.709746927022934, "rewards/rejected": -0.4501892030239105, "step": 1737 }, { "epoch": 0.09212095513211248, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2408460.25, "logits/rejected": -30419520.0, "logps/chosen": -140.8974151611328, "logps/rejected": -293.8636474609375, "loss": 0.3367, "rewards/chosen": -0.18142108619213104, "rewards/margins": 0.9155351668596268, "rewards/rejected": -1.0969562530517578, "step": 1738 }, { "epoch": 0.09217395913391462, "grad_norm": 46.75, "kl": 0.2825899124145508, "learning_rate": 5e-07, "logits/chosen": -10957174.666666666, "logits/rejected": -100002600.0, "logps/chosen": -194.6962890625, "logps/rejected": -237.87112426757812, "loss": 0.4607, "rewards/chosen": 0.1560471455256144, "rewards/margins": 0.506340096394221, "rewards/rejected": -0.35029295086860657, "step": 1739 }, { "epoch": 0.09222696313571675, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78196048.0, "logits/rejected": -35355013.333333336, "logps/chosen": -228.32675170898438, "logps/rejected": -440.3240559895833, "loss": 0.2893, "rewards/chosen": -0.18902817368507385, "rewards/margins": 1.428097019592921, "rewards/rejected": -1.6171251932779949, "step": 1740 }, { "epoch": 0.09227996713751889, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22302930.285714287, "logits/rejected": -35658272.0, "logps/chosen": -143.91162981305803, "logps/rejected": -799.6852416992188, "loss": 0.4672, "rewards/chosen": -0.13118457794189453, "rewards/margins": 4.493736267089844, "rewards/rejected": -4.624920845031738, "step": 1741 }, { "epoch": 0.09233297113932101, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36917482.666666664, "logits/rejected": -40281491.2, "logps/chosen": -328.01361083984375, "logps/rejected": -258.481787109375, "loss": 0.387, "rewards/chosen": -0.373492956161499, "rewards/margins": 0.7248252391815186, "rewards/rejected": -1.0983181953430177, "step": 1742 }, { "epoch": 0.09238597514112315, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55389960.0, "logits/rejected": -34180016.0, "logps/chosen": -237.81130981445312, "logps/rejected": -270.97442626953125, "loss": 0.3818, "rewards/chosen": -0.08099803328514099, "rewards/margins": 1.2288468182086945, "rewards/rejected": -1.3098448514938354, "step": 1743 }, { "epoch": 0.09243897914292529, "grad_norm": 59.25, "kl": 0.11463737487792969, "learning_rate": 5e-07, "logits/chosen": 21190185.6, "logits/rejected": -8596639.333333334, "logps/chosen": -286.27119140625, "logps/rejected": -128.91558837890625, "loss": 0.3374, "rewards/chosen": 0.3869919300079346, "rewards/margins": 1.772857141494751, "rewards/rejected": -1.3858652114868164, "step": 1744 }, { "epoch": 0.09249198314472742, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40517356.8, "logits/rejected": -37820978.666666664, "logps/chosen": -249.5654296875, "logps/rejected": -1052.09716796875, "loss": 0.3314, "rewards/chosen": 0.3274040699005127, "rewards/margins": 2.9521116097768147, "rewards/rejected": -2.6247075398763022, "step": 1745 }, { "epoch": 0.09254498714652956, "grad_norm": 63.5, "kl": 0.7967205047607422, "learning_rate": 5e-07, "logits/chosen": 5407016.666666667, "logits/rejected": -35487244.8, "logps/chosen": -401.0726318359375, "logps/rejected": -572.621484375, "loss": 0.2551, "rewards/chosen": 0.5290846029917399, "rewards/margins": 2.5453354040781657, "rewards/rejected": -2.016250801086426, "step": 1746 }, { "epoch": 0.0925979911483317, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28869552.0, "logits/rejected": -14690639.0, "logps/chosen": -168.70489501953125, "logps/rejected": -334.2040710449219, "loss": 0.3777, "rewards/chosen": -0.07619701325893402, "rewards/margins": 1.3152287155389786, "rewards/rejected": -1.3914257287979126, "step": 1747 }, { "epoch": 0.09265099515013384, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18505688.0, "logits/rejected": -23610456.0, "logps/chosen": -143.322900390625, "logps/rejected": -185.06363932291666, "loss": 0.3566, "rewards/chosen": 0.19295568466186525, "rewards/margins": 1.6509237607320149, "rewards/rejected": -1.4579680760701497, "step": 1748 }, { "epoch": 0.09270399915193597, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24612192.0, "logits/rejected": -40353260.0, "logps/chosen": -212.75723266601562, "logps/rejected": -279.9610900878906, "loss": 0.401, "rewards/chosen": 0.022500425577163696, "rewards/margins": 0.9289543330669403, "rewards/rejected": -0.9064539074897766, "step": 1749 }, { "epoch": 0.09275700315373811, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1972450.0, "logits/rejected": -26961542.4, "logps/chosen": -127.37692260742188, "logps/rejected": -278.27275390625, "loss": 0.3467, "rewards/chosen": -0.29059791564941406, "rewards/margins": 1.270319652557373, "rewards/rejected": -1.560917568206787, "step": 1750 }, { "epoch": 0.09281000715554025, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23789237.333333332, "logits/rejected": -34472550.4, "logps/chosen": -454.8662109375, "logps/rejected": -276.11171875, "loss": 0.2957, "rewards/chosen": 0.15721474091211954, "rewards/margins": 1.7214597741762798, "rewards/rejected": -1.5642450332641602, "step": 1751 }, { "epoch": 0.09286301115734238, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14521822.0, "logits/rejected": -20102772.0, "logps/chosen": -272.4482421875, "logps/rejected": -458.71649169921875, "loss": 0.3189, "rewards/chosen": 0.1268566995859146, "rewards/margins": 2.217650070786476, "rewards/rejected": -2.0907933712005615, "step": 1752 }, { "epoch": 0.09291601515914452, "grad_norm": 52.0, "kl": 0.4346122741699219, "learning_rate": 5e-07, "logits/chosen": -14445065.6, "logits/rejected": -30089832.0, "logps/chosen": -150.0982421875, "logps/rejected": -447.473388671875, "loss": 0.4009, "rewards/chosen": -0.012974134087562561, "rewards/margins": 1.4764933278163273, "rewards/rejected": -1.48946746190389, "step": 1753 }, { "epoch": 0.09296901916094664, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9500658.666666666, "logits/rejected": -85694003.2, "logps/chosen": -332.600830078125, "logps/rejected": -529.300244140625, "loss": 0.3306, "rewards/chosen": -0.08660888671875, "rewards/margins": 1.645907211303711, "rewards/rejected": -1.732516098022461, "step": 1754 }, { "epoch": 0.09302202316274878, "grad_norm": 52.0, "kl": 0.09007930755615234, "learning_rate": 5e-07, "logits/chosen": -36218428.8, "logits/rejected": -32913290.666666668, "logps/chosen": -304.5431640625, "logps/rejected": -402.541015625, "loss": 0.4002, "rewards/chosen": -0.16664248704910278, "rewards/margins": 1.4614229400952656, "rewards/rejected": -1.6280654271443684, "step": 1755 }, { "epoch": 0.09307502716455092, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5828595.0, "logits/rejected": -4067857.0, "logps/chosen": -25.116024017333984, "logps/rejected": -79.48728942871094, "loss": 0.2538, "rewards/chosen": 0.44032469391822815, "rewards/margins": 2.015119145313899, "rewards/rejected": -1.5747944513956706, "step": 1756 }, { "epoch": 0.09312803116635306, "grad_norm": 49.75, "kl": 0.12200546264648438, "learning_rate": 5e-07, "logits/chosen": -52027744.0, "logits/rejected": -26119576.0, "logps/chosen": -351.8741455078125, "logps/rejected": -331.1344970703125, "loss": 0.2431, "rewards/chosen": 0.6163427035013834, "rewards/margins": 2.6659934679667154, "rewards/rejected": -2.049650764465332, "step": 1757 }, { "epoch": 0.0931810351681552, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19158545.333333332, "logits/rejected": -40893720.0, "logps/chosen": -285.6405436197917, "logps/rejected": -140.53086853027344, "loss": 0.4584, "rewards/chosen": 0.001161704460779826, "rewards/margins": 0.6900422672430674, "rewards/rejected": -0.6888805627822876, "step": 1758 }, { "epoch": 0.09323403916995733, "grad_norm": 55.75, "kl": 0.08209609985351562, "learning_rate": 5e-07, "logits/chosen": -26996602.666666668, "logits/rejected": -63769128.0, "logps/chosen": -216.76224772135416, "logps/rejected": -267.2847900390625, "loss": 0.4327, "rewards/chosen": 0.1503356695175171, "rewards/margins": 0.9318256378173828, "rewards/rejected": -0.7814899682998657, "step": 1759 }, { "epoch": 0.09328704317175947, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62547160.0, "logits/rejected": -51400458.666666664, "logps/chosen": -907.444091796875, "logps/rejected": -418.9253743489583, "loss": 0.2512, "rewards/chosen": 0.6347503662109375, "rewards/margins": 2.023442268371582, "rewards/rejected": -1.3886919021606445, "step": 1760 }, { "epoch": 0.0933400471735616, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38513133.333333336, "logits/rejected": -15500396.8, "logps/chosen": -319.3058268229167, "logps/rejected": -141.4447998046875, "loss": 0.3374, "rewards/chosen": 0.08381499846776326, "rewards/margins": 1.2455320318539937, "rewards/rejected": -1.1617170333862306, "step": 1761 }, { "epoch": 0.09339305117536374, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28954994.0, "logits/rejected": -8816409.0, "logps/chosen": -539.9924926757812, "logps/rejected": -288.5747375488281, "loss": 0.3342, "rewards/chosen": 0.16616517305374146, "rewards/margins": 1.6705676913261414, "rewards/rejected": -1.5044025182724, "step": 1762 }, { "epoch": 0.09344605517716588, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1667578.0, "logits/rejected": -15532117.333333334, "logps/chosen": -216.78701782226562, "logps/rejected": -534.7324625651041, "loss": 0.2613, "rewards/chosen": 0.05936717987060547, "rewards/margins": 1.8882144292195637, "rewards/rejected": -1.8288472493489583, "step": 1763 }, { "epoch": 0.09349905917896802, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15428228.57142857, "logits/rejected": 6989628.5, "logps/chosen": -322.41385323660717, "logps/rejected": -191.02706909179688, "loss": 0.429, "rewards/chosen": 0.20175187928336008, "rewards/margins": 1.140692344733647, "rewards/rejected": -0.9389404654502869, "step": 1764 }, { "epoch": 0.09355206318077015, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36550437.333333336, "logits/rejected": -12549841.6, "logps/chosen": -284.41461181640625, "logps/rejected": -280.1782958984375, "loss": 0.327, "rewards/chosen": -0.30779369672139484, "rewards/margins": 1.3259957472483317, "rewards/rejected": -1.6337894439697265, "step": 1765 }, { "epoch": 0.09360506718257229, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18005046.85714286, "logits/rejected": -12332334.0, "logps/chosen": -230.20244489397322, "logps/rejected": -247.29232788085938, "loss": 0.4097, "rewards/chosen": 0.27228890146527973, "rewards/margins": 1.445207578795297, "rewards/rejected": -1.172918677330017, "step": 1766 }, { "epoch": 0.09365807118437441, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33123442.0, "logits/rejected": -3900974.0, "logps/chosen": -395.0350341796875, "logps/rejected": -57.931495666503906, "loss": 0.4824, "rewards/chosen": -0.3836052119731903, "rewards/margins": 0.16659584641456604, "rewards/rejected": -0.5502010583877563, "step": 1767 }, { "epoch": 0.09371107518617655, "grad_norm": 65.5, "kl": 0.8511810302734375, "learning_rate": 5e-07, "logits/chosen": -101932288.0, "logits/rejected": -41444064.0, "logps/chosen": -397.63701171875, "logps/rejected": -298.16050211588544, "loss": 0.3653, "rewards/chosen": 0.30005722045898436, "rewards/margins": 1.8514675776163738, "rewards/rejected": -1.5514103571573894, "step": 1768 }, { "epoch": 0.09376407918797869, "grad_norm": 78.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39416493.333333336, "logits/rejected": 5826052.0, "logps/chosen": -398.3076578776042, "logps/rejected": -228.2236785888672, "loss": 0.4434, "rewards/chosen": 0.05023409922917684, "rewards/margins": 0.8751547535260519, "rewards/rejected": -0.824920654296875, "step": 1769 }, { "epoch": 0.09381708318978083, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38240435.2, "logits/rejected": -44524208.0, "logps/chosen": -308.381494140625, "logps/rejected": -327.892333984375, "loss": 0.3315, "rewards/chosen": 0.3949504137039185, "rewards/margins": 2.1539133628209433, "rewards/rejected": -1.7589629491170247, "step": 1770 }, { "epoch": 0.09387008719158296, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31113756.0, "logits/rejected": -30485517.333333332, "logps/chosen": -719.6261596679688, "logps/rejected": -318.3597819010417, "loss": 0.3307, "rewards/chosen": 0.05681724101305008, "rewards/margins": 1.0810192699233692, "rewards/rejected": -1.024202028910319, "step": 1771 }, { "epoch": 0.0939230911933851, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -206824618.66666666, "logits/rejected": -33242137.6, "logps/chosen": -256.75587972005206, "logps/rejected": -437.9892578125, "loss": 0.3069, "rewards/chosen": 0.2182546059290568, "rewards/margins": 1.6058458725611369, "rewards/rejected": -1.38759126663208, "step": 1772 }, { "epoch": 0.09397609519518724, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65299580.0, "logits/rejected": -2276679.5, "logps/chosen": -413.6064147949219, "logps/rejected": -247.43661499023438, "loss": 0.3029, "rewards/chosen": 0.477533757686615, "rewards/margins": 1.770530641078949, "rewards/rejected": -1.292996883392334, "step": 1773 }, { "epoch": 0.09402909919698937, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2992750.5, "logits/rejected": -51938808.0, "logps/chosen": -152.5586395263672, "logps/rejected": -416.2208251953125, "loss": 0.3298, "rewards/chosen": -0.17539793252944946, "rewards/margins": 2.0137757658958435, "rewards/rejected": -2.189173698425293, "step": 1774 }, { "epoch": 0.09408210319879151, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 849516.1875, "logits/rejected": -11395320.0, "logps/chosen": -208.47314453125, "logps/rejected": -65.73689270019531, "loss": 0.3843, "rewards/chosen": 0.22941619157791138, "rewards/margins": 0.970294177532196, "rewards/rejected": -0.7408779859542847, "step": 1775 }, { "epoch": 0.09413510720059365, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44518339.2, "logits/rejected": -23474826.666666668, "logps/chosen": -392.9046142578125, "logps/rejected": -297.4837239583333, "loss": 0.3852, "rewards/chosen": 0.050633227825164794, "rewards/margins": 1.5109211166699728, "rewards/rejected": -1.4602878888448079, "step": 1776 }, { "epoch": 0.09418811120239579, "grad_norm": 45.75, "kl": 0.04194450378417969, "learning_rate": 5e-07, "logits/chosen": 765063.2, "logits/rejected": -40476970.666666664, "logps/chosen": -146.37353515625, "logps/rejected": -200.4962361653646, "loss": 0.4284, "rewards/chosen": -0.12351386547088623, "rewards/margins": 0.9780125538508098, "rewards/rejected": -1.101526419321696, "step": 1777 }, { "epoch": 0.09424111520419792, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1006480.0, "logits/rejected": -69835148.8, "logps/chosen": -84.79918416341145, "logps/rejected": -526.663232421875, "loss": 0.1943, "rewards/chosen": 0.7729655901590983, "rewards/margins": 2.955650488535563, "rewards/rejected": -2.182684898376465, "step": 1778 }, { "epoch": 0.09429411920600006, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48885856.0, "logits/rejected": -13353096.0, "logps/chosen": -329.32147216796875, "logps/rejected": -386.1357828776042, "loss": 0.195, "rewards/chosen": 0.3747367858886719, "rewards/margins": 2.4479859670003257, "rewards/rejected": -2.073249181111654, "step": 1779 }, { "epoch": 0.09434712320780218, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7781402.5, "logits/rejected": -28376934.0, "logps/chosen": -267.5843200683594, "logps/rejected": -436.2216491699219, "loss": 0.316, "rewards/chosen": -0.060719311237335205, "rewards/margins": 2.0453423857688904, "rewards/rejected": -2.1060616970062256, "step": 1780 }, { "epoch": 0.09440012720960432, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 879566.6666666666, "logits/rejected": -55584588.8, "logps/chosen": -72.51761372884114, "logps/rejected": -294.90537109375, "loss": 0.3233, "rewards/chosen": 0.04857929050922394, "rewards/margins": 1.7192637234926225, "rewards/rejected": -1.6706844329833985, "step": 1781 }, { "epoch": 0.09445313121140646, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14921704.0, "logits/rejected": -23156293.333333332, "logps/chosen": -445.387158203125, "logps/rejected": -351.1198323567708, "loss": 0.3383, "rewards/chosen": 0.258624267578125, "rewards/margins": 2.0142730394999186, "rewards/rejected": -1.7556487719217937, "step": 1782 }, { "epoch": 0.0945061352132086, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4123386.5, "logits/rejected": -7680522.0, "logps/chosen": -199.48655700683594, "logps/rejected": -210.86561584472656, "loss": 0.38, "rewards/chosen": 0.007740497589111328, "rewards/margins": 1.2390918731689453, "rewards/rejected": -1.231351375579834, "step": 1783 }, { "epoch": 0.09455913921501073, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63519940.0, "logits/rejected": -4651250.0, "logps/chosen": -620.3851318359375, "logps/rejected": -257.5225830078125, "loss": 0.3648, "rewards/chosen": 0.12523499131202698, "rewards/margins": 1.2268202602863312, "rewards/rejected": -1.1015852689743042, "step": 1784 }, { "epoch": 0.09461214321681287, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -126513312.0, "logits/rejected": -6423512.0, "logps/chosen": -242.03240966796875, "logps/rejected": -115.48093668619792, "loss": 0.3544, "rewards/chosen": -0.22665806114673615, "rewards/margins": 0.7681281218926111, "rewards/rejected": -0.9947861830393473, "step": 1785 }, { "epoch": 0.094665147218615, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31300573.333333332, "logits/rejected": -19656524.8, "logps/chosen": -521.6702067057291, "logps/rejected": -348.46923828125, "loss": 0.2746, "rewards/chosen": 0.5925680001576742, "rewards/margins": 1.9804702599843345, "rewards/rejected": -1.3879022598266602, "step": 1786 }, { "epoch": 0.09471815122041714, "grad_norm": 50.5, "kl": 0.09848976135253906, "learning_rate": 5e-07, "logits/chosen": -60852275.2, "logits/rejected": 26528722.666666668, "logps/chosen": -168.4760986328125, "logps/rejected": -146.74051920572916, "loss": 0.3637, "rewards/chosen": 0.2247614622116089, "rewards/margins": 1.458125885327657, "rewards/rejected": -1.233364423116048, "step": 1787 }, { "epoch": 0.09477115522221928, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65843816.0, "logits/rejected": -37855498.666666664, "logps/chosen": -88.245849609375, "logps/rejected": -673.655029296875, "loss": 0.2399, "rewards/chosen": -0.3914247453212738, "rewards/margins": 2.225936005512873, "rewards/rejected": -2.617360750834147, "step": 1788 }, { "epoch": 0.09482415922402142, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11264675.0, "logits/rejected": -11390157.333333334, "logps/chosen": -19.46575164794922, "logps/rejected": -163.86137898763022, "loss": 0.3655, "rewards/chosen": 0.17645645141601562, "rewards/margins": 0.9347279071807861, "rewards/rejected": -0.7582714557647705, "step": 1789 }, { "epoch": 0.09487716322582355, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14267952.0, "logits/rejected": -16324412.8, "logps/chosen": -276.1621907552083, "logps/rejected": -306.1931640625, "loss": 0.2731, "rewards/chosen": 0.428011417388916, "rewards/margins": 1.9156167030334472, "rewards/rejected": -1.4876052856445312, "step": 1790 }, { "epoch": 0.09493016722762569, "grad_norm": 80.5, "kl": 0.00966644287109375, "learning_rate": 5e-07, "logits/chosen": -21822570.666666668, "logits/rejected": -38443672.0, "logps/chosen": -409.2869873046875, "logps/rejected": -283.794677734375, "loss": 0.3872, "rewards/chosen": 0.06175944209098816, "rewards/margins": 2.331619471311569, "rewards/rejected": -2.269860029220581, "step": 1791 }, { "epoch": 0.09498317122942782, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41335949.333333336, "logits/rejected": -35486358.4, "logps/chosen": -218.1956787109375, "logps/rejected": -415.651220703125, "loss": 0.3211, "rewards/chosen": 0.19570489724477133, "rewards/margins": 1.5373647292455037, "rewards/rejected": -1.3416598320007325, "step": 1792 }, { "epoch": 0.09503617523122995, "grad_norm": 75.0, "kl": 2.38348388671875, "learning_rate": 5e-07, "logits/chosen": -39878643.2, "logits/rejected": -12833226.666666666, "logps/chosen": -771.98505859375, "logps/rejected": -193.68355305989584, "loss": 0.372, "rewards/chosen": 0.7068252563476562, "rewards/margins": 1.9517911275227864, "rewards/rejected": -1.2449658711751301, "step": 1793 }, { "epoch": 0.09508917923303209, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48691248.0, "logits/rejected": -26539173.333333332, "logps/chosen": -301.336962890625, "logps/rejected": -354.6471354166667, "loss": 0.4132, "rewards/chosen": -0.12361345291137696, "rewards/margins": 1.1897979418436686, "rewards/rejected": -1.3134113947550456, "step": 1794 }, { "epoch": 0.09514218323483423, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7126467.5, "logits/rejected": -24498346.0, "logps/chosen": -266.37109375, "logps/rejected": -363.9233703613281, "loss": 0.2926, "rewards/chosen": 0.08460532128810883, "rewards/margins": 2.2820044606924057, "rewards/rejected": -2.197399139404297, "step": 1795 }, { "epoch": 0.09519518723663636, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35474284.8, "logits/rejected": -5254722.0, "logps/chosen": -173.74814453125, "logps/rejected": -46.39995320638021, "loss": 0.4738, "rewards/chosen": -0.22153358459472655, "rewards/margins": 0.5593293825785319, "rewards/rejected": -0.7808629671732584, "step": 1796 }, { "epoch": 0.0952481912384385, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36662124.8, "logits/rejected": -24490450.666666668, "logps/chosen": -240.3205078125, "logps/rejected": -188.9697265625, "loss": 0.3017, "rewards/chosen": 0.3044334888458252, "rewards/margins": 2.707642380396525, "rewards/rejected": -2.4032088915506997, "step": 1797 }, { "epoch": 0.09530119524024064, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26574232.0, "logits/rejected": 135979225.6, "logps/chosen": -354.3741861979167, "logps/rejected": -603.2861328125, "loss": 0.2715, "rewards/chosen": 0.14582368731498718, "rewards/margins": 2.2595899760723115, "rewards/rejected": -2.1137662887573243, "step": 1798 }, { "epoch": 0.09535419924204278, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39152992.0, "logits/rejected": -23810150.0, "logps/chosen": -387.19122314453125, "logps/rejected": -173.29214477539062, "loss": 0.3689, "rewards/chosen": 0.10468579083681107, "rewards/margins": 1.3705191686749458, "rewards/rejected": -1.2658333778381348, "step": 1799 }, { "epoch": 0.09540720324384491, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22986387.2, "logits/rejected": -29072346.666666668, "logps/chosen": -322.2827392578125, "logps/rejected": -354.8944905598958, "loss": 0.3716, "rewards/chosen": 0.04569992423057556, "rewards/margins": 1.9588658114274342, "rewards/rejected": -1.9131658871968586, "step": 1800 }, { "epoch": 0.09546020724564705, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3406246.0, "logits/rejected": -28056861.333333332, "logps/chosen": -197.29054260253906, "logps/rejected": -387.379150390625, "loss": 0.3108, "rewards/chosen": -1.2425094842910767, "rewards/margins": 0.8273536761601767, "rewards/rejected": -2.0698631604512534, "step": 1801 }, { "epoch": 0.09551321124744919, "grad_norm": 74.0, "kl": 0.36873435974121094, "learning_rate": 5e-07, "logits/chosen": -22857362.666666668, "logits/rejected": 139001584.0, "logps/chosen": -348.0298665364583, "logps/rejected": -323.52178955078125, "loss": 0.3782, "rewards/chosen": 0.2339396278063456, "rewards/margins": 1.820862869421641, "rewards/rejected": -1.5869232416152954, "step": 1802 }, { "epoch": 0.09556621524925132, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -19599362.0, "logps/rejected": -253.36407470703125, "loss": 0.205, "rewards/rejected": -1.4637506008148193, "step": 1803 }, { "epoch": 0.09561921925105346, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42493590.4, "logits/rejected": -24361712.0, "logps/chosen": -284.629931640625, "logps/rejected": -404.4534098307292, "loss": 0.3759, "rewards/chosen": 0.13195592164993286, "rewards/margins": 2.004123826821645, "rewards/rejected": -1.8721679051717122, "step": 1804 }, { "epoch": 0.09567222325285558, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 23401834.0, "logits/rejected": -27534038.0, "logps/chosen": -360.47418212890625, "logps/rejected": -308.3596496582031, "loss": 0.3192, "rewards/chosen": 0.31260591745376587, "rewards/margins": 1.7756127715110779, "rewards/rejected": -1.463006854057312, "step": 1805 }, { "epoch": 0.09572522725465772, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38793976.0, "logits/rejected": -22405716.0, "logps/chosen": -158.40606689453125, "logps/rejected": -192.0609130859375, "loss": 0.4683, "rewards/chosen": -0.2844066023826599, "rewards/margins": 0.33498579263687134, "rewards/rejected": -0.6193923950195312, "step": 1806 }, { "epoch": 0.09577823125645986, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20408474.666666668, "logits/rejected": -11518206.4, "logps/chosen": -299.6135660807292, "logps/rejected": -427.58154296875, "loss": 0.3404, "rewards/chosen": 0.0021187464396158853, "rewards/margins": 1.2333844820658366, "rewards/rejected": -1.2312657356262207, "step": 1807 }, { "epoch": 0.095831235258262, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25894008.0, "logits/rejected": -12081794.0, "logps/chosen": -266.5399169921875, "logps/rejected": -230.28863525390625, "loss": 0.3882, "rewards/chosen": -0.12087079137563705, "rewards/margins": 1.0720166340470314, "rewards/rejected": -1.1928874254226685, "step": 1808 }, { "epoch": 0.09588423926006413, "grad_norm": 81.0, "kl": 0.9506187438964844, "learning_rate": 5e-07, "logits/chosen": -27310137.6, "logits/rejected": -3582428.0, "logps/chosen": -671.00751953125, "logps/rejected": -225.74613444010416, "loss": 0.3424, "rewards/chosen": 0.4024847984313965, "rewards/margins": 1.7805974324544271, "rewards/rejected": -1.3781126340230305, "step": 1809 }, { "epoch": 0.09593724326186627, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20670790.666666668, "logits/rejected": -534354.8, "logps/chosen": -437.326416015625, "logps/rejected": -446.5384765625, "loss": 0.2888, "rewards/chosen": 0.2868865927060445, "rewards/margins": 1.8456409414609272, "rewards/rejected": -1.5587543487548827, "step": 1810 }, { "epoch": 0.09599024726366841, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6658329.5, "logits/rejected": -15587670.0, "logps/chosen": -190.56588745117188, "logps/rejected": -251.47344970703125, "loss": 0.3342, "rewards/chosen": 0.3403133451938629, "rewards/margins": 1.6696323454380035, "rewards/rejected": -1.3293190002441406, "step": 1811 }, { "epoch": 0.09604325126547054, "grad_norm": 52.25, "kl": 0.4370384216308594, "learning_rate": 5e-07, "logits/chosen": -27312208.0, "logits/rejected": -53685664.0, "logps/chosen": -237.70821126302084, "logps/rejected": -354.409423828125, "loss": 0.4387, "rewards/chosen": -0.048419187466303505, "rewards/margins": 1.9090328713258107, "rewards/rejected": -1.9574520587921143, "step": 1812 }, { "epoch": 0.09609625526727268, "grad_norm": 57.0, "kl": 0.14892196655273438, "learning_rate": 5e-07, "logits/chosen": -33748864.0, "logits/rejected": -206781.16666666666, "logps/chosen": -354.301025390625, "logps/rejected": -94.4461669921875, "loss": 0.3886, "rewards/chosen": 0.180985426902771, "rewards/margins": 1.2895596583684286, "rewards/rejected": -1.1085742314656575, "step": 1813 }, { "epoch": 0.09614925926907482, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26425874.666666668, "logits/rejected": -28501540.0, "logps/chosen": -195.1270548502604, "logps/rejected": -292.886962890625, "loss": 0.4497, "rewards/chosen": -0.0924207071463267, "rewards/margins": 1.241037239631017, "rewards/rejected": -1.3334579467773438, "step": 1814 }, { "epoch": 0.09620226327087696, "grad_norm": 60.5, "kl": 0.5299892425537109, "learning_rate": 5e-07, "logits/chosen": -47582772.0, "logits/rejected": -17973630.666666668, "logps/chosen": -454.905517578125, "logps/rejected": -291.317138671875, "loss": 0.3128, "rewards/chosen": 0.14046631753444672, "rewards/margins": 1.2871442884206772, "rewards/rejected": -1.1466779708862305, "step": 1815 }, { "epoch": 0.0962552672726791, "grad_norm": 60.25, "kl": 0.4219398498535156, "learning_rate": 5e-07, "logits/chosen": -45222628.0, "logits/rejected": -34542832.0, "logps/chosen": -516.0055541992188, "logps/rejected": -252.21356201171875, "loss": 0.315, "rewards/chosen": 0.3246699273586273, "rewards/margins": 1.8191426694393158, "rewards/rejected": -1.4944727420806885, "step": 1816 }, { "epoch": 0.09630827127448123, "grad_norm": 57.25, "kl": 0.6965484619140625, "learning_rate": 5e-07, "logits/chosen": -50978693.333333336, "logits/rejected": -55509520.0, "logps/chosen": -309.0506184895833, "logps/rejected": -332.658447265625, "loss": 0.429, "rewards/chosen": 0.18856608867645264, "rewards/margins": 1.178229033946991, "rewards/rejected": -0.9896629452705383, "step": 1817 }, { "epoch": 0.09636127527628335, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12132784.0, "logits/rejected": -5774438.666666667, "logps/chosen": -430.46795654296875, "logps/rejected": -475.4931640625, "loss": 0.3254, "rewards/chosen": 0.17916107177734375, "rewards/margins": 1.3776051203409831, "rewards/rejected": -1.1984440485636394, "step": 1818 }, { "epoch": 0.09641427927808549, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17788558.4, "logits/rejected": -26310032.0, "logps/chosen": -166.5486572265625, "logps/rejected": -234.857177734375, "loss": 0.3753, "rewards/chosen": 0.10470428466796874, "rewards/margins": 1.441021982828776, "rewards/rejected": -1.3363176981608074, "step": 1819 }, { "epoch": 0.09646728327988763, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32463720.0, "logits/rejected": -16947164.8, "logps/chosen": -391.1982421875, "logps/rejected": -265.072021484375, "loss": 0.2889, "rewards/chosen": 0.048755904038747154, "rewards/margins": 1.7614956577618917, "rewards/rejected": -1.7127397537231446, "step": 1820 }, { "epoch": 0.09652028728168977, "grad_norm": 69.0, "kl": 0.2743415832519531, "learning_rate": 5e-07, "logits/chosen": -24552473.6, "logits/rejected": -9133126.666666666, "logps/chosen": -571.736572265625, "logps/rejected": -360.9181722005208, "loss": 0.32, "rewards/chosen": 0.3908190965652466, "rewards/margins": 2.215117001533508, "rewards/rejected": -1.8242979049682617, "step": 1821 }, { "epoch": 0.0965732912834919, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52729365.333333336, "logits/rejected": -7683788.8, "logps/chosen": -387.7763264973958, "logps/rejected": -170.9668701171875, "loss": 0.3583, "rewards/chosen": 0.13013001283009848, "rewards/margins": 1.1347128947575886, "rewards/rejected": -1.0045828819274902, "step": 1822 }, { "epoch": 0.09662629528529404, "grad_norm": 45.75, "kl": 0.017363548278808594, "learning_rate": 5e-07, "logits/chosen": -33077780.0, "logits/rejected": -18494428.0, "logps/chosen": -321.7297058105469, "logps/rejected": -229.78955078125, "loss": 0.3565, "rewards/chosen": 0.05261430889368057, "rewards/margins": 1.4194570556282997, "rewards/rejected": -1.3668427467346191, "step": 1823 }, { "epoch": 0.09667929928709618, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27181453.333333332, "logits/rejected": -19185667.2, "logps/chosen": -197.6317342122396, "logps/rejected": -301.361181640625, "loss": 0.3137, "rewards/chosen": 0.0409109095732371, "rewards/margins": 1.6862543086210888, "rewards/rejected": -1.6453433990478517, "step": 1824 }, { "epoch": 0.09673230328889831, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43076242.28571428, "logits/rejected": -46459384.0, "logps/chosen": -212.04460797991072, "logps/rejected": -633.45751953125, "loss": 0.5073, "rewards/chosen": -0.24488813536507742, "rewards/margins": 1.5368624244417464, "rewards/rejected": -1.7817505598068237, "step": 1825 }, { "epoch": 0.09678530729070045, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50436148.0, "logits/rejected": -64264736.0, "logps/chosen": -207.11190795898438, "logps/rejected": -330.3109436035156, "loss": 0.3117, "rewards/chosen": 0.18863371014595032, "rewards/margins": 1.876994401216507, "rewards/rejected": -1.6883606910705566, "step": 1826 }, { "epoch": 0.09683831129250259, "grad_norm": 50.5, "kl": 0.1566162109375, "learning_rate": 5e-07, "logits/chosen": -8082759.333333333, "logits/rejected": -26739592.0, "logps/chosen": -132.4420166015625, "logps/rejected": -228.04550170898438, "loss": 0.4723, "rewards/chosen": 0.03468023240566254, "rewards/margins": 0.41729359328746796, "rewards/rejected": -0.3826133608818054, "step": 1827 }, { "epoch": 0.09689131529430473, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43572336.0, "logits/rejected": 3689778.0, "logps/chosen": -311.107421875, "logps/rejected": -356.7065734863281, "loss": 0.4629, "rewards/chosen": 0.044063438971837364, "rewards/margins": 0.5185125966866811, "rewards/rejected": -0.47444915771484375, "step": 1828 }, { "epoch": 0.09694431929610686, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24955234.0, "logits/rejected": -1887022.0, "logps/chosen": -216.62066650390625, "logps/rejected": -316.4983317057292, "loss": 0.2822, "rewards/chosen": 0.18594685196876526, "rewards/margins": 1.7733326653639476, "rewards/rejected": -1.5873858133951824, "step": 1829 }, { "epoch": 0.09699732329790899, "grad_norm": 64.0, "kl": 0.9329681396484375, "learning_rate": 5e-07, "logits/chosen": 10521911.0, "logits/rejected": -15626876.0, "logps/chosen": -380.4114990234375, "logps/rejected": -294.20684814453125, "loss": 0.391, "rewards/chosen": 0.0036636358126997948, "rewards/margins": 1.4084160095080733, "rewards/rejected": -1.4047523736953735, "step": 1830 }, { "epoch": 0.09705032729971112, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25073404.0, "logits/rejected": -8524721.0, "logps/chosen": -247.36843872070312, "logps/rejected": -331.0155944824219, "loss": 0.3281, "rewards/chosen": 0.20192794501781464, "rewards/margins": 1.7397788017988205, "rewards/rejected": -1.5378508567810059, "step": 1831 }, { "epoch": 0.09710333130151326, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36488796.0, "logits/rejected": -13835698.666666666, "logps/chosen": -411.14593505859375, "logps/rejected": -123.54393513997395, "loss": 0.3953, "rewards/chosen": 0.11747513711452484, "rewards/margins": 0.6621835877497991, "rewards/rejected": -0.5447084506352743, "step": 1832 }, { "epoch": 0.0971563353033154, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59187072.0, "logits/rejected": -54610712.0, "logps/chosen": -368.1055908203125, "logps/rejected": -238.4110870361328, "loss": 0.3002, "rewards/chosen": 0.4923728108406067, "rewards/margins": 1.7976225018501282, "rewards/rejected": -1.3052496910095215, "step": 1833 }, { "epoch": 0.09720933930511753, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14816674.666666666, "logits/rejected": -42666828.8, "logps/chosen": -309.52142333984375, "logps/rejected": -438.1517578125, "loss": 0.2446, "rewards/chosen": 0.3860234022140503, "rewards/margins": 2.4364100217819216, "rewards/rejected": -2.0503866195678713, "step": 1834 }, { "epoch": 0.09726234330691967, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31753817.6, "logits/rejected": -52877141.333333336, "logps/chosen": -168.7649169921875, "logps/rejected": -371.5827229817708, "loss": 0.4122, "rewards/chosen": -0.1302700161933899, "rewards/margins": 1.3329375465710958, "rewards/rejected": -1.4632075627644856, "step": 1835 }, { "epoch": 0.09731534730872181, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4055340.0, "logits/rejected": -9454368.0, "logps/chosen": -97.76070149739583, "logps/rejected": -325.0563720703125, "loss": 0.3132, "rewards/chosen": 0.05339177946249644, "rewards/margins": 1.3977439294258753, "rewards/rejected": -1.3443521499633788, "step": 1836 }, { "epoch": 0.09736835131052395, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23996522.0, "logits/rejected": -30656106.0, "logps/chosen": -444.5774841308594, "logps/rejected": -313.79461669921875, "loss": 0.3803, "rewards/chosen": 0.04850253835320473, "rewards/margins": 1.3863017298281193, "rewards/rejected": -1.3377991914749146, "step": 1837 }, { "epoch": 0.09742135531232608, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33004083.2, "logits/rejected": -27741925.333333332, "logps/chosen": -373.2138671875, "logps/rejected": -230.43416341145834, "loss": 0.3256, "rewards/chosen": 0.3647735595703125, "rewards/margins": 2.0014293670654295, "rewards/rejected": -1.6366558074951172, "step": 1838 }, { "epoch": 0.09747435931412822, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26180165.333333332, "logits/rejected": -33197548.8, "logps/chosen": -303.41623942057294, "logps/rejected": -307.098388671875, "loss": 0.2825, "rewards/chosen": 0.3349456787109375, "rewards/margins": 2.0567413330078126, "rewards/rejected": -1.721795654296875, "step": 1839 }, { "epoch": 0.09752736331593036, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -367517.4375, "logits/rejected": -55138725.333333336, "logps/chosen": -40.601226806640625, "logps/rejected": -284.0026041666667, "loss": 0.2859, "rewards/chosen": 0.33612126111984253, "rewards/margins": 1.8087619344393413, "rewards/rejected": -1.4726406733194988, "step": 1840 }, { "epoch": 0.0975803673177325, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10648601.0, "logits/rejected": -54999096.0, "logps/chosen": -91.12305450439453, "logps/rejected": -450.9554748535156, "loss": 0.3153, "rewards/chosen": 0.10989561676979065, "rewards/margins": 1.9130175411701202, "rewards/rejected": -1.8031219244003296, "step": 1841 }, { "epoch": 0.09763337131953463, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8056794.0, "logits/rejected": -30497782.4, "logps/chosen": -46.487630208333336, "logps/rejected": -537.779541015625, "loss": 0.2988, "rewards/chosen": 0.06452668209870656, "rewards/margins": 2.2534749319156013, "rewards/rejected": -2.1889482498168946, "step": 1842 }, { "epoch": 0.09768637532133675, "grad_norm": 61.75, "kl": 0.06356048583984375, "learning_rate": 5e-07, "logits/chosen": -29300928.0, "logits/rejected": -3314892.5, "logps/chosen": -317.9920349121094, "logps/rejected": -155.45639038085938, "loss": 0.2761, "rewards/chosen": 0.36062973737716675, "rewards/margins": 2.2325320839881897, "rewards/rejected": -1.871902346611023, "step": 1843 }, { "epoch": 0.09773937932313889, "grad_norm": 56.75, "kl": 0.055988311767578125, "learning_rate": 5e-07, "logits/chosen": -17627963.42857143, "logits/rejected": 6825272.0, "logps/chosen": -269.0955113002232, "logps/rejected": -144.977783203125, "loss": 0.4549, "rewards/chosen": 0.17590504033224924, "rewards/margins": 0.4800768111433302, "rewards/rejected": -0.30417177081108093, "step": 1844 }, { "epoch": 0.09779238332494103, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41898076.0, "logits/rejected": -21094164.0, "logps/chosen": -525.213134765625, "logps/rejected": -243.4742431640625, "loss": 0.323, "rewards/chosen": 0.1092529371380806, "rewards/margins": 1.1779147858421009, "rewards/rejected": -1.0686618487040203, "step": 1845 }, { "epoch": 0.09784538732674317, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56980240.0, "logits/rejected": -27853528.0, "logps/chosen": -263.6279296875, "logps/rejected": -337.57415771484375, "loss": 0.4016, "rewards/chosen": -0.1928638517856598, "rewards/margins": 1.0711300075054169, "rewards/rejected": -1.2639938592910767, "step": 1846 }, { "epoch": 0.0978983913285453, "grad_norm": 57.5, "kl": 0.0623016357421875, "learning_rate": 5e-07, "logits/chosen": -20783955.2, "logits/rejected": -6986044.666666667, "logps/chosen": -361.6943359375, "logps/rejected": -151.78270467122397, "loss": 0.3876, "rewards/chosen": 0.2948911190032959, "rewards/margins": 1.0733222961425781, "rewards/rejected": -0.7784311771392822, "step": 1847 }, { "epoch": 0.09795139533034744, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 902210.5, "logits/rejected": -38399658.666666664, "logps/chosen": -555.505419921875, "logps/rejected": -620.21728515625, "loss": 0.3193, "rewards/chosen": 0.3535665512084961, "rewards/margins": 2.9578346252441405, "rewards/rejected": -2.6042680740356445, "step": 1848 }, { "epoch": 0.09800439933214958, "grad_norm": 38.25, "kl": 0.07400798797607422, "learning_rate": 5e-07, "logits/chosen": -14125096.0, "logits/rejected": -36406632.0, "logps/chosen": -112.25241088867188, "logps/rejected": -217.06742350260416, "loss": 0.2425, "rewards/chosen": 0.55303955078125, "rewards/margins": 2.101444085439046, "rewards/rejected": -1.5484045346577961, "step": 1849 }, { "epoch": 0.09805740333395171, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59603624.0, "logits/rejected": -15858358.666666666, "logps/chosen": -126.79105377197266, "logps/rejected": -195.127197265625, "loss": 0.3776, "rewards/chosen": -0.33520087599754333, "rewards/margins": 0.6013580063978831, "rewards/rejected": -0.9365588823954264, "step": 1850 }, { "epoch": 0.09811040733575385, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -96571509.33333333, "logits/rejected": -34431340.8, "logps/chosen": -404.3662109375, "logps/rejected": -330.935546875, "loss": 0.3292, "rewards/chosen": 0.23877461751302084, "rewards/margins": 1.3141264279683431, "rewards/rejected": -1.0753518104553224, "step": 1851 }, { "epoch": 0.09816341133755599, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28039076.0, "logits/rejected": -22228097.333333332, "logps/chosen": -267.8756408691406, "logps/rejected": -277.2371419270833, "loss": 0.255, "rewards/chosen": 0.6256813406944275, "rewards/margins": 1.90107657512029, "rewards/rejected": -1.2753952344258626, "step": 1852 }, { "epoch": 0.09821641533935813, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8267471.5, "logits/rejected": 2322295.25, "logps/chosen": -281.88726806640625, "logps/rejected": -68.76688385009766, "loss": 0.3496, "rewards/chosen": 0.32261162996292114, "rewards/margins": 1.4714091420173645, "rewards/rejected": -1.1487975120544434, "step": 1853 }, { "epoch": 0.09826941934116026, "grad_norm": 57.5, "kl": 0.16112899780273438, "learning_rate": 5e-07, "logits/chosen": -43892904.0, "logits/rejected": -27262242.0, "logps/chosen": -305.3468017578125, "logps/rejected": -402.0244140625, "loss": 0.366, "rewards/chosen": 0.2580534517765045, "rewards/margins": 1.463000327348709, "rewards/rejected": -1.2049468755722046, "step": 1854 }, { "epoch": 0.09832242334296239, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36643882.666666664, "logits/rejected": -29639299.2, "logps/chosen": -410.453369140625, "logps/rejected": -402.22607421875, "loss": 0.2744, "rewards/chosen": 0.045833329359690346, "rewards/margins": 1.954577950636546, "rewards/rejected": -1.9087446212768555, "step": 1855 }, { "epoch": 0.09837542734476452, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1800200.5, "logits/rejected": -20641284.0, "logps/chosen": -365.2862854003906, "logps/rejected": -344.6412658691406, "loss": 0.3548, "rewards/chosen": 0.08803138136863708, "rewards/margins": 1.457518070936203, "rewards/rejected": -1.369486689567566, "step": 1856 }, { "epoch": 0.09842843134656666, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19818392.0, "logits/rejected": -37616496.0, "logps/chosen": -310.9869079589844, "logps/rejected": -362.53533935546875, "loss": 0.3594, "rewards/chosen": 0.02439136803150177, "rewards/margins": 1.32548888027668, "rewards/rejected": -1.3010975122451782, "step": 1857 }, { "epoch": 0.0984814353483688, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2788889.6666666665, "logits/rejected": -2960258.0, "logps/chosen": -150.7707722981771, "logps/rejected": -328.550341796875, "loss": 0.301, "rewards/chosen": 0.497463862101237, "rewards/margins": 1.6514674822489421, "rewards/rejected": -1.154003620147705, "step": 1858 }, { "epoch": 0.09853443935017094, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43883764.0, "logits/rejected": -68321752.0, "logps/chosen": -336.1698913574219, "logps/rejected": -256.70452880859375, "loss": 0.3075, "rewards/chosen": 0.2637840211391449, "rewards/margins": 1.8553495109081268, "rewards/rejected": -1.591565489768982, "step": 1859 }, { "epoch": 0.09858744335197307, "grad_norm": 59.75, "kl": 1.5986480712890625, "learning_rate": 5e-07, "logits/chosen": -27395848.0, "logits/rejected": -8991120.0, "logps/chosen": -689.81591796875, "logps/rejected": -270.0349426269531, "loss": 0.3641, "rewards/chosen": 0.6807785034179688, "rewards/margins": 1.587204098701477, "rewards/rejected": -0.9064255952835083, "step": 1860 }, { "epoch": 0.09864044735377521, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36134480.0, "logits/rejected": -47674148.0, "logps/chosen": -325.161376953125, "logps/rejected": -443.1854248046875, "loss": 0.3142, "rewards/chosen": 0.31360846757888794, "rewards/margins": 2.5630868077278137, "rewards/rejected": -2.249478340148926, "step": 1861 }, { "epoch": 0.09869345135557735, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -97905776.0, "logits/rejected": -54866624.0, "logps/chosen": -160.88613891601562, "logps/rejected": -402.0858561197917, "loss": 0.3073, "rewards/chosen": -0.03536376357078552, "rewards/margins": 1.2881967922051747, "rewards/rejected": -1.3235605557759602, "step": 1862 }, { "epoch": 0.09874645535737948, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9609536.0, "logits/rejected": 13870839.0, "logps/chosen": -242.7608642578125, "logps/rejected": -778.082275390625, "loss": 0.4569, "rewards/chosen": -0.023391557591302053, "rewards/margins": 2.021115230662482, "rewards/rejected": -2.044506788253784, "step": 1863 }, { "epoch": 0.09879945935918162, "grad_norm": 54.75, "kl": 0.5548973083496094, "learning_rate": 5e-07, "logits/chosen": -27954384.0, "logits/rejected": -8489106.0, "logps/chosen": -281.45107421875, "logps/rejected": -115.05062866210938, "loss": 0.4195, "rewards/chosen": 0.15170867443084718, "rewards/margins": 0.9661623557408652, "rewards/rejected": -0.8144536813100179, "step": 1864 }, { "epoch": 0.09885246336098376, "grad_norm": 100.5, "kl": 0.5186576843261719, "learning_rate": 5e-07, "logits/chosen": 2504795.0, "logps/chosen": -624.63330078125, "loss": 0.5361, "rewards/chosen": -0.10532741993665695, "step": 1865 }, { "epoch": 0.0989054673627859, "grad_norm": 72.5, "kl": 0.3843193054199219, "learning_rate": 5e-07, "logits/chosen": -33234901.333333332, "logits/rejected": -3969390.5, "logps/chosen": -595.2066243489584, "logps/rejected": -260.58331298828125, "loss": 0.4571, "rewards/chosen": -0.0006240904331207275, "rewards/margins": 1.1948231160640717, "rewards/rejected": -1.1954472064971924, "step": 1866 }, { "epoch": 0.09895847136458803, "grad_norm": 53.5, "kl": 0.404296875, "learning_rate": 5e-07, "logits/chosen": -22953144.0, "logits/rejected": -9347436.0, "logps/chosen": -261.19049072265625, "logps/rejected": -222.02951049804688, "loss": 0.3518, "rewards/chosen": 0.02786560356616974, "rewards/margins": 1.5919160395860672, "rewards/rejected": -1.5640504360198975, "step": 1867 }, { "epoch": 0.09901147536639016, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 25184169.6, "logits/rejected": 2986456.0, "logps/chosen": -184.42314453125, "logps/rejected": -284.07753499348956, "loss": 0.3503, "rewards/chosen": 0.33358809947967527, "rewards/margins": 1.5412271579106647, "rewards/rejected": -1.2076390584309895, "step": 1868 }, { "epoch": 0.09906447936819229, "grad_norm": 68.0, "kl": 0.6646614074707031, "learning_rate": 5e-07, "logits/chosen": -35986616.0, "logits/rejected": -33649880.0, "logps/chosen": -711.5909423828125, "logps/rejected": -412.96185302734375, "loss": 0.3399, "rewards/chosen": 0.4037249684333801, "rewards/margins": 1.6611025929450989, "rewards/rejected": -1.2573776245117188, "step": 1869 }, { "epoch": 0.09911748336999443, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74509786.66666667, "logits/rejected": -39034969.6, "logps/chosen": -394.1070963541667, "logps/rejected": -478.747412109375, "loss": 0.29, "rewards/chosen": -0.16210630536079407, "rewards/margins": 1.7524000346660613, "rewards/rejected": -1.9145063400268554, "step": 1870 }, { "epoch": 0.09917048737179657, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7014926.0, "logits/rejected": -21167762.666666668, "logps/chosen": -385.091064453125, "logps/rejected": -196.2498779296875, "loss": 0.2964, "rewards/chosen": 0.20244140923023224, "rewards/margins": 1.3574998726447423, "rewards/rejected": -1.15505846341451, "step": 1871 }, { "epoch": 0.0992234913735987, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25713908.0, "logits/rejected": 3763251.5, "logps/chosen": -107.21153259277344, "logps/rejected": -131.5408172607422, "loss": 0.4435, "rewards/chosen": -0.12274245917797089, "rewards/margins": 0.5216135531663895, "rewards/rejected": -0.6443560123443604, "step": 1872 }, { "epoch": 0.09927649537540084, "grad_norm": 63.25, "kl": 0.03334808349609375, "learning_rate": 5e-07, "logits/chosen": -126333109.33333333, "logits/rejected": -34742816.0, "logps/chosen": -582.1846923828125, "logps/rejected": -605.037646484375, "loss": 0.2663, "rewards/chosen": -0.014205930133660635, "rewards/margins": 2.113384440044562, "rewards/rejected": -2.127590370178223, "step": 1873 }, { "epoch": 0.09932949937720298, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32231568.0, "logits/rejected": -22265819.2, "logps/chosen": -150.2671915690104, "logps/rejected": -268.2063720703125, "loss": 0.3804, "rewards/chosen": -0.3814224402109782, "rewards/margins": 1.0113178094228108, "rewards/rejected": -1.392740249633789, "step": 1874 }, { "epoch": 0.09938250337900512, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2715444.0, "logits/rejected": -30034710.0, "logps/chosen": -201.57553100585938, "logps/rejected": -414.52972412109375, "loss": 0.3797, "rewards/chosen": -0.32022398710250854, "rewards/margins": 1.5029457211494446, "rewards/rejected": -1.8231697082519531, "step": 1875 }, { "epoch": 0.09943550738080725, "grad_norm": 53.25, "kl": 0.1479787826538086, "learning_rate": 5e-07, "logits/chosen": -1136189.25, "logits/rejected": -26132624.0, "logps/chosen": -327.21014404296875, "logps/rejected": -285.94732666015625, "loss": 0.3143, "rewards/chosen": 0.18655076622962952, "rewards/margins": 1.9041980803012848, "rewards/rejected": -1.7176473140716553, "step": 1876 }, { "epoch": 0.09948851138260939, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31469414.0, "logits/rejected": -25016234.666666668, "logps/chosen": -384.6078796386719, "logps/rejected": -313.2579345703125, "loss": 0.1666, "rewards/chosen": 0.49092361330986023, "rewards/margins": 2.8941409289836884, "rewards/rejected": -2.403217315673828, "step": 1877 }, { "epoch": 0.09954151538441153, "grad_norm": 46.75, "kl": 0.602025032043457, "learning_rate": 5e-07, "logits/chosen": 776299.6, "logits/rejected": -7874726.666666667, "logps/chosen": -146.955322265625, "logps/rejected": -259.3101806640625, "loss": 0.3776, "rewards/chosen": 0.25724790096282957, "rewards/margins": 1.5365190267562867, "rewards/rejected": -1.279271125793457, "step": 1878 }, { "epoch": 0.09959451938621366, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14258964.0, "logits/rejected": -20757083.42857143, "logps/chosen": -726.9466552734375, "logps/rejected": -338.0709751674107, "loss": 0.1962, "rewards/chosen": 1.132989525794983, "rewards/margins": 2.7545914479664395, "rewards/rejected": -1.6216019221714564, "step": 1879 }, { "epoch": 0.0996475233880158, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18017466.666666668, "logits/rejected": -36450163.2, "logps/chosen": -185.173095703125, "logps/rejected": -267.21044921875, "loss": 0.2745, "rewards/chosen": 0.9078301588694254, "rewards/margins": 2.3008782545725506, "rewards/rejected": -1.393048095703125, "step": 1880 }, { "epoch": 0.09970052738981793, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6472099.6, "logits/rejected": -2643460.0, "logps/chosen": -158.793603515625, "logps/rejected": -77.16710917154948, "loss": 0.4332, "rewards/chosen": -0.21113080978393556, "rewards/margins": 1.145848560333252, "rewards/rejected": -1.3569793701171875, "step": 1881 }, { "epoch": 0.09975353139162006, "grad_norm": 79.5, "kl": 0.5248565673828125, "learning_rate": 5e-07, "logits/chosen": -322135.3333333333, "logits/rejected": -47221004.0, "logps/chosen": -579.9866536458334, "logps/rejected": -356.3021545410156, "loss": 0.367, "rewards/chosen": 0.34399274984995526, "rewards/margins": 2.2702424923578897, "rewards/rejected": -1.9262497425079346, "step": 1882 }, { "epoch": 0.0998065353934222, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14930053.333333334, "logits/rejected": -24466332.0, "logps/chosen": -329.6440022786458, "logps/rejected": -450.3997802734375, "loss": 0.3863, "rewards/chosen": 0.037353520592053734, "rewards/margins": 2.6060059120257697, "rewards/rejected": -2.568652391433716, "step": 1883 }, { "epoch": 0.09985953939522434, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36606336.0, "logits/rejected": -20885897.333333332, "logps/chosen": -371.049462890625, "logps/rejected": -382.8144124348958, "loss": 0.3212, "rewards/chosen": 0.3882054328918457, "rewards/margins": 1.9749896367390951, "rewards/rejected": -1.5867842038472493, "step": 1884 }, { "epoch": 0.09991254339702647, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8175981.0, "logits/rejected": -17127392.0, "logps/chosen": -340.9349670410156, "logps/rejected": -300.7798549107143, "loss": 0.257, "rewards/chosen": 1.0392884016036987, "rewards/margins": 2.2537468671798706, "rewards/rejected": -1.2144584655761719, "step": 1885 }, { "epoch": 0.09996554739882861, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3473824.5, "logits/rejected": -123386.66666666667, "logps/chosen": -45.916534423828125, "logps/rejected": -490.8657633463542, "loss": 0.288, "rewards/chosen": -0.5829284191131592, "rewards/margins": 1.4026281833648682, "rewards/rejected": -1.9855566024780273, "step": 1886 }, { "epoch": 0.10001855140063075, "grad_norm": 64.5, "kl": 0.037240028381347656, "learning_rate": 5e-07, "logits/chosen": -11470200.0, "logits/rejected": 7200601.0, "logps/chosen": -208.44976806640625, "logps/rejected": -316.7996520996094, "loss": 0.3106, "rewards/chosen": 0.5124102830886841, "rewards/margins": 1.6817190647125244, "rewards/rejected": -1.1693087816238403, "step": 1887 }, { "epoch": 0.10007155540243289, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36674976.0, "logits/rejected": -26836640.0, "logps/chosen": -320.28411865234375, "logps/rejected": -215.57261657714844, "loss": 0.3276, "rewards/chosen": 0.08369049429893494, "rewards/margins": 1.7922477424144745, "rewards/rejected": -1.7085572481155396, "step": 1888 }, { "epoch": 0.10012455940423502, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -586862.0, "logits/rejected": -9238713.333333334, "logps/chosen": -193.75316162109374, "logps/rejected": -112.240966796875, "loss": 0.3501, "rewards/chosen": 0.23003125190734863, "rewards/margins": 1.7595144112904866, "rewards/rejected": -1.529483159383138, "step": 1889 }, { "epoch": 0.10017756340603716, "grad_norm": 51.25, "kl": 0.311370849609375, "learning_rate": 5e-07, "logits/chosen": -44082483.2, "logits/rejected": -28054309.333333332, "logps/chosen": -313.1482421875, "logps/rejected": -137.21160888671875, "loss": 0.3734, "rewards/chosen": 0.20648910999298095, "rewards/margins": 1.491697573661804, "rewards/rejected": -1.2852084636688232, "step": 1890 }, { "epoch": 0.1002305674078393, "grad_norm": 67.0, "kl": 0.09452056884765625, "learning_rate": 5e-07, "logits/chosen": -23275222.4, "logits/rejected": -23744026.666666668, "logps/chosen": -485.8603515625, "logps/rejected": -180.5552978515625, "loss": 0.4655, "rewards/chosen": -0.11748584508895873, "rewards/margins": 0.47483523289362584, "rewards/rejected": -0.5923210779825846, "step": 1891 }, { "epoch": 0.10028357140964143, "grad_norm": 60.75, "kl": 0.2652397155761719, "learning_rate": 5e-07, "logits/chosen": -41992501.333333336, "logits/rejected": -71183600.0, "logps/chosen": -267.4083658854167, "logps/rejected": -472.726318359375, "loss": 0.3926, "rewards/chosen": 0.16173623005549112, "rewards/margins": 1.7948005000750225, "rewards/rejected": -1.6330642700195312, "step": 1892 }, { "epoch": 0.10033657541144356, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25981862.0, "logits/rejected": -2879419.5, "logps/chosen": -265.14337158203125, "logps/rejected": -179.40533447265625, "loss": 0.325, "rewards/chosen": 0.15659476816654205, "rewards/margins": 1.6624817997217178, "rewards/rejected": -1.5058870315551758, "step": 1893 }, { "epoch": 0.1003895794132457, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9558744.0, "logits/rejected": -24968192.0, "logps/chosen": -26.153881072998047, "logps/rejected": -350.2828776041667, "loss": 0.285, "rewards/chosen": 0.1768559366464615, "rewards/margins": 1.7430310795704524, "rewards/rejected": -1.566175142923991, "step": 1894 }, { "epoch": 0.10044258341504783, "grad_norm": 51.75, "kl": 0.3288874626159668, "learning_rate": 5e-07, "logits/chosen": -18193512.0, "logits/rejected": -6078069.5, "logps/chosen": -270.7982482910156, "logps/rejected": -437.531494140625, "loss": 0.29, "rewards/chosen": 0.1356283575296402, "rewards/margins": 2.367578163743019, "rewards/rejected": -2.231949806213379, "step": 1895 }, { "epoch": 0.10049558741684997, "grad_norm": 54.75, "kl": 0.428680419921875, "learning_rate": 5e-07, "logits/chosen": -9711915.0, "logits/rejected": -3545105.75, "logps/chosen": -353.86199951171875, "logps/rejected": -110.65685272216797, "loss": 0.3223, "rewards/chosen": 0.32217711210250854, "rewards/margins": 1.766399085521698, "rewards/rejected": -1.4442219734191895, "step": 1896 }, { "epoch": 0.1005485914186521, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22532608.0, "logits/rejected": -12836109.333333334, "logps/chosen": -493.356494140625, "logps/rejected": -218.1143798828125, "loss": 0.3111, "rewards/chosen": 0.7134284973144531, "rewards/margins": 2.0091018676757812, "rewards/rejected": -1.2956733703613281, "step": 1897 }, { "epoch": 0.10060159542045424, "grad_norm": 77.0, "kl": 0.17912673950195312, "learning_rate": 5e-07, "logits/chosen": -42167725.71428572, "logits/rejected": -36227668.0, "logps/chosen": -330.44308035714283, "logps/rejected": -669.7131958007812, "loss": 0.3927, "rewards/chosen": 0.3157073089054653, "rewards/margins": 2.353128058569772, "rewards/rejected": -2.0374207496643066, "step": 1898 }, { "epoch": 0.10065459942225638, "grad_norm": 77.0, "kl": 0.4868583679199219, "learning_rate": 5e-07, "logits/chosen": -10027228.0, "logits/rejected": 12971457.6, "logps/chosen": -557.2664388020834, "logps/rejected": -265.8680419921875, "loss": 0.3186, "rewards/chosen": 0.27173564831415814, "rewards/margins": 1.6405877868334453, "rewards/rejected": -1.368852138519287, "step": 1899 }, { "epoch": 0.10070760342405852, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30014931.2, "logits/rejected": -51167552.0, "logps/chosen": -304.1877685546875, "logps/rejected": -217.300537109375, "loss": 0.3338, "rewards/chosen": 0.21348586082458496, "rewards/margins": 2.230284833908081, "rewards/rejected": -2.016798973083496, "step": 1900 }, { "epoch": 0.10076060742586065, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42957920.0, "logits/rejected": -12214676.0, "logps/chosen": -255.37261962890625, "logps/rejected": -383.6285400390625, "loss": 0.3423, "rewards/chosen": 0.0931975394487381, "rewards/margins": 1.5229621678590775, "rewards/rejected": -1.4297646284103394, "step": 1901 }, { "epoch": 0.10081361142766279, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16223740.0, "logits/rejected": 3888560.0, "logps/chosen": -129.91751098632812, "logps/rejected": -276.489306640625, "loss": 0.3204, "rewards/chosen": 0.049163629611333214, "rewards/margins": 1.477099420626958, "rewards/rejected": -1.427935791015625, "step": 1902 }, { "epoch": 0.10086661542946493, "grad_norm": 57.0, "kl": 1.083456039428711, "learning_rate": 5e-07, "logits/chosen": -35821107.2, "logits/rejected": -19388221.333333332, "logps/chosen": -268.2388671875, "logps/rejected": -244.4383544921875, "loss": 0.389, "rewards/chosen": 0.2771937370300293, "rewards/margins": 1.5152325948079426, "rewards/rejected": -1.2380388577779133, "step": 1903 }, { "epoch": 0.10091961943126707, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 22328992.0, "logits/rejected": -20683699.2, "logps/chosen": -374.0758463541667, "logps/rejected": -270.524853515625, "loss": 0.3452, "rewards/chosen": 0.3059885899225871, "rewards/margins": 1.4782137791315715, "rewards/rejected": -1.1722251892089843, "step": 1904 }, { "epoch": 0.1009726234330692, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30237925.333333332, "logits/rejected": -14973648.0, "logps/chosen": -468.8494873046875, "logps/rejected": -473.7287109375, "loss": 0.3225, "rewards/chosen": 0.10641747713088989, "rewards/margins": 1.6196333050727845, "rewards/rejected": -1.5132158279418946, "step": 1905 }, { "epoch": 0.10102562743487133, "grad_norm": 57.25, "kl": 3.15594482421875, "learning_rate": 5e-07, "logits/chosen": -50409996.8, "logits/rejected": -51117200.0, "logps/chosen": -408.99375, "logps/rejected": -684.7327473958334, "loss": 0.3664, "rewards/chosen": 0.26721930503845215, "rewards/margins": 3.6271445751190186, "rewards/rejected": -3.3599252700805664, "step": 1906 }, { "epoch": 0.10107863143667346, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -113676362.66666667, "logits/rejected": -33635321.6, "logps/chosen": -280.1451416015625, "logps/rejected": -384.7798095703125, "loss": 0.3052, "rewards/chosen": -0.2575986981391907, "rewards/margins": 1.5577733874320985, "rewards/rejected": -1.8153720855712892, "step": 1907 }, { "epoch": 0.1011316354384756, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9004433.0, "logits/rejected": -10176980.0, "logps/chosen": -167.9784698486328, "logps/rejected": -267.4157409667969, "loss": 0.3077, "rewards/chosen": 0.2562919557094574, "rewards/margins": 1.8152982890605927, "rewards/rejected": -1.5590063333511353, "step": 1908 }, { "epoch": 0.10118463944027774, "grad_norm": 66.0, "kl": 0.6322288513183594, "learning_rate": 5e-07, "logits/chosen": -31476920.0, "logits/rejected": -53856544.0, "logps/chosen": -403.4774169921875, "logps/rejected": -574.846435546875, "loss": 0.4096, "rewards/chosen": 0.090664009253184, "rewards/margins": 1.8994500438372295, "rewards/rejected": -1.8087860345840454, "step": 1909 }, { "epoch": 0.10123764344207987, "grad_norm": 50.5, "kl": 0.28302764892578125, "learning_rate": 5e-07, "logits/chosen": -25484115.2, "logits/rejected": -4367381.0, "logps/chosen": -293.14697265625, "logps/rejected": -312.7688802083333, "loss": 0.3471, "rewards/chosen": 0.2476630449295044, "rewards/margins": 1.7941829601923625, "rewards/rejected": -1.5465199152628581, "step": 1910 }, { "epoch": 0.10129064744388201, "grad_norm": 67.0, "kl": 1.6672744750976562, "learning_rate": 5e-07, "logits/chosen": -25560340.0, "logits/rejected": -19273484.0, "logps/chosen": -494.8712158203125, "logps/rejected": -167.3594207763672, "loss": 0.359, "rewards/chosen": 0.3905262053012848, "rewards/margins": 1.6411764919757843, "rewards/rejected": -1.2506502866744995, "step": 1911 }, { "epoch": 0.10134365144568415, "grad_norm": 51.0, "kl": 0.2396392822265625, "learning_rate": 5e-07, "logits/chosen": -15634597.333333334, "logits/rejected": -7408952.8, "logps/chosen": -269.54201253255206, "logps/rejected": -126.8843994140625, "loss": 0.4007, "rewards/chosen": -0.08857536315917969, "rewards/margins": 0.7005049228668213, "rewards/rejected": -0.789080286026001, "step": 1912 }, { "epoch": 0.10139665544748629, "grad_norm": 58.25, "kl": 0.4518775939941406, "learning_rate": 5e-07, "logits/chosen": -26917988.57142857, "logits/rejected": -4668949.0, "logps/chosen": -329.60787527901783, "logps/rejected": -339.44915771484375, "loss": 0.4725, "rewards/chosen": 0.014087183134896415, "rewards/margins": 1.4201174804142542, "rewards/rejected": -1.406030297279358, "step": 1913 }, { "epoch": 0.10144965944928842, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22856896.0, "logits/rejected": -53578200.0, "logps/chosen": -255.33743286132812, "logps/rejected": -476.8962097167969, "loss": 0.2663, "rewards/chosen": 0.48713815212249756, "rewards/margins": 2.4333568811416626, "rewards/rejected": -1.946218729019165, "step": 1914 }, { "epoch": 0.10150266345109056, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33689460.0, "logits/rejected": -52378544.0, "logps/chosen": -252.12960815429688, "logps/rejected": -428.4476725260417, "loss": 0.2121, "rewards/chosen": 0.47403907775878906, "rewards/margins": 2.76813538869222, "rewards/rejected": -2.294096310933431, "step": 1915 }, { "epoch": 0.1015556674528927, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34482956.8, "logits/rejected": -41575717.333333336, "logps/chosen": -163.1872314453125, "logps/rejected": -440.9923502604167, "loss": 0.3319, "rewards/chosen": 0.07672637701034546, "rewards/margins": 2.692300577958425, "rewards/rejected": -2.6155742009480796, "step": 1916 }, { "epoch": 0.10160867145469483, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1583315.9, "logits/rejected": -50301232.0, "logps/chosen": -200.9180908203125, "logps/rejected": -294.3160400390625, "loss": 0.3562, "rewards/chosen": 0.33987979888916015, "rewards/margins": 1.4369735399882, "rewards/rejected": -1.0970937410990398, "step": 1917 }, { "epoch": 0.10166167545649697, "grad_norm": 62.0, "kl": 0.5825538635253906, "learning_rate": 5e-07, "logits/chosen": -14113035.2, "logits/rejected": -7345656.0, "logps/chosen": -257.4928466796875, "logps/rejected": -184.22265625, "loss": 0.401, "rewards/chosen": 0.20343568325042724, "rewards/margins": 1.2150545358657836, "rewards/rejected": -1.0116188526153564, "step": 1918 }, { "epoch": 0.1017146794582991, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -133348760.0, "logits/rejected": -34687501.333333336, "logps/chosen": -347.982177734375, "logps/rejected": -374.864013671875, "loss": 0.2203, "rewards/chosen": 0.40658265352249146, "rewards/margins": 2.523706098397573, "rewards/rejected": -2.1171234448750815, "step": 1919 }, { "epoch": 0.10176768346010123, "grad_norm": 48.25, "kl": 0.9813222885131836, "learning_rate": 5e-07, "logits/chosen": 16735633.0, "logits/rejected": 475650.1666666667, "logps/chosen": -241.40701293945312, "logps/rejected": -81.11264038085938, "loss": 0.3765, "rewards/chosen": -0.06402625888586044, "rewards/margins": 0.6787332867582639, "rewards/rejected": -0.7427595456441244, "step": 1920 }, { "epoch": 0.10182068746190337, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33918500.571428575, "logits/rejected": -3593364.5, "logps/chosen": -486.1581333705357, "logps/rejected": -58.917572021484375, "loss": 0.4889, "rewards/chosen": 0.05454150268009731, "rewards/margins": 0.04389430076948234, "rewards/rejected": 0.010647201910614967, "step": 1921 }, { "epoch": 0.1018736914637055, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12735004.0, "logits/rejected": -42586554.666666664, "logps/chosen": -427.2045593261719, "logps/rejected": -391.88427734375, "loss": 0.315, "rewards/chosen": -0.37430036067962646, "rewards/margins": 1.079047958056132, "rewards/rejected": -1.4533483187357585, "step": 1922 }, { "epoch": 0.10192669546550764, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5759623.5, "logits/rejected": -20523676.0, "logps/chosen": -297.9627380371094, "logps/rejected": -111.39859008789062, "loss": 0.3852, "rewards/chosen": -0.04390771687030792, "rewards/margins": 1.0995218008756638, "rewards/rejected": -1.1434295177459717, "step": 1923 }, { "epoch": 0.10197969946730978, "grad_norm": 55.0, "kl": 0.30698204040527344, "learning_rate": 5e-07, "logits/chosen": -16992416.0, "logits/rejected": -7478925.0, "logps/chosen": -286.5658772786458, "logps/rejected": -52.02149963378906, "loss": 0.4024, "rewards/chosen": 0.24592177073160806, "rewards/margins": 1.321877161661784, "rewards/rejected": -1.0759553909301758, "step": 1924 }, { "epoch": 0.10203270346911192, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17500830.0, "logits/rejected": -6687333.0, "logps/chosen": -237.34228515625, "logps/rejected": -232.78550720214844, "loss": 0.3253, "rewards/chosen": 0.4343113601207733, "rewards/margins": 1.730559080839157, "rewards/rejected": -1.2962477207183838, "step": 1925 }, { "epoch": 0.10208570747091406, "grad_norm": 53.0, "kl": 0.8802032470703125, "learning_rate": 5e-07, "logits/chosen": -82177113.6, "logits/rejected": -23235997.333333332, "logps/chosen": -272.297265625, "logps/rejected": -232.90999348958334, "loss": 0.3769, "rewards/chosen": 0.19061675071716308, "rewards/margins": 1.7464744091033935, "rewards/rejected": -1.5558576583862305, "step": 1926 }, { "epoch": 0.10213871147271619, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27710460.0, "logits/rejected": -26751034.0, "logps/chosen": -261.961181640625, "logps/rejected": -325.1553955078125, "loss": 0.3216, "rewards/chosen": -0.031073950231075287, "rewards/margins": 2.378909446299076, "rewards/rejected": -2.4099833965301514, "step": 1927 }, { "epoch": 0.10219171547451833, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -159503968.0, "logits/rejected": -51127610.666666664, "logps/chosen": -542.6283569335938, "logps/rejected": -381.8610026041667, "loss": 0.2731, "rewards/chosen": -0.43766021728515625, "rewards/margins": 1.3671706517537434, "rewards/rejected": -1.8048308690388997, "step": 1928 }, { "epoch": 0.10224471947632047, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27160037.333333332, "logits/rejected": -34760518.4, "logps/chosen": -297.3349202473958, "logps/rejected": -442.587841796875, "loss": 0.2841, "rewards/chosen": 0.12182090679804485, "rewards/margins": 2.15723960796992, "rewards/rejected": -2.035418701171875, "step": 1929 }, { "epoch": 0.1022977234781226, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65940346.666666664, "logits/rejected": -2252953.2, "logps/chosen": -365.2857666015625, "logps/rejected": -236.4838623046875, "loss": 0.3482, "rewards/chosen": -0.16521072387695312, "rewards/margins": 1.2545405387878419, "rewards/rejected": -1.419751262664795, "step": 1930 }, { "epoch": 0.10235072747992473, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18257698.666666668, "logits/rejected": -20582001.6, "logps/chosen": -83.36869303385417, "logps/rejected": -146.289697265625, "loss": 0.3842, "rewards/chosen": -2.7974446614583332e-05, "rewards/margins": 0.822946802775065, "rewards/rejected": -0.8229747772216797, "step": 1931 }, { "epoch": 0.10240373148172686, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4404551.0, "logits/rejected": -32677394.285714287, "logps/chosen": -37.47413635253906, "logps/rejected": -272.93838936941967, "loss": 0.2425, "rewards/chosen": 1.1606944799423218, "rewards/margins": 2.492600117410932, "rewards/rejected": -1.3319056374686105, "step": 1932 }, { "epoch": 0.102456735483529, "grad_norm": 71.0, "kl": 1.8240127563476562, "learning_rate": 5e-07, "logits/chosen": -30343720.0, "logits/rejected": -7608163.5, "logps/chosen": -522.3286743164062, "logps/rejected": -206.76559448242188, "loss": 0.4032, "rewards/chosen": 0.41224512457847595, "rewards/margins": 1.2665279805660248, "rewards/rejected": -0.8542828559875488, "step": 1933 }, { "epoch": 0.10250973948533114, "grad_norm": 53.75, "kl": 0.21927261352539062, "learning_rate": 5e-07, "logits/chosen": -50225320.0, "logits/rejected": -30462896.0, "logps/chosen": -280.94635009765625, "logps/rejected": -337.3820495605469, "loss": 0.4, "rewards/chosen": -0.13726167380809784, "rewards/margins": 1.0909151583909988, "rewards/rejected": -1.2281768321990967, "step": 1934 }, { "epoch": 0.10256274348713328, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34174728.0, "logits/rejected": -31008480.0, "logps/chosen": -250.5577596028646, "logps/rejected": -419.7357421875, "loss": 0.2752, "rewards/chosen": -0.18098348379135132, "rewards/margins": 2.0418997406959534, "rewards/rejected": -2.2228832244873047, "step": 1935 }, { "epoch": 0.10261574748893541, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8694895.0, "logits/rejected": -15181557.333333334, "logps/chosen": -182.55751037597656, "logps/rejected": -215.69661458333334, "loss": 0.305, "rewards/chosen": -0.013144686818122864, "rewards/margins": 1.3543880432844162, "rewards/rejected": -1.367532730102539, "step": 1936 }, { "epoch": 0.10266875149073755, "grad_norm": 74.5, "kl": 1.1415863037109375, "learning_rate": 5e-07, "logits/chosen": -57341781.333333336, "logits/rejected": 25404948.0, "logps/chosen": -607.4486490885416, "logps/rejected": -497.5523681640625, "loss": 0.3336, "rewards/chosen": 0.5583692391713461, "rewards/margins": 2.5757857163747153, "rewards/rejected": -2.017416477203369, "step": 1937 }, { "epoch": 0.10272175549253969, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2285123.75, "logits/rejected": 434975456.0, "logps/chosen": -204.1608428955078, "logps/rejected": -252.7532958984375, "loss": 0.3861, "rewards/chosen": 0.23295150697231293, "rewards/margins": 1.2282671183347702, "rewards/rejected": -0.9953156113624573, "step": 1938 }, { "epoch": 0.10277475949434182, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21277912.0, "logits/rejected": -11798370.4, "logps/chosen": -294.4322102864583, "logps/rejected": -313.24033203125, "loss": 0.3392, "rewards/chosen": 0.0987283190091451, "rewards/margins": 1.2431528528531393, "rewards/rejected": -1.1444245338439942, "step": 1939 }, { "epoch": 0.10282776349614396, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20093744.0, "logits/rejected": -25077180.0, "logps/chosen": -290.18792724609375, "logps/rejected": -384.4915771484375, "loss": 0.3049, "rewards/chosen": 0.35435378551483154, "rewards/margins": 1.7781455516815186, "rewards/rejected": -1.423791766166687, "step": 1940 }, { "epoch": 0.1028807674979461, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30444120.0, "logits/rejected": -20183484.0, "logps/chosen": -458.3526611328125, "logps/rejected": -305.2381286621094, "loss": 0.3293, "rewards/chosen": 0.09728895127773285, "rewards/margins": 2.1862870305776596, "rewards/rejected": -2.0889980792999268, "step": 1941 }, { "epoch": 0.10293377149974824, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17908449.6, "logits/rejected": -3974017.3333333335, "logps/chosen": -247.78798828125, "logps/rejected": -723.3340657552084, "loss": 0.3088, "rewards/chosen": 0.3911550998687744, "rewards/margins": 3.0072420597076417, "rewards/rejected": -2.616086959838867, "step": 1942 }, { "epoch": 0.10298677550155037, "grad_norm": 52.25, "kl": 0.5194244384765625, "learning_rate": 5e-07, "logits/chosen": -50547768.0, "logits/rejected": -28377172.0, "logps/chosen": -405.0252990722656, "logps/rejected": -268.9909973144531, "loss": 0.3295, "rewards/chosen": 0.2875639498233795, "rewards/margins": 1.863192230463028, "rewards/rejected": -1.5756282806396484, "step": 1943 }, { "epoch": 0.1030397795033525, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2440658.25, "logits/rejected": -31556293.333333332, "logps/chosen": -283.2103576660156, "logps/rejected": -195.9791056315104, "loss": 0.2827, "rewards/chosen": 0.2556452751159668, "rewards/margins": 1.6339354515075684, "rewards/rejected": -1.3782901763916016, "step": 1944 }, { "epoch": 0.10309278350515463, "grad_norm": 56.25, "kl": 0.49730682373046875, "learning_rate": 5e-07, "logits/chosen": -27676629.333333332, "logits/rejected": 15555292.8, "logps/chosen": -402.9079182942708, "logps/rejected": -106.64815673828124, "loss": 0.3942, "rewards/chosen": 0.3065897623697917, "rewards/margins": 0.8322437445322672, "rewards/rejected": -0.5256539821624756, "step": 1945 }, { "epoch": 0.10314578750695677, "grad_norm": 72.0, "kl": 0.7301254272460938, "learning_rate": 5e-07, "logits/chosen": -37743664.0, "logits/rejected": -6918417.5, "logps/chosen": -492.5325404575893, "logps/rejected": -54.49012756347656, "loss": 0.449, "rewards/chosen": 0.20073003428322927, "rewards/margins": 1.272753426006862, "rewards/rejected": -1.0720233917236328, "step": 1946 }, { "epoch": 0.10319879150875891, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10203667.0, "logits/rejected": -20930858.666666668, "logps/chosen": -163.17567443847656, "logps/rejected": -374.8618570963542, "loss": 0.2231, "rewards/chosen": 0.3891373872756958, "rewards/margins": 2.254833658536275, "rewards/rejected": -1.8656962712605794, "step": 1947 }, { "epoch": 0.10325179551056105, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38635268.0, "logits/rejected": -28702604.0, "logps/chosen": -256.5457458496094, "logps/rejected": -588.4710693359375, "loss": 0.3048, "rewards/chosen": -0.0651189386844635, "rewards/margins": 2.356330007314682, "rewards/rejected": -2.4214489459991455, "step": 1948 }, { "epoch": 0.10330479951236318, "grad_norm": 62.0, "kl": 0.5316314697265625, "learning_rate": 5e-07, "logits/chosen": -49185110.4, "logits/rejected": -9236254.666666666, "logps/chosen": -412.463134765625, "logps/rejected": -120.56090291341145, "loss": 0.3517, "rewards/chosen": 0.29947996139526367, "rewards/margins": 1.871465841929118, "rewards/rejected": -1.5719858805338542, "step": 1949 }, { "epoch": 0.10335780351416532, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11959840.0, "logits/rejected": -23438394.0, "logps/chosen": -191.5904998779297, "logps/rejected": -319.951416015625, "loss": 0.3636, "rewards/chosen": 0.2055114358663559, "rewards/margins": 1.2625708431005478, "rewards/rejected": -1.057059407234192, "step": 1950 }, { "epoch": 0.10341080751596746, "grad_norm": 157.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21969824.0, "logits/rejected": -28084140.0, "logps/chosen": -194.01499720982142, "logps/rejected": -590.977294921875, "loss": 0.4458, "rewards/chosen": -0.022817722388676236, "rewards/margins": 3.331991800240108, "rewards/rejected": -3.354809522628784, "step": 1951 }, { "epoch": 0.1034638115177696, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19398096.0, "logits/rejected": -22357570.666666668, "logps/chosen": -150.44432067871094, "logps/rejected": -375.1659342447917, "loss": 0.2843, "rewards/chosen": 0.011073682457208633, "rewards/margins": 1.5150938965380192, "rewards/rejected": -1.5040202140808105, "step": 1952 }, { "epoch": 0.10351681551957173, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36511536.0, "logits/rejected": -65147845.333333336, "logps/chosen": -305.5763916015625, "logps/rejected": -487.9147135416667, "loss": 0.3245, "rewards/chosen": 0.20304527282714843, "rewards/margins": 2.511384868621826, "rewards/rejected": -2.3083395957946777, "step": 1953 }, { "epoch": 0.10356981952137387, "grad_norm": 51.25, "kl": 0.7462806701660156, "learning_rate": 5e-07, "logits/chosen": -36641609.6, "logits/rejected": -32482154.666666668, "logps/chosen": -276.22158203125, "logps/rejected": -297.4273274739583, "loss": 0.3174, "rewards/chosen": 0.4615070343017578, "rewards/margins": 2.3626830101013185, "rewards/rejected": -1.9011759757995605, "step": 1954 }, { "epoch": 0.103622823523176, "grad_norm": 52.5, "kl": 0.034852027893066406, "learning_rate": 5e-07, "logits/chosen": 26616296.0, "logits/rejected": -31309672.0, "logps/chosen": -197.90750122070312, "logps/rejected": -443.61907958984375, "loss": 0.3639, "rewards/chosen": 0.019327916204929352, "rewards/margins": 1.403215803205967, "rewards/rejected": -1.3838878870010376, "step": 1955 }, { "epoch": 0.10367582752497814, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39367789.333333336, "logits/rejected": -19816336.0, "logps/chosen": -100.75935872395833, "logps/rejected": -284.079248046875, "loss": 0.3026, "rewards/chosen": 0.02020702262719472, "rewards/margins": 1.745259474714597, "rewards/rejected": -1.7250524520874024, "step": 1956 }, { "epoch": 0.10372883152678027, "grad_norm": 58.0, "kl": 0.049935340881347656, "learning_rate": 5e-07, "logits/chosen": -24363376.0, "logits/rejected": -17746128.0, "logps/chosen": -328.5555419921875, "logps/rejected": -178.92901611328125, "loss": 0.3612, "rewards/chosen": 0.35241812467575073, "rewards/margins": 1.189553439617157, "rewards/rejected": -0.8371353149414062, "step": 1957 }, { "epoch": 0.1037818355285824, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11846968.0, "logits/rejected": -6740864.0, "logps/chosen": -311.569287109375, "logps/rejected": -121.51668294270833, "loss": 0.3452, "rewards/chosen": 0.13266205787658691, "rewards/margins": 2.0724402268727617, "rewards/rejected": -1.939778168996175, "step": 1958 }, { "epoch": 0.10383483953038454, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45945536.0, "logits/rejected": -8451855.0, "logps/chosen": -320.7881774902344, "logps/rejected": -113.78990936279297, "loss": 0.3514, "rewards/chosen": 0.17085589468479156, "rewards/margins": 1.4071575552225113, "rewards/rejected": -1.2363016605377197, "step": 1959 }, { "epoch": 0.10388784353218668, "grad_norm": 61.0, "kl": 0.09132766723632812, "learning_rate": 5e-07, "logits/chosen": -25591769.6, "logits/rejected": -12717001.333333334, "logps/chosen": -378.9421142578125, "logps/rejected": -322.6038818359375, "loss": 0.2947, "rewards/chosen": 0.5277490139007568, "rewards/margins": 2.518429390589396, "rewards/rejected": -1.9906803766886394, "step": 1960 }, { "epoch": 0.10394084753398881, "grad_norm": 82.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14301606.857142856, "logits/rejected": -73454160.0, "logps/chosen": -442.2725306919643, "logps/rejected": -391.5897216796875, "loss": 0.4323, "rewards/chosen": 0.14295826639447892, "rewards/margins": 1.6390642949513026, "rewards/rejected": -1.4961060285568237, "step": 1961 }, { "epoch": 0.10399385153579095, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40991216.0, "logits/rejected": -25087104.0, "logps/chosen": -202.278564453125, "logps/rejected": -188.0781046549479, "loss": 0.3437, "rewards/chosen": -0.1849406361579895, "rewards/margins": 0.8430648843447368, "rewards/rejected": -1.0280055205027263, "step": 1962 }, { "epoch": 0.10404685553759309, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16893452.0, "logits/rejected": -26790680.0, "logps/chosen": -148.61862182617188, "logps/rejected": -256.1817321777344, "loss": 0.3756, "rewards/chosen": -0.22664442658424377, "rewards/margins": 1.3923707902431488, "rewards/rejected": -1.6190152168273926, "step": 1963 }, { "epoch": 0.10409985953939523, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22881660.8, "logits/rejected": 6803417.333333333, "logps/chosen": -345.1842041015625, "logps/rejected": -342.8349202473958, "loss": 0.328, "rewards/chosen": 0.3896191120147705, "rewards/margins": 2.017473141352336, "rewards/rejected": -1.6278540293375652, "step": 1964 }, { "epoch": 0.10415286354119736, "grad_norm": 46.25, "kl": 0.69970703125, "learning_rate": 5e-07, "logits/chosen": -16716896.0, "logits/rejected": -34264604.8, "logps/chosen": -230.84285481770834, "logps/rejected": -248.0786865234375, "loss": 0.3364, "rewards/chosen": -0.27526867389678955, "rewards/margins": 1.3926655530929566, "rewards/rejected": -1.6679342269897461, "step": 1965 }, { "epoch": 0.1042058675429995, "grad_norm": 64.0, "kl": 0.3002490997314453, "learning_rate": 5e-07, "logits/chosen": -48119972.571428575, "logits/rejected": -64851264.0, "logps/chosen": -273.78553989955356, "logps/rejected": -233.90673828125, "loss": 0.4378, "rewards/chosen": 0.26205488613673616, "rewards/margins": 0.8538273828370231, "rewards/rejected": -0.5917724967002869, "step": 1966 }, { "epoch": 0.10425887154480164, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58531833.6, "logits/rejected": -32035978.666666668, "logps/chosen": -365.664599609375, "logps/rejected": -295.3442789713542, "loss": 0.4496, "rewards/chosen": 0.20540709495544435, "rewards/margins": 0.4335013548533122, "rewards/rejected": -0.22809425989786783, "step": 1967 }, { "epoch": 0.10431187554660377, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35066896.0, "logits/rejected": -28761578.666666668, "logps/chosen": -545.7614135742188, "logps/rejected": -309.1330973307292, "loss": 0.3111, "rewards/chosen": 0.47215574979782104, "rewards/margins": 1.453725834687551, "rewards/rejected": -0.9815700848897299, "step": 1968 }, { "epoch": 0.1043648795484059, "grad_norm": 49.0, "kl": 0.8464756011962891, "learning_rate": 5e-07, "logits/chosen": -21884661.333333332, "logits/rejected": -24965090.0, "logps/chosen": -235.1502685546875, "logps/rejected": -322.84283447265625, "loss": 0.3672, "rewards/chosen": 0.3391154607137044, "rewards/margins": 2.810465653737386, "rewards/rejected": -2.4713501930236816, "step": 1969 }, { "epoch": 0.10441788355020803, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21597940.0, "logits/rejected": -38272944.0, "logps/chosen": -86.82745361328125, "logps/rejected": -241.9068359375, "loss": 0.2597, "rewards/chosen": 0.578136126200358, "rewards/margins": 2.1757940928141277, "rewards/rejected": -1.5976579666137696, "step": 1970 }, { "epoch": 0.10447088755201017, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52767596.0, "logits/rejected": -10400904.0, "logps/chosen": -254.71534729003906, "logps/rejected": -282.58062744140625, "loss": 0.3193, "rewards/chosen": 0.18368330597877502, "rewards/margins": 1.9080304205417633, "rewards/rejected": -1.7243471145629883, "step": 1971 }, { "epoch": 0.10452389155381231, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21723016.0, "logits/rejected": -36714104.0, "logps/chosen": -430.5999348958333, "logps/rejected": -583.4983520507812, "loss": 0.3909, "rewards/chosen": 0.2043129007021586, "rewards/margins": 1.6391113797823589, "rewards/rejected": -1.4347984790802002, "step": 1972 }, { "epoch": 0.10457689555561445, "grad_norm": 69.0, "kl": 0.03691577911376953, "learning_rate": 5e-07, "logits/chosen": 3135535.5, "logits/rejected": -20936633.14285714, "logps/chosen": -26.308368682861328, "logps/rejected": -223.50765555245536, "loss": 0.2463, "rewards/chosen": 0.6001285910606384, "rewards/margins": 1.928768983909062, "rewards/rejected": -1.3286403928484236, "step": 1973 }, { "epoch": 0.10462989955741658, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47908076.8, "logits/rejected": -12971210.666666666, "logps/chosen": -261.63505859375, "logps/rejected": -309.7884521484375, "loss": 0.3832, "rewards/chosen": -0.03453307449817657, "rewards/margins": 1.5287540247042972, "rewards/rejected": -1.5632870992024739, "step": 1974 }, { "epoch": 0.10468290355921872, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20080137.333333332, "logits/rejected": -35961452.8, "logps/chosen": -363.080810546875, "logps/rejected": -427.97119140625, "loss": 0.2676, "rewards/chosen": 0.33365877469380695, "rewards/margins": 2.1398156960805257, "rewards/rejected": -1.8061569213867188, "step": 1975 }, { "epoch": 0.10473590756102086, "grad_norm": 55.5, "kl": 0.10835075378417969, "learning_rate": 5e-07, "logits/chosen": -31883002.0, "logits/rejected": -9636163.0, "logps/chosen": -450.43878173828125, "logps/rejected": -145.5540008544922, "loss": 0.3624, "rewards/chosen": 0.44001227617263794, "rewards/margins": 1.214207947254181, "rewards/rejected": -0.774195671081543, "step": 1976 }, { "epoch": 0.104788911562823, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17463842.0, "logits/rejected": -24120109.333333332, "logps/chosen": -216.13722229003906, "logps/rejected": -323.11024983723956, "loss": 0.2642, "rewards/chosen": 0.48566168546676636, "rewards/margins": 1.9085707863171895, "rewards/rejected": -1.422909100850423, "step": 1977 }, { "epoch": 0.10484191556462513, "grad_norm": 59.0, "kl": 0.037911415100097656, "learning_rate": 5e-07, "logits/chosen": -30859717.333333332, "logits/rejected": 539823.25, "logps/chosen": -336.12986246744794, "logps/rejected": -48.73450469970703, "loss": 0.4814, "rewards/chosen": -0.03771412372589111, "rewards/margins": 0.3570252060890198, "rewards/rejected": -0.3947393298149109, "step": 1978 }, { "epoch": 0.10489491956642727, "grad_norm": 55.0, "kl": 0.19480514526367188, "learning_rate": 5e-07, "logits/chosen": -26406738.666666668, "logits/rejected": -5208954.0, "logps/chosen": -342.244873046875, "logps/rejected": -106.72207641601562, "loss": 0.4122, "rewards/chosen": 0.05953490734100342, "rewards/margins": 1.6031193733215332, "rewards/rejected": -1.5435844659805298, "step": 1979 }, { "epoch": 0.1049479235682294, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29313870.0, "logits/rejected": -19715152.0, "logps/chosen": -361.8865966796875, "logps/rejected": -331.2179260253906, "loss": 0.3115, "rewards/chosen": 0.3148130476474762, "rewards/margins": 1.70757856965065, "rewards/rejected": -1.3927655220031738, "step": 1980 }, { "epoch": 0.10500092757003154, "grad_norm": 79.5, "kl": 0.7109718322753906, "learning_rate": 5e-07, "logits/chosen": -64088214.85714286, "logits/rejected": 2755128.0, "logps/chosen": -398.49741908482144, "logps/rejected": -9.653610229492188, "loss": 0.5089, "rewards/chosen": 0.03461467794009617, "rewards/margins": 0.00936271755823067, "rewards/rejected": 0.0252519603818655, "step": 1981 }, { "epoch": 0.10505393157183367, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54048883.2, "logits/rejected": -40639152.0, "logps/chosen": -411.5212890625, "logps/rejected": -293.4038899739583, "loss": 0.3149, "rewards/chosen": 0.5367462158203125, "rewards/margins": 1.862312348683675, "rewards/rejected": -1.3255661328633626, "step": 1982 }, { "epoch": 0.1051069355736358, "grad_norm": 51.25, "kl": 0.27668285369873047, "learning_rate": 5e-07, "logits/chosen": -56907584.0, "logits/rejected": 136918.25, "logps/chosen": -381.60589599609375, "logps/rejected": -168.0109405517578, "loss": 0.3536, "rewards/chosen": 0.17787933349609375, "rewards/margins": 1.3559991121292114, "rewards/rejected": -1.1781197786331177, "step": 1983 }, { "epoch": 0.10515993957543794, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31433746.0, "logits/rejected": 19920010.666666668, "logps/chosen": -454.2640686035156, "logps/rejected": -285.851806640625, "loss": 0.3261, "rewards/chosen": -0.28820961713790894, "rewards/margins": 1.0315681099891663, "rewards/rejected": -1.3197777271270752, "step": 1984 }, { "epoch": 0.10521294357724008, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16620846.666666666, "logits/rejected": -12755537.6, "logps/chosen": -238.7360636393229, "logps/rejected": -231.696826171875, "loss": 0.3871, "rewards/chosen": 0.024745052059491474, "rewards/margins": 0.8054425885279973, "rewards/rejected": -0.7806975364685058, "step": 1985 }, { "epoch": 0.10526594757904222, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 67285.25, "logits/rejected": -27954797.714285713, "logps/chosen": -245.46998596191406, "logps/rejected": -300.9037388392857, "loss": 0.2507, "rewards/chosen": 0.03031463734805584, "rewards/margins": 1.5614785341812032, "rewards/rejected": -1.5311638968331474, "step": 1986 }, { "epoch": 0.10531895158084435, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47923226.666666664, "logits/rejected": -18110147.2, "logps/chosen": -249.361572265625, "logps/rejected": -148.52451171875, "loss": 0.3669, "rewards/chosen": 0.020528793334960938, "rewards/margins": 1.0525211334228515, "rewards/rejected": -1.0319923400878905, "step": 1987 }, { "epoch": 0.10537195558264649, "grad_norm": 61.0, "kl": 0.3105888366699219, "learning_rate": 5e-07, "logits/chosen": -43346128.0, "logits/rejected": 11637054.666666666, "logps/chosen": -237.6746826171875, "logps/rejected": -445.2840983072917, "loss": 0.4076, "rewards/chosen": -0.052576905488967894, "rewards/margins": 1.221196287870407, "rewards/rejected": -1.273773193359375, "step": 1988 }, { "epoch": 0.10542495958444863, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11125834.0, "logits/rejected": -45894472.0, "logps/chosen": -193.96852111816406, "logps/rejected": -235.42010498046875, "loss": 0.3949, "rewards/chosen": -0.10342250019311905, "rewards/margins": 1.0676743015646935, "rewards/rejected": -1.1710968017578125, "step": 1989 }, { "epoch": 0.10547796358625076, "grad_norm": 54.5, "kl": 0.8251857757568359, "learning_rate": 5e-07, "logits/chosen": -25291644.0, "logits/rejected": 522997.5, "logps/chosen": -280.7915344238281, "logps/rejected": -107.94007110595703, "loss": 0.3853, "rewards/chosen": 0.3360404074192047, "rewards/margins": 0.9801453649997711, "rewards/rejected": -0.6441049575805664, "step": 1990 }, { "epoch": 0.1055309675880529, "grad_norm": 52.25, "kl": 0.7579765319824219, "learning_rate": 5e-07, "logits/chosen": -40818316.0, "logits/rejected": 1642592.0, "logps/chosen": -179.1298828125, "logps/rejected": -272.5846252441406, "loss": 0.3792, "rewards/chosen": -0.09234599769115448, "rewards/margins": 1.1945967227220535, "rewards/rejected": -1.286942720413208, "step": 1991 }, { "epoch": 0.10558397158985504, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23974832.0, "logits/rejected": -66905720.0, "logps/chosen": -183.5943806966146, "logps/rejected": -154.72799682617188, "loss": 0.423, "rewards/chosen": -0.06412477294603984, "rewards/margins": 1.4969679613908131, "rewards/rejected": -1.561092734336853, "step": 1992 }, { "epoch": 0.10563697559165718, "grad_norm": 52.5, "kl": 0.16771316528320312, "learning_rate": 5e-07, "logits/chosen": -2513575.0, "logits/rejected": -49787872.0, "logps/chosen": -240.38134765625, "logps/rejected": -362.2339680989583, "loss": 0.3014, "rewards/chosen": 0.5031259536743165, "rewards/margins": 2.187156327565511, "rewards/rejected": -1.6840303738911946, "step": 1993 }, { "epoch": 0.10568997959345931, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49834680.0, "logits/rejected": 10837116.0, "logps/chosen": -211.28041076660156, "logps/rejected": -274.41522216796875, "loss": 0.3673, "rewards/chosen": -0.09569653868675232, "rewards/margins": 1.365950495004654, "rewards/rejected": -1.4616470336914062, "step": 1994 }, { "epoch": 0.10574298359526144, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43131640.0, "logits/rejected": -37213872.0, "logps/chosen": -184.45314025878906, "logps/rejected": -366.6198323567708, "loss": 0.3059, "rewards/chosen": -0.033231355249881744, "rewards/margins": 1.1810407464702923, "rewards/rejected": -1.214272101720174, "step": 1995 }, { "epoch": 0.10579598759706357, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1638700.0, "logits/rejected": -1712680.0, "logps/chosen": -134.41960652669272, "logps/rejected": -239.471728515625, "loss": 0.2789, "rewards/chosen": 0.5300365289052328, "rewards/margins": 1.810578521092733, "rewards/rejected": -1.2805419921875, "step": 1996 }, { "epoch": 0.10584899159886571, "grad_norm": 65.0, "kl": 0.4351959228515625, "learning_rate": 5e-07, "logits/chosen": -54352640.0, "logits/rejected": -46505856.0, "logps/chosen": -616.856103515625, "logps/rejected": -360.4622395833333, "loss": 0.3384, "rewards/chosen": 0.4652684211730957, "rewards/margins": 2.1200799624125164, "rewards/rejected": -1.6548115412394206, "step": 1997 }, { "epoch": 0.10590199560066785, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10563709.333333334, "logits/rejected": -28429460.0, "logps/chosen": -84.63817850748698, "logps/rejected": -739.5034790039062, "loss": 0.3675, "rewards/chosen": 0.13694939017295837, "rewards/margins": 2.7265146672725677, "rewards/rejected": -2.5895652770996094, "step": 1998 }, { "epoch": 0.10595499960246998, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20007569.6, "logits/rejected": -130935168.0, "logps/chosen": -344.6678466796875, "logps/rejected": -669.4351806640625, "loss": 0.3244, "rewards/chosen": 0.13124458789825438, "rewards/margins": 2.8870461066563924, "rewards/rejected": -2.755801518758138, "step": 1999 }, { "epoch": 0.10600800360427212, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43383756.0, "logits/rejected": -16806720.0, "logps/chosen": -369.2165832519531, "logps/rejected": -241.7907257080078, "loss": 0.3464, "rewards/chosen": 0.24207726120948792, "rewards/margins": 1.548717886209488, "rewards/rejected": -1.306640625, "step": 2000 }, { "epoch": 0.10606100760607426, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12682998.0, "logits/rejected": -14124507.0, "logps/chosen": -99.33283996582031, "logps/rejected": -167.12017822265625, "loss": 0.3715, "rewards/chosen": 0.4020129442214966, "rewards/margins": 1.0787988901138306, "rewards/rejected": -0.676785945892334, "step": 2001 }, { "epoch": 0.1061140116078764, "grad_norm": 72.5, "kl": 0.3799896240234375, "learning_rate": 5e-07, "logits/chosen": -10353361.0, "logits/rejected": -19125620.0, "logps/chosen": -456.247802734375, "logps/rejected": -233.86764526367188, "loss": 0.2863, "rewards/chosen": 0.6418129205703735, "rewards/margins": 2.035075068473816, "rewards/rejected": -1.3932621479034424, "step": 2002 }, { "epoch": 0.10616701560967853, "grad_norm": 70.5, "kl": 0.44001007080078125, "learning_rate": 5e-07, "logits/chosen": -42610208.0, "logits/rejected": -6987884.0, "logps/chosen": -632.4588623046875, "logps/rejected": -481.3495788574219, "loss": 0.3412, "rewards/chosen": 0.3809623718261719, "rewards/margins": 1.8343905210494995, "rewards/rejected": -1.4534281492233276, "step": 2003 }, { "epoch": 0.10622001961148067, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27566352.0, "logits/rejected": -46058304.0, "logps/chosen": -191.06744384765625, "logps/rejected": -343.672607421875, "loss": 0.2887, "rewards/chosen": 0.05817928910255432, "rewards/margins": 1.5263457993666332, "rewards/rejected": -1.4681665102640789, "step": 2004 }, { "epoch": 0.10627302361328281, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25667044.8, "logits/rejected": -33470565.333333332, "logps/chosen": -211.256591796875, "logps/rejected": -544.2440592447916, "loss": 0.3741, "rewards/chosen": -0.20457167625427247, "rewards/margins": 3.327957518895467, "rewards/rejected": -3.5325291951497397, "step": 2005 }, { "epoch": 0.10632602761508494, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27209142.0, "logits/rejected": -47026692.0, "logps/chosen": -305.1683044433594, "logps/rejected": -524.6376953125, "loss": 0.236, "rewards/chosen": 0.5557651519775391, "rewards/margins": 3.020296812057495, "rewards/rejected": -2.464531660079956, "step": 2006 }, { "epoch": 0.10637903161688707, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8628926.666666666, "logits/rejected": -26474992.0, "logps/chosen": -182.5042724609375, "logps/rejected": -266.5624755859375, "loss": 0.328, "rewards/chosen": -0.3188622792561849, "rewards/margins": 1.3915362040201824, "rewards/rejected": -1.7103984832763672, "step": 2007 }, { "epoch": 0.1064320356186892, "grad_norm": 54.5, "kl": 0.023326873779296875, "learning_rate": 5e-07, "logits/chosen": -20760290.666666668, "logits/rejected": -17612168.0, "logps/chosen": -177.93583170572916, "logps/rejected": -139.255859375, "loss": 0.3636, "rewards/chosen": 0.05192458629608154, "rewards/margins": 1.0141520261764527, "rewards/rejected": -0.9622274398803711, "step": 2008 }, { "epoch": 0.10648503962049134, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9672086.666666666, "logits/rejected": -21538950.4, "logps/chosen": -222.7884521484375, "logps/rejected": -490.90498046875, "loss": 0.2791, "rewards/chosen": 0.09534912308057149, "rewards/margins": 1.8945802708466848, "rewards/rejected": -1.7992311477661134, "step": 2009 }, { "epoch": 0.10653804362229348, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22553570.666666668, "logits/rejected": -50795960.0, "logps/chosen": -287.53729248046875, "logps/rejected": -417.516357421875, "loss": 0.3791, "rewards/chosen": 0.15424365798632303, "rewards/margins": 2.2849001387755075, "rewards/rejected": -2.1306564807891846, "step": 2010 }, { "epoch": 0.10659104762409562, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65046132.0, "logits/rejected": -10378040.0, "logps/chosen": -403.1285400390625, "logps/rejected": -282.4960414341518, "loss": 0.3583, "rewards/chosen": 0.37744140625, "rewards/margins": 1.0257393973214286, "rewards/rejected": -0.6482979910714286, "step": 2011 }, { "epoch": 0.10664405162589775, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11070140.0, "logits/rejected": -31016802.666666668, "logps/chosen": -172.30406188964844, "logps/rejected": -329.68568929036456, "loss": 0.2154, "rewards/chosen": 0.5574424862861633, "rewards/margins": 2.283178945382436, "rewards/rejected": -1.7257364590962727, "step": 2012 }, { "epoch": 0.10669705562769989, "grad_norm": 53.25, "kl": 0.5029220581054688, "learning_rate": 5e-07, "logits/chosen": -23678858.666666668, "logits/rejected": -12774193.0, "logps/chosen": -261.678955078125, "logps/rejected": -94.59842681884766, "loss": 0.4652, "rewards/chosen": -0.03726510206858317, "rewards/margins": 1.0161593755086262, "rewards/rejected": -1.0534244775772095, "step": 2013 }, { "epoch": 0.10675005962950203, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42444242.666666664, "logits/rejected": -40390304.0, "logps/chosen": -91.14060465494792, "logps/rejected": -437.86212158203125, "loss": 0.4151, "rewards/chosen": 0.022163107991218567, "rewards/margins": 1.559011533856392, "rewards/rejected": -1.5368484258651733, "step": 2014 }, { "epoch": 0.10680306363130417, "grad_norm": 52.75, "kl": 0.19144630432128906, "learning_rate": 5e-07, "logits/chosen": -37244356.0, "logits/rejected": -11474240.0, "logps/chosen": -297.24749755859375, "logps/rejected": -259.86883544921875, "loss": 0.2999, "rewards/chosen": 0.4603075087070465, "rewards/margins": 1.9180858433246613, "rewards/rejected": -1.4577783346176147, "step": 2015 }, { "epoch": 0.1068560676331063, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16685356.8, "logits/rejected": -14205278.666666666, "logps/chosen": -493.13994140625, "logps/rejected": -243.00370279947916, "loss": 0.437, "rewards/chosen": 0.036945194005966187, "rewards/margins": 0.9506697754065195, "rewards/rejected": -0.9137245814005533, "step": 2016 }, { "epoch": 0.10690907163490844, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9016519.0, "logits/rejected": -48405560.0, "logps/chosen": -232.11569213867188, "logps/rejected": -315.0914306640625, "loss": 0.3218, "rewards/chosen": 0.31785839796066284, "rewards/margins": 1.837492048740387, "rewards/rejected": -1.5196336507797241, "step": 2017 }, { "epoch": 0.10696207563671058, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9149978.0, "logits/rejected": -27175337.14285714, "logps/chosen": -31.059438705444336, "logps/rejected": -246.50399344308036, "loss": 0.2541, "rewards/chosen": 0.05039329454302788, "rewards/margins": 1.4039522436048304, "rewards/rejected": -1.3535589490618025, "step": 2018 }, { "epoch": 0.10701507963851271, "grad_norm": 68.5, "kl": 0.3373584747314453, "learning_rate": 5e-07, "logits/chosen": -15245977.6, "logits/rejected": -28971445.333333332, "logps/chosen": -329.8895751953125, "logps/rejected": -254.5194295247396, "loss": 0.416, "rewards/chosen": 0.030645525455474852, "rewards/margins": 1.0456287026405335, "rewards/rejected": -1.0149831771850586, "step": 2019 }, { "epoch": 0.10706808364031484, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32397986.0, "logits/rejected": -14523162.666666666, "logps/chosen": -255.4464569091797, "logps/rejected": -175.0217488606771, "loss": 0.3734, "rewards/chosen": -0.401048868894577, "rewards/margins": 0.4680939018726349, "rewards/rejected": -0.8691427707672119, "step": 2020 }, { "epoch": 0.10712108764211697, "grad_norm": 47.25, "kl": 0.2137432098388672, "learning_rate": 5e-07, "logits/chosen": -40472083.2, "logits/rejected": -5730986.666666667, "logps/chosen": -307.456689453125, "logps/rejected": -159.29338582356772, "loss": 0.3553, "rewards/chosen": 0.14167803525924683, "rewards/margins": 1.830237090587616, "rewards/rejected": -1.6885590553283691, "step": 2021 }, { "epoch": 0.10717409164391911, "grad_norm": 52.5, "kl": 0.204925537109375, "learning_rate": 5e-07, "logits/chosen": -36391773.333333336, "logits/rejected": -19132192.0, "logps/chosen": -192.869873046875, "logps/rejected": -221.87060546875, "loss": 0.3592, "rewards/chosen": 0.030262380838394165, "rewards/margins": 1.015132623910904, "rewards/rejected": -0.9848702430725098, "step": 2022 }, { "epoch": 0.10722709564572125, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43146067.2, "logits/rejected": -74326800.0, "logps/chosen": -476.466357421875, "logps/rejected": -283.60487874348956, "loss": 0.3597, "rewards/chosen": 0.27710235118865967, "rewards/margins": 1.4679359197616577, "rewards/rejected": -1.190833568572998, "step": 2023 }, { "epoch": 0.10728009964752339, "grad_norm": 63.0, "kl": 0.4894981384277344, "learning_rate": 5e-07, "logits/chosen": -23886486.4, "logits/rejected": -37419861.333333336, "logps/chosen": -409.014990234375, "logps/rejected": -435.7210286458333, "loss": 0.2904, "rewards/chosen": 0.45818185806274414, "rewards/margins": 2.8656363487243652, "rewards/rejected": -2.407454490661621, "step": 2024 }, { "epoch": 0.10733310364932552, "grad_norm": 58.25, "kl": 0.4750823974609375, "learning_rate": 5e-07, "logits/chosen": -12784625.6, "logits/rejected": -18088992.0, "logps/chosen": -233.250830078125, "logps/rejected": -370.078857421875, "loss": 0.3775, "rewards/chosen": 0.14542075395584106, "rewards/margins": 1.5190077265103656, "rewards/rejected": -1.3735869725545247, "step": 2025 }, { "epoch": 0.10738610765112766, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18067130.0, "logits/rejected": -17645496.0, "logps/chosen": -99.76921081542969, "logps/rejected": -212.93194580078125, "loss": 0.2962, "rewards/chosen": 0.23485717177391052, "rewards/margins": 1.4920946856339772, "rewards/rejected": -1.2572375138600667, "step": 2026 }, { "epoch": 0.1074391116529298, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4572023.2, "logits/rejected": 11870453.333333334, "logps/chosen": -212.73916015625, "logps/rejected": -301.24167887369794, "loss": 0.3678, "rewards/chosen": 0.19187141656875611, "rewards/margins": 1.5210489551226298, "rewards/rejected": -1.3291775385538738, "step": 2027 }, { "epoch": 0.10749211565473193, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34419704.0, "logits/rejected": -10910329.6, "logps/chosen": -445.4339192708333, "logps/rejected": -603.925, "loss": 0.2544, "rewards/chosen": 0.29014766216278076, "rewards/margins": 2.14887855052948, "rewards/rejected": -1.8587308883666993, "step": 2028 }, { "epoch": 0.10754511965653407, "grad_norm": 44.75, "kl": 0.2562522888183594, "learning_rate": 5e-07, "logits/chosen": -1359790.1666666667, "logits/rejected": -10654614.4, "logps/chosen": -158.54632568359375, "logps/rejected": -100.80247802734375, "loss": 0.3656, "rewards/chosen": 0.1346557637055715, "rewards/margins": 1.1122303028901417, "rewards/rejected": -0.9775745391845703, "step": 2029 }, { "epoch": 0.10759812365833621, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62029928.0, "logits/rejected": -1756112.6666666667, "logps/chosen": -537.9368896484375, "logps/rejected": -277.4330647786458, "loss": 0.3354, "rewards/chosen": 0.3330276608467102, "rewards/margins": 1.2043618957201638, "rewards/rejected": -0.8713342348734537, "step": 2030 }, { "epoch": 0.10765112766013835, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19540906.0, "logits/rejected": -25091304.0, "logps/chosen": -288.8680419921875, "logps/rejected": -203.1826171875, "loss": 0.3905, "rewards/chosen": -0.2878515422344208, "rewards/margins": 1.25001659989357, "rewards/rejected": -1.5378681421279907, "step": 2031 }, { "epoch": 0.10770413166194048, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1507706.25, "logits/rejected": -19202315.42857143, "logps/chosen": -33.86305618286133, "logps/rejected": -216.19515555245536, "loss": 0.2256, "rewards/chosen": 0.15599556267261505, "rewards/margins": 2.0081978418997357, "rewards/rejected": -1.8522022792271204, "step": 2032 }, { "epoch": 0.1077571356637426, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32685173.333333332, "logits/rejected": -22967670.4, "logps/chosen": -316.97064208984375, "logps/rejected": -449.2572265625, "loss": 0.3283, "rewards/chosen": -0.31770894924799603, "rewards/margins": 1.2249375542004903, "rewards/rejected": -1.5426465034484864, "step": 2033 }, { "epoch": 0.10781013966554474, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8891077.0, "logits/rejected": 4078052.0, "logps/chosen": -339.5985412597656, "logps/rejected": -181.56234741210938, "loss": 0.3811, "rewards/chosen": -0.06063079833984375, "rewards/margins": 1.0103532075881958, "rewards/rejected": -1.0709840059280396, "step": 2034 }, { "epoch": 0.10786314366734688, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56681292.8, "logits/rejected": -27546245.333333332, "logps/chosen": -361.80390625, "logps/rejected": -216.72273763020834, "loss": 0.3965, "rewards/chosen": 0.10939680337905884, "rewards/margins": 1.438852894306183, "rewards/rejected": -1.329456090927124, "step": 2035 }, { "epoch": 0.10791614766914902, "grad_norm": 59.0, "kl": 0.12318038940429688, "learning_rate": 5e-07, "logits/chosen": -46792762.666666664, "logits/rejected": -3652917.0, "logps/chosen": -362.3044840494792, "logps/rejected": -187.7936248779297, "loss": 0.4116, "rewards/chosen": 0.1193468968073527, "rewards/margins": 1.669095555941264, "rewards/rejected": -1.5497486591339111, "step": 2036 }, { "epoch": 0.10796915167095116, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43861920.0, "logits/rejected": -29883378.285714287, "logps/chosen": -1168.621826171875, "logps/rejected": -278.73685128348217, "loss": 0.2063, "rewards/chosen": 1.46527099609375, "rewards/margins": 2.875984328133719, "rewards/rejected": -1.4107133320399694, "step": 2037 }, { "epoch": 0.10802215567275329, "grad_norm": 56.25, "kl": 0.16376876831054688, "learning_rate": 5e-07, "logits/chosen": -6344349.0, "logits/rejected": -69604440.0, "logps/chosen": -313.3403015136719, "logps/rejected": -347.8231201171875, "loss": 0.3023, "rewards/chosen": 0.4988643527030945, "rewards/margins": 2.104653537273407, "rewards/rejected": -1.6057891845703125, "step": 2038 }, { "epoch": 0.10807515967455543, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53895808.0, "logits/rejected": -35140000.0, "logps/chosen": -581.5145670572916, "logps/rejected": -357.7935791015625, "loss": 0.3762, "rewards/chosen": -0.5496268272399902, "rewards/margins": 0.8151543617248536, "rewards/rejected": -1.3647811889648438, "step": 2039 }, { "epoch": 0.10812816367635757, "grad_norm": 53.25, "kl": 0.3197298049926758, "learning_rate": 5e-07, "logits/chosen": -26166256.0, "logits/rejected": -63370092.0, "logps/chosen": -204.81109619140625, "logps/rejected": -446.15533447265625, "loss": 0.3057, "rewards/chosen": 0.3276112675666809, "rewards/margins": 1.94216650724411, "rewards/rejected": -1.6145552396774292, "step": 2040 }, { "epoch": 0.1081811676781597, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31090468.0, "logits/rejected": 8394226.0, "logps/chosen": -195.37451171875, "logps/rejected": -227.7006378173828, "loss": 0.3522, "rewards/chosen": 0.3752167820930481, "rewards/margins": 1.3273306488990784, "rewards/rejected": -0.9521138668060303, "step": 2041 }, { "epoch": 0.10823417167996184, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24885508.0, "logits/rejected": -5666209.0, "logps/chosen": -179.84921264648438, "logps/rejected": -518.5712890625, "loss": 0.3207, "rewards/chosen": 0.22552800178527832, "rewards/margins": 1.8629624843597412, "rewards/rejected": -1.637434482574463, "step": 2042 }, { "epoch": 0.10828717568176398, "grad_norm": 81.0, "kl": 0.3309974670410156, "learning_rate": 5e-07, "logits/chosen": -64665952.0, "logits/rejected": -43998240.0, "logps/chosen": -421.3219807942708, "logps/rejected": -351.7988586425781, "loss": 0.4544, "rewards/chosen": 0.05676977833112081, "rewards/margins": 0.8139048119386038, "rewards/rejected": -0.7571350336074829, "step": 2043 }, { "epoch": 0.10834017968356612, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40339564.8, "logits/rejected": -17736760.0, "logps/chosen": -521.94482421875, "logps/rejected": -263.994873046875, "loss": 0.3785, "rewards/chosen": 0.2684506416320801, "rewards/margins": 1.3273982048034667, "rewards/rejected": -1.0589475631713867, "step": 2044 }, { "epoch": 0.10839318368536824, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7325614.4, "logits/rejected": -27862560.0, "logps/chosen": -123.31143798828126, "logps/rejected": -269.00990804036456, "loss": 0.4036, "rewards/chosen": -0.2515327215194702, "rewards/margins": 1.5931520382563273, "rewards/rejected": -1.8446847597757976, "step": 2045 }, { "epoch": 0.10844618768717038, "grad_norm": 64.0, "kl": 0.19979476928710938, "learning_rate": 5e-07, "logits/chosen": -59968053.333333336, "logits/rejected": -19828420.0, "logps/chosen": -390.0958658854167, "logps/rejected": -545.0108642578125, "loss": 0.3586, "rewards/chosen": 0.2060066262880961, "rewards/margins": 3.310895542303721, "rewards/rejected": -3.104888916015625, "step": 2046 }, { "epoch": 0.10849919168897251, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18509389.333333332, "logits/rejected": -31998112.0, "logps/chosen": -288.5312093098958, "logps/rejected": -521.11357421875, "loss": 0.2333, "rewards/chosen": 0.40115880966186523, "rewards/margins": 2.838552379608154, "rewards/rejected": -2.437393569946289, "step": 2047 }, { "epoch": 0.10855219569077465, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13676099.0, "logits/rejected": -17890452.0, "logps/chosen": -155.4196319580078, "logps/rejected": -372.82391357421875, "loss": 0.3416, "rewards/chosen": 0.03870755434036255, "rewards/margins": 1.595862090587616, "rewards/rejected": -1.5571545362472534, "step": 2048 }, { "epoch": 0.10860519969257679, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69790837.33333333, "logits/rejected": -40235635.2, "logps/chosen": -256.0989583333333, "logps/rejected": -467.124951171875, "loss": 0.2743, "rewards/chosen": 0.14021402597427368, "rewards/margins": 2.0316086411476135, "rewards/rejected": -1.8913946151733398, "step": 2049 }, { "epoch": 0.10865820369437892, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22606638.4, "logits/rejected": -19153854.666666668, "logps/chosen": -264.7482421875, "logps/rejected": -269.2620035807292, "loss": 0.3957, "rewards/chosen": 0.09035762548446655, "rewards/margins": 1.190186822414398, "rewards/rejected": -1.0998291969299316, "step": 2050 }, { "epoch": 0.10871120769618106, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31650832.0, "logits/rejected": -6341188.0, "logps/chosen": -332.3050537109375, "logps/rejected": -172.14656575520834, "loss": 0.4319, "rewards/chosen": -0.19584168195724488, "rewards/margins": 1.2141732652982076, "rewards/rejected": -1.4100149472554524, "step": 2051 }, { "epoch": 0.1087642116979832, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6529000.0, "logits/rejected": -15637564.0, "logps/chosen": -124.85523986816406, "logps/rejected": -243.13653564453125, "loss": 0.3955, "rewards/chosen": -0.09098415076732635, "rewards/margins": 0.9362668544054031, "rewards/rejected": -1.0272510051727295, "step": 2052 }, { "epoch": 0.10881721569978534, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4192435.6666666665, "logits/rejected": -30547481.6, "logps/chosen": -59.45159403483073, "logps/rejected": -461.700341796875, "loss": 0.3715, "rewards/chosen": -0.6082009474436442, "rewards/margins": 0.8946372826894126, "rewards/rejected": -1.5028382301330567, "step": 2053 }, { "epoch": 0.10887021970158747, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41832106.666666664, "logits/rejected": -12060573.6, "logps/chosen": -566.8291829427084, "logps/rejected": -296.0508056640625, "loss": 0.2569, "rewards/chosen": 0.5948695341746012, "rewards/margins": 2.106534210840861, "rewards/rejected": -1.5116646766662598, "step": 2054 }, { "epoch": 0.10892322370338961, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17142200.0, "logits/rejected": -14262243.0, "logps/chosen": -141.2661895751953, "logps/rejected": -150.77902221679688, "loss": 0.3331, "rewards/chosen": 0.20507365465164185, "rewards/margins": 1.5432459712028503, "rewards/rejected": -1.3381723165512085, "step": 2055 }, { "epoch": 0.10897622770519175, "grad_norm": 54.25, "kl": 0.051364898681640625, "learning_rate": 5e-07, "logits/chosen": -28070909.333333332, "logits/rejected": -53151000.0, "logps/chosen": -328.6811930338542, "logps/rejected": -261.34368896484375, "loss": 0.4129, "rewards/chosen": -0.10366706053415935, "rewards/margins": 2.234289367993673, "rewards/rejected": -2.337956428527832, "step": 2056 }, { "epoch": 0.10902923170699388, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48581008.0, "logits/rejected": -62328916.0, "logps/chosen": -242.3163045247396, "logps/rejected": -449.66900634765625, "loss": 0.4105, "rewards/chosen": 0.06602297226587932, "rewards/margins": 1.529898504416148, "rewards/rejected": -1.4638755321502686, "step": 2057 }, { "epoch": 0.10908223570879601, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5103168.5, "logits/rejected": -4729653.714285715, "logps/chosen": -79.03736114501953, "logps/rejected": -266.7182094029018, "loss": 0.2781, "rewards/chosen": 0.05999755859375, "rewards/margins": 1.2429512568882533, "rewards/rejected": -1.1829536982945033, "step": 2058 }, { "epoch": 0.10913523971059814, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24629616.0, "logits/rejected": -45405480.0, "logps/chosen": -122.88395690917969, "logps/rejected": -433.73236083984375, "loss": 0.3148, "rewards/chosen": 0.18934670090675354, "rewards/margins": 1.9582069218158722, "rewards/rejected": -1.7688602209091187, "step": 2059 }, { "epoch": 0.10918824371240028, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3170349.6666666665, "logits/rejected": -1918221.6, "logps/chosen": -153.92998250325522, "logps/rejected": -89.84376831054688, "loss": 0.3085, "rewards/chosen": 0.23961450656255087, "rewards/margins": 1.6264103134473165, "rewards/rejected": -1.3867958068847657, "step": 2060 }, { "epoch": 0.10924124771420242, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1801145.3333333333, "logits/rejected": -50888384.0, "logps/chosen": -162.8277384440104, "logps/rejected": -201.73917236328126, "loss": 0.3086, "rewards/chosen": 0.5109560489654541, "rewards/margins": 1.578841257095337, "rewards/rejected": -1.0678852081298829, "step": 2061 }, { "epoch": 0.10929425171600456, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22555344.0, "logits/rejected": -45143973.333333336, "logps/chosen": -354.301025390625, "logps/rejected": -421.1146647135417, "loss": 0.2716, "rewards/chosen": 0.013611221686005592, "rewards/margins": 1.8128150943666697, "rewards/rejected": -1.799203872680664, "step": 2062 }, { "epoch": 0.1093472557178067, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1240940.1, "logits/rejected": -30683488.0, "logps/chosen": -386.53095703125, "logps/rejected": -405.4302164713542, "loss": 0.3735, "rewards/chosen": 0.07147949934005737, "rewards/margins": 1.5581496755282085, "rewards/rejected": -1.4866701761881511, "step": 2063 }, { "epoch": 0.10940025971960883, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31078520.0, "logits/rejected": -18166166.4, "logps/chosen": -342.6656901041667, "logps/rejected": -260.8669189453125, "loss": 0.3107, "rewards/chosen": 0.2527809143066406, "rewards/margins": 1.5626278877258302, "rewards/rejected": -1.3098469734191895, "step": 2064 }, { "epoch": 0.10945326372141097, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15022421.333333334, "logits/rejected": -30039376.0, "logps/chosen": -297.1570231119792, "logps/rejected": -538.793603515625, "loss": 0.2357, "rewards/chosen": 0.37027029196421307, "rewards/margins": 2.494866315523783, "rewards/rejected": -2.1245960235595702, "step": 2065 }, { "epoch": 0.1095062677232131, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64300656.0, "logits/rejected": -12179968.0, "logps/chosen": -431.43133544921875, "logps/rejected": -269.268798828125, "loss": 0.3522, "rewards/chosen": 0.10480032116174698, "rewards/margins": 1.429932214319706, "rewards/rejected": -1.325131893157959, "step": 2066 }, { "epoch": 0.10955927172501524, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66956832.0, "logits/rejected": -57532732.0, "logps/chosen": -541.2432250976562, "logps/rejected": -500.4464416503906, "loss": 0.2928, "rewards/chosen": 0.009292113594710827, "rewards/margins": 2.8895938275381923, "rewards/rejected": -2.8803017139434814, "step": 2067 }, { "epoch": 0.10961227572681738, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63480420.0, "logits/rejected": -14788364.0, "logps/chosen": -444.0916442871094, "logps/rejected": -207.5603485107422, "loss": 0.3027, "rewards/chosen": 0.4435691833496094, "rewards/margins": 1.9041917324066162, "rewards/rejected": -1.4606225490570068, "step": 2068 }, { "epoch": 0.10966527972861952, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33433688.0, "logits/rejected": -19391352.0, "logps/chosen": -460.0362854003906, "logps/rejected": -413.5418701171875, "loss": 0.2926, "rewards/chosen": 0.21254807710647583, "rewards/margins": 2.41661673784256, "rewards/rejected": -2.204068660736084, "step": 2069 }, { "epoch": 0.10971828373042165, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -82413896.0, "logits/rejected": -42536458.666666664, "logps/chosen": -269.87371826171875, "logps/rejected": -347.9474283854167, "loss": 0.3068, "rewards/chosen": -0.2611702084541321, "rewards/margins": 1.041190246740977, "rewards/rejected": -1.3023604551951091, "step": 2070 }, { "epoch": 0.10977128773222378, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33357644.8, "logits/rejected": -30134085.333333332, "logps/chosen": -223.67041015625, "logps/rejected": -118.44528198242188, "loss": 0.4783, "rewards/chosen": -0.1689221143722534, "rewards/margins": 0.3676938931147257, "rewards/rejected": -0.5366160074869791, "step": 2071 }, { "epoch": 0.10982429173402591, "grad_norm": 45.25, "kl": 0.6306695938110352, "learning_rate": 5e-07, "logits/chosen": -25243818.666666668, "logits/rejected": -3504876.0, "logps/chosen": -184.72049967447916, "logps/rejected": -279.107666015625, "loss": 0.4626, "rewards/chosen": -0.221457839012146, "rewards/margins": 1.8046211004257202, "rewards/rejected": -2.026078939437866, "step": 2072 }, { "epoch": 0.10987729573582805, "grad_norm": 56.0, "kl": 0.7693061828613281, "learning_rate": 5e-07, "logits/chosen": -9872343.333333334, "logits/rejected": -30729088.0, "logps/chosen": -663.664794921875, "logps/rejected": -91.729931640625, "loss": 0.3522, "rewards/chosen": 0.5664851268132528, "rewards/margins": 1.2539059718449912, "rewards/rejected": -0.6874208450317383, "step": 2073 }, { "epoch": 0.10993029973763019, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31821386.666666668, "logits/rejected": -16520334.4, "logps/chosen": -175.5901082356771, "logps/rejected": -263.299267578125, "loss": 0.323, "rewards/chosen": -0.20877609650293985, "rewards/margins": 1.2872750242551168, "rewards/rejected": -1.4960511207580567, "step": 2074 }, { "epoch": 0.10998330373943233, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50024776.0, "logits/rejected": -49414584.0, "logps/chosen": -313.64483642578125, "logps/rejected": -635.2391357421875, "loss": 0.2734, "rewards/chosen": 0.28290098905563354, "rewards/margins": 2.623753249645233, "rewards/rejected": -2.3408522605895996, "step": 2075 }, { "epoch": 0.11003630774123446, "grad_norm": 66.5, "kl": 0.547454833984375, "learning_rate": 5e-07, "logits/chosen": -84385136.0, "logits/rejected": -35531764.0, "logps/chosen": -699.2532958984375, "logps/rejected": -399.985107421875, "loss": 0.3236, "rewards/chosen": 0.19561979174613953, "rewards/margins": 1.9965698421001434, "rewards/rejected": -1.800950050354004, "step": 2076 }, { "epoch": 0.1100893117430366, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12554313.333333334, "logits/rejected": -26328723.2, "logps/chosen": -187.9582316080729, "logps/rejected": -294.526708984375, "loss": 0.3846, "rewards/chosen": -0.11625925699869792, "rewards/margins": 0.8991372426350911, "rewards/rejected": -1.015396499633789, "step": 2077 }, { "epoch": 0.11014231574483874, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10573531.333333334, "logits/rejected": -32398528.0, "logps/chosen": -275.2423909505208, "logps/rejected": -480.0451171875, "loss": 0.3287, "rewards/chosen": 0.050451661149660744, "rewards/margins": 1.4461438188950222, "rewards/rejected": -1.3956921577453614, "step": 2078 }, { "epoch": 0.11019531974664087, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34539092.0, "logits/rejected": -12146530.0, "logps/chosen": -173.17379760742188, "logps/rejected": -175.93374633789062, "loss": 0.3718, "rewards/chosen": 0.16217827796936035, "rewards/margins": 1.181829810142517, "rewards/rejected": -1.0196515321731567, "step": 2079 }, { "epoch": 0.11024832374844301, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26593973.333333332, "logits/rejected": 24323436.0, "logps/chosen": -233.8062947591146, "logps/rejected": -310.6376953125, "loss": 0.3595, "rewards/chosen": 0.29293161630630493, "rewards/margins": 2.5189478993415833, "rewards/rejected": -2.2260162830352783, "step": 2080 }, { "epoch": 0.11030132775024515, "grad_norm": 76.0, "kl": 0.4036979675292969, "learning_rate": 5e-07, "logits/chosen": -29757676.0, "logits/rejected": -3172784.0, "logps/chosen": -293.6062316894531, "logps/rejected": -399.40277099609375, "loss": 0.3286, "rewards/chosen": 0.01976761966943741, "rewards/margins": 1.979465700685978, "rewards/rejected": -1.9596980810165405, "step": 2081 }, { "epoch": 0.11035433175204729, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18474210.666666668, "logits/rejected": -9771436.0, "logps/chosen": -232.112548828125, "logps/rejected": -309.355224609375, "loss": 0.4126, "rewards/chosen": -0.0739964097738266, "rewards/margins": 2.7878624349832535, "rewards/rejected": -2.86185884475708, "step": 2082 }, { "epoch": 0.11040733575384941, "grad_norm": 65.0, "kl": 0.7132329940795898, "learning_rate": 5e-07, "logits/chosen": -21473101.714285713, "logits/rejected": 6588679.0, "logps/chosen": -466.179931640625, "logps/rejected": -10.747617721557617, "loss": 0.4335, "rewards/chosen": 0.4140044961656843, "rewards/margins": 0.31545369965689524, "rewards/rejected": 0.09855079650878906, "step": 2083 }, { "epoch": 0.11046033975565155, "grad_norm": 80.0, "kl": 0.49140167236328125, "learning_rate": 5e-07, "logits/chosen": -3345467.6666666665, "logits/rejected": 35254096.0, "logps/chosen": -401.8396809895833, "logps/rejected": -574.5672607421875, "loss": 0.4923, "rewards/chosen": -0.19975878794987997, "rewards/margins": 0.7805497447649637, "rewards/rejected": -0.9803085327148438, "step": 2084 }, { "epoch": 0.11051334375745368, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43177024.0, "logits/rejected": -10999400.0, "logps/chosen": -502.2985026041667, "logps/rejected": -770.009765625, "loss": 0.3979, "rewards/chosen": -0.08117265502611797, "rewards/margins": 3.3807566662629447, "rewards/rejected": -3.4619293212890625, "step": 2085 }, { "epoch": 0.11056634775925582, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58143624.0, "logits/rejected": -10575748.0, "logps/chosen": -572.213134765625, "logps/rejected": -237.48347981770834, "loss": 0.2124, "rewards/chosen": 0.5502716302871704, "rewards/margins": 2.3154178857803345, "rewards/rejected": -1.765146255493164, "step": 2086 }, { "epoch": 0.11061935176105796, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25395300.8, "logits/rejected": -18768588.0, "logps/chosen": -131.41055908203126, "logps/rejected": -164.56376139322916, "loss": 0.4552, "rewards/chosen": -0.16140143871307372, "rewards/margins": 0.671305807431539, "rewards/rejected": -0.8327072461446127, "step": 2087 }, { "epoch": 0.1106723557628601, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14197169.333333334, "logits/rejected": -10451807.0, "logps/chosen": -213.0828857421875, "logps/rejected": -261.7042236328125, "loss": 0.4027, "rewards/chosen": 0.04093740383783976, "rewards/margins": 1.9480137626330059, "rewards/rejected": -1.907076358795166, "step": 2088 }, { "epoch": 0.11072535976466223, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": 8850974.0, "logps/rejected": -304.4156494140625, "loss": 0.131, "rewards/rejected": -1.9814996719360352, "step": 2089 }, { "epoch": 0.11077836376646437, "grad_norm": 64.5, "kl": 0.18500137329101562, "learning_rate": 5e-07, "logits/chosen": -24562899.2, "logits/rejected": -19570866.666666668, "logps/chosen": -338.5441162109375, "logps/rejected": -220.0238037109375, "loss": 0.3778, "rewards/chosen": 0.09805488586425781, "rewards/margins": 1.487056573232015, "rewards/rejected": -1.389001687367757, "step": 2090 }, { "epoch": 0.1108313677682665, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39538596.0, "logits/rejected": -29015227.42857143, "logps/chosen": -867.5399169921875, "logps/rejected": -354.9825962611607, "loss": 0.1782, "rewards/chosen": 1.7964661121368408, "rewards/margins": 3.4820639405931746, "rewards/rejected": -1.6855978284563338, "step": 2091 }, { "epoch": 0.11088437177006864, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6968530.5, "logits/rejected": -10216876.0, "logps/chosen": -240.8273468017578, "logps/rejected": -311.4447021484375, "loss": 0.2988, "rewards/chosen": 0.4149215817451477, "rewards/margins": 2.1405444741249084, "rewards/rejected": -1.7256228923797607, "step": 2092 }, { "epoch": 0.11093737577187078, "grad_norm": 56.0, "kl": 0.9945526123046875, "learning_rate": 5e-07, "logits/chosen": -32477420.8, "logits/rejected": -36483557.333333336, "logps/chosen": -367.903271484375, "logps/rejected": -324.2755126953125, "loss": 0.3763, "rewards/chosen": 0.28964478969573976, "rewards/margins": 1.9066720724105835, "rewards/rejected": -1.6170272827148438, "step": 2093 }, { "epoch": 0.11099037977367292, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34986904.0, "logits/rejected": 5233417.0, "logps/chosen": -185.59974670410156, "logps/rejected": -379.4660339355469, "loss": 0.3844, "rewards/chosen": -0.2820814847946167, "rewards/margins": 1.2560113668441772, "rewards/rejected": -1.538092851638794, "step": 2094 }, { "epoch": 0.11104338377547505, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33121026.666666668, "logits/rejected": -50323376.0, "logps/chosen": -542.5864664713541, "logps/rejected": -358.971435546875, "loss": 0.3676, "rewards/chosen": 0.35893527666727704, "rewards/margins": 2.2334866921106973, "rewards/rejected": -1.8745514154434204, "step": 2095 }, { "epoch": 0.11109638777727718, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16367168.0, "logits/rejected": 2076498.6666666667, "logps/chosen": -392.75634765625, "logps/rejected": -152.43574015299478, "loss": 0.4021, "rewards/chosen": 0.212851881980896, "rewards/margins": 1.004742980003357, "rewards/rejected": -0.7918910980224609, "step": 2096 }, { "epoch": 0.11114939177907932, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34935165.333333336, "logits/rejected": 5153352.5, "logps/chosen": -263.891357421875, "logps/rejected": -41.55881881713867, "loss": 0.4542, "rewards/chosen": 0.07162157694498698, "rewards/margins": 0.5815377632776896, "rewards/rejected": -0.5099161863327026, "step": 2097 }, { "epoch": 0.11120239578088145, "grad_norm": 53.0, "kl": 0.0952911376953125, "learning_rate": 5e-07, "logits/chosen": -34420684.8, "logits/rejected": -2149912.0, "logps/chosen": -272.5915771484375, "logps/rejected": -282.5821533203125, "loss": 0.3754, "rewards/chosen": 0.128787624835968, "rewards/margins": 1.5026546597480774, "rewards/rejected": -1.3738670349121094, "step": 2098 }, { "epoch": 0.11125539978268359, "grad_norm": 59.25, "kl": 1.8514728546142578, "learning_rate": 5e-07, "logits/chosen": -7662492.666666667, "logits/rejected": -48635814.4, "logps/chosen": -546.8630777994791, "logps/rejected": -412.157861328125, "loss": 0.3525, "rewards/chosen": 0.2955212990442912, "rewards/margins": 1.7506591240564984, "rewards/rejected": -1.4551378250122071, "step": 2099 }, { "epoch": 0.11130840378448573, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5663128.0, "logits/rejected": -51821640.0, "logps/chosen": -62.37835184733073, "logps/rejected": -549.2789306640625, "loss": 0.3561, "rewards/chosen": 0.24473631381988525, "rewards/margins": 2.441240429878235, "rewards/rejected": -2.1965041160583496, "step": 2100 }, { "epoch": 0.11136140778628786, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -93077952.0, "logits/rejected": -10802680.666666666, "logps/chosen": -363.00537109375, "logps/rejected": -231.50152587890625, "loss": 0.2727, "rewards/chosen": 0.11742096394300461, "rewards/margins": 1.546540232996146, "rewards/rejected": -1.4291192690531414, "step": 2101 }, { "epoch": 0.11141441178809, "grad_norm": 54.5, "kl": 0.8042984008789062, "learning_rate": 5e-07, "logits/chosen": 2975440.8, "logits/rejected": -49129029.333333336, "logps/chosen": -331.90068359375, "logps/rejected": -417.4943033854167, "loss": 0.3077, "rewards/chosen": 0.4023625373840332, "rewards/margins": 3.1066987037658693, "rewards/rejected": -2.704336166381836, "step": 2102 }, { "epoch": 0.11146741578989214, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20364730.0, "logits/rejected": -18087917.333333332, "logps/chosen": -353.7983703613281, "logps/rejected": -276.5870361328125, "loss": 0.2575, "rewards/chosen": 0.39922791719436646, "rewards/margins": 1.897297998269399, "rewards/rejected": -1.4980700810750325, "step": 2103 }, { "epoch": 0.11152041979169428, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17769020.0, "logits/rejected": -72219840.0, "logps/chosen": -295.8125305175781, "logps/rejected": -547.2498168945312, "loss": 0.301, "rewards/chosen": 0.0011337324976921082, "rewards/margins": 2.3901117369532585, "rewards/rejected": -2.3889780044555664, "step": 2104 }, { "epoch": 0.11157342379349641, "grad_norm": 65.5, "kl": 0.6562118530273438, "learning_rate": 5e-07, "logits/chosen": -37521011.2, "logits/rejected": 25632048.0, "logps/chosen": -358.45869140625, "logps/rejected": -694.8815104166666, "loss": 0.3974, "rewards/chosen": 0.038117837905883786, "rewards/margins": 2.181717348098755, "rewards/rejected": -2.143599510192871, "step": 2105 }, { "epoch": 0.11162642779529855, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4329456.5, "logits/rejected": -26280320.0, "logps/chosen": -361.6457824707031, "logps/rejected": -292.41139439174106, "loss": 0.2104, "rewards/chosen": -0.11629944294691086, "rewards/margins": 1.76882040394204, "rewards/rejected": -1.885119846888951, "step": 2106 }, { "epoch": 0.11167943179710069, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1138892.8, "logits/rejected": 8745127.333333334, "logps/chosen": -265.786865234375, "logps/rejected": -157.7937215169271, "loss": 0.3478, "rewards/chosen": 0.40305438041687014, "rewards/margins": 1.5803123156229657, "rewards/rejected": -1.1772579352060955, "step": 2107 }, { "epoch": 0.11173243579890282, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39266760.0, "logits/rejected": -29242460.0, "logps/chosen": -357.406005859375, "logps/rejected": -361.30511474609375, "loss": 0.3318, "rewards/chosen": 0.1862739622592926, "rewards/margins": 1.9228688776493073, "rewards/rejected": -1.7365949153900146, "step": 2108 }, { "epoch": 0.11178543980070495, "grad_norm": 46.75, "kl": 1.9419021606445312, "learning_rate": 5e-07, "logits/chosen": -56015456.0, "logits/rejected": -31753756.0, "logps/chosen": -458.65087890625, "logps/rejected": -294.48699951171875, "loss": 0.3018, "rewards/chosen": 0.6946086883544922, "rewards/margins": 3.1646599769592285, "rewards/rejected": -2.4700512886047363, "step": 2109 }, { "epoch": 0.11183844380250708, "grad_norm": 62.25, "kl": 0.5419387817382812, "learning_rate": 5e-07, "logits/chosen": -24720014.0, "logits/rejected": -42228960.0, "logps/chosen": -583.0645141601562, "logps/rejected": -334.9259033203125, "loss": 0.3425, "rewards/chosen": 0.0925087109208107, "rewards/margins": 1.696055807173252, "rewards/rejected": -1.6035470962524414, "step": 2110 }, { "epoch": 0.11189144780430922, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54132773.333333336, "logits/rejected": -55173892.0, "logps/chosen": -291.4623616536458, "logps/rejected": -80.30502319335938, "loss": 0.4124, "rewards/chosen": 0.17756932973861694, "rewards/margins": 1.1844106316566467, "rewards/rejected": -1.0068413019180298, "step": 2111 }, { "epoch": 0.11194445180611136, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46094736.0, "logits/rejected": -41239804.0, "logps/chosen": -419.7464599609375, "logps/rejected": -403.8607177734375, "loss": 0.3256, "rewards/chosen": 0.0173034630715847, "rewards/margins": 2.116833683103323, "rewards/rejected": -2.0995302200317383, "step": 2112 }, { "epoch": 0.1119974558079135, "grad_norm": 61.5, "kl": 0.27988624572753906, "learning_rate": 5e-07, "logits/chosen": -7356310.5, "logits/rejected": 22194685.333333332, "logps/chosen": -244.6064453125, "logps/rejected": -370.5764567057292, "loss": 0.3166, "rewards/chosen": -0.02003173902630806, "rewards/margins": 1.3881427757441998, "rewards/rejected": -1.4081745147705078, "step": 2113 }, { "epoch": 0.11205045980971563, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23048888.0, "logits/rejected": 2861709.25, "logps/chosen": -423.1903991699219, "logps/rejected": -38.348411560058594, "loss": 0.4147, "rewards/chosen": 0.16929331421852112, "rewards/margins": 0.7650000154972076, "rewards/rejected": -0.5957067012786865, "step": 2114 }, { "epoch": 0.11210346381151777, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5258640.0, "logits/rejected": -14472026.285714285, "logps/chosen": -3.316600799560547, "logps/rejected": -317.27378627232144, "loss": 0.2527, "rewards/chosen": 0.023917580023407936, "rewards/margins": 1.3640370781400375, "rewards/rejected": -1.3401194981166296, "step": 2115 }, { "epoch": 0.11215646781331991, "grad_norm": 77.0, "kl": 3.113433837890625, "learning_rate": 5e-07, "logits/chosen": -48013820.0, "logits/rejected": -21646190.0, "logps/chosen": -776.2586669921875, "logps/rejected": -405.43585205078125, "loss": 0.2283, "rewards/chosen": 1.1254093647003174, "rewards/margins": 3.939783811569214, "rewards/rejected": -2.8143744468688965, "step": 2116 }, { "epoch": 0.11220947181512204, "grad_norm": 52.0, "kl": 0.40633583068847656, "learning_rate": 5e-07, "logits/chosen": -2385318.8571428573, "logits/rejected": 7543640.0, "logps/chosen": -154.60027204241072, "logps/rejected": -79.48957061767578, "loss": 0.4874, "rewards/chosen": 0.05314739687102182, "rewards/margins": 0.3376421843256269, "rewards/rejected": -0.2844947874546051, "step": 2117 }, { "epoch": 0.11226247581692418, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43067200.0, "logits/rejected": -26868793.6, "logps/chosen": -372.37646484375, "logps/rejected": -288.3161865234375, "loss": 0.2811, "rewards/chosen": 0.09022065003712972, "rewards/margins": 1.8671196063359579, "rewards/rejected": -1.7768989562988282, "step": 2118 }, { "epoch": 0.11231547981872632, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39852854.4, "logits/rejected": -30146218.666666668, "logps/chosen": -259.96591796875, "logps/rejected": -117.79196166992188, "loss": 0.389, "rewards/chosen": 0.12224388122558594, "rewards/margins": 1.2392131487528484, "rewards/rejected": -1.1169692675272624, "step": 2119 }, { "epoch": 0.11236848382052846, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51387568.0, "logits/rejected": -70735888.0, "logps/chosen": -279.46990966796875, "logps/rejected": -355.59765625, "loss": 0.3029, "rewards/chosen": 0.3129642605781555, "rewards/margins": 1.872378408908844, "rewards/rejected": -1.5594141483306885, "step": 2120 }, { "epoch": 0.11242148782233058, "grad_norm": 59.0, "kl": 0.1831531524658203, "learning_rate": 5e-07, "logits/chosen": -44756773.333333336, "logits/rejected": -38113848.0, "logps/chosen": -315.1240234375, "logps/rejected": -245.49658203125, "loss": 0.3466, "rewards/chosen": 0.42492441336313885, "rewards/margins": 1.9239564339319866, "rewards/rejected": -1.4990320205688477, "step": 2121 }, { "epoch": 0.11247449182413272, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -109695416.0, "logits/rejected": -21234710.0, "logps/chosen": -165.87161254882812, "logps/rejected": -121.21200561523438, "loss": 0.4021, "rewards/chosen": -0.12609979510307312, "rewards/margins": 0.8852684795856476, "rewards/rejected": -1.0113682746887207, "step": 2122 }, { "epoch": 0.11252749582593485, "grad_norm": 47.5, "kl": 0.08020782470703125, "learning_rate": 5e-07, "logits/chosen": -58454988.0, "logits/rejected": -49463072.0, "logps/chosen": -262.9939880371094, "logps/rejected": -283.9772644042969, "loss": 0.2898, "rewards/chosen": 0.2897363305091858, "rewards/margins": 2.2417109608650208, "rewards/rejected": -1.951974630355835, "step": 2123 }, { "epoch": 0.11258049982773699, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44639140.0, "logits/rejected": -19776784.0, "logps/chosen": -315.8543701171875, "logps/rejected": -243.03594970703125, "loss": 0.4042, "rewards/chosen": -0.2674906551837921, "rewards/margins": 0.9741015136241913, "rewards/rejected": -1.2415921688079834, "step": 2124 }, { "epoch": 0.11263350382953913, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9502856.0, "logits/rejected": -26130309.333333332, "logps/chosen": -354.70306396484375, "logps/rejected": -289.85178629557294, "loss": 0.2969, "rewards/chosen": -0.45683175325393677, "rewards/margins": 1.122721532980601, "rewards/rejected": -1.5795532862345378, "step": 2125 }, { "epoch": 0.11268650783134126, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68935609.6, "logits/rejected": -28550773.333333332, "logps/chosen": -457.64365234375, "logps/rejected": -263.57786051432294, "loss": 0.4128, "rewards/chosen": -0.2244598388671875, "rewards/margins": 1.4882257461547852, "rewards/rejected": -1.7126855850219727, "step": 2126 }, { "epoch": 0.1127395118331434, "grad_norm": 50.25, "kl": 0.3489723205566406, "learning_rate": 5e-07, "logits/chosen": -27831144.0, "logits/rejected": -18796736.0, "logps/chosen": -194.47428385416666, "logps/rejected": -241.23886108398438, "loss": 0.4134, "rewards/chosen": 0.21655712525049844, "rewards/margins": 1.2603292266527812, "rewards/rejected": -1.0437721014022827, "step": 2127 }, { "epoch": 0.11279251583494554, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75109872.0, "logits/rejected": 8954883.0, "logps/chosen": -359.9644775390625, "logps/rejected": -131.0675506591797, "loss": 0.3891, "rewards/chosen": 0.15939027070999146, "rewards/margins": 0.9613439440727234, "rewards/rejected": -0.8019536733627319, "step": 2128 }, { "epoch": 0.11284551983674768, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58430324.0, "logits/rejected": -32624885.333333332, "logps/chosen": -283.5910339355469, "logps/rejected": -361.6805419921875, "loss": 0.244, "rewards/chosen": -0.14854183793067932, "rewards/margins": 1.7608433663845062, "rewards/rejected": -1.9093852043151855, "step": 2129 }, { "epoch": 0.11289852383854981, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10805795.0, "logits/rejected": -17627409.333333332, "logps/chosen": -208.2036590576172, "logps/rejected": -177.5899658203125, "loss": 0.3059, "rewards/chosen": 0.2734169363975525, "rewards/margins": 1.335472842057546, "rewards/rejected": -1.0620559056599934, "step": 2130 }, { "epoch": 0.11295152784035195, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30467824.0, "logits/rejected": -31617830.4, "logps/chosen": -454.6562906901042, "logps/rejected": -400.6500244140625, "loss": 0.2211, "rewards/chosen": 0.7011322180430094, "rewards/margins": 2.6149999777475994, "rewards/rejected": -1.9138677597045899, "step": 2131 }, { "epoch": 0.11300453184215409, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 67573344.0, "logits/rejected": -45519208.0, "logps/chosen": -528.3192749023438, "logps/rejected": -291.8500671386719, "loss": 0.3232, "rewards/chosen": -0.18136872351169586, "rewards/margins": 2.1659391969442368, "rewards/rejected": -2.3473079204559326, "step": 2132 }, { "epoch": 0.11305753584395623, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4286231.5, "logits/rejected": -6550484.666666667, "logps/chosen": -54.279571533203125, "logps/rejected": -212.81929524739584, "loss": 0.314, "rewards/chosen": -0.3145696520805359, "rewards/margins": 1.0070622563362122, "rewards/rejected": -1.321631908416748, "step": 2133 }, { "epoch": 0.11311053984575835, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22103001.6, "logits/rejected": 26438762.666666668, "logps/chosen": -214.36083984375, "logps/rejected": -226.56396484375, "loss": 0.3953, "rewards/chosen": 0.1797877073287964, "rewards/margins": 1.3130516131718952, "rewards/rejected": -1.1332639058430989, "step": 2134 }, { "epoch": 0.11316354384756049, "grad_norm": 90.0, "kl": 0.357513427734375, "learning_rate": 5e-07, "logits/chosen": -16850745.6, "logits/rejected": -23593173.333333332, "logps/chosen": -731.072802734375, "logps/rejected": -118.09493001302083, "loss": 0.3261, "rewards/chosen": 0.37734408378601075, "rewards/margins": 2.019292147954305, "rewards/rejected": -1.6419480641682942, "step": 2135 }, { "epoch": 0.11321654784936262, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -212889.83333333334, "logits/rejected": -31169123.2, "logps/chosen": -251.3637898763021, "logps/rejected": -196.673876953125, "loss": 0.3041, "rewards/chosen": 0.0678845743338267, "rewards/margins": 1.5801312784353894, "rewards/rejected": -1.5122467041015626, "step": 2136 }, { "epoch": 0.11326955185116476, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29555298.0, "logits/rejected": -33373738.0, "logps/chosen": -307.668701171875, "logps/rejected": -408.1946105957031, "loss": 0.3213, "rewards/chosen": 0.1839151829481125, "rewards/margins": 1.8661369532346725, "rewards/rejected": -1.68222177028656, "step": 2137 }, { "epoch": 0.1133225558529669, "grad_norm": 48.75, "kl": 0.12506103515625, "learning_rate": 5e-07, "logits/chosen": 1171162.3333333333, "logits/rejected": -4400286.4, "logps/chosen": -291.779541015625, "logps/rejected": -330.7486083984375, "loss": 0.3222, "rewards/chosen": -0.03240686655044556, "rewards/margins": 1.5767699599266052, "rewards/rejected": -1.6091768264770507, "step": 2138 }, { "epoch": 0.11337555985476903, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15599865.333333334, "logits/rejected": -26923033.6, "logps/chosen": -182.5854695638021, "logps/rejected": -422.9921875, "loss": 0.3301, "rewards/chosen": -0.09056701262791951, "rewards/margins": 2.155092624823252, "rewards/rejected": -2.2456596374511717, "step": 2139 }, { "epoch": 0.11342856385657117, "grad_norm": 62.0, "kl": 0.39415454864501953, "learning_rate": 5e-07, "logits/chosen": -41766502.4, "logits/rejected": -37732365.333333336, "logps/chosen": -303.0748046875, "logps/rejected": -256.4403483072917, "loss": 0.351, "rewards/chosen": 0.10958541631698608, "rewards/margins": 2.0957956035931904, "rewards/rejected": -1.9862101872762044, "step": 2140 }, { "epoch": 0.11348156785837331, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42018712.0, "logits/rejected": -32144809.6, "logps/chosen": -173.0543212890625, "logps/rejected": -275.1603271484375, "loss": 0.3386, "rewards/chosen": -0.24552257855733237, "rewards/margins": 1.15450545946757, "rewards/rejected": -1.4000280380249024, "step": 2141 }, { "epoch": 0.11353457186017545, "grad_norm": 48.25, "kl": 0.03699684143066406, "learning_rate": 5e-07, "logits/chosen": -2513444.4, "logits/rejected": 2106936.0, "logps/chosen": -235.0206298828125, "logps/rejected": -26.696217854817707, "loss": 0.4906, "rewards/chosen": -0.04612182378768921, "rewards/margins": 0.12207823197046914, "rewards/rejected": -0.16820005575815836, "step": 2142 }, { "epoch": 0.11358757586197758, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4454710.4, "logits/rejected": -33155746.666666668, "logps/chosen": -179.10384521484374, "logps/rejected": -357.3085530598958, "loss": 0.4338, "rewards/chosen": -0.3291667938232422, "rewards/margins": 1.4325130144755045, "rewards/rejected": -1.7616798082987468, "step": 2143 }, { "epoch": 0.11364057986377972, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1153068.0, "logits/rejected": -27325128.0, "logps/chosen": -178.12742614746094, "logps/rejected": -252.59391276041666, "loss": 0.3094, "rewards/chosen": 0.23221321403980255, "rewards/margins": 1.438634033004443, "rewards/rejected": -1.2064208189646404, "step": 2144 }, { "epoch": 0.11369358386558186, "grad_norm": 93.0, "kl": 0.30391502380371094, "learning_rate": 5e-07, "logits/chosen": -29073312.0, "logits/rejected": 10399354.0, "logps/chosen": -512.5576782226562, "logps/rejected": -229.27749633789062, "loss": 0.29, "rewards/chosen": 0.3376716673374176, "rewards/margins": 2.21747425198555, "rewards/rejected": -1.8798025846481323, "step": 2145 }, { "epoch": 0.113746587867384, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21083778.666666668, "logits/rejected": -36268320.0, "logps/chosen": -129.3101603190104, "logps/rejected": -236.322705078125, "loss": 0.3273, "rewards/chosen": 0.2701652447382609, "rewards/margins": 1.3584129254023234, "rewards/rejected": -1.0882476806640624, "step": 2146 }, { "epoch": 0.11379959186918612, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -23239028.0, "logps/rejected": -327.1964111328125, "loss": 0.2151, "rewards/rejected": -1.3707350492477417, "step": 2147 }, { "epoch": 0.11385259587098825, "grad_norm": 86.0, "kl": 0.7216076850891113, "learning_rate": 5e-07, "logits/chosen": 12496162.0, "logits/rejected": -26481250.0, "logps/chosen": -679.167236328125, "logps/rejected": -198.29843139648438, "loss": 0.3969, "rewards/chosen": 0.18498095870018005, "rewards/margins": 1.0676407516002655, "rewards/rejected": -0.8826597929000854, "step": 2148 }, { "epoch": 0.11390559987279039, "grad_norm": 79.0, "kl": 1.2493324279785156, "learning_rate": 5e-07, "logits/chosen": -36873564.8, "logits/rejected": -21903677.333333332, "logps/chosen": -426.751220703125, "logps/rejected": -423.8277587890625, "loss": 0.3785, "rewards/chosen": 0.2707183837890625, "rewards/margins": 1.8051569938659668, "rewards/rejected": -1.5344386100769043, "step": 2149 }, { "epoch": 0.11395860387459253, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39388308.0, "logits/rejected": -24611644.0, "logps/chosen": -435.03485107421875, "logps/rejected": -239.48541259765625, "loss": 0.2987, "rewards/chosen": 0.20509567856788635, "rewards/margins": 2.214351326227188, "rewards/rejected": -2.0092556476593018, "step": 2150 }, { "epoch": 0.11401160787639467, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42999042.666666664, "logits/rejected": -8196850.4, "logps/chosen": -476.4972737630208, "logps/rejected": -285.8887939453125, "loss": 0.3511, "rewards/chosen": -0.13512026270230612, "rewards/margins": 1.3125466684500378, "rewards/rejected": -1.4476669311523438, "step": 2151 }, { "epoch": 0.1140646118781968, "grad_norm": 50.0, "kl": 0.030000686645507812, "learning_rate": 5e-07, "logits/chosen": -30803852.8, "logits/rejected": -7563286.0, "logps/chosen": -239.52060546875, "logps/rejected": -155.57999674479166, "loss": 0.3883, "rewards/chosen": -0.11251709461212159, "rewards/margins": 1.6512660423914591, "rewards/rejected": -1.7637831370035808, "step": 2152 }, { "epoch": 0.11411761587999894, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18600336.0, "logits/rejected": 62712224.0, "logps/chosen": -202.54466247558594, "logps/rejected": -466.273681640625, "loss": 0.2966, "rewards/chosen": 0.1595853865146637, "rewards/margins": 2.533799558877945, "rewards/rejected": -2.3742141723632812, "step": 2153 }, { "epoch": 0.11417061988180108, "grad_norm": 55.25, "kl": 0.782470703125, "learning_rate": 5e-07, "logits/chosen": -31120890.666666668, "logits/rejected": -51713667.2, "logps/chosen": -525.5187174479166, "logps/rejected": -421.211474609375, "loss": 0.2218, "rewards/chosen": 0.6785064538319906, "rewards/margins": 2.564551909764608, "rewards/rejected": -1.8860454559326172, "step": 2154 }, { "epoch": 0.11422362388360321, "grad_norm": 37.25, "kl": 0.3826322555541992, "learning_rate": 5e-07, "logits/chosen": -9701278.666666666, "logits/rejected": -9682506.4, "logps/chosen": -82.539306640625, "logps/rejected": -372.6729736328125, "loss": 0.2627, "rewards/chosen": 0.34003714720408124, "rewards/margins": 2.2082864681879677, "rewards/rejected": -1.8682493209838866, "step": 2155 }, { "epoch": 0.11427662788540535, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1257066.25, "logits/rejected": -36287852.0, "logps/chosen": -93.48070526123047, "logps/rejected": -383.6089782714844, "loss": 0.3527, "rewards/chosen": -0.35197335481643677, "rewards/margins": 1.9851483702659607, "rewards/rejected": -2.3371217250823975, "step": 2156 }, { "epoch": 0.11432963188720749, "grad_norm": 62.25, "kl": 0.15601539611816406, "learning_rate": 5e-07, "logits/chosen": -48313680.0, "logits/rejected": 169629584.0, "logps/chosen": -283.44228108723956, "logps/rejected": -431.2143249511719, "loss": 0.4092, "rewards/chosen": 0.08867381016413371, "rewards/margins": 1.576456884543101, "rewards/rejected": -1.4877830743789673, "step": 2157 }, { "epoch": 0.11438263588900963, "grad_norm": 54.0, "kl": 0.36057281494140625, "learning_rate": 5e-07, "logits/chosen": -27463138.0, "logits/rejected": -57360368.0, "logps/chosen": -546.9794311523438, "logps/rejected": -422.37890625, "loss": 0.3051, "rewards/chosen": 0.36777040362358093, "rewards/margins": 2.1554894149303436, "rewards/rejected": -1.7877190113067627, "step": 2158 }, { "epoch": 0.11443563989081175, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21800662.4, "logits/rejected": -16542128.0, "logps/chosen": -314.8025390625, "logps/rejected": -163.10880533854166, "loss": 0.4403, "rewards/chosen": 0.03602355718612671, "rewards/margins": 0.6603911916414896, "rewards/rejected": -0.6243676344553629, "step": 2159 }, { "epoch": 0.11448864389261389, "grad_norm": 53.0, "kl": 0.39090633392333984, "learning_rate": 5e-07, "logits/chosen": -29573504.0, "logits/rejected": -22746398.4, "logps/chosen": -310.99338785807294, "logps/rejected": -134.35775146484374, "loss": 0.3279, "rewards/chosen": -0.044290671745936074, "rewards/margins": 1.3326984067757923, "rewards/rejected": -1.3769890785217285, "step": 2160 }, { "epoch": 0.11454164789441602, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35419924.0, "logits/rejected": -23024772.0, "logps/chosen": -450.0642395019531, "logps/rejected": -406.17962646484375, "loss": 0.3224, "rewards/chosen": 0.1173490434885025, "rewards/margins": 2.2181786447763443, "rewards/rejected": -2.100829601287842, "step": 2161 }, { "epoch": 0.11459465189621816, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18271238.0, "logits/rejected": -27133734.0, "logps/chosen": -338.03057861328125, "logps/rejected": -252.97299194335938, "loss": 0.2856, "rewards/chosen": 0.6558660864830017, "rewards/margins": 2.147463023662567, "rewards/rejected": -1.4915969371795654, "step": 2162 }, { "epoch": 0.1146476558980203, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2093289.875, "logits/rejected": -56544987.428571425, "logps/chosen": -114.65607452392578, "logps/rejected": -548.0152762276786, "loss": 0.1533, "rewards/chosen": 0.4554184079170227, "rewards/margins": 3.034633321421487, "rewards/rejected": -2.5792149135044644, "step": 2163 }, { "epoch": 0.11470065989982244, "grad_norm": 68.5, "kl": 1.7938003540039062, "learning_rate": 5e-07, "logits/chosen": -10966922.666666666, "logits/rejected": -24238740.0, "logps/chosen": -410.7505289713542, "logps/rejected": -169.49546813964844, "loss": 0.3591, "rewards/chosen": 0.4977556864420573, "rewards/margins": 2.939100901285807, "rewards/rejected": -2.44134521484375, "step": 2164 }, { "epoch": 0.11475366390162457, "grad_norm": 62.25, "kl": 0.1113433837890625, "learning_rate": 5e-07, "logits/chosen": -44937561.6, "logits/rejected": 10864926.666666666, "logps/chosen": -327.788232421875, "logps/rejected": -269.03289794921875, "loss": 0.4095, "rewards/chosen": -0.15095525979995728, "rewards/margins": 1.2569256822268169, "rewards/rejected": -1.4078809420267742, "step": 2165 }, { "epoch": 0.11480666790342671, "grad_norm": 47.75, "kl": 0.6143131256103516, "learning_rate": 5e-07, "logits/chosen": -51733424.0, "logits/rejected": -7766158.0, "logps/chosen": -352.06494140625, "logps/rejected": -168.84822591145834, "loss": 0.2698, "rewards/chosen": 0.5999897122383118, "rewards/margins": 1.9224133292833965, "rewards/rejected": -1.3224236170450847, "step": 2166 }, { "epoch": 0.11485967190522885, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28100576.0, "logits/rejected": -7878163.0, "logps/chosen": -232.03125, "logps/rejected": -328.2828674316406, "loss": 0.3779, "rewards/chosen": -0.10387039184570312, "rewards/margins": 1.2620121240615845, "rewards/rejected": -1.3658825159072876, "step": 2167 }, { "epoch": 0.11491267590703098, "grad_norm": 68.0, "kl": 1.2649116516113281, "learning_rate": 5e-07, "logits/chosen": -24597261.333333332, "logits/rejected": -27734566.4, "logps/chosen": -659.2923583984375, "logps/rejected": -258.054443359375, "loss": 0.3178, "rewards/chosen": 0.3795059124628703, "rewards/margins": 1.7949166218439738, "rewards/rejected": -1.4154107093811035, "step": 2168 }, { "epoch": 0.11496567990883312, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25616451.2, "logits/rejected": -24055210.666666668, "logps/chosen": -236.619091796875, "logps/rejected": -485.8426513671875, "loss": 0.398, "rewards/chosen": -0.02019016444683075, "rewards/margins": 1.83600814640522, "rewards/rejected": -1.8561983108520508, "step": 2169 }, { "epoch": 0.11501868391063526, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25605768.0, "logits/rejected": -22340788.0, "logps/chosen": -249.78666178385416, "logps/rejected": -320.1408386230469, "loss": 0.4412, "rewards/chosen": -0.16451629996299744, "rewards/margins": 1.7980639040470123, "rewards/rejected": -1.9625802040100098, "step": 2170 }, { "epoch": 0.1150716879124374, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4678454.0, "logits/rejected": -48920536.0, "logps/chosen": -216.3145751953125, "logps/rejected": -331.6514892578125, "loss": 0.3591, "rewards/chosen": -0.052072346210479736, "rewards/margins": 1.5941296219825745, "rewards/rejected": -1.6462019681930542, "step": 2171 }, { "epoch": 0.11512469191423952, "grad_norm": 43.25, "kl": 0.4455604553222656, "learning_rate": 5e-07, "logits/chosen": -20443086.4, "logits/rejected": -72164533.33333333, "logps/chosen": -209.7928466796875, "logps/rejected": -387.8960367838542, "loss": 0.36, "rewards/chosen": 0.12842826843261718, "rewards/margins": 2.1829863866170247, "rewards/rejected": -2.0545581181844077, "step": 2172 }, { "epoch": 0.11517769591604166, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21271675.2, "logits/rejected": -43158768.0, "logps/chosen": -166.61552734375, "logps/rejected": -596.0321044921875, "loss": 0.3881, "rewards/chosen": -0.3982505798339844, "rewards/margins": 2.872407913208008, "rewards/rejected": -3.270658493041992, "step": 2173 }, { "epoch": 0.11523069991784379, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5912095.5, "logits/rejected": 101612944.0, "logps/chosen": -298.24114990234375, "logps/rejected": -411.6243896484375, "loss": 0.3404, "rewards/chosen": -0.07581005245447159, "rewards/margins": 2.2609888538718224, "rewards/rejected": -2.336798906326294, "step": 2174 }, { "epoch": 0.11528370391964593, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43174512.0, "logits/rejected": -18144114.666666668, "logps/chosen": -232.6894073486328, "logps/rejected": -222.52982584635416, "loss": 0.3199, "rewards/chosen": 0.1119152158498764, "rewards/margins": 1.1641082217295964, "rewards/rejected": -1.05219300587972, "step": 2175 }, { "epoch": 0.11533670792144807, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46916224.0, "logits/rejected": 12906664.0, "logps/chosen": -310.9541829427083, "logps/rejected": -85.27970886230469, "loss": 0.4263, "rewards/chosen": 0.1799235741297404, "rewards/margins": 1.1970471541086833, "rewards/rejected": -1.0171235799789429, "step": 2176 }, { "epoch": 0.1153897119232502, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39724309.333333336, "logits/rejected": -13906451.2, "logps/chosen": -378.4477945963542, "logps/rejected": -351.328564453125, "loss": 0.3199, "rewards/chosen": 0.32320253054300946, "rewards/margins": 1.5571090857187908, "rewards/rejected": -1.2339065551757813, "step": 2177 }, { "epoch": 0.11544271592505234, "grad_norm": 53.0, "kl": 0.7742671966552734, "learning_rate": 5e-07, "logits/chosen": -34451722.666666664, "logits/rejected": 4099277.25, "logps/chosen": -287.1731770833333, "logps/rejected": -297.3805847167969, "loss": 0.4549, "rewards/chosen": -0.000749293714761734, "rewards/margins": 1.1726836524903774, "rewards/rejected": -1.1734329462051392, "step": 2178 }, { "epoch": 0.11549571992685448, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10194854.0, "logits/rejected": -21742260.0, "logps/chosen": -214.9927215576172, "logps/rejected": -284.1207580566406, "loss": 0.3454, "rewards/chosen": 0.3328467607498169, "rewards/margins": 1.4814172983169556, "rewards/rejected": -1.1485705375671387, "step": 2179 }, { "epoch": 0.11554872392865662, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -108514056.0, "logits/rejected": -63707674.666666664, "logps/chosen": -1191.4476318359375, "logps/rejected": -444.0867106119792, "loss": 0.2115, "rewards/chosen": 0.7379257678985596, "rewards/margins": 2.507131179173787, "rewards/rejected": -1.7692054112752278, "step": 2180 }, { "epoch": 0.11560172793045875, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26168044.0, "logits/rejected": -15203003.42857143, "logps/chosen": -1205.4954833984375, "logps/rejected": -303.60867745535717, "loss": 0.1419, "rewards/chosen": 1.378625512123108, "rewards/margins": 3.5388671840940202, "rewards/rejected": -2.1602416719709123, "step": 2181 }, { "epoch": 0.11565473193226089, "grad_norm": 84.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31605340.0, "logps/chosen": -366.87115478515625, "loss": 0.5147, "rewards/chosen": -0.058298975229263306, "step": 2182 }, { "epoch": 0.11570773593406303, "grad_norm": 45.5, "kl": 0.7095584869384766, "learning_rate": 5e-07, "logits/chosen": -26824132.0, "logits/rejected": -41829984.0, "logps/chosen": -305.8624572753906, "logps/rejected": -264.0165100097656, "loss": 0.3629, "rewards/chosen": 0.21061143279075623, "rewards/margins": 1.5585051476955414, "rewards/rejected": -1.3478937149047852, "step": 2183 }, { "epoch": 0.11576073993586516, "grad_norm": 55.75, "kl": 1.119333267211914, "learning_rate": 5e-07, "logits/chosen": -30823244.8, "logits/rejected": -47098042.666666664, "logps/chosen": -270.2273193359375, "logps/rejected": -332.7985026041667, "loss": 0.3762, "rewards/chosen": 0.2399775743484497, "rewards/margins": 1.9428736925125123, "rewards/rejected": -1.7028961181640625, "step": 2184 }, { "epoch": 0.11581374393766729, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27471952.0, "logits/rejected": -17208238.0, "logps/chosen": -307.4026794433594, "logps/rejected": -342.347412109375, "loss": 0.3538, "rewards/chosen": 0.22982820868492126, "rewards/margins": 1.4668632447719574, "rewards/rejected": -1.2370350360870361, "step": 2185 }, { "epoch": 0.11586674793946942, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 54518016.0, "logits/rejected": -14445299.2, "logps/chosen": -347.6704915364583, "logps/rejected": -260.72392578125, "loss": 0.3464, "rewards/chosen": -0.1536946694056193, "rewards/margins": 1.1148480971654255, "rewards/rejected": -1.268542766571045, "step": 2186 }, { "epoch": 0.11591975194127156, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41552441.6, "logits/rejected": -28437957.333333332, "logps/chosen": -339.1978515625, "logps/rejected": -501.896728515625, "loss": 0.3215, "rewards/chosen": 0.14647431373596193, "rewards/margins": 3.084019136428833, "rewards/rejected": -2.937544822692871, "step": 2187 }, { "epoch": 0.1159727559430737, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40726760.0, "logits/rejected": 32976102.4, "logps/chosen": -234.6722412109375, "logps/rejected": -425.813916015625, "loss": 0.3349, "rewards/chosen": -0.21039853493372598, "rewards/margins": 1.2478070696194965, "rewards/rejected": -1.4582056045532226, "step": 2188 }, { "epoch": 0.11602575994487584, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14703893.333333334, "logits/rejected": -55470876.0, "logps/chosen": -427.0730387369792, "logps/rejected": -265.4089050292969, "loss": 0.4062, "rewards/chosen": 0.11657316486040752, "rewards/margins": 1.4690386752287548, "rewards/rejected": -1.3524655103683472, "step": 2189 }, { "epoch": 0.11607876394667797, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15693120.0, "logits/rejected": -19486092.8, "logps/chosen": -197.3851114908854, "logps/rejected": -172.17103271484376, "loss": 0.2893, "rewards/chosen": 0.24430044492085776, "rewards/margins": 1.7625043710072834, "rewards/rejected": -1.5182039260864257, "step": 2190 }, { "epoch": 0.11613176794848011, "grad_norm": 69.0, "kl": 1.1674537658691406, "learning_rate": 5e-07, "logits/chosen": -45928200.0, "logps/chosen": -397.62548828125, "loss": 0.5014, "rewards/chosen": 0.116289421916008, "step": 2191 }, { "epoch": 0.11618477195028225, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24196986.666666668, "logits/rejected": -10346928.0, "logps/chosen": -249.3912353515625, "logps/rejected": -342.26226806640625, "loss": 0.4061, "rewards/chosen": -0.003919094800949097, "rewards/margins": 1.9740319550037384, "rewards/rejected": -1.9779510498046875, "step": 2192 }, { "epoch": 0.11623777595208439, "grad_norm": 49.25, "kl": 0.7782764434814453, "learning_rate": 5e-07, "logits/chosen": -24982075.2, "logits/rejected": -19910748.0, "logps/chosen": -440.969384765625, "logps/rejected": -380.2378743489583, "loss": 0.3033, "rewards/chosen": 0.6389500617980957, "rewards/margins": 2.433956336975098, "rewards/rejected": -1.795006275177002, "step": 2193 }, { "epoch": 0.11629077995388652, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3324805.3333333335, "logits/rejected": -31158872.0, "logps/chosen": -155.76716105143228, "logps/rejected": -445.01861572265625, "loss": 0.3976, "rewards/chosen": 0.15839005510012308, "rewards/margins": 1.9898369411627452, "rewards/rejected": -1.831446886062622, "step": 2194 }, { "epoch": 0.11634378395568866, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19025529.333333332, "logits/rejected": -36948220.8, "logps/chosen": -259.2745361328125, "logps/rejected": -373.3744140625, "loss": 0.3268, "rewards/chosen": 0.22343279918034872, "rewards/margins": 1.355936022599538, "rewards/rejected": -1.1325032234191894, "step": 2195 }, { "epoch": 0.1163967879574908, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31835988.0, "logits/rejected": -39836264.0, "logps/chosen": -270.387939453125, "logps/rejected": -288.0501403808594, "loss": 0.3434, "rewards/chosen": 0.26761895418167114, "rewards/margins": 1.4183524250984192, "rewards/rejected": -1.150733470916748, "step": 2196 }, { "epoch": 0.11644979195929292, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24986262.0, "logits/rejected": 13850575.0, "logps/chosen": -101.86045837402344, "logps/rejected": -382.2229919433594, "loss": 0.348, "rewards/chosen": 0.18258437514305115, "rewards/margins": 1.4037972390651703, "rewards/rejected": -1.2212128639221191, "step": 2197 }, { "epoch": 0.11650279596109506, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2082284.0, "logits/rejected": -30006371.2, "logps/chosen": -165.50234985351562, "logps/rejected": -195.53114013671876, "loss": 0.2831, "rewards/chosen": 0.2766307791074117, "rewards/margins": 1.788453193505605, "rewards/rejected": -1.5118224143981933, "step": 2198 }, { "epoch": 0.1165557999628972, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58907910.4, "logits/rejected": -13907473.333333334, "logps/chosen": -186.3093017578125, "logps/rejected": -499.2636311848958, "loss": 0.3413, "rewards/chosen": 0.1288149833679199, "rewards/margins": 2.5612322171529134, "rewards/rejected": -2.4324172337849936, "step": 2199 }, { "epoch": 0.11660880396469933, "grad_norm": 60.25, "kl": 1.546299934387207, "learning_rate": 5e-07, "logits/chosen": -2474390.0, "logits/rejected": -60920581.333333336, "logps/chosen": -492.5498046875, "logps/rejected": -501.1376139322917, "loss": 0.3597, "rewards/chosen": 0.39267444610595703, "rewards/margins": 2.4093213081359863, "rewards/rejected": -2.0166468620300293, "step": 2200 }, { "epoch": 0.11666180796650147, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32184786.666666668, "logits/rejected": -33507654.4, "logps/chosen": -256.75242106119794, "logps/rejected": -771.3427734375, "loss": 0.2612, "rewards/chosen": -0.113107164700826, "rewards/margins": 2.246046201388041, "rewards/rejected": -2.3591533660888673, "step": 2201 }, { "epoch": 0.1167148119683036, "grad_norm": 51.25, "kl": 0.41172027587890625, "learning_rate": 5e-07, "logits/chosen": -31462546.666666668, "logits/rejected": -25982604.0, "logps/chosen": -176.03743489583334, "logps/rejected": -112.08636474609375, "loss": 0.4162, "rewards/chosen": 0.07096245884895325, "rewards/margins": 1.603488951921463, "rewards/rejected": -1.5325264930725098, "step": 2202 }, { "epoch": 0.11676781597010574, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39224304.0, "logits/rejected": -24985568.0, "logps/chosen": -240.5547332763672, "logps/rejected": -335.33050537109375, "loss": 0.3201, "rewards/chosen": 0.22509261965751648, "rewards/margins": 1.8975398242473602, "rewards/rejected": -1.6724472045898438, "step": 2203 }, { "epoch": 0.11682081997190788, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20427834.666666668, "logits/rejected": 17742206.4, "logps/chosen": -127.11531575520833, "logps/rejected": -274.954541015625, "loss": 0.3285, "rewards/chosen": -0.38670190175374347, "rewards/margins": 1.3976677576700847, "rewards/rejected": -1.784369659423828, "step": 2204 }, { "epoch": 0.11687382397371002, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23181048.0, "logits/rejected": -59975296.0, "logps/chosen": -326.96142578125, "logps/rejected": -830.0731811523438, "loss": 0.3207, "rewards/chosen": -0.18110442161560059, "rewards/margins": 3.2543952465057373, "rewards/rejected": -3.435499668121338, "step": 2205 }, { "epoch": 0.11692682797551215, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84750560.0, "logits/rejected": -16657862.0, "logps/chosen": -316.80133056640625, "logps/rejected": -324.5943908691406, "loss": 0.369, "rewards/chosen": -0.13395051658153534, "rewards/margins": 1.3695413619279861, "rewards/rejected": -1.5034918785095215, "step": 2206 }, { "epoch": 0.11697983197731429, "grad_norm": 52.25, "kl": 0.012412071228027344, "learning_rate": 5e-07, "logits/chosen": -6981659.5, "logits/rejected": -23615210.0, "logps/chosen": -261.6114196777344, "logps/rejected": -464.5232238769531, "loss": 0.3206, "rewards/chosen": 0.08938093483448029, "rewards/margins": 1.908063843846321, "rewards/rejected": -1.8186829090118408, "step": 2207 }, { "epoch": 0.11703283597911643, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42951260.0, "logits/rejected": -21413070.0, "logps/chosen": -306.3656005859375, "logps/rejected": -243.54075622558594, "loss": 0.3514, "rewards/chosen": -0.13058358430862427, "rewards/margins": 1.4842889904975891, "rewards/rejected": -1.6148725748062134, "step": 2208 }, { "epoch": 0.11708583998091857, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8687691.333333334, "logits/rejected": -26907526.4, "logps/chosen": -312.6223958333333, "logps/rejected": -332.402001953125, "loss": 0.3325, "rewards/chosen": 0.0026193062464396157, "rewards/margins": 1.5153489510218303, "rewards/rejected": -1.5127296447753906, "step": 2209 }, { "epoch": 0.11713884398272069, "grad_norm": 58.25, "kl": 0.5062904357910156, "learning_rate": 5e-07, "logits/chosen": -37939579.428571425, "logits/rejected": -25364762.0, "logps/chosen": -334.68300083705356, "logps/rejected": -386.8201904296875, "loss": 0.3753, "rewards/chosen": 0.4077085086277553, "rewards/margins": 2.5739255973270962, "rewards/rejected": -2.166217088699341, "step": 2210 }, { "epoch": 0.11719184798452283, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21309435.2, "logits/rejected": -16763746.666666666, "logps/chosen": -275.6232421875, "logps/rejected": -114.06784057617188, "loss": 0.3687, "rewards/chosen": 0.38154337406158445, "rewards/margins": 1.2985438267389933, "rewards/rejected": -0.9170004526774088, "step": 2211 }, { "epoch": 0.11724485198632496, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5829189.5, "logits/rejected": -25739940.0, "logps/chosen": -167.51666259765625, "logps/rejected": -505.673095703125, "loss": 0.2441, "rewards/chosen": 0.5591744184494019, "rewards/margins": 3.2314456701278687, "rewards/rejected": -2.672271251678467, "step": 2212 }, { "epoch": 0.1172978559881271, "grad_norm": 71.5, "kl": 0.7186203002929688, "learning_rate": 5e-07, "logits/chosen": -33141981.333333332, "logits/rejected": -4229078.0, "logps/chosen": -380.7858072916667, "logps/rejected": -259.0660400390625, "loss": 0.4875, "rewards/chosen": -0.08943939208984375, "rewards/margins": 0.7085979580879211, "rewards/rejected": -0.7980373501777649, "step": 2213 }, { "epoch": 0.11735085998992924, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58189160.0, "logits/rejected": -23510554.0, "logps/chosen": -315.1042785644531, "logps/rejected": -340.0762939453125, "loss": 0.3748, "rewards/chosen": -0.3036685883998871, "rewards/margins": 1.273112565279007, "rewards/rejected": -1.576781153678894, "step": 2214 }, { "epoch": 0.11740386399173137, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19872232.0, "logits/rejected": -11312519.0, "logps/chosen": -240.13417053222656, "logps/rejected": -189.3060760498047, "loss": 0.3515, "rewards/chosen": 0.06487318873405457, "rewards/margins": 1.3845066726207733, "rewards/rejected": -1.3196334838867188, "step": 2215 }, { "epoch": 0.11745686799353351, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17551160.0, "logits/rejected": -42483576.0, "logps/chosen": -381.9342956542969, "logps/rejected": -383.3439636230469, "loss": 0.3289, "rewards/chosen": 0.08650971204042435, "rewards/margins": 1.6967009380459785, "rewards/rejected": -1.6101912260055542, "step": 2216 }, { "epoch": 0.11750987199533565, "grad_norm": 63.75, "kl": 1.6164817810058594, "learning_rate": 5e-07, "logits/chosen": -52431368.0, "logits/rejected": -84829610.66666667, "logps/chosen": -429.5448303222656, "logps/rejected": -439.9808756510417, "loss": 0.2792, "rewards/chosen": 0.923687756061554, "rewards/margins": 2.451404392719269, "rewards/rejected": -1.5277166366577148, "step": 2217 }, { "epoch": 0.11756287599713779, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17502546.666666668, "logits/rejected": -22435552.0, "logps/chosen": -316.19875081380206, "logps/rejected": -424.909130859375, "loss": 0.2643, "rewards/chosen": 0.002412541459004084, "rewards/margins": 2.237885411332051, "rewards/rejected": -2.235472869873047, "step": 2218 }, { "epoch": 0.11761587999893992, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50262339.2, "logits/rejected": -41275045.333333336, "logps/chosen": -284.136083984375, "logps/rejected": -258.6833089192708, "loss": 0.3961, "rewards/chosen": 0.1280747413635254, "rewards/margins": 1.2095240910847982, "rewards/rejected": -1.0814493497212727, "step": 2219 }, { "epoch": 0.11766888400074206, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39648336.0, "logits/rejected": -9697692.0, "logps/chosen": -248.0036417643229, "logps/rejected": -290.04852294921875, "loss": 0.3846, "rewards/chosen": 0.23387749989827475, "rewards/margins": 1.7556105454762776, "rewards/rejected": -1.521733045578003, "step": 2220 }, { "epoch": 0.1177218880025442, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28694938.666666668, "logits/rejected": -20763803.2, "logps/chosen": -691.3159993489584, "logps/rejected": -192.3049072265625, "loss": 0.3288, "rewards/chosen": 0.23026021321614584, "rewards/margins": 1.3315477689107258, "rewards/rejected": -1.10128755569458, "step": 2221 }, { "epoch": 0.11777489200434633, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39700115.2, "logits/rejected": -17969952.0, "logps/chosen": -240.1075927734375, "logps/rejected": -157.77730305989584, "loss": 0.4031, "rewards/chosen": 0.06620949506759644, "rewards/margins": 1.1444456775983174, "rewards/rejected": -1.078236182530721, "step": 2222 }, { "epoch": 0.11782789600614846, "grad_norm": 64.5, "kl": 0.0077152252197265625, "learning_rate": 5e-07, "logits/chosen": -23963602.0, "logits/rejected": -18179530.0, "logps/chosen": -346.7083740234375, "logps/rejected": -267.5830078125, "loss": 0.3547, "rewards/chosen": 0.2741350531578064, "rewards/margins": 1.2984514832496643, "rewards/rejected": -1.024316430091858, "step": 2223 }, { "epoch": 0.1178809000079506, "grad_norm": 60.25, "kl": 0.8189773559570312, "learning_rate": 5e-07, "logits/chosen": -37580752.0, "logits/rejected": -24480282.0, "logps/chosen": -396.299560546875, "logps/rejected": -233.74560546875, "loss": 0.3291, "rewards/chosen": 0.19477462768554688, "rewards/margins": 2.0467441082000732, "rewards/rejected": -1.8519694805145264, "step": 2224 }, { "epoch": 0.11793390400975273, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27346948.0, "logits/rejected": -25289284.0, "logps/chosen": -276.5189208984375, "logps/rejected": -497.04888916015625, "loss": 0.4398, "rewards/chosen": -0.2942934036254883, "rewards/margins": 0.8344696760177612, "rewards/rejected": -1.1287630796432495, "step": 2225 }, { "epoch": 0.11798690801155487, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20077692.0, "logits/rejected": -34303348.0, "logps/chosen": -386.8904724121094, "logps/rejected": -504.6402893066406, "loss": 0.3255, "rewards/chosen": 0.1378742754459381, "rewards/margins": 1.8003703653812408, "rewards/rejected": -1.6624960899353027, "step": 2226 }, { "epoch": 0.118039912013357, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47328949.333333336, "logits/rejected": -33614992.0, "logps/chosen": -305.4290771484375, "logps/rejected": -307.1296081542969, "loss": 0.3735, "rewards/chosen": 0.14465028047561646, "rewards/margins": 2.5162444710731506, "rewards/rejected": -2.371594190597534, "step": 2227 }, { "epoch": 0.11809291601515914, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7652511.0, "logits/rejected": -46048954.666666664, "logps/chosen": -152.63412475585938, "logps/rejected": -337.0096028645833, "loss": 0.3306, "rewards/chosen": -0.312704473733902, "rewards/margins": 1.0514021813869476, "rewards/rejected": -1.3641066551208496, "step": 2228 }, { "epoch": 0.11814592001696128, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9397195.2, "logits/rejected": -28282602.666666668, "logps/chosen": -58.33941650390625, "logps/rejected": -362.0277506510417, "loss": 0.413, "rewards/chosen": -0.16074413061141968, "rewards/margins": 1.7262635827064514, "rewards/rejected": -1.887007713317871, "step": 2229 }, { "epoch": 0.11819892401876342, "grad_norm": 58.25, "kl": 0.26575374603271484, "learning_rate": 5e-07, "logits/chosen": -26980624.0, "logits/rejected": -33711328.0, "logps/chosen": -223.382568359375, "logps/rejected": -212.294384765625, "loss": 0.3591, "rewards/chosen": -0.008585219581921896, "rewards/margins": 1.1216664741436642, "rewards/rejected": -1.130251693725586, "step": 2230 }, { "epoch": 0.11825192802056556, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10539996.8, "logits/rejected": -3227710.0, "logps/chosen": -246.8352294921875, "logps/rejected": -178.82405598958334, "loss": 0.4201, "rewards/chosen": -0.17946556806564332, "rewards/margins": 1.1441266973813373, "rewards/rejected": -1.3235922654469807, "step": 2231 }, { "epoch": 0.11830493202236769, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4057360.0, "logits/rejected": -11640121.333333334, "logps/chosen": -43.179542541503906, "logps/rejected": -177.3531290690104, "loss": 0.316, "rewards/chosen": -0.28603678941726685, "rewards/margins": 1.0254712303479512, "rewards/rejected": -1.311508019765218, "step": 2232 }, { "epoch": 0.11835793602416983, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8124177.0, "logits/rejected": -38331472.0, "logps/chosen": -147.6197509765625, "logps/rejected": -542.69189453125, "loss": 0.3722, "rewards/chosen": 0.09866829216480255, "rewards/margins": 1.1617732793092728, "rewards/rejected": -1.0631049871444702, "step": 2233 }, { "epoch": 0.11841094002597197, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6286990.4, "logits/rejected": -44159301.333333336, "logps/chosen": -341.68798828125, "logps/rejected": -342.7090250651042, "loss": 0.2847, "rewards/chosen": 0.43627634048461916, "rewards/margins": 3.0288022994995116, "rewards/rejected": -2.5925259590148926, "step": 2234 }, { "epoch": 0.11846394402777409, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22254633.6, "logits/rejected": 75608624.0, "logps/chosen": -414.053125, "logps/rejected": -265.213134765625, "loss": 0.4257, "rewards/chosen": 0.064532470703125, "rewards/margins": 0.8326266924540201, "rewards/rejected": -0.7680942217508951, "step": 2235 }, { "epoch": 0.11851694802957623, "grad_norm": 82.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14062982.0, "logits/rejected": -25166264.0, "logps/chosen": -900.9434814453125, "logps/rejected": -408.2062072753906, "loss": 0.2705, "rewards/chosen": 0.42224445939064026, "rewards/margins": 2.264750510454178, "rewards/rejected": -1.8425060510635376, "step": 2236 }, { "epoch": 0.11856995203137836, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7620139.2, "logits/rejected": -27634794.666666668, "logps/chosen": -143.52989501953124, "logps/rejected": -214.11710611979166, "loss": 0.4121, "rewards/chosen": -0.2776391267776489, "rewards/margins": 1.4135937134424847, "rewards/rejected": -1.6912328402201335, "step": 2237 }, { "epoch": 0.1186229560331805, "grad_norm": 52.0, "kl": 0.0048084259033203125, "learning_rate": 5e-07, "logits/chosen": -21598651.2, "logits/rejected": -16846940.0, "logps/chosen": -213.59892578125, "logps/rejected": -297.388671875, "loss": 0.369, "rewards/chosen": 0.25746726989746094, "rewards/margins": 1.3956948121388753, "rewards/rejected": -1.1382275422414143, "step": 2238 }, { "epoch": 0.11867596003498264, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10248901.0, "logits/rejected": -28582794.666666668, "logps/chosen": -110.2389907836914, "logps/rejected": -314.96681722005206, "loss": 0.2528, "rewards/chosen": 0.46419450640678406, "rewards/margins": 1.9246184527873993, "rewards/rejected": -1.4604239463806152, "step": 2239 }, { "epoch": 0.11872896403678478, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10552526.666666666, "logits/rejected": -12533278.4, "logps/chosen": -207.69816080729166, "logps/rejected": -190.39271240234376, "loss": 0.3923, "rewards/chosen": -0.048714260260264076, "rewards/margins": 0.8391201933224997, "rewards/rejected": -0.8878344535827637, "step": 2240 }, { "epoch": 0.11878196803858691, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7321648.0, "logits/rejected": -32882516.0, "logps/chosen": -253.81948852539062, "logps/rejected": -357.2317810058594, "loss": 0.3408, "rewards/chosen": -0.05341930314898491, "rewards/margins": 1.5876776464283466, "rewards/rejected": -1.6410969495773315, "step": 2241 }, { "epoch": 0.11883497204038905, "grad_norm": 89.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20609704.0, "logits/rejected": -18932078.0, "logps/chosen": -593.362060546875, "logps/rejected": -316.42779541015625, "loss": 0.3837, "rewards/chosen": -0.020397573709487915, "rewards/margins": 1.1462403237819672, "rewards/rejected": -1.166637897491455, "step": 2242 }, { "epoch": 0.11888797604219119, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21304526.4, "logits/rejected": -10916247.333333334, "logps/chosen": -346.014892578125, "logps/rejected": -183.86273193359375, "loss": 0.3627, "rewards/chosen": -0.04016808271408081, "rewards/margins": 2.065509839852651, "rewards/rejected": -2.105677922566732, "step": 2243 }, { "epoch": 0.11894098004399332, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68678360.0, "logits/rejected": -48582704.0, "logps/chosen": -350.41497802734375, "logps/rejected": -331.27886962890625, "loss": 0.2783, "rewards/chosen": 0.12151946872472763, "rewards/margins": 2.6022836193442345, "rewards/rejected": -2.480764150619507, "step": 2244 }, { "epoch": 0.11899398404579546, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35509304.0, "logits/rejected": -17310538.0, "logps/chosen": -300.6600646972656, "logps/rejected": -452.0465087890625, "loss": 0.3499, "rewards/chosen": -0.10786361992359161, "rewards/margins": 1.7081630676984787, "rewards/rejected": -1.8160266876220703, "step": 2245 }, { "epoch": 0.1190469880475976, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33254246.0, "logits/rejected": -7259277.5, "logps/chosen": -361.2100830078125, "logps/rejected": -182.2571258544922, "loss": 0.3146, "rewards/chosen": 0.1262575089931488, "rewards/margins": 1.8564800918102264, "rewards/rejected": -1.7302225828170776, "step": 2246 }, { "epoch": 0.11909999204939974, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26965005.333333332, "logits/rejected": -83604403.2, "logps/chosen": -79.0020243326823, "logps/rejected": -571.66171875, "loss": 0.2241, "rewards/chosen": 0.3621853192647298, "rewards/margins": 3.0635820706685384, "rewards/rejected": -2.7013967514038084, "step": 2247 }, { "epoch": 0.11915299605120186, "grad_norm": 73.0, "kl": 1.3931427001953125, "learning_rate": 5e-07, "logits/chosen": 3805472.0, "logps/chosen": -267.3908996582031, "loss": 0.5402, "rewards/chosen": -0.03273904323577881, "step": 2248 }, { "epoch": 0.119206000053004, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23867174.4, "logits/rejected": -26134597.333333332, "logps/chosen": -185.153369140625, "logps/rejected": -278.10296630859375, "loss": 0.3905, "rewards/chosen": -0.09567944407463073, "rewards/margins": 1.660790600379308, "rewards/rejected": -1.7564700444539387, "step": 2249 }, { "epoch": 0.11925900405480613, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9751524.0, "logits/rejected": -36775248.0, "logps/chosen": -124.0374043782552, "logps/rejected": -588.51328125, "loss": 0.3018, "rewards/chosen": 0.35849467913309735, "rewards/margins": 3.074352757136027, "rewards/rejected": -2.7158580780029298, "step": 2250 }, { "epoch": 0.11931200805660827, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3661898.5, "logits/rejected": -113342432.0, "logps/chosen": -126.6757583618164, "logps/rejected": -219.42030334472656, "loss": 0.335, "rewards/chosen": -0.10642695426940918, "rewards/margins": 2.3866281509399414, "rewards/rejected": -2.4930551052093506, "step": 2251 }, { "epoch": 0.11936501205841041, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69144997.33333333, "logits/rejected": -37809088.0, "logps/chosen": -334.1144612630208, "logps/rejected": -558.61806640625, "loss": 0.2749, "rewards/chosen": 0.002887984116872152, "rewards/margins": 2.0599102695782983, "rewards/rejected": -2.057022285461426, "step": 2252 }, { "epoch": 0.11941801606021255, "grad_norm": 71.0, "kl": 0.02686309814453125, "learning_rate": 5e-07, "logits/chosen": -60567562.666666664, "logits/rejected": -40235939.2, "logps/chosen": -953.1126302083334, "logps/rejected": -299.6365478515625, "loss": 0.2883, "rewards/chosen": 0.5922465324401855, "rewards/margins": 1.8266973495483398, "rewards/rejected": -1.2344508171081543, "step": 2253 }, { "epoch": 0.11947102006201468, "grad_norm": 50.0, "kl": 0.22027587890625, "learning_rate": 5e-07, "logits/chosen": -14321077.0, "logits/rejected": -34678400.0, "logps/chosen": -345.81573486328125, "logps/rejected": -271.3025817871094, "loss": 0.2934, "rewards/chosen": 0.3702794909477234, "rewards/margins": 2.243284523487091, "rewards/rejected": -1.8730050325393677, "step": 2254 }, { "epoch": 0.11952402406381682, "grad_norm": 75.0, "kl": 1.9829254150390625, "learning_rate": 5e-07, "logits/chosen": -30877757.333333332, "logits/rejected": -51855332.0, "logps/chosen": -313.1099446614583, "logps/rejected": -709.4869995117188, "loss": 0.353, "rewards/chosen": 0.5464672247568766, "rewards/margins": 2.56304136912028, "rewards/rejected": -2.0165741443634033, "step": 2255 }, { "epoch": 0.11957702806561896, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65534220.8, "logits/rejected": 2109036.0, "logps/chosen": -537.408837890625, "logps/rejected": -222.4460652669271, "loss": 0.3194, "rewards/chosen": 0.7129827976226807, "rewards/margins": 1.7078600565592448, "rewards/rejected": -0.9948772589365641, "step": 2256 }, { "epoch": 0.1196300320674211, "grad_norm": 57.5, "kl": 1.2420177459716797, "learning_rate": 5e-07, "logits/chosen": -27108140.8, "logits/rejected": 18877826.666666668, "logps/chosen": -421.30888671875, "logps/rejected": -69.68830871582031, "loss": 0.3502, "rewards/chosen": 0.41696724891662595, "rewards/margins": 1.5516297976175943, "rewards/rejected": -1.1346625487009685, "step": 2257 }, { "epoch": 0.11968303606922323, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 56368536.0, "logits/rejected": -66592496.0, "logps/chosen": -527.4641723632812, "logps/rejected": -425.094970703125, "loss": 0.3136, "rewards/chosen": -0.46626895666122437, "rewards/margins": 1.1922159790992737, "rewards/rejected": -1.658484935760498, "step": 2258 }, { "epoch": 0.11973604007102537, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35227996.0, "logits/rejected": -16735986.666666666, "logps/chosen": -386.4642639160156, "logps/rejected": -234.4444783528646, "loss": 0.3155, "rewards/chosen": -0.4515235126018524, "rewards/margins": 1.1074393093585968, "rewards/rejected": -1.5589628219604492, "step": 2259 }, { "epoch": 0.1197890440728275, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26727973.333333332, "logits/rejected": -12981710.4, "logps/chosen": -304.3896484375, "logps/rejected": -88.27354736328125, "loss": 0.3145, "rewards/chosen": 0.22677308320999146, "rewards/margins": 1.531067955493927, "rewards/rejected": -1.3042948722839356, "step": 2260 }, { "epoch": 0.11984204807462963, "grad_norm": 43.75, "kl": 0.7346458435058594, "learning_rate": 5e-07, "logits/chosen": -46642378.666666664, "logits/rejected": -34174576.0, "logps/chosen": -113.08260091145833, "logps/rejected": -342.93975830078125, "loss": 0.4388, "rewards/chosen": 0.08939469854036967, "rewards/margins": 1.6111049751440685, "rewards/rejected": -1.5217102766036987, "step": 2261 }, { "epoch": 0.11989505207643177, "grad_norm": 58.0, "kl": 0.4432792663574219, "learning_rate": 5e-07, "logits/chosen": -31528360.0, "logits/rejected": -4637572.5, "logps/chosen": -297.40370686848956, "logps/rejected": -68.02993774414062, "loss": 0.3934, "rewards/chosen": 0.30701422691345215, "rewards/margins": 1.3499228954315186, "rewards/rejected": -1.0429086685180664, "step": 2262 }, { "epoch": 0.1199480560782339, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11264540.0, "logits/rejected": -22021451.2, "logps/chosen": -424.1874593098958, "logps/rejected": -199.3076904296875, "loss": 0.308, "rewards/chosen": 0.301955242951711, "rewards/margins": 1.6260899742444355, "rewards/rejected": -1.3241347312927245, "step": 2263 }, { "epoch": 0.12000106008003604, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16640924.0, "logits/rejected": -57937740.8, "logps/chosen": -144.22022501627603, "logps/rejected": -360.274072265625, "loss": 0.2899, "rewards/chosen": -0.28173738718032837, "rewards/margins": 1.747913110256195, "rewards/rejected": -2.0296504974365233, "step": 2264 }, { "epoch": 0.12005406408183818, "grad_norm": 55.25, "kl": 0.5728740692138672, "learning_rate": 5e-07, "logits/chosen": -22020378.666666668, "logits/rejected": -27916323.2, "logps/chosen": -260.2528889973958, "logps/rejected": -414.999560546875, "loss": 0.2713, "rewards/chosen": -0.021464539070924122, "rewards/margins": 1.976428412894408, "rewards/rejected": -1.9978929519653321, "step": 2265 }, { "epoch": 0.12010706808364031, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19171894.666666668, "logits/rejected": -8821222.4, "logps/chosen": -517.0196126302084, "logps/rejected": -259.930419921875, "loss": 0.3151, "rewards/chosen": -0.07572275896867116, "rewards/margins": 1.4890523264805477, "rewards/rejected": -1.5647750854492188, "step": 2266 }, { "epoch": 0.12016007208544245, "grad_norm": 58.75, "kl": 0.4880790710449219, "learning_rate": 5e-07, "logits/chosen": -30445988.0, "logits/rejected": -7288917.5, "logps/chosen": -478.90728759765625, "logps/rejected": -228.58523559570312, "loss": 0.3933, "rewards/chosen": -0.10279279947280884, "rewards/margins": 1.0808103680610657, "rewards/rejected": -1.1836031675338745, "step": 2267 }, { "epoch": 0.12021307608724459, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8136567.333333333, "logits/rejected": -22315636.8, "logps/chosen": -266.01841227213544, "logps/rejected": -372.8719970703125, "loss": 0.2789, "rewards/chosen": 0.17867914835611978, "rewards/margins": 2.109824625651042, "rewards/rejected": -1.931145477294922, "step": 2268 }, { "epoch": 0.12026608008904673, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10684693.0, "logits/rejected": -56185930.666666664, "logps/chosen": -354.50396728515625, "logps/rejected": -423.5357259114583, "loss": 0.2973, "rewards/chosen": -0.28734704852104187, "rewards/margins": 1.4054064253966014, "rewards/rejected": -1.6927534739176433, "step": 2269 }, { "epoch": 0.12031908409084886, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18579416.0, "logits/rejected": -1715629.6, "logps/chosen": -216.71038818359375, "logps/rejected": -137.999267578125, "loss": 0.3844, "rewards/chosen": -0.19479223092397055, "rewards/margins": 0.819987146059672, "rewards/rejected": -1.0147793769836426, "step": 2270 }, { "epoch": 0.120372088092651, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37575764.0, "logits/rejected": -11360204.0, "logps/chosen": -164.4819793701172, "logps/rejected": -280.16131591796875, "loss": 0.3169, "rewards/chosen": -0.12720651924610138, "rewards/margins": 1.0599806755781174, "rewards/rejected": -1.1871871948242188, "step": 2271 }, { "epoch": 0.12042509209445314, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73637184.0, "logits/rejected": -55396933.333333336, "logps/chosen": -271.021435546875, "logps/rejected": -129.5887654622396, "loss": 0.3941, "rewards/chosen": 0.12316681146621704, "rewards/margins": 1.181697936852773, "rewards/rejected": -1.058531125386556, "step": 2272 }, { "epoch": 0.12047809609625526, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17656240.0, "logits/rejected": -44528496.0, "logps/chosen": -209.72283063616072, "logps/rejected": -873.3050537109375, "loss": 0.4293, "rewards/chosen": 0.0671557869229998, "rewards/margins": 3.077037521771022, "rewards/rejected": -3.0098817348480225, "step": 2273 }, { "epoch": 0.1205311000980574, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13726516.0, "logits/rejected": -31897597.333333332, "logps/chosen": -250.83523559570312, "logps/rejected": -429.9763590494792, "loss": 0.1933, "rewards/chosen": 0.4956817626953125, "rewards/margins": 2.6904872258504233, "rewards/rejected": -2.194805463155111, "step": 2274 }, { "epoch": 0.12058410409985953, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54930400.0, "logits/rejected": -33819720.0, "logps/chosen": -588.1762084960938, "logps/rejected": -272.91949462890625, "loss": 0.2399, "rewards/chosen": 0.8735226392745972, "rewards/margins": 2.7697123289108276, "rewards/rejected": -1.8961896896362305, "step": 2275 }, { "epoch": 0.12063710810166167, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16216776.0, "logits/rejected": -32221987.2, "logps/chosen": -196.4679158528646, "logps/rejected": -242.141748046875, "loss": 0.2997, "rewards/chosen": -0.23161975542704263, "rewards/margins": 1.6704014619191487, "rewards/rejected": -1.9020212173461915, "step": 2276 }, { "epoch": 0.12069011210346381, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11340904.0, "logits/rejected": -31349961.6, "logps/chosen": -591.9377034505209, "logps/rejected": -604.615576171875, "loss": 0.276, "rewards/chosen": 0.07163544495900472, "rewards/margins": 2.1248863299687706, "rewards/rejected": -2.0532508850097657, "step": 2277 }, { "epoch": 0.12074311610526595, "grad_norm": 61.75, "kl": 0.1692943572998047, "learning_rate": 5e-07, "logits/chosen": -48766858.666666664, "logits/rejected": -5513150.0, "logps/chosen": -650.0275472005209, "logps/rejected": -412.09892578125, "loss": 0.229, "rewards/chosen": 0.6750259399414062, "rewards/margins": 2.5098220825195314, "rewards/rejected": -1.834796142578125, "step": 2278 }, { "epoch": 0.12079612010706808, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34315952.0, "logits/rejected": -35478816.0, "logps/chosen": -269.7736511230469, "logps/rejected": -390.79742431640625, "loss": 0.3445, "rewards/chosen": -0.03537559136748314, "rewards/margins": 1.7976919449865818, "rewards/rejected": -1.833067536354065, "step": 2279 }, { "epoch": 0.12084912410887022, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -22412928.0, "logps/rejected": -408.5278015136719, "loss": 0.1267, "rewards/rejected": -2.140550136566162, "step": 2280 }, { "epoch": 0.12090212811067236, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4266532.0, "logits/rejected": -27935634.666666668, "logps/chosen": -261.18988037109375, "logps/rejected": -221.4680379231771, "loss": 0.2974, "rewards/chosen": 0.7162002921104431, "rewards/margins": 1.675648788611094, "rewards/rejected": -0.959448496500651, "step": 2281 }, { "epoch": 0.1209551321124745, "grad_norm": 56.5, "kl": 0.8023548126220703, "learning_rate": 5e-07, "logits/chosen": -22744984.0, "logits/rejected": -17591436.0, "logps/chosen": -317.209228515625, "logps/rejected": -168.88815307617188, "loss": 0.3393, "rewards/chosen": 0.31909680366516113, "rewards/margins": 1.709108829498291, "rewards/rejected": -1.3900120258331299, "step": 2282 }, { "epoch": 0.12100813611427663, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65292885.333333336, "logits/rejected": -89730272.0, "logps/chosen": -415.9756673177083, "logps/rejected": -572.9783935546875, "loss": 0.343, "rewards/chosen": 0.32053864002227783, "rewards/margins": 2.756811738014221, "rewards/rejected": -2.4362730979919434, "step": 2283 }, { "epoch": 0.12106114011607877, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11580796.0, "logits/rejected": -21098054.0, "logps/chosen": -214.71798706054688, "logps/rejected": -650.1163940429688, "loss": 0.3227, "rewards/chosen": -0.006149861961603165, "rewards/margins": 2.398978855460882, "rewards/rejected": -2.4051287174224854, "step": 2284 }, { "epoch": 0.1211141441178809, "grad_norm": 50.25, "kl": 0.5179176330566406, "learning_rate": 5e-07, "logits/chosen": -37332412.8, "logits/rejected": -48095424.0, "logps/chosen": -276.9980224609375, "logps/rejected": -587.7196044921875, "loss": 0.3259, "rewards/chosen": 0.22561445236206054, "rewards/margins": 2.5890880266825356, "rewards/rejected": -2.363473574320475, "step": 2285 }, { "epoch": 0.12116714811968303, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22925013.333333332, "logits/rejected": -32751161.6, "logps/chosen": -548.3307291666666, "logps/rejected": -489.8537109375, "loss": 0.2621, "rewards/chosen": 0.26079710324605304, "rewards/margins": 2.11780260403951, "rewards/rejected": -1.857005500793457, "step": 2286 }, { "epoch": 0.12122015212148517, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54815528.0, "logits/rejected": -44073664.0, "logps/chosen": -121.63935089111328, "logps/rejected": -414.41312081473217, "loss": 0.1682, "rewards/chosen": 0.09574661403894424, "rewards/margins": 2.4966263377240727, "rewards/rejected": -2.4008797236851285, "step": 2287 }, { "epoch": 0.1212731561232873, "grad_norm": 145.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 128287648.0, "logits/rejected": 47894920.0, "logps/chosen": -505.3677571614583, "logps/rejected": -564.4469604492188, "loss": 0.4107, "rewards/chosen": 0.10757811864217122, "rewards/margins": 1.411269982655843, "rewards/rejected": -1.3036918640136719, "step": 2288 }, { "epoch": 0.12132616012508944, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -491003.5, "logits/rejected": -15447421.0, "logps/chosen": -197.59722900390625, "logps/rejected": -249.9200897216797, "loss": 0.3756, "rewards/chosen": -0.23888476192951202, "rewards/margins": 1.4898350685834885, "rewards/rejected": -1.7287198305130005, "step": 2289 }, { "epoch": 0.12137916412689158, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57713684.0, "logits/rejected": 1076468.0, "logps/chosen": -337.5347900390625, "logps/rejected": -181.05746459960938, "loss": 0.4558, "rewards/chosen": -0.29866352677345276, "rewards/margins": 0.38682249188423157, "rewards/rejected": -0.6854860186576843, "step": 2290 }, { "epoch": 0.12143216812869372, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6466882.0, "logits/rejected": -11904960.0, "logps/chosen": -420.77532958984375, "logps/rejected": -232.578369140625, "loss": 0.2869, "rewards/chosen": 0.496542364358902, "rewards/margins": 2.0611410836378736, "rewards/rejected": -1.5645987192789714, "step": 2291 }, { "epoch": 0.12148517213049585, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13233076.0, "logits/rejected": -9814142.0, "logps/chosen": -273.50982666015625, "logps/rejected": -378.4342041015625, "loss": 0.3948, "rewards/chosen": -0.3297920525074005, "rewards/margins": 1.1580471694469452, "rewards/rejected": -1.4878392219543457, "step": 2292 }, { "epoch": 0.12153817613229799, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 608384.75, "logits/rejected": -48770293.333333336, "logps/chosen": -61.322391510009766, "logps/rejected": -433.7762858072917, "loss": 0.2308, "rewards/chosen": 0.5467491149902344, "rewards/margins": 2.1301732063293457, "rewards/rejected": -1.5834240913391113, "step": 2293 }, { "epoch": 0.12159118013410013, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20588952.0, "logits/rejected": -24774574.0, "logps/chosen": -293.29888916015625, "logps/rejected": -313.01373291015625, "loss": 0.3104, "rewards/chosen": 0.1320180892944336, "rewards/margins": 1.9375627040863037, "rewards/rejected": -1.8055446147918701, "step": 2294 }, { "epoch": 0.12164418413590226, "grad_norm": 41.75, "kl": 0.25478363037109375, "learning_rate": 5e-07, "logits/chosen": 6155078.0, "logits/rejected": -27850900.0, "logps/chosen": -105.4329833984375, "logps/rejected": -177.46397399902344, "loss": 0.359, "rewards/chosen": 0.20296603441238403, "rewards/margins": 1.3426207900047302, "rewards/rejected": -1.1396547555923462, "step": 2295 }, { "epoch": 0.1216971881377044, "grad_norm": 47.0, "kl": 0.0868387222290039, "learning_rate": 5e-07, "logits/chosen": -26272514.666666668, "logits/rejected": -38845529.6, "logps/chosen": -443.0840657552083, "logps/rejected": -311.5897705078125, "loss": 0.2748, "rewards/chosen": 0.3162252902984619, "rewards/margins": 1.9483736515045167, "rewards/rejected": -1.6321483612060548, "step": 2296 }, { "epoch": 0.12175019213950654, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4100391.5, "logits/rejected": -12636425.142857144, "logps/chosen": -22.969661712646484, "logps/rejected": -230.92647879464286, "loss": 0.2923, "rewards/chosen": -0.11162471771240234, "rewards/margins": 1.116952351161412, "rewards/rejected": -1.2285770688738142, "step": 2297 }, { "epoch": 0.12180319614130868, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48950540.8, "logits/rejected": -23007312.0, "logps/chosen": -224.5311767578125, "logps/rejected": -359.478759765625, "loss": 0.4742, "rewards/chosen": -0.352698016166687, "rewards/margins": 1.0772754271825153, "rewards/rejected": -1.4299734433492024, "step": 2298 }, { "epoch": 0.1218562001431108, "grad_norm": 58.25, "kl": 0.2792816162109375, "learning_rate": 5e-07, "logits/chosen": -14460347.2, "logits/rejected": -41267493.333333336, "logps/chosen": -324.7498046875, "logps/rejected": -562.4879557291666, "loss": 0.3044, "rewards/chosen": 0.6860971450805664, "rewards/margins": 1.9875763257344563, "rewards/rejected": -1.30147918065389, "step": 2299 }, { "epoch": 0.12190920414491294, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27082732.8, "logits/rejected": -42045597.333333336, "logps/chosen": -428.9984375, "logps/rejected": -225.30806477864584, "loss": 0.3288, "rewards/chosen": 0.5357214927673339, "rewards/margins": 1.6038896560668945, "rewards/rejected": -1.0681681632995605, "step": 2300 }, { "epoch": 0.12196220814671507, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42787091.2, "logits/rejected": -43709594.666666664, "logps/chosen": -535.54453125, "logps/rejected": -276.3285725911458, "loss": 0.3263, "rewards/chosen": 0.3594472408294678, "rewards/margins": 2.1086277167002363, "rewards/rejected": -1.7491804758707683, "step": 2301 }, { "epoch": 0.12201521214851721, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39814852.0, "logits/rejected": -14274864.0, "logps/chosen": -100.10195922851562, "logps/rejected": -133.25953674316406, "loss": 0.3652, "rewards/chosen": -0.029265224933624268, "rewards/margins": 1.2421227097511292, "rewards/rejected": -1.2713879346847534, "step": 2302 }, { "epoch": 0.12206821615031935, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75796581.33333333, "logits/rejected": -22687380.8, "logps/chosen": -432.4726155598958, "logps/rejected": -300.14794921875, "loss": 0.2537, "rewards/chosen": 0.6356954177220663, "rewards/margins": 2.27379318078359, "rewards/rejected": -1.6380977630615234, "step": 2303 }, { "epoch": 0.12212122015212148, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 548179.5, "logits/rejected": -12023128.0, "logps/chosen": -197.12136840820312, "logps/rejected": -239.19307454427084, "loss": 0.2554, "rewards/chosen": 0.5363502502441406, "rewards/margins": 2.0122173627217608, "rewards/rejected": -1.4758671124776204, "step": 2304 }, { "epoch": 0.12217422415392362, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43951282.666666664, "logits/rejected": -18101356.0, "logps/chosen": -306.27789306640625, "logps/rejected": -446.65191650390625, "loss": 0.4479, "rewards/chosen": -0.10804368058840434, "rewards/margins": 1.1328025956948597, "rewards/rejected": -1.2408462762832642, "step": 2305 }, { "epoch": 0.12222722815572576, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19402818.666666668, "logits/rejected": -29450714.0, "logps/chosen": -232.4069620768229, "logps/rejected": -410.7959289550781, "loss": 0.4736, "rewards/chosen": -0.2196866273880005, "rewards/margins": 0.9875613451004028, "rewards/rejected": -1.2072479724884033, "step": 2306 }, { "epoch": 0.1222802321575279, "grad_norm": 51.5, "kl": 0.3316192626953125, "learning_rate": 5e-07, "logits/chosen": -25660070.4, "logits/rejected": -13378910.666666666, "logps/chosen": -287.6258544921875, "logps/rejected": -187.8787841796875, "loss": 0.4171, "rewards/chosen": 0.14582617282867433, "rewards/margins": 0.8669541597366333, "rewards/rejected": -0.721127986907959, "step": 2307 }, { "epoch": 0.12233323615933003, "grad_norm": 65.5, "kl": 0.3200531005859375, "learning_rate": 5e-07, "logits/chosen": -37038224.0, "logits/rejected": 6156434.0, "logps/chosen": -320.71728515625, "logps/rejected": -364.3468017578125, "loss": 0.3138, "rewards/chosen": 0.39652007818222046, "rewards/margins": 1.8579046130180359, "rewards/rejected": -1.4613845348358154, "step": 2308 }, { "epoch": 0.12238624016113217, "grad_norm": 59.25, "kl": 0.3289642333984375, "learning_rate": 5e-07, "logits/chosen": -21149430.0, "logits/rejected": -8660678.0, "logps/chosen": -397.6939697265625, "logps/rejected": -317.3873291015625, "loss": 0.3615, "rewards/chosen": 0.07929305732250214, "rewards/margins": 1.3000410050153732, "rewards/rejected": -1.220747947692871, "step": 2309 }, { "epoch": 0.12243924416293431, "grad_norm": 56.25, "kl": 0.01828765869140625, "learning_rate": 5e-07, "logits/chosen": -86524842.66666667, "logits/rejected": -34791772.8, "logps/chosen": -539.4883626302084, "logps/rejected": -373.8236572265625, "loss": 0.2884, "rewards/chosen": -0.08616740504900615, "rewards/margins": 1.8166951437791188, "rewards/rejected": -1.902862548828125, "step": 2310 }, { "epoch": 0.12249224816473643, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20373012.8, "logits/rejected": -39456000.0, "logps/chosen": -260.025732421875, "logps/rejected": -428.524169921875, "loss": 0.3717, "rewards/chosen": -0.007154548168182373, "rewards/margins": 2.438104863961538, "rewards/rejected": -2.44525941212972, "step": 2311 }, { "epoch": 0.12254525216653857, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35619244.0, "logits/rejected": -3050967.25, "logps/chosen": -165.76181030273438, "logps/rejected": -152.0517578125, "loss": 0.4051, "rewards/chosen": -0.3386770486831665, "rewards/margins": 1.0140327215194702, "rewards/rejected": -1.3527097702026367, "step": 2312 }, { "epoch": 0.1225982561683407, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16047976.0, "logits/rejected": -69641792.0, "logps/chosen": -310.2543640136719, "logps/rejected": -411.69921875, "loss": 0.2947, "rewards/chosen": -0.04653492569923401, "rewards/margins": 2.4927137792110443, "rewards/rejected": -2.5392487049102783, "step": 2313 }, { "epoch": 0.12265126017014284, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45165032.0, "logits/rejected": -19604154.0, "logps/chosen": -260.35028076171875, "logps/rejected": -179.93577575683594, "loss": 0.2998, "rewards/chosen": 0.08586005866527557, "rewards/margins": 2.1723745316267014, "rewards/rejected": -2.086514472961426, "step": 2314 }, { "epoch": 0.12270426417194498, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -472246.4, "logits/rejected": -52355370.666666664, "logps/chosen": -306.6633544921875, "logps/rejected": -340.2294108072917, "loss": 0.3424, "rewards/chosen": 0.18809239864349364, "rewards/margins": 1.9364134550094605, "rewards/rejected": -1.7483210563659668, "step": 2315 }, { "epoch": 0.12275726817374712, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5092626.0, "logits/rejected": -29251692.8, "logps/chosen": -257.8312581380208, "logps/rejected": -584.9046875, "loss": 0.2841, "rewards/chosen": 0.18761964639027914, "rewards/margins": 2.1346453269322714, "rewards/rejected": -1.947025680541992, "step": 2316 }, { "epoch": 0.12281027217554925, "grad_norm": 64.5, "kl": 1.7090377807617188, "learning_rate": 5e-07, "logits/chosen": -1597648.0, "logits/rejected": -24526590.0, "logps/chosen": -315.85205078125, "logps/rejected": -79.74467468261719, "loss": 0.4056, "rewards/chosen": 0.5777058998743693, "rewards/margins": 1.0920129219690957, "rewards/rejected": -0.5143070220947266, "step": 2317 }, { "epoch": 0.12286327617735139, "grad_norm": 85.0, "kl": 0.6595869064331055, "learning_rate": 5e-07, "logits/chosen": -32048832.0, "logits/rejected": -17017227.2, "logps/chosen": -1009.2613118489584, "logps/rejected": -153.96759033203125, "loss": 0.329, "rewards/chosen": 0.2640209396680196, "rewards/margins": 1.5591360290845235, "rewards/rejected": -1.295115089416504, "step": 2318 }, { "epoch": 0.12291628017915353, "grad_norm": 54.5, "kl": 0.3231658935546875, "learning_rate": 5e-07, "logits/chosen": -52765237.333333336, "logits/rejected": -36631260.0, "logps/chosen": -182.49466959635416, "logps/rejected": -424.2308044433594, "loss": 0.3787, "rewards/chosen": 0.3094126383463542, "rewards/margins": 1.7088412443796794, "rewards/rejected": -1.3994286060333252, "step": 2319 }, { "epoch": 0.12296928418095567, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33005273.6, "logits/rejected": 3558605.3333333335, "logps/chosen": -417.966552734375, "logps/rejected": -50.495269775390625, "loss": 0.4657, "rewards/chosen": -0.11033985614776612, "rewards/margins": 0.4627933740615845, "rewards/rejected": -0.5731332302093506, "step": 2320 }, { "epoch": 0.1230222881827578, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1646214.375, "logits/rejected": 1751331.0, "logps/chosen": -236.0782928466797, "logps/rejected": -436.5931701660156, "loss": 0.4141, "rewards/chosen": -0.3109760284423828, "rewards/margins": 1.3879116773605347, "rewards/rejected": -1.6988877058029175, "step": 2321 }, { "epoch": 0.12307529218455994, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1204793.3333333333, "logits/rejected": -19953248.0, "logps/chosen": -172.7974853515625, "logps/rejected": -169.71402587890626, "loss": 0.3852, "rewards/chosen": -0.2390876809755961, "rewards/margins": 0.763490386803945, "rewards/rejected": -1.002578067779541, "step": 2322 }, { "epoch": 0.12312829618636208, "grad_norm": 57.5, "kl": 0.2939882278442383, "learning_rate": 5e-07, "logits/chosen": 61166566.4, "logits/rejected": -36864426.666666664, "logps/chosen": -172.0736572265625, "logps/rejected": -465.8402099609375, "loss": 0.3635, "rewards/chosen": 0.11879128217697144, "rewards/margins": 2.0771767497062683, "rewards/rejected": -1.9583854675292969, "step": 2323 }, { "epoch": 0.1231813001881642, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15856381.333333334, "logits/rejected": -33765212.8, "logps/chosen": -164.3414306640625, "logps/rejected": -427.50439453125, "loss": 0.2743, "rewards/chosen": 0.20563813050587973, "rewards/margins": 2.290937050183614, "rewards/rejected": -2.0852989196777343, "step": 2324 }, { "epoch": 0.12323430418996634, "grad_norm": 50.0, "kl": 0.212738037109375, "learning_rate": 5e-07, "logits/chosen": -24452324.57142857, "logits/rejected": -31834096.0, "logps/chosen": -198.97872488839286, "logps/rejected": -483.5240478515625, "loss": 0.4188, "rewards/chosen": 0.14262967450278147, "rewards/margins": 3.000869563647679, "rewards/rejected": -2.8582398891448975, "step": 2325 }, { "epoch": 0.12328730819176847, "grad_norm": 44.5, "kl": 0.7815208435058594, "learning_rate": 5e-07, "logits/chosen": -48794544.0, "logits/rejected": 26319017.6, "logps/chosen": -397.46728515625, "logps/rejected": -357.96259765625, "loss": 0.2727, "rewards/chosen": 0.3399098714192708, "rewards/margins": 2.6762551625569664, "rewards/rejected": -2.3363452911376954, "step": 2326 }, { "epoch": 0.12334031219357061, "grad_norm": 50.0, "kl": 0.5276021957397461, "learning_rate": 5e-07, "logits/chosen": -11861837.333333334, "logits/rejected": -37041904.0, "logps/chosen": -430.1911214192708, "logps/rejected": -150.9896240234375, "loss": 0.3603, "rewards/chosen": 0.6302827994028727, "rewards/margins": 1.4169172445933023, "rewards/rejected": -0.7866344451904297, "step": 2327 }, { "epoch": 0.12339331619537275, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1213405.0, "logits/rejected": -14713021.333333334, "logps/chosen": -148.125048828125, "logps/rejected": -412.1336263020833, "loss": 0.3268, "rewards/chosen": 0.18657021522521972, "rewards/margins": 2.4171424388885496, "rewards/rejected": -2.23057222366333, "step": 2328 }, { "epoch": 0.12344632019717489, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23831870.0, "logits/rejected": -60957120.0, "logps/chosen": -365.0721740722656, "logps/rejected": -648.16748046875, "loss": 0.3085, "rewards/chosen": 0.1444316804409027, "rewards/margins": 2.031877487897873, "rewards/rejected": -1.8874458074569702, "step": 2329 }, { "epoch": 0.12349932419897702, "grad_norm": 57.0, "kl": 0.15937423706054688, "learning_rate": 5e-07, "logits/chosen": -21223889.333333332, "logits/rejected": -16636598.0, "logps/chosen": -363.9260660807292, "logps/rejected": -209.368896484375, "loss": 0.4068, "rewards/chosen": 0.11143181721369426, "rewards/margins": 1.7822716037432353, "rewards/rejected": -1.670839786529541, "step": 2330 }, { "epoch": 0.12355232820077916, "grad_norm": 55.75, "kl": 0.10374832153320312, "learning_rate": 5e-07, "logits/chosen": -24721228.0, "logits/rejected": -35470792.0, "logps/chosen": -247.42507934570312, "logps/rejected": -275.9425354003906, "loss": 0.3805, "rewards/chosen": -0.18345382809638977, "rewards/margins": 1.2925531566143036, "rewards/rejected": -1.4760069847106934, "step": 2331 }, { "epoch": 0.1236053322025813, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34942714.666666664, "logits/rejected": -6121629.0, "logps/chosen": -162.38410441080728, "logps/rejected": -138.79061889648438, "loss": 0.3535, "rewards/chosen": 0.3249611457188924, "rewards/margins": 2.1528844436009726, "rewards/rejected": -1.82792329788208, "step": 2332 }, { "epoch": 0.12365833620438343, "grad_norm": 59.0, "kl": 0.3000640869140625, "learning_rate": 5e-07, "logits/chosen": -54006554.666666664, "logits/rejected": -2106793.0, "logps/chosen": -383.1800130208333, "logps/rejected": -179.19601440429688, "loss": 0.3983, "rewards/chosen": 0.1831717093785604, "rewards/margins": 1.611217459042867, "rewards/rejected": -1.4280457496643066, "step": 2333 }, { "epoch": 0.12371134020618557, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8810809.333333334, "logits/rejected": -56668441.6, "logps/chosen": -324.6085611979167, "logps/rejected": -338.3690185546875, "loss": 0.3047, "rewards/chosen": 0.009330620368321737, "rewards/margins": 1.6550696035226184, "rewards/rejected": -1.6457389831542968, "step": 2334 }, { "epoch": 0.12376434420798771, "grad_norm": 84.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 171994176.0, "logits/rejected": -55916112.0, "logps/chosen": -743.0938720703125, "logps/rejected": -247.98065185546875, "loss": 0.391, "rewards/chosen": 0.1119782030582428, "rewards/margins": 0.9361756145954132, "rewards/rejected": -0.8241974115371704, "step": 2335 }, { "epoch": 0.12381734820978985, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29557054.0, "logits/rejected": -20839152.0, "logps/chosen": -196.64260864257812, "logps/rejected": -159.07070922851562, "loss": 0.3054, "rewards/chosen": 0.04227094352245331, "rewards/margins": 1.2152833590904872, "rewards/rejected": -1.173012415568034, "step": 2336 }, { "epoch": 0.12387035221159197, "grad_norm": 93.5, "kl": 1.6374187469482422, "learning_rate": 5e-07, "logits/chosen": -32729321.6, "logits/rejected": -7019066.0, "logps/chosen": -940.87333984375, "logps/rejected": -486.7329508463542, "loss": 0.3322, "rewards/chosen": 0.5721844673156739, "rewards/margins": 2.509230359395345, "rewards/rejected": -1.9370458920796711, "step": 2337 }, { "epoch": 0.1239233562133941, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17923588.0, "logits/rejected": -69973811.2, "logps/chosen": -160.59154256184897, "logps/rejected": -324.014453125, "loss": 0.3352, "rewards/chosen": -0.29077096780141193, "rewards/margins": 1.3911017815272013, "rewards/rejected": -1.6818727493286132, "step": 2338 }, { "epoch": 0.12397636021519624, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17937257.6, "logits/rejected": -37596317.333333336, "logps/chosen": -170.7013427734375, "logps/rejected": -255.9741007486979, "loss": 0.4303, "rewards/chosen": -0.25978360176086424, "rewards/margins": 1.2234358628590902, "rewards/rejected": -1.4832194646199544, "step": 2339 }, { "epoch": 0.12402936421699838, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62771957.333333336, "logits/rejected": -30388352.0, "logps/chosen": -897.1551106770834, "logps/rejected": -430.273876953125, "loss": 0.2632, "rewards/chosen": 0.4327860673268636, "rewards/margins": 2.251485904057821, "rewards/rejected": -1.818699836730957, "step": 2340 }, { "epoch": 0.12408236821880052, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27218812.0, "logits/rejected": -22833466.666666668, "logps/chosen": -343.6478271484375, "logps/rejected": -317.4915364583333, "loss": 0.2676, "rewards/chosen": 0.19134369492530823, "rewards/margins": 1.8708137571811676, "rewards/rejected": -1.6794700622558594, "step": 2341 }, { "epoch": 0.12413537222060265, "grad_norm": 50.75, "kl": 1.1597652435302734, "learning_rate": 5e-07, "logits/chosen": -1611524.4, "logits/rejected": -31500928.0, "logps/chosen": -173.59256591796876, "logps/rejected": -240.7862548828125, "loss": 0.4274, "rewards/chosen": 0.16719506978988646, "rewards/margins": 1.250590233008067, "rewards/rejected": -1.0833951632181804, "step": 2342 }, { "epoch": 0.12418837622240479, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1061900.5, "logits/rejected": -11769813.333333334, "logps/chosen": -206.51303100585938, "logps/rejected": -238.62103271484375, "loss": 0.2888, "rewards/chosen": -0.03290557861328125, "rewards/margins": 1.5834801991780598, "rewards/rejected": -1.616385777791341, "step": 2343 }, { "epoch": 0.12424138022420693, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1643018.875, "logits/rejected": -69688722.28571428, "logps/chosen": -89.86203002929688, "logps/rejected": -391.9244907924107, "loss": 0.3021, "rewards/chosen": -0.593518853187561, "rewards/margins": 0.7343214069093977, "rewards/rejected": -1.3278402600969588, "step": 2344 }, { "epoch": 0.12429438422600907, "grad_norm": 72.0, "kl": 0.4088153839111328, "learning_rate": 5e-07, "logits/chosen": 173826029.7142857, "logits/rejected": 325013312.0, "logps/chosen": -261.3436279296875, "logps/rejected": -786.0045776367188, "loss": 0.4322, "rewards/chosen": 0.19231425012860978, "rewards/margins": 1.5416978427342005, "rewards/rejected": -1.3493835926055908, "step": 2345 }, { "epoch": 0.1243473882278112, "grad_norm": 61.5, "kl": 0.5596351623535156, "learning_rate": 5e-07, "logits/chosen": -15005550.4, "logits/rejected": 35426298.666666664, "logps/chosen": -528.503125, "logps/rejected": -416.626708984375, "loss": 0.3127, "rewards/chosen": 0.42522339820861815, "rewards/margins": 2.2629735151926678, "rewards/rejected": -1.8377501169840496, "step": 2346 }, { "epoch": 0.12440039222961334, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72039992.0, "logits/rejected": -12876801.142857144, "logps/chosen": -319.9629821777344, "logps/rejected": -200.02054268973214, "loss": 0.2804, "rewards/chosen": 0.17767028510570526, "rewards/margins": 1.377973226564271, "rewards/rejected": -1.2003029414585658, "step": 2347 }, { "epoch": 0.12445339623141548, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29822636.8, "logits/rejected": -49746309.333333336, "logps/chosen": -320.440185546875, "logps/rejected": -383.6850179036458, "loss": 0.3194, "rewards/chosen": 0.43908987045288084, "rewards/margins": 2.155254618326823, "rewards/rejected": -1.7161647478739421, "step": 2348 }, { "epoch": 0.1245064002332176, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48635002.666666664, "logits/rejected": -27806483.2, "logps/chosen": -695.76708984375, "logps/rejected": -466.71328125, "loss": 0.2555, "rewards/chosen": 0.46115930875142414, "rewards/margins": 2.3169795831044517, "rewards/rejected": -1.8558202743530274, "step": 2349 }, { "epoch": 0.12455940423501974, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4564603.5, "logits/rejected": -36140205.333333336, "logps/chosen": -227.4861297607422, "logps/rejected": -363.7114664713542, "loss": 0.244, "rewards/chosen": 0.2637466490268707, "rewards/margins": 1.873680184284846, "rewards/rejected": -1.6099335352579753, "step": 2350 }, { "epoch": 0.12461240823682188, "grad_norm": 83.5, "kl": 0.9087677001953125, "learning_rate": 5e-07, "logits/chosen": -7923005.333333333, "logits/rejected": -13665564.8, "logps/chosen": -963.9205729166666, "logps/rejected": -171.1512939453125, "loss": 0.2988, "rewards/chosen": 0.5996063152949015, "rewards/margins": 1.850832454363505, "rewards/rejected": -1.2512261390686035, "step": 2351 }, { "epoch": 0.12466541223862401, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20653523.2, "logits/rejected": -21550016.0, "logps/chosen": -275.5810302734375, "logps/rejected": -300.692626953125, "loss": 0.3717, "rewards/chosen": -0.07848236560821534, "rewards/margins": 1.8938992261886596, "rewards/rejected": -1.972381591796875, "step": 2352 }, { "epoch": 0.12471841624042615, "grad_norm": 47.0, "kl": 0.6819953918457031, "learning_rate": 5e-07, "logits/chosen": -24372300.0, "logits/rejected": -37855416.0, "logps/chosen": -252.6510009765625, "logps/rejected": -298.9461975097656, "loss": 0.3289, "rewards/chosen": -0.07206017524003983, "rewards/margins": 1.740676335990429, "rewards/rejected": -1.8127365112304688, "step": 2353 }, { "epoch": 0.12477142024222829, "grad_norm": 68.0, "kl": 0.6079483032226562, "learning_rate": 5e-07, "logits/chosen": -27855884.0, "logits/rejected": -12580484.0, "logps/chosen": -409.13763427734375, "logps/rejected": -136.1147003173828, "loss": 0.3715, "rewards/chosen": 0.2701871693134308, "rewards/margins": 1.2317568361759186, "rewards/rejected": -0.9615696668624878, "step": 2354 }, { "epoch": 0.12482442424403042, "grad_norm": 63.0, "kl": 1.0095710754394531, "learning_rate": 5e-07, "logits/chosen": -30877410.285714287, "logits/rejected": -43457112.0, "logps/chosen": -291.4084995814732, "logps/rejected": -597.9645385742188, "loss": 0.4575, "rewards/chosen": 0.046747377940586636, "rewards/margins": 3.217200211116246, "rewards/rejected": -3.170452833175659, "step": 2355 }, { "epoch": 0.12487742824583256, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43441634.666666664, "logits/rejected": -21098460.0, "logps/chosen": -282.93007405598956, "logps/rejected": -477.51837158203125, "loss": 0.3339, "rewards/chosen": 0.3857279618581136, "rewards/margins": 2.6094547112782798, "rewards/rejected": -2.223726749420166, "step": 2356 }, { "epoch": 0.1249304322476347, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9159216.0, "logits/rejected": -21282635.2, "logps/chosen": -277.0309651692708, "logps/rejected": -289.2028564453125, "loss": 0.3052, "rewards/chosen": 0.3597373962402344, "rewards/margins": 1.784543228149414, "rewards/rejected": -1.4248058319091796, "step": 2357 }, { "epoch": 0.12498343624943684, "grad_norm": 81.5, "kl": 0.4587535858154297, "learning_rate": 5e-07, "logits/chosen": -33467978.666666668, "logits/rejected": 1816849.8, "logps/chosen": -1038.5553385416667, "logps/rejected": -561.153466796875, "loss": 0.2137, "rewards/chosen": 1.0480672518412273, "rewards/margins": 3.066566626230876, "rewards/rejected": -2.0184993743896484, "step": 2358 }, { "epoch": 0.12503644025123897, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8086779.333333333, "logits/rejected": -1896654.4, "logps/chosen": -138.814208984375, "logps/rejected": -420.211865234375, "loss": 0.2725, "rewards/chosen": 0.4590478738149007, "rewards/margins": 2.238825782140096, "rewards/rejected": -1.7797779083251952, "step": 2359 }, { "epoch": 0.1250894442530411, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7092669.333333333, "logits/rejected": -26293217.6, "logps/chosen": -102.73189290364583, "logps/rejected": -379.102294921875, "loss": 0.2674, "rewards/chosen": -0.027067323525746662, "rewards/margins": 2.2515440503756206, "rewards/rejected": -2.2786113739013674, "step": 2360 }, { "epoch": 0.12514244825484325, "grad_norm": 61.25, "kl": 1.2845573425292969, "learning_rate": 5e-07, "logits/chosen": -38843460.0, "logits/rejected": 12600956.0, "logps/chosen": -934.7711791992188, "logps/rejected": -141.24267578125, "loss": 0.3009, "rewards/chosen": 1.4083008766174316, "rewards/margins": 2.142203172047933, "rewards/rejected": -0.7339022954305013, "step": 2361 }, { "epoch": 0.12519545225664538, "grad_norm": 70.0, "kl": 0.488616943359375, "learning_rate": 5e-07, "logits/chosen": -22546362.666666668, "logits/rejected": -27195688.0, "logps/chosen": -461.9613037109375, "logps/rejected": -274.5308837890625, "loss": 0.4217, "rewards/chosen": -0.05927899976571401, "rewards/margins": 2.222811664144198, "rewards/rejected": -2.282090663909912, "step": 2362 }, { "epoch": 0.12524845625844752, "grad_norm": 55.25, "kl": 0.2944221496582031, "learning_rate": 5e-07, "logits/chosen": -18410859.2, "logits/rejected": -14114396.0, "logps/chosen": -368.7915771484375, "logps/rejected": -106.07720947265625, "loss": 0.4146, "rewards/chosen": 0.12506062984466554, "rewards/margins": 0.9413312991460165, "rewards/rejected": -0.816270669301351, "step": 2363 }, { "epoch": 0.12530146026024966, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22454003.2, "logits/rejected": -121250517.33333333, "logps/chosen": -377.3681884765625, "logps/rejected": -461.6128336588542, "loss": 0.3567, "rewards/chosen": 0.06249387264251709, "rewards/margins": 2.349362921714783, "rewards/rejected": -2.2868690490722656, "step": 2364 }, { "epoch": 0.1253544642620518, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70836736.0, "logits/rejected": -98552184.0, "logps/chosen": -446.7176513671875, "logps/rejected": -136.3056640625, "loss": 0.3437, "rewards/chosen": -0.0531616248190403, "rewards/margins": 1.7008784972131252, "rewards/rejected": -1.7540401220321655, "step": 2365 }, { "epoch": 0.12540746826385393, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -23719002.0, "logps/rejected": -357.69305419921875, "loss": 0.1938, "rewards/rejected": -1.758760929107666, "step": 2366 }, { "epoch": 0.12546047226565607, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32609045.333333332, "logits/rejected": -7202912.0, "logps/chosen": -641.4360758463541, "logps/rejected": -717.35302734375, "loss": 0.3182, "rewards/chosen": 0.5021183093388876, "rewards/margins": 5.063169678052266, "rewards/rejected": -4.561051368713379, "step": 2367 }, { "epoch": 0.12551347626745818, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1009812.25, "logits/rejected": -32225122.666666668, "logps/chosen": -212.4542236328125, "logps/rejected": -310.7446695963542, "loss": 0.1864, "rewards/chosen": 0.6387048959732056, "rewards/margins": 2.674423336982727, "rewards/rejected": -2.0357184410095215, "step": 2368 }, { "epoch": 0.12556648026926032, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15386761.0, "logits/rejected": -29154770.0, "logps/chosen": -113.68238830566406, "logps/rejected": -297.68218994140625, "loss": 0.356, "rewards/chosen": -0.2573660910129547, "rewards/margins": 1.6205171048641205, "rewards/rejected": -1.8778831958770752, "step": 2369 }, { "epoch": 0.12561948427106245, "grad_norm": 45.0, "kl": 0.7526817321777344, "learning_rate": 5e-07, "logits/chosen": -20031962.0, "logits/rejected": -19652384.0, "logps/chosen": -296.907470703125, "logps/rejected": -277.30224609375, "loss": 0.3329, "rewards/chosen": 0.13850097358226776, "rewards/margins": 1.7874103039503098, "rewards/rejected": -1.648909330368042, "step": 2370 }, { "epoch": 0.1256724882728646, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39280340.0, "logits/rejected": -1709385.875, "logps/chosen": -461.2757873535156, "logps/rejected": -269.2577819824219, "loss": 0.3504, "rewards/chosen": 0.3780372738838196, "rewards/margins": 1.3857046961784363, "rewards/rejected": -1.0076674222946167, "step": 2371 }, { "epoch": 0.12572549227466673, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 380520.75, "logits/rejected": -19408496.0, "logps/chosen": -45.05204391479492, "logps/rejected": -109.74601745605469, "loss": 0.3854, "rewards/chosen": 0.12808857858181, "rewards/margins": 0.9718183130025864, "rewards/rejected": -0.8437297344207764, "step": 2372 }, { "epoch": 0.12577849627646887, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15525899.2, "logits/rejected": -7479497.333333333, "logps/chosen": -210.52138671875, "logps/rejected": -92.87166341145833, "loss": 0.3983, "rewards/chosen": 0.07429336309432984, "rewards/margins": 1.2526556531588238, "rewards/rejected": -1.178362290064494, "step": 2373 }, { "epoch": 0.125831500278271, "grad_norm": 81.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 31701452.8, "logits/rejected": -54779802.666666664, "logps/chosen": -477.40107421875, "logps/rejected": -723.0485026041666, "loss": 0.3849, "rewards/chosen": -0.09920007586479188, "rewards/margins": 1.9418319205443062, "rewards/rejected": -2.041031996409098, "step": 2374 }, { "epoch": 0.12588450428007314, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16192383.0, "logits/rejected": -22286021.333333332, "logps/chosen": -118.33674621582031, "logps/rejected": -305.1219075520833, "loss": 0.2681, "rewards/chosen": -0.1899127960205078, "rewards/margins": 1.5954987208048503, "rewards/rejected": -1.7854115168253581, "step": 2375 }, { "epoch": 0.12593750828187528, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34532611.2, "logits/rejected": -44102906.666666664, "logps/chosen": -450.98955078125, "logps/rejected": -462.0602620442708, "loss": 0.3129, "rewards/chosen": 0.2649302244186401, "rewards/margins": 2.6666423241297403, "rewards/rejected": -2.4017120997111, "step": 2376 }, { "epoch": 0.1259905122836774, "grad_norm": 44.25, "kl": 0.39864349365234375, "learning_rate": 5e-07, "logits/chosen": -44549432.0, "logits/rejected": -29360308.0, "logps/chosen": -315.73095703125, "logps/rejected": -285.58111572265625, "loss": 0.3043, "rewards/chosen": 0.4311492443084717, "rewards/margins": 2.27114737033844, "rewards/rejected": -1.8399981260299683, "step": 2377 }, { "epoch": 0.12604351628547955, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17936768.0, "logits/rejected": -32491188.0, "logps/chosen": -361.5271911621094, "logps/rejected": -342.5279541015625, "loss": 0.2914, "rewards/chosen": 0.302538126707077, "rewards/margins": 2.1423830687999725, "rewards/rejected": -1.8398449420928955, "step": 2378 }, { "epoch": 0.1260965202872817, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12850447.0, "logits/rejected": -39013138.666666664, "logps/chosen": -74.54147338867188, "logps/rejected": -260.38999430338544, "loss": 0.2837, "rewards/chosen": -0.2439059317111969, "rewards/margins": 1.411574771006902, "rewards/rejected": -1.6554807027180989, "step": 2379 }, { "epoch": 0.12614952428908383, "grad_norm": 68.0, "kl": 1.0778045654296875, "learning_rate": 5e-07, "logits/chosen": -101802995.2, "logits/rejected": -40672448.0, "logps/chosen": -570.31650390625, "logps/rejected": -344.1640625, "loss": 0.3107, "rewards/chosen": 0.4794015407562256, "rewards/margins": 2.3975313027699787, "rewards/rejected": -1.9181297620137532, "step": 2380 }, { "epoch": 0.12620252829088596, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39022684.0, "logits/rejected": -6473484.5, "logps/chosen": -352.2530212402344, "logps/rejected": -142.8905029296875, "loss": 0.3157, "rewards/chosen": 0.29666242003440857, "rewards/margins": 1.7398553788661957, "rewards/rejected": -1.443192958831787, "step": 2381 }, { "epoch": 0.1262555322926881, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4584497.333333333, "logits/rejected": -22533652.8, "logps/chosen": -204.67814127604166, "logps/rejected": -352.8724609375, "loss": 0.289, "rewards/chosen": -0.20842591921488443, "rewards/margins": 1.760558303197225, "rewards/rejected": -1.9689842224121095, "step": 2382 }, { "epoch": 0.12630853629449024, "grad_norm": 76.0, "kl": 2.164562225341797, "learning_rate": 5e-07, "logits/chosen": -47084069.333333336, "logits/rejected": -13319762.0, "logps/chosen": -560.119140625, "logps/rejected": -262.29254150390625, "loss": 0.4367, "rewards/chosen": 0.0014287779728571575, "rewards/margins": 1.621954138080279, "rewards/rejected": -1.6205253601074219, "step": 2383 }, { "epoch": 0.12636154029629237, "grad_norm": 44.75, "kl": 0.16459369659423828, "learning_rate": 5e-07, "logits/chosen": -6380622.8, "logits/rejected": -36172210.666666664, "logps/chosen": -137.09354248046876, "logps/rejected": -231.81734212239584, "loss": 0.3847, "rewards/chosen": 0.2384875774383545, "rewards/margins": 1.3355724175771078, "rewards/rejected": -1.0970848401387532, "step": 2384 }, { "epoch": 0.1264145442980945, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17420073.6, "logits/rejected": -47073269.333333336, "logps/chosen": -364.371337890625, "logps/rejected": -424.19580078125, "loss": 0.3468, "rewards/chosen": 0.06906830668449401, "rewards/margins": 2.4517851213614144, "rewards/rejected": -2.3827168146769204, "step": 2385 }, { "epoch": 0.12646754829989665, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18858118.4, "logits/rejected": -31994016.0, "logps/chosen": -316.33515625, "logps/rejected": -262.2713216145833, "loss": 0.3319, "rewards/chosen": 0.14959716796875, "rewards/margins": 2.316697438557943, "rewards/rejected": -2.167100270589193, "step": 2386 }, { "epoch": 0.12652055230169879, "grad_norm": 66.5, "kl": 0.10806655883789062, "learning_rate": 5e-07, "logits/chosen": -55701203.2, "logits/rejected": -37582362.666666664, "logps/chosen": -448.59150390625, "logps/rejected": -368.942626953125, "loss": 0.3881, "rewards/chosen": 0.010999757796525955, "rewards/margins": 1.452407279362281, "rewards/rejected": -1.4414075215657551, "step": 2387 }, { "epoch": 0.12657355630350092, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3158779.3333333335, "logits/rejected": -70393560.0, "logps/chosen": -203.52583821614584, "logps/rejected": -346.01226806640625, "loss": 0.4177, "rewards/chosen": 0.02084009846051534, "rewards/margins": 1.724909593661626, "rewards/rejected": -1.7040694952011108, "step": 2388 }, { "epoch": 0.12662656030530306, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10761068.0, "logits/rejected": 8745869.142857144, "logps/chosen": -518.2034912109375, "logps/rejected": -157.45071847098214, "loss": 0.258, "rewards/chosen": 1.4551270008087158, "rewards/margins": 2.5512962000710626, "rewards/rejected": -1.0961691992623466, "step": 2389 }, { "epoch": 0.1266795643071052, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43900373.333333336, "logits/rejected": -10390348.0, "logps/chosen": -1057.8177083333333, "logps/rejected": -227.054736328125, "loss": 0.2541, "rewards/chosen": 0.4788740873336792, "rewards/margins": 2.413327670097351, "rewards/rejected": -1.934453582763672, "step": 2390 }, { "epoch": 0.12673256830890733, "grad_norm": 56.0, "kl": 0.2303028106689453, "learning_rate": 5e-07, "logits/chosen": -29080832.0, "logits/rejected": -33383762.666666668, "logps/chosen": -663.786474609375, "logps/rejected": -356.2112630208333, "loss": 0.323, "rewards/chosen": 0.3288642168045044, "rewards/margins": 2.3696447610855103, "rewards/rejected": -2.040780544281006, "step": 2391 }, { "epoch": 0.12678557231070947, "grad_norm": 52.0, "kl": 0.2885913848876953, "learning_rate": 5e-07, "logits/chosen": -846016.5, "logits/rejected": -44431080.0, "logps/chosen": -490.3971862792969, "logps/rejected": -209.6524658203125, "loss": 0.3318, "rewards/chosen": 0.4865154027938843, "rewards/margins": 1.641481637954712, "rewards/rejected": -1.1549662351608276, "step": 2392 }, { "epoch": 0.1268385763125116, "grad_norm": 62.25, "kl": 0.404632568359375, "learning_rate": 5e-07, "logits/chosen": -37300972.8, "logits/rejected": -24488552.0, "logps/chosen": -390.010791015625, "logps/rejected": -341.5461018880208, "loss": 0.3738, "rewards/chosen": 0.23766400814056396, "rewards/margins": 1.7823144038518268, "rewards/rejected": -1.544650395711263, "step": 2393 }, { "epoch": 0.12689158031431372, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13173772.8, "logits/rejected": -32492021.333333332, "logps/chosen": -175.2427001953125, "logps/rejected": -350.8961181640625, "loss": 0.4044, "rewards/chosen": -0.16162185668945311, "rewards/margins": 1.5855080604553222, "rewards/rejected": -1.7471299171447754, "step": 2394 }, { "epoch": 0.12694458431611585, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2429942.6, "logits/rejected": -8571244.0, "logps/chosen": -63.609869384765624, "logps/rejected": -73.62480163574219, "loss": 0.4425, "rewards/chosen": -0.15884075164794922, "rewards/margins": 0.8798182646433513, "rewards/rejected": -1.0386590162913005, "step": 2395 }, { "epoch": 0.126997588317918, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24129507.2, "logits/rejected": -19050346.666666668, "logps/chosen": -258.9853515625, "logps/rejected": -416.861328125, "loss": 0.3453, "rewards/chosen": 0.260773229598999, "rewards/margins": 2.255205837885539, "rewards/rejected": -1.9944326082865398, "step": 2396 }, { "epoch": 0.12705059231972013, "grad_norm": 58.75, "kl": 0.5193290710449219, "learning_rate": 5e-07, "logits/chosen": -40046921.6, "logits/rejected": 14050437.333333334, "logps/chosen": -266.462060546875, "logps/rejected": -583.6161702473959, "loss": 0.3444, "rewards/chosen": 0.1651228427886963, "rewards/margins": 2.6339551766713463, "rewards/rejected": -2.46883233388265, "step": 2397 }, { "epoch": 0.12710359632152227, "grad_norm": 53.75, "kl": 0.8309783935546875, "learning_rate": 5e-07, "logits/chosen": -68390112.0, "logits/rejected": -19437182.0, "logps/chosen": -389.6693115234375, "logps/rejected": -300.01971435546875, "loss": 0.3027, "rewards/chosen": 0.22100047767162323, "rewards/margins": 2.366712376475334, "rewards/rejected": -2.145711898803711, "step": 2398 }, { "epoch": 0.1271566003233244, "grad_norm": 51.25, "kl": 0.12509727478027344, "learning_rate": 5e-07, "logits/chosen": -23470381.333333332, "logits/rejected": -47093267.2, "logps/chosen": -310.43609619140625, "logps/rejected": -374.9427734375, "loss": 0.2673, "rewards/chosen": 0.3103249867757161, "rewards/margins": 2.0179089864095054, "rewards/rejected": -1.7075839996337892, "step": 2399 }, { "epoch": 0.12720960432512654, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4360360.0, "logits/rejected": -24644088.0, "logps/chosen": -179.5498046875, "logps/rejected": -619.6415405273438, "loss": 0.3365, "rewards/chosen": 0.07827726006507874, "rewards/margins": 2.0145073235034943, "rewards/rejected": -1.9362300634384155, "step": 2400 }, { "epoch": 0.12726260832692868, "grad_norm": 57.25, "kl": 0.054271697998046875, "learning_rate": 5e-07, "logits/chosen": -30532822.0, "logits/rejected": -1860476.125, "logps/chosen": -422.6390380859375, "logps/rejected": -261.80377197265625, "loss": 0.3367, "rewards/chosen": 0.31745070219039917, "rewards/margins": 1.7265523076057434, "rewards/rejected": -1.4091016054153442, "step": 2401 }, { "epoch": 0.12731561232873082, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33929264.0, "logits/rejected": -19190540.0, "logps/chosen": -182.69502766927084, "logps/rejected": -230.67381286621094, "loss": 0.3895, "rewards/chosen": 0.3087327678998311, "rewards/margins": 1.5362464388211567, "rewards/rejected": -1.2275136709213257, "step": 2402 }, { "epoch": 0.12736861633053295, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7728688.0, "logits/rejected": -20007740.0, "logps/chosen": -187.22509765625, "logps/rejected": -391.1734619140625, "loss": 0.1935, "rewards/chosen": 0.5037037134170532, "rewards/margins": 2.5690784056981406, "rewards/rejected": -2.0653746922810874, "step": 2403 }, { "epoch": 0.1274216203323351, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10725174.666666666, "logits/rejected": -27991488.0, "logps/chosen": -82.81227620442708, "logps/rejected": -431.730517578125, "loss": 0.2572, "rewards/chosen": 0.035520692666371666, "rewards/margins": 2.5631370027860005, "rewards/rejected": -2.5276163101196287, "step": 2404 }, { "epoch": 0.12747462433413723, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13449160.0, "logits/rejected": -19474134.4, "logps/chosen": -249.81123860677084, "logps/rejected": -265.895361328125, "loss": 0.3738, "rewards/chosen": -0.22402071952819824, "rewards/margins": 0.9593653202056884, "rewards/rejected": -1.1833860397338867, "step": 2405 }, { "epoch": 0.12752762833593936, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46305868.0, "logits/rejected": -5294803.0, "logps/chosen": -191.1586151123047, "logps/rejected": -303.99139404296875, "loss": 0.327, "rewards/chosen": 0.02315397560596466, "rewards/margins": 1.9091045409440994, "rewards/rejected": -1.8859505653381348, "step": 2406 }, { "epoch": 0.1275806323377415, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17851516.0, "logits/rejected": -41049096.0, "logps/chosen": -245.02554321289062, "logps/rejected": -584.1563720703125, "loss": 0.3005, "rewards/chosen": 0.15545988082885742, "rewards/margins": 2.522736072540283, "rewards/rejected": -2.367276191711426, "step": 2407 }, { "epoch": 0.12763363633954364, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6322755.0, "logits/rejected": -92494336.0, "logps/chosen": -280.0634765625, "logps/rejected": -340.3966064453125, "loss": 0.2761, "rewards/chosen": -0.22828903794288635, "rewards/margins": 1.5890414814154308, "rewards/rejected": -1.8173305193583171, "step": 2408 }, { "epoch": 0.12768664034134578, "grad_norm": 67.0, "kl": 0.7166423797607422, "learning_rate": 5e-07, "logits/chosen": -32762816.0, "logits/rejected": -37472757.333333336, "logps/chosen": -383.947021484375, "logps/rejected": -162.4767049153646, "loss": 0.3162, "rewards/chosen": 0.6084437370300293, "rewards/margins": 2.3270513216654463, "rewards/rejected": -1.7186075846354167, "step": 2409 }, { "epoch": 0.1277396443431479, "grad_norm": 48.5, "kl": 1.189117431640625, "learning_rate": 5e-07, "logits/chosen": -35705136.0, "logits/rejected": -27212828.0, "logps/chosen": -260.5268249511719, "logps/rejected": -203.41294860839844, "loss": 0.3582, "rewards/chosen": 0.20160123705863953, "rewards/margins": 1.552503079175949, "rewards/rejected": -1.3509018421173096, "step": 2410 }, { "epoch": 0.12779264834495005, "grad_norm": 59.5, "kl": 0.5612144470214844, "learning_rate": 5e-07, "logits/chosen": -12045165.6, "logits/rejected": 11263269.333333334, "logps/chosen": -314.3268310546875, "logps/rejected": -342.1814778645833, "loss": 0.3479, "rewards/chosen": 0.4065351963043213, "rewards/margins": 1.535770304997762, "rewards/rejected": -1.1292351086934407, "step": 2411 }, { "epoch": 0.1278456523467522, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31492755.2, "logits/rejected": -41264336.0, "logps/chosen": -301.7097900390625, "logps/rejected": -313.9895833333333, "loss": 0.3629, "rewards/chosen": 0.09192794561386108, "rewards/margins": 2.1237587332725525, "rewards/rejected": -2.0318307876586914, "step": 2412 }, { "epoch": 0.12789865634855432, "grad_norm": 50.75, "kl": 0.8872032165527344, "learning_rate": 5e-07, "logits/chosen": 3885933.6, "logits/rejected": -88319381.33333333, "logps/chosen": -210.400390625, "logps/rejected": -396.5112711588542, "loss": 0.3272, "rewards/chosen": 0.23155779838562013, "rewards/margins": 2.82189523379008, "rewards/rejected": -2.5903374354044595, "step": 2413 }, { "epoch": 0.12795166035035646, "grad_norm": 82.5, "kl": 0.6221542358398438, "learning_rate": 5e-07, "logits/chosen": -31344858.666666668, "logits/rejected": -24418176.0, "logps/chosen": -1047.80029296875, "logps/rejected": -228.46524047851562, "loss": 0.3185, "rewards/chosen": 0.6391328573226929, "rewards/margins": 2.6154428720474243, "rewards/rejected": -1.9763100147247314, "step": 2414 }, { "epoch": 0.1280046643521586, "grad_norm": 43.75, "kl": 0.030467987060546875, "learning_rate": 5e-07, "logits/chosen": 1268859.3333333333, "logits/rejected": -21771176.0, "logps/chosen": -284.17881266276044, "logps/rejected": -172.34814453125, "loss": 0.2774, "rewards/chosen": 0.16498337189356485, "rewards/margins": 1.83746261994044, "rewards/rejected": -1.672479248046875, "step": 2415 }, { "epoch": 0.12805766835396074, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33892346.666666664, "logits/rejected": -38470969.6, "logps/chosen": -147.70196533203125, "logps/rejected": -132.02803955078124, "loss": 0.4149, "rewards/chosen": -0.46493581930796307, "rewards/margins": 0.4991757949193319, "rewards/rejected": -0.964111614227295, "step": 2416 }, { "epoch": 0.12811067235576287, "grad_norm": 43.75, "kl": 0.574920654296875, "learning_rate": 5e-07, "logits/chosen": 6347556.666666667, "logits/rejected": -16085121.6, "logps/chosen": -41.85504659016927, "logps/rejected": -172.1195556640625, "loss": 0.2417, "rewards/chosen": 1.4709590276082356, "rewards/margins": 2.489495023091634, "rewards/rejected": -1.0185359954833983, "step": 2417 }, { "epoch": 0.128163676357565, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52694500.0, "logits/rejected": -41040216.0, "logps/chosen": -314.3942565917969, "logps/rejected": -122.16946411132812, "loss": 0.4297, "rewards/chosen": -0.26249372959136963, "rewards/margins": 0.6580044627189636, "rewards/rejected": -0.9204981923103333, "step": 2418 }, { "epoch": 0.12821668035936712, "grad_norm": 56.5, "kl": 1.010498046875, "learning_rate": 5e-07, "logits/chosen": -5289942.0, "logits/rejected": -14211383.0, "logps/chosen": -177.78385416666666, "logps/rejected": -311.451416015625, "loss": 0.4535, "rewards/chosen": 0.10890531539916992, "rewards/margins": 1.1586014032363892, "rewards/rejected": -1.0496960878372192, "step": 2419 }, { "epoch": 0.12826968436116926, "grad_norm": 66.0, "kl": 0.073333740234375, "learning_rate": 5e-07, "logits/chosen": -20767782.666666668, "logits/rejected": -6115820.0, "logps/chosen": -349.4373779296875, "logps/rejected": -99.97064208984375, "loss": 0.451, "rewards/chosen": 0.1693026820818583, "rewards/margins": 0.518049160639445, "rewards/rejected": -0.34874647855758667, "step": 2420 }, { "epoch": 0.1283226883629714, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55505728.0, "logits/rejected": -17790744.0, "logps/chosen": -965.9832763671875, "logps/rejected": -259.41741943359375, "loss": 0.2142, "rewards/chosen": 0.9935028553009033, "rewards/margins": 2.504918336868286, "rewards/rejected": -1.5114154815673828, "step": 2421 }, { "epoch": 0.12837569236477353, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10564127.2, "logits/rejected": -54264485.333333336, "logps/chosen": -188.7900634765625, "logps/rejected": -479.8983154296875, "loss": 0.3063, "rewards/chosen": 0.16635438203811645, "rewards/margins": 3.138762358824412, "rewards/rejected": -2.9724079767862954, "step": 2422 }, { "epoch": 0.12842869636657567, "grad_norm": 58.0, "kl": 0.501032829284668, "learning_rate": 5e-07, "logits/chosen": 13929873.142857144, "logits/rejected": 5585069.0, "logps/chosen": -174.67609514508928, "logps/rejected": -33.35475158691406, "loss": 0.4808, "rewards/chosen": 0.07375907897949219, "rewards/margins": 0.17783012241125107, "rewards/rejected": -0.10407104343175888, "step": 2423 }, { "epoch": 0.1284817003683778, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9741436.0, "logits/rejected": -27545514.666666668, "logps/chosen": -206.7943603515625, "logps/rejected": -706.721435546875, "loss": 0.3462, "rewards/chosen": -0.006759722530841827, "rewards/margins": 2.774872382978598, "rewards/rejected": -2.78163210550944, "step": 2424 }, { "epoch": 0.12853470437017994, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36440636.8, "logits/rejected": -8301002.666666667, "logps/chosen": -324.2007080078125, "logps/rejected": -234.99015299479166, "loss": 0.382, "rewards/chosen": -0.06421138644218445, "rewards/margins": 1.8844529608885447, "rewards/rejected": -1.9486643473307292, "step": 2425 }, { "epoch": 0.12858770837198208, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 38226048.0, "logits/rejected": -14077151.0, "logps/chosen": -595.93115234375, "logps/rejected": -553.205322265625, "loss": 0.2525, "rewards/chosen": 0.5547475814819336, "rewards/margins": 2.5088316202163696, "rewards/rejected": -1.954084038734436, "step": 2426 }, { "epoch": 0.12864071237378422, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79147098.66666667, "logits/rejected": 26359052.8, "logps/chosen": -297.9974365234375, "logps/rejected": -241.298486328125, "loss": 0.3663, "rewards/chosen": -0.1569570004940033, "rewards/margins": 0.9225187838077544, "rewards/rejected": -1.0794757843017577, "step": 2427 }, { "epoch": 0.12869371637558635, "grad_norm": 76.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10490080.0, "logits/rejected": -36395168.0, "logps/chosen": -613.488037109375, "logps/rejected": -411.9691162109375, "loss": 0.3162, "rewards/chosen": 0.13550491631031036, "rewards/margins": 2.0524475127458572, "rewards/rejected": -1.9169425964355469, "step": 2428 }, { "epoch": 0.1287467203773885, "grad_norm": 54.5, "kl": 0.36714935302734375, "learning_rate": 5e-07, "logits/chosen": -89512240.0, "logits/rejected": -26561290.0, "logps/chosen": -379.5719299316406, "logps/rejected": -289.6645812988281, "loss": 0.3069, "rewards/chosen": 0.35414183139801025, "rewards/margins": 1.8844753503799438, "rewards/rejected": -1.5303335189819336, "step": 2429 }, { "epoch": 0.12879972437919063, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37136940.0, "logits/rejected": -13075041.333333334, "logps/chosen": -638.005126953125, "logps/rejected": -322.9755452473958, "loss": 0.2307, "rewards/chosen": 0.8457611203193665, "rewards/margins": 2.3680917223294573, "rewards/rejected": -1.522330602010091, "step": 2430 }, { "epoch": 0.12885272838099276, "grad_norm": 58.0, "kl": 2.278994083404541, "learning_rate": 5e-07, "logits/chosen": -70447206.4, "logits/rejected": -9628860.666666666, "logps/chosen": -760.363232421875, "logps/rejected": -202.8751220703125, "loss": 0.333, "rewards/chosen": 0.7122113227844238, "rewards/margins": 2.2127886136372883, "rewards/rejected": -1.5005772908528645, "step": 2431 }, { "epoch": 0.1289057323827949, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31611596.0, "logits/rejected": -17733132.0, "logps/chosen": -941.4722290039062, "logps/rejected": -295.1946614583333, "loss": 0.3111, "rewards/chosen": 0.17068994045257568, "rewards/margins": 1.3245389858881633, "rewards/rejected": -1.1538490454355876, "step": 2432 }, { "epoch": 0.12895873638459704, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29429564.0, "logits/rejected": -23683646.0, "logps/chosen": -231.46139526367188, "logps/rejected": -496.8253173828125, "loss": 0.329, "rewards/chosen": -0.22095242142677307, "rewards/margins": 2.4081660211086273, "rewards/rejected": -2.6291184425354004, "step": 2433 }, { "epoch": 0.12901174038639918, "grad_norm": 64.0, "kl": 1.338531494140625, "learning_rate": 5e-07, "logits/chosen": -31941308.0, "logits/rejected": -60957920.0, "logps/chosen": -572.863525390625, "logps/rejected": -773.087158203125, "loss": 0.2603, "rewards/chosen": 0.6548336148262024, "rewards/margins": 3.589005768299103, "rewards/rejected": -2.9341721534729004, "step": 2434 }, { "epoch": 0.1290647443882013, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38126844.0, "logits/rejected": -77868234.66666667, "logps/chosen": -384.43865966796875, "logps/rejected": -330.40211995442706, "loss": 0.2986, "rewards/chosen": 0.46514129638671875, "rewards/margins": 1.7743455568949382, "rewards/rejected": -1.3092042605082195, "step": 2435 }, { "epoch": 0.12911774839000345, "grad_norm": 60.0, "kl": 0.133148193359375, "learning_rate": 5e-07, "logits/chosen": -45672124.8, "logits/rejected": -4747771.0, "logps/chosen": -281.4720947265625, "logps/rejected": -179.06673177083334, "loss": 0.3913, "rewards/chosen": 0.245485520362854, "rewards/margins": 1.064814305305481, "rewards/rejected": -0.819328784942627, "step": 2436 }, { "epoch": 0.1291707523918056, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35282152.0, "logits/rejected": 5332294.666666667, "logps/chosen": -280.93731689453125, "logps/rejected": -443.1690266927083, "loss": 0.2645, "rewards/chosen": 0.06782683730125427, "rewards/margins": 1.8317517538865407, "rewards/rejected": -1.7639249165852864, "step": 2437 }, { "epoch": 0.12922375639360772, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37497152.0, "logits/rejected": -24459405.333333332, "logps/chosen": -175.13490295410156, "logps/rejected": -360.4888509114583, "loss": 0.2834, "rewards/chosen": -0.16723957657814026, "rewards/margins": 1.5178028047084808, "rewards/rejected": -1.685042381286621, "step": 2438 }, { "epoch": 0.12927676039540986, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10282656.0, "logits/rejected": -38946710.85714286, "logps/chosen": -120.01191711425781, "logps/rejected": -268.694580078125, "loss": 0.2366, "rewards/chosen": -0.07338638603687286, "rewards/margins": 1.5968766318900245, "rewards/rejected": -1.6702630179268974, "step": 2439 }, { "epoch": 0.129329764397212, "grad_norm": 73.0, "kl": 0.6766281127929688, "learning_rate": 5e-07, "logits/chosen": -30536425.14285714, "logits/rejected": 2304446.5, "logps/chosen": -420.8115234375, "logps/rejected": -101.12371826171875, "loss": 0.3728, "rewards/chosen": 0.4504202774592808, "rewards/margins": 2.4977759293147495, "rewards/rejected": -2.0473556518554688, "step": 2440 }, { "epoch": 0.12938276839901414, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29218099.2, "logits/rejected": -28077592.0, "logps/chosen": -388.5877685546875, "logps/rejected": -163.60647583007812, "loss": 0.3103, "rewards/chosen": 0.4051093101501465, "rewards/margins": 2.2990338643391928, "rewards/rejected": -1.8939245541890461, "step": 2441 }, { "epoch": 0.12943577240081627, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30417412.0, "logits/rejected": -23571762.0, "logps/chosen": -282.351318359375, "logps/rejected": -219.0428466796875, "loss": 0.3514, "rewards/chosen": 0.08270654082298279, "rewards/margins": 1.657201498746872, "rewards/rejected": -1.5744949579238892, "step": 2442 }, { "epoch": 0.1294887764026184, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 52928.375, "logits/rejected": -72454742.85714285, "logps/chosen": -30.3302001953125, "logps/rejected": -414.4017857142857, "loss": 0.2556, "rewards/chosen": -0.6487724184989929, "rewards/margins": 1.0789658256939478, "rewards/rejected": -1.7277382441929408, "step": 2443 }, { "epoch": 0.12954178040442052, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22846276.0, "logits/rejected": -15670144.0, "logps/chosen": -167.06573486328125, "logps/rejected": -153.79837036132812, "loss": 0.3146, "rewards/chosen": 0.5433641672134399, "rewards/margins": 1.7085520029067993, "rewards/rejected": -1.1651878356933594, "step": 2444 }, { "epoch": 0.12959478440622266, "grad_norm": 85.5, "kl": 1.5584139823913574, "learning_rate": 5e-07, "logits/chosen": -35808937.6, "logits/rejected": -52238218.666666664, "logps/chosen": -703.27626953125, "logps/rejected": -592.7360026041666, "loss": 0.3075, "rewards/chosen": 0.5198585510253906, "rewards/margins": 2.94859094619751, "rewards/rejected": -2.428732395172119, "step": 2445 }, { "epoch": 0.1296477884080248, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48782944.0, "logits/rejected": -42218634.666666664, "logps/chosen": -317.32060546875, "logps/rejected": -383.3346354166667, "loss": 0.4213, "rewards/chosen": -0.24566307067871093, "rewards/margins": 1.2748369534810384, "rewards/rejected": -1.5205000241597493, "step": 2446 }, { "epoch": 0.12970079240982693, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33866396.0, "logits/rejected": 16413790.666666666, "logps/chosen": -475.4176025390625, "logps/rejected": -255.0777587890625, "loss": 0.309, "rewards/chosen": 0.3603353500366211, "rewards/margins": 1.4051400025685628, "rewards/rejected": -1.0448046525319417, "step": 2447 }, { "epoch": 0.12975379641162907, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40204640.0, "logits/rejected": 849704.3333333334, "logps/chosen": -395.1674560546875, "logps/rejected": -143.59552001953125, "loss": 0.4469, "rewards/chosen": 0.08240005373954773, "rewards/margins": 0.537179599205653, "rewards/rejected": -0.45477954546610516, "step": 2448 }, { "epoch": 0.1298068004134312, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63280953.6, "logits/rejected": -16222794.666666666, "logps/chosen": -419.17529296875, "logps/rejected": -205.12947591145834, "loss": 0.3576, "rewards/chosen": 0.25594871044158934, "rewards/margins": 2.0317203283309935, "rewards/rejected": -1.7757716178894043, "step": 2449 }, { "epoch": 0.12985980441523334, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84697848.0, "logits/rejected": -2855352.25, "logps/chosen": -393.9082946777344, "logps/rejected": -157.1778564453125, "loss": 0.3697, "rewards/chosen": 0.12825241684913635, "rewards/margins": 1.2110050022602081, "rewards/rejected": -1.0827525854110718, "step": 2450 }, { "epoch": 0.12991280841703548, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53164808.0, "logits/rejected": -19714046.0, "logps/chosen": -305.82708740234375, "logps/rejected": -259.3595275878906, "loss": 0.3179, "rewards/chosen": 0.24591206014156342, "rewards/margins": 1.8058451265096664, "rewards/rejected": -1.559933066368103, "step": 2451 }, { "epoch": 0.12996581241883762, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9081104.0, "logits/rejected": -21673945.333333332, "logps/chosen": -40.17876434326172, "logps/rejected": -334.91371663411456, "loss": 0.2987, "rewards/chosen": 0.22715207934379578, "rewards/margins": 1.5539204776287079, "rewards/rejected": -1.326768398284912, "step": 2452 }, { "epoch": 0.13001881642063975, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11943753.0, "logits/rejected": -4216768.5, "logps/chosen": -128.3018798828125, "logps/rejected": -53.91170883178711, "loss": 0.4212, "rewards/chosen": -0.19633452594280243, "rewards/margins": 0.5887801200151443, "rewards/rejected": -0.7851146459579468, "step": 2453 }, { "epoch": 0.1300718204224419, "grad_norm": 57.75, "kl": 0.2597770690917969, "learning_rate": 5e-07, "logits/chosen": -46579040.0, "logits/rejected": -4846656.0, "logps/chosen": -392.1851806640625, "logps/rejected": -356.441943359375, "loss": 0.3612, "rewards/chosen": -0.1493072509765625, "rewards/margins": 1.3385587692260743, "rewards/rejected": -1.4878660202026368, "step": 2454 }, { "epoch": 0.13012482442424403, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24270366.0, "logits/rejected": -226983.33333333334, "logps/chosen": -248.27500915527344, "logps/rejected": -324.1195882161458, "loss": 0.2642, "rewards/chosen": 0.2824213206768036, "rewards/margins": 1.7548008461793263, "rewards/rejected": -1.4723795255025227, "step": 2455 }, { "epoch": 0.13017782842604617, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -102575808.0, "logits/rejected": -68492864.0, "logps/chosen": -170.34611002604166, "logps/rejected": -342.3162109375, "loss": 0.3208, "rewards/chosen": -0.052877614895502724, "rewards/margins": 1.6153158207734426, "rewards/rejected": -1.6681934356689454, "step": 2456 }, { "epoch": 0.1302308324278483, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14972002.666666666, "logits/rejected": -7698755.2, "logps/chosen": -262.46433512369794, "logps/rejected": -206.318359375, "loss": 0.2884, "rewards/chosen": 0.3502919673919678, "rewards/margins": 1.821876859664917, "rewards/rejected": -1.4715848922729493, "step": 2457 }, { "epoch": 0.13028383642965044, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23512706.0, "logits/rejected": -45182714.666666664, "logps/chosen": -175.69178771972656, "logps/rejected": -466.3312581380208, "loss": 0.22, "rewards/chosen": -0.07348175346851349, "rewards/margins": 2.433275505900383, "rewards/rejected": -2.5067572593688965, "step": 2458 }, { "epoch": 0.13033684043145258, "grad_norm": 49.0, "kl": 0.29210662841796875, "learning_rate": 5e-07, "logits/chosen": 20025140.0, "logits/rejected": -6956646.5, "logps/chosen": -236.63507080078125, "logps/rejected": -103.20433044433594, "loss": 0.3654, "rewards/chosen": 0.0712360367178917, "rewards/margins": 1.4915899261832237, "rewards/rejected": -1.420353889465332, "step": 2459 }, { "epoch": 0.13038984443325471, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54595417.6, "logits/rejected": -10425715.333333334, "logps/chosen": -300.722412109375, "logps/rejected": -127.05240885416667, "loss": 0.4117, "rewards/chosen": 0.006524240970611573, "rewards/margins": 1.0356243411699932, "rewards/rejected": -1.0291001001993816, "step": 2460 }, { "epoch": 0.13044284843505685, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14254918.666666666, "logits/rejected": 17395836.0, "logps/chosen": -181.85628255208334, "logps/rejected": -181.8204345703125, "loss": 0.3504, "rewards/chosen": 0.6319035291671753, "rewards/margins": 1.3692407011985779, "rewards/rejected": -0.7373371720314026, "step": 2461 }, { "epoch": 0.130495852436859, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2737808.4, "logits/rejected": -1074004.0, "logps/chosen": -179.17322998046876, "logps/rejected": -248.97686767578125, "loss": 0.393, "rewards/chosen": 0.13482284545898438, "rewards/margins": 1.1440180937449138, "rewards/rejected": -1.0091952482859294, "step": 2462 }, { "epoch": 0.13054885643866113, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14811804.0, "logits/rejected": -30533094.4, "logps/chosen": -60.92766316731771, "logps/rejected": -299.47138671875, "loss": 0.3501, "rewards/chosen": -0.4357159932454427, "rewards/margins": 1.0829123179117839, "rewards/rejected": -1.5186283111572265, "step": 2463 }, { "epoch": 0.13060186044046326, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31538661.333333332, "logits/rejected": -33216484.0, "logps/chosen": -278.4189453125, "logps/rejected": -149.00701904296875, "loss": 0.3898, "rewards/chosen": 0.24470941225687662, "rewards/margins": 1.5051738421122234, "rewards/rejected": -1.2604644298553467, "step": 2464 }, { "epoch": 0.1306548644422654, "grad_norm": 39.5, "kl": 0.22647857666015625, "learning_rate": 5e-07, "logits/chosen": -48538016.0, "logits/rejected": -54518310.4, "logps/chosen": -287.53428141276044, "logps/rejected": -548.981640625, "loss": 0.2368, "rewards/chosen": 0.07605718076229095, "rewards/margins": 2.621137747168541, "rewards/rejected": -2.54508056640625, "step": 2465 }, { "epoch": 0.13070786844406754, "grad_norm": 59.75, "kl": 0.4261913299560547, "learning_rate": 5e-07, "logits/chosen": -78059443.2, "logits/rejected": -16182897.333333334, "logps/chosen": -376.88251953125, "logps/rejected": -485.1256103515625, "loss": 0.401, "rewards/chosen": 0.12837876081466676, "rewards/margins": 1.6661951263745625, "rewards/rejected": -1.5378163655598958, "step": 2466 }, { "epoch": 0.13076087244586967, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75291104.0, "logits/rejected": -34243081.6, "logps/chosen": -320.74428304036456, "logps/rejected": -225.1857421875, "loss": 0.3241, "rewards/chosen": 0.1063079833984375, "rewards/margins": 1.3968762397766112, "rewards/rejected": -1.2905682563781737, "step": 2467 }, { "epoch": 0.1308138764476718, "grad_norm": 90.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 42844160.0, "logits/rejected": -6084233.333333333, "logps/chosen": -724.1722412109375, "logps/rejected": -315.65915934244794, "loss": 0.3073, "rewards/chosen": 0.15856018662452698, "rewards/margins": 1.2716651658217113, "rewards/rejected": -1.1131049791971843, "step": 2468 }, { "epoch": 0.13086688044947395, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16868256.0, "logits/rejected": -59672096.0, "logps/chosen": -204.632861328125, "logps/rejected": -381.0218098958333, "loss": 0.347, "rewards/chosen": 0.48478031158447266, "rewards/margins": 1.7015314102172852, "rewards/rejected": -1.2167510986328125, "step": 2469 }, { "epoch": 0.13091988445127606, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37876344.0, "logits/rejected": 530112.0, "logps/chosen": -322.99365234375, "logps/rejected": -282.7383626302083, "loss": 0.3082, "rewards/chosen": -0.1711578369140625, "rewards/margins": 1.3952110608418782, "rewards/rejected": -1.5663688977559407, "step": 2470 }, { "epoch": 0.1309728884530782, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41963352.0, "logits/rejected": -16434661.333333334, "logps/chosen": -380.64984130859375, "logps/rejected": -150.35511271158853, "loss": 0.3962, "rewards/chosen": -0.6495163440704346, "rewards/margins": 0.2108956972757975, "rewards/rejected": -0.8604120413462321, "step": 2471 }, { "epoch": 0.13102589245488033, "grad_norm": 52.25, "kl": 0.20400238037109375, "learning_rate": 5e-07, "logits/chosen": -42516056.0, "logits/rejected": -33785344.0, "logps/chosen": -404.1894836425781, "logps/rejected": -365.9488220214844, "loss": 0.2977, "rewards/chosen": 0.4407951533794403, "rewards/margins": 2.3321370780467987, "rewards/rejected": -1.8913419246673584, "step": 2472 }, { "epoch": 0.13107889645668247, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32227914.666666668, "logits/rejected": 27854614.4, "logps/chosen": -157.46033732096353, "logps/rejected": -359.064453125, "loss": 0.2738, "rewards/chosen": -0.22499390443166098, "rewards/margins": 1.9690820614496867, "rewards/rejected": -2.1940759658813476, "step": 2473 }, { "epoch": 0.1311319004584846, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6382117.5, "logits/rejected": 4578782.666666667, "logps/chosen": -69.93639373779297, "logps/rejected": -233.50480143229166, "loss": 0.3129, "rewards/chosen": -0.5139293670654297, "rewards/margins": 0.887542724609375, "rewards/rejected": -1.4014720916748047, "step": 2474 }, { "epoch": 0.13118490446028674, "grad_norm": 60.75, "kl": 0.185760498046875, "learning_rate": 5e-07, "logits/chosen": -39662042.666666664, "logits/rejected": -31618570.0, "logps/chosen": -271.81386311848956, "logps/rejected": -538.4172973632812, "loss": 0.3863, "rewards/chosen": 0.19708732763926187, "rewards/margins": 1.8626299301783245, "rewards/rejected": -1.6655426025390625, "step": 2475 }, { "epoch": 0.13123790846208888, "grad_norm": 59.0, "kl": 1.8036231994628906, "learning_rate": 5e-07, "logits/chosen": -49717654.4, "logits/rejected": -14397322.666666666, "logps/chosen": -344.798583984375, "logps/rejected": -217.6039021809896, "loss": 0.3517, "rewards/chosen": 0.4730926513671875, "rewards/margins": 2.118189748128255, "rewards/rejected": -1.6450970967610676, "step": 2476 }, { "epoch": 0.13129091246389102, "grad_norm": 57.75, "kl": 0.30783843994140625, "learning_rate": 5e-07, "logits/chosen": -49273340.8, "logits/rejected": 5807272.0, "logps/chosen": -371.724951171875, "logps/rejected": -108.86539713541667, "loss": 0.2906, "rewards/chosen": 0.5945120334625245, "rewards/margins": 2.396657673517863, "rewards/rejected": -1.8021456400553386, "step": 2477 }, { "epoch": 0.13134391646569316, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25720028.0, "logits/rejected": -87113837.71428572, "logps/chosen": -229.22747802734375, "logps/rejected": -421.8221958705357, "loss": 0.2066, "rewards/chosen": -0.633410632610321, "rewards/margins": 1.308636622769492, "rewards/rejected": -1.9420472553798132, "step": 2478 }, { "epoch": 0.1313969204674953, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35345286.4, "logits/rejected": -41006250.666666664, "logps/chosen": -234.225244140625, "logps/rejected": -266.216552734375, "loss": 0.3879, "rewards/chosen": 0.2580064535140991, "rewards/margins": 1.1831706762313843, "rewards/rejected": -0.9251642227172852, "step": 2479 }, { "epoch": 0.13144992446929743, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17584920.0, "logits/rejected": -39641674.666666664, "logps/chosen": -216.01402282714844, "logps/rejected": -213.6331583658854, "loss": 0.3365, "rewards/chosen": -0.07234020531177521, "rewards/margins": 1.202897275487582, "rewards/rejected": -1.2752374807993572, "step": 2480 }, { "epoch": 0.13150292847109957, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48935684.0, "logits/rejected": -43269904.0, "logps/chosen": -428.7620544433594, "logps/rejected": -514.75439453125, "loss": 0.2891, "rewards/chosen": 0.06061439961194992, "rewards/margins": 2.381877712905407, "rewards/rejected": -2.321263313293457, "step": 2481 }, { "epoch": 0.1315559324729017, "grad_norm": 43.5, "kl": 0.2776336669921875, "learning_rate": 5e-07, "logits/chosen": -23805232.0, "logits/rejected": -33503512.0, "logps/chosen": -266.27764892578125, "logps/rejected": -652.8418579101562, "loss": 0.2511, "rewards/chosen": 0.392264723777771, "rewards/margins": 3.425453782081604, "rewards/rejected": -3.033189058303833, "step": 2482 }, { "epoch": 0.13160893647470384, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25740419.2, "logits/rejected": -22208108.0, "logps/chosen": -421.854248046875, "logps/rejected": -455.021240234375, "loss": 0.3506, "rewards/chosen": 0.16894195079803467, "rewards/margins": 2.4920337756474815, "rewards/rejected": -2.3230918248494468, "step": 2483 }, { "epoch": 0.13166194047650598, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10922434.666666666, "logits/rejected": -30994586.0, "logps/chosen": -224.97526041666666, "logps/rejected": -390.91864013671875, "loss": 0.5023, "rewards/chosen": -0.2944290240605672, "rewards/margins": 0.6276932160059612, "rewards/rejected": -0.9221222400665283, "step": 2484 }, { "epoch": 0.13171494447830812, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3505456.75, "logits/rejected": -10006744.0, "logps/chosen": -45.955413818359375, "logps/rejected": -328.4326578776042, "loss": 0.2701, "rewards/chosen": -0.3989149332046509, "rewards/margins": 1.4706329107284546, "rewards/rejected": -1.8695478439331055, "step": 2485 }, { "epoch": 0.13176794848011025, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21433722.0, "logits/rejected": -26942597.333333332, "logps/chosen": -108.19677734375, "logps/rejected": -273.56976318359375, "loss": 0.2888, "rewards/chosen": -0.19501619040966034, "rewards/margins": 1.3646588673194249, "rewards/rejected": -1.5596750577290852, "step": 2486 }, { "epoch": 0.1318209524819124, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14043862.0, "logits/rejected": -25103573.333333332, "logps/chosen": -84.66218566894531, "logps/rejected": -355.3872884114583, "loss": 0.2566, "rewards/chosen": 0.1764465570449829, "rewards/margins": 1.8838529189427693, "rewards/rejected": -1.7074063618977864, "step": 2487 }, { "epoch": 0.13187395648371453, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14423005.0, "logits/rejected": -21093506.666666668, "logps/chosen": -520.6338500976562, "logps/rejected": -205.0489501953125, "loss": 0.3509, "rewards/chosen": -0.1526840180158615, "rewards/margins": 0.8712714066108067, "rewards/rejected": -1.0239554246266682, "step": 2488 }, { "epoch": 0.13192696048551666, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3313302.0, "logits/rejected": -42325238.85714286, "logps/chosen": -0.13953590393066406, "logps/rejected": -282.36948939732144, "loss": 0.2733, "rewards/chosen": -0.0029630661010742188, "rewards/margins": 1.2806881495884486, "rewards/rejected": -1.2836512156895228, "step": 2489 }, { "epoch": 0.1319799644873188, "grad_norm": 47.5, "kl": 0.369232177734375, "learning_rate": 5e-07, "logits/chosen": -21946475.2, "logits/rejected": -1450281.3333333333, "logps/chosen": -290.9186279296875, "logps/rejected": -118.548583984375, "loss": 0.3553, "rewards/chosen": 0.29566006660461425, "rewards/margins": 2.2163881142934163, "rewards/rejected": -1.920728047688802, "step": 2490 }, { "epoch": 0.13203296848912094, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81531818.66666667, "logits/rejected": -5116331.0, "logps/chosen": -356.9019368489583, "logps/rejected": -241.88685607910156, "loss": 0.4568, "rewards/chosen": -0.011723265051841736, "rewards/margins": 0.8081684857606888, "rewards/rejected": -0.8198917508125305, "step": 2491 }, { "epoch": 0.13208597249092308, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -117417846.85714285, "logits/rejected": -8768348.0, "logps/chosen": -362.43798828125, "logps/rejected": -56.00458908081055, "loss": 0.4545, "rewards/chosen": 0.017178663185664585, "rewards/margins": 1.6510353173528398, "rewards/rejected": -1.6338566541671753, "step": 2492 }, { "epoch": 0.1321389764927252, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21562410.0, "logits/rejected": -21902086.0, "logps/chosen": -122.16019439697266, "logps/rejected": -271.44927978515625, "loss": 0.3452, "rewards/chosen": 0.32785913348197937, "rewards/margins": 1.699655145406723, "rewards/rejected": -1.3717960119247437, "step": 2493 }, { "epoch": 0.13219198049452735, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20773001.6, "logits/rejected": -28237717.333333332, "logps/chosen": -484.76806640625, "logps/rejected": -259.841064453125, "loss": 0.4701, "rewards/chosen": -0.46550354957580564, "rewards/margins": 0.876449728012085, "rewards/rejected": -1.3419532775878906, "step": 2494 }, { "epoch": 0.13224498449632946, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -99394352.0, "logits/rejected": -43838996.0, "logps/chosen": -395.7723388671875, "logps/rejected": -328.6009826660156, "loss": 0.3437, "rewards/chosen": -0.2309490442276001, "rewards/margins": 1.744653344154358, "rewards/rejected": -1.975602388381958, "step": 2495 }, { "epoch": 0.1322979884981316, "grad_norm": 53.75, "kl": 0.6995458602905273, "learning_rate": 5e-07, "logits/chosen": -23835418.666666668, "logits/rejected": -75968032.0, "logps/chosen": -321.6019287109375, "logps/rejected": -400.6474914550781, "loss": 0.3811, "rewards/chosen": 0.22821722428003946, "rewards/margins": 2.2072791854540506, "rewards/rejected": -1.9790619611740112, "step": 2496 }, { "epoch": 0.13235099249993373, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 34235154.666666664, "logits/rejected": -20299822.4, "logps/chosen": -220.0809122721354, "logps/rejected": -327.424169921875, "loss": 0.3356, "rewards/chosen": -0.09830919901529948, "rewards/margins": 1.3493455251057942, "rewards/rejected": -1.4476547241210938, "step": 2497 }, { "epoch": 0.13240399650173587, "grad_norm": 205.0, "kl": 1.2474632263183594, "learning_rate": 5e-07, "logits/chosen": -32476581.333333332, "logits/rejected": -40619208.0, "logps/chosen": -407.0077718098958, "logps/rejected": -405.53082275390625, "loss": 0.4042, "rewards/chosen": 0.3089088598887126, "rewards/margins": 2.595296541849772, "rewards/rejected": -2.2863876819610596, "step": 2498 }, { "epoch": 0.132457000503538, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7234322.0, "logits/rejected": -33617728.0, "logps/chosen": -262.2524108886719, "logps/rejected": -393.5022888183594, "loss": 0.3187, "rewards/chosen": 0.13998186588287354, "rewards/margins": 2.102469563484192, "rewards/rejected": -1.9624876976013184, "step": 2499 }, { "epoch": 0.13251000450534015, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12082658.0, "logits/rejected": -6212836.0, "logps/chosen": -311.77655029296875, "logps/rejected": -486.902099609375, "loss": 0.3269, "rewards/chosen": 0.08212623000144958, "rewards/margins": 1.7274719178676605, "rewards/rejected": -1.645345687866211, "step": 2500 }, { "epoch": 0.13256300850714228, "grad_norm": 49.5, "kl": 0.16880035400390625, "learning_rate": 5e-07, "logits/chosen": -34500652.0, "logits/rejected": -28471082.666666668, "logps/chosen": -184.4668731689453, "logps/rejected": -388.6802978515625, "loss": 0.3191, "rewards/chosen": 0.09812775254249573, "rewards/margins": 1.2981760402520497, "rewards/rejected": -1.200048287709554, "step": 2501 }, { "epoch": 0.13261601250894442, "grad_norm": 51.5, "kl": 0.051627159118652344, "learning_rate": 5e-07, "logits/chosen": -19412548.8, "logits/rejected": -31267736.0, "logps/chosen": -284.2791748046875, "logps/rejected": -80.81235758463542, "loss": 0.4165, "rewards/chosen": -0.016185152530670165, "rewards/margins": 1.1821876088778178, "rewards/rejected": -1.198372761408488, "step": 2502 }, { "epoch": 0.13266901651074656, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63506256.0, "logits/rejected": -24750604.8, "logps/chosen": -444.0195719401042, "logps/rejected": -409.62041015625, "loss": 0.2557, "rewards/chosen": -0.05443471670150757, "rewards/margins": 2.2917284607887267, "rewards/rejected": -2.3461631774902343, "step": 2503 }, { "epoch": 0.1327220205125487, "grad_norm": 63.5, "kl": 1.6893367767333984, "learning_rate": 5e-07, "logits/chosen": 4944751.666666667, "logits/rejected": -44766356.0, "logps/chosen": -382.2483723958333, "logps/rejected": -243.58558654785156, "loss": 0.3916, "rewards/chosen": 0.38145554065704346, "rewards/margins": 2.228534698486328, "rewards/rejected": -1.8470791578292847, "step": 2504 }, { "epoch": 0.13277502451435083, "grad_norm": 48.75, "kl": 0.6445846557617188, "learning_rate": 5e-07, "logits/chosen": 3591806.3333333335, "logits/rejected": -33518099.2, "logps/chosen": -356.9197591145833, "logps/rejected": -301.17724609375, "loss": 0.3321, "rewards/chosen": 0.039601137240727745, "rewards/margins": 1.7782524128754933, "rewards/rejected": -1.7386512756347656, "step": 2505 }, { "epoch": 0.13282802851615297, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19062691.2, "logits/rejected": -11489402.666666666, "logps/chosen": -282.3130615234375, "logps/rejected": -304.7276204427083, "loss": 0.4268, "rewards/chosen": -0.22476067543029785, "rewards/margins": 1.1856854915618897, "rewards/rejected": -1.4104461669921875, "step": 2506 }, { "epoch": 0.1328810325179551, "grad_norm": 58.5, "kl": 0.02640533447265625, "learning_rate": 5e-07, "logits/chosen": -43828963.2, "logits/rejected": -27682146.666666668, "logps/chosen": -312.06337890625, "logps/rejected": -312.8484293619792, "loss": 0.3304, "rewards/chosen": 0.45094904899597166, "rewards/margins": 1.888148482640584, "rewards/rejected": -1.4371994336446126, "step": 2507 }, { "epoch": 0.13293403651975724, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35481493.333333336, "logits/rejected": -41173372.8, "logps/chosen": -284.3173828125, "logps/rejected": -396.1153076171875, "loss": 0.2633, "rewards/chosen": 0.3689993619918823, "rewards/margins": 2.040998339653015, "rewards/rejected": -1.6719989776611328, "step": 2508 }, { "epoch": 0.13298704052155938, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54254041.6, "logits/rejected": -24226517.333333332, "logps/chosen": -441.53212890625, "logps/rejected": -495.9295247395833, "loss": 0.3602, "rewards/chosen": -0.1742175817489624, "rewards/margins": 2.665691335995992, "rewards/rejected": -2.8399089177449546, "step": 2509 }, { "epoch": 0.13304004452336152, "grad_norm": 58.25, "kl": 0.16602325439453125, "learning_rate": 5e-07, "logits/chosen": -46162604.8, "logits/rejected": -23926797.333333332, "logps/chosen": -316.14541015625, "logps/rejected": -276.32427978515625, "loss": 0.3582, "rewards/chosen": 0.003456878662109375, "rewards/margins": 1.8565942128499349, "rewards/rejected": -1.8531373341878254, "step": 2510 }, { "epoch": 0.13309304852516365, "grad_norm": 60.5, "kl": 0.056209564208984375, "learning_rate": 5e-07, "logits/chosen": -3922569.0, "logits/rejected": 10570500.0, "logps/chosen": -414.3975830078125, "logps/rejected": -189.9438018798828, "loss": 0.3492, "rewards/chosen": 0.13107222318649292, "rewards/margins": 1.3853622078895569, "rewards/rejected": -1.254289984703064, "step": 2511 }, { "epoch": 0.1331460525269658, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19780088.0, "logits/rejected": 84254944.0, "logps/chosen": -295.737060546875, "logps/rejected": -372.8902587890625, "loss": 0.387, "rewards/chosen": -0.1669330596923828, "rewards/margins": 1.250722885131836, "rewards/rejected": -1.4176559448242188, "step": 2512 }, { "epoch": 0.13319905652876793, "grad_norm": 57.25, "kl": 0.14083099365234375, "learning_rate": 5e-07, "logits/chosen": -30286560.0, "logits/rejected": -18068414.0, "logps/chosen": -440.7714029947917, "logps/rejected": -547.4067993164062, "loss": 0.4069, "rewards/chosen": 0.04644804199536642, "rewards/margins": 2.449339677890142, "rewards/rejected": -2.4028916358947754, "step": 2513 }, { "epoch": 0.13325206053057007, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28177779.2, "logits/rejected": -40138442.666666664, "logps/chosen": -184.128759765625, "logps/rejected": -480.2704671223958, "loss": 0.3611, "rewards/chosen": 0.25083284378051757, "rewards/margins": 1.6976413408915203, "rewards/rejected": -1.4468084971110027, "step": 2514 }, { "epoch": 0.1333050645323722, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55193848.0, "logits/rejected": -18650668.0, "logps/chosen": -252.0836181640625, "logps/rejected": -344.516845703125, "loss": 0.302, "rewards/chosen": 0.123046875, "rewards/margins": 2.054190158843994, "rewards/rejected": -1.9311432838439941, "step": 2515 }, { "epoch": 0.13335806853417434, "grad_norm": 52.0, "kl": 0.3827838897705078, "learning_rate": 5e-07, "logits/chosen": -36857912.0, "logits/rejected": -48410052.0, "logps/chosen": -292.21551513671875, "logps/rejected": -287.322265625, "loss": 0.3409, "rewards/chosen": 0.4047355651855469, "rewards/margins": 1.5859335660934448, "rewards/rejected": -1.181198000907898, "step": 2516 }, { "epoch": 0.13341107253597648, "grad_norm": 101.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17516928.0, "logits/rejected": 683346.25, "logps/chosen": -186.96197509765625, "logps/rejected": -212.71182250976562, "loss": 0.3694, "rewards/chosen": 0.2507917284965515, "rewards/margins": 1.1446852087974548, "rewards/rejected": -0.8938934803009033, "step": 2517 }, { "epoch": 0.13346407653777861, "grad_norm": 58.25, "kl": 0.0704498291015625, "learning_rate": 5e-07, "logits/chosen": -2013249.6, "logits/rejected": -51129312.0, "logps/chosen": -302.929833984375, "logps/rejected": -440.4112141927083, "loss": 0.2916, "rewards/chosen": 0.3850920915603638, "rewards/margins": 2.6363909959793093, "rewards/rejected": -2.2512989044189453, "step": 2518 }, { "epoch": 0.13351708053958075, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45491976.0, "logits/rejected": -23727296.0, "logps/chosen": -127.8497314453125, "logps/rejected": -210.89825439453125, "loss": 0.3522, "rewards/chosen": -0.07439309358596802, "rewards/margins": 1.5137450098991394, "rewards/rejected": -1.5881381034851074, "step": 2519 }, { "epoch": 0.13357008454138286, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46496696.0, "logits/rejected": -25140988.0, "logps/chosen": -468.27496337890625, "logps/rejected": -313.74176025390625, "loss": 0.2775, "rewards/chosen": 0.303213506937027, "rewards/margins": 2.4738315641880035, "rewards/rejected": -2.1706180572509766, "step": 2520 }, { "epoch": 0.133623088543185, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9585450.4, "logits/rejected": -51062464.0, "logps/chosen": -329.44521484375, "logps/rejected": -421.4881998697917, "loss": 0.3464, "rewards/chosen": 0.2650787353515625, "rewards/margins": 2.9839110692342126, "rewards/rejected": -2.71883233388265, "step": 2521 }, { "epoch": 0.13367609254498714, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7119099.333333333, "logits/rejected": -14759304.0, "logps/chosen": -185.5818888346354, "logps/rejected": -289.00244140625, "loss": 0.2968, "rewards/chosen": 0.3447156349817912, "rewards/margins": 1.786953870455424, "rewards/rejected": -1.4422382354736327, "step": 2522 }, { "epoch": 0.13372909654678927, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20392645.333333332, "logits/rejected": -84635468.8, "logps/chosen": -105.59228515625, "logps/rejected": -694.43134765625, "loss": 0.2376, "rewards/chosen": 0.21295547485351562, "rewards/margins": 3.069692611694336, "rewards/rejected": -2.8567371368408203, "step": 2523 }, { "epoch": 0.1337821005485914, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23050612.0, "logits/rejected": -42153908.0, "logps/chosen": -309.6407165527344, "logps/rejected": -513.8111572265625, "loss": 0.3332, "rewards/chosen": -0.1646268665790558, "rewards/margins": 1.79861781001091, "rewards/rejected": -1.9632446765899658, "step": 2524 }, { "epoch": 0.13383510455039355, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12758823.0, "logits/rejected": -11549186.285714285, "logps/chosen": -66.82849884033203, "logps/rejected": -181.95071847098214, "loss": 0.2717, "rewards/chosen": -0.5941917300224304, "rewards/margins": 0.8198180454117912, "rewards/rejected": -1.4140097754342216, "step": 2525 }, { "epoch": 0.13388810855219568, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9882793.6, "logits/rejected": -25109909.333333332, "logps/chosen": -191.0334716796875, "logps/rejected": -411.6227620442708, "loss": 0.3735, "rewards/chosen": -0.11051499843597412, "rewards/margins": 2.1697333256403604, "rewards/rejected": -2.2802483240763345, "step": 2526 }, { "epoch": 0.13394111255399782, "grad_norm": 58.25, "kl": 0.3850994110107422, "learning_rate": 5e-07, "logits/chosen": -531079.7083333334, "logits/rejected": -47455456.0, "logps/chosen": -112.93773396809895, "logps/rejected": -301.6522216796875, "loss": 0.3281, "rewards/chosen": 0.2810507615407308, "rewards/margins": 1.4961078484853108, "rewards/rejected": -1.21505708694458, "step": 2527 }, { "epoch": 0.13399411655579996, "grad_norm": 46.25, "kl": 0.454437255859375, "learning_rate": 5e-07, "logits/chosen": -28177004.8, "logits/rejected": -17457581.333333332, "logps/chosen": -219.8310791015625, "logps/rejected": -358.0650227864583, "loss": 0.3714, "rewards/chosen": 0.09626058340072632, "rewards/margins": 1.9627495169639588, "rewards/rejected": -1.8664889335632324, "step": 2528 }, { "epoch": 0.1340471205576021, "grad_norm": 67.5, "kl": 0.2200927734375, "learning_rate": 5e-07, "logits/chosen": -63386330.666666664, "logits/rejected": -35958361.6, "logps/chosen": -642.0257161458334, "logps/rejected": -536.87646484375, "loss": 0.2555, "rewards/chosen": 0.320819616317749, "rewards/margins": 2.534896230697632, "rewards/rejected": -2.214076614379883, "step": 2529 }, { "epoch": 0.13410012455940423, "grad_norm": 57.75, "kl": 0.1371631622314453, "learning_rate": 5e-07, "logits/chosen": -14512293.333333334, "logits/rejected": -33753592.0, "logps/chosen": -201.06302897135416, "logps/rejected": -419.48370361328125, "loss": 0.4199, "rewards/chosen": -0.0016423662503560383, "rewards/margins": 1.6894146005312602, "rewards/rejected": -1.6910569667816162, "step": 2530 }, { "epoch": 0.13415312856120637, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13868449.6, "logits/rejected": 16564058.666666666, "logps/chosen": -346.3158203125, "logps/rejected": -593.1787923177084, "loss": 0.2959, "rewards/chosen": 0.38397631645202634, "rewards/margins": 2.6726417700449625, "rewards/rejected": -2.288665453592936, "step": 2531 }, { "epoch": 0.1342061325630085, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31358725.333333332, "logits/rejected": -19237696.0, "logps/chosen": -542.8500162760416, "logps/rejected": -483.511181640625, "loss": 0.2046, "rewards/chosen": 0.30095672607421875, "rewards/margins": 3.0301641464233398, "rewards/rejected": -2.729207420349121, "step": 2532 }, { "epoch": 0.13425913656481064, "grad_norm": 45.25, "kl": 0.4317626953125, "learning_rate": 5e-07, "logits/chosen": 123565.6, "logits/rejected": -21792354.666666668, "logps/chosen": -151.685888671875, "logps/rejected": -214.9427490234375, "loss": 0.3988, "rewards/chosen": 0.00340728759765625, "rewards/margins": 1.3642690658569336, "rewards/rejected": -1.3608617782592773, "step": 2533 }, { "epoch": 0.13431214056661278, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1991068.5, "logits/rejected": -15461476.57142857, "logps/chosen": -61.492279052734375, "logps/rejected": -282.71644810267856, "loss": 0.2329, "rewards/chosen": -0.34185829758644104, "rewards/margins": 1.7569309175014496, "rewards/rejected": -2.0987892150878906, "step": 2534 }, { "epoch": 0.13436514456841492, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -117594805.33333333, "logits/rejected": -18035502.4, "logps/chosen": -331.6993408203125, "logps/rejected": -406.4896240234375, "loss": 0.3035, "rewards/chosen": -0.00830384095509847, "rewards/margins": 1.6788421551386516, "rewards/rejected": -1.68714599609375, "step": 2535 }, { "epoch": 0.13441814857021706, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14497024.0, "logits/rejected": -25340808.0, "logps/chosen": -207.43701171875, "logps/rejected": -334.119580078125, "loss": 0.3195, "rewards/chosen": -0.29640114307403564, "rewards/margins": 1.741184878349304, "rewards/rejected": -2.0375860214233397, "step": 2536 }, { "epoch": 0.1344711525720192, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34783526.4, "logits/rejected": -10257228.666666666, "logps/chosen": -412.214111328125, "logps/rejected": -202.23197428385416, "loss": 0.34, "rewards/chosen": 0.18929061889648438, "rewards/margins": 1.9152570724487306, "rewards/rejected": -1.725966453552246, "step": 2537 }, { "epoch": 0.13452415657382133, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12890030.666666666, "logits/rejected": -30105734.4, "logps/chosen": -237.01399739583334, "logps/rejected": -296.9654541015625, "loss": 0.289, "rewards/chosen": 0.33050572872161865, "rewards/margins": 1.7924934148788452, "rewards/rejected": -1.4619876861572265, "step": 2538 }, { "epoch": 0.13457716057562347, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20948035.2, "logits/rejected": -97088405.33333333, "logps/chosen": -324.0146484375, "logps/rejected": -586.9595540364584, "loss": 0.3489, "rewards/chosen": 0.10730105638504028, "rewards/margins": 2.1387068231900535, "rewards/rejected": -2.031405766805013, "step": 2539 }, { "epoch": 0.1346301645774256, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86148928.0, "logits/rejected": -41687354.666666664, "logps/chosen": -311.1051025390625, "logps/rejected": -420.5189208984375, "loss": 0.216, "rewards/chosen": 0.15512849390506744, "rewards/margins": 2.167691245675087, "rewards/rejected": -2.0125627517700195, "step": 2540 }, { "epoch": 0.13468316857922774, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2833447.0, "logits/rejected": -31535112.0, "logps/chosen": -111.21561431884766, "logps/rejected": -113.68765258789062, "loss": 0.3676, "rewards/chosen": 0.06782661378383636, "rewards/margins": 1.3734032958745956, "rewards/rejected": -1.3055766820907593, "step": 2541 }, { "epoch": 0.13473617258102988, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21508296.0, "logits/rejected": -2145045.0, "logps/chosen": -189.7327677408854, "logps/rejected": -539.31103515625, "loss": 0.4079, "rewards/chosen": -0.016590312123298645, "rewards/margins": 1.9614217728376389, "rewards/rejected": -1.9780120849609375, "step": 2542 }, { "epoch": 0.13478917658283202, "grad_norm": 83.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44807066.666666664, "logits/rejected": -42461180.8, "logps/chosen": -1004.3193359375, "logps/rejected": -389.628759765625, "loss": 0.2161, "rewards/chosen": 0.5783490339914957, "rewards/margins": 2.8001443068186442, "rewards/rejected": -2.2217952728271486, "step": 2543 }, { "epoch": 0.13484218058463415, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 382940.6, "logits/rejected": -13985157.333333334, "logps/chosen": -309.8267578125, "logps/rejected": -207.8543701171875, "loss": 0.4259, "rewards/chosen": -0.1791623830795288, "rewards/margins": 1.4300605058670044, "rewards/rejected": -1.6092228889465332, "step": 2544 }, { "epoch": 0.1348951845864363, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34785512.0, "logits/rejected": -19549132.0, "logps/chosen": -149.24636840820312, "logps/rejected": -282.18109130859375, "loss": 0.3255, "rewards/chosen": 0.031169461086392403, "rewards/margins": 2.164560841396451, "rewards/rejected": -2.1333913803100586, "step": 2545 }, { "epoch": 0.1349481885882384, "grad_norm": 61.5, "kl": 0.7009468078613281, "learning_rate": 5e-07, "logits/chosen": -20171820.8, "logits/rejected": -52182101.333333336, "logps/chosen": -356.4061279296875, "logps/rejected": -520.9312744140625, "loss": 0.2941, "rewards/chosen": 0.7310519695281983, "rewards/margins": 2.3239581267038982, "rewards/rejected": -1.5929061571757, "step": 2546 }, { "epoch": 0.13500119259004054, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28482972.0, "logits/rejected": -25260440.0, "logps/chosen": -224.0340576171875, "logps/rejected": -405.83026123046875, "loss": 0.3512, "rewards/chosen": 0.1040092408657074, "rewards/margins": 1.6847158372402191, "rewards/rejected": -1.5807065963745117, "step": 2547 }, { "epoch": 0.13505419659184267, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3222478.6666666665, "logits/rejected": -31811225.6, "logps/chosen": -241.21675618489584, "logps/rejected": -363.231103515625, "loss": 0.2995, "rewards/chosen": 0.14341368277867636, "rewards/margins": 2.022680421670278, "rewards/rejected": -1.8792667388916016, "step": 2548 }, { "epoch": 0.1351072005936448, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22747077.333333332, "logits/rejected": -11048950.0, "logps/chosen": -162.7781982421875, "logps/rejected": -373.8235168457031, "loss": 0.4005, "rewards/chosen": 0.2319784164428711, "rewards/margins": 2.1629446744918823, "rewards/rejected": -1.9309662580490112, "step": 2549 }, { "epoch": 0.13516020459544695, "grad_norm": 61.5, "kl": 0.598968505859375, "learning_rate": 5e-07, "logits/chosen": -26122560.0, "logits/rejected": -17109934.666666668, "logps/chosen": -305.700732421875, "logps/rejected": -201.91288248697916, "loss": 0.3863, "rewards/chosen": 0.30205426216125486, "rewards/margins": 1.2458369890848795, "rewards/rejected": -0.9437827269236246, "step": 2550 }, { "epoch": 0.13521320859724908, "grad_norm": 53.5, "kl": 1.5368022918701172, "learning_rate": 5e-07, "logits/chosen": -38073013.333333336, "logits/rejected": 1767364.4, "logps/chosen": -208.40132649739584, "logps/rejected": -109.05908203125, "loss": 0.4699, "rewards/chosen": -0.052048111955324806, "rewards/margins": 0.4439374436934789, "rewards/rejected": -0.4959855556488037, "step": 2551 }, { "epoch": 0.13526621259905122, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22985342.4, "logits/rejected": -21619856.0, "logps/chosen": -511.895654296875, "logps/rejected": -278.2939046223958, "loss": 0.2962, "rewards/chosen": 0.5675531387329101, "rewards/margins": 2.3805590629577638, "rewards/rejected": -1.8130059242248535, "step": 2552 }, { "epoch": 0.13531921660085336, "grad_norm": 46.5, "kl": 0.35043907165527344, "learning_rate": 5e-07, "logits/chosen": -23737554.666666668, "logits/rejected": 4159206.75, "logps/chosen": -217.07596842447916, "logps/rejected": -303.04534912109375, "loss": 0.3981, "rewards/chosen": 0.1428558131059011, "rewards/margins": 1.720342328151067, "rewards/rejected": -1.577486515045166, "step": 2553 }, { "epoch": 0.1353722206026555, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14220299.2, "logits/rejected": -37476720.0, "logps/chosen": -147.75885009765625, "logps/rejected": -538.8115641276041, "loss": 0.4015, "rewards/chosen": -0.2316453456878662, "rewards/margins": 1.532429297765096, "rewards/rejected": -1.7640746434529622, "step": 2554 }, { "epoch": 0.13542522460445763, "grad_norm": 54.75, "kl": 0.12296009063720703, "learning_rate": 5e-07, "logits/chosen": -16719118.666666666, "logits/rejected": -50899708.0, "logps/chosen": -316.19960530598956, "logps/rejected": -525.4302978515625, "loss": 0.3644, "rewards/chosen": 0.2673158645629883, "rewards/margins": 2.118228316307068, "rewards/rejected": -1.8509124517440796, "step": 2555 }, { "epoch": 0.13547822860625977, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26581876.0, "logits/rejected": -24853180.0, "logps/chosen": -183.49502563476562, "logps/rejected": -403.03131103515625, "loss": 0.3169, "rewards/chosen": 0.15956804156303406, "rewards/margins": 2.0663022696971893, "rewards/rejected": -1.9067342281341553, "step": 2556 }, { "epoch": 0.1355312326080619, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12720610.666666666, "logits/rejected": -65439334.4, "logps/chosen": -149.24163818359375, "logps/rejected": -468.84072265625, "loss": 0.3432, "rewards/chosen": -0.4915485382080078, "rewards/margins": 1.1601442337036132, "rewards/rejected": -1.651692771911621, "step": 2557 }, { "epoch": 0.13558423660986405, "grad_norm": 58.25, "kl": 0.7509078979492188, "learning_rate": 5e-07, "logits/chosen": -32420240.0, "logits/rejected": -43790985.6, "logps/chosen": -417.9488118489583, "logps/rejected": -540.19423828125, "loss": 0.2215, "rewards/chosen": 0.4877156416575114, "rewards/margins": 2.908896176020304, "rewards/rejected": -2.421180534362793, "step": 2558 }, { "epoch": 0.13563724061166618, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17668864.0, "logits/rejected": -29758620.8, "logps/chosen": -390.9440104166667, "logps/rejected": -317.864111328125, "loss": 0.3193, "rewards/chosen": 0.31765441099802655, "rewards/margins": 1.9019006649653118, "rewards/rejected": -1.5842462539672852, "step": 2559 }, { "epoch": 0.13569024461346832, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24044386.0, "logits/rejected": -2710736.0, "logps/chosen": -269.45361328125, "logps/rejected": -175.96609497070312, "loss": 0.3663, "rewards/chosen": 0.25118622183799744, "rewards/margins": 1.1717616021633148, "rewards/rejected": -0.9205753803253174, "step": 2560 }, { "epoch": 0.13574324861527046, "grad_norm": 44.75, "kl": 0.2761268615722656, "learning_rate": 5e-07, "logits/chosen": -24258446.4, "logits/rejected": -34587978.666666664, "logps/chosen": -260.5136962890625, "logps/rejected": -354.0843505859375, "loss": 0.3077, "rewards/chosen": 0.5339737892150879, "rewards/margins": 2.319484488169352, "rewards/rejected": -1.7855106989542644, "step": 2561 }, { "epoch": 0.1357962526170726, "grad_norm": 57.0, "kl": 0.004360198974609375, "learning_rate": 5e-07, "logits/chosen": -37572313.6, "logits/rejected": -10302737.333333334, "logps/chosen": -316.70263671875, "logps/rejected": -169.06895955403647, "loss": 0.397, "rewards/chosen": 0.011496278643608093, "rewards/margins": 1.3419497539599736, "rewards/rejected": -1.3304534753163655, "step": 2562 }, { "epoch": 0.13584925661887473, "grad_norm": 62.75, "kl": 0.7287521362304688, "learning_rate": 5e-07, "logits/chosen": -51861075.2, "logits/rejected": -212488170.66666666, "logps/chosen": -345.6302001953125, "logps/rejected": -343.3706868489583, "loss": 0.3337, "rewards/chosen": 0.3099392890930176, "rewards/margins": 2.1998671531677245, "rewards/rejected": -1.889927864074707, "step": 2563 }, { "epoch": 0.13590226062067687, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20679890.0, "logits/rejected": -24257466.666666668, "logps/chosen": -608.2059936523438, "logps/rejected": -239.1472371419271, "loss": 0.271, "rewards/chosen": 0.7321915030479431, "rewards/margins": 1.9686441620190938, "rewards/rejected": -1.2364526589711506, "step": 2564 }, { "epoch": 0.135955264622479, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45029920.0, "logits/rejected": -28459520.0, "logps/chosen": -276.8458658854167, "logps/rejected": -471.1033203125, "loss": 0.3637, "rewards/chosen": -0.09200223286946614, "rewards/margins": 1.7435852686564128, "rewards/rejected": -1.835587501525879, "step": 2565 }, { "epoch": 0.13600826862428114, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27201148.0, "logits/rejected": -44346709.333333336, "logps/chosen": -491.2255859375, "logps/rejected": -357.2476806640625, "loss": 0.2055, "rewards/chosen": 0.5909317135810852, "rewards/margins": 2.697284201780955, "rewards/rejected": -2.1063524881998696, "step": 2566 }, { "epoch": 0.13606127262608328, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23800312.0, "logits/rejected": -12049782.0, "logps/chosen": -267.2882080078125, "logps/rejected": -257.0543212890625, "loss": 0.3928, "rewards/chosen": 0.16673157612482706, "rewards/margins": 1.640005091826121, "rewards/rejected": -1.473273515701294, "step": 2567 }, { "epoch": 0.13611427662788542, "grad_norm": 42.5, "kl": 0.04798698425292969, "learning_rate": 5e-07, "logits/chosen": -7239302.5, "logits/rejected": -4282245.5, "logps/chosen": -169.18698120117188, "logps/rejected": -85.76949310302734, "loss": 0.4121, "rewards/chosen": 0.17021606862545013, "rewards/margins": 0.7476606220006943, "rewards/rejected": -0.5774445533752441, "step": 2568 }, { "epoch": 0.13616728062968755, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4718912.0, "logits/rejected": -12231686.0, "logps/chosen": -307.29327392578125, "logps/rejected": -174.3244171142578, "loss": 0.3182, "rewards/chosen": 0.25844621658325195, "rewards/margins": 1.7570695877075195, "rewards/rejected": -1.4986233711242676, "step": 2569 }, { "epoch": 0.1362202846314897, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7335272.666666667, "logits/rejected": -32575465.6, "logps/chosen": -261.4337972005208, "logps/rejected": -336.049853515625, "loss": 0.3415, "rewards/chosen": -0.167718768119812, "rewards/margins": 1.1893991708755494, "rewards/rejected": -1.3571179389953614, "step": 2570 }, { "epoch": 0.1362732886332918, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9421426.0, "logits/rejected": -13949975.0, "logps/chosen": -300.42803955078125, "logps/rejected": -135.8936767578125, "loss": 0.3635, "rewards/chosen": 0.06297950446605682, "rewards/margins": 1.239140197634697, "rewards/rejected": -1.1761606931686401, "step": 2571 }, { "epoch": 0.13632629263509394, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3768193.6, "logits/rejected": 17893637.333333332, "logps/chosen": -316.6276123046875, "logps/rejected": -483.9181315104167, "loss": 0.354, "rewards/chosen": 0.36359176635742185, "rewards/margins": 1.7017977396647135, "rewards/rejected": -1.3382059733072917, "step": 2572 }, { "epoch": 0.13637929663689607, "grad_norm": 58.0, "kl": 2.1061248779296875, "learning_rate": 5e-07, "logits/chosen": -26019770.0, "logits/rejected": -35948864.0, "logps/chosen": -676.0340576171875, "logps/rejected": -499.7660217285156, "loss": 0.2106, "rewards/chosen": 0.9746744632720947, "rewards/margins": 3.0782580375671387, "rewards/rejected": -2.103583574295044, "step": 2573 }, { "epoch": 0.1364323006386982, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28454268.0, "logits/rejected": -24244042.0, "logps/chosen": -457.2138366699219, "logps/rejected": -391.416015625, "loss": 0.3039, "rewards/chosen": 0.6519032716751099, "rewards/margins": 2.192656397819519, "rewards/rejected": -1.5407531261444092, "step": 2574 }, { "epoch": 0.13648530464050035, "grad_norm": 66.5, "kl": 0.1121673583984375, "learning_rate": 5e-07, "logits/chosen": -65398602.666666664, "logits/rejected": -4955842.5, "logps/chosen": -413.8936360677083, "logps/rejected": -284.02130126953125, "loss": 0.3437, "rewards/chosen": 0.42638858159383136, "rewards/margins": 2.0058948596318564, "rewards/rejected": -1.579506278038025, "step": 2575 }, { "epoch": 0.13653830864230249, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19916334.0, "logits/rejected": -25737414.0, "logps/chosen": -358.84442138671875, "logps/rejected": -272.2534484863281, "loss": 0.2653, "rewards/chosen": 0.6497915387153625, "rewards/margins": 2.4541892409324646, "rewards/rejected": -1.804397702217102, "step": 2576 }, { "epoch": 0.13659131264410462, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36413353.6, "logits/rejected": -61232154.666666664, "logps/chosen": -407.7458984375, "logps/rejected": -412.9003092447917, "loss": 0.3892, "rewards/chosen": 0.19315645694732667, "rewards/margins": 1.3636658747990926, "rewards/rejected": -1.1705094178517659, "step": 2577 }, { "epoch": 0.13664431664590676, "grad_norm": 48.75, "kl": 0.19823455810546875, "learning_rate": 5e-07, "logits/chosen": -33524092.8, "logits/rejected": -37900957.333333336, "logps/chosen": -244.693115234375, "logps/rejected": -405.6525065104167, "loss": 0.2816, "rewards/chosen": 0.4909278392791748, "rewards/margins": 3.077788654963175, "rewards/rejected": -2.5868608156840005, "step": 2578 }, { "epoch": 0.1366973206477089, "grad_norm": 44.25, "kl": 0.34331321716308594, "learning_rate": 5e-07, "logits/chosen": -15758148.0, "logits/rejected": -23040658.0, "logps/chosen": -96.45878601074219, "logps/rejected": -374.17974853515625, "loss": 0.3698, "rewards/chosen": 0.1886790692806244, "rewards/margins": 1.4190405309200287, "rewards/rejected": -1.2303614616394043, "step": 2579 }, { "epoch": 0.13675032464951103, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47151616.0, "logits/rejected": -35000373.333333336, "logps/chosen": -367.0751953125, "logps/rejected": -390.1593424479167, "loss": 0.2917, "rewards/chosen": 0.026536554098129272, "rewards/margins": 1.5182434022426605, "rewards/rejected": -1.4917068481445312, "step": 2580 }, { "epoch": 0.13680332865131317, "grad_norm": 62.75, "kl": 0.7213115692138672, "learning_rate": 5e-07, "logits/chosen": -60184532.0, "logits/rejected": -8927491.0, "logps/chosen": -566.633544921875, "logps/rejected": -230.35728454589844, "loss": 0.3114, "rewards/chosen": 0.6647037267684937, "rewards/margins": 1.811099648475647, "rewards/rejected": -1.1463959217071533, "step": 2581 }, { "epoch": 0.1368563326531153, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17444733.333333332, "logits/rejected": -21190172.8, "logps/chosen": -319.4464518229167, "logps/rejected": -375.4714599609375, "loss": 0.2598, "rewards/chosen": 0.5062448978424072, "rewards/margins": 2.25548939704895, "rewards/rejected": -1.749244499206543, "step": 2582 }, { "epoch": 0.13690933665491745, "grad_norm": 48.25, "kl": 0.374755859375, "learning_rate": 5e-07, "logits/chosen": -9965548.0, "logits/rejected": -21830710.0, "logps/chosen": -291.5639953613281, "logps/rejected": -307.2397766113281, "loss": 0.316, "rewards/chosen": 0.13506507873535156, "rewards/margins": 2.1429052352905273, "rewards/rejected": -2.007840156555176, "step": 2583 }, { "epoch": 0.13696234065671958, "grad_norm": 80.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30546393.6, "logits/rejected": -101786560.0, "logps/chosen": -279.6650390625, "logps/rejected": -369.130859375, "loss": 0.3563, "rewards/chosen": 0.09483887553215027, "rewards/margins": 2.0674121558666227, "rewards/rejected": -1.9725732803344727, "step": 2584 }, { "epoch": 0.13701534465852172, "grad_norm": 169.0, "kl": 0.04409217834472656, "learning_rate": 5e-07, "logits/chosen": -50512341.333333336, "logits/rejected": -16374954.0, "logps/chosen": -421.240966796875, "logps/rejected": -498.39324951171875, "loss": 0.3086, "rewards/chosen": 0.4649211565653483, "rewards/margins": 3.407857815424601, "rewards/rejected": -2.942936658859253, "step": 2585 }, { "epoch": 0.13706834866032386, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55867968.0, "logits/rejected": 17174208.0, "logps/chosen": -376.759765625, "logps/rejected": -276.33447265625, "loss": 0.3295, "rewards/chosen": 0.17637077967325845, "rewards/margins": 1.3384241739908855, "rewards/rejected": -1.162053394317627, "step": 2586 }, { "epoch": 0.137121352662126, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43577384.0, "logits/rejected": -24493444.0, "logps/chosen": -263.4698181152344, "logps/rejected": -520.3921508789062, "loss": 0.332, "rewards/chosen": -0.07723847031593323, "rewards/margins": 2.2295044362545013, "rewards/rejected": -2.3067429065704346, "step": 2587 }, { "epoch": 0.13717435666392813, "grad_norm": 64.0, "kl": 1.0496139526367188, "learning_rate": 5e-07, "logits/chosen": -84161626.66666667, "logits/rejected": -104796048.0, "logps/chosen": -432.2735188802083, "logps/rejected": -462.3516540527344, "loss": 0.3759, "rewards/chosen": 0.4299654960632324, "rewards/margins": 1.8798693418502808, "rewards/rejected": -1.4499038457870483, "step": 2588 }, { "epoch": 0.13722736066573027, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56244837.333333336, "logits/rejected": -28803641.6, "logps/chosen": -538.9659830729166, "logps/rejected": -352.4529052734375, "loss": 0.2848, "rewards/chosen": 0.3929067055384318, "rewards/margins": 1.9026711861292522, "rewards/rejected": -1.5097644805908204, "step": 2589 }, { "epoch": 0.1372803646675324, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10312713.0, "logits/rejected": -30908758.0, "logps/chosen": -198.66268920898438, "logps/rejected": -299.4817810058594, "loss": 0.392, "rewards/chosen": -0.12056426703929901, "rewards/margins": 1.1587813645601273, "rewards/rejected": -1.2793456315994263, "step": 2590 }, { "epoch": 0.13733336866933454, "grad_norm": 88.0, "kl": 0.8496685028076172, "learning_rate": 5e-07, "logits/chosen": -54099433.14285714, "logits/rejected": -289542.21875, "logps/chosen": -345.1144321986607, "logps/rejected": -478.6109619140625, "loss": 0.3527, "rewards/chosen": 0.6736418860299247, "rewards/margins": 1.7612762110573905, "rewards/rejected": -1.0876343250274658, "step": 2591 }, { "epoch": 0.13738637267113668, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76592469.33333333, "logits/rejected": -40771433.6, "logps/chosen": -315.0811360677083, "logps/rejected": -450.13740234375, "loss": 0.2666, "rewards/chosen": 0.373312513033549, "rewards/margins": 2.2571244637171426, "rewards/rejected": -1.8838119506835938, "step": 2592 }, { "epoch": 0.13743937667293882, "grad_norm": 62.25, "kl": 1.3162155151367188, "learning_rate": 5e-07, "logits/chosen": -25394069.333333332, "logits/rejected": -2857607.0, "logps/chosen": -461.9373779296875, "logps/rejected": -92.16581726074219, "loss": 0.3549, "rewards/chosen": 0.32307934761047363, "rewards/margins": 2.6737852096557617, "rewards/rejected": -2.350705862045288, "step": 2593 }, { "epoch": 0.13749238067474095, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32093904.0, "logits/rejected": -63483801.6, "logps/chosen": -347.9794108072917, "logps/rejected": -380.070556640625, "loss": 0.3273, "rewards/chosen": 0.01329079270362854, "rewards/margins": 1.4607734739780427, "rewards/rejected": -1.4474826812744142, "step": 2594 }, { "epoch": 0.1375453846765431, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32393600.0, "logits/rejected": -9051553.333333334, "logps/chosen": -203.559716796875, "logps/rejected": -156.9628702799479, "loss": 0.3752, "rewards/chosen": 0.2232672929763794, "rewards/margins": 1.306600308418274, "rewards/rejected": -1.0833330154418945, "step": 2595 }, { "epoch": 0.1375983886783452, "grad_norm": 53.75, "kl": 0.44168758392333984, "learning_rate": 5e-07, "logits/chosen": -24719386.666666668, "logits/rejected": -3654988.0, "logps/chosen": -312.303955078125, "logps/rejected": -187.15045166015625, "loss": 0.4131, "rewards/chosen": 0.07240527868270874, "rewards/margins": 1.6041128039360046, "rewards/rejected": -1.531707525253296, "step": 2596 }, { "epoch": 0.13765139268014734, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25356280.0, "logits/rejected": -40506557.333333336, "logps/chosen": -213.66827392578125, "logps/rejected": -237.5224812825521, "loss": 0.3375, "rewards/chosen": -0.6493414640426636, "rewards/margins": 0.7624064683914185, "rewards/rejected": -1.411747932434082, "step": 2597 }, { "epoch": 0.13770439668194948, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18368116.0, "logits/rejected": -8172778.5, "logps/chosen": -313.4927978515625, "logps/rejected": -258.8584899902344, "loss": 0.324, "rewards/chosen": 0.1746324896812439, "rewards/margins": 1.9837670922279358, "rewards/rejected": -1.809134602546692, "step": 2598 }, { "epoch": 0.1377574006837516, "grad_norm": 51.25, "kl": 0.207122802734375, "learning_rate": 5e-07, "logits/chosen": -37523040.0, "logits/rejected": -106250453.33333333, "logps/chosen": -409.919921875, "logps/rejected": -396.4894205729167, "loss": 0.2932, "rewards/chosen": 0.4799102783203125, "rewards/margins": 3.0889745394388832, "rewards/rejected": -2.609064261118571, "step": 2599 }, { "epoch": 0.13781040468555375, "grad_norm": 58.5, "kl": 0.5682182312011719, "learning_rate": 5e-07, "logits/chosen": -20916918.4, "logits/rejected": -18851409.333333332, "logps/chosen": -350.9013427734375, "logps/rejected": -575.4497477213541, "loss": 0.3494, "rewards/chosen": 0.5011314392089844, "rewards/margins": 1.8516960779825848, "rewards/rejected": -1.3505646387736003, "step": 2600 }, { "epoch": 0.1378634086873559, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10727676.0, "logits/rejected": -28628578.0, "logps/chosen": -191.7246551513672, "logps/rejected": -266.4484558105469, "loss": 0.3278, "rewards/chosen": 0.0356019027531147, "rewards/margins": 1.6694814451038837, "rewards/rejected": -1.633879542350769, "step": 2601 }, { "epoch": 0.13791641268915802, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13318114.666666666, "logits/rejected": -45395200.0, "logps/chosen": -234.343994140625, "logps/rejected": -531.928369140625, "loss": 0.2447, "rewards/chosen": 0.29873273770014447, "rewards/margins": 2.5059932510058083, "rewards/rejected": -2.207260513305664, "step": 2602 }, { "epoch": 0.13796941669096016, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70513691.42857143, "logits/rejected": -45455600.0, "logps/chosen": -174.14301409040178, "logps/rejected": -86.91468811035156, "loss": 0.5012, "rewards/chosen": -0.011584247861589705, "rewards/margins": 0.10178016551903316, "rewards/rejected": -0.11336441338062286, "step": 2603 }, { "epoch": 0.1380224206927623, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14165648.0, "logits/rejected": -49436772.0, "logps/chosen": -127.94757843017578, "logps/rejected": -408.68316650390625, "loss": 0.3616, "rewards/chosen": -0.34370118379592896, "rewards/margins": 1.496769368648529, "rewards/rejected": -1.840470552444458, "step": 2604 }, { "epoch": 0.13807542469456444, "grad_norm": 48.75, "kl": 1.4604172706604004, "learning_rate": 5e-07, "logits/chosen": -9255597.714285715, "logits/rejected": -52677576.0, "logps/chosen": -135.22820172991072, "logps/rejected": -529.7166748046875, "loss": 0.4614, "rewards/chosen": 0.1289996930531093, "rewards/margins": 2.0444691010883878, "rewards/rejected": -1.9154694080352783, "step": 2605 }, { "epoch": 0.13812842869636657, "grad_norm": 61.25, "kl": 0.7856674194335938, "learning_rate": 5e-07, "logits/chosen": -69900440.0, "logits/rejected": -12392803.0, "logps/chosen": -335.76190185546875, "logps/rejected": -240.8904266357422, "loss": 0.3255, "rewards/chosen": 0.1557365357875824, "rewards/margins": 2.0087283551692963, "rewards/rejected": -1.8529918193817139, "step": 2606 }, { "epoch": 0.1381814326981687, "grad_norm": 49.75, "kl": 1.0560722351074219, "learning_rate": 5e-07, "logits/chosen": -45823616.0, "logits/rejected": -31073642.666666668, "logps/chosen": -299.99912109375, "logps/rejected": -653.3694661458334, "loss": 0.3065, "rewards/chosen": 0.5009312152862548, "rewards/margins": 2.779326868057251, "rewards/rejected": -2.278395652770996, "step": 2607 }, { "epoch": 0.13823443669997085, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42280389.333333336, "logits/rejected": -42840392.0, "logps/chosen": -247.59320068359375, "logps/rejected": -321.3061218261719, "loss": 0.4473, "rewards/chosen": -0.15420458714167276, "rewards/margins": 1.5519981582959492, "rewards/rejected": -1.706202745437622, "step": 2608 }, { "epoch": 0.13828744070177298, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50385194.666666664, "logits/rejected": -43526524.0, "logps/chosen": -353.0414632161458, "logps/rejected": -411.0287780761719, "loss": 0.4564, "rewards/chosen": -0.10282750924428304, "rewards/margins": 1.6575942436854045, "rewards/rejected": -1.7604217529296875, "step": 2609 }, { "epoch": 0.13834044470357512, "grad_norm": 51.25, "kl": 0.34606361389160156, "learning_rate": 5e-07, "logits/chosen": -2985510.5, "logits/rejected": -26312642.0, "logps/chosen": -188.80506896972656, "logps/rejected": -415.157470703125, "loss": 0.3526, "rewards/chosen": 0.19183310866355896, "rewards/margins": 1.5358011424541473, "rewards/rejected": -1.3439680337905884, "step": 2610 }, { "epoch": 0.13839344870537726, "grad_norm": 56.25, "kl": 0.05663299560546875, "learning_rate": 5e-07, "logits/chosen": -38394668.0, "logits/rejected": -37592872.0, "logps/chosen": -290.2059631347656, "logps/rejected": -492.2819519042969, "loss": 0.2961, "rewards/chosen": 0.04784204065799713, "rewards/margins": 2.7783398777246475, "rewards/rejected": -2.7304978370666504, "step": 2611 }, { "epoch": 0.1384464527071794, "grad_norm": 74.5, "kl": 0.022357940673828125, "learning_rate": 5e-07, "logits/chosen": -72336101.33333333, "logits/rejected": -17347424.0, "logps/chosen": -652.9872233072916, "logps/rejected": -200.51998901367188, "loss": 0.3553, "rewards/chosen": 0.4181045691172282, "rewards/margins": 1.8806137243906658, "rewards/rejected": -1.4625091552734375, "step": 2612 }, { "epoch": 0.13849945670898153, "grad_norm": 51.25, "kl": 0.27478790283203125, "learning_rate": 5e-07, "logits/chosen": -33539808.0, "logits/rejected": -29382064.0, "logps/chosen": -206.42886352539062, "logps/rejected": -264.38665771484375, "loss": 0.3337, "rewards/chosen": 0.017168045043945312, "rewards/margins": 1.8694500923156738, "rewards/rejected": -1.8522820472717285, "step": 2613 }, { "epoch": 0.13855246071078367, "grad_norm": 52.0, "kl": 0.07442855834960938, "learning_rate": 5e-07, "logits/chosen": -13824384.0, "logits/rejected": -15828202.666666666, "logps/chosen": -239.5079345703125, "logps/rejected": -229.86893717447916, "loss": 0.3428, "rewards/chosen": 0.37022829055786133, "rewards/margins": 1.9924133618672688, "rewards/rejected": -1.6221850713094075, "step": 2614 }, { "epoch": 0.1386054647125858, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35268250.666666664, "logits/rejected": -47835264.0, "logps/chosen": -234.8627726236979, "logps/rejected": -452.833935546875, "loss": 0.3257, "rewards/chosen": 0.042736053466796875, "rewards/margins": 1.693477249145508, "rewards/rejected": -1.650741195678711, "step": 2615 }, { "epoch": 0.13865846871438794, "grad_norm": 61.5, "kl": 0.9424819946289062, "learning_rate": 5e-07, "logits/chosen": -42699081.6, "logits/rejected": 3258895.6666666665, "logps/chosen": -328.85908203125, "logps/rejected": -527.3718668619791, "loss": 0.4139, "rewards/chosen": 0.05705230236053467, "rewards/margins": 1.5311387459437054, "rewards/rejected": -1.4740864435831706, "step": 2616 }, { "epoch": 0.13871147271619008, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 37495116.0, "logits/rejected": -58174282.666666664, "logps/chosen": -469.810302734375, "logps/rejected": -350.6276041666667, "loss": 0.2698, "rewards/chosen": -0.1326141357421875, "rewards/margins": 1.8774328231811523, "rewards/rejected": -2.01004695892334, "step": 2617 }, { "epoch": 0.13876447671799222, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40867168.0, "logits/rejected": -35481475.2, "logps/chosen": -505.6033528645833, "logps/rejected": -353.74755859375, "loss": 0.2606, "rewards/chosen": 0.6997981071472168, "rewards/margins": 2.2442170143127442, "rewards/rejected": -1.5444189071655274, "step": 2618 }, { "epoch": 0.13881748071979436, "grad_norm": 60.5, "kl": 0.7761459350585938, "learning_rate": 5e-07, "logits/chosen": -48067728.0, "logits/rejected": -22975330.666666668, "logps/chosen": -416.637744140625, "logps/rejected": -182.0928751627604, "loss": 0.3982, "rewards/chosen": 0.20178914070129395, "rewards/margins": 1.036916653315226, "rewards/rejected": -0.8351275126139323, "step": 2619 }, { "epoch": 0.1388704847215965, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 22170244.0, "logits/rejected": -10662512.0, "logps/chosen": -172.436767578125, "logps/rejected": -267.1310628255208, "loss": 0.3584, "rewards/chosen": -0.08334656059741974, "rewards/margins": 0.8200780202945074, "rewards/rejected": -0.9034245808919271, "step": 2620 }, { "epoch": 0.13892348872339863, "grad_norm": 29.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8420789.333333334, "logits/rejected": -46697395.2, "logps/chosen": -42.94681294759115, "logps/rejected": -388.5824951171875, "loss": 0.2687, "rewards/chosen": -0.08762047688166301, "rewards/margins": 2.259440298875173, "rewards/rejected": -2.347060775756836, "step": 2621 }, { "epoch": 0.13897649272520074, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -251850.2, "logits/rejected": -44514970.666666664, "logps/chosen": -147.3945556640625, "logps/rejected": -399.8695475260417, "loss": 0.3486, "rewards/chosen": -0.02877315878868103, "rewards/margins": 2.3646534184614816, "rewards/rejected": -2.3934265772501626, "step": 2622 }, { "epoch": 0.13902949672700288, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23817448.0, "logits/rejected": -27541242.0, "logps/chosen": -271.2724609375, "logps/rejected": -362.12640380859375, "loss": 0.2586, "rewards/chosen": 0.5149120092391968, "rewards/margins": 2.657730221748352, "rewards/rejected": -2.1428182125091553, "step": 2623 }, { "epoch": 0.13908250072880501, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21557636.0, "logits/rejected": -137939296.0, "logps/chosen": -310.0908610026042, "logps/rejected": -419.4931945800781, "loss": 0.4025, "rewards/chosen": 0.021243383487065632, "rewards/margins": 1.9596422215302784, "rewards/rejected": -1.938398838043213, "step": 2624 }, { "epoch": 0.13913550473060715, "grad_norm": 54.75, "kl": 0.8122072219848633, "learning_rate": 5e-07, "logits/chosen": -31762508.8, "logits/rejected": -14634017.333333334, "logps/chosen": -250.2257568359375, "logps/rejected": -640.5594482421875, "loss": 0.3475, "rewards/chosen": 0.14131137132644653, "rewards/margins": 2.7771966099739074, "rewards/rejected": -2.635885238647461, "step": 2625 }, { "epoch": 0.1391885087324093, "grad_norm": 73.0, "kl": 0.7674503326416016, "learning_rate": 5e-07, "logits/chosen": -19575948.0, "logits/rejected": -11845698.4, "logps/chosen": -771.297607421875, "logps/rejected": -223.1663330078125, "loss": 0.3441, "rewards/chosen": 0.25398675600687665, "rewards/margins": 1.49245711962382, "rewards/rejected": -1.2384703636169434, "step": 2626 }, { "epoch": 0.13924151273421143, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4993386.5, "logits/rejected": -33761413.333333336, "logps/chosen": -328.9530334472656, "logps/rejected": -456.0060221354167, "loss": 0.2681, "rewards/chosen": -0.10730581730604172, "rewards/margins": 1.8059759413202603, "rewards/rejected": -1.913281758626302, "step": 2627 }, { "epoch": 0.13929451673601356, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12110227.0, "logits/rejected": -21338936.0, "logps/chosen": -70.9884033203125, "logps/rejected": -375.92816162109375, "loss": 0.3181, "rewards/chosen": 0.06443218886852264, "rewards/margins": 1.9198987931013107, "rewards/rejected": -1.855466604232788, "step": 2628 }, { "epoch": 0.1393475207378157, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -142944192.0, "logits/rejected": -13485308.8, "logps/chosen": -342.50048828125, "logps/rejected": -211.438427734375, "loss": 0.3155, "rewards/chosen": 0.3661275307337443, "rewards/margins": 1.5617618958155315, "rewards/rejected": -1.195634365081787, "step": 2629 }, { "epoch": 0.13940052473961784, "grad_norm": 78.0, "kl": 1.359283447265625, "learning_rate": 5e-07, "logits/chosen": 3814384.0, "logits/rejected": -38192904.0, "logps/chosen": -389.3350016276042, "logps/rejected": -342.2576904296875, "loss": 0.4272, "rewards/chosen": 0.10566882292429607, "rewards/margins": 2.676824371019999, "rewards/rejected": -2.571155548095703, "step": 2630 }, { "epoch": 0.13945352874141997, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25481340.0, "logits/rejected": 222534.0, "logps/chosen": -305.0205993652344, "logps/rejected": -443.3475646972656, "loss": 0.337, "rewards/chosen": -0.042228713631629944, "rewards/margins": 1.9049739688634872, "rewards/rejected": -1.9472026824951172, "step": 2631 }, { "epoch": 0.1395065327432221, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38695472.0, "logits/rejected": -69568992.0, "logps/chosen": -532.3779907226562, "logps/rejected": -556.3078002929688, "loss": 0.2778, "rewards/chosen": 0.3977649509906769, "rewards/margins": 2.6493652164936066, "rewards/rejected": -2.2516002655029297, "step": 2632 }, { "epoch": 0.13955953674502425, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28935386.666666668, "logits/rejected": -4855894.4, "logps/chosen": -421.8063151041667, "logps/rejected": -351.858837890625, "loss": 0.2708, "rewards/chosen": 0.40251465638478595, "rewards/margins": 2.064738472302755, "rewards/rejected": -1.6622238159179688, "step": 2633 }, { "epoch": 0.13961254074682639, "grad_norm": 101.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32628376.0, "logits/rejected": -29004006.0, "logps/chosen": -488.3248596191406, "logps/rejected": -293.89599609375, "loss": 0.2938, "rewards/chosen": 0.5590934753417969, "rewards/margins": 1.8786876201629639, "rewards/rejected": -1.319594144821167, "step": 2634 }, { "epoch": 0.13966554474862852, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7212776.0, "logits/rejected": -1283677.25, "logps/chosen": -359.70843505859375, "logps/rejected": -174.20484924316406, "loss": 0.4508, "rewards/chosen": -0.12854920327663422, "rewards/margins": 0.537719801068306, "rewards/rejected": -0.6662690043449402, "step": 2635 }, { "epoch": 0.13971854875043066, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20343288.0, "logits/rejected": -62612072.0, "logps/chosen": -445.53839111328125, "logps/rejected": -366.9229431152344, "loss": 0.328, "rewards/chosen": 0.23833274841308594, "rewards/margins": 1.6362462043762207, "rewards/rejected": -1.3979134559631348, "step": 2636 }, { "epoch": 0.1397715527522328, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15375152.0, "logits/rejected": -12975148.0, "logps/chosen": -200.8043975830078, "logps/rejected": -266.6230773925781, "loss": 0.3672, "rewards/chosen": -0.11195985972881317, "rewards/margins": 1.432992771267891, "rewards/rejected": -1.544952630996704, "step": 2637 }, { "epoch": 0.13982455675403493, "grad_norm": 50.5, "kl": 0.5761833190917969, "learning_rate": 5e-07, "logits/chosen": 24952435.2, "logits/rejected": 5041518.0, "logps/chosen": -166.268115234375, "logps/rejected": -150.72628784179688, "loss": 0.3548, "rewards/chosen": 0.31497209072113036, "rewards/margins": 1.5454135020573934, "rewards/rejected": -1.230441411336263, "step": 2638 }, { "epoch": 0.13987756075583707, "grad_norm": 96.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5287427.5, "logits/rejected": -66630432.0, "logps/chosen": -581.214111328125, "logps/rejected": -624.399169921875, "loss": 0.3488, "rewards/chosen": -0.0423925444483757, "rewards/margins": 2.368673510849476, "rewards/rejected": -2.4110660552978516, "step": 2639 }, { "epoch": 0.1399305647576392, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11409069.0, "logits/rejected": -38405328.0, "logps/chosen": -374.8304443359375, "logps/rejected": -406.46759033203125, "loss": 0.3675, "rewards/chosen": -0.0959220677614212, "rewards/margins": 1.6687835901975632, "rewards/rejected": -1.7647056579589844, "step": 2640 }, { "epoch": 0.13998356875944135, "grad_norm": 55.25, "kl": 1.1751937866210938, "learning_rate": 5e-07, "logits/chosen": -87339680.0, "logits/rejected": -14821382.4, "logps/chosen": -395.3509114583333, "logps/rejected": -245.640966796875, "loss": 0.3184, "rewards/chosen": -0.0843928058942159, "rewards/margins": 1.7724030772844952, "rewards/rejected": -1.856795883178711, "step": 2641 }, { "epoch": 0.14003657276124348, "grad_norm": 64.5, "kl": 0.0652618408203125, "learning_rate": 5e-07, "logits/chosen": -30458928.0, "logits/rejected": -13031133.0, "logps/chosen": -357.89385986328125, "logps/rejected": -203.65386962890625, "loss": 0.3844, "rewards/chosen": 0.03674779087305069, "rewards/margins": 1.0435166135430336, "rewards/rejected": -1.006768822669983, "step": 2642 }, { "epoch": 0.14008957676304562, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49288163.2, "logits/rejected": -17732513.333333332, "logps/chosen": -295.12734375, "logps/rejected": -306.7408854166667, "loss": 0.3784, "rewards/chosen": -0.13788559436798095, "rewards/margins": 1.9021973371505738, "rewards/rejected": -2.0400829315185547, "step": 2643 }, { "epoch": 0.14014258076484776, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6348532.666666667, "logits/rejected": -22003756.8, "logps/chosen": -220.8870849609375, "logps/rejected": -260.6330810546875, "loss": 0.3602, "rewards/chosen": -0.6954716046651205, "rewards/margins": 0.8754960695902506, "rewards/rejected": -1.570967674255371, "step": 2644 }, { "epoch": 0.1401955847666499, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16523403.2, "logits/rejected": -5377595.333333333, "logps/chosen": -238.5833251953125, "logps/rejected": -325.3735758463542, "loss": 0.3607, "rewards/chosen": 0.3724850654602051, "rewards/margins": 1.5207335789998373, "rewards/rejected": -1.148248513539632, "step": 2645 }, { "epoch": 0.14024858876845203, "grad_norm": 55.25, "kl": 2.075756072998047, "learning_rate": 5e-07, "logits/chosen": -76011770.66666667, "logits/rejected": -23550400.0, "logps/chosen": -667.0230305989584, "logps/rejected": -357.725927734375, "loss": 0.2873, "rewards/chosen": 0.8820515473683676, "rewards/margins": 2.741122515996297, "rewards/rejected": -1.8590709686279296, "step": 2646 }, { "epoch": 0.14030159277025414, "grad_norm": 54.0, "kl": 0.6822490692138672, "learning_rate": 5e-07, "logits/chosen": -64891251.2, "logits/rejected": -50104890.666666664, "logps/chosen": -321.2705810546875, "logps/rejected": -391.1611735026042, "loss": 0.3419, "rewards/chosen": 0.35399088859558103, "rewards/margins": 2.2356491247812906, "rewards/rejected": -1.8816582361857097, "step": 2647 }, { "epoch": 0.14035459677205628, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15987558.0, "logits/rejected": -39032818.666666664, "logps/chosen": -224.00392150878906, "logps/rejected": -289.8407796223958, "loss": 0.3281, "rewards/chosen": 0.34292182326316833, "rewards/margins": 1.2982905010382333, "rewards/rejected": -0.9553686777750651, "step": 2648 }, { "epoch": 0.14040760077385842, "grad_norm": 39.5, "kl": 0.034140586853027344, "learning_rate": 5e-07, "logits/chosen": -36998810.666666664, "logits/rejected": -18052057.6, "logps/chosen": -214.6822509765625, "logps/rejected": -576.277978515625, "loss": 0.2477, "rewards/chosen": -0.09835415085156758, "rewards/margins": 2.638817403713862, "rewards/rejected": -2.7371715545654296, "step": 2649 }, { "epoch": 0.14046060477566055, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17745216.0, "logits/rejected": -38313660.8, "logps/chosen": -193.55924479166666, "logps/rejected": -337.095166015625, "loss": 0.3322, "rewards/chosen": -0.0134010116259257, "rewards/margins": 1.3661935051282246, "rewards/rejected": -1.3795945167541503, "step": 2650 }, { "epoch": 0.1405136087774627, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40885644.0, "logits/rejected": -16045096.0, "logps/chosen": -284.0386962890625, "logps/rejected": -225.21360778808594, "loss": 0.3436, "rewards/chosen": -0.018358806148171425, "rewards/margins": 1.6578572001308203, "rewards/rejected": -1.6762160062789917, "step": 2651 }, { "epoch": 0.14056661277926483, "grad_norm": 83.0, "kl": 1.487508773803711, "learning_rate": 5e-07, "logits/chosen": -13790618.0, "logps/chosen": -318.8439025878906, "loss": 0.4519, "rewards/chosen": 0.3528605103492737, "step": 2652 }, { "epoch": 0.14061961678106696, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34982752.0, "logits/rejected": -99181568.0, "logps/chosen": -121.41274007161458, "logps/rejected": -550.584765625, "loss": 0.3607, "rewards/chosen": -0.47452545166015625, "rewards/margins": 1.1589797973632812, "rewards/rejected": -1.6335052490234374, "step": 2653 }, { "epoch": 0.1406726207828691, "grad_norm": 31.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17639521.333333332, "logits/rejected": -19170062.4, "logps/chosen": -196.57914225260416, "logps/rejected": -149.057421875, "loss": 0.2767, "rewards/chosen": -0.003698869297901789, "rewards/margins": 2.068386320521434, "rewards/rejected": -2.0720851898193358, "step": 2654 }, { "epoch": 0.14072562478467124, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62976888.0, "logits/rejected": -34490556.0, "logps/chosen": -770.025634765625, "logps/rejected": -446.6401062011719, "loss": 0.2656, "rewards/chosen": 0.6117180585861206, "rewards/margins": 2.695020079612732, "rewards/rejected": -2.0833020210266113, "step": 2655 }, { "epoch": 0.14077862878647338, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -319896.3125, "logits/rejected": -22759680.0, "logps/chosen": -216.2572784423828, "logps/rejected": -433.2777913411458, "loss": 0.2359, "rewards/chosen": -0.23624345660209656, "rewards/margins": 1.7367141544818878, "rewards/rejected": -1.9729576110839844, "step": 2656 }, { "epoch": 0.1408316327882755, "grad_norm": 57.75, "kl": 0.20007896423339844, "learning_rate": 5e-07, "logits/chosen": -36377532.8, "logits/rejected": -8393791.333333334, "logps/chosen": -256.228271484375, "logps/rejected": -155.20465087890625, "loss": 0.4477, "rewards/chosen": -0.10352489948272706, "rewards/margins": 0.737319572766622, "rewards/rejected": -0.840844472249349, "step": 2657 }, { "epoch": 0.14088463679007765, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44201408.0, "logits/rejected": -28089610.0, "logps/chosen": -301.24393136160717, "logps/rejected": -122.4972915649414, "loss": 0.4521, "rewards/chosen": 0.0946899311883109, "rewards/margins": 1.0575576680047172, "rewards/rejected": -0.9628677368164062, "step": 2658 }, { "epoch": 0.1409376407918798, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48635072.0, "logits/rejected": -22099702.0, "logps/chosen": -287.17181396484375, "logps/rejected": -271.340576171875, "loss": 0.3694, "rewards/chosen": -0.17023172974586487, "rewards/margins": 1.3774612843990326, "rewards/rejected": -1.5476930141448975, "step": 2659 }, { "epoch": 0.14099064479368192, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40332324.0, "logits/rejected": -37519884.0, "logps/chosen": -313.05792236328125, "logps/rejected": -409.3131408691406, "loss": 0.3208, "rewards/chosen": 0.17016106843948364, "rewards/margins": 1.8392409682273865, "rewards/rejected": -1.6690798997879028, "step": 2660 }, { "epoch": 0.14104364879548406, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48124362.666666664, "logits/rejected": -18920803.2, "logps/chosen": -306.6064046223958, "logps/rejected": -274.5810302734375, "loss": 0.3014, "rewards/chosen": -0.05611266692479452, "rewards/margins": 1.5170510331789653, "rewards/rejected": -1.5731637001037597, "step": 2661 }, { "epoch": 0.1410966527972862, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29435132.0, "logits/rejected": -9741198.0, "logps/chosen": -448.5484619140625, "logps/rejected": -548.7940673828125, "loss": 0.2982, "rewards/chosen": 0.37656137347221375, "rewards/margins": 2.8713147342205048, "rewards/rejected": -2.494753360748291, "step": 2662 }, { "epoch": 0.14114965679908834, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29045973.333333332, "logits/rejected": 113082150.4, "logps/chosen": -228.77229817708334, "logps/rejected": -207.9052978515625, "loss": 0.3247, "rewards/chosen": 0.1005880335966746, "rewards/margins": 1.3870462397734324, "rewards/rejected": -1.2864582061767578, "step": 2663 }, { "epoch": 0.14120266080089047, "grad_norm": 69.0, "kl": 0.16603851318359375, "learning_rate": 5e-07, "logits/chosen": -76897184.0, "logits/rejected": -8296939.5, "logps/chosen": -647.7779541015625, "logps/rejected": -161.14295959472656, "loss": 0.3412, "rewards/chosen": -0.15575942397117615, "rewards/margins": 2.0897120535373688, "rewards/rejected": -2.245471477508545, "step": 2664 }, { "epoch": 0.1412556648026926, "grad_norm": 47.5, "kl": 1.0141525268554688, "learning_rate": 5e-07, "logits/chosen": -17416748.8, "logits/rejected": -29568805.333333332, "logps/chosen": -202.62257080078126, "logps/rejected": -523.9384765625, "loss": 0.3251, "rewards/chosen": 0.37723190784454347, "rewards/margins": 2.7192055304845177, "rewards/rejected": -2.341973622639974, "step": 2665 }, { "epoch": 0.14130866880449475, "grad_norm": 53.0, "kl": 0.41028690338134766, "learning_rate": 5e-07, "logits/chosen": -27382682.666666668, "logits/rejected": -24312828.0, "logps/chosen": -255.98004150390625, "logps/rejected": -204.16151428222656, "loss": 0.407, "rewards/chosen": -0.020163943370183308, "rewards/margins": 2.618485043446223, "rewards/rejected": -2.6386489868164062, "step": 2666 }, { "epoch": 0.14136167280629688, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35340728.0, "logits/rejected": -25719189.333333332, "logps/chosen": -213.40838623046875, "logps/rejected": -387.1000162760417, "loss": 0.3024, "rewards/chosen": -0.5482690930366516, "rewards/margins": 1.0073221723238628, "rewards/rejected": -1.5555912653605144, "step": 2667 }, { "epoch": 0.14141467680809902, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1156888.75, "logits/rejected": -14634726.857142856, "logps/chosen": -59.52370834350586, "logps/rejected": -274.32631138392856, "loss": 0.2693, "rewards/chosen": -0.21303825080394745, "rewards/margins": 1.2913866617849894, "rewards/rejected": -1.5044249125889368, "step": 2668 }, { "epoch": 0.14146768080990116, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5651612.5, "logits/rejected": -10241069.0, "logps/chosen": -267.5093078613281, "logps/rejected": -200.65396118164062, "loss": 0.3544, "rewards/chosen": 0.2666122019290924, "rewards/margins": 1.406222015619278, "rewards/rejected": -1.1396098136901855, "step": 2669 }, { "epoch": 0.1415206848117033, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80090181.33333333, "logits/rejected": -33289804.8, "logps/chosen": -443.1313883463542, "logps/rejected": -459.340234375, "loss": 0.2535, "rewards/chosen": 0.2841786543528239, "rewards/margins": 2.061269680658976, "rewards/rejected": -1.7770910263061523, "step": 2670 }, { "epoch": 0.14157368881350543, "grad_norm": 51.25, "kl": 0.5759897232055664, "learning_rate": 5e-07, "logits/chosen": -22430912.0, "logits/rejected": -15880114.666666666, "logps/chosen": -280.119091796875, "logps/rejected": -240.2873331705729, "loss": 0.415, "rewards/chosen": -0.15675941705703736, "rewards/margins": 1.4059308409690856, "rewards/rejected": -1.562690258026123, "step": 2671 }, { "epoch": 0.14162669281530754, "grad_norm": 52.5, "kl": 0.6311569213867188, "learning_rate": 5e-07, "logits/chosen": -42690120.0, "logits/rejected": -58063580.0, "logps/chosen": -330.532470703125, "logps/rejected": -368.71856689453125, "loss": 0.3307, "rewards/chosen": 0.4953720271587372, "rewards/margins": 1.673319548368454, "rewards/rejected": -1.1779475212097168, "step": 2672 }, { "epoch": 0.14167969681710968, "grad_norm": 97.5, "kl": 3.89874267578125, "learning_rate": 5e-07, "logits/chosen": -11561339.42857143, "logits/rejected": -7257137.0, "logps/chosen": -760.4949776785714, "logps/rejected": -146.45541381835938, "loss": 0.3756, "rewards/chosen": 0.8392646653311593, "rewards/margins": 2.3830238921301703, "rewards/rejected": -1.5437592267990112, "step": 2673 }, { "epoch": 0.14173270081891182, "grad_norm": 57.5, "kl": 1.390207290649414, "learning_rate": 5e-07, "logits/chosen": -38865445.333333336, "logits/rejected": -38077372.8, "logps/chosen": -1142.6359049479167, "logps/rejected": -317.219482421875, "loss": 0.2195, "rewards/chosen": 0.787872314453125, "rewards/margins": 2.7134334564208986, "rewards/rejected": -1.9255611419677734, "step": 2674 }, { "epoch": 0.14178570482071395, "grad_norm": 62.5, "kl": 0.8536529541015625, "learning_rate": 5e-07, "logits/chosen": -8936342.4, "logits/rejected": -40494352.0, "logps/chosen": -318.986767578125, "logps/rejected": -280.5575764973958, "loss": 0.3296, "rewards/chosen": 0.6711710929870606, "rewards/margins": 1.7534038861592611, "rewards/rejected": -1.0822327931722004, "step": 2675 }, { "epoch": 0.1418387088225161, "grad_norm": 76.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9210673.6, "logits/rejected": -18556256.0, "logps/chosen": -430.63212890625, "logps/rejected": -359.1156412760417, "loss": 0.4245, "rewards/chosen": 0.07212952375411988, "rewards/margins": 0.9174787561098734, "rewards/rejected": -0.8453492323557535, "step": 2676 }, { "epoch": 0.14189171282431823, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29550878.0, "logits/rejected": -30300356.0, "logps/chosen": -183.1302032470703, "logps/rejected": -306.6636962890625, "loss": 0.2855, "rewards/chosen": 0.15364141762256622, "rewards/margins": 2.3257474452257156, "rewards/rejected": -2.1721060276031494, "step": 2677 }, { "epoch": 0.14194471682612037, "grad_norm": 68.5, "kl": 1.1580238342285156, "learning_rate": 5e-07, "logits/chosen": -42457592.0, "logits/rejected": -7467386.0, "logps/chosen": -562.2784423828125, "logps/rejected": -103.63380432128906, "loss": 0.382, "rewards/chosen": 0.41754037141799927, "rewards/margins": 1.3646137714385986, "rewards/rejected": -0.9470734000205994, "step": 2678 }, { "epoch": 0.1419977208279225, "grad_norm": 60.0, "kl": 1.9469165802001953, "learning_rate": 5e-07, "logits/chosen": -15500886.666666666, "logits/rejected": -29947128.0, "logps/chosen": -514.5365397135416, "logps/rejected": -386.91802978515625, "loss": 0.378, "rewards/chosen": 0.4957133928934733, "rewards/margins": 2.2870848576227822, "rewards/rejected": -1.791371464729309, "step": 2679 }, { "epoch": 0.14205072482972464, "grad_norm": 53.25, "kl": 0.63751220703125, "learning_rate": 5e-07, "logits/chosen": -32338217.6, "logits/rejected": -50102325.333333336, "logps/chosen": -276.777197265625, "logps/rejected": -300.73480224609375, "loss": 0.3645, "rewards/chosen": 0.10856598615646362, "rewards/margins": 2.229105015595754, "rewards/rejected": -2.1205390294392905, "step": 2680 }, { "epoch": 0.14210372883152678, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52648757.333333336, "logits/rejected": -9369921.6, "logps/chosen": -376.5845540364583, "logps/rejected": -243.351513671875, "loss": 0.3208, "rewards/chosen": -0.4832019805908203, "rewards/margins": 1.837539482116699, "rewards/rejected": -2.3207414627075194, "step": 2681 }, { "epoch": 0.1421567328333289, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9640292.0, "logits/rejected": -43341964.0, "logps/chosen": -357.4453125, "logps/rejected": -289.56573486328125, "loss": 0.2376, "rewards/chosen": 0.817363977432251, "rewards/margins": 2.6621938943862915, "rewards/rejected": -1.8448299169540405, "step": 2682 }, { "epoch": 0.14220973683513105, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2975579.0, "logits/rejected": -20872500.0, "logps/chosen": -449.67486572265625, "logps/rejected": -253.85458374023438, "loss": 0.3006, "rewards/chosen": 0.5348644256591797, "rewards/margins": 1.929251790046692, "rewards/rejected": -1.3943873643875122, "step": 2683 }, { "epoch": 0.1422627408369332, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 33064400.0, "logits/rejected": -15684028.0, "logps/chosen": -236.50860595703125, "logps/rejected": -403.94525146484375, "loss": 0.319, "rewards/chosen": 0.05788564682006836, "rewards/margins": 1.8761157989501953, "rewards/rejected": -1.818230152130127, "step": 2684 }, { "epoch": 0.14231574483873533, "grad_norm": 72.5, "kl": 1.1428451538085938, "learning_rate": 5e-07, "logits/chosen": -15556384.0, "logits/rejected": 4977217.5, "logps/chosen": -521.9840959821429, "logps/rejected": -11.963441848754883, "loss": 0.4042, "rewards/chosen": 0.4698014940534319, "rewards/margins": 0.44138981880886213, "rewards/rejected": 0.02841167524456978, "step": 2685 }, { "epoch": 0.14236874884053746, "grad_norm": 66.0, "kl": 1.2487506866455078, "learning_rate": 5e-07, "logits/chosen": -43788152.0, "logits/rejected": -6419796.5, "logps/chosen": -863.5696411132812, "logps/rejected": -209.72067260742188, "loss": 0.1907, "rewards/chosen": 1.0875306129455566, "rewards/margins": 3.1876630783081055, "rewards/rejected": -2.100132465362549, "step": 2686 }, { "epoch": 0.1424217528423396, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23931796.0, "logits/rejected": -23313660.0, "logps/chosen": -445.9101867675781, "logps/rejected": -177.86459350585938, "loss": 0.3331, "rewards/chosen": 0.1731712371110916, "rewards/margins": 1.5206062346696854, "rewards/rejected": -1.3474349975585938, "step": 2687 }, { "epoch": 0.14247475684414174, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48234453.333333336, "logits/rejected": -38681360.0, "logps/chosen": -436.8438313802083, "logps/rejected": -314.2471435546875, "loss": 0.3598, "rewards/chosen": -0.4209981362024943, "rewards/margins": 1.01505761941274, "rewards/rejected": -1.4360557556152345, "step": 2688 }, { "epoch": 0.14252776084594387, "grad_norm": 47.5, "kl": 0.2866649627685547, "learning_rate": 5e-07, "logits/chosen": -18686070.666666668, "logits/rejected": 3932539.2, "logps/chosen": -371.1309814453125, "logps/rejected": -187.25474853515624, "loss": 0.3547, "rewards/chosen": 0.1618118385473887, "rewards/margins": 1.1361794571081798, "rewards/rejected": -0.974367618560791, "step": 2689 }, { "epoch": 0.142580764847746, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2020724.0, "logits/rejected": -19798932.8, "logps/chosen": -51.43145243326823, "logps/rejected": -528.20791015625, "loss": 0.2755, "rewards/chosen": 0.5268232822418213, "rewards/margins": 2.051965570449829, "rewards/rejected": -1.525142288208008, "step": 2690 }, { "epoch": 0.14263376884954815, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27667518.0, "logits/rejected": -42691194.666666664, "logps/chosen": -187.68325805664062, "logps/rejected": -168.5477294921875, "loss": 0.2803, "rewards/chosen": -0.16220055520534515, "rewards/margins": 1.3151231358448665, "rewards/rejected": -1.4773236910502117, "step": 2691 }, { "epoch": 0.14268677285135029, "grad_norm": 69.5, "kl": 1.6339874267578125, "learning_rate": 5e-07, "logits/chosen": -63338677.333333336, "logits/rejected": -23099750.0, "logps/chosen": -507.4488118489583, "logps/rejected": -287.6334228515625, "loss": 0.4412, "rewards/chosen": 0.15854721268018088, "rewards/margins": 1.8457946081956227, "rewards/rejected": -1.687247395515442, "step": 2692 }, { "epoch": 0.14273977685315242, "grad_norm": 37.0, "kl": 0.5561943054199219, "learning_rate": 5e-07, "logits/chosen": -451558.4, "logits/rejected": -18182989.333333332, "logps/chosen": -101.09578247070313, "logps/rejected": -317.10373942057294, "loss": 0.4957, "rewards/chosen": -0.4610160827636719, "rewards/margins": 0.44839653968811033, "rewards/rejected": -0.9094126224517822, "step": 2693 }, { "epoch": 0.14279278085495456, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65206356.0, "logits/rejected": -30369328.0, "logps/chosen": -380.910400390625, "logps/rejected": -183.3805694580078, "loss": 0.393, "rewards/chosen": -0.22292709350585938, "rewards/margins": 0.983879804611206, "rewards/rejected": -1.2068068981170654, "step": 2694 }, { "epoch": 0.1428457848567567, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3331438.0, "logits/rejected": -18916280.0, "logps/chosen": -203.72186279296875, "logps/rejected": -135.0154266357422, "loss": 0.3637, "rewards/chosen": -0.15015679597854614, "rewards/margins": 1.4873515963554382, "rewards/rejected": -1.6375083923339844, "step": 2695 }, { "epoch": 0.14289878885855883, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31384970.666666668, "logits/rejected": -23596748.8, "logps/chosen": -48.0574951171875, "logps/rejected": -283.565185546875, "loss": 0.3076, "rewards/chosen": -0.18239416678746542, "rewards/margins": 1.6102697889010111, "rewards/rejected": -1.7926639556884765, "step": 2696 }, { "epoch": 0.14295179286036097, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52779664.0, "logits/rejected": -4487023.2, "logps/chosen": -210.7376708984375, "logps/rejected": -214.63994140625, "loss": 0.3127, "rewards/chosen": 0.05899543563524882, "rewards/margins": 1.439418218533198, "rewards/rejected": -1.3804227828979492, "step": 2697 }, { "epoch": 0.14300479686216308, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21087134.666666668, "logits/rejected": -1189754.4, "logps/chosen": -331.3410237630208, "logps/rejected": -374.385791015625, "loss": 0.2875, "rewards/chosen": -0.34393489360809326, "rewards/margins": 1.9758980989456179, "rewards/rejected": -2.319832992553711, "step": 2698 }, { "epoch": 0.14305780086396522, "grad_norm": 52.5, "kl": 0.10612106323242188, "learning_rate": 5e-07, "logits/chosen": -8678358.0, "logits/rejected": -36111732.0, "logps/chosen": -201.02565002441406, "logps/rejected": -254.4510498046875, "loss": 0.3227, "rewards/chosen": 0.46219927072525024, "rewards/margins": 1.5579589009284973, "rewards/rejected": -1.095759630203247, "step": 2699 }, { "epoch": 0.14311080486576735, "grad_norm": 64.5, "kl": 1.4934425354003906, "learning_rate": 5e-07, "logits/chosen": -16952035.2, "logits/rejected": -49663114.666666664, "logps/chosen": -306.4433837890625, "logps/rejected": -387.64404296875, "loss": 0.3983, "rewards/chosen": 0.22306511402130128, "rewards/margins": 1.6540151198705038, "rewards/rejected": -1.4309500058492024, "step": 2700 }, { "epoch": 0.1431638088675695, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17885942.666666668, "logits/rejected": -22931542.4, "logps/chosen": -134.2188517252604, "logps/rejected": -300.152783203125, "loss": 0.3213, "rewards/chosen": -0.13932100931803384, "rewards/margins": 1.365800889333089, "rewards/rejected": -1.505121898651123, "step": 2701 }, { "epoch": 0.14321681286937163, "grad_norm": 48.75, "kl": 1.9596405029296875, "learning_rate": 5e-07, "logits/chosen": -90656216.0, "logits/rejected": -17954958.0, "logps/chosen": -387.5498046875, "logps/rejected": -302.8869934082031, "loss": 0.3507, "rewards/chosen": 0.35947269201278687, "rewards/margins": 2.296430289745331, "rewards/rejected": -1.936957597732544, "step": 2702 }, { "epoch": 0.14326981687117377, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13866757.333333334, "logits/rejected": -14614763.2, "logps/chosen": -287.0542805989583, "logps/rejected": -289.753271484375, "loss": 0.3205, "rewards/chosen": 0.08907897273699443, "rewards/margins": 1.8912977914015452, "rewards/rejected": -1.8022188186645507, "step": 2703 }, { "epoch": 0.1433228208729759, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51976597.333333336, "logits/rejected": -38866144.0, "logps/chosen": -243.33536783854166, "logps/rejected": -268.6007080078125, "loss": 0.3141, "rewards/chosen": 0.026689847310384113, "rewards/margins": 1.9693696339925129, "rewards/rejected": -1.9426797866821288, "step": 2704 }, { "epoch": 0.14337582487477804, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11890656.0, "logits/rejected": -20994216.0, "logps/chosen": -219.6488800048828, "logps/rejected": -361.9195556640625, "loss": 0.3196, "rewards/chosen": 0.27356255054473877, "rewards/margins": 1.800083041191101, "rewards/rejected": -1.5265204906463623, "step": 2705 }, { "epoch": 0.14342882887658018, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33743992.0, "logits/rejected": -2727687.0, "logps/chosen": -343.7325439453125, "logps/rejected": -279.3958740234375, "loss": 0.3281, "rewards/chosen": 0.3069412112236023, "rewards/margins": 1.7312714457511902, "rewards/rejected": -1.424330234527588, "step": 2706 }, { "epoch": 0.14348183287838231, "grad_norm": 45.25, "kl": 0.23846054077148438, "learning_rate": 5e-07, "logits/chosen": -27207308.8, "logits/rejected": 22975568.0, "logps/chosen": -214.84541015625, "logps/rejected": -242.30533854166666, "loss": 0.3756, "rewards/chosen": 0.12009141445159913, "rewards/margins": 1.6025581121444703, "rewards/rejected": -1.482466697692871, "step": 2707 }, { "epoch": 0.14353483688018445, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30945250.0, "logits/rejected": -28841856.0, "logps/chosen": -236.62167358398438, "logps/rejected": -418.34735107421875, "loss": 0.2704, "rewards/chosen": 0.28864404559135437, "rewards/margins": 2.4840087592601776, "rewards/rejected": -2.1953647136688232, "step": 2708 }, { "epoch": 0.1435878408819866, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50161260.0, "logits/rejected": -25048580.0, "logps/chosen": -160.9468231201172, "logps/rejected": -317.4239807128906, "loss": 0.3472, "rewards/chosen": 0.18294677138328552, "rewards/margins": 1.4960915744304657, "rewards/rejected": -1.3131448030471802, "step": 2709 }, { "epoch": 0.14364084488378873, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -34172992.0, "logps/rejected": -245.45352172851562, "loss": 0.2502, "rewards/rejected": -1.2064110040664673, "step": 2710 }, { "epoch": 0.14369384888559086, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3676535.5, "logits/rejected": -13931084.0, "logps/chosen": -197.29318237304688, "logps/rejected": -213.88633728027344, "loss": 0.3679, "rewards/chosen": -0.10525237023830414, "rewards/margins": 1.2752078920602798, "rewards/rejected": -1.380460262298584, "step": 2711 }, { "epoch": 0.143746852887393, "grad_norm": 60.25, "kl": 0.8386039733886719, "learning_rate": 5e-07, "logits/chosen": -60510565.333333336, "logits/rejected": -11321056.0, "logps/chosen": -343.066162109375, "logps/rejected": -209.392626953125, "loss": 0.3146, "rewards/chosen": 0.21202951669692993, "rewards/margins": 1.8552225708961487, "rewards/rejected": -1.6431930541992188, "step": 2712 }, { "epoch": 0.14379985688919514, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49781842.28571428, "logits/rejected": -60226632.0, "logps/chosen": -463.7879115513393, "logps/rejected": -290.830810546875, "loss": 0.4205, "rewards/chosen": 0.1931342056819371, "rewards/margins": 1.6982459000178747, "rewards/rejected": -1.5051116943359375, "step": 2713 }, { "epoch": 0.14385286089099728, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1703757.25, "logits/rejected": -2703384.25, "logps/chosen": -139.39971923828125, "logps/rejected": -160.69920349121094, "loss": 0.4322, "rewards/chosen": -0.1090865507721901, "rewards/margins": 0.6159631833434105, "rewards/rejected": -0.7250497341156006, "step": 2714 }, { "epoch": 0.1439058648927994, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20045520.0, "logits/rejected": -21528044.0, "logps/chosen": -323.1366271972656, "logps/rejected": -433.22235107421875, "loss": 0.2585, "rewards/chosen": 0.6495859026908875, "rewards/margins": 2.6682812571525574, "rewards/rejected": -2.01869535446167, "step": 2715 }, { "epoch": 0.14395886889460155, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21537565.333333332, "logits/rejected": 3936781.5, "logps/chosen": -127.47303263346355, "logps/rejected": -102.65657043457031, "loss": 0.3828, "rewards/chosen": 0.3902723391850789, "rewards/margins": 1.2341667612393696, "rewards/rejected": -0.8438944220542908, "step": 2716 }, { "epoch": 0.1440118728964037, "grad_norm": 60.75, "kl": 0.7622032165527344, "learning_rate": 5e-07, "logits/chosen": -41693958.4, "logits/rejected": -120236917.33333333, "logps/chosen": -347.9060791015625, "logps/rejected": -201.79984537760416, "loss": 0.3296, "rewards/chosen": 0.5734265327453614, "rewards/margins": 2.027305094401042, "rewards/rejected": -1.4538785616556804, "step": 2717 }, { "epoch": 0.14406487689820582, "grad_norm": 53.25, "kl": 0.43303871154785156, "learning_rate": 5e-07, "logits/chosen": -45478204.8, "logits/rejected": -24839717.333333332, "logps/chosen": -158.447802734375, "logps/rejected": -223.7756144205729, "loss": 0.3362, "rewards/chosen": 0.2830382823944092, "rewards/margins": 2.2844703833262123, "rewards/rejected": -2.0014321009318032, "step": 2718 }, { "epoch": 0.14411788090000796, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38728812.0, "logits/rejected": -34635900.0, "logps/chosen": -205.822998046875, "logps/rejected": -365.70745849609375, "loss": 0.4082, "rewards/chosen": -0.31516677141189575, "rewards/margins": 1.1875516772270203, "rewards/rejected": -1.502718448638916, "step": 2719 }, { "epoch": 0.1441708849018101, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35759474.666666664, "logits/rejected": -22130462.4, "logps/chosen": -375.8653564453125, "logps/rejected": -266.5471435546875, "loss": 0.2173, "rewards/chosen": 0.6009857654571533, "rewards/margins": 2.9470951557159424, "rewards/rejected": -2.346109390258789, "step": 2720 }, { "epoch": 0.14422388890361224, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44254368.0, "logits/rejected": -24959378.0, "logps/chosen": -384.4476623535156, "logps/rejected": -272.37554931640625, "loss": 0.3355, "rewards/chosen": -0.09009819477796555, "rewards/margins": 1.6825642064213753, "rewards/rejected": -1.7726624011993408, "step": 2721 }, { "epoch": 0.14427689290541437, "grad_norm": 41.75, "kl": 0.410125732421875, "learning_rate": 5e-07, "logits/chosen": -38926344.0, "logits/rejected": -35489776.0, "logps/chosen": -282.599853515625, "logps/rejected": -518.74658203125, "loss": 0.3152, "rewards/chosen": -0.22737465798854828, "rewards/margins": 2.8086232095956802, "rewards/rejected": -3.0359978675842285, "step": 2722 }, { "epoch": 0.14432989690721648, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -90568768.0, "logits/rejected": -19635172.57142857, "logps/chosen": -465.9113464355469, "logps/rejected": -210.38828822544642, "loss": 0.2187, "rewards/chosen": 0.14679871499538422, "rewards/margins": 1.7914942694561822, "rewards/rejected": -1.644695554460798, "step": 2723 }, { "epoch": 0.14438290090901862, "grad_norm": 63.0, "kl": 2.0213375091552734, "learning_rate": 5e-07, "logits/chosen": -3857709.25, "logits/rejected": -32469222.0, "logps/chosen": -673.060302734375, "logps/rejected": -174.30323791503906, "loss": 0.3265, "rewards/chosen": 0.8102750778198242, "rewards/margins": 2.1415865421295166, "rewards/rejected": -1.3313114643096924, "step": 2724 }, { "epoch": 0.14443590491082076, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14608824.0, "logits/rejected": -5811132.0, "logps/chosen": -248.40594482421875, "logps/rejected": -269.7520263671875, "loss": 0.2384, "rewards/chosen": 0.34752921263376874, "rewards/margins": 2.4648986736933387, "rewards/rejected": -2.11736946105957, "step": 2725 }, { "epoch": 0.1444889089126229, "grad_norm": 66.0, "kl": 0.5518951416015625, "learning_rate": 5e-07, "logits/chosen": -33432072.0, "logits/rejected": -12169225.0, "logps/chosen": -644.3900146484375, "logps/rejected": -168.3512725830078, "loss": 0.3401, "rewards/chosen": 0.3727302551269531, "rewards/margins": 1.578238606452942, "rewards/rejected": -1.2055083513259888, "step": 2726 }, { "epoch": 0.14454191291442503, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28698749.333333332, "logits/rejected": -27485190.4, "logps/chosen": -210.32965087890625, "logps/rejected": -286.496484375, "loss": 0.3099, "rewards/chosen": -0.015663782755533855, "rewards/margins": 1.684430440266927, "rewards/rejected": -1.700094223022461, "step": 2727 }, { "epoch": 0.14459491691622717, "grad_norm": 56.25, "kl": 1.0484867095947266, "learning_rate": 5e-07, "logits/chosen": -10307983.0, "logits/rejected": -29402946.0, "logps/chosen": -234.2652130126953, "logps/rejected": -294.3296813964844, "loss": 0.4004, "rewards/chosen": 0.2341713160276413, "rewards/margins": 0.98223577439785, "rewards/rejected": -0.7480644583702087, "step": 2728 }, { "epoch": 0.1446479209180293, "grad_norm": 72.0, "kl": 0.4226799011230469, "learning_rate": 5e-07, "logits/chosen": -9348778.666666666, "logits/rejected": -5664362.0, "logps/chosen": -331.6291097005208, "logps/rejected": -182.09519958496094, "loss": 0.4988, "rewards/chosen": -0.08548243840535481, "rewards/margins": 0.3780197600523631, "rewards/rejected": -0.4635021984577179, "step": 2729 }, { "epoch": 0.14470092491983144, "grad_norm": 50.25, "kl": 0.3491249084472656, "learning_rate": 5e-07, "logits/chosen": -10255480.0, "logits/rejected": 9382983.333333334, "logps/chosen": -241.4712890625, "logps/rejected": -83.81669616699219, "loss": 0.365, "rewards/chosen": 0.20980219841003417, "rewards/margins": 1.643774398167928, "rewards/rejected": -1.4339721997578938, "step": 2730 }, { "epoch": 0.14475392892163358, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11515444.0, "logits/rejected": -29916746.0, "logps/chosen": -522.4481811523438, "logps/rejected": -359.06884765625, "loss": 0.2786, "rewards/chosen": 0.54669189453125, "rewards/margins": 2.3472820520401, "rewards/rejected": -1.80059015750885, "step": 2731 }, { "epoch": 0.14480693292343572, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 812859.25, "logits/rejected": -32140248.0, "logps/chosen": -209.32803344726562, "logps/rejected": -400.7451171875, "loss": 0.2171, "rewards/chosen": 0.9489479064941406, "rewards/margins": 2.4999953905741377, "rewards/rejected": -1.5510474840799968, "step": 2732 }, { "epoch": 0.14485993692523785, "grad_norm": 24.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67312.83333333333, "logits/rejected": -4279914.4, "logps/chosen": -43.60767618815104, "logps/rejected": -156.082666015625, "loss": 0.3377, "rewards/chosen": -0.39134565989176434, "rewards/margins": 1.589126141866048, "rewards/rejected": -1.9804718017578125, "step": 2733 }, { "epoch": 0.14491294092704, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15676886.0, "logits/rejected": -41173122.666666664, "logps/chosen": -463.4549865722656, "logps/rejected": -374.6334635416667, "loss": 0.1881, "rewards/chosen": 0.3939681947231293, "rewards/margins": 2.650078227122625, "rewards/rejected": -2.2561100323994956, "step": 2734 }, { "epoch": 0.14496594492884213, "grad_norm": 61.5, "kl": 2.6734771728515625, "learning_rate": 5e-07, "logits/chosen": -20786940.8, "logits/rejected": -30646168.0, "logps/chosen": -738.597900390625, "logps/rejected": -294.499267578125, "loss": 0.3378, "rewards/chosen": 0.8258233070373535, "rewards/margins": 2.4122910499572754, "rewards/rejected": -1.5864677429199219, "step": 2735 }, { "epoch": 0.14501894893064426, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26742316.0, "logits/rejected": -33383138.666666668, "logps/chosen": -117.38612365722656, "logps/rejected": -334.21608479817706, "loss": 0.2377, "rewards/chosen": 0.1331520974636078, "rewards/margins": 2.0219845076402025, "rewards/rejected": -1.888832410176595, "step": 2736 }, { "epoch": 0.1450719529324464, "grad_norm": 49.5, "kl": 1.1227149963378906, "learning_rate": 5e-07, "logits/chosen": -14743614.666666666, "logits/rejected": -23823907.2, "logps/chosen": -200.5406290690104, "logps/rejected": -146.6803955078125, "loss": 0.3751, "rewards/chosen": 0.13205604751904806, "rewards/margins": 1.2824218769868214, "rewards/rejected": -1.1503658294677734, "step": 2737 }, { "epoch": 0.14512495693424854, "grad_norm": 43.75, "kl": 0.29135608673095703, "learning_rate": 5e-07, "logits/chosen": -27486470.4, "logits/rejected": -42085405.333333336, "logps/chosen": -219.5029296875, "logps/rejected": -398.7637532552083, "loss": 0.3329, "rewards/chosen": 0.04095405340194702, "rewards/margins": 3.0195173621177673, "rewards/rejected": -2.9785633087158203, "step": 2738 }, { "epoch": 0.14517796093605068, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35055296.0, "logits/rejected": -7432925.0, "logps/chosen": -354.77496337890625, "logps/rejected": -164.38796997070312, "loss": 0.3285, "rewards/chosen": 0.24735765159130096, "rewards/margins": 1.875363513827324, "rewards/rejected": -1.628005862236023, "step": 2739 }, { "epoch": 0.1452309649378528, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26926828.8, "logits/rejected": -48723690.666666664, "logps/chosen": -176.63599853515626, "logps/rejected": -319.8985188802083, "loss": 0.3639, "rewards/chosen": 0.0001826554536819458, "rewards/margins": 1.8979122112194697, "rewards/rejected": -1.8977295557657878, "step": 2740 }, { "epoch": 0.14528396893965495, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -82463768.0, "logits/rejected": -19604555.42857143, "logps/chosen": -508.4445495605469, "logps/rejected": -250.00233677455358, "loss": 0.214, "rewards/chosen": -0.41790771484375, "rewards/margins": 1.4317332676478796, "rewards/rejected": -1.8496409824916296, "step": 2741 }, { "epoch": 0.1453369729414571, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58619061.333333336, "logits/rejected": -16122496.0, "logps/chosen": -252.8748575846354, "logps/rejected": -187.8857421875, "loss": 0.4039, "rewards/chosen": 0.2100704312324524, "rewards/margins": 1.2912935614585876, "rewards/rejected": -1.0812231302261353, "step": 2742 }, { "epoch": 0.14538997694325922, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25550781.333333332, "logits/rejected": -13112206.4, "logps/chosen": -497.9029541015625, "logps/rejected": -351.1101318359375, "loss": 0.261, "rewards/chosen": 0.5696678956349691, "rewards/margins": 2.2160942872365315, "rewards/rejected": -1.6464263916015625, "step": 2743 }, { "epoch": 0.14544298094506136, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4338248.5, "logits/rejected": -24610920.0, "logps/chosen": -112.32329559326172, "logps/rejected": -345.31854248046875, "loss": 0.3238, "rewards/chosen": -0.1354977786540985, "rewards/margins": 1.9782207310199738, "rewards/rejected": -2.1137185096740723, "step": 2744 }, { "epoch": 0.1454959849468635, "grad_norm": 44.5, "kl": 0.29747676849365234, "learning_rate": 5e-07, "logits/chosen": -18646976.0, "logits/rejected": 24997846.0, "logps/chosen": -147.8457489013672, "logps/rejected": -351.57342529296875, "loss": 0.413, "rewards/chosen": -0.12004732340574265, "rewards/margins": 1.4856205061078072, "rewards/rejected": -1.6056678295135498, "step": 2745 }, { "epoch": 0.14554898894866564, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26271825.6, "logits/rejected": -21323910.666666668, "logps/chosen": -349.967724609375, "logps/rejected": -373.6570231119792, "loss": 0.3709, "rewards/chosen": 0.04568513035774231, "rewards/margins": 1.6715752263863881, "rewards/rejected": -1.6258900960286458, "step": 2746 }, { "epoch": 0.14560199295046777, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25912984.0, "logits/rejected": -11873900.0, "logps/chosen": -272.8236490885417, "logps/rejected": -380.0848083496094, "loss": 0.3905, "rewards/chosen": 0.12272896369298299, "rewards/margins": 2.693315406640371, "rewards/rejected": -2.5705864429473877, "step": 2747 }, { "epoch": 0.14565499695226988, "grad_norm": 43.0, "kl": 0.6015701293945312, "learning_rate": 5e-07, "logits/chosen": -35799852.8, "logits/rejected": -36247957.333333336, "logps/chosen": -205.9848388671875, "logps/rejected": -299.7344156901042, "loss": 0.3739, "rewards/chosen": 0.410890007019043, "rewards/margins": 1.6637093067169189, "rewards/rejected": -1.252819299697876, "step": 2748 }, { "epoch": 0.14570800095407202, "grad_norm": 78.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28775152.0, "logits/rejected": -6253787.0, "logps/chosen": -439.9464518229167, "logps/rejected": -203.77951049804688, "loss": 0.3357, "rewards/chosen": 0.43394744396209717, "rewards/margins": 2.247839093208313, "rewards/rejected": -1.8138916492462158, "step": 2749 }, { "epoch": 0.14576100495587416, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36572512.0, "logits/rejected": -8353138.4, "logps/chosen": -160.8401082356771, "logps/rejected": -185.61534423828124, "loss": 0.3823, "rewards/chosen": -0.4109385013580322, "rewards/margins": 0.7867602825164794, "rewards/rejected": -1.1976987838745117, "step": 2750 }, { "epoch": 0.1458140089576763, "grad_norm": 91.0, "kl": 0.04390907287597656, "learning_rate": 5e-07, "logits/chosen": -98731944.0, "logits/rejected": -4857941.0, "logps/chosen": -1105.8834228515625, "logps/rejected": -204.05422973632812, "loss": 0.361, "rewards/chosen": 0.44488754868507385, "rewards/margins": 1.214775711297989, "rewards/rejected": -0.769888162612915, "step": 2751 }, { "epoch": 0.14586701295947843, "grad_norm": 44.5, "kl": 0.0742645263671875, "learning_rate": 5e-07, "logits/chosen": -10268432.8, "logits/rejected": 7536412.666666667, "logps/chosen": -157.09464111328126, "logps/rejected": -259.9794514973958, "loss": 0.4217, "rewards/chosen": -0.16055769920349122, "rewards/margins": 1.2091637134552002, "rewards/rejected": -1.3697214126586914, "step": 2752 }, { "epoch": 0.14592001696128057, "grad_norm": 61.0, "kl": 2.566730499267578, "learning_rate": 5e-07, "logits/chosen": -89054472.0, "logits/rejected": 360629.8125, "logps/chosen": -644.7312622070312, "logps/rejected": -103.05001831054688, "loss": 0.3304, "rewards/chosen": 0.5350766777992249, "rewards/margins": 1.9151033759117126, "rewards/rejected": -1.3800266981124878, "step": 2753 }, { "epoch": 0.1459730209630827, "grad_norm": 56.5, "kl": 1.2234954833984375, "learning_rate": 5e-07, "logits/chosen": -7140913.333333333, "logits/rejected": -15109566.0, "logps/chosen": -363.132080078125, "logps/rejected": -193.2149658203125, "loss": 0.3741, "rewards/chosen": 0.5741715431213379, "rewards/margins": 1.569682002067566, "rewards/rejected": -0.995510458946228, "step": 2754 }, { "epoch": 0.14602602496488484, "grad_norm": 53.0, "kl": 0.8370285034179688, "learning_rate": 5e-07, "logits/chosen": -59727260.0, "logits/rejected": -10242408.0, "logps/chosen": -362.338134765625, "logps/rejected": -114.45960998535156, "loss": 0.3855, "rewards/chosen": 0.12820512056350708, "rewards/margins": 1.1951971650123596, "rewards/rejected": -1.0669920444488525, "step": 2755 }, { "epoch": 0.14607902896668698, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 734105.875, "logits/rejected": -45847712.0, "logps/chosen": -329.29010009765625, "logps/rejected": -372.6590983072917, "loss": 0.2455, "rewards/chosen": 0.3997696042060852, "rewards/margins": 2.1056756774584455, "rewards/rejected": -1.70590607325236, "step": 2756 }, { "epoch": 0.14613203296848912, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17861036.8, "logits/rejected": -13211769.333333334, "logps/chosen": -231.3844970703125, "logps/rejected": -212.7578125, "loss": 0.3736, "rewards/chosen": 0.009489953517913818, "rewards/margins": 1.6499380469322205, "rewards/rejected": -1.6404480934143066, "step": 2757 }, { "epoch": 0.14618503697029125, "grad_norm": 47.75, "kl": 0.017368316650390625, "learning_rate": 5e-07, "logits/chosen": 18591206.0, "logits/rejected": -22877514.0, "logps/chosen": -312.6796569824219, "logps/rejected": -292.5501708984375, "loss": 0.3115, "rewards/chosen": 0.20852360129356384, "rewards/margins": 2.1702912747859955, "rewards/rejected": -1.9617676734924316, "step": 2758 }, { "epoch": 0.1462380409720934, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4915429.666666667, "logits/rejected": -16077566.0, "logps/chosen": -183.65804036458334, "logps/rejected": -111.97186279296875, "loss": 0.4933, "rewards/chosen": -0.2705369790395101, "rewards/margins": 0.5578056971232097, "rewards/rejected": -0.8283426761627197, "step": 2759 }, { "epoch": 0.14629104497389553, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6394142.666666667, "logits/rejected": -7464935.2, "logps/chosen": -397.0218505859375, "logps/rejected": -690.08564453125, "loss": 0.2451, "rewards/chosen": 0.1757779320081075, "rewards/margins": 2.6053487022717796, "rewards/rejected": -2.429570770263672, "step": 2760 }, { "epoch": 0.14634404897569767, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68081913.6, "logits/rejected": -33199144.0, "logps/chosen": -356.57431640625, "logps/rejected": -216.9322509765625, "loss": 0.3757, "rewards/chosen": 0.2239445209503174, "rewards/margins": 1.3942251364390057, "rewards/rejected": -1.1702806154886882, "step": 2761 }, { "epoch": 0.1463970529774998, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53192640.0, "logits/rejected": -7286042.0, "logps/chosen": -213.79753766741072, "logps/rejected": -38.98896026611328, "loss": 0.5006, "rewards/chosen": -0.16559415204184397, "rewards/margins": 1.101064988545009, "rewards/rejected": -1.266659140586853, "step": 2762 }, { "epoch": 0.14645005697930194, "grad_norm": 40.0, "kl": 0.3590831756591797, "learning_rate": 5e-07, "logits/chosen": -22036776.0, "logits/rejected": -36258450.666666664, "logps/chosen": -92.64849243164062, "logps/rejected": -327.0144449869792, "loss": 0.3891, "rewards/chosen": 0.13173489570617675, "rewards/margins": 1.6519524733225506, "rewards/rejected": -1.5202175776163738, "step": 2763 }, { "epoch": 0.14650306098110408, "grad_norm": 62.75, "kl": 0.7460250854492188, "learning_rate": 5e-07, "logits/chosen": -61470848.0, "logits/rejected": -1770071.25, "logps/chosen": -322.798828125, "logps/rejected": -56.44715118408203, "loss": 0.4167, "rewards/chosen": 0.24851008823939733, "rewards/margins": 2.4196295397622243, "rewards/rejected": -2.171119451522827, "step": 2764 }, { "epoch": 0.14655606498290621, "grad_norm": 45.5, "kl": 0.2075800895690918, "learning_rate": 5e-07, "logits/chosen": -31278650.666666668, "logits/rejected": -16924548.8, "logps/chosen": -249.25870768229166, "logps/rejected": -220.1859375, "loss": 0.3091, "rewards/chosen": 0.17414391040802002, "rewards/margins": 1.5831486940383912, "rewards/rejected": -1.4090047836303712, "step": 2765 }, { "epoch": 0.14660906898470835, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79711248.0, "logits/rejected": -12402104.0, "logps/chosen": -509.748046875, "logps/rejected": -205.773046875, "loss": 0.3497, "rewards/chosen": 0.0589473694562912, "rewards/margins": 1.1626872032880784, "rewards/rejected": -1.1037398338317872, "step": 2766 }, { "epoch": 0.1466620729865105, "grad_norm": 50.5, "kl": 0.10141754150390625, "learning_rate": 5e-07, "logits/chosen": -25910022.4, "logits/rejected": -38395674.666666664, "logps/chosen": -314.4021728515625, "logps/rejected": -454.0234375, "loss": 0.3264, "rewards/chosen": 0.21783576011657715, "rewards/margins": 2.309510405858358, "rewards/rejected": -2.0916746457417807, "step": 2767 }, { "epoch": 0.14671507698831263, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42442696.0, "logits/rejected": -40691496.0, "logps/chosen": -366.7991943359375, "logps/rejected": -229.61044311523438, "loss": 0.3394, "rewards/chosen": 0.008723169565200806, "rewards/margins": 1.715576320886612, "rewards/rejected": -1.7068531513214111, "step": 2768 }, { "epoch": 0.14676808099011476, "grad_norm": 41.0, "kl": 0.1080617904663086, "learning_rate": 5e-07, "logits/chosen": -17236240.0, "logits/rejected": -15437229.333333334, "logps/chosen": -56.87718200683594, "logps/rejected": -469.0939534505208, "loss": 0.2278, "rewards/chosen": -0.006201458163559437, "rewards/margins": 2.1008894918486476, "rewards/rejected": -2.107090950012207, "step": 2769 }, { "epoch": 0.1468210849919169, "grad_norm": 71.5, "kl": 0.34412384033203125, "learning_rate": 5e-07, "logits/chosen": 6150491.6, "logits/rejected": -24599866.666666668, "logps/chosen": -605.4794921875, "logps/rejected": -279.3943277994792, "loss": 0.3449, "rewards/chosen": 0.28393471240997314, "rewards/margins": 2.0794626474380493, "rewards/rejected": -1.7955279350280762, "step": 2770 }, { "epoch": 0.14687408899371904, "grad_norm": 47.75, "kl": 0.1022186279296875, "learning_rate": 5e-07, "logits/chosen": -19184738.666666668, "logits/rejected": -7682233.0, "logps/chosen": -170.48672485351562, "logps/rejected": -382.7784118652344, "loss": 0.4438, "rewards/chosen": -0.18283923467000326, "rewards/margins": 1.6833876768747966, "rewards/rejected": -1.8662269115447998, "step": 2771 }, { "epoch": 0.14692709299552117, "grad_norm": 77.0, "kl": 0.7945632934570312, "learning_rate": 5e-07, "logits/chosen": -71315436.8, "logits/rejected": -23118088.0, "logps/chosen": -641.54033203125, "logps/rejected": -364.65625, "loss": 0.3623, "rewards/chosen": 0.1572834610939026, "rewards/margins": 2.2227427124977113, "rewards/rejected": -2.0654592514038086, "step": 2772 }, { "epoch": 0.14698009699732328, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8811132.8, "logits/rejected": -18762725.333333332, "logps/chosen": -315.1512451171875, "logps/rejected": -314.94944254557294, "loss": 0.3231, "rewards/chosen": 0.25253844261169434, "rewards/margins": 2.455116033554077, "rewards/rejected": -2.202577590942383, "step": 2773 }, { "epoch": 0.14703310099912542, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37429123.2, "logits/rejected": -29290562.666666668, "logps/chosen": -260.58779296875, "logps/rejected": -429.2244466145833, "loss": 0.3916, "rewards/chosen": -0.05084221363067627, "rewards/margins": 1.5847168684005737, "rewards/rejected": -1.63555908203125, "step": 2774 }, { "epoch": 0.14708610500092756, "grad_norm": 64.5, "kl": 0.38605499267578125, "learning_rate": 5e-07, "logits/chosen": -94617584.0, "logits/rejected": -22011784.0, "logps/chosen": -296.61151123046875, "logps/rejected": -187.03030395507812, "loss": 0.3635, "rewards/chosen": 0.22683143615722656, "rewards/margins": 1.325563907623291, "rewards/rejected": -1.0987324714660645, "step": 2775 }, { "epoch": 0.1471391090027297, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23568920.0, "logits/rejected": 78694412.8, "logps/chosen": -217.627685546875, "logps/rejected": -439.031591796875, "loss": 0.3303, "rewards/chosen": 0.1541005571683248, "rewards/margins": 1.7095696886380514, "rewards/rejected": -1.5554691314697267, "step": 2776 }, { "epoch": 0.14719211300453183, "grad_norm": 63.75, "kl": 0.1744384765625, "learning_rate": 5e-07, "logits/chosen": -26895593.6, "logits/rejected": -192982976.0, "logps/chosen": -526.83740234375, "logps/rejected": -410.0105387369792, "loss": 0.3056, "rewards/chosen": 0.20459258556365967, "rewards/margins": 3.2771958112716675, "rewards/rejected": -3.072603225708008, "step": 2777 }, { "epoch": 0.14724511700633397, "grad_norm": 46.5, "kl": 0.7086143493652344, "learning_rate": 5e-07, "logits/chosen": -24453520.0, "logits/rejected": -8595645.0, "logps/chosen": -386.95770263671875, "logps/rejected": -152.72433471679688, "loss": 0.3265, "rewards/chosen": 0.5349367260932922, "rewards/margins": 1.8913083672523499, "rewards/rejected": -1.3563716411590576, "step": 2778 }, { "epoch": 0.1472981210081361, "grad_norm": 32.5, "kl": 0.22515392303466797, "learning_rate": 5e-07, "logits/chosen": -10649000.0, "logits/rejected": -19951902.0, "logps/chosen": -168.81423950195312, "logps/rejected": -395.20343017578125, "loss": 0.2354, "rewards/chosen": 0.647786557674408, "rewards/margins": 3.3041345477104187, "rewards/rejected": -2.6563479900360107, "step": 2779 }, { "epoch": 0.14735112500993824, "grad_norm": 54.75, "kl": 1.3829193115234375, "learning_rate": 5e-07, "logits/chosen": -53987872.0, "logits/rejected": -26725212.8, "logps/chosen": -750.8943684895834, "logps/rejected": -303.4378173828125, "loss": 0.2449, "rewards/chosen": 1.2529123624165852, "rewards/margins": 2.7190120061238607, "rewards/rejected": -1.4660996437072753, "step": 2780 }, { "epoch": 0.14740412901174038, "grad_norm": 51.0, "kl": 0.2012958526611328, "learning_rate": 5e-07, "logits/chosen": -20227075.2, "logits/rejected": -37300736.0, "logps/chosen": -226.5618896484375, "logps/rejected": -416.5162353515625, "loss": 0.336, "rewards/chosen": 0.2150350332260132, "rewards/margins": 2.328747661908468, "rewards/rejected": -2.1137126286824546, "step": 2781 }, { "epoch": 0.14745713301354252, "grad_norm": 51.25, "kl": 1.605560302734375, "learning_rate": 5e-07, "logits/chosen": -20009680.0, "logits/rejected": -43399736.0, "logps/chosen": -297.878759765625, "logps/rejected": -545.8140055338541, "loss": 0.3845, "rewards/chosen": 0.10871632099151611, "rewards/margins": 2.3246820688247682, "rewards/rejected": -2.215965747833252, "step": 2782 }, { "epoch": 0.14751013701534466, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30106428.8, "logits/rejected": 1045752.0, "logps/chosen": -232.5181884765625, "logps/rejected": -97.47438557942708, "loss": 0.4402, "rewards/chosen": -0.24546775817871094, "rewards/margins": 0.9391836166381836, "rewards/rejected": -1.1846513748168945, "step": 2783 }, { "epoch": 0.1475631410171468, "grad_norm": 54.0, "kl": 0.5786800384521484, "learning_rate": 5e-07, "logits/chosen": -16652348.8, "logits/rejected": 9671494.666666666, "logps/chosen": -282.70693359375, "logps/rejected": -234.8499755859375, "loss": 0.3154, "rewards/chosen": 0.46423611640930174, "rewards/margins": 1.937632417678833, "rewards/rejected": -1.4733963012695312, "step": 2784 }, { "epoch": 0.14761614501894893, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76174739.2, "logits/rejected": -30623717.333333332, "logps/chosen": -407.4092529296875, "logps/rejected": -239.97261555989584, "loss": 0.3379, "rewards/chosen": 0.5056027412414551, "rewards/margins": 1.7525953292846679, "rewards/rejected": -1.246992588043213, "step": 2785 }, { "epoch": 0.14766914902075107, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18123993.333333332, "logits/rejected": -25786710.4, "logps/chosen": -158.53750610351562, "logps/rejected": -570.38173828125, "loss": 0.293, "rewards/chosen": -0.21945138772328696, "rewards/margins": 1.906154195467631, "rewards/rejected": -2.125605583190918, "step": 2786 }, { "epoch": 0.1477221530225532, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39261928.0, "logits/rejected": -15065219.2, "logps/chosen": -423.01220703125, "logps/rejected": -331.4491943359375, "loss": 0.2222, "rewards/chosen": 0.7291096846262614, "rewards/margins": 2.6438893477121987, "rewards/rejected": -1.9147796630859375, "step": 2787 }, { "epoch": 0.14777515702435534, "grad_norm": 48.25, "kl": 1.280564308166504, "learning_rate": 5e-07, "logits/chosen": -64833312.0, "logits/rejected": -18612304.0, "logps/chosen": -208.55776977539062, "logps/rejected": -131.13632202148438, "loss": 0.2981, "rewards/chosen": 0.5891244411468506, "rewards/margins": 2.078248143196106, "rewards/rejected": -1.4891237020492554, "step": 2788 }, { "epoch": 0.14782816102615748, "grad_norm": 56.25, "kl": 0.08185768127441406, "learning_rate": 5e-07, "logits/chosen": -29925740.8, "logits/rejected": -21969448.0, "logps/chosen": -343.063232421875, "logps/rejected": -265.0839029947917, "loss": 0.3035, "rewards/chosen": 0.5814001083374023, "rewards/margins": 2.1090792338053386, "rewards/rejected": -1.5276791254679363, "step": 2789 }, { "epoch": 0.14788116502795962, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52130010.666666664, "logits/rejected": -48163612.8, "logps/chosen": -268.50083414713544, "logps/rejected": -422.4568359375, "loss": 0.3063, "rewards/chosen": -0.16466662287712097, "rewards/margins": 1.6893519699573516, "rewards/rejected": -1.8540185928344726, "step": 2790 }, { "epoch": 0.14793416902976175, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8253588.0, "logits/rejected": -78823552.0, "logps/chosen": -121.72258758544922, "logps/rejected": -274.1148681640625, "loss": 0.2732, "rewards/chosen": -0.25601959228515625, "rewards/margins": 1.7774527867635093, "rewards/rejected": -2.0334723790486655, "step": 2791 }, { "epoch": 0.1479871730315639, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29729651.2, "logits/rejected": -23017109.333333332, "logps/chosen": -380.362744140625, "logps/rejected": -447.43994140625, "loss": 0.2848, "rewards/chosen": 0.5124248504638672, "rewards/margins": 2.587681198120117, "rewards/rejected": -2.07525634765625, "step": 2792 }, { "epoch": 0.14804017703336603, "grad_norm": 44.0, "kl": 0.7414588928222656, "learning_rate": 5e-07, "logits/chosen": -36498073.6, "logits/rejected": -16078673.333333334, "logps/chosen": -198.078369140625, "logps/rejected": -153.57231648763022, "loss": 0.4595, "rewards/chosen": -0.12616896629333496, "rewards/margins": 0.7928880850474039, "rewards/rejected": -0.9190570513407389, "step": 2793 }, { "epoch": 0.14809318103516816, "grad_norm": 57.25, "kl": 1.6466598510742188, "learning_rate": 5e-07, "logits/chosen": -23594336.0, "logits/rejected": -29369540.0, "logps/chosen": -644.429931640625, "logps/rejected": -319.7847900390625, "loss": 0.3171, "rewards/chosen": 0.3925117552280426, "rewards/margins": 2.411996752023697, "rewards/rejected": -2.0194849967956543, "step": 2794 }, { "epoch": 0.1481461850369703, "grad_norm": 68.0, "kl": 0.41295433044433594, "learning_rate": 5e-07, "logits/chosen": -36886811.428571425, "logits/rejected": -68484720.0, "logps/chosen": -557.7834821428571, "logps/rejected": -706.3284912109375, "loss": 0.3819, "rewards/chosen": 0.376699583871024, "rewards/margins": 4.110122578484671, "rewards/rejected": -3.7334229946136475, "step": 2795 }, { "epoch": 0.14819918903877244, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42011872.0, "logits/rejected": -41218981.333333336, "logps/chosen": -574.230224609375, "logps/rejected": -528.5772298177084, "loss": 0.2054, "rewards/chosen": 0.049588024616241455, "rewards/margins": 2.7447784543037415, "rewards/rejected": -2.6951904296875, "step": 2796 }, { "epoch": 0.14825219304057458, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44438152.0, "logits/rejected": -45914476.8, "logps/chosen": -360.74755859375, "logps/rejected": -362.8096435546875, "loss": 0.2813, "rewards/chosen": -0.026678969462712605, "rewards/margins": 2.062571976582209, "rewards/rejected": -2.089250946044922, "step": 2797 }, { "epoch": 0.1483051970423767, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35450808.0, "logits/rejected": -23044896.0, "logps/chosen": -281.05987548828125, "logps/rejected": -299.22475179036456, "loss": 0.223, "rewards/chosen": 0.05714645981788635, "rewards/margins": 2.202166626850764, "rewards/rejected": -2.1450201670328775, "step": 2798 }, { "epoch": 0.14835820104417882, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55128057.6, "logits/rejected": -13644657.333333334, "logps/chosen": -311.6622802734375, "logps/rejected": -194.35546875, "loss": 0.3505, "rewards/chosen": 0.604305362701416, "rewards/margins": 1.4753105799357096, "rewards/rejected": -0.8710052172342936, "step": 2799 }, { "epoch": 0.14841120504598096, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32262501.333333332, "logits/rejected": -3643201.6, "logps/chosen": -162.65347290039062, "logps/rejected": -208.350634765625, "loss": 0.3743, "rewards/chosen": -0.7023390134175619, "rewards/margins": 0.744642988840739, "rewards/rejected": -1.4469820022583009, "step": 2800 }, { "epoch": 0.1484642090477831, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67334314.66666667, "logits/rejected": -8258828.0, "logps/chosen": -312.1837158203125, "logps/rejected": -249.0156005859375, "loss": 0.3601, "rewards/chosen": -0.36457570393880206, "rewards/margins": 1.1085929552714031, "rewards/rejected": -1.4731686592102051, "step": 2801 }, { "epoch": 0.14851721304958523, "grad_norm": 87.0, "kl": 1.0831794738769531, "learning_rate": 5e-07, "logits/chosen": -8935391.333333334, "logits/rejected": -2902732.5, "logps/chosen": -507.6258544921875, "logps/rejected": -128.11143493652344, "loss": 0.408, "rewards/chosen": 0.199414332707723, "rewards/margins": 1.7891451915105183, "rewards/rejected": -1.5897308588027954, "step": 2802 }, { "epoch": 0.14857021705138737, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40148792.0, "logits/rejected": -11830529.0, "logps/chosen": -308.478271484375, "logps/rejected": -359.16796875, "loss": 0.3851, "rewards/chosen": -0.06461944431066513, "rewards/margins": 1.4367819800972939, "rewards/rejected": -1.501401424407959, "step": 2803 }, { "epoch": 0.1486232210531895, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2643918.3333333335, "logits/rejected": -79118278.4, "logps/chosen": -158.8934326171875, "logps/rejected": -520.8138671875, "loss": 0.2699, "rewards/chosen": 0.18334094683329263, "rewards/margins": 2.02268230120341, "rewards/rejected": -1.8393413543701171, "step": 2804 }, { "epoch": 0.14867622505499165, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55929024.0, "logits/rejected": -20286532.0, "logps/chosen": -332.7139892578125, "logps/rejected": -605.8793334960938, "loss": 0.375, "rewards/chosen": -0.20954829454421997, "rewards/margins": 1.8433837294578552, "rewards/rejected": -2.052932024002075, "step": 2805 }, { "epoch": 0.14872922905679378, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44828040.0, "logits/rejected": 85720906.66666667, "logps/chosen": -443.3399963378906, "logps/rejected": -492.3211669921875, "loss": 0.2755, "rewards/chosen": -0.44106751680374146, "rewards/margins": 1.4803500374158223, "rewards/rejected": -1.9214175542195637, "step": 2806 }, { "epoch": 0.14878223305859592, "grad_norm": 54.5, "kl": 0.2872276306152344, "learning_rate": 5e-07, "logits/chosen": -49847696.0, "logits/rejected": -45497684.0, "logps/chosen": -381.1692199707031, "logps/rejected": -390.4110107421875, "loss": 0.3223, "rewards/chosen": 0.21122398972511292, "rewards/margins": 2.064572721719742, "rewards/rejected": -1.853348731994629, "step": 2807 }, { "epoch": 0.14883523706039806, "grad_norm": 52.5, "kl": 1.9224224090576172, "learning_rate": 5e-07, "logits/chosen": -2337451.0, "logits/rejected": -2192567.6, "logps/chosen": -261.52821858723956, "logps/rejected": -192.940087890625, "loss": 0.2378, "rewards/chosen": 1.351780891418457, "rewards/margins": 2.963395881652832, "rewards/rejected": -1.611614990234375, "step": 2808 }, { "epoch": 0.1488882410622002, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16607708.8, "logits/rejected": -28769642.666666668, "logps/chosen": -291.7891845703125, "logps/rejected": -420.7150065104167, "loss": 0.3238, "rewards/chosen": 0.10986456871032715, "rewards/margins": 2.7604719956715904, "rewards/rejected": -2.650607426961263, "step": 2809 }, { "epoch": 0.14894124506400233, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50291765.333333336, "logits/rejected": -25360022.4, "logps/chosen": -584.5372721354166, "logps/rejected": -327.631982421875, "loss": 0.2357, "rewards/chosen": 0.8495158354441324, "rewards/margins": 2.2698681036631267, "rewards/rejected": -1.4203522682189942, "step": 2810 }, { "epoch": 0.14899424906580447, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41222704.0, "logits/rejected": -2039315.0, "logps/chosen": -237.52163696289062, "logps/rejected": -142.62277221679688, "loss": 0.3601, "rewards/chosen": 0.08607397973537445, "rewards/margins": 1.5014926046133041, "rewards/rejected": -1.4154186248779297, "step": 2811 }, { "epoch": 0.1490472530676066, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30757795.2, "logits/rejected": -43038202.666666664, "logps/chosen": -213.1400634765625, "logps/rejected": -366.9999186197917, "loss": 0.3618, "rewards/chosen": 0.09356367588043213, "rewards/margins": 1.761609673500061, "rewards/rejected": -1.668045997619629, "step": 2812 }, { "epoch": 0.14910025706940874, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1875261.3333333333, "logits/rejected": -18303011.2, "logps/chosen": -259.1169026692708, "logps/rejected": -246.0473388671875, "loss": 0.3002, "rewards/chosen": 0.24153123299280801, "rewards/margins": 1.6678930799166363, "rewards/rejected": -1.4263618469238282, "step": 2813 }, { "epoch": 0.14915326107121088, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48804984.0, "logits/rejected": -7460534.0, "logps/chosen": -345.01568603515625, "logps/rejected": -314.81658935546875, "loss": 0.3456, "rewards/chosen": 0.054228782653808594, "rewards/margins": 1.419604778289795, "rewards/rejected": -1.3653759956359863, "step": 2814 }, { "epoch": 0.14920626507301302, "grad_norm": 41.25, "kl": 0.012910842895507812, "learning_rate": 5e-07, "logits/chosen": -3084731.75, "logits/rejected": -52096373.333333336, "logps/chosen": -267.66815185546875, "logps/rejected": -472.6923421223958, "loss": 0.198, "rewards/chosen": 0.21723672747612, "rewards/margins": 2.579570025205612, "rewards/rejected": -2.362333297729492, "step": 2815 }, { "epoch": 0.14925926907481515, "grad_norm": 53.0, "kl": 0.5339927673339844, "learning_rate": 5e-07, "logits/chosen": -45778044.0, "logits/rejected": -19522258.0, "logps/chosen": -391.935791015625, "logps/rejected": -171.64981079101562, "loss": 0.3744, "rewards/chosen": 0.4335800111293793, "rewards/margins": 1.406210333108902, "rewards/rejected": -0.9726303219795227, "step": 2816 }, { "epoch": 0.1493122730766173, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23532946.0, "logits/rejected": -17118454.0, "logps/chosen": -321.9593200683594, "logps/rejected": -207.39938354492188, "loss": 0.3153, "rewards/chosen": 0.35899507999420166, "rewards/margins": 1.7586065530776978, "rewards/rejected": -1.399611473083496, "step": 2817 }, { "epoch": 0.14936527707841943, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10666790.0, "logits/rejected": -21616354.0, "logps/chosen": -190.29266357421875, "logps/rejected": -288.7058410644531, "loss": 0.3216, "rewards/chosen": 0.06939199566841125, "rewards/margins": 1.9648554623126984, "rewards/rejected": -1.895463466644287, "step": 2818 }, { "epoch": 0.14941828108022157, "grad_norm": 57.25, "kl": 0.11063385009765625, "learning_rate": 5e-07, "logits/chosen": -50264448.0, "logits/rejected": -32233928.0, "logps/chosen": -352.4804992675781, "logps/rejected": -198.89852905273438, "loss": 0.3737, "rewards/chosen": 0.04236069321632385, "rewards/margins": 1.2605116665363312, "rewards/rejected": -1.2181509733200073, "step": 2819 }, { "epoch": 0.1494712850820237, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28809556.57142857, "logits/rejected": -15515278.0, "logps/chosen": -460.5982142857143, "logps/rejected": -273.64056396484375, "loss": 0.3521, "rewards/chosen": 0.495565618787493, "rewards/margins": 2.871559109006609, "rewards/rejected": -2.375993490219116, "step": 2820 }, { "epoch": 0.14952428908382584, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12553713.6, "logits/rejected": -54179994.666666664, "logps/chosen": -136.2718505859375, "logps/rejected": -425.5558268229167, "loss": 0.3516, "rewards/chosen": 0.05937185883522034, "rewards/margins": 2.1224803030490875, "rewards/rejected": -2.063108444213867, "step": 2821 }, { "epoch": 0.14957729308562798, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -93582576.0, "logits/rejected": -17371345.14285714, "logps/chosen": -366.529541015625, "logps/rejected": -313.14383370535717, "loss": 0.2439, "rewards/chosen": -0.40002748370170593, "rewards/margins": 1.586474482502256, "rewards/rejected": -1.986501966203962, "step": 2822 }, { "epoch": 0.14963029708743011, "grad_norm": 93.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17395283.2, "logits/rejected": -16308256.0, "logps/chosen": -449.1177734375, "logps/rejected": -144.51216634114584, "loss": 0.3463, "rewards/chosen": 0.1645603895187378, "rewards/margins": 1.9961445252100627, "rewards/rejected": -1.831584135691325, "step": 2823 }, { "epoch": 0.14968330108923222, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2442124.0, "logits/rejected": -127328152.0, "logps/chosen": -265.1202087402344, "logps/rejected": -447.1202392578125, "loss": 0.3286, "rewards/chosen": -0.06530246138572693, "rewards/margins": 1.8752135336399078, "rewards/rejected": -1.9405159950256348, "step": 2824 }, { "epoch": 0.14973630509103436, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18478470.666666668, "logits/rejected": -46922115.2, "logps/chosen": -257.26283772786456, "logps/rejected": -319.969189453125, "loss": 0.3114, "rewards/chosen": -0.12589164574941, "rewards/margins": 1.8125646988550823, "rewards/rejected": -1.9384563446044922, "step": 2825 }, { "epoch": 0.1497893090928365, "grad_norm": 50.5, "kl": 1.1764593124389648, "learning_rate": 5e-07, "logits/chosen": -8542380.0, "logits/rejected": -22347776.0, "logps/chosen": -214.96104431152344, "logps/rejected": -276.40802001953125, "loss": 0.3687, "rewards/chosen": 0.17349880933761597, "rewards/margins": 1.583042562007904, "rewards/rejected": -1.409543752670288, "step": 2826 }, { "epoch": 0.14984231309463864, "grad_norm": 51.5, "kl": 0.02388763427734375, "learning_rate": 5e-07, "logits/chosen": -32907988.0, "logits/rejected": -11331809.0, "logps/chosen": -485.5942077636719, "logps/rejected": -320.337646484375, "loss": 0.2523, "rewards/chosen": 0.6802808046340942, "rewards/margins": 3.0568405389785767, "rewards/rejected": -2.3765597343444824, "step": 2827 }, { "epoch": 0.14989531709644077, "grad_norm": 50.5, "kl": 0.6336021423339844, "learning_rate": 5e-07, "logits/chosen": -24420201.6, "logits/rejected": -43525296.0, "logps/chosen": -232.349755859375, "logps/rejected": -420.7777506510417, "loss": 0.355, "rewards/chosen": 0.10324561595916748, "rewards/margins": 2.3828076124191284, "rewards/rejected": -2.279561996459961, "step": 2828 }, { "epoch": 0.1499483210982429, "grad_norm": 61.5, "kl": 0.052913665771484375, "learning_rate": 5e-07, "logits/chosen": -58973830.4, "logits/rejected": -10623884.666666666, "logps/chosen": -419.27890625, "logps/rejected": -228.03938802083334, "loss": 0.4362, "rewards/chosen": -0.02061249315738678, "rewards/margins": 0.7571735233068466, "rewards/rejected": -0.7777860164642334, "step": 2829 }, { "epoch": 0.15000132510004505, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22595016.0, "logits/rejected": -10062626.4, "logps/chosen": -392.9589029947917, "logps/rejected": -498.05546875, "loss": 0.2874, "rewards/chosen": -0.07705485324064891, "rewards/margins": 2.054859094818433, "rewards/rejected": -2.131913948059082, "step": 2830 }, { "epoch": 0.15005432910184718, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20966452.0, "logits/rejected": -16351942.666666666, "logps/chosen": -91.81747436523438, "logps/rejected": -399.11962890625, "loss": 0.19, "rewards/chosen": 0.14289578795433044, "rewards/margins": 2.6350174844264984, "rewards/rejected": -2.492121696472168, "step": 2831 }, { "epoch": 0.15010733310364932, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29442995.2, "logits/rejected": -14161921.333333334, "logps/chosen": -472.27607421875, "logps/rejected": -504.906005859375, "loss": 0.365, "rewards/chosen": 0.33527042865753176, "rewards/margins": 1.7526095946629843, "rewards/rejected": -1.4173391660054524, "step": 2832 }, { "epoch": 0.15016033710545146, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58676448.0, "logits/rejected": -11951216.0, "logps/chosen": -431.08575439453125, "logps/rejected": -191.4679718017578, "loss": 0.318, "rewards/chosen": 0.5127711296081543, "rewards/margins": 1.7773363590240479, "rewards/rejected": -1.2645652294158936, "step": 2833 }, { "epoch": 0.1502133411072536, "grad_norm": 54.5, "kl": 0.3080863952636719, "learning_rate": 5e-07, "logits/chosen": -26768178.666666668, "logits/rejected": -26701619.2, "logps/chosen": -352.7654215494792, "logps/rejected": -396.8055908203125, "loss": 0.3616, "rewards/chosen": 0.4790511926015218, "rewards/margins": 1.3865548928578695, "rewards/rejected": -0.9075037002563476, "step": 2834 }, { "epoch": 0.15026634510905573, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21768064.0, "logits/rejected": -3949941.0, "logps/chosen": -162.32794189453125, "logps/rejected": -486.8929036458333, "loss": 0.3092, "rewards/chosen": -0.45205098390579224, "rewards/margins": 1.8552882075309753, "rewards/rejected": -2.3073391914367676, "step": 2835 }, { "epoch": 0.15031934911085787, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35702453.333333336, "logits/rejected": -14068592.0, "logps/chosen": -307.9748128255208, "logps/rejected": -284.50625, "loss": 0.3699, "rewards/chosen": -0.06611202160517375, "rewards/margins": 0.9563357551892598, "rewards/rejected": -1.0224477767944335, "step": 2836 }, { "epoch": 0.15037235311266, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45497322.666666664, "logits/rejected": -22987734.4, "logps/chosen": -218.70782470703125, "logps/rejected": -622.756494140625, "loss": 0.2588, "rewards/chosen": 0.07428896427154541, "rewards/margins": 2.4197754144668577, "rewards/rejected": -2.3454864501953123, "step": 2837 }, { "epoch": 0.15042535711446214, "grad_norm": 76.0, "kl": 0.8468856811523438, "learning_rate": 5e-07, "logits/chosen": -22378564.0, "logps/chosen": -381.9280090332031, "loss": 0.4412, "rewards/chosen": 0.34039241075515747, "step": 2838 }, { "epoch": 0.15047836111626428, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25984176.0, "logits/rejected": -7629566.5, "logps/chosen": -287.98870849609375, "logps/rejected": -524.8805541992188, "loss": 0.324, "rewards/chosen": -0.011364750564098358, "rewards/margins": 2.7271291688084602, "rewards/rejected": -2.7384939193725586, "step": 2839 }, { "epoch": 0.15053136511806642, "grad_norm": 51.25, "kl": 0.1469440460205078, "learning_rate": 5e-07, "logits/chosen": -19916948.8, "logits/rejected": -6917016.666666667, "logps/chosen": -235.514990234375, "logps/rejected": -109.32157389322917, "loss": 0.3641, "rewards/chosen": 0.33515152931213377, "rewards/margins": 1.4138337930043536, "rewards/rejected": -1.07868226369222, "step": 2840 }, { "epoch": 0.15058436911986856, "grad_norm": 60.5, "kl": 1.1387786865234375, "learning_rate": 5e-07, "logits/chosen": 1584024.0, "logits/rejected": -17968713.6, "logps/chosen": -675.379150390625, "logps/rejected": -385.45263671875, "loss": 0.2531, "rewards/chosen": 0.7474894523620605, "rewards/margins": 2.499780559539795, "rewards/rejected": -1.7522911071777343, "step": 2841 }, { "epoch": 0.1506373731216707, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8675554.4, "logits/rejected": -24341008.0, "logps/chosen": -221.651171875, "logps/rejected": -417.7023518880208, "loss": 0.3935, "rewards/chosen": -0.2028195381164551, "rewards/margins": 2.4041766802469886, "rewards/rejected": -2.606996218363444, "step": 2842 }, { "epoch": 0.15069037712347283, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -224334.5, "logits/rejected": -9875597.714285715, "logps/chosen": -531.8701171875, "logps/rejected": -275.59000069754467, "loss": 0.241, "rewards/chosen": -0.75592041015625, "rewards/margins": 0.9214120592389787, "rewards/rejected": -1.6773324693952287, "step": 2843 }, { "epoch": 0.15074338112527497, "grad_norm": 57.5, "kl": 0.2903757095336914, "learning_rate": 5e-07, "logits/chosen": -25703611.2, "logits/rejected": -7663381.333333333, "logps/chosen": -442.814892578125, "logps/rejected": -178.10636393229166, "loss": 0.3145, "rewards/chosen": 0.5038529872894287, "rewards/margins": 2.451016092300415, "rewards/rejected": -1.9471631050109863, "step": 2844 }, { "epoch": 0.1507963851270771, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26323832.0, "logits/rejected": -40025728.0, "logps/chosen": -233.59286499023438, "logps/rejected": -217.1204376220703, "loss": 0.298, "rewards/chosen": -0.05104656517505646, "rewards/margins": 2.607690379023552, "rewards/rejected": -2.6587369441986084, "step": 2845 }, { "epoch": 0.15084938912887924, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20421510.666666668, "logits/rejected": -74750131.2, "logps/chosen": -103.2260233561198, "logps/rejected": -406.501904296875, "loss": 0.3069, "rewards/chosen": -0.53850785891215, "rewards/margins": 1.7598098595937093, "rewards/rejected": -2.298317718505859, "step": 2846 }, { "epoch": 0.15090239313068138, "grad_norm": 75.5, "kl": 1.401580810546875, "learning_rate": 5e-07, "logits/chosen": -2535449.3333333335, "logits/rejected": -55507824.0, "logps/chosen": -275.9642333984375, "logps/rejected": -540.7221069335938, "loss": 0.3613, "rewards/chosen": 0.3573571840922038, "rewards/margins": 3.01960555712382, "rewards/rejected": -2.662248373031616, "step": 2847 }, { "epoch": 0.15095539713248352, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -90065728.0, "logits/rejected": -12209255.2, "logps/chosen": -366.2849934895833, "logps/rejected": -413.365673828125, "loss": 0.2863, "rewards/chosen": 0.17438660065333048, "rewards/margins": 2.6078929940859474, "rewards/rejected": -2.433506393432617, "step": 2848 }, { "epoch": 0.15100840113428562, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66606725.333333336, "logits/rejected": -14589280.0, "logps/chosen": -477.5966389973958, "logps/rejected": -360.2072998046875, "loss": 0.2507, "rewards/chosen": 0.20493469635645548, "rewards/margins": 2.2329559365908302, "rewards/rejected": -2.028021240234375, "step": 2849 }, { "epoch": 0.15106140513608776, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23296826.666666668, "logits/rejected": -48584873.6, "logps/chosen": -755.7511393229166, "logps/rejected": -395.26259765625, "loss": 0.2495, "rewards/chosen": 0.6868128776550293, "rewards/margins": 2.376881504058838, "rewards/rejected": -1.6900686264038085, "step": 2850 }, { "epoch": 0.1511144091378899, "grad_norm": 48.0, "kl": 0.2077178955078125, "learning_rate": 5e-07, "logits/chosen": -21986472.0, "logits/rejected": -14010838.666666666, "logps/chosen": -235.5730712890625, "logps/rejected": -101.39739990234375, "loss": 0.3868, "rewards/chosen": 0.17271603345870973, "rewards/margins": 1.3701119144757588, "rewards/rejected": -1.197395881017049, "step": 2851 }, { "epoch": 0.15116741313969204, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59052981.333333336, "logits/rejected": -26445120.0, "logps/chosen": -359.1699625651042, "logps/rejected": -239.65576171875, "loss": 0.3042, "rewards/chosen": 0.2564062674840291, "rewards/margins": 1.5586862166722615, "rewards/rejected": -1.3022799491882324, "step": 2852 }, { "epoch": 0.15122041714149417, "grad_norm": 55.75, "kl": 0.08982086181640625, "learning_rate": 5e-07, "logits/chosen": -11623763.42857143, "logits/rejected": -59474960.0, "logps/chosen": -268.7379673549107, "logps/rejected": -865.7489013671875, "loss": 0.4439, "rewards/chosen": 0.015726378985813687, "rewards/margins": 2.732090047427586, "rewards/rejected": -2.7163636684417725, "step": 2853 }, { "epoch": 0.1512734211432963, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63390680.0, "logits/rejected": -21221952.0, "logps/chosen": -258.1006164550781, "logps/rejected": -461.13560267857144, "loss": 0.2938, "rewards/chosen": -0.3865066468715668, "rewards/margins": 0.9430112625871385, "rewards/rejected": -1.3295179094587053, "step": 2854 }, { "epoch": 0.15132642514509845, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36762360.0, "logits/rejected": -3550319.2, "logps/chosen": -395.6303304036458, "logps/rejected": -162.3447265625, "loss": 0.2065, "rewards/chosen": 0.8646748860677084, "rewards/margins": 2.6499261220296226, "rewards/rejected": -1.7852512359619142, "step": 2855 }, { "epoch": 0.15137942914690058, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30644168.0, "logits/rejected": -15434650.285714285, "logps/chosen": -371.5612487792969, "logps/rejected": -181.05913434709822, "loss": 0.2125, "rewards/chosen": 0.07297668606042862, "rewards/margins": 1.7834896647504397, "rewards/rejected": -1.710512978690011, "step": 2856 }, { "epoch": 0.15143243314870272, "grad_norm": 62.75, "kl": 0.7761402130126953, "learning_rate": 5e-07, "logits/chosen": -45026777.6, "logits/rejected": 426591.1666666667, "logps/chosen": -371.4812744140625, "logps/rejected": -82.67326354980469, "loss": 0.4138, "rewards/chosen": 0.3676729202270508, "rewards/margins": 0.7153463363647461, "rewards/rejected": -0.3476734161376953, "step": 2857 }, { "epoch": 0.15148543715050486, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5503406.666666667, "logits/rejected": -42968124.8, "logps/chosen": -67.89625040690105, "logps/rejected": -301.19599609375, "loss": 0.3698, "rewards/chosen": -0.5325488249460856, "rewards/margins": 1.0222344239552816, "rewards/rejected": -1.554783248901367, "step": 2858 }, { "epoch": 0.151538441152307, "grad_norm": 52.25, "kl": 0.4889335632324219, "learning_rate": 5e-07, "logits/chosen": -23038563.2, "logits/rejected": -6948050.666666667, "logps/chosen": -433.214990234375, "logps/rejected": -299.13665771484375, "loss": 0.345, "rewards/chosen": 0.4111932277679443, "rewards/margins": 1.75280810991923, "rewards/rejected": -1.3416148821512859, "step": 2859 }, { "epoch": 0.15159144515410913, "grad_norm": 27.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29534410.666666668, "logits/rejected": -13632230.4, "logps/chosen": -35.689361572265625, "logps/rejected": -343.1141845703125, "loss": 0.3123, "rewards/chosen": -0.08937544624010722, "rewards/margins": 1.8748339196046193, "rewards/rejected": -1.9642093658447266, "step": 2860 }, { "epoch": 0.15164444915591127, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29191770.0, "logits/rejected": -3552321.25, "logps/chosen": -250.6019744873047, "logps/rejected": -351.65289306640625, "loss": 0.3017, "rewards/chosen": 0.4256454408168793, "rewards/margins": 2.1234894692897797, "rewards/rejected": -1.6978440284729004, "step": 2861 }, { "epoch": 0.1516974531577134, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18854054.0, "logits/rejected": -19619005.333333332, "logps/chosen": -260.2333984375, "logps/rejected": -247.59639485677084, "loss": 0.2513, "rewards/chosen": 0.7800319790840149, "rewards/margins": 2.2263324856758118, "rewards/rejected": -1.4463005065917969, "step": 2862 }, { "epoch": 0.15175045715951554, "grad_norm": 65.0, "kl": 1.994384765625, "learning_rate": 5e-07, "logits/chosen": -38982548.0, "logits/rejected": -45227488.0, "logps/chosen": -491.3402404785156, "logps/rejected": -379.0477600097656, "loss": 0.3092, "rewards/chosen": 0.6552238464355469, "rewards/margins": 2.4947426319122314, "rewards/rejected": -1.8395187854766846, "step": 2863 }, { "epoch": 0.15180346116131768, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27317332.0, "logits/rejected": -16342497.333333334, "logps/chosen": -218.86288452148438, "logps/rejected": -432.0260009765625, "loss": 0.2619, "rewards/chosen": 0.4814910888671875, "rewards/margins": 2.08981196085612, "rewards/rejected": -1.6083208719889324, "step": 2864 }, { "epoch": 0.15185646516311982, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3724674.0, "logits/rejected": -30492512.0, "logps/chosen": -295.688232421875, "logps/rejected": -222.76693725585938, "loss": 0.3017, "rewards/chosen": 0.20570188760757446, "rewards/margins": 2.1691091656684875, "rewards/rejected": -1.963407278060913, "step": 2865 }, { "epoch": 0.15190946916492196, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34896280.0, "logits/rejected": -40678648.0, "logps/chosen": -344.8487141927083, "logps/rejected": -164.62255859375, "loss": 0.358, "rewards/chosen": 0.37355677286783856, "rewards/margins": 2.0847250620524087, "rewards/rejected": -1.7111682891845703, "step": 2866 }, { "epoch": 0.1519624731667241, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23551098.0, "logits/rejected": -58039146.666666664, "logps/chosen": -410.8531799316406, "logps/rejected": -387.0997721354167, "loss": 0.2297, "rewards/chosen": 0.537860095500946, "rewards/margins": 2.194818039735158, "rewards/rejected": -1.6569579442342122, "step": 2867 }, { "epoch": 0.15201547716852623, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19282374.4, "logits/rejected": -58096197.333333336, "logps/chosen": -619.608740234375, "logps/rejected": -544.3326416015625, "loss": 0.3244, "rewards/chosen": 0.12569549083709716, "rewards/margins": 2.9002306699752807, "rewards/rejected": -2.7745351791381836, "step": 2868 }, { "epoch": 0.15206848117032837, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13196686.0, "logits/rejected": -32752364.0, "logps/chosen": -443.3944396972656, "logps/rejected": -378.0247802734375, "loss": 0.2348, "rewards/chosen": 0.7037268877029419, "rewards/margins": 2.6523245573043823, "rewards/rejected": -1.9485976696014404, "step": 2869 }, { "epoch": 0.1521214851721305, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 26213674.666666668, "logits/rejected": -61003801.6, "logps/chosen": -353.4527180989583, "logps/rejected": -532.010009765625, "loss": 0.2426, "rewards/chosen": 0.33599801858266193, "rewards/margins": 2.828497370084127, "rewards/rejected": -2.492499351501465, "step": 2870 }, { "epoch": 0.15217448917393264, "grad_norm": 61.25, "kl": 0.9401893615722656, "learning_rate": 5e-07, "logits/chosen": -5733704.0, "logits/rejected": -11572148.0, "logps/chosen": -297.4725864955357, "logps/rejected": -90.96638488769531, "loss": 0.4424, "rewards/chosen": 0.23677519389561244, "rewards/margins": 1.1972213728087289, "rewards/rejected": -0.9604461789131165, "step": 2871 }, { "epoch": 0.15222749317573478, "grad_norm": 49.75, "kl": 0.0206451416015625, "learning_rate": 5e-07, "logits/chosen": -47545330.28571428, "logits/rejected": -5911635.5, "logps/chosen": -184.2467041015625, "logps/rejected": -93.54641723632812, "loss": 0.4541, "rewards/chosen": 9.121320077351161e-05, "rewards/margins": 2.3073285999042645, "rewards/rejected": -2.307237386703491, "step": 2872 }, { "epoch": 0.15228049717753692, "grad_norm": 54.5, "kl": 1.9916443824768066, "learning_rate": 5e-07, "logits/chosen": -34709570.666666664, "logits/rejected": -33459926.4, "logps/chosen": -525.4892578125, "logps/rejected": -387.0299560546875, "loss": 0.2476, "rewards/chosen": 0.6220588684082031, "rewards/margins": 2.539190673828125, "rewards/rejected": -1.9171318054199218, "step": 2873 }, { "epoch": 0.15233350117933905, "grad_norm": 59.75, "kl": 0.6098480224609375, "learning_rate": 5e-07, "logits/chosen": -49786197.333333336, "logits/rejected": -3088446.25, "logps/chosen": -350.9649658203125, "logps/rejected": -87.7044677734375, "loss": 0.3903, "rewards/chosen": 0.20516701539357504, "rewards/margins": 2.0263294776280723, "rewards/rejected": -1.821162462234497, "step": 2874 }, { "epoch": 0.15238650518114116, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32161965.333333332, "logits/rejected": -7076389.5, "logps/chosen": -200.13370768229166, "logps/rejected": -236.8933868408203, "loss": 0.4117, "rewards/chosen": 0.029858211676279705, "rewards/margins": 1.6332720319430034, "rewards/rejected": -1.6034138202667236, "step": 2875 }, { "epoch": 0.1524395091829433, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44817836.0, "logits/rejected": -41243784.0, "logps/chosen": -429.3858642578125, "logps/rejected": -400.7102966308594, "loss": 0.3025, "rewards/chosen": 0.33612746000289917, "rewards/margins": 2.0404600501060486, "rewards/rejected": -1.7043325901031494, "step": 2876 }, { "epoch": 0.15249251318474544, "grad_norm": 55.25, "kl": 0.209197998046875, "learning_rate": 5e-07, "logits/chosen": -16291273.333333334, "logits/rejected": -5055178.0, "logps/chosen": -219.52669270833334, "logps/rejected": -58.73298645019531, "loss": 0.4513, "rewards/chosen": 0.0798926701148351, "rewards/margins": 0.7769015779097875, "rewards/rejected": -0.6970089077949524, "step": 2877 }, { "epoch": 0.15254551718654757, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18902032.0, "logits/rejected": -10514429.714285715, "logps/chosen": -502.6586608886719, "logps/rejected": -284.71133858816967, "loss": 0.1873, "rewards/chosen": 0.382650762796402, "rewards/margins": 2.247631187949862, "rewards/rejected": -1.8649804251534599, "step": 2878 }, { "epoch": 0.1525985211883497, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21610404.8, "logits/rejected": -10044272.666666666, "logps/chosen": -181.6251220703125, "logps/rejected": -185.22745768229166, "loss": 0.389, "rewards/chosen": 0.06755566596984863, "rewards/margins": 1.313592831293742, "rewards/rejected": -1.2460371653238933, "step": 2879 }, { "epoch": 0.15265152519015185, "grad_norm": 104.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47908901.333333336, "logits/rejected": -42097504.0, "logps/chosen": -1323.6272786458333, "logps/rejected": -304.4719970703125, "loss": 0.2944, "rewards/chosen": -0.304338018099467, "rewards/margins": 1.6637943824132282, "rewards/rejected": -1.9681324005126952, "step": 2880 }, { "epoch": 0.15270452919195399, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15263173.333333334, "logits/rejected": -22578950.4, "logps/chosen": -81.07510375976562, "logps/rejected": -278.21943359375, "loss": 0.3233, "rewards/chosen": 0.06236889958381653, "rewards/margins": 1.6162720978260041, "rewards/rejected": -1.5539031982421876, "step": 2881 }, { "epoch": 0.15275753319375612, "grad_norm": 57.5, "kl": 1.3082275390625, "learning_rate": 5e-07, "logits/chosen": -20826700.8, "logits/rejected": -34481642.666666664, "logps/chosen": -397.1990234375, "logps/rejected": -97.97357177734375, "loss": 0.3865, "rewards/chosen": 0.24466941356658936, "rewards/margins": 1.8601336240768434, "rewards/rejected": -1.615464210510254, "step": 2882 }, { "epoch": 0.15281053719555826, "grad_norm": 51.25, "kl": 0.30077457427978516, "learning_rate": 5e-07, "logits/chosen": -12029570.666666666, "logits/rejected": -10596860.0, "logps/chosen": -302.0594889322917, "logps/rejected": -256.82196044921875, "loss": 0.3814, "rewards/chosen": 0.20383155345916748, "rewards/margins": 2.2222079038619995, "rewards/rejected": -2.018376350402832, "step": 2883 }, { "epoch": 0.1528635411973604, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34547988.0, "logits/rejected": -1088599.6666666667, "logps/chosen": -211.8839111328125, "logps/rejected": -142.3134765625, "loss": 0.3871, "rewards/chosen": -0.15582428872585297, "rewards/margins": 0.6045369058847427, "rewards/rejected": -0.7603611946105957, "step": 2884 }, { "epoch": 0.15291654519916253, "grad_norm": 71.0, "kl": 0.4719867706298828, "learning_rate": 5e-07, "logits/chosen": -9741060.0, "logits/rejected": 37630598.4, "logps/chosen": -95.96045939127605, "logps/rejected": -590.1943359375, "loss": 0.3219, "rewards/chosen": 0.11341120799382527, "rewards/margins": 1.5870932300885519, "rewards/rejected": -1.4736820220947267, "step": 2885 }, { "epoch": 0.15296954920096467, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3079973.0, "logits/rejected": -28222648.0, "logps/chosen": -162.43524169921875, "logps/rejected": -262.45485432942706, "loss": 0.2237, "rewards/chosen": 0.566175103187561, "rewards/margins": 2.4458037614822388, "rewards/rejected": -1.8796286582946777, "step": 2886 }, { "epoch": 0.1530225532027668, "grad_norm": 50.0, "kl": 0.02466583251953125, "learning_rate": 5e-07, "logits/chosen": -50032810.666666664, "logits/rejected": 15940796.0, "logps/chosen": -327.3751627604167, "logps/rejected": -465.6533203125, "loss": 0.3492, "rewards/chosen": 0.22909061113993326, "rewards/margins": 2.4312636057535806, "rewards/rejected": -2.2021729946136475, "step": 2887 }, { "epoch": 0.15307555720456895, "grad_norm": 76.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4916300.4, "logits/rejected": 1194990.6666666667, "logps/chosen": -414.893896484375, "logps/rejected": -726.6344401041666, "loss": 0.3818, "rewards/chosen": -0.24100837707519532, "rewards/margins": 2.197526200612386, "rewards/rejected": -2.4385345776875815, "step": 2888 }, { "epoch": 0.15312856120637108, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69671040.0, "logits/rejected": -19614200.0, "logps/chosen": -598.5658569335938, "logps/rejected": -324.64349365234375, "loss": 0.2159, "rewards/chosen": 0.9075927734375, "rewards/margins": 2.591045379638672, "rewards/rejected": -1.6834526062011719, "step": 2889 }, { "epoch": 0.15318156520817322, "grad_norm": 31.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1387691.25, "logits/rejected": 3068333.0, "logps/chosen": -52.910194396972656, "logps/rejected": -418.6033935546875, "loss": 0.3334, "rewards/chosen": -0.31686532497406006, "rewards/margins": 2.520333170890808, "rewards/rejected": -2.837198495864868, "step": 2890 }, { "epoch": 0.15323456920997536, "grad_norm": 60.0, "kl": 2.0911922454833984, "learning_rate": 5e-07, "logits/chosen": -40204848.0, "logits/rejected": -20099762.285714287, "logps/chosen": -1557.5263671875, "logps/rejected": -414.63619559151783, "loss": 0.2443, "rewards/chosen": 2.375244140625, "rewards/margins": 3.6529948370797296, "rewards/rejected": -1.2777506964547294, "step": 2891 }, { "epoch": 0.1532875732117775, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22863236.0, "logits/rejected": 9871248.0, "logps/chosen": -145.09474182128906, "logps/rejected": -349.4265950520833, "loss": 0.2625, "rewards/chosen": 0.06058730185031891, "rewards/margins": 1.6729482561349869, "rewards/rejected": -1.612360954284668, "step": 2892 }, { "epoch": 0.15334057721357963, "grad_norm": 61.0, "kl": 0.4437713623046875, "learning_rate": 5e-07, "logits/chosen": -25167312.0, "logits/rejected": -60826528.0, "logps/chosen": -263.6895345052083, "logps/rejected": -442.041015625, "loss": 0.3998, "rewards/chosen": 0.010009249051411947, "rewards/margins": 2.7301966746648154, "rewards/rejected": -2.7201874256134033, "step": 2893 }, { "epoch": 0.15339358121538177, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33051206.4, "logits/rejected": 4504009.333333333, "logps/chosen": -176.0024169921875, "logps/rejected": -211.67232259114584, "loss": 0.4173, "rewards/chosen": -0.3082849979400635, "rewards/margins": 1.2615349292755127, "rewards/rejected": -1.5698199272155762, "step": 2894 }, { "epoch": 0.1534465852171839, "grad_norm": 54.0, "kl": 1.1969184875488281, "learning_rate": 5e-07, "logits/chosen": -57872986.666666664, "logits/rejected": -37280758.4, "logps/chosen": -402.97900390625, "logps/rejected": -406.156591796875, "loss": 0.2747, "rewards/chosen": 0.34477949142456055, "rewards/margins": 2.668164348602295, "rewards/rejected": -2.3233848571777345, "step": 2895 }, { "epoch": 0.15349958921898604, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12521046.857142856, "logits/rejected": -6554211.0, "logps/chosen": -268.10396902901783, "logps/rejected": -69.18046569824219, "loss": 0.4131, "rewards/chosen": 0.2804323605128697, "rewards/margins": 1.401509302003043, "rewards/rejected": -1.1210769414901733, "step": 2896 }, { "epoch": 0.15355259322078818, "grad_norm": 62.75, "kl": 0.6404838562011719, "learning_rate": 5e-07, "logits/chosen": -19542545.6, "logits/rejected": 36832530.666666664, "logps/chosen": -448.5056640625, "logps/rejected": -496.1734212239583, "loss": 0.3267, "rewards/chosen": 0.22787842750549317, "rewards/margins": 3.0312034447987877, "rewards/rejected": -2.8033250172932944, "step": 2897 }, { "epoch": 0.15360559722259032, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22427229.333333332, "logits/rejected": -44018384.0, "logps/chosen": -176.880615234375, "logps/rejected": -409.790087890625, "loss": 0.2759, "rewards/chosen": 0.19317648808161417, "rewards/margins": 2.060636548201243, "rewards/rejected": -1.867460060119629, "step": 2898 }, { "epoch": 0.15365860122439245, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58931908.0, "logits/rejected": -28185892.0, "logps/chosen": -491.5220947265625, "logps/rejected": -382.59527587890625, "loss": 0.363, "rewards/chosen": -0.1959129124879837, "rewards/margins": 1.5121540278196335, "rewards/rejected": -1.7080669403076172, "step": 2899 }, { "epoch": 0.15371160522619456, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59658256.0, "logits/rejected": -12032258.666666666, "logps/chosen": -581.7568359375, "logps/rejected": -373.2576497395833, "loss": 0.2017, "rewards/chosen": 0.28921812772750854, "rewards/margins": 2.7478922406832376, "rewards/rejected": -2.458674112955729, "step": 2900 }, { "epoch": 0.1537646092279967, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1337489.0, "logits/rejected": -9902700.666666666, "logps/chosen": -79.13408813476562, "logps/rejected": -314.5284830729167, "loss": 0.4519, "rewards/chosen": -0.38588101863861085, "rewards/margins": 0.9157838106155396, "rewards/rejected": -1.3016648292541504, "step": 2901 }, { "epoch": 0.15381761322979884, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9207954.666666666, "logits/rejected": -24010244.8, "logps/chosen": -97.23228963216145, "logps/rejected": -442.88955078125, "loss": 0.3377, "rewards/chosen": -0.13191655278205872, "rewards/margins": 1.2488349616527556, "rewards/rejected": -1.3807515144348144, "step": 2902 }, { "epoch": 0.15387061723160098, "grad_norm": 46.25, "kl": 0.16415023803710938, "learning_rate": 5e-07, "logits/chosen": -51638704.0, "logits/rejected": -9016515.333333334, "logps/chosen": -178.21412353515626, "logps/rejected": -121.39769490559895, "loss": 0.4061, "rewards/chosen": 0.023716819286346436, "rewards/margins": 1.1786523381868999, "rewards/rejected": -1.1549355189005535, "step": 2903 }, { "epoch": 0.1539236212334031, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42363091.2, "logits/rejected": -46826597.333333336, "logps/chosen": -511.0076171875, "logps/rejected": -418.78076171875, "loss": 0.307, "rewards/chosen": 0.3026494026184082, "rewards/margins": 2.4828754107157387, "rewards/rejected": -2.1802260080973306, "step": 2904 }, { "epoch": 0.15397662523520525, "grad_norm": 85.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52075689.6, "logits/rejected": -5167695.333333333, "logps/chosen": -750.68642578125, "logps/rejected": -343.0722249348958, "loss": 0.4507, "rewards/chosen": -0.03995726108551025, "rewards/margins": 0.5943970521291098, "rewards/rejected": -0.63435431321462, "step": 2905 }, { "epoch": 0.1540296292370074, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27503420.8, "logits/rejected": 37858048.0, "logps/chosen": -335.6531494140625, "logps/rejected": -355.1038818359375, "loss": 0.3315, "rewards/chosen": 0.42523531913757323, "rewards/margins": 1.7508225917816163, "rewards/rejected": -1.325587272644043, "step": 2906 }, { "epoch": 0.15408263323880952, "grad_norm": 32.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12379252.0, "logits/rejected": -28445507.2, "logps/chosen": -135.00770060221353, "logps/rejected": -290.4900390625, "loss": 0.3018, "rewards/chosen": 0.3766220013300578, "rewards/margins": 2.3577735821406045, "rewards/rejected": -1.9811515808105469, "step": 2907 }, { "epoch": 0.15413563724061166, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2044511.0, "logits/rejected": -16909304.0, "logps/chosen": -369.0499267578125, "logps/rejected": -278.08892822265625, "loss": 0.3175, "rewards/chosen": 0.30677226185798645, "rewards/margins": 1.981294184923172, "rewards/rejected": -1.6745219230651855, "step": 2908 }, { "epoch": 0.1541886412424138, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45158261.333333336, "logits/rejected": -3310381.5, "logps/chosen": -171.9637451171875, "logps/rejected": -44.42378234863281, "loss": 0.4654, "rewards/chosen": -0.2056144674619039, "rewards/margins": 1.0883830587069194, "rewards/rejected": -1.2939975261688232, "step": 2909 }, { "epoch": 0.15424164524421594, "grad_norm": 34.0, "kl": 0.10747909545898438, "learning_rate": 5e-07, "logits/chosen": 7198022.666666667, "logits/rejected": -13243682.4, "logps/chosen": -136.2028605143229, "logps/rejected": -258.468505859375, "loss": 0.2073, "rewards/chosen": 0.5880702336629232, "rewards/margins": 2.7238452275594076, "rewards/rejected": -2.1357749938964843, "step": 2910 }, { "epoch": 0.15429464924601807, "grad_norm": 56.5, "kl": 0.12700271606445312, "learning_rate": 5e-07, "logits/chosen": -59297380.0, "logits/rejected": -3035989.5, "logps/chosen": -437.72540283203125, "logps/rejected": -286.5899658203125, "loss": 0.3094, "rewards/chosen": 0.4237356185913086, "rewards/margins": 1.7949577569961548, "rewards/rejected": -1.3712221384048462, "step": 2911 }, { "epoch": 0.1543476532478202, "grad_norm": 45.0, "kl": 0.1851024627685547, "learning_rate": 5e-07, "logits/chosen": -26114668.0, "logits/rejected": -39386548.0, "logps/chosen": -240.12277221679688, "logps/rejected": -177.64236450195312, "loss": 0.3505, "rewards/chosen": -0.1610812246799469, "rewards/margins": 1.596137136220932, "rewards/rejected": -1.757218360900879, "step": 2912 }, { "epoch": 0.15440065724962235, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15565649.333333334, "logits/rejected": -47249097.6, "logps/chosen": -206.0280965169271, "logps/rejected": -462.2599609375, "loss": 0.3191, "rewards/chosen": 0.022497499982515972, "rewards/margins": 1.5866605172554653, "rewards/rejected": -1.5641630172729493, "step": 2913 }, { "epoch": 0.15445366125142448, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7425076.5, "logits/rejected": -17835572.0, "logps/chosen": -412.70355224609375, "logps/rejected": -241.46002197265625, "loss": 0.3259, "rewards/chosen": 0.3584485650062561, "rewards/margins": 1.6591903567314148, "rewards/rejected": -1.3007417917251587, "step": 2914 }, { "epoch": 0.15450666525322662, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61923020.0, "logits/rejected": -12576414.666666666, "logps/chosen": -381.8403625488281, "logps/rejected": -299.49021402994794, "loss": 0.2553, "rewards/chosen": -0.007241060957312584, "rewards/margins": 1.6610026651372511, "rewards/rejected": -1.6682437260945637, "step": 2915 }, { "epoch": 0.15455966925502876, "grad_norm": 47.5, "kl": 0.45096397399902344, "learning_rate": 5e-07, "logits/chosen": -24811928.0, "logits/rejected": -46768032.0, "logps/chosen": -180.08270263671875, "logps/rejected": -174.18429565429688, "loss": 0.2929, "rewards/chosen": 0.28121238946914673, "rewards/margins": 2.137555420398712, "rewards/rejected": -1.8563430309295654, "step": 2916 }, { "epoch": 0.1546126732568309, "grad_norm": 67.0, "kl": 0.760498046875, "learning_rate": 5e-07, "logits/chosen": -62458432.0, "logits/rejected": -42537240.0, "logps/chosen": -405.8550618489583, "logps/rejected": -364.0716247558594, "loss": 0.4171, "rewards/chosen": -0.061029563347498574, "rewards/margins": 2.9605811734994254, "rewards/rejected": -3.021610736846924, "step": 2917 }, { "epoch": 0.15466567725863303, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56411354.666666664, "logits/rejected": -39999328.0, "logps/chosen": -577.3646240234375, "logps/rejected": -376.418994140625, "loss": 0.3371, "rewards/chosen": -0.15898718436559042, "rewards/margins": 1.4598610440889994, "rewards/rejected": -1.6188482284545898, "step": 2918 }, { "epoch": 0.15471868126043517, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1011033.25, "logits/rejected": -40209796.571428575, "logps/chosen": -33.649505615234375, "logps/rejected": -285.27784946986606, "loss": 0.2084, "rewards/chosen": 0.20362626016139984, "rewards/margins": 1.8378222691161292, "rewards/rejected": -1.6341960089547294, "step": 2919 }, { "epoch": 0.1547716852622373, "grad_norm": 76.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3654532.0, "logits/rejected": -35324276.0, "logps/chosen": -434.8070882161458, "logps/rejected": -365.35797119140625, "loss": 0.338, "rewards/chosen": 0.36717530091603595, "rewards/margins": 2.9792909224828086, "rewards/rejected": -2.6121156215667725, "step": 2920 }, { "epoch": 0.15482468926403944, "grad_norm": 41.0, "kl": 0.11367416381835938, "learning_rate": 5e-07, "logits/chosen": -9863880.0, "logits/rejected": -21915085.333333332, "logps/chosen": -178.19351806640626, "logps/rejected": -257.6263427734375, "loss": 0.3398, "rewards/chosen": 0.2522265911102295, "rewards/margins": 1.904591957728068, "rewards/rejected": -1.6523653666178386, "step": 2921 }, { "epoch": 0.15487769326584158, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36889780.0, "logits/rejected": -97754720.0, "logps/chosen": -214.97409057617188, "logps/rejected": -668.6934814453125, "loss": 0.2639, "rewards/chosen": 0.07794523239135742, "rewards/margins": 3.2039377689361572, "rewards/rejected": -3.1259925365448, "step": 2922 }, { "epoch": 0.15493069726764372, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4905798.4, "logits/rejected": -27024834.666666668, "logps/chosen": -203.4621337890625, "logps/rejected": -261.8100179036458, "loss": 0.3637, "rewards/chosen": 0.23212883472442628, "rewards/margins": 1.6011011362075807, "rewards/rejected": -1.3689723014831543, "step": 2923 }, { "epoch": 0.15498370126944586, "grad_norm": 64.5, "kl": 0.7173805236816406, "learning_rate": 5e-07, "logits/chosen": -18038082.285714287, "logits/rejected": 78664960.0, "logps/chosen": -276.9375, "logps/rejected": -742.2241821289062, "loss": 0.5107, "rewards/chosen": -0.13675192424229213, "rewards/margins": 1.166135038648333, "rewards/rejected": -1.302886962890625, "step": 2924 }, { "epoch": 0.15503670527124797, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23412978.0, "logits/rejected": -27108876.0, "logps/chosen": -432.8180847167969, "logps/rejected": -190.73788452148438, "loss": 0.3279, "rewards/chosen": 0.2344266027212143, "rewards/margins": 1.673981413245201, "rewards/rejected": -1.4395548105239868, "step": 2925 }, { "epoch": 0.1550897092730501, "grad_norm": 54.25, "kl": 0.8252792358398438, "learning_rate": 5e-07, "logits/chosen": -60243520.0, "logits/rejected": -52789990.4, "logps/chosen": -413.4853515625, "logps/rejected": -176.920068359375, "loss": 0.3018, "rewards/chosen": 0.2504832148551941, "rewards/margins": 1.6153627276420592, "rewards/rejected": -1.3648795127868651, "step": 2926 }, { "epoch": 0.15514271327485224, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45154602.666666664, "logits/rejected": -18806760.0, "logps/chosen": -353.9456380208333, "logps/rejected": -272.974560546875, "loss": 0.3794, "rewards/chosen": -0.4272572994232178, "rewards/margins": 0.8640632152557373, "rewards/rejected": -1.291320514678955, "step": 2927 }, { "epoch": 0.15519571727665438, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24339786.0, "logits/rejected": -19829960.0, "logps/chosen": -190.51779174804688, "logps/rejected": -190.96737670898438, "loss": 0.3669, "rewards/chosen": -0.3445165753364563, "rewards/margins": 1.3776677250862122, "rewards/rejected": -1.7221843004226685, "step": 2928 }, { "epoch": 0.15524872127845651, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13758922.666666666, "logits/rejected": -47621123.2, "logps/chosen": -216.17106119791666, "logps/rejected": -511.7291015625, "loss": 0.2609, "rewards/chosen": 0.18652725219726562, "rewards/margins": 2.8493223190307617, "rewards/rejected": -2.662795066833496, "step": 2929 }, { "epoch": 0.15530172528025865, "grad_norm": 103.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1613053.0, "logits/rejected": -20676854.4, "logps/chosen": -118.72422281901042, "logps/rejected": -337.4349365234375, "loss": 0.2471, "rewards/chosen": 0.5041725238164266, "rewards/margins": 2.330180557568868, "rewards/rejected": -1.8260080337524414, "step": 2930 }, { "epoch": 0.1553547292820608, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1496865.5, "logits/rejected": -21663492.0, "logps/chosen": -349.9276123046875, "logps/rejected": -318.8415120442708, "loss": 0.1966, "rewards/chosen": 0.6423065066337585, "rewards/margins": 2.8328031102816262, "rewards/rejected": -2.1904966036478677, "step": 2931 }, { "epoch": 0.15540773328386293, "grad_norm": 80.0, "kl": 1.3136825561523438, "learning_rate": 5e-07, "logits/chosen": -39544656.0, "logps/chosen": -447.3876953125, "loss": 0.4796, "rewards/chosen": 0.21486932039260864, "step": 2932 }, { "epoch": 0.15546073728566506, "grad_norm": 53.0, "kl": 0.19125747680664062, "learning_rate": 5e-07, "logits/chosen": -60139973.333333336, "logits/rejected": -48714307.2, "logps/chosen": -417.5584309895833, "logps/rejected": -432.260205078125, "loss": 0.2647, "rewards/chosen": 0.29919230937957764, "rewards/margins": 2.2198670148849486, "rewards/rejected": -1.920674705505371, "step": 2933 }, { "epoch": 0.1555137412874672, "grad_norm": 76.0, "kl": 0.6349945068359375, "learning_rate": 5e-07, "logits/chosen": -64114316.8, "logits/rejected": -26538989.333333332, "logps/chosen": -688.161376953125, "logps/rejected": -250.39188639322916, "loss": 0.3475, "rewards/chosen": 0.501332426071167, "rewards/margins": 1.7178251584370932, "rewards/rejected": -1.216492732365926, "step": 2934 }, { "epoch": 0.15556674528926934, "grad_norm": 62.5, "kl": 0.9342498779296875, "learning_rate": 5e-07, "logits/chosen": -40747430.4, "logits/rejected": -10917538.666666666, "logps/chosen": -354.464306640625, "logps/rejected": -344.488525390625, "loss": 0.3685, "rewards/chosen": 0.22083861827850343, "rewards/margins": 1.7871442556381225, "rewards/rejected": -1.5663056373596191, "step": 2935 }, { "epoch": 0.15561974929107147, "grad_norm": 59.5, "kl": 1.1041698455810547, "learning_rate": 5e-07, "logits/chosen": -21179108.0, "logits/rejected": -10105082.0, "logps/chosen": -1858.7255859375, "logps/rejected": -241.2645467122396, "loss": 0.18, "rewards/chosen": 1.9888579845428467, "rewards/margins": 3.407752752304077, "rewards/rejected": -1.4188947677612305, "step": 2936 }, { "epoch": 0.1556727532928736, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31952732.8, "logits/rejected": -24614666.666666668, "logps/chosen": -216.631201171875, "logps/rejected": -252.93499755859375, "loss": 0.3561, "rewards/chosen": 0.3055055379867554, "rewards/margins": 1.5977263689041137, "rewards/rejected": -1.2922208309173584, "step": 2937 }, { "epoch": 0.15572575729467575, "grad_norm": 46.75, "kl": 0.241485595703125, "learning_rate": 5e-07, "logits/chosen": -29478896.0, "logits/rejected": -9572703.0, "logps/chosen": -242.2546183268229, "logps/rejected": -569.4490966796875, "loss": 0.3702, "rewards/chosen": 0.12749621272087097, "rewards/margins": 3.1132459342479706, "rewards/rejected": -2.9857497215270996, "step": 2938 }, { "epoch": 0.15577876129647789, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74254496.0, "logits/rejected": -25521301.333333332, "logps/chosen": -336.08087158203125, "logps/rejected": -364.466796875, "loss": 0.249, "rewards/chosen": 0.26958560943603516, "rewards/margins": 2.0416789054870605, "rewards/rejected": -1.7720932960510254, "step": 2939 }, { "epoch": 0.15583176529828002, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33985600.0, "logits/rejected": -21023272.0, "logps/chosen": -348.35400390625, "logps/rejected": -337.263427734375, "loss": 0.321, "rewards/chosen": -0.05139007791876793, "rewards/margins": 1.6054235436022282, "rewards/rejected": -1.656813621520996, "step": 2940 }, { "epoch": 0.15588476930008216, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19416301.333333332, "logits/rejected": -36053724.8, "logps/chosen": -76.36798095703125, "logps/rejected": -297.14873046875, "loss": 0.3401, "rewards/chosen": -0.32739512125651044, "rewards/margins": 1.412173016866048, "rewards/rejected": -1.7395681381225585, "step": 2941 }, { "epoch": 0.1559377733018843, "grad_norm": 55.5, "kl": 0.511383056640625, "learning_rate": 5e-07, "logits/chosen": -33064144.0, "logits/rejected": -33587664.0, "logps/chosen": -306.4666015625, "logps/rejected": -320.39227294921875, "loss": 0.3054, "rewards/chosen": 0.4193441867828369, "rewards/margins": 3.0179434617360434, "rewards/rejected": -2.5985992749532065, "step": 2942 }, { "epoch": 0.15599077730368643, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14004805.0, "logits/rejected": -15776058.0, "logps/chosen": -201.15110778808594, "logps/rejected": -291.01776123046875, "loss": 0.3144, "rewards/chosen": 0.23309722542762756, "rewards/margins": 2.214177280664444, "rewards/rejected": -1.9810800552368164, "step": 2943 }, { "epoch": 0.15604378130548857, "grad_norm": 32.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21704336.0, "logits/rejected": -26212037.333333332, "logps/chosen": -221.51409912109375, "logps/rejected": -422.0143229166667, "loss": 0.2373, "rewards/chosen": 0.17808571457862854, "rewards/margins": 2.5118838051954904, "rewards/rejected": -2.333798090616862, "step": 2944 }, { "epoch": 0.1560967853072907, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59259416.0, "logits/rejected": -35375696.0, "logps/chosen": -364.51959228515625, "logps/rejected": -304.9820963541667, "loss": 0.3185, "rewards/chosen": -0.08924255520105362, "rewards/margins": 1.3305769587556522, "rewards/rejected": -1.4198195139567058, "step": 2945 }, { "epoch": 0.15614978930909285, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34359924.0, "logits/rejected": -4617565.0, "logps/chosen": -209.625732421875, "logps/rejected": -400.12786865234375, "loss": 0.3093, "rewards/chosen": 0.11219597607851028, "rewards/margins": 2.1963453367352486, "rewards/rejected": -2.0841493606567383, "step": 2946 }, { "epoch": 0.15620279331089498, "grad_norm": 56.75, "kl": 0.597841739654541, "learning_rate": 5e-07, "logits/chosen": -4686441.2, "logits/rejected": -21658540.0, "logps/chosen": -227.8918701171875, "logps/rejected": -169.3327840169271, "loss": 0.4032, "rewards/chosen": 0.08463290929794312, "rewards/margins": 1.3241581479708355, "rewards/rejected": -1.2395252386728923, "step": 2947 }, { "epoch": 0.15625579731269712, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 159094.66666666666, "logits/rejected": -33972793.6, "logps/chosen": -305.1760660807292, "logps/rejected": -602.6419921875, "loss": 0.2582, "rewards/chosen": -0.2978408734003703, "rewards/margins": 2.4821964343388876, "rewards/rejected": -2.7800373077392577, "step": 2948 }, { "epoch": 0.15630880131449926, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8516668.0, "logits/rejected": -21387742.4, "logps/chosen": -311.79567464192706, "logps/rejected": -56.559661865234375, "loss": 0.4234, "rewards/chosen": -0.08555946747461955, "rewards/margins": 0.6412490328152974, "rewards/rejected": -0.726808500289917, "step": 2949 }, { "epoch": 0.1563618053163014, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7135817.333333333, "logits/rejected": -14716852.8, "logps/chosen": -367.2776285807292, "logps/rejected": -148.74818115234376, "loss": 0.2586, "rewards/chosen": 0.22667006651560465, "rewards/margins": 2.2001495281855266, "rewards/rejected": -1.9734794616699218, "step": 2950 }, { "epoch": 0.1564148093181035, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7195594.0, "logits/rejected": -14768133.333333334, "logps/chosen": -258.369140625, "logps/rejected": -208.45353190104166, "loss": 0.3029, "rewards/chosen": -0.13194532692432404, "rewards/margins": 1.245614970723788, "rewards/rejected": -1.377560297648112, "step": 2951 }, { "epoch": 0.15646781331990564, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39528616.0, "logits/rejected": -34739378.666666664, "logps/chosen": -485.6549987792969, "logps/rejected": -204.6690673828125, "loss": 0.2641, "rewards/chosen": 0.1262008547782898, "rewards/margins": 1.706531544526418, "rewards/rejected": -1.5803306897481282, "step": 2952 }, { "epoch": 0.15652081732170778, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69428392.0, "logits/rejected": -17906848.0, "logps/chosen": -378.421875, "logps/rejected": -227.20120239257812, "loss": 0.3271, "rewards/chosen": 0.014307968318462372, "rewards/margins": 1.9074582979083061, "rewards/rejected": -1.8931503295898438, "step": 2953 }, { "epoch": 0.15657382132350992, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30848584.0, "logits/rejected": 12677310.0, "logps/chosen": -444.11517333984375, "logps/rejected": -299.9405822753906, "loss": 0.348, "rewards/chosen": 0.1778222620487213, "rewards/margins": 1.7077716886997223, "rewards/rejected": -1.529949426651001, "step": 2954 }, { "epoch": 0.15662682532531205, "grad_norm": 43.5, "kl": 0.06688690185546875, "learning_rate": 5e-07, "logits/chosen": -34070272.0, "logits/rejected": -35655778.666666664, "logps/chosen": -193.20631408691406, "logps/rejected": -299.9093017578125, "loss": 0.2606, "rewards/chosen": 0.19906730949878693, "rewards/margins": 1.7695778558651607, "rewards/rejected": -1.5705105463663738, "step": 2955 }, { "epoch": 0.1566798293271142, "grad_norm": 30.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4431316.666666667, "logits/rejected": -18980358.4, "logps/chosen": -23.530540466308594, "logps/rejected": -276.7042236328125, "loss": 0.2991, "rewards/chosen": 0.2614600658416748, "rewards/margins": 1.8500750064849854, "rewards/rejected": -1.5886149406433105, "step": 2956 }, { "epoch": 0.15673283332891633, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59480339.2, "logits/rejected": -43834517.333333336, "logps/chosen": -374.7332763671875, "logps/rejected": -227.06400553385416, "loss": 0.3786, "rewards/chosen": -0.008250012993812561, "rewards/margins": 1.787975216905276, "rewards/rejected": -1.7962252298990886, "step": 2957 }, { "epoch": 0.15678583733071846, "grad_norm": 53.75, "kl": 0.9154872894287109, "learning_rate": 5e-07, "logits/chosen": -27448141.333333332, "logits/rejected": -8875290.0, "logps/chosen": -243.4642333984375, "logps/rejected": -396.7707824707031, "loss": 0.4111, "rewards/chosen": 0.27188026905059814, "rewards/margins": 2.4630943536758423, "rewards/rejected": -2.191214084625244, "step": 2958 }, { "epoch": 0.1568388413325206, "grad_norm": 56.0, "kl": 0.39269256591796875, "learning_rate": 5e-07, "logits/chosen": -16507013.333333334, "logits/rejected": -6786780.0, "logps/chosen": -247.11749267578125, "logps/rejected": -276.4729919433594, "loss": 0.4369, "rewards/chosen": 0.008646488810578981, "rewards/margins": 1.3290376669416826, "rewards/rejected": -1.3203911781311035, "step": 2959 }, { "epoch": 0.15689184533432274, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29704840.0, "logits/rejected": -24990512.0, "logps/chosen": -152.10209147135416, "logps/rejected": -325.2585693359375, "loss": 0.3291, "rewards/chosen": -0.17887471119562784, "rewards/margins": 1.2550310810407002, "rewards/rejected": -1.433905792236328, "step": 2960 }, { "epoch": 0.15694484933612488, "grad_norm": 58.5, "kl": 0.08580970764160156, "learning_rate": 5e-07, "logits/chosen": -8724495.0, "logits/rejected": -3505310.0, "logps/chosen": -312.0575866699219, "logps/rejected": -248.4566650390625, "loss": 0.3499, "rewards/chosen": 0.13598975539207458, "rewards/margins": 1.4431473910808563, "rewards/rejected": -1.3071576356887817, "step": 2961 }, { "epoch": 0.156997853337927, "grad_norm": 56.25, "kl": 1.124359130859375, "learning_rate": 5e-07, "logits/chosen": -20938838.666666668, "logits/rejected": -29451100.0, "logps/chosen": -265.6581624348958, "logps/rejected": -186.9355010986328, "loss": 0.3874, "rewards/chosen": 0.4013410011927287, "rewards/margins": 1.6340238253275554, "rewards/rejected": -1.2326828241348267, "step": 2962 }, { "epoch": 0.15705085733972915, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42672084.0, "logits/rejected": -1798302.5, "logps/chosen": -76.74125671386719, "logps/rejected": -82.51206970214844, "loss": 0.3703, "rewards/chosen": -0.3279518187046051, "rewards/margins": 1.3490183055400848, "rewards/rejected": -1.67697012424469, "step": 2963 }, { "epoch": 0.1571038613415313, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26712762.0, "logits/rejected": -19276720.0, "logps/chosen": -178.46954345703125, "logps/rejected": -262.0224609375, "loss": 0.3905, "rewards/chosen": -0.16138191521167755, "rewards/margins": 1.1145167797803879, "rewards/rejected": -1.2758986949920654, "step": 2964 }, { "epoch": 0.15715686534333342, "grad_norm": 45.0, "kl": 0.04062175750732422, "learning_rate": 5e-07, "logits/chosen": -19130653.333333332, "logits/rejected": -31759577.6, "logps/chosen": -43.94299825032552, "logps/rejected": -303.736279296875, "loss": 0.3064, "rewards/chosen": -0.18526687224706015, "rewards/margins": 1.6840726892153424, "rewards/rejected": -1.8693395614624024, "step": 2965 }, { "epoch": 0.15720986934513556, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14152352.0, "logits/rejected": -750821.6875, "logps/chosen": -236.68839518229166, "logps/rejected": -89.43557739257812, "loss": 0.3717, "rewards/chosen": 0.20210119088490805, "rewards/margins": 2.1062385638554892, "rewards/rejected": -1.904137372970581, "step": 2966 }, { "epoch": 0.1572628733469377, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16679308.0, "logits/rejected": -1971706.5, "logps/chosen": -332.94793701171875, "logps/rejected": -340.2728271484375, "loss": 0.3106, "rewards/chosen": 0.24889519810676575, "rewards/margins": 1.8173175156116486, "rewards/rejected": -1.5684223175048828, "step": 2967 }, { "epoch": 0.15731587734873984, "grad_norm": 61.0, "kl": 1.54937744140625, "learning_rate": 5e-07, "logits/chosen": -54204917.333333336, "logits/rejected": -17603616.0, "logps/chosen": -367.9391276041667, "logps/rejected": -253.65830078125, "loss": 0.2755, "rewards/chosen": 0.4434132973353068, "rewards/margins": 2.3061261574427285, "rewards/rejected": -1.8627128601074219, "step": 2968 }, { "epoch": 0.15736888135054197, "grad_norm": 67.0, "kl": 0.17588043212890625, "learning_rate": 5e-07, "logits/chosen": 951158.4, "logits/rejected": -22752680.0, "logps/chosen": -306.1063232421875, "logps/rejected": -203.6467081705729, "loss": 0.3607, "rewards/chosen": 0.1812812089920044, "rewards/margins": 1.77662988503774, "rewards/rejected": -1.5953486760457356, "step": 2969 }, { "epoch": 0.1574218853523441, "grad_norm": 69.5, "kl": 0.8956985473632812, "learning_rate": 5e-07, "logits/chosen": -6944000.666666667, "logits/rejected": -18756428.0, "logps/chosen": -329.2668863932292, "logps/rejected": -411.2747802734375, "loss": 0.3455, "rewards/chosen": 0.48934078216552734, "rewards/margins": 2.3880605697631836, "rewards/rejected": -1.8987197875976562, "step": 2970 }, { "epoch": 0.15747488935414625, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39337366.4, "logits/rejected": -37630437.333333336, "logps/chosen": -259.2170166015625, "logps/rejected": -477.8934733072917, "loss": 0.342, "rewards/chosen": 0.2913181304931641, "rewards/margins": 1.8159146626790363, "rewards/rejected": -1.5245965321858723, "step": 2971 }, { "epoch": 0.15752789335594838, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27082268.0, "logits/rejected": -17197122.666666668, "logps/chosen": -211.69198608398438, "logps/rejected": -377.922607421875, "loss": 0.216, "rewards/chosen": 0.01591205596923828, "rewards/margins": 2.2590607007344565, "rewards/rejected": -2.2431486447652182, "step": 2972 }, { "epoch": 0.15758089735775052, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 38385442.666666664, "logits/rejected": -7652921.6, "logps/chosen": -430.6976725260417, "logps/rejected": -268.488232421875, "loss": 0.3622, "rewards/chosen": -0.04450251658757528, "rewards/margins": 1.0033225337664287, "rewards/rejected": -1.0478250503540039, "step": 2973 }, { "epoch": 0.15763390135955266, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23778408.0, "logits/rejected": -6714468.5, "logps/chosen": -244.62369791666666, "logps/rejected": -336.1027526855469, "loss": 0.2973, "rewards/chosen": 0.5632921059926351, "rewards/margins": 2.934638579686483, "rewards/rejected": -2.3713464736938477, "step": 2974 }, { "epoch": 0.1576869053613548, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14545653.0, "logits/rejected": -36011748.0, "logps/chosen": -150.66941833496094, "logps/rejected": -406.824951171875, "loss": 0.2691, "rewards/chosen": 0.5257084965705872, "rewards/margins": 2.3467671275138855, "rewards/rejected": -1.8210586309432983, "step": 2975 }, { "epoch": 0.1577399093631569, "grad_norm": 59.25, "kl": 0.1529388427734375, "learning_rate": 5e-07, "logits/chosen": -32787507.2, "logits/rejected": -29984629.333333332, "logps/chosen": -272.767041015625, "logps/rejected": -257.7881673177083, "loss": 0.4724, "rewards/chosen": -0.21393661499023436, "rewards/margins": 0.5379722436269125, "rewards/rejected": -0.7519088586171468, "step": 2976 }, { "epoch": 0.15779291336495904, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48598104.0, "logits/rejected": -64436572.0, "logps/chosen": -248.09530639648438, "logps/rejected": -404.86993408203125, "loss": 0.3391, "rewards/chosen": 0.08390884101390839, "rewards/margins": 1.5739034861326218, "rewards/rejected": -1.4899946451187134, "step": 2977 }, { "epoch": 0.15784591736676118, "grad_norm": 50.25, "kl": 2.160646438598633, "learning_rate": 5e-07, "logits/chosen": -41453228.0, "logits/rejected": -55580100.0, "logps/chosen": -626.6637573242188, "logps/rejected": -368.5779724121094, "loss": 0.2678, "rewards/chosen": 0.5763554573059082, "rewards/margins": 3.380916118621826, "rewards/rejected": -2.804560661315918, "step": 2978 }, { "epoch": 0.15789892136856332, "grad_norm": 52.75, "kl": 0.718719482421875, "learning_rate": 5e-07, "logits/chosen": -16685000.0, "logits/rejected": -7214746.0, "logps/chosen": -276.6096923828125, "logps/rejected": -240.7652587890625, "loss": 0.355, "rewards/chosen": 0.6451748847961426, "rewards/margins": 1.5063756148020426, "rewards/rejected": -0.8612007300059, "step": 2979 }, { "epoch": 0.15795192537036545, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14835154.666666666, "logits/rejected": -42926905.6, "logps/chosen": -255.88338216145834, "logps/rejected": -376.442578125, "loss": 0.2587, "rewards/chosen": 0.413711945215861, "rewards/margins": 2.227178208033244, "rewards/rejected": -1.8134662628173828, "step": 2980 }, { "epoch": 0.1580049293721676, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33726310.4, "logits/rejected": -2215142.1666666665, "logps/chosen": -236.4459228515625, "logps/rejected": -123.88309733072917, "loss": 0.3683, "rewards/chosen": 0.12921713590621947, "rewards/margins": 1.6390596906344097, "rewards/rejected": -1.5098425547281902, "step": 2981 }, { "epoch": 0.15805793337396973, "grad_norm": 52.5, "kl": 0.3923683166503906, "learning_rate": 5e-07, "logits/chosen": -56179910.4, "logits/rejected": 22255469.333333332, "logps/chosen": -227.2453369140625, "logps/rejected": -208.08736165364584, "loss": 0.4073, "rewards/chosen": 0.24318175315856932, "rewards/margins": 1.0472614447275796, "rewards/rejected": -0.8040796915690104, "step": 2982 }, { "epoch": 0.15811093737577187, "grad_norm": 54.75, "kl": 0.2890663146972656, "learning_rate": 5e-07, "logits/chosen": -56430496.0, "logits/rejected": -26345716.8, "logps/chosen": -393.4863688151042, "logps/rejected": -212.39853515625, "loss": 0.2896, "rewards/chosen": 0.32239989439646405, "rewards/margins": 1.7630041042963664, "rewards/rejected": -1.4406042098999023, "step": 2983 }, { "epoch": 0.158163941377574, "grad_norm": 58.5, "kl": 0.5311813354492188, "learning_rate": 5e-07, "logits/chosen": -24273544.0, "logits/rejected": -11656736.0, "logps/chosen": -271.2975769042969, "logps/rejected": -190.5138397216797, "loss": 0.3302, "rewards/chosen": 0.5551565289497375, "rewards/margins": 1.5126685500144958, "rewards/rejected": -0.9575120210647583, "step": 2984 }, { "epoch": 0.15821694537937614, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32084188.0, "logits/rejected": -29696552.0, "logps/chosen": -508.5038757324219, "logps/rejected": -533.844482421875, "loss": 0.3159, "rewards/chosen": 0.17040729522705078, "rewards/margins": 1.934080719947815, "rewards/rejected": -1.7636734247207642, "step": 2985 }, { "epoch": 0.15826994938117828, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65596128.0, "logits/rejected": -48994412.0, "logps/chosen": -264.74652099609375, "logps/rejected": -381.6714172363281, "loss": 0.2693, "rewards/chosen": 0.3753287196159363, "rewards/margins": 2.6879329085350037, "rewards/rejected": -2.3126041889190674, "step": 2986 }, { "epoch": 0.1583229533829804, "grad_norm": 51.75, "kl": 0.3096923828125, "learning_rate": 5e-07, "logits/chosen": -33849688.0, "logits/rejected": -55158512.0, "logps/chosen": -231.6861114501953, "logps/rejected": -216.0392608642578, "loss": 0.3611, "rewards/chosen": 0.03702640160918236, "rewards/margins": 1.359328981488943, "rewards/rejected": -1.3223025798797607, "step": 2987 }, { "epoch": 0.15837595738478255, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20029980.0, "logits/rejected": -17618243.2, "logps/chosen": -216.6662801106771, "logps/rejected": -346.913623046875, "loss": 0.3148, "rewards/chosen": -0.1292289694150289, "rewards/margins": 1.7321468393007915, "rewards/rejected": -1.8613758087158203, "step": 2988 }, { "epoch": 0.1584289613865847, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22280400.0, "logits/rejected": -21480713.6, "logps/chosen": -188.205810546875, "logps/rejected": -317.55009765625, "loss": 0.2792, "rewards/chosen": 0.11525122324625652, "rewards/margins": 1.8290590922037762, "rewards/rejected": -1.7138078689575196, "step": 2989 }, { "epoch": 0.15848196538838683, "grad_norm": 72.0, "kl": 1.9776535034179688, "learning_rate": 5e-07, "logits/chosen": 2172489.0, "logits/rejected": -60113656.0, "logps/chosen": -409.708251953125, "logps/rejected": -444.39404296875, "loss": 0.3563, "rewards/chosen": 0.5516406297683716, "rewards/margins": 2.687503457069397, "rewards/rejected": -2.1358628273010254, "step": 2990 }, { "epoch": 0.15853496939018896, "grad_norm": 51.25, "kl": 0.2489337921142578, "learning_rate": 5e-07, "logits/chosen": -10117623.333333334, "logits/rejected": 642841.875, "logps/chosen": -269.3363037109375, "logps/rejected": -108.60713958740234, "loss": 0.387, "rewards/chosen": 0.416525920232137, "rewards/margins": 1.1802721222241719, "rewards/rejected": -0.7637462019920349, "step": 2991 }, { "epoch": 0.1585879733919911, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1582240.4, "logits/rejected": -96818560.0, "logps/chosen": -408.3532470703125, "logps/rejected": -444.9312744140625, "loss": 0.3575, "rewards/chosen": 0.19212547540664673, "rewards/margins": 1.7659714976946514, "rewards/rejected": -1.5738460222880046, "step": 2992 }, { "epoch": 0.15864097739379324, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1212120.25, "logits/rejected": -8219761.142857143, "logps/chosen": -28.053966522216797, "logps/rejected": -256.75784737723217, "loss": 0.249, "rewards/chosen": -0.43370458483695984, "rewards/margins": 1.091687947511673, "rewards/rejected": -1.5253925323486328, "step": 2993 }, { "epoch": 0.15869398139559537, "grad_norm": 56.0, "kl": 0.45800113677978516, "learning_rate": 5e-07, "logits/chosen": -49801944.0, "logits/rejected": -23802038.0, "logps/chosen": -429.5001220703125, "logps/rejected": -94.5167236328125, "loss": 0.3512, "rewards/chosen": 0.3160311281681061, "rewards/margins": 1.2940811216831207, "rewards/rejected": -0.9780499935150146, "step": 2994 }, { "epoch": 0.1587469853973975, "grad_norm": 61.75, "kl": 0.36400413513183594, "learning_rate": 5e-07, "logits/chosen": -21089300.8, "logits/rejected": -15013806.666666666, "logps/chosen": -380.8814697265625, "logps/rejected": -215.30619303385416, "loss": 0.3578, "rewards/chosen": 0.3067315578460693, "rewards/margins": 1.8146397431691486, "rewards/rejected": -1.5079081853230794, "step": 2995 }, { "epoch": 0.15879998939919965, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23309742.0, "logits/rejected": -19825774.0, "logps/chosen": -350.44903564453125, "logps/rejected": -419.4928283691406, "loss": 0.2262, "rewards/chosen": 0.6928123831748962, "rewards/margins": 2.957777440547943, "rewards/rejected": -2.264965057373047, "step": 2996 }, { "epoch": 0.15885299340100179, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25772202.666666668, "logits/rejected": -25330156.8, "logps/chosen": -322.197509765625, "logps/rejected": -329.936669921875, "loss": 0.3254, "rewards/chosen": -0.6919330755869547, "rewards/margins": 1.420329268773397, "rewards/rejected": -2.1122623443603517, "step": 2997 }, { "epoch": 0.15890599740280392, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -188565616.0, "logits/rejected": -35107968.0, "logps/chosen": -427.32379150390625, "logps/rejected": -412.25439453125, "loss": 0.2289, "rewards/chosen": 0.4207901358604431, "rewards/margins": 2.0723995566368103, "rewards/rejected": -1.6516094207763672, "step": 2998 }, { "epoch": 0.15895900140460606, "grad_norm": 57.0, "kl": 0.6411323547363281, "learning_rate": 5e-07, "logits/chosen": -54807936.0, "logits/rejected": 250803.75, "logps/chosen": -297.0755615234375, "logps/rejected": -134.32923889160156, "loss": 0.4487, "rewards/chosen": 0.017908096313476562, "rewards/margins": 0.5932270288467407, "rewards/rejected": -0.5753189325332642, "step": 2999 }, { "epoch": 0.1590120054064082, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46152009.6, "logits/rejected": -26385341.333333332, "logps/chosen": -213.263525390625, "logps/rejected": -236.82466634114584, "loss": 0.3884, "rewards/chosen": -0.07908798456192016, "rewards/margins": 1.5567753195762635, "rewards/rejected": -1.6358633041381836, "step": 3000 }, { "epoch": 0.1590650094082103, "grad_norm": 63.0, "kl": 0.7471160888671875, "learning_rate": 5e-07, "logits/chosen": -39352061.333333336, "logits/rejected": 1611088.0, "logps/chosen": -320.4691162109375, "logps/rejected": -169.2975311279297, "loss": 0.3957, "rewards/chosen": 0.3775485356648763, "rewards/margins": 1.3892688353856404, "rewards/rejected": -1.0117202997207642, "step": 3001 }, { "epoch": 0.15911801341001244, "grad_norm": 46.5, "kl": 0.5671539306640625, "learning_rate": 5e-07, "logits/chosen": -34890996.0, "logits/rejected": 12895184.0, "logps/chosen": -388.7770690917969, "logps/rejected": -237.30191040039062, "loss": 0.2937, "rewards/chosen": 0.43122655153274536, "rewards/margins": 2.149500548839569, "rewards/rejected": -1.7182739973068237, "step": 3002 }, { "epoch": 0.15917101741181458, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63509992.0, "logits/rejected": -15483019.0, "logps/chosen": -263.56658935546875, "logps/rejected": -301.943359375, "loss": 0.3643, "rewards/chosen": -0.1918092668056488, "rewards/margins": 1.4772873222827911, "rewards/rejected": -1.66909658908844, "step": 3003 }, { "epoch": 0.15922402141361672, "grad_norm": 57.0, "kl": 0.3694572448730469, "learning_rate": 5e-07, "logits/chosen": -52717993.14285714, "logits/rejected": -8582191.0, "logps/chosen": -263.1932373046875, "logps/rejected": -150.90255737304688, "loss": 0.4401, "rewards/chosen": 0.2220125538962228, "rewards/margins": 1.0692995531218392, "rewards/rejected": -0.8472869992256165, "step": 3004 }, { "epoch": 0.15927702541541885, "grad_norm": 57.0, "kl": 0.709259033203125, "learning_rate": 5e-07, "logits/chosen": -19671329.6, "logits/rejected": 20884882.666666668, "logps/chosen": -501.949658203125, "logps/rejected": -282.39329020182294, "loss": 0.2675, "rewards/chosen": 0.6670207500457763, "rewards/margins": 2.752357753117879, "rewards/rejected": -2.085337003072103, "step": 3005 }, { "epoch": 0.159330029417221, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3065271.75, "logits/rejected": -25002742.85714286, "logps/chosen": -281.4970703125, "logps/rejected": -329.9566127232143, "loss": 0.1799, "rewards/chosen": 0.331063836812973, "rewards/margins": 2.321863468204226, "rewards/rejected": -1.9907996313912528, "step": 3006 }, { "epoch": 0.15938303341902313, "grad_norm": 58.75, "kl": 0.18767166137695312, "learning_rate": 5e-07, "logits/chosen": -67473696.0, "logits/rejected": -26101056.0, "logps/chosen": -523.9942626953125, "logps/rejected": -83.20137023925781, "loss": 0.333, "rewards/chosen": 0.35994186997413635, "rewards/margins": 1.5139502584934235, "rewards/rejected": -1.154008388519287, "step": 3007 }, { "epoch": 0.15943603742082527, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38476204.8, "logits/rejected": -54232272.0, "logps/chosen": -295.11865234375, "logps/rejected": -479.7882893880208, "loss": 0.3499, "rewards/chosen": 0.25786728858947755, "rewards/margins": 1.9132704257965087, "rewards/rejected": -1.6554031372070312, "step": 3008 }, { "epoch": 0.1594890414226274, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14685782.0, "logits/rejected": -39870664.0, "logps/chosen": -178.7101287841797, "logps/rejected": -383.58770751953125, "loss": 0.3379, "rewards/chosen": -0.17027321457862854, "rewards/margins": 1.741964191198349, "rewards/rejected": -1.9122374057769775, "step": 3009 }, { "epoch": 0.15954204542442954, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -122055704.0, "logits/rejected": 75255456.0, "logps/chosen": -435.1113586425781, "logps/rejected": -256.7510070800781, "loss": 0.333, "rewards/chosen": 0.16050948202610016, "rewards/margins": 1.7260109335184097, "rewards/rejected": -1.5655014514923096, "step": 3010 }, { "epoch": 0.15959504942623168, "grad_norm": 58.5, "kl": 0.1335124969482422, "learning_rate": 5e-07, "logits/chosen": -39703184.0, "logits/rejected": -4106064.25, "logps/chosen": -331.45758056640625, "logps/rejected": -175.52328491210938, "loss": 0.4106, "rewards/chosen": 0.026605707903703053, "rewards/margins": 1.796293002863725, "rewards/rejected": -1.769687294960022, "step": 3011 }, { "epoch": 0.15964805342803381, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9656626.0, "logits/rejected": -29962302.0, "logps/chosen": -126.9034652709961, "logps/rejected": -175.24676513671875, "loss": 0.2795, "rewards/chosen": 0.46111011505126953, "rewards/margins": 2.3870978355407715, "rewards/rejected": -1.925987720489502, "step": 3012 }, { "epoch": 0.15970105742983595, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28830832.0, "logits/rejected": -13776242.666666666, "logps/chosen": -193.1976806640625, "logps/rejected": -186.3868611653646, "loss": 0.3824, "rewards/chosen": 0.06118217706680298, "rewards/margins": 1.5490077296892804, "rewards/rejected": -1.4878255526224773, "step": 3013 }, { "epoch": 0.1597540614316381, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32670352.0, "logits/rejected": -50077432.0, "logps/chosen": -442.0980529785156, "logps/rejected": -619.2367553710938, "loss": 0.3417, "rewards/chosen": 0.2731019854545593, "rewards/margins": 1.670714557170868, "rewards/rejected": -1.3976125717163086, "step": 3014 }, { "epoch": 0.15980706543344023, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -96883392.0, "logits/rejected": -24082902.4, "logps/chosen": -148.94999186197916, "logps/rejected": -331.8140625, "loss": 0.2793, "rewards/chosen": -0.21783868471781412, "rewards/margins": 2.033921798070272, "rewards/rejected": -2.251760482788086, "step": 3015 }, { "epoch": 0.15986006943524236, "grad_norm": 50.25, "kl": 0.3913917541503906, "learning_rate": 5e-07, "logits/chosen": -28540930.666666668, "logits/rejected": -49367832.0, "logps/chosen": -252.59676106770834, "logps/rejected": -381.8912353515625, "loss": 0.3607, "rewards/chosen": 0.32811001936594647, "rewards/margins": 2.171551744143168, "rewards/rejected": -1.8434417247772217, "step": 3016 }, { "epoch": 0.1599130734370445, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65248473.6, "logits/rejected": 3837168.0, "logps/chosen": -319.754296875, "logps/rejected": -648.5345865885416, "loss": 0.3326, "rewards/chosen": 0.08008573055267335, "rewards/margins": 2.5766515493392945, "rewards/rejected": -2.496565818786621, "step": 3017 }, { "epoch": 0.15996607743884664, "grad_norm": 65.0, "kl": 0.8271102905273438, "learning_rate": 5e-07, "logits/chosen": -67186026.66666667, "logits/rejected": 3872357.0, "logps/chosen": -331.1844482421875, "logps/rejected": -68.8490219116211, "loss": 0.4198, "rewards/chosen": 0.025521280864874523, "rewards/margins": 2.056723120311896, "rewards/rejected": -2.0312018394470215, "step": 3018 }, { "epoch": 0.16001908144064877, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45570105.6, "logits/rejected": -49472880.0, "logps/chosen": -336.434912109375, "logps/rejected": -81.17222595214844, "loss": 0.4601, "rewards/chosen": -0.19155296087265014, "rewards/margins": 0.6256492098172506, "rewards/rejected": -0.8172021706899008, "step": 3019 }, { "epoch": 0.1600720854424509, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36087576.0, "logits/rejected": -31571238.0, "logps/chosen": -570.2109375, "logps/rejected": -227.24302673339844, "loss": 0.3995, "rewards/chosen": 0.030799098312854767, "rewards/margins": 1.1148937419056892, "rewards/rejected": -1.0840946435928345, "step": 3020 }, { "epoch": 0.16012508944425305, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21235982.0, "logits/rejected": -25322330.0, "logps/chosen": -325.8787841796875, "logps/rejected": -264.6639099121094, "loss": 0.2716, "rewards/chosen": 0.7265968918800354, "rewards/margins": 2.1782827973365784, "rewards/rejected": -1.451685905456543, "step": 3021 }, { "epoch": 0.1601780934460552, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6715964.0, "logits/rejected": -23745684.0, "logps/chosen": -659.69873046875, "logps/rejected": -406.0458984375, "loss": 0.2699, "rewards/chosen": 0.8807335495948792, "rewards/margins": 2.442836105823517, "rewards/rejected": -1.5621025562286377, "step": 3022 }, { "epoch": 0.16023109744785732, "grad_norm": 57.25, "kl": 0.053325653076171875, "learning_rate": 5e-07, "logits/chosen": -7568322.0, "logits/rejected": 66135052.0, "logps/chosen": -209.4277140299479, "logps/rejected": -313.1034240722656, "loss": 0.442, "rewards/chosen": -0.10256512959798177, "rewards/margins": 1.6745672623316448, "rewards/rejected": -1.7771323919296265, "step": 3023 }, { "epoch": 0.16028410144965946, "grad_norm": 54.75, "kl": 0.1299762725830078, "learning_rate": 5e-07, "logits/chosen": -41696396.0, "logits/rejected": -2897117.75, "logps/chosen": -415.8111877441406, "logps/rejected": -135.72926330566406, "loss": 0.3433, "rewards/chosen": 0.010081395506858826, "rewards/margins": 1.7072589248418808, "rewards/rejected": -1.697177529335022, "step": 3024 }, { "epoch": 0.1603371054514616, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25861304.0, "logits/rejected": -28528220.8, "logps/chosen": -260.10207112630206, "logps/rejected": -418.738916015625, "loss": 0.1844, "rewards/chosen": 0.9694061279296875, "rewards/margins": 3.1801939010620117, "rewards/rejected": -2.210787773132324, "step": 3025 }, { "epoch": 0.16039010945326374, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55438992.0, "logits/rejected": -10346883.0, "logps/chosen": -522.0626831054688, "logps/rejected": -96.5529556274414, "loss": 0.375, "rewards/chosen": 0.04399318993091583, "rewards/margins": 1.1904594451189041, "rewards/rejected": -1.1464662551879883, "step": 3026 }, { "epoch": 0.16044311345506584, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26994734.0, "logits/rejected": -38235104.0, "logps/chosen": -323.2525939941406, "logps/rejected": -290.83628336588544, "loss": 0.2567, "rewards/chosen": -0.26454752683639526, "rewards/margins": 1.6675321062405903, "rewards/rejected": -1.9320796330769856, "step": 3027 }, { "epoch": 0.16049611745686798, "grad_norm": 71.5, "kl": 0.13703346252441406, "learning_rate": 5e-07, "logits/chosen": -56393456.0, "logits/rejected": -2800674.0, "logps/chosen": -610.750244140625, "logps/rejected": -182.44833374023438, "loss": 0.3723, "rewards/chosen": -0.043502792716026306, "rewards/margins": 1.2943899780511856, "rewards/rejected": -1.337892770767212, "step": 3028 }, { "epoch": 0.16054912145867012, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3680376.0, "logits/rejected": -9095612.8, "logps/chosen": -372.652587890625, "logps/rejected": -374.4765380859375, "loss": 0.2789, "rewards/chosen": 0.2683003942171733, "rewards/margins": 1.8475639859835307, "rewards/rejected": -1.5792635917663573, "step": 3029 }, { "epoch": 0.16060212546047226, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34969152.0, "logits/rejected": -21673945.6, "logps/chosen": -179.4641316731771, "logps/rejected": -223.6646484375, "loss": 0.2903, "rewards/chosen": 0.05553722381591797, "rewards/margins": 1.8336568832397462, "rewards/rejected": -1.7781196594238282, "step": 3030 }, { "epoch": 0.1606551294622744, "grad_norm": 76.5, "kl": 0.43782806396484375, "learning_rate": 5e-07, "logits/chosen": -34850848.0, "logits/rejected": -53161112.0, "logps/chosen": -423.2652994791667, "logps/rejected": -463.57733154296875, "loss": 0.3381, "rewards/chosen": 0.4276687701543172, "rewards/margins": 2.640762289365133, "rewards/rejected": -2.2130935192108154, "step": 3031 }, { "epoch": 0.16070813346407653, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16251315.2, "logits/rejected": -25762181.333333332, "logps/chosen": -242.217626953125, "logps/rejected": -532.3186848958334, "loss": 0.3798, "rewards/chosen": 0.11717270612716675, "rewards/margins": 1.8565912127494812, "rewards/rejected": -1.7394185066223145, "step": 3032 }, { "epoch": 0.16076113746587867, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32391536.0, "logits/rejected": -7201575.2, "logps/chosen": -487.611572265625, "logps/rejected": -485.25166015625, "loss": 0.2876, "rewards/chosen": -0.057377129793167114, "rewards/margins": 2.0314736545085905, "rewards/rejected": -2.0888507843017576, "step": 3033 }, { "epoch": 0.1608141414676808, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28522116.0, "logits/rejected": -6057033.5, "logps/chosen": -379.9462890625, "logps/rejected": -107.72430419921875, "loss": 0.3829, "rewards/chosen": 0.3989059329032898, "rewards/margins": 0.9781083464622498, "rewards/rejected": -0.57920241355896, "step": 3034 }, { "epoch": 0.16086714546948294, "grad_norm": 47.25, "kl": 1.3762779235839844, "learning_rate": 5e-07, "logits/chosen": -83432832.0, "logits/rejected": -36144032.0, "logps/chosen": -227.318994140625, "logps/rejected": -278.4729817708333, "loss": 0.4363, "rewards/chosen": -0.1617767333984375, "rewards/margins": 1.7716706275939942, "rewards/rejected": -1.9334473609924316, "step": 3035 }, { "epoch": 0.16092014947128508, "grad_norm": 40.5, "kl": 0.10009765625, "learning_rate": 5e-07, "logits/chosen": -42506508.0, "logits/rejected": -8719385.0, "logps/chosen": -216.85218811035156, "logps/rejected": -309.4487609863281, "loss": 0.3707, "rewards/chosen": 0.09199920296669006, "rewards/margins": 1.3436370193958282, "rewards/rejected": -1.2516378164291382, "step": 3036 }, { "epoch": 0.16097315347308722, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28779746.666666668, "logits/rejected": 15023585.0, "logps/chosen": -209.68255615234375, "logps/rejected": -172.94386291503906, "loss": 0.4978, "rewards/chosen": -0.13413073619206747, "rewards/margins": 0.2818187276522318, "rewards/rejected": -0.4159494638442993, "step": 3037 }, { "epoch": 0.16102615747488935, "grad_norm": 55.25, "kl": 0.58245849609375, "learning_rate": 5e-07, "logits/chosen": -31551920.0, "logits/rejected": -27481488.0, "logps/chosen": -297.76967366536456, "logps/rejected": -372.1422424316406, "loss": 0.5043, "rewards/chosen": -0.3677284320195516, "rewards/margins": 1.7502780357996623, "rewards/rejected": -2.118006467819214, "step": 3038 }, { "epoch": 0.1610791614766915, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9282448.0, "logits/rejected": -33457568.0, "logps/chosen": -215.04305013020834, "logps/rejected": -172.7368896484375, "loss": 0.3133, "rewards/chosen": 0.1240885357062022, "rewards/margins": 1.581789646546046, "rewards/rejected": -1.4577011108398437, "step": 3039 }, { "epoch": 0.16113216547849363, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41175104.0, "logits/rejected": -3153509.3333333335, "logps/chosen": -373.9062255859375, "logps/rejected": -502.6153971354167, "loss": 0.3993, "rewards/chosen": -0.03510406017303467, "rewards/margins": 2.4575838009516398, "rewards/rejected": -2.4926878611246743, "step": 3040 }, { "epoch": 0.16118516948029576, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54444906.666666664, "logits/rejected": -16207070.4, "logps/chosen": -937.2884928385416, "logps/rejected": -325.285400390625, "loss": 0.2599, "rewards/chosen": 0.44798584779103595, "rewards/margins": 2.1397485812505086, "rewards/rejected": -1.6917627334594727, "step": 3041 }, { "epoch": 0.1612381734820979, "grad_norm": 62.75, "kl": 1.3731536865234375, "learning_rate": 5e-07, "logits/chosen": -16905680.0, "logits/rejected": -42590032.0, "logps/chosen": -524.262255859375, "logps/rejected": -477.5695393880208, "loss": 0.3585, "rewards/chosen": 0.26314125061035154, "rewards/margins": 2.953260358174642, "rewards/rejected": -2.6901191075642905, "step": 3042 }, { "epoch": 0.16129117748390004, "grad_norm": 50.5, "kl": 0.7223701477050781, "learning_rate": 5e-07, "logits/chosen": -38024569.6, "logits/rejected": -10738672.0, "logps/chosen": -292.6223388671875, "logps/rejected": -225.68257649739584, "loss": 0.3643, "rewards/chosen": 0.263018274307251, "rewards/margins": 1.7795416355133056, "rewards/rejected": -1.5165233612060547, "step": 3043 }, { "epoch": 0.16134418148570218, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53369019.428571425, "logits/rejected": -51830120.0, "logps/chosen": -234.45704868861608, "logps/rejected": -531.2843017578125, "loss": 0.4196, "rewards/chosen": 0.11252113751002721, "rewards/margins": 3.7053403513772145, "rewards/rejected": -3.5928192138671875, "step": 3044 }, { "epoch": 0.1613971854875043, "grad_norm": 67.5, "kl": 0.08139324188232422, "learning_rate": 5e-07, "logits/chosen": -48103192.0, "logits/rejected": -13876874.0, "logps/chosen": -521.6688232421875, "logps/rejected": -243.15521240234375, "loss": 0.3463, "rewards/chosen": 0.21536701917648315, "rewards/margins": 2.0344647765159607, "rewards/rejected": -1.8190977573394775, "step": 3045 }, { "epoch": 0.16145018948930645, "grad_norm": 55.25, "kl": 0.4672279357910156, "learning_rate": 5e-07, "logits/chosen": -60745226.666666664, "logits/rejected": -70159776.0, "logps/chosen": -597.1680501302084, "logps/rejected": -297.2728271484375, "loss": 0.2634, "rewards/chosen": 0.6684885819753011, "rewards/margins": 2.5087595780690513, "rewards/rejected": -1.84027099609375, "step": 3046 }, { "epoch": 0.1615031934911086, "grad_norm": 55.25, "kl": 0.6313552856445312, "learning_rate": 5e-07, "logits/chosen": -40726496.0, "logits/rejected": -280879.5, "logps/chosen": -563.8970947265625, "logps/rejected": -254.61862182617188, "loss": 0.3137, "rewards/chosen": 0.6593089699745178, "rewards/margins": 1.858819305896759, "rewards/rejected": -1.1995103359222412, "step": 3047 }, { "epoch": 0.16155619749291072, "grad_norm": 65.0, "kl": 0.5147790908813477, "learning_rate": 5e-07, "logits/chosen": -36002502.4, "logits/rejected": -37965581.333333336, "logps/chosen": -360.48896484375, "logps/rejected": -520.19189453125, "loss": 0.4073, "rewards/chosen": -0.35409011840820315, "rewards/margins": 2.7617199579874674, "rewards/rejected": -3.1158100763956704, "step": 3048 }, { "epoch": 0.16160920149471286, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16167266.666666666, "logits/rejected": -15143392.0, "logps/chosen": -233.52823893229166, "logps/rejected": -275.058740234375, "loss": 0.2792, "rewards/chosen": 0.13043753306070963, "rewards/margins": 1.988050142923991, "rewards/rejected": -1.8576126098632812, "step": 3049 }, { "epoch": 0.161662205496515, "grad_norm": 30.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3202027.8, "logits/rejected": 75873013.33333333, "logps/chosen": -86.043310546875, "logps/rejected": -130.24252319335938, "loss": 0.3625, "rewards/chosen": 0.10908111333847045, "rewards/margins": 1.7229971925417582, "rewards/rejected": -1.6139160792032878, "step": 3050 }, { "epoch": 0.16171520949831714, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39515637.333333336, "logits/rejected": -22828302.4, "logps/chosen": -177.67854817708334, "logps/rejected": -236.862744140625, "loss": 0.3113, "rewards/chosen": -0.36292115847269696, "rewards/margins": 1.5383647759755452, "rewards/rejected": -1.9012859344482422, "step": 3051 }, { "epoch": 0.16176821350011925, "grad_norm": 40.0, "kl": 0.06113433837890625, "learning_rate": 5e-07, "logits/chosen": -3989783.6666666665, "logits/rejected": -20688104.0, "logps/chosen": -147.33277384440103, "logps/rejected": -798.755224609375, "loss": 0.2526, "rewards/chosen": 0.2678585847218831, "rewards/margins": 3.6667446931203207, "rewards/rejected": -3.3988861083984374, "step": 3052 }, { "epoch": 0.16182121750192138, "grad_norm": 27.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -38369784.0, "logps/rejected": -501.3186950683594, "loss": 0.1313, "rewards/rejected": -2.3492860794067383, "step": 3053 }, { "epoch": 0.16187422150372352, "grad_norm": 63.0, "kl": 0.7783670425415039, "learning_rate": 5e-07, "logits/chosen": -14531792.0, "logits/rejected": -15319938.666666666, "logps/chosen": -382.470654296875, "logps/rejected": -179.56803385416666, "loss": 0.3335, "rewards/chosen": 0.6903430938720703, "rewards/margins": 1.606150499979655, "rewards/rejected": -0.9158074061075846, "step": 3054 }, { "epoch": 0.16192722550552566, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19687376.0, "logits/rejected": -24774058.666666668, "logps/chosen": -311.0394775390625, "logps/rejected": -380.7591145833333, "loss": 0.3406, "rewards/chosen": 0.30789790153503416, "rewards/margins": 1.7988184452056886, "rewards/rejected": -1.4909205436706543, "step": 3055 }, { "epoch": 0.1619802295073278, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2889197.25, "logits/rejected": -8407992.0, "logps/chosen": -424.83758544921875, "logps/rejected": -352.04364013671875, "loss": 0.3213, "rewards/chosen": 0.2622610330581665, "rewards/margins": 1.8541998863220215, "rewards/rejected": -1.591938853263855, "step": 3056 }, { "epoch": 0.16203323350912993, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22694410.666666668, "logits/rejected": -13355554.4, "logps/chosen": -221.2725626627604, "logps/rejected": -170.14512939453124, "loss": 0.3414, "rewards/chosen": 0.20782812436421713, "rewards/margins": 1.2980863412221273, "rewards/rejected": -1.0902582168579102, "step": 3057 }, { "epoch": 0.16208623751093207, "grad_norm": 50.25, "kl": 1.6377992630004883, "learning_rate": 5e-07, "logits/chosen": -29166700.8, "logits/rejected": -48964912.0, "logps/chosen": -285.031396484375, "logps/rejected": -463.9745279947917, "loss": 0.3398, "rewards/chosen": 0.6815608978271485, "rewards/margins": 2.288746738433838, "rewards/rejected": -1.6071858406066895, "step": 3058 }, { "epoch": 0.1621392415127342, "grad_norm": 78.5, "kl": 0.5562839508056641, "learning_rate": 5e-07, "logits/chosen": -8385442.4, "logits/rejected": 1363008.0, "logps/chosen": -709.86435546875, "logps/rejected": -322.10675048828125, "loss": 0.3123, "rewards/chosen": 0.5552459716796875, "rewards/margins": 2.0659719467163087, "rewards/rejected": -1.510725975036621, "step": 3059 }, { "epoch": 0.16219224551453634, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22119260.8, "logits/rejected": -54288746.666666664, "logps/chosen": -330.046728515625, "logps/rejected": -788.92138671875, "loss": 0.3882, "rewards/chosen": 0.06136624217033386, "rewards/margins": 1.9066797912120819, "rewards/rejected": -1.845313549041748, "step": 3060 }, { "epoch": 0.16224524951633848, "grad_norm": 54.0, "kl": 0.49172210693359375, "learning_rate": 5e-07, "logits/chosen": -5801324.0, "logits/rejected": -16628090.666666666, "logps/chosen": -656.9892578125, "logps/rejected": -258.1341959635417, "loss": 0.2211, "rewards/chosen": 0.47335660457611084, "rewards/margins": 2.26601763566335, "rewards/rejected": -1.7926610310872395, "step": 3061 }, { "epoch": 0.16229825351814062, "grad_norm": 58.75, "kl": 0.7972793579101562, "learning_rate": 5e-07, "logits/chosen": -6833135.2, "logits/rejected": -10507507.333333334, "logps/chosen": -266.892041015625, "logps/rejected": -586.4518636067709, "loss": 0.4269, "rewards/chosen": -0.18806557655334472, "rewards/margins": 1.5730687300364177, "rewards/rejected": -1.7611343065897624, "step": 3062 }, { "epoch": 0.16235125751994275, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16252668.0, "logits/rejected": -47576408.0, "logps/chosen": -177.78704833984375, "logps/rejected": -310.8626708984375, "loss": 0.3807, "rewards/chosen": 0.2222724755605062, "rewards/margins": 1.8138793309529622, "rewards/rejected": -1.591606855392456, "step": 3063 }, { "epoch": 0.1624042615217449, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32682438.0, "logits/rejected": -69250296.0, "logps/chosen": -231.95697021484375, "logps/rejected": -351.1561584472656, "loss": 0.3775, "rewards/chosen": -0.08113746345043182, "rewards/margins": 1.3221373111009598, "rewards/rejected": -1.4032747745513916, "step": 3064 }, { "epoch": 0.16245726552354703, "grad_norm": 45.0, "kl": 0.43091583251953125, "learning_rate": 5e-07, "logits/chosen": -65011772.0, "logits/rejected": -25152124.0, "logps/chosen": -376.03802490234375, "logps/rejected": -217.2881317138672, "loss": 0.2986, "rewards/chosen": 0.115496926009655, "rewards/margins": 2.2854834124445915, "rewards/rejected": -2.1699864864349365, "step": 3065 }, { "epoch": 0.16251026952534917, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4135476.6666666665, "logits/rejected": -19612787.2, "logps/chosen": -85.46714274088542, "logps/rejected": -470.23251953125, "loss": 0.2718, "rewards/chosen": 0.0890862246354421, "rewards/margins": 2.0835613985856374, "rewards/rejected": -1.9944751739501954, "step": 3066 }, { "epoch": 0.1625632735271513, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1412953.625, "logits/rejected": -13834407.0, "logps/chosen": -236.37924194335938, "logps/rejected": -163.87496948242188, "loss": 0.3247, "rewards/chosen": 0.18291980028152466, "rewards/margins": 1.6560705304145813, "rewards/rejected": -1.4731507301330566, "step": 3067 }, { "epoch": 0.16261627752895344, "grad_norm": 64.5, "kl": 3.2809314727783203, "learning_rate": 5e-07, "logits/chosen": -32280176.0, "logits/rejected": -61374592.0, "logps/chosen": -471.5594889322917, "logps/rejected": -363.0179748535156, "loss": 0.3943, "rewards/chosen": 0.46953141689300537, "rewards/margins": 2.7961641550064087, "rewards/rejected": -2.3266327381134033, "step": 3068 }, { "epoch": 0.16266928153075558, "grad_norm": 71.5, "kl": 1.0107784271240234, "learning_rate": 5e-07, "logits/chosen": -42763080.0, "logits/rejected": -1169972.25, "logps/chosen": -414.9234313964844, "logps/rejected": -413.8253173828125, "loss": 0.3106, "rewards/chosen": 0.46182823181152344, "rewards/margins": 2.0537891387939453, "rewards/rejected": -1.5919609069824219, "step": 3069 }, { "epoch": 0.16272228553255771, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22024588.0, "logits/rejected": -9866669.0, "logps/chosen": -165.06886291503906, "logps/rejected": -92.68113708496094, "loss": 0.4169, "rewards/chosen": -0.1851566731929779, "rewards/margins": 0.7348610460758209, "rewards/rejected": -0.9200177192687988, "step": 3070 }, { "epoch": 0.16277528953435985, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -116230600.0, "logits/rejected": -10567497.142857144, "logps/chosen": -1015.70263671875, "logps/rejected": -300.42020089285717, "loss": 0.2965, "rewards/chosen": 0.4192565977573395, "rewards/margins": 1.8982489236763544, "rewards/rejected": -1.478992325919015, "step": 3071 }, { "epoch": 0.162828293536162, "grad_norm": 66.5, "kl": 2.264216423034668, "learning_rate": 5e-07, "logits/chosen": -20716641.6, "logits/rejected": -51608442.666666664, "logps/chosen": -486.45576171875, "logps/rejected": -505.630615234375, "loss": 0.3707, "rewards/chosen": 0.49393110275268554, "rewards/margins": 1.9981679598490398, "rewards/rejected": -1.5042368570963542, "step": 3072 }, { "epoch": 0.16288129753796413, "grad_norm": 47.75, "kl": 0.09216785430908203, "learning_rate": 5e-07, "logits/chosen": -16049846.0, "logits/rejected": -25605168.0, "logps/chosen": -404.10650634765625, "logps/rejected": -413.260009765625, "loss": 0.3514, "rewards/chosen": 0.0025583431124687195, "rewards/margins": 1.8694707080721855, "rewards/rejected": -1.8669123649597168, "step": 3073 }, { "epoch": 0.16293430153976626, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9888558.4, "logits/rejected": -36153045.333333336, "logps/chosen": -192.97938232421876, "logps/rejected": -174.329833984375, "loss": 0.3885, "rewards/chosen": 0.05058155059814453, "rewards/margins": 1.2952705383300782, "rewards/rejected": -1.2446889877319336, "step": 3074 }, { "epoch": 0.1629873055415684, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -112238144.0, "logits/rejected": -8748539.2, "logps/chosen": -378.9365641276042, "logps/rejected": -189.103662109375, "loss": 0.2363, "rewards/chosen": 0.448139230410258, "rewards/margins": 2.303726808230082, "rewards/rejected": -1.8555875778198243, "step": 3075 }, { "epoch": 0.16304030954337054, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36365868.0, "logits/rejected": -41201168.0, "logps/chosen": -501.55401611328125, "logps/rejected": -195.32533264160156, "loss": 0.2895, "rewards/chosen": 0.4724838435649872, "rewards/margins": 2.0239801108837128, "rewards/rejected": -1.5514962673187256, "step": 3076 }, { "epoch": 0.16309331354517265, "grad_norm": 58.25, "kl": 0.038112640380859375, "learning_rate": 5e-07, "logits/chosen": -23644096.0, "logits/rejected": -18025094.666666668, "logps/chosen": -206.1924285888672, "logps/rejected": -265.4112141927083, "loss": 0.3466, "rewards/chosen": -0.023263931274414062, "rewards/margins": 0.9590429464975992, "rewards/rejected": -0.9823068777720133, "step": 3077 }, { "epoch": 0.16314631754697478, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72596307.2, "logits/rejected": -76482618.66666667, "logps/chosen": -331.47861328125, "logps/rejected": -415.2008056640625, "loss": 0.3012, "rewards/chosen": 0.4237858772277832, "rewards/margins": 2.5064033190409343, "rewards/rejected": -2.082617441813151, "step": 3078 }, { "epoch": 0.16319932154877692, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28277357.333333332, "logits/rejected": 4462339.0, "logps/chosen": -298.15016682942706, "logps/rejected": -13.136418342590332, "loss": 0.5255, "rewards/chosen": -0.18685479958852133, "rewards/margins": -0.1026115914185842, "rewards/rejected": -0.08424320816993713, "step": 3079 }, { "epoch": 0.16325232555057906, "grad_norm": 30.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8631246.0, "logits/rejected": -19932603.2, "logps/chosen": -90.5576883951823, "logps/rejected": -383.0167236328125, "loss": 0.2964, "rewards/chosen": -0.52654496828715, "rewards/margins": 1.83329447110494, "rewards/rejected": -2.35983943939209, "step": 3080 }, { "epoch": 0.1633053295523812, "grad_norm": 61.0, "kl": 0.04438018798828125, "learning_rate": 5e-07, "logits/chosen": -55477525.333333336, "logits/rejected": -35287672.0, "logps/chosen": -438.3164876302083, "logps/rejected": -134.76815795898438, "loss": 0.4323, "rewards/chosen": 0.07186877727508545, "rewards/margins": 1.0700980424880981, "rewards/rejected": -0.9982292652130127, "step": 3081 }, { "epoch": 0.16335833355418333, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6985628.0, "logits/rejected": -46081228.8, "logps/chosen": -1028.8817545572917, "logps/rejected": -382.3248779296875, "loss": 0.1864, "rewards/chosen": 0.9561269283294678, "rewards/margins": 3.140803098678589, "rewards/rejected": -2.184676170349121, "step": 3082 }, { "epoch": 0.16341133755598547, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32119958.0, "logits/rejected": -35295440.0, "logps/chosen": -262.5450134277344, "logps/rejected": -394.8113098144531, "loss": 0.3376, "rewards/chosen": -0.04252453148365021, "rewards/margins": 1.7216333597898483, "rewards/rejected": -1.7641578912734985, "step": 3083 }, { "epoch": 0.1634643415577876, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19660772.0, "logits/rejected": -18961610.666666668, "logps/chosen": -546.1990966796875, "logps/rejected": -401.777587890625, "loss": 0.2429, "rewards/chosen": 0.1780693084001541, "rewards/margins": 1.9362806032101314, "rewards/rejected": -1.7582112948099773, "step": 3084 }, { "epoch": 0.16351734555958974, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4025643.25, "logits/rejected": 4348053.0, "logps/chosen": -390.8397521972656, "logps/rejected": -360.3423767089844, "loss": 0.3783, "rewards/chosen": -0.06889975816011429, "rewards/margins": 1.1827528402209282, "rewards/rejected": -1.2516525983810425, "step": 3085 }, { "epoch": 0.16357034956139188, "grad_norm": 50.25, "kl": 0.0726470947265625, "learning_rate": 5e-07, "logits/chosen": -19931773.333333332, "logits/rejected": -18705650.0, "logps/chosen": -254.38936360677084, "logps/rejected": -648.9035034179688, "loss": 0.3765, "rewards/chosen": 0.23036664724349976, "rewards/margins": 2.512783944606781, "rewards/rejected": -2.2824172973632812, "step": 3086 }, { "epoch": 0.16362335356319402, "grad_norm": 44.5, "kl": 0.20377731323242188, "learning_rate": 5e-07, "logits/chosen": -55552709.333333336, "logits/rejected": -28819952.0, "logps/chosen": -111.9940185546875, "logps/rejected": -504.55517578125, "loss": 0.4872, "rewards/chosen": -0.4109210968017578, "rewards/margins": 1.3773739337921143, "rewards/rejected": -1.788295030593872, "step": 3087 }, { "epoch": 0.16367635756499616, "grad_norm": 59.75, "kl": 0.7637882232666016, "learning_rate": 5e-07, "logits/chosen": -18160385.333333332, "logits/rejected": -45874056.0, "logps/chosen": -353.307373046875, "logps/rejected": -477.38848876953125, "loss": 0.3765, "rewards/chosen": 0.2685752312342326, "rewards/margins": 2.2296607891718545, "rewards/rejected": -1.961085557937622, "step": 3088 }, { "epoch": 0.1637293615667983, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28693269.333333332, "logits/rejected": -7867559.2, "logps/chosen": -303.2113037109375, "logps/rejected": -292.08720703125, "loss": 0.3527, "rewards/chosen": -0.1516071359316508, "rewards/margins": 1.4824167211850483, "rewards/rejected": -1.6340238571166992, "step": 3089 }, { "epoch": 0.16378236556860043, "grad_norm": 62.25, "kl": 1.6110000610351562, "learning_rate": 5e-07, "logits/chosen": -38103648.0, "logits/rejected": -29392464.0, "logps/chosen": -635.5267578125, "logps/rejected": -324.8638102213542, "loss": 0.3803, "rewards/chosen": 0.42546920776367186, "rewards/margins": 1.5934130032857259, "rewards/rejected": -1.167943795522054, "step": 3090 }, { "epoch": 0.16383536957040257, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10177358.0, "logits/rejected": -21227044.8, "logps/chosen": -101.68931070963542, "logps/rejected": -226.868115234375, "loss": 0.3402, "rewards/chosen": -0.1553413470586141, "rewards/margins": 1.4147560040156046, "rewards/rejected": -1.5700973510742187, "step": 3091 }, { "epoch": 0.1638883735722047, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7340016.0, "logits/rejected": -13160244.0, "logps/chosen": -36.554443359375, "logps/rejected": -226.3634033203125, "loss": 0.4019, "rewards/chosen": -0.0700421929359436, "rewards/margins": 1.2725397149721782, "rewards/rejected": -1.3425819079081218, "step": 3092 }, { "epoch": 0.16394137757400684, "grad_norm": 66.5, "kl": 1.1367530822753906, "learning_rate": 5e-07, "logits/chosen": -58849971.2, "logits/rejected": -63641456.0, "logps/chosen": -584.41650390625, "logps/rejected": -560.6255289713541, "loss": 0.3737, "rewards/chosen": 0.07947969436645508, "rewards/margins": 1.81760835647583, "rewards/rejected": -1.738128662109375, "step": 3093 }, { "epoch": 0.16399438157580898, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1841202.5, "logits/rejected": -24394530.666666668, "logps/chosen": -320.51055908203125, "logps/rejected": -298.0397135416667, "loss": 0.2062, "rewards/chosen": 0.5917431116104126, "rewards/margins": 2.705421566963196, "rewards/rejected": -2.113678455352783, "step": 3094 }, { "epoch": 0.16404738557761112, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8646420.0, "logits/rejected": -7131560.666666667, "logps/chosen": -143.2617919921875, "logps/rejected": -157.48125203450522, "loss": 0.4167, "rewards/chosen": 0.06769691705703736, "rewards/margins": 1.180219837029775, "rewards/rejected": -1.1125229199727376, "step": 3095 }, { "epoch": 0.16410038957941325, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14102146.666666666, "logits/rejected": -38807816.0, "logps/chosen": -272.85064697265625, "logps/rejected": -384.8594970703125, "loss": 0.475, "rewards/chosen": -0.48958492279052734, "rewards/margins": 1.835069179534912, "rewards/rejected": -2.3246541023254395, "step": 3096 }, { "epoch": 0.1641533935812154, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5010425.0, "logits/rejected": -47707584.0, "logps/chosen": -137.9228973388672, "logps/rejected": -316.3278401692708, "loss": 0.2659, "rewards/chosen": 0.5174859762191772, "rewards/margins": 1.898037314414978, "rewards/rejected": -1.3805513381958008, "step": 3097 }, { "epoch": 0.16420639758301753, "grad_norm": 63.0, "kl": 0.38982391357421875, "learning_rate": 5e-07, "logits/chosen": 15366246.0, "logits/rejected": -32632128.0, "logps/chosen": -294.784912109375, "logps/rejected": -309.6927490234375, "loss": 0.3233, "rewards/chosen": 0.11836233735084534, "rewards/margins": 2.15842667222023, "rewards/rejected": -2.0400643348693848, "step": 3098 }, { "epoch": 0.16425940158481966, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13838052.0, "logits/rejected": -22763746.0, "logps/chosen": -141.57681274414062, "logps/rejected": -198.12013244628906, "loss": 0.3851, "rewards/chosen": -0.10891599953174591, "rewards/margins": 1.240996167063713, "rewards/rejected": -1.349912166595459, "step": 3099 }, { "epoch": 0.1643124055866218, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -100379744.0, "logits/rejected": -24234339.2, "logps/chosen": -724.7306315104166, "logps/rejected": -382.95068359375, "loss": 0.3226, "rewards/chosen": -0.040263871351877846, "rewards/margins": 1.5964932163556416, "rewards/rejected": -1.6367570877075195, "step": 3100 }, { "epoch": 0.16436540958842394, "grad_norm": 50.5, "kl": 1.2299041748046875, "learning_rate": 5e-07, "logits/chosen": -29054376.0, "logits/rejected": -37120576.0, "logps/chosen": -368.0357259114583, "logps/rejected": -326.62490234375, "loss": 0.3073, "rewards/chosen": 0.4591168959935506, "rewards/margins": 2.056952436765035, "rewards/rejected": -1.5978355407714844, "step": 3101 }, { "epoch": 0.16441841359022608, "grad_norm": 61.75, "kl": 0.5306491851806641, "learning_rate": 5e-07, "logits/chosen": -6721537.333333333, "logits/rejected": 2680848.25, "logps/chosen": -285.46681722005206, "logps/rejected": -103.78840637207031, "loss": 0.3784, "rewards/chosen": 0.5316807428995768, "rewards/margins": 1.3153015772501626, "rewards/rejected": -0.7836208343505859, "step": 3102 }, { "epoch": 0.16447141759202819, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25591720.0, "logits/rejected": -36131436.0, "logps/chosen": -466.8290100097656, "logps/rejected": -220.617431640625, "loss": 0.2973, "rewards/chosen": 0.18696938455104828, "rewards/margins": 2.0958850234746933, "rewards/rejected": -1.908915638923645, "step": 3103 }, { "epoch": 0.16452442159383032, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1362993.0, "logits/rejected": -46816928.0, "logps/chosen": -268.9665934244792, "logps/rejected": -394.428564453125, "loss": 0.1909, "rewards/chosen": 0.6427310307820638, "rewards/margins": 3.0597827275594076, "rewards/rejected": -2.4170516967773437, "step": 3104 }, { "epoch": 0.16457742559563246, "grad_norm": 41.25, "kl": 0.25592803955078125, "learning_rate": 5e-07, "logits/chosen": -24151236.0, "logits/rejected": -22475300.0, "logps/chosen": -210.359619140625, "logps/rejected": -682.430908203125, "loss": 0.2328, "rewards/chosen": 0.49240341782569885, "rewards/margins": 3.9911378920078278, "rewards/rejected": -3.498734474182129, "step": 3105 }, { "epoch": 0.1646304295974346, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23908016.0, "logits/rejected": -62082896.0, "logps/chosen": -193.8170369466146, "logps/rejected": -614.0143432617188, "loss": 0.3632, "rewards/chosen": 0.14363300800323486, "rewards/margins": 2.9730931520462036, "rewards/rejected": -2.8294601440429688, "step": 3106 }, { "epoch": 0.16468343359923673, "grad_norm": 49.0, "kl": 1.4131355285644531, "learning_rate": 5e-07, "logits/chosen": -9844832.0, "logits/rejected": -14251318.4, "logps/chosen": -241.5372111002604, "logps/rejected": -307.6281982421875, "loss": 0.2911, "rewards/chosen": 0.6381696859995524, "rewards/margins": 2.2911617437998455, "rewards/rejected": -1.652992057800293, "step": 3107 }, { "epoch": 0.16473643760103887, "grad_norm": 93.0, "kl": 1.2436752319335938, "learning_rate": 5e-07, "logits/chosen": -113636.66666666667, "logits/rejected": -9826011.0, "logps/chosen": -715.4469401041666, "logps/rejected": -167.32473754882812, "loss": 0.4155, "rewards/chosen": 0.40381304423014325, "rewards/margins": 1.1390982071558635, "rewards/rejected": -0.7352851629257202, "step": 3108 }, { "epoch": 0.164789441602841, "grad_norm": 60.25, "kl": 0.016147613525390625, "learning_rate": 5e-07, "logits/chosen": -64377478.4, "logits/rejected": -41350789.333333336, "logps/chosen": -510.646435546875, "logps/rejected": -379.1805826822917, "loss": 0.299, "rewards/chosen": 0.5173834323883056, "rewards/margins": 2.666104809443156, "rewards/rejected": -2.14872137705485, "step": 3109 }, { "epoch": 0.16484244560464315, "grad_norm": 38.5, "kl": 0.06114006042480469, "learning_rate": 5e-07, "logits/chosen": -24572260.0, "logits/rejected": -25283208.0, "logps/chosen": -210.0196533203125, "logps/rejected": -384.5587565104167, "loss": 0.2665, "rewards/chosen": -0.44722241163253784, "rewards/margins": 1.5891691644986472, "rewards/rejected": -2.036391576131185, "step": 3110 }, { "epoch": 0.16489544960644528, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18774826.666666668, "logits/rejected": -6270724.0, "logps/chosen": -209.23726399739584, "logps/rejected": -217.306591796875, "loss": 0.3453, "rewards/chosen": 0.2285273273785909, "rewards/margins": 1.3432166775067647, "rewards/rejected": -1.1146893501281738, "step": 3111 }, { "epoch": 0.16494845360824742, "grad_norm": 31.0, "kl": 0.047806739807128906, "learning_rate": 5e-07, "logits/chosen": 9295465.333333334, "logits/rejected": -26536115.2, "logps/chosen": -139.89237467447916, "logps/rejected": -322.868212890625, "loss": 0.2054, "rewards/chosen": 0.37569669882456463, "rewards/margins": 2.8149091164271036, "rewards/rejected": -2.439212417602539, "step": 3112 }, { "epoch": 0.16500145761004956, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16001801.6, "logits/rejected": -6018018.0, "logps/chosen": -135.735546875, "logps/rejected": -146.93707275390625, "loss": 0.3826, "rewards/chosen": 0.21522512435913085, "rewards/margins": 1.3110198020935058, "rewards/rejected": -1.095794677734375, "step": 3113 }, { "epoch": 0.1650544616118517, "grad_norm": 62.5, "kl": 1.564809799194336, "learning_rate": 5e-07, "logits/chosen": -8962819.0, "logits/rejected": 9442569.714285715, "logps/chosen": -1970.7535400390625, "logps/rejected": -398.25589425223217, "loss": 0.1695, "rewards/chosen": 1.957788109779358, "rewards/margins": 3.6784724337714056, "rewards/rejected": -1.720684323992048, "step": 3114 }, { "epoch": 0.16510746561365383, "grad_norm": 52.5, "kl": 0.05317497253417969, "learning_rate": 5e-07, "logits/chosen": -17722172.8, "logits/rejected": -78423498.66666667, "logps/chosen": -254.506103515625, "logps/rejected": -450.6278483072917, "loss": 0.3183, "rewards/chosen": 0.33246612548828125, "rewards/margins": 2.336438020070394, "rewards/rejected": -2.003971894582113, "step": 3115 }, { "epoch": 0.16516046961545597, "grad_norm": 51.25, "kl": 0.23769664764404297, "learning_rate": 5e-07, "logits/chosen": -72174730.66666667, "logits/rejected": -13443984.0, "logps/chosen": -414.9409993489583, "logps/rejected": -329.673388671875, "loss": 0.2638, "rewards/chosen": 0.4239894946416219, "rewards/margins": 2.1878936847050987, "rewards/rejected": -1.7639041900634767, "step": 3116 }, { "epoch": 0.1652134736172581, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39436266.666666664, "logits/rejected": -8688588.8, "logps/chosen": -200.7354736328125, "logps/rejected": -233.743115234375, "loss": 0.3882, "rewards/chosen": -0.10587400197982788, "rewards/margins": 0.7220218300819397, "rewards/rejected": -0.8278958320617675, "step": 3117 }, { "epoch": 0.16526647761906024, "grad_norm": 67.0, "kl": 0.17551422119140625, "learning_rate": 5e-07, "logits/chosen": -31469888.0, "logits/rejected": -1833112.0, "logps/chosen": -406.2636962890625, "logps/rejected": -169.89286295572916, "loss": 0.425, "rewards/chosen": 0.10028934478759766, "rewards/margins": 0.896508534749349, "rewards/rejected": -0.7962191899617513, "step": 3118 }, { "epoch": 0.16531948162086238, "grad_norm": 53.25, "kl": 1.0659446716308594, "learning_rate": 5e-07, "logits/chosen": -38219141.333333336, "logits/rejected": -7777304.0, "logps/chosen": -346.2727864583333, "logps/rejected": -300.450439453125, "loss": 0.3073, "rewards/chosen": 0.4051350752512614, "rewards/margins": 1.851004711786906, "rewards/rejected": -1.4458696365356445, "step": 3119 }, { "epoch": 0.16537248562266452, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22016914.666666668, "logits/rejected": -4789714.8, "logps/chosen": -185.95550537109375, "logps/rejected": -99.66529541015625, "loss": 0.331, "rewards/chosen": 0.27943408489227295, "rewards/margins": 1.3637202978134155, "rewards/rejected": -1.0842862129211426, "step": 3120 }, { "epoch": 0.16542548962446665, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33752816.0, "logits/rejected": -29099701.333333332, "logps/chosen": -987.3828735351562, "logps/rejected": -333.85146077473956, "loss": 0.219, "rewards/chosen": 0.23517760634422302, "rewards/margins": 2.116058597962062, "rewards/rejected": -1.8808809916178386, "step": 3121 }, { "epoch": 0.1654784936262688, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32094582.4, "logits/rejected": 20248098.666666668, "logps/chosen": -110.0836181640625, "logps/rejected": -551.496826171875, "loss": 0.4176, "rewards/chosen": -0.21512603759765625, "rewards/margins": 1.384290059407552, "rewards/rejected": -1.5994160970052083, "step": 3122 }, { "epoch": 0.16553149762807093, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21211732.0, "logits/rejected": -13129571.2, "logps/chosen": -108.9949951171875, "logps/rejected": -324.41376953125, "loss": 0.2868, "rewards/chosen": -0.2723369598388672, "rewards/margins": 1.8183204650878908, "rewards/rejected": -2.090657424926758, "step": 3123 }, { "epoch": 0.16558450162987307, "grad_norm": 67.0, "kl": 0.9979362487792969, "learning_rate": 5e-07, "logits/chosen": 15776194.0, "logits/rejected": -3708510.0, "logps/chosen": -485.2191162109375, "logps/rejected": -318.54412841796875, "loss": 0.3531, "rewards/chosen": 0.036704450845718384, "rewards/margins": 1.7796556055545807, "rewards/rejected": -1.7429511547088623, "step": 3124 }, { "epoch": 0.1656375056316752, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47696832.0, "logits/rejected": -14776709.0, "logps/chosen": -216.43798828125, "logps/rejected": -70.2627944946289, "loss": 0.4129, "rewards/chosen": -0.03484554092089335, "rewards/margins": 1.9270416994889576, "rewards/rejected": -1.961887240409851, "step": 3125 }, { "epoch": 0.16569050963347734, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3003296.5, "logits/rejected": -32165053.333333332, "logps/chosen": -40.340824127197266, "logps/rejected": -376.266357421875, "loss": 0.2565, "rewards/chosen": 0.08542472124099731, "rewards/margins": 1.9416980544726055, "rewards/rejected": -1.8562733332316081, "step": 3126 }, { "epoch": 0.16574351363527948, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17615584.0, "logits/rejected": -33405978.0, "logps/chosen": -145.92813110351562, "logps/rejected": -345.8258361816406, "loss": 0.3312, "rewards/chosen": 0.2912721037864685, "rewards/margins": 1.8664564490318298, "rewards/rejected": -1.5751843452453613, "step": 3127 }, { "epoch": 0.1657965176370816, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51560780.8, "logits/rejected": -47344160.0, "logps/chosen": -186.18797607421874, "logps/rejected": -304.96225992838544, "loss": 0.4075, "rewards/chosen": 0.08848022818565368, "rewards/margins": 1.018414968252182, "rewards/rejected": -0.9299347400665283, "step": 3128 }, { "epoch": 0.16584952163888372, "grad_norm": 65.5, "kl": 0.01892852783203125, "learning_rate": 5e-07, "logits/chosen": -6877702.666666667, "logits/rejected": -35454416.0, "logps/chosen": -297.8568929036458, "logps/rejected": -492.5708923339844, "loss": 0.298, "rewards/chosen": 0.6765373547871908, "rewards/margins": 2.358823378880819, "rewards/rejected": -1.682286024093628, "step": 3129 }, { "epoch": 0.16590252564068586, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66015704.0, "logits/rejected": -27885458.0, "logps/chosen": -219.95521545410156, "logps/rejected": -462.7885437011719, "loss": 0.2936, "rewards/chosen": 0.25272437930107117, "rewards/margins": 2.838591307401657, "rewards/rejected": -2.585866928100586, "step": 3130 }, { "epoch": 0.165955529642488, "grad_norm": 63.0, "kl": 1.4102630615234375, "learning_rate": 5e-07, "logits/chosen": -22876594.666666668, "logits/rejected": -1364230.5, "logps/chosen": -260.512939453125, "logps/rejected": -193.53758239746094, "loss": 0.4489, "rewards/chosen": 0.10540592670440674, "rewards/margins": 1.3003346920013428, "rewards/rejected": -1.194928765296936, "step": 3131 }, { "epoch": 0.16600853364429013, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6640166.666666667, "logits/rejected": -14031998.4, "logps/chosen": -181.59456380208334, "logps/rejected": -227.06318359375, "loss": 0.2655, "rewards/chosen": 0.8213239510854086, "rewards/margins": 2.132798941930135, "rewards/rejected": -1.3114749908447265, "step": 3132 }, { "epoch": 0.16606153764609227, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10144954.666666666, "logits/rejected": -29327936.0, "logps/chosen": -252.9794921875, "logps/rejected": -462.5662109375, "loss": 0.2482, "rewards/chosen": 0.3804017702738444, "rewards/margins": 2.348674742380778, "rewards/rejected": -1.9682729721069336, "step": 3133 }, { "epoch": 0.1661145416478944, "grad_norm": 56.25, "kl": 0.24626922607421875, "learning_rate": 5e-07, "logits/chosen": -48866160.0, "logits/rejected": -21611509.333333332, "logps/chosen": -441.62509765625, "logps/rejected": -582.8041585286459, "loss": 0.3171, "rewards/chosen": 0.3993180990219116, "rewards/margins": 2.5232550700505576, "rewards/rejected": -2.123936971028646, "step": 3134 }, { "epoch": 0.16616754564969655, "grad_norm": 55.0, "kl": 0.20700454711914062, "learning_rate": 5e-07, "logits/chosen": -23017955.2, "logits/rejected": 4371376.666666667, "logps/chosen": -304.8349365234375, "logps/rejected": -185.4052734375, "loss": 0.3976, "rewards/chosen": 0.002026677131652832, "rewards/margins": 1.4635411500930786, "rewards/rejected": -1.4615144729614258, "step": 3135 }, { "epoch": 0.16622054965149868, "grad_norm": 53.25, "kl": 0.19145584106445312, "learning_rate": 5e-07, "logits/chosen": -81938888.0, "logits/rejected": -18728136.0, "logps/chosen": -306.02471923828125, "logps/rejected": -532.6387329101562, "loss": 0.3585, "rewards/chosen": 0.019120600074529648, "rewards/margins": 2.5533920787274837, "rewards/rejected": -2.534271478652954, "step": 3136 }, { "epoch": 0.16627355365330082, "grad_norm": 87.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 83165798.4, "logits/rejected": -32202226.666666668, "logps/chosen": -1039.8724609375, "logps/rejected": -349.7389729817708, "loss": 0.3605, "rewards/chosen": 0.31882076263427733, "rewards/margins": 1.6357686996459961, "rewards/rejected": -1.3169479370117188, "step": 3137 }, { "epoch": 0.16632655765510296, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18353988.0, "logits/rejected": -19871912.0, "logps/chosen": -483.2682189941406, "logps/rejected": -202.36721801757812, "loss": 0.3183, "rewards/chosen": 0.6834468841552734, "rewards/margins": 1.6963382959365845, "rewards/rejected": -1.012891411781311, "step": 3138 }, { "epoch": 0.1663795616569051, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32796840.0, "logits/rejected": -11564157.6, "logps/chosen": -190.84468587239584, "logps/rejected": -340.3873779296875, "loss": 0.377, "rewards/chosen": -0.4755118687947591, "rewards/margins": 0.8924045244852703, "rewards/rejected": -1.3679163932800293, "step": 3139 }, { "epoch": 0.16643256565870723, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8389867.2, "logits/rejected": -69827104.0, "logps/chosen": -155.72628173828124, "logps/rejected": -207.2501017252604, "loss": 0.4342, "rewards/chosen": -0.11658310890197754, "rewards/margins": 0.9753426710764568, "rewards/rejected": -1.0919257799784343, "step": 3140 }, { "epoch": 0.16648556966050937, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10483470.4, "logits/rejected": -10766712.666666666, "logps/chosen": -357.9115478515625, "logps/rejected": -284.29071044921875, "loss": 0.3832, "rewards/chosen": 0.03587461709976196, "rewards/margins": 1.7749111771583557, "rewards/rejected": -1.7390365600585938, "step": 3141 }, { "epoch": 0.1665385736623115, "grad_norm": 54.25, "kl": 0.8806877136230469, "learning_rate": 5e-07, "logits/chosen": -97893960.0, "logits/rejected": -9877137.0, "logps/chosen": -421.3993225097656, "logps/rejected": -305.78533935546875, "loss": 0.3016, "rewards/chosen": 0.4760902523994446, "rewards/margins": 2.20540589094162, "rewards/rejected": -1.7293156385421753, "step": 3142 }, { "epoch": 0.16659157766411364, "grad_norm": 39.5, "kl": 0.01042938232421875, "learning_rate": 5e-07, "logits/chosen": -18233861.333333332, "logits/rejected": -30312633.6, "logps/chosen": -161.22977701822916, "logps/rejected": -389.2646728515625, "loss": 0.2828, "rewards/chosen": -0.13570074240366617, "rewards/margins": 2.1301701148351033, "rewards/rejected": -2.2658708572387694, "step": 3143 }, { "epoch": 0.16664458166591578, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20022786.0, "logits/rejected": 4479186.0, "logps/chosen": -319.347412109375, "logps/rejected": -171.2577362060547, "loss": 0.299, "rewards/chosen": 0.5736067891120911, "rewards/margins": 1.8427167534828186, "rewards/rejected": -1.2691099643707275, "step": 3144 }, { "epoch": 0.16669758566771792, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55443258.666666664, "logits/rejected": -12022496.0, "logps/chosen": -312.8170572916667, "logps/rejected": -350.0218994140625, "loss": 0.2911, "rewards/chosen": 0.20955024162928262, "rewards/margins": 2.1425252477327983, "rewards/rejected": -1.9329750061035156, "step": 3145 }, { "epoch": 0.16675058966952006, "grad_norm": 70.0, "kl": 0.2725257873535156, "learning_rate": 5e-07, "logits/chosen": -41909248.0, "logits/rejected": -6257307.5, "logps/chosen": -413.46383231026783, "logps/rejected": -321.9744873046875, "loss": 0.4597, "rewards/chosen": 0.06214466265269688, "rewards/margins": 1.2604112880570548, "rewards/rejected": -1.198266625404358, "step": 3146 }, { "epoch": 0.1668035936713222, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -122633304.0, "logits/rejected": -35916578.666666664, "logps/chosen": -503.1956787109375, "logps/rejected": -413.5637613932292, "loss": 0.2223, "rewards/chosen": 0.0668640285730362, "rewards/margins": 2.0222250769535703, "rewards/rejected": -1.955361048380534, "step": 3147 }, { "epoch": 0.16685659767312433, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13243444.0, "logits/rejected": 36923148.8, "logps/chosen": -212.28824869791666, "logps/rejected": -623.652734375, "loss": 0.2339, "rewards/chosen": 0.2138249675432841, "rewards/margins": 2.4345372478167215, "rewards/rejected": -2.2207122802734376, "step": 3148 }, { "epoch": 0.16690960167492647, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 31089869.333333332, "logits/rejected": -23185385.6, "logps/chosen": -702.726318359375, "logps/rejected": -298.671826171875, "loss": 0.3763, "rewards/chosen": -0.264789084593455, "rewards/margins": 1.1878444870313007, "rewards/rejected": -1.4526335716247558, "step": 3149 }, { "epoch": 0.1669626056767286, "grad_norm": 71.0, "kl": 0.94757080078125, "learning_rate": 5e-07, "logits/chosen": -33779404.8, "logits/rejected": 2451926.6666666665, "logps/chosen": -569.21181640625, "logps/rejected": -224.46659342447916, "loss": 0.3224, "rewards/chosen": 0.4510244846343994, "rewards/margins": 2.2478756427764894, "rewards/rejected": -1.7968511581420898, "step": 3150 }, { "epoch": 0.16701560967853074, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46689048.0, "logits/rejected": -18536920.0, "logps/chosen": -714.6805419921875, "logps/rejected": -480.59112548828125, "loss": 0.3055, "rewards/chosen": 0.32744142413139343, "rewards/margins": 1.9998746812343597, "rewards/rejected": -1.6724332571029663, "step": 3151 }, { "epoch": 0.16706861368033288, "grad_norm": 57.25, "kl": 0.10375022888183594, "learning_rate": 5e-07, "logits/chosen": -75229254.4, "logits/rejected": -18928272.0, "logps/chosen": -273.001318359375, "logps/rejected": -482.7588297526042, "loss": 0.3924, "rewards/chosen": -0.08135815262794495, "rewards/margins": 1.6501615464687347, "rewards/rejected": -1.7315196990966797, "step": 3152 }, { "epoch": 0.167121617682135, "grad_norm": 49.5, "kl": 1.004587173461914, "learning_rate": 5e-07, "logits/chosen": -5458464.5, "logits/rejected": -21615064.0, "logps/chosen": -213.00059509277344, "logps/rejected": -148.730712890625, "loss": 0.3574, "rewards/chosen": 0.16211628913879395, "rewards/margins": 1.5134778022766113, "rewards/rejected": -1.3513615131378174, "step": 3153 }, { "epoch": 0.16717462168393712, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35682562.666666664, "logits/rejected": -7682657.6, "logps/chosen": -739.9386393229166, "logps/rejected": -289.0237060546875, "loss": 0.1987, "rewards/chosen": 0.8772408962249756, "rewards/margins": 2.8974772930145263, "rewards/rejected": -2.0202363967895507, "step": 3154 }, { "epoch": 0.16722762568573926, "grad_norm": 58.0, "kl": 1.0736007690429688, "learning_rate": 5e-07, "logits/chosen": -27174899.2, "logits/rejected": -108958773.33333333, "logps/chosen": -522.5021484375, "logps/rejected": -295.9788818359375, "loss": 0.2667, "rewards/chosen": 0.9535253524780274, "rewards/margins": 2.4503984133402508, "rewards/rejected": -1.4968730608622234, "step": 3155 }, { "epoch": 0.1672806296875414, "grad_norm": 60.0, "kl": 0.23139572143554688, "learning_rate": 5e-07, "logits/chosen": -52637264.0, "logits/rejected": -23554881.6, "logps/chosen": -717.7461751302084, "logps/rejected": -327.3806884765625, "loss": 0.2455, "rewards/chosen": 0.8061361312866211, "rewards/margins": 2.433885383605957, "rewards/rejected": -1.627749252319336, "step": 3156 }, { "epoch": 0.16733363368934354, "grad_norm": 39.5, "kl": 0.23076915740966797, "learning_rate": 5e-07, "logits/chosen": -40569180.0, "logits/rejected": -50739952.0, "logps/chosen": -161.43006896972656, "logps/rejected": -579.9557495117188, "loss": 0.2913, "rewards/chosen": 0.05821209400892258, "rewards/margins": 2.621383957564831, "rewards/rejected": -2.563171863555908, "step": 3157 }, { "epoch": 0.16738663769114567, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4782170.8, "logits/rejected": -26945757.333333332, "logps/chosen": -116.29254150390625, "logps/rejected": -372.9951171875, "loss": 0.3707, "rewards/chosen": 0.05955212712287903, "rewards/margins": 1.6013887067635852, "rewards/rejected": -1.5418365796407063, "step": 3158 }, { "epoch": 0.1674396416929478, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -103721344.0, "logits/rejected": 22704928.0, "logps/chosen": -386.6321614583333, "logps/rejected": -287.58212890625, "loss": 0.3514, "rewards/chosen": 0.4154093662897746, "rewards/margins": 1.2224897305170694, "rewards/rejected": -0.8070803642272949, "step": 3159 }, { "epoch": 0.16749264569474995, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35085434.666666664, "logits/rejected": -51545852.8, "logps/chosen": -173.9048868815104, "logps/rejected": -415.1634765625, "loss": 0.3476, "rewards/chosen": -0.60748823483785, "rewards/margins": 1.2238306204477944, "rewards/rejected": -1.8313188552856445, "step": 3160 }, { "epoch": 0.16754564969655208, "grad_norm": 59.0, "kl": 0.5818862915039062, "learning_rate": 5e-07, "logits/chosen": -31449408.0, "logits/rejected": -24623747.2, "logps/chosen": -612.5863850911459, "logps/rejected": -485.613671875, "loss": 0.2564, "rewards/chosen": 0.21262512604395548, "rewards/margins": 2.732340625921885, "rewards/rejected": -2.51971549987793, "step": 3161 }, { "epoch": 0.16759865369835422, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30847840.0, "logits/rejected": -45854860.8, "logps/chosen": -186.4559122721354, "logps/rejected": -325.741162109375, "loss": 0.2402, "rewards/chosen": 0.5715209643046061, "rewards/margins": 2.269034163157145, "rewards/rejected": -1.6975131988525392, "step": 3162 }, { "epoch": 0.16765165770015636, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4606161.5, "logits/rejected": -49827893.333333336, "logps/chosen": -210.43490600585938, "logps/rejected": -443.448486328125, "loss": 0.2452, "rewards/chosen": 0.5687792897224426, "rewards/margins": 2.4322538177172346, "rewards/rejected": -1.8634745279947917, "step": 3163 }, { "epoch": 0.1677046617019585, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37304595.2, "logits/rejected": -44214466.666666664, "logps/chosen": -395.177880859375, "logps/rejected": -335.8258056640625, "loss": 0.3998, "rewards/chosen": 0.03966255187988281, "rewards/margins": 1.1561513264973957, "rewards/rejected": -1.116488774617513, "step": 3164 }, { "epoch": 0.16775766570376063, "grad_norm": 65.0, "kl": 0.05536365509033203, "learning_rate": 5e-07, "logits/chosen": -25102925.333333332, "logits/rejected": 147428400.0, "logps/chosen": -226.29207356770834, "logps/rejected": -464.3260498046875, "loss": 0.4681, "rewards/chosen": -0.38586413860321045, "rewards/margins": 1.622216820716858, "rewards/rejected": -2.0080809593200684, "step": 3165 }, { "epoch": 0.16781066970556277, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28483760.0, "logits/rejected": -16659826.0, "logps/chosen": -180.28497314453125, "logps/rejected": -142.2050323486328, "loss": 0.3554, "rewards/chosen": 0.2763547897338867, "rewards/margins": 1.32784104347229, "rewards/rejected": -1.0514862537384033, "step": 3166 }, { "epoch": 0.1678636737073649, "grad_norm": 49.75, "kl": 3.2683095932006836, "learning_rate": 5e-07, "logits/chosen": -52208464.0, "logits/rejected": -55001532.0, "logps/chosen": -495.02020263671875, "logps/rejected": -498.0317687988281, "loss": 0.3132, "rewards/chosen": 0.7140591144561768, "rewards/margins": 2.8017711639404297, "rewards/rejected": -2.087712049484253, "step": 3167 }, { "epoch": 0.16791667770916704, "grad_norm": 44.5, "kl": 0.059234619140625, "learning_rate": 5e-07, "logits/chosen": -12536410.666666666, "logits/rejected": -14712971.0, "logps/chosen": -187.54585774739584, "logps/rejected": -64.26859283447266, "loss": 0.4152, "rewards/chosen": 0.18105697631835938, "rewards/margins": 1.1746403574943542, "rewards/rejected": -0.9935833811759949, "step": 3168 }, { "epoch": 0.16796968171096918, "grad_norm": 36.5, "kl": 0.48586368560791016, "learning_rate": 5e-07, "logits/chosen": -34155568.0, "logits/rejected": -27679222.0, "logps/chosen": -257.04864501953125, "logps/rejected": -240.46054077148438, "loss": 0.3098, "rewards/chosen": 0.43233487010002136, "rewards/margins": 1.9015482366085052, "rewards/rejected": -1.4692133665084839, "step": 3169 }, { "epoch": 0.16802268571277132, "grad_norm": 53.5, "kl": 0.2879905700683594, "learning_rate": 5e-07, "logits/chosen": 2861854.6666666665, "logits/rejected": -18442910.0, "logps/chosen": -187.17144775390625, "logps/rejected": -453.2215270996094, "loss": 0.4949, "rewards/chosen": -0.25037022431691486, "rewards/margins": 0.7916188637415569, "rewards/rejected": -1.0419890880584717, "step": 3170 }, { "epoch": 0.16807568971457346, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2395696.5, "logits/rejected": -22636888.0, "logps/chosen": -15.151981353759766, "logps/rejected": -433.6522623697917, "loss": 0.2356, "rewards/chosen": 0.3457402288913727, "rewards/margins": 2.689019113779068, "rewards/rejected": -2.3432788848876953, "step": 3171 }, { "epoch": 0.1681286937163756, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43689138.666666664, "logits/rejected": -8167574.4, "logps/chosen": -538.0064290364584, "logps/rejected": -334.5132568359375, "loss": 0.3556, "rewards/chosen": -0.16846048831939697, "rewards/margins": 1.0568864583969115, "rewards/rejected": -1.2253469467163085, "step": 3172 }, { "epoch": 0.16818169771817773, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9186765.0, "logits/rejected": -35894176.0, "logps/chosen": -115.77809143066406, "logps/rejected": -509.69683837890625, "loss": 0.3376, "rewards/chosen": -0.2770686745643616, "rewards/margins": 1.8206173777580261, "rewards/rejected": -2.0976860523223877, "step": 3173 }, { "epoch": 0.16823470171997987, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 983048.75, "logits/rejected": -15706608.0, "logps/chosen": -65.90231831868489, "logps/rejected": -186.29034423828125, "loss": 0.3241, "rewards/chosen": -0.528262217839559, "rewards/margins": 1.3205405394236247, "rewards/rejected": -1.8488027572631835, "step": 3174 }, { "epoch": 0.168287705721782, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36465314.666666664, "logits/rejected": -28492153.6, "logps/chosen": -458.0034993489583, "logps/rejected": -264.322216796875, "loss": 0.2337, "rewards/chosen": 0.4416036208470662, "rewards/margins": 2.398490293820699, "rewards/rejected": -1.9568866729736327, "step": 3175 }, { "epoch": 0.16834070972358414, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30226224.0, "logits/rejected": -44226136.0, "logps/chosen": -138.63230895996094, "logps/rejected": -435.9421691894531, "loss": 0.3425, "rewards/chosen": -0.10653306543827057, "rewards/margins": 1.6624097675085068, "rewards/rejected": -1.7689428329467773, "step": 3176 }, { "epoch": 0.16839371372538628, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6313818.0, "logits/rejected": -14484021.714285715, "logps/chosen": -192.91854858398438, "logps/rejected": -288.74012974330356, "loss": 0.2554, "rewards/chosen": -0.13968200981616974, "rewards/margins": 1.5000240568603789, "rewards/rejected": -1.6397060666765486, "step": 3177 }, { "epoch": 0.16844671772718842, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2316585.6666666665, "logits/rejected": -57207379.2, "logps/chosen": -341.7719319661458, "logps/rejected": -247.858154296875, "loss": 0.2449, "rewards/chosen": 0.5732461214065552, "rewards/margins": 2.2582316637039184, "rewards/rejected": -1.6849855422973632, "step": 3178 }, { "epoch": 0.16849972172899053, "grad_norm": 67.0, "kl": 0.46251678466796875, "learning_rate": 5e-07, "logits/chosen": -30367312.0, "logits/rejected": -39192612.0, "logps/chosen": -485.0966796875, "logps/rejected": -354.7856140136719, "loss": 0.273, "rewards/chosen": 0.7026390433311462, "rewards/margins": 2.426708161830902, "rewards/rejected": -1.7240691184997559, "step": 3179 }, { "epoch": 0.16855272573079266, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25923976.0, "logits/rejected": -45887957.333333336, "logps/chosen": -521.3101806640625, "logps/rejected": -362.1144205729167, "loss": 0.2728, "rewards/chosen": 0.464324951171875, "rewards/margins": 1.8208476702372234, "rewards/rejected": -1.3565227190653484, "step": 3180 }, { "epoch": 0.1686057297325948, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12330551.0, "logits/rejected": -17310168.0, "logps/chosen": -133.7676544189453, "logps/rejected": -297.79595947265625, "loss": 0.2706, "rewards/chosen": 0.19168569147586823, "rewards/margins": 2.572213187813759, "rewards/rejected": -2.3805274963378906, "step": 3181 }, { "epoch": 0.16865873373439694, "grad_norm": 55.75, "kl": 0.06421852111816406, "learning_rate": 5e-07, "logits/chosen": -79234073.6, "logits/rejected": -43650066.666666664, "logps/chosen": -260.812548828125, "logps/rejected": -500.5016682942708, "loss": 0.3116, "rewards/chosen": 0.43073453903198244, "rewards/margins": 2.0812183062235516, "rewards/rejected": -1.650483767191569, "step": 3182 }, { "epoch": 0.16871173773619907, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32254068.0, "logits/rejected": 37903244.0, "logps/chosen": -225.97750854492188, "logps/rejected": -183.929443359375, "loss": 0.383, "rewards/chosen": -0.1992063671350479, "rewards/margins": 1.1522376388311386, "rewards/rejected": -1.3514440059661865, "step": 3183 }, { "epoch": 0.1687647417380012, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3469510.0, "logits/rejected": -9321116.0, "logps/chosen": -342.1732177734375, "logps/rejected": -237.57631138392858, "loss": 0.2668, "rewards/chosen": 1.1357147693634033, "rewards/margins": 2.278066737311227, "rewards/rejected": -1.1423519679478236, "step": 3184 }, { "epoch": 0.16881774573980335, "grad_norm": 63.5, "kl": 0.08958625793457031, "learning_rate": 5e-07, "logits/chosen": -30104925.333333332, "logits/rejected": -17620826.0, "logps/chosen": -215.01505533854166, "logps/rejected": -261.30865478515625, "loss": 0.4331, "rewards/chosen": -0.060783972342809044, "rewards/margins": 1.421661506096522, "rewards/rejected": -1.482445478439331, "step": 3185 }, { "epoch": 0.16887074974160549, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5741311.333333333, "logits/rejected": 9372800.0, "logps/chosen": -242.68619791666666, "logps/rejected": -200.7251739501953, "loss": 0.3702, "rewards/chosen": 0.4691602389017741, "rewards/margins": 1.3888776699701946, "rewards/rejected": -0.9197174310684204, "step": 3186 }, { "epoch": 0.16892375374340762, "grad_norm": 72.5, "kl": 0.5320663452148438, "learning_rate": 5e-07, "logits/chosen": -47920424.0, "logits/rejected": -23226637.333333332, "logps/chosen": -847.275390625, "logps/rejected": -331.4789632161458, "loss": 0.2315, "rewards/chosen": 0.8032897710800171, "rewards/margins": 2.5441179672876997, "rewards/rejected": -1.7408281962076824, "step": 3187 }, { "epoch": 0.16897675774520976, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15985005.0, "logits/rejected": -27211724.0, "logps/chosen": -298.69647216796875, "logps/rejected": -413.6636962890625, "loss": 0.2657, "rewards/chosen": 0.2456449568271637, "rewards/margins": 2.6648856699466705, "rewards/rejected": -2.419240713119507, "step": 3188 }, { "epoch": 0.1690297617470119, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9091854.0, "logits/rejected": -18168496.0, "logps/chosen": -124.15326690673828, "logps/rejected": -368.4628499348958, "loss": 0.2962, "rewards/chosen": -0.15717697143554688, "rewards/margins": 1.4263747533162434, "rewards/rejected": -1.5835517247517903, "step": 3189 }, { "epoch": 0.16908276574881403, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -101118480.0, "logits/rejected": -23637618.285714287, "logps/chosen": -509.4909362792969, "logps/rejected": -426.2157505580357, "loss": 0.159, "rewards/chosen": 0.6952056884765625, "rewards/margins": 3.0260729108537947, "rewards/rejected": -2.330867222377232, "step": 3190 }, { "epoch": 0.16913576975061617, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13294550.4, "logits/rejected": -61927360.0, "logps/chosen": -273.857958984375, "logps/rejected": -234.26411946614584, "loss": 0.3411, "rewards/chosen": 0.2787044525146484, "rewards/margins": 1.8607194900512696, "rewards/rejected": -1.582015037536621, "step": 3191 }, { "epoch": 0.1691887737524183, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -16414375.0, "logps/rejected": -384.37469482421875, "loss": 0.1518, "rewards/rejected": -2.0647201538085938, "step": 3192 }, { "epoch": 0.16924177775422045, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3100648.5, "logits/rejected": -51930752.0, "logps/chosen": -186.3863067626953, "logps/rejected": -418.2465413411458, "loss": 0.2227, "rewards/chosen": 0.9099162817001343, "rewards/margins": 2.571887056032817, "rewards/rejected": -1.6619707743326824, "step": 3193 }, { "epoch": 0.16929478175602258, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13078119.0, "logits/rejected": -22850125.333333332, "logps/chosen": -46.807373046875, "logps/rejected": -438.674560546875, "loss": 0.235, "rewards/chosen": 0.5171035528182983, "rewards/margins": 2.4830859899520874, "rewards/rejected": -1.965982437133789, "step": 3194 }, { "epoch": 0.16934778575782472, "grad_norm": 50.25, "kl": 0.6953153610229492, "learning_rate": 5e-07, "logits/chosen": -12807736.0, "logits/rejected": -2371412.0, "logps/chosen": -253.25873674665178, "logps/rejected": -79.01042938232422, "loss": 0.3413, "rewards/chosen": 0.6166792597089495, "rewards/margins": 2.6919100965772356, "rewards/rejected": -2.075230836868286, "step": 3195 }, { "epoch": 0.16940078975962686, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38314312.0, "logits/rejected": -28766329.6, "logps/chosen": -398.5187174479167, "logps/rejected": -406.61337890625, "loss": 0.2806, "rewards/chosen": 0.1338459054629008, "rewards/margins": 1.9705257455507914, "rewards/rejected": -1.8366798400878905, "step": 3196 }, { "epoch": 0.169453793761429, "grad_norm": 60.5, "kl": 0.3120136260986328, "learning_rate": 5e-07, "logits/chosen": -30150229.333333332, "logits/rejected": -34653014.4, "logps/chosen": -472.7141927083333, "logps/rejected": -676.496337890625, "loss": 0.2296, "rewards/chosen": 0.13280131419499716, "rewards/margins": 3.248397894700368, "rewards/rejected": -3.115596580505371, "step": 3197 }, { "epoch": 0.16950679776323113, "grad_norm": 398.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54718696.0, "logits/rejected": -353323.25, "logps/chosen": -397.8578796386719, "logps/rejected": -278.96881103515625, "loss": 0.394, "rewards/chosen": 0.039618782699108124, "rewards/margins": 1.0424058958888054, "rewards/rejected": -1.0027871131896973, "step": 3198 }, { "epoch": 0.16955980176503327, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31850749.333333332, "logits/rejected": -26303033.6, "logps/chosen": -172.6226806640625, "logps/rejected": -433.826513671875, "loss": 0.2973, "rewards/chosen": -0.11225395401318868, "rewards/margins": 1.8487010935942332, "rewards/rejected": -1.9609550476074218, "step": 3199 }, { "epoch": 0.1696128057668354, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6681599.0, "logits/rejected": -59178698.666666664, "logps/chosen": -196.7860107421875, "logps/rejected": -487.9807535807292, "loss": 0.1964, "rewards/chosen": 0.6427881717681885, "rewards/margins": 2.841690937678019, "rewards/rejected": -2.1989027659098306, "step": 3200 }, { "epoch": 0.16966580976863754, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15824279.0, "logits/rejected": -15535830.0, "logps/chosen": -651.752685546875, "logps/rejected": -258.454345703125, "loss": 0.3545, "rewards/chosen": 0.2670789957046509, "rewards/margins": 1.708633303642273, "rewards/rejected": -1.441554307937622, "step": 3201 }, { "epoch": 0.16971881377043968, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43220632.0, "logits/rejected": -52377173.333333336, "logps/chosen": -369.0672607421875, "logps/rejected": -391.1243489583333, "loss": 0.2258, "rewards/chosen": 0.8636108636856079, "rewards/margins": 2.540432095527649, "rewards/rejected": -1.676821231842041, "step": 3202 }, { "epoch": 0.16977181777224182, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33380221.333333332, "logits/rejected": -42072918.4, "logps/chosen": -454.5498046875, "logps/rejected": -521.97802734375, "loss": 0.2428, "rewards/chosen": 0.10170897841453552, "rewards/margins": 2.6081302583217623, "rewards/rejected": -2.5064212799072267, "step": 3203 }, { "epoch": 0.16982482177404393, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18427252.0, "logits/rejected": -65427276.0, "logps/chosen": -195.94715881347656, "logps/rejected": -130.27389526367188, "loss": 0.4759, "rewards/chosen": -0.23373891413211823, "rewards/margins": 0.18378598988056183, "rewards/rejected": -0.41752490401268005, "step": 3204 }, { "epoch": 0.16987782577584606, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17412121.333333332, "logits/rejected": -40447792.0, "logps/chosen": -210.01143391927084, "logps/rejected": -449.178564453125, "loss": 0.2707, "rewards/chosen": -0.31423226992289227, "rewards/margins": 2.1916129906972253, "rewards/rejected": -2.5058452606201174, "step": 3205 }, { "epoch": 0.1699308297776482, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46104981.333333336, "logits/rejected": -70695500.8, "logps/chosen": -190.09564208984375, "logps/rejected": -398.9473876953125, "loss": 0.3094, "rewards/chosen": -0.1265586813290914, "rewards/margins": 1.6304359475771586, "rewards/rejected": -1.75699462890625, "step": 3206 }, { "epoch": 0.16998383377945034, "grad_norm": 60.0, "kl": 0.02492666244506836, "learning_rate": 5e-07, "logits/chosen": -36019853.333333336, "logits/rejected": -66779208.0, "logps/chosen": -406.0633138020833, "logps/rejected": -331.0352783203125, "loss": 0.3668, "rewards/chosen": 0.4680517514546712, "rewards/margins": 1.5018957455952961, "rewards/rejected": -1.033843994140625, "step": 3207 }, { "epoch": 0.17003683778125248, "grad_norm": 48.25, "kl": 0.4234466552734375, "learning_rate": 5e-07, "logits/chosen": -11842225.333333334, "logits/rejected": -11174615.2, "logps/chosen": -260.24342854817706, "logps/rejected": -218.009326171875, "loss": 0.347, "rewards/chosen": 0.02621777852376302, "rewards/margins": 1.2743974049886067, "rewards/rejected": -1.2481796264648437, "step": 3208 }, { "epoch": 0.1700898417830546, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12112205.714285715, "logits/rejected": 299101.96875, "logps/chosen": -648.9331752232143, "logps/rejected": -166.46463012695312, "loss": 0.3538, "rewards/chosen": 0.6446266174316406, "rewards/margins": 1.6088172793388367, "rewards/rejected": -0.964190661907196, "step": 3209 }, { "epoch": 0.17014284578485675, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 37742760.0, "logits/rejected": -16140345.0, "logps/chosen": -421.9556884765625, "logps/rejected": -258.9947814941406, "loss": 0.3058, "rewards/chosen": 0.4996486306190491, "rewards/margins": 2.3770728707313538, "rewards/rejected": -1.8774242401123047, "step": 3210 }, { "epoch": 0.1701958497866589, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10771579.2, "logits/rejected": -1313351.1666666667, "logps/chosen": -304.719482421875, "logps/rejected": -265.65407307942706, "loss": 0.3614, "rewards/chosen": 0.07912983894348144, "rewards/margins": 1.8038790861765543, "rewards/rejected": -1.724749247233073, "step": 3211 }, { "epoch": 0.17024885378846102, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59314496.0, "logits/rejected": -58150760.0, "logps/chosen": -437.55999755859375, "logps/rejected": -365.5441589355469, "loss": 0.3171, "rewards/chosen": 0.1894889771938324, "rewards/margins": 1.9350950419902802, "rewards/rejected": -1.7456060647964478, "step": 3212 }, { "epoch": 0.17030185779026316, "grad_norm": 55.25, "kl": 0.37818145751953125, "learning_rate": 5e-07, "logits/chosen": -69401594.66666667, "logits/rejected": -56498675.2, "logps/chosen": -428.4222005208333, "logps/rejected": -410.85654296875, "loss": 0.3019, "rewards/chosen": 0.11840059359868367, "rewards/margins": 2.1056556900342307, "rewards/rejected": -1.9872550964355469, "step": 3213 }, { "epoch": 0.1703548617920653, "grad_norm": 77.5, "kl": 0.07319355010986328, "learning_rate": 5e-07, "logits/chosen": -45051644.8, "logits/rejected": -54166672.0, "logps/chosen": -418.599560546875, "logps/rejected": -525.658203125, "loss": 0.3166, "rewards/chosen": 0.2573948860168457, "rewards/margins": 3.151551024119059, "rewards/rejected": -2.8941561381022134, "step": 3214 }, { "epoch": 0.17040786579386744, "grad_norm": 50.75, "kl": 0.45432281494140625, "learning_rate": 5e-07, "logits/chosen": -42284741.333333336, "logits/rejected": -47467904.0, "logps/chosen": -285.49428304036456, "logps/rejected": -588.751806640625, "loss": 0.2433, "rewards/chosen": 0.17595823605855307, "rewards/margins": 2.6165557702382407, "rewards/rejected": -2.4405975341796875, "step": 3215 }, { "epoch": 0.17046086979566957, "grad_norm": 56.5, "kl": 0.22667694091796875, "learning_rate": 5e-07, "logits/chosen": -3475047.6, "logits/rejected": 4133063.0, "logps/chosen": -204.2530029296875, "logps/rejected": -65.37296040852864, "loss": 0.4175, "rewards/chosen": 0.33982224464416505, "rewards/margins": 0.7566871563593547, "rewards/rejected": -0.41686491171518963, "step": 3216 }, { "epoch": 0.1705138737974717, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29324754.666666668, "logits/rejected": -5251792.4, "logps/chosen": -276.5536295572917, "logps/rejected": -155.44581298828126, "loss": 0.3317, "rewards/chosen": -0.00839182734489441, "rewards/margins": 1.5703129112720489, "rewards/rejected": -1.5787047386169433, "step": 3217 }, { "epoch": 0.17056687779927385, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 36252076.8, "logits/rejected": -68262229.33333333, "logps/chosen": -166.72706298828126, "logps/rejected": -380.6207275390625, "loss": 0.3785, "rewards/chosen": -0.04147751033306122, "rewards/margins": 1.837003083030383, "rewards/rejected": -1.878480593363444, "step": 3218 }, { "epoch": 0.17061988180107598, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41767210.666666664, "logits/rejected": -10042364.8, "logps/chosen": -370.3180745442708, "logps/rejected": -131.10863037109374, "loss": 0.322, "rewards/chosen": 0.18903605143229166, "rewards/margins": 1.4490688006083172, "rewards/rejected": -1.2600327491760255, "step": 3219 }, { "epoch": 0.17067288580287812, "grad_norm": 54.5, "kl": 1.1078624725341797, "learning_rate": 5e-07, "logits/chosen": -25174812.0, "logits/rejected": -8475296.0, "logps/chosen": -543.1353759765625, "logps/rejected": -416.56048583984375, "loss": 0.3128, "rewards/chosen": 0.4568466544151306, "rewards/margins": 2.151919424533844, "rewards/rejected": -1.6950727701187134, "step": 3220 }, { "epoch": 0.17072588980468026, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1222407.75, "logits/rejected": -8041992.0, "logps/chosen": -151.06597900390625, "logps/rejected": -274.889404296875, "loss": 0.234, "rewards/chosen": 0.6454209089279175, "rewards/margins": 2.4030544360478716, "rewards/rejected": -1.7576335271199544, "step": 3221 }, { "epoch": 0.1707788938064824, "grad_norm": 48.0, "kl": 0.7138710021972656, "learning_rate": 5e-07, "logits/chosen": -22864101.333333332, "logits/rejected": -13591841.6, "logps/chosen": -302.91387939453125, "logps/rejected": -479.1072265625, "loss": 0.2588, "rewards/chosen": 0.18509344259897867, "rewards/margins": 2.4956475655237833, "rewards/rejected": -2.3105541229248048, "step": 3222 }, { "epoch": 0.17083189780828453, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48482416.0, "logits/rejected": -36490116.571428575, "logps/chosen": -296.06402587890625, "logps/rejected": -415.3597935267857, "loss": 0.2065, "rewards/chosen": -0.01245727576315403, "rewards/margins": 2.2842457903815165, "rewards/rejected": -2.2967030661446706, "step": 3223 }, { "epoch": 0.17088490181008667, "grad_norm": 48.25, "kl": 0.1835041046142578, "learning_rate": 5e-07, "logits/chosen": -10659596.0, "logits/rejected": -29523481.6, "logps/chosen": -302.93210856119794, "logps/rejected": -318.3409912109375, "loss": 0.2529, "rewards/chosen": 0.3650991916656494, "rewards/margins": 2.356801080703735, "rewards/rejected": -1.991701889038086, "step": 3224 }, { "epoch": 0.1709379058118888, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40768664.0, "logits/rejected": -17706085.333333332, "logps/chosen": -698.9585571289062, "logps/rejected": -206.83089192708334, "loss": 0.2326, "rewards/chosen": 1.2525100708007812, "rewards/margins": 2.4701017538706465, "rewards/rejected": -1.217591683069865, "step": 3225 }, { "epoch": 0.17099090981369094, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29735510.0, "logits/rejected": 2441796.5, "logps/chosen": -439.15814208984375, "logps/rejected": -460.62841796875, "loss": 0.2763, "rewards/chosen": 0.3100036680698395, "rewards/margins": 2.6049253046512604, "rewards/rejected": -2.294921636581421, "step": 3226 }, { "epoch": 0.17104391381549308, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10508976.0, "logits/rejected": -37882954.666666664, "logps/chosen": -181.037353515625, "logps/rejected": -361.7402750651042, "loss": 0.3741, "rewards/chosen": 0.11019797325134277, "rewards/margins": 1.551141055425008, "rewards/rejected": -1.4409430821736653, "step": 3227 }, { "epoch": 0.17109691781729522, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -93606816.0, "logits/rejected": -5765369.5, "logps/chosen": -394.4549560546875, "logps/rejected": -133.31520080566406, "loss": 0.3144, "rewards/chosen": 0.5416015386581421, "rewards/margins": 1.7575000524520874, "rewards/rejected": -1.2158985137939453, "step": 3228 }, { "epoch": 0.17114992181909733, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36999814.4, "logits/rejected": -106128800.0, "logps/chosen": -442.24140625, "logps/rejected": -363.4771728515625, "loss": 0.28, "rewards/chosen": 0.8705657958984375, "rewards/margins": 2.043675009409587, "rewards/rejected": -1.1731092135111492, "step": 3229 }, { "epoch": 0.17120292582089947, "grad_norm": 55.75, "kl": 0.1571502685546875, "learning_rate": 5e-07, "logits/chosen": -7706395.0, "logits/rejected": 131432424.0, "logps/chosen": -248.75686645507812, "logps/rejected": -531.1437377929688, "loss": 0.2867, "rewards/chosen": 0.092485710978508, "rewards/margins": 2.5990951508283615, "rewards/rejected": -2.5066094398498535, "step": 3230 }, { "epoch": 0.1712559298227016, "grad_norm": 51.75, "kl": 0.7515773773193359, "learning_rate": 5e-07, "logits/chosen": -42412970.666666664, "logits/rejected": -9484809.6, "logps/chosen": -448.7744547526042, "logps/rejected": -307.3130615234375, "loss": 0.2845, "rewards/chosen": 0.04962386687596639, "rewards/margins": 1.890905757745107, "rewards/rejected": -1.8412818908691406, "step": 3231 }, { "epoch": 0.17130893382450374, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22676304.0, "logits/rejected": -57082533.333333336, "logps/chosen": -112.708447265625, "logps/rejected": -420.0652262369792, "loss": 0.4083, "rewards/chosen": -0.1918583631515503, "rewards/margins": 1.3368845860163372, "rewards/rejected": -1.5287429491678874, "step": 3232 }, { "epoch": 0.17136193782630588, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53106064.0, "logits/rejected": -25517928.0, "logps/chosen": -266.26104736328125, "logps/rejected": -336.681884765625, "loss": 0.3446, "rewards/chosen": -0.40398427844047546, "rewards/margins": 2.0869829952716827, "rewards/rejected": -2.490967273712158, "step": 3233 }, { "epoch": 0.17141494182810801, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 36719.4140625, "logits/rejected": 1928438.75, "logps/chosen": -75.02403259277344, "logps/rejected": -252.82205200195312, "loss": 0.3385, "rewards/chosen": 0.17248377203941345, "rewards/margins": 1.6852760016918182, "rewards/rejected": -1.5127922296524048, "step": 3234 }, { "epoch": 0.17146794582991015, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14849392.0, "logits/rejected": -50644406.4, "logps/chosen": -311.5387369791667, "logps/rejected": -336.39140625, "loss": 0.2296, "rewards/chosen": 0.5566814740498861, "rewards/margins": 2.6408308347066245, "rewards/rejected": -2.0841493606567383, "step": 3235 }, { "epoch": 0.1715209498317123, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30947597.333333332, "logits/rejected": -34021814.4, "logps/chosen": -477.2504475911458, "logps/rejected": -162.4396240234375, "loss": 0.2944, "rewards/chosen": 0.21136474609375, "rewards/margins": 1.6973334312438966, "rewards/rejected": -1.4859686851501466, "step": 3236 }, { "epoch": 0.17157395383351443, "grad_norm": 58.0, "kl": 0.8619804382324219, "learning_rate": 5e-07, "logits/chosen": -14095278.4, "logits/rejected": -16820625.333333332, "logps/chosen": -342.71748046875, "logps/rejected": -80.5103759765625, "loss": 0.3382, "rewards/chosen": 0.574524211883545, "rewards/margins": 1.6271441459655762, "rewards/rejected": -1.0526199340820312, "step": 3237 }, { "epoch": 0.17162695783531656, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30866541.714285713, "logits/rejected": -8408170.0, "logps/chosen": -322.4761439732143, "logps/rejected": -584.1282958984375, "loss": 0.4665, "rewards/chosen": -0.15760023253304617, "rewards/margins": 4.287187269755772, "rewards/rejected": -4.444787502288818, "step": 3238 }, { "epoch": 0.1716799618371187, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44896924.0, "logits/rejected": -24662754.0, "logps/chosen": -400.0185546875, "logps/rejected": -344.64794921875, "loss": 0.2736, "rewards/chosen": 0.18197326362133026, "rewards/margins": 2.655060574412346, "rewards/rejected": -2.4730873107910156, "step": 3239 }, { "epoch": 0.17173296583892084, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19099864.0, "logits/rejected": -41854088.0, "logps/chosen": -386.14849853515625, "logps/rejected": -508.99542236328125, "loss": 0.3128, "rewards/chosen": 0.018782049417495728, "rewards/margins": 2.2803849279880524, "rewards/rejected": -2.2616028785705566, "step": 3240 }, { "epoch": 0.17178596984072297, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53862224.0, "logits/rejected": -50738736.0, "logps/chosen": -316.5951334635417, "logps/rejected": -391.83441162109375, "loss": 0.4313, "rewards/chosen": -0.14230063557624817, "rewards/margins": 1.8912062346935272, "rewards/rejected": -2.0335068702697754, "step": 3241 }, { "epoch": 0.1718389738425251, "grad_norm": 42.75, "kl": 0.36724090576171875, "learning_rate": 5e-07, "logits/chosen": -19773508.8, "logits/rejected": -15801837.333333334, "logps/chosen": -217.2941650390625, "logps/rejected": -486.2820638020833, "loss": 0.3266, "rewards/chosen": 0.27146244049072266, "rewards/margins": 2.5052088101704917, "rewards/rejected": -2.233746369679769, "step": 3242 }, { "epoch": 0.17189197784432725, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4306846.0, "logits/rejected": -37591347.2, "logps/chosen": -148.73486328125, "logps/rejected": -435.717236328125, "loss": 0.2314, "rewards/chosen": 0.5816057523091634, "rewards/margins": 2.8375597318013512, "rewards/rejected": -2.2559539794921877, "step": 3243 }, { "epoch": 0.17194498184612939, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9747424.0, "logits/rejected": -26514674.666666668, "logps/chosen": -109.06337890625, "logps/rejected": -219.6175740559896, "loss": 0.4789, "rewards/chosen": -0.2980775833129883, "rewards/margins": 0.7357292970021565, "rewards/rejected": -1.0338068803151448, "step": 3244 }, { "epoch": 0.17199798584793152, "grad_norm": 67.5, "kl": 1.9654388427734375, "learning_rate": 5e-07, "logits/chosen": -23907520.0, "logits/rejected": 17106624.0, "logps/chosen": -387.8750813802083, "logps/rejected": -404.8403625488281, "loss": 0.3134, "rewards/chosen": 0.7648287614186605, "rewards/margins": 3.374906619389852, "rewards/rejected": -2.6100778579711914, "step": 3245 }, { "epoch": 0.17205098984973366, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21935958.0, "logits/rejected": -20250141.333333332, "logps/chosen": -187.84202575683594, "logps/rejected": -262.74755859375, "loss": 0.2368, "rewards/chosen": 0.002246856689453125, "rewards/margins": 1.9846744537353516, "rewards/rejected": -1.9824275970458984, "step": 3246 }, { "epoch": 0.1721039938515358, "grad_norm": 74.0, "kl": 0.6945877075195312, "learning_rate": 5e-07, "logits/chosen": -66140112.0, "logits/rejected": -53055808.0, "logps/chosen": -226.18194580078125, "logps/rejected": -515.4556274414062, "loss": 0.4232, "rewards/chosen": -0.06531859437624614, "rewards/margins": 3.0992660423119864, "rewards/rejected": -3.1645846366882324, "step": 3247 }, { "epoch": 0.17215699785333793, "grad_norm": 78.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15983752.0, "logits/rejected": -131754040.0, "logps/chosen": -313.9930943080357, "logps/rejected": -556.2308349609375, "loss": 0.533, "rewards/chosen": -0.4323878288269043, "rewards/margins": 2.7290868759155273, "rewards/rejected": -3.1614747047424316, "step": 3248 }, { "epoch": 0.17221000185514007, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -125386000.0, "logits/rejected": -24928397.333333332, "logps/chosen": -465.4120788574219, "logps/rejected": -180.70037841796875, "loss": 0.3271, "rewards/chosen": -0.5473541021347046, "rewards/margins": 0.8466488122940063, "rewards/rejected": -1.394002914428711, "step": 3249 }, { "epoch": 0.1722630058569422, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19493841.6, "logits/rejected": -17064641.333333332, "logps/chosen": -407.2023681640625, "logps/rejected": -300.12058512369794, "loss": 0.2653, "rewards/chosen": 0.5521081924438477, "rewards/margins": 2.943578783671061, "rewards/rejected": -2.3914705912272134, "step": 3250 }, { "epoch": 0.17231600985874435, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1438181.125, "logits/rejected": -53164952.0, "logps/chosen": -165.95274353027344, "logps/rejected": -316.7105407714844, "loss": 0.3334, "rewards/chosen": 0.14123135805130005, "rewards/margins": 2.2140219807624817, "rewards/rejected": -2.0727906227111816, "step": 3251 }, { "epoch": 0.17236901386054648, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39483654.4, "logits/rejected": -81914672.0, "logps/chosen": -203.0158447265625, "logps/rejected": -379.7069498697917, "loss": 0.3693, "rewards/chosen": 0.0665168046951294, "rewards/margins": 2.6226063648859657, "rewards/rejected": -2.5560895601908364, "step": 3252 }, { "epoch": 0.17242201786234862, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38504739.2, "logits/rejected": -53233722.666666664, "logps/chosen": -272.0404296875, "logps/rejected": -299.9785563151042, "loss": 0.3408, "rewards/chosen": 0.318580174446106, "rewards/margins": 1.9654999017715453, "rewards/rejected": -1.6469197273254395, "step": 3253 }, { "epoch": 0.17247502186415076, "grad_norm": 53.0, "kl": 0.3961830139160156, "learning_rate": 5e-07, "logits/chosen": -45051832.0, "logits/rejected": -8228673.5, "logps/chosen": -278.35308837890625, "logps/rejected": -156.09193420410156, "loss": 0.3417, "rewards/chosen": 0.40512096881866455, "rewards/margins": 1.7645270824432373, "rewards/rejected": -1.3594061136245728, "step": 3254 }, { "epoch": 0.17252802586595287, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4595508.5, "logits/rejected": -7989261.714285715, "logps/chosen": -75.00921630859375, "logps/rejected": -117.57721819196429, "loss": 0.3094, "rewards/chosen": 0.05366211012005806, "rewards/margins": 1.038649232792003, "rewards/rejected": -0.9849871226719448, "step": 3255 }, { "epoch": 0.172581029867755, "grad_norm": 62.5, "kl": 0.26117515563964844, "learning_rate": 5e-07, "logits/chosen": -42693400.0, "logits/rejected": -14535390.4, "logps/chosen": -446.685302734375, "logps/rejected": -184.0438232421875, "loss": 0.3182, "rewards/chosen": 0.2727157672246297, "rewards/margins": 1.7708152850468952, "rewards/rejected": -1.4980995178222656, "step": 3256 }, { "epoch": 0.17263403386955714, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81673536.0, "logits/rejected": 11317829.333333334, "logps/chosen": -252.2276153564453, "logps/rejected": -425.6065266927083, "loss": 0.2693, "rewards/chosen": -0.2600860595703125, "rewards/margins": 1.4769269625345867, "rewards/rejected": -1.7370130221048992, "step": 3257 }, { "epoch": 0.17268703787135928, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17653542.4, "logits/rejected": -22155405.333333332, "logps/chosen": -197.97645263671876, "logps/rejected": -100.99832153320312, "loss": 0.3102, "rewards/chosen": 0.40999646186828614, "rewards/margins": 2.2534913539886476, "rewards/rejected": -1.8434948921203613, "step": 3258 }, { "epoch": 0.17274004187316142, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39105196.0, "logits/rejected": -34099580.0, "logps/chosen": -317.7812194824219, "logps/rejected": -335.6460876464844, "loss": 0.2908, "rewards/chosen": 0.4759616255760193, "rewards/margins": 2.151380240917206, "rewards/rejected": -1.6754186153411865, "step": 3259 }, { "epoch": 0.17279304587496355, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20526662.666666668, "logits/rejected": -3693038.4, "logps/chosen": -192.9338582356771, "logps/rejected": -477.2060546875, "loss": 0.2067, "rewards/chosen": 0.33056894938151044, "rewards/margins": 2.9582255045572916, "rewards/rejected": -2.6276565551757813, "step": 3260 }, { "epoch": 0.1728460498767657, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36325912.0, "logits/rejected": 12469436.0, "logps/chosen": -176.439453125, "logps/rejected": -218.46246337890625, "loss": 0.3221, "rewards/chosen": -0.4220283627510071, "rewards/margins": 0.8142800529797871, "rewards/rejected": -1.2363084157307942, "step": 3261 }, { "epoch": 0.17289905387856783, "grad_norm": 49.25, "kl": 0.047260284423828125, "learning_rate": 5e-07, "logits/chosen": -13576894.666666666, "logits/rejected": -29105726.0, "logps/chosen": -130.83153279622397, "logps/rejected": -365.2831115722656, "loss": 0.3892, "rewards/chosen": 0.08665043115615845, "rewards/margins": 2.1173580288887024, "rewards/rejected": -2.030707597732544, "step": 3262 }, { "epoch": 0.17295205788036996, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23320226.666666668, "logits/rejected": -27875632.0, "logps/chosen": -136.6072794596354, "logps/rejected": -262.1140869140625, "loss": 0.2729, "rewards/chosen": 0.23425928751627603, "rewards/margins": 1.8551952997843426, "rewards/rejected": -1.6209360122680665, "step": 3263 }, { "epoch": 0.1730050618821721, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25555160.0, "logits/rejected": -13108320.0, "logps/chosen": -156.1213582356771, "logps/rejected": -196.7012451171875, "loss": 0.3666, "rewards/chosen": 0.011861672004063925, "rewards/margins": 1.2774174352486927, "rewards/rejected": -1.2655557632446288, "step": 3264 }, { "epoch": 0.17305806588397424, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58767093.333333336, "logits/rejected": -72466080.0, "logps/chosen": -243.2152303059896, "logps/rejected": -432.610009765625, "loss": 0.312, "rewards/chosen": -0.39844616254170734, "rewards/margins": 1.5506417433420818, "rewards/rejected": -1.949087905883789, "step": 3265 }, { "epoch": 0.17311106988577638, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17234760.0, "logits/rejected": -16731764.0, "logps/chosen": -139.39267578125, "logps/rejected": -182.32088216145834, "loss": 0.3381, "rewards/chosen": 0.11031570434570312, "rewards/margins": 2.180318737030029, "rewards/rejected": -2.070003032684326, "step": 3266 }, { "epoch": 0.1731640738875785, "grad_norm": 55.0, "kl": 2.2078495025634766, "learning_rate": 5e-07, "logits/chosen": -13611713.333333334, "logits/rejected": -35453420.0, "logps/chosen": -238.29510498046875, "logps/rejected": -198.31588745117188, "loss": 0.4664, "rewards/chosen": 0.12213629484176636, "rewards/margins": 0.6609809994697571, "rewards/rejected": -0.5388447046279907, "step": 3267 }, { "epoch": 0.17321707788938065, "grad_norm": 56.0, "kl": 0.147552490234375, "learning_rate": 5e-07, "logits/chosen": -30862154.666666668, "logits/rejected": -21861936.0, "logps/chosen": -386.7294108072917, "logps/rejected": -226.5854949951172, "loss": 0.4071, "rewards/chosen": 0.1368336578210195, "rewards/margins": 1.6747864385445912, "rewards/rejected": -1.5379527807235718, "step": 3268 }, { "epoch": 0.1732700818911828, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49883541.333333336, "logits/rejected": -37785232.0, "logps/chosen": -300.1029866536458, "logps/rejected": -281.031005859375, "loss": 0.3801, "rewards/chosen": 0.11050176620483398, "rewards/margins": 2.3622751235961914, "rewards/rejected": -2.2517733573913574, "step": 3269 }, { "epoch": 0.17332308589298492, "grad_norm": 64.5, "kl": 0.7068328857421875, "learning_rate": 5e-07, "logits/chosen": -21809398.4, "logits/rejected": -25030810.666666668, "logps/chosen": -432.25771484375, "logps/rejected": -453.3608805338542, "loss": 0.3176, "rewards/chosen": 0.263606595993042, "rewards/margins": 3.0081903616587318, "rewards/rejected": -2.74458376566569, "step": 3270 }, { "epoch": 0.17337608989478706, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1017235.0, "logits/rejected": -33468237.333333332, "logps/chosen": -184.70152282714844, "logps/rejected": -435.1354166666667, "loss": 0.2099, "rewards/chosen": 0.3140987753868103, "rewards/margins": 2.2581360538800554, "rewards/rejected": -1.9440372784932454, "step": 3271 }, { "epoch": 0.1734290938965892, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43235413.333333336, "logits/rejected": -21655187.2, "logps/chosen": -516.5865071614584, "logps/rejected": -135.715283203125, "loss": 0.3104, "rewards/chosen": 0.17608439922332764, "rewards/margins": 1.5034371137619018, "rewards/rejected": -1.3273527145385742, "step": 3272 }, { "epoch": 0.17348209789839134, "grad_norm": 53.75, "kl": 0.044525146484375, "learning_rate": 5e-07, "logits/chosen": -92536298.66666667, "logits/rejected": -23737296.0, "logps/chosen": -608.7124837239584, "logps/rejected": -483.4154296875, "loss": 0.251, "rewards/chosen": 0.38587292035420734, "rewards/margins": 2.391664775212606, "rewards/rejected": -2.0057918548583986, "step": 3273 }, { "epoch": 0.17353510190019347, "grad_norm": 39.75, "kl": 0.10899019241333008, "learning_rate": 5e-07, "logits/chosen": -10811811.0, "logits/rejected": -6793226.0, "logps/chosen": -73.94319915771484, "logps/rejected": -248.26043701171875, "loss": 0.2675, "rewards/chosen": 0.2404865324497223, "rewards/margins": 1.7712123294671376, "rewards/rejected": -1.5307257970174153, "step": 3274 }, { "epoch": 0.1735881059019956, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18836897.6, "logits/rejected": -24736442.666666668, "logps/chosen": -241.3966064453125, "logps/rejected": -548.2444661458334, "loss": 0.3804, "rewards/chosen": 0.20327959060668946, "rewards/margins": 1.90071226755778, "rewards/rejected": -1.6974326769510906, "step": 3275 }, { "epoch": 0.17364110990379775, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4883720.0, "logits/rejected": 13051074.285714285, "logps/chosen": -22.479249954223633, "logps/rejected": -510.357421875, "loss": 0.2465, "rewards/chosen": -0.3056648373603821, "rewards/margins": 1.387729959828513, "rewards/rejected": -1.6933947971888952, "step": 3276 }, { "epoch": 0.17369411390559988, "grad_norm": 53.5, "kl": 0.02057647705078125, "learning_rate": 5e-07, "logits/chosen": -52967573.333333336, "logits/rejected": -64596384.0, "logps/chosen": -295.16701253255206, "logps/rejected": -100.75289916992188, "loss": 0.4208, "rewards/chosen": -0.06778082251548767, "rewards/margins": 1.8269483149051666, "rewards/rejected": -1.8947291374206543, "step": 3277 }, { "epoch": 0.17374711790740202, "grad_norm": 57.25, "kl": 0.7556686401367188, "learning_rate": 5e-07, "logits/chosen": -5582898.5, "logits/rejected": 774000.875, "logps/chosen": -333.9580078125, "logps/rejected": -117.47402954101562, "loss": 0.3201, "rewards/chosen": 0.6570242047309875, "rewards/margins": 1.881252110004425, "rewards/rejected": -1.2242279052734375, "step": 3278 }, { "epoch": 0.17380012190920416, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1333498.5, "logits/rejected": -10742656.666666666, "logps/chosen": -38.16636657714844, "logps/rejected": -95.9375, "loss": 0.3144, "rewards/chosen": 0.45726367831230164, "rewards/margins": 1.4671049614747365, "rewards/rejected": -1.0098412831624348, "step": 3279 }, { "epoch": 0.17385312591100627, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5371882.4, "logits/rejected": -28322277.333333332, "logps/chosen": -200.65078125, "logps/rejected": -473.2578125, "loss": 0.4023, "rewards/chosen": -0.33900227546691897, "rewards/margins": 2.5316009680430094, "rewards/rejected": -2.8706032435099282, "step": 3280 }, { "epoch": 0.1739061299128084, "grad_norm": 48.5, "kl": 0.6265945434570312, "learning_rate": 5e-07, "logits/chosen": -26813601.6, "logits/rejected": -27343125.333333332, "logps/chosen": -373.44814453125, "logps/rejected": -445.9795328776042, "loss": 0.3034, "rewards/chosen": 0.4071035385131836, "rewards/margins": 2.7915008862813315, "rewards/rejected": -2.384397347768148, "step": 3281 }, { "epoch": 0.17395913391461054, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16732068.0, "logits/rejected": -40345283.2, "logps/chosen": -194.38203938802084, "logps/rejected": -249.3158203125, "loss": 0.3204, "rewards/chosen": 0.20406997203826904, "rewards/margins": 1.6002979040145875, "rewards/rejected": -1.3962279319763184, "step": 3282 }, { "epoch": 0.17401213791641268, "grad_norm": 45.75, "kl": 0.7977428436279297, "learning_rate": 5e-07, "logits/chosen": -20186110.4, "logits/rejected": -25511578.666666668, "logps/chosen": -174.5548095703125, "logps/rejected": -105.82069905598958, "loss": 0.4198, "rewards/chosen": 0.2527630805969238, "rewards/margins": 1.441158978144328, "rewards/rejected": -1.188395897547404, "step": 3283 }, { "epoch": 0.17406514191821482, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39720475.428571425, "logits/rejected": -14323326.0, "logps/chosen": -307.06563895089283, "logps/rejected": -86.35450744628906, "loss": 0.499, "rewards/chosen": -0.05876257164137704, "rewards/margins": 0.2848897661481585, "rewards/rejected": -0.3436523377895355, "step": 3284 }, { "epoch": 0.17411814592001695, "grad_norm": 63.75, "kl": 0.7523269653320312, "learning_rate": 5e-07, "logits/chosen": -12950731.2, "logits/rejected": -17934821.333333332, "logps/chosen": -464.158984375, "logps/rejected": -143.61648559570312, "loss": 0.2697, "rewards/chosen": 0.93507080078125, "rewards/margins": 2.4908071835835774, "rewards/rejected": -1.5557363828023274, "step": 3285 }, { "epoch": 0.1741711499218191, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6391587.333333333, "logits/rejected": -26140712.0, "logps/chosen": -73.0775146484375, "logps/rejected": -435.494580078125, "loss": 0.308, "rewards/chosen": -0.40153956413269043, "rewards/margins": 1.6018625736236571, "rewards/rejected": -2.0034021377563476, "step": 3286 }, { "epoch": 0.17422415392362123, "grad_norm": 44.0, "kl": 0.1139984130859375, "learning_rate": 5e-07, "logits/chosen": -31741424.0, "logits/rejected": -16202502.0, "logps/chosen": -211.95013427734375, "logps/rejected": -178.3785400390625, "loss": 0.2876, "rewards/chosen": 0.12090854346752167, "rewards/margins": 2.3594546765089035, "rewards/rejected": -2.238546133041382, "step": 3287 }, { "epoch": 0.17427715792542336, "grad_norm": 46.75, "kl": 0.30228424072265625, "learning_rate": 5e-07, "logits/chosen": -35539846.4, "logits/rejected": -27719040.0, "logps/chosen": -392.04208984375, "logps/rejected": -299.51491292317706, "loss": 0.351, "rewards/chosen": 0.43658785820007323, "rewards/margins": 1.8254485607147217, "rewards/rejected": -1.3888607025146484, "step": 3288 }, { "epoch": 0.1743301619272255, "grad_norm": 51.75, "kl": 0.39202880859375, "learning_rate": 5e-07, "logits/chosen": -32859081.6, "logits/rejected": -31054864.0, "logps/chosen": -342.1043212890625, "logps/rejected": -334.4783935546875, "loss": 0.3962, "rewards/chosen": -0.12032676935195923, "rewards/margins": 1.6124074339866639, "rewards/rejected": -1.732734203338623, "step": 3289 }, { "epoch": 0.17438316592902764, "grad_norm": 53.75, "kl": 0.6264972686767578, "learning_rate": 5e-07, "logits/chosen": -36165453.71428572, "logits/rejected": -99462032.0, "logps/chosen": -333.36226981026783, "logps/rejected": -418.8465576171875, "loss": 0.3831, "rewards/chosen": 0.3533509799412319, "rewards/margins": 3.5289522239140103, "rewards/rejected": -3.1756012439727783, "step": 3290 }, { "epoch": 0.17443616993082978, "grad_norm": 67.5, "kl": 0.30547475814819336, "learning_rate": 5e-07, "logits/chosen": 14438710.0, "logits/rejected": 706569.875, "logps/chosen": -436.218017578125, "logps/rejected": -373.3527526855469, "loss": 0.3464, "rewards/chosen": 0.19453535974025726, "rewards/margins": 1.5736633390188217, "rewards/rejected": -1.3791279792785645, "step": 3291 }, { "epoch": 0.1744891739326319, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21268422.666666668, "logits/rejected": -52405248.0, "logps/chosen": -118.65458170572917, "logps/rejected": -452.453271484375, "loss": 0.24, "rewards/chosen": 0.3785494565963745, "rewards/margins": 2.3712002515792845, "rewards/rejected": -1.9926507949829102, "step": 3292 }, { "epoch": 0.17454217793443405, "grad_norm": 55.75, "kl": 0.2080078125, "learning_rate": 5e-07, "logits/chosen": -34816248.0, "logits/rejected": -1959064.75, "logps/chosen": -479.0542907714844, "logps/rejected": -50.102535247802734, "loss": 0.3418, "rewards/chosen": 0.5583213567733765, "rewards/margins": 1.3820594549179077, "rewards/rejected": -0.8237380981445312, "step": 3293 }, { "epoch": 0.1745951819362362, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65402120.0, "logits/rejected": -20394102.0, "logps/chosen": -431.0206604003906, "logps/rejected": -160.68603515625, "loss": 0.3585, "rewards/chosen": 0.13105371594429016, "rewards/margins": 1.2679010927677155, "rewards/rejected": -1.1368473768234253, "step": 3294 }, { "epoch": 0.17464818593803833, "grad_norm": 22.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6617993.0, "logits/rejected": -27432948.57142857, "logps/chosen": -105.67035675048828, "logps/rejected": -367.59608677455356, "loss": 0.1914, "rewards/chosen": -1.443304419517517, "rewards/margins": 1.332509364400591, "rewards/rejected": -2.775813783918108, "step": 3295 }, { "epoch": 0.17470118993984046, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15947790.4, "logits/rejected": -64439776.0, "logps/chosen": -178.2893798828125, "logps/rejected": -600.3123779296875, "loss": 0.3151, "rewards/chosen": 0.25068106651306155, "rewards/margins": 2.590136194229126, "rewards/rejected": -2.3394551277160645, "step": 3296 }, { "epoch": 0.1747541939416426, "grad_norm": 51.75, "kl": 0.2677001953125, "learning_rate": 5e-07, "logits/chosen": -24305043.2, "logits/rejected": -7028021.333333333, "logps/chosen": -200.1559326171875, "logps/rejected": -312.389892578125, "loss": 0.3407, "rewards/chosen": 0.22771978378295898, "rewards/margins": 1.8667599360148113, "rewards/rejected": -1.6390401522318523, "step": 3297 }, { "epoch": 0.17480719794344474, "grad_norm": 46.0, "kl": 0.9171819686889648, "learning_rate": 5e-07, "logits/chosen": -40146204.0, "logits/rejected": -35706352.0, "logps/chosen": -217.6431121826172, "logps/rejected": -316.3396911621094, "loss": 0.329, "rewards/chosen": 0.18048328161239624, "rewards/margins": 1.7290398478507996, "rewards/rejected": -1.5485565662384033, "step": 3298 }, { "epoch": 0.17486020194524687, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19733354.666666668, "logits/rejected": -46814694.4, "logps/chosen": -190.41182454427084, "logps/rejected": -364.050634765625, "loss": 0.2492, "rewards/chosen": 0.022615691026051838, "rewards/margins": 2.4916027744611107, "rewards/rejected": -2.4689870834350587, "step": 3299 }, { "epoch": 0.174913205947049, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28290317.333333332, "logits/rejected": -34467449.6, "logps/chosen": -287.7510579427083, "logps/rejected": -458.694970703125, "loss": 0.2336, "rewards/chosen": 0.3395024538040161, "rewards/margins": 2.507257008552551, "rewards/rejected": -2.167754554748535, "step": 3300 }, { "epoch": 0.17496620994885115, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -101414822.4, "logits/rejected": -8995021.333333334, "logps/chosen": -455.36513671875, "logps/rejected": -225.58256022135416, "loss": 0.3639, "rewards/chosen": 0.17031677961349487, "rewards/margins": 1.5783256729443866, "rewards/rejected": -1.4080088933308919, "step": 3301 }, { "epoch": 0.17501921395065329, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26017382.0, "logits/rejected": -29735312.0, "logps/chosen": -270.7447509765625, "logps/rejected": -400.3629150390625, "loss": 0.3786, "rewards/chosen": -0.057669080793857574, "rewards/margins": 1.3439885005354881, "rewards/rejected": -1.4016575813293457, "step": 3302 }, { "epoch": 0.17507221795245542, "grad_norm": 50.5, "kl": 0.12987756729125977, "learning_rate": 5e-07, "logits/chosen": -35813200.0, "logits/rejected": 175234640.0, "logps/chosen": -236.69232177734375, "logps/rejected": -310.15802001953125, "loss": 0.3146, "rewards/chosen": -0.0013360083103179932, "rewards/margins": 2.0849452912807465, "rewards/rejected": -2.0862812995910645, "step": 3303 }, { "epoch": 0.17512522195425756, "grad_norm": 76.5, "kl": 0.2761116027832031, "learning_rate": 5e-07, "logits/chosen": -18154794.0, "logits/rejected": -33003960.0, "logps/chosen": -439.3584899902344, "logps/rejected": -322.534912109375, "loss": 0.2237, "rewards/chosen": 1.2249267101287842, "rewards/margins": 2.6359182198842364, "rewards/rejected": -1.4109915097554524, "step": 3304 }, { "epoch": 0.17517822595605967, "grad_norm": 38.0, "kl": 0.8981332778930664, "learning_rate": 5e-07, "logits/chosen": 4692590.8, "logits/rejected": -24582536.0, "logps/chosen": -98.22020874023437, "logps/rejected": -279.30710856119794, "loss": 0.3934, "rewards/chosen": 0.06823535561561585, "rewards/margins": 1.437585312128067, "rewards/rejected": -1.3693499565124512, "step": 3305 }, { "epoch": 0.1752312299578618, "grad_norm": 48.75, "kl": 0.5743408203125, "learning_rate": 5e-07, "logits/chosen": -18750960.0, "logits/rejected": -38123336.0, "logps/chosen": -350.754736328125, "logps/rejected": -459.7130533854167, "loss": 0.2633, "rewards/chosen": 0.5733999729156494, "rewards/margins": 3.5700319131215412, "rewards/rejected": -2.996631940205892, "step": 3306 }, { "epoch": 0.17528423395966394, "grad_norm": 51.75, "kl": 0.7397499084472656, "learning_rate": 5e-07, "logits/chosen": -25864696.0, "logits/rejected": -4750660.666666667, "logps/chosen": -212.5447021484375, "logps/rejected": -130.4835001627604, "loss": 0.3483, "rewards/chosen": 0.3318509578704834, "rewards/margins": 1.8529033184051513, "rewards/rejected": -1.521052360534668, "step": 3307 }, { "epoch": 0.17533723796146608, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54246444.8, "logits/rejected": -5690643.333333333, "logps/chosen": -521.731640625, "logps/rejected": -127.76472981770833, "loss": 0.4141, "rewards/chosen": 0.12579132318496705, "rewards/margins": 0.8958093444506328, "rewards/rejected": -0.7700180212656657, "step": 3308 }, { "epoch": 0.17539024196326822, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1659092.625, "logits/rejected": -55064328.0, "logps/chosen": -111.73609924316406, "logps/rejected": -396.48394775390625, "loss": 0.3287, "rewards/chosen": -0.08652453869581223, "rewards/margins": 1.9564458802342415, "rewards/rejected": -2.0429704189300537, "step": 3309 }, { "epoch": 0.17544324596507035, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48234997.333333336, "logits/rejected": -28953971.2, "logps/chosen": -360.4060465494792, "logps/rejected": -328.4212890625, "loss": 0.2988, "rewards/chosen": -0.046062727769215904, "rewards/margins": 1.8512158672014873, "rewards/rejected": -1.8972785949707032, "step": 3310 }, { "epoch": 0.1754962499668725, "grad_norm": 47.25, "kl": 1.1935501098632812, "learning_rate": 5e-07, "logits/chosen": -7325273.0, "logits/rejected": -26174242.0, "logps/chosen": -240.17388916015625, "logps/rejected": -423.0068359375, "loss": 0.2938, "rewards/chosen": 0.7689605951309204, "rewards/margins": 2.2964223623275757, "rewards/rejected": -1.5274617671966553, "step": 3311 }, { "epoch": 0.17554925396867463, "grad_norm": 69.0, "kl": 0.3757362365722656, "learning_rate": 5e-07, "logits/chosen": -41446008.0, "logits/rejected": -10886213.0, "logps/chosen": -221.20735677083334, "logps/rejected": -250.40586853027344, "loss": 0.3646, "rewards/chosen": 0.45529528458913165, "rewards/margins": 1.6711499293645222, "rewards/rejected": -1.2158546447753906, "step": 3312 }, { "epoch": 0.17560225797047677, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8076953.5, "logits/rejected": -9603718.0, "logps/chosen": -187.52041625976562, "logps/rejected": -263.75384521484375, "loss": 0.3057, "rewards/chosen": -0.3062378168106079, "rewards/margins": 1.3087610006332397, "rewards/rejected": -1.6149988174438477, "step": 3313 }, { "epoch": 0.1756552619722789, "grad_norm": 60.75, "kl": 2.1491870880126953, "learning_rate": 5e-07, "logits/chosen": 42091829.333333336, "logits/rejected": -9251620.0, "logps/chosen": -697.8234049479166, "logps/rejected": -344.7714111328125, "loss": 0.265, "rewards/chosen": 0.7746704419453939, "rewards/margins": 2.3622496922810874, "rewards/rejected": -1.5875792503356934, "step": 3314 }, { "epoch": 0.17570826597408104, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18411862.0, "logits/rejected": -39913512.0, "logps/chosen": -107.28889465332031, "logps/rejected": -293.0491027832031, "loss": 0.3457, "rewards/chosen": -0.26093974709510803, "rewards/margins": 1.9123891294002533, "rewards/rejected": -2.1733288764953613, "step": 3315 }, { "epoch": 0.17576126997588318, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46986101.333333336, "logits/rejected": -13732888.0, "logps/chosen": -332.30128987630206, "logps/rejected": -212.099365234375, "loss": 0.2985, "rewards/chosen": -0.20058506727218628, "rewards/margins": 1.7560986638069154, "rewards/rejected": -1.9566837310791017, "step": 3316 }, { "epoch": 0.17581427397768531, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2952680.6, "logits/rejected": -27245922.666666668, "logps/chosen": -189.5516357421875, "logps/rejected": -362.8014729817708, "loss": 0.3858, "rewards/chosen": 0.18580760955810546, "rewards/margins": 1.3199132919311523, "rewards/rejected": -1.1341056823730469, "step": 3317 }, { "epoch": 0.17586727797948745, "grad_norm": 52.25, "kl": 0.5837745666503906, "learning_rate": 5e-07, "logits/chosen": -18385176.0, "logits/rejected": -31539084.0, "logps/chosen": -357.99151611328125, "logps/rejected": -213.6237030029297, "loss": 0.329, "rewards/chosen": 0.343843549489975, "rewards/margins": 1.7310806214809418, "rewards/rejected": -1.3872370719909668, "step": 3318 }, { "epoch": 0.1759202819812896, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25773283.2, "logits/rejected": -23558794.666666668, "logps/chosen": -292.117626953125, "logps/rejected": -210.7007853190104, "loss": 0.3465, "rewards/chosen": 0.19917985200881957, "rewards/margins": 1.9983226895332336, "rewards/rejected": -1.799142837524414, "step": 3319 }, { "epoch": 0.17597328598309173, "grad_norm": 61.75, "kl": 0.9437141418457031, "learning_rate": 5e-07, "logits/chosen": -36471144.0, "logits/rejected": -9869400.0, "logps/chosen": -785.597412109375, "logps/rejected": -149.8971710205078, "loss": 0.2678, "rewards/chosen": 0.9251044988632202, "rewards/margins": 2.2130602598190308, "rewards/rejected": -1.2879557609558105, "step": 3320 }, { "epoch": 0.17602628998489386, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21126446.0, "logits/rejected": -23634444.0, "logps/chosen": -245.9805450439453, "logps/rejected": -573.9705200195312, "loss": 0.3121, "rewards/chosen": -0.0807342529296875, "rewards/margins": 2.400630235671997, "rewards/rejected": -2.4813644886016846, "step": 3321 }, { "epoch": 0.176079293986696, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23261678.0, "logits/rejected": -3167691.0, "logps/chosen": -327.340576171875, "logps/rejected": -386.97796630859375, "loss": 0.3003, "rewards/chosen": 0.3125818371772766, "rewards/margins": 2.219918429851532, "rewards/rejected": -1.9073365926742554, "step": 3322 }, { "epoch": 0.17613229798849814, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32454165.333333332, "logits/rejected": -26881116.8, "logps/chosen": -297.7237548828125, "logps/rejected": -154.14306640625, "loss": 0.325, "rewards/chosen": 0.053729633490244545, "rewards/margins": 1.3797319451967875, "rewards/rejected": -1.326002311706543, "step": 3323 }, { "epoch": 0.17618530199030027, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7842293.333333333, "logits/rejected": -12964980.8, "logps/chosen": -703.7403971354166, "logps/rejected": -262.46201171875, "loss": 0.2688, "rewards/chosen": 0.06101177136103312, "rewards/margins": 2.0960505286852515, "rewards/rejected": -2.0350387573242186, "step": 3324 }, { "epoch": 0.1762383059921024, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24777466.666666668, "logits/rejected": -24409396.0, "logps/chosen": -178.75602213541666, "logps/rejected": -356.16204833984375, "loss": 0.4273, "rewards/chosen": -0.18889602025349936, "rewards/margins": 2.5469042460123696, "rewards/rejected": -2.735800266265869, "step": 3325 }, { "epoch": 0.17629130999390455, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20699298.0, "logits/rejected": -12317460.0, "logps/chosen": -268.2172546386719, "logps/rejected": -86.22537231445312, "loss": 0.2891, "rewards/chosen": 0.7000325322151184, "rewards/margins": 1.941177785396576, "rewards/rejected": -1.2411452531814575, "step": 3326 }, { "epoch": 0.1763443139957067, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62833552.0, "logits/rejected": -51963136.0, "logps/chosen": -435.9909973144531, "logps/rejected": -279.87359619140625, "loss": 0.2642, "rewards/chosen": 0.26157093048095703, "rewards/margins": 2.6654086112976074, "rewards/rejected": -2.4038376808166504, "step": 3327 }, { "epoch": 0.17639731799750882, "grad_norm": 54.75, "kl": 1.0091743469238281, "learning_rate": 5e-07, "logits/chosen": -10239557.714285715, "logits/rejected": -507296.6875, "logps/chosen": -225.41788155691964, "logps/rejected": -51.39362335205078, "loss": 0.4618, "rewards/chosen": 0.06163663523537772, "rewards/margins": 0.9934398021016803, "rewards/rejected": -0.9318031668663025, "step": 3328 }, { "epoch": 0.17645032199931096, "grad_norm": 45.5, "kl": 0.5017261505126953, "learning_rate": 5e-07, "logits/chosen": -27696437.333333332, "logits/rejected": -58976665.6, "logps/chosen": -306.785400390625, "logps/rejected": -351.88837890625, "loss": 0.305, "rewards/chosen": 0.0767139991124471, "rewards/margins": 1.9285265525182087, "rewards/rejected": -1.8518125534057617, "step": 3329 }, { "epoch": 0.1765033260011131, "grad_norm": 59.5, "kl": 0.5599174499511719, "learning_rate": 5e-07, "logits/chosen": -12693354.0, "logits/rejected": -36837080.0, "logps/chosen": -320.18798828125, "logps/rejected": -751.8670654296875, "loss": 0.2875, "rewards/chosen": 0.19015580415725708, "rewards/margins": 3.358021557331085, "rewards/rejected": -3.167865753173828, "step": 3330 }, { "epoch": 0.1765563300029152, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3376153.75, "logits/rejected": -3082505.5, "logps/chosen": -340.6154479980469, "logps/rejected": -317.2935791015625, "loss": 0.3197, "rewards/chosen": 0.03330719470977783, "rewards/margins": 2.0259543657302856, "rewards/rejected": -1.9926471710205078, "step": 3331 }, { "epoch": 0.17660933400471734, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 35818384.0, "logits/rejected": -62541868.8, "logps/chosen": -358.8333333333333, "logps/rejected": -283.0475830078125, "loss": 0.335, "rewards/chosen": -0.012301127115885416, "rewards/margins": 1.5065880139668781, "rewards/rejected": -1.5188891410827636, "step": 3332 }, { "epoch": 0.17666233800651948, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7505727.5, "logits/rejected": 5868196.0, "logps/chosen": -150.09744262695312, "logps/rejected": -157.59908040364584, "loss": 0.3248, "rewards/chosen": 0.37771034240722656, "rewards/margins": 1.3146124680836997, "rewards/rejected": -0.936902125676473, "step": 3333 }, { "epoch": 0.17671534200832162, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51222324.0, "logits/rejected": -17374512.0, "logps/chosen": -627.16796875, "logps/rejected": -376.3761393229167, "loss": 0.2776, "rewards/chosen": 0.2678634524345398, "rewards/margins": 1.8786473472913106, "rewards/rejected": -1.6107838948567708, "step": 3334 }, { "epoch": 0.17676834601012376, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 31628192.0, "logits/rejected": -11815919.2, "logps/chosen": -172.3285929361979, "logps/rejected": -128.0777587890625, "loss": 0.3236, "rewards/chosen": -0.21740086873372397, "rewards/margins": 1.6575119654337567, "rewards/rejected": -1.8749128341674806, "step": 3335 }, { "epoch": 0.1768213500119259, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14954551.0, "logits/rejected": -17770630.0, "logps/chosen": -168.89581298828125, "logps/rejected": -454.7952575683594, "loss": 0.4025, "rewards/chosen": -0.19023609161376953, "rewards/margins": 1.1511139869689941, "rewards/rejected": -1.3413500785827637, "step": 3336 }, { "epoch": 0.17687435401372803, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -13335434.0, "logps/rejected": -270.84033203125, "loss": 0.1843, "rewards/rejected": -1.6462937593460083, "step": 3337 }, { "epoch": 0.17692735801553017, "grad_norm": 40.75, "kl": 0.8307199478149414, "learning_rate": 5e-07, "logits/chosen": 58626.8125, "logits/rejected": -19586096.0, "logps/chosen": -111.08062744140625, "logps/rejected": -279.4981384277344, "loss": 0.3209, "rewards/chosen": 0.47104111313819885, "rewards/margins": 2.1026441156864166, "rewards/rejected": -1.6316030025482178, "step": 3338 }, { "epoch": 0.1769803620173323, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32057898.666666668, "logits/rejected": -15166020.8, "logps/chosen": -248.75860595703125, "logps/rejected": -362.51123046875, "loss": 0.2395, "rewards/chosen": 0.570440928141276, "rewards/margins": 2.5095854441324867, "rewards/rejected": -1.9391445159912108, "step": 3339 }, { "epoch": 0.17703336601913444, "grad_norm": 69.5, "kl": 0.21541595458984375, "learning_rate": 5e-07, "logits/chosen": -53525200.0, "logits/rejected": -11049741.333333334, "logps/chosen": -502.813427734375, "logps/rejected": -170.5063680013021, "loss": 0.3131, "rewards/chosen": 0.48273682594299316, "rewards/margins": 2.0074594020843506, "rewards/rejected": -1.5247225761413574, "step": 3340 }, { "epoch": 0.17708637002093658, "grad_norm": 33.5, "kl": 0.060672760009765625, "learning_rate": 5e-07, "logits/chosen": -6908913.0, "logits/rejected": -5631390.0, "logps/chosen": -247.9944305419922, "logps/rejected": -206.72227478027344, "loss": 0.2812, "rewards/chosen": 0.5513136386871338, "rewards/margins": 2.4707210063934326, "rewards/rejected": -1.9194073677062988, "step": 3341 }, { "epoch": 0.17713937402273872, "grad_norm": 60.75, "kl": 0.8810653686523438, "learning_rate": 5e-07, "logits/chosen": -12825190.4, "logits/rejected": -26981461.333333332, "logps/chosen": -224.4411865234375, "logps/rejected": -672.7152506510416, "loss": 0.4123, "rewards/chosen": -0.33685827255249023, "rewards/margins": 2.386101245880127, "rewards/rejected": -2.722959518432617, "step": 3342 }, { "epoch": 0.17719237802454085, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22318534.0, "logits/rejected": -19979354.666666668, "logps/chosen": -489.47760009765625, "logps/rejected": -307.1815999348958, "loss": 0.2383, "rewards/chosen": 0.12141571193933487, "rewards/margins": 2.1716845209399858, "rewards/rejected": -2.050268809000651, "step": 3343 }, { "epoch": 0.177245382026343, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7030601.5, "logits/rejected": -40422016.0, "logps/chosen": -17.039424896240234, "logps/rejected": -415.07986886160717, "loss": 0.2035, "rewards/chosen": 1.3468687534332275, "rewards/margins": 3.057670559201922, "rewards/rejected": -1.7108018057686942, "step": 3344 }, { "epoch": 0.17729838602814513, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10368284.666666666, "logits/rejected": -3157866.0, "logps/chosen": -114.90812174479167, "logps/rejected": -107.867431640625, "loss": 0.35, "rewards/chosen": -0.07797590891520183, "rewards/margins": 1.087207539876302, "rewards/rejected": -1.1651834487915038, "step": 3345 }, { "epoch": 0.17735139002994726, "grad_norm": 48.0, "kl": 0.4225006103515625, "learning_rate": 5e-07, "logits/chosen": -6697513.0, "logits/rejected": -21800816.0, "logps/chosen": -459.28387451171875, "logps/rejected": -475.368408203125, "loss": 0.1897, "rewards/chosen": 1.009497046470642, "rewards/margins": 2.8655998309453325, "rewards/rejected": -1.8561027844746907, "step": 3346 }, { "epoch": 0.1774043940317494, "grad_norm": 40.5, "kl": 0.07400894165039062, "learning_rate": 5e-07, "logits/chosen": -14383110.4, "logits/rejected": -22436640.0, "logps/chosen": -221.81064453125, "logps/rejected": -192.21258544921875, "loss": 0.3178, "rewards/chosen": 0.16968445777893065, "rewards/margins": 2.668819793065389, "rewards/rejected": -2.4991353352864585, "step": 3347 }, { "epoch": 0.17745739803355154, "grad_norm": 60.5, "kl": 0.1792449951171875, "learning_rate": 5e-07, "logits/chosen": -30946342.0, "logits/rejected": -51992360.0, "logps/chosen": -517.494384765625, "logps/rejected": -206.49424743652344, "loss": 0.3146, "rewards/chosen": 0.48485565185546875, "rewards/margins": 1.7512381076812744, "rewards/rejected": -1.2663824558258057, "step": 3348 }, { "epoch": 0.17751040203535368, "grad_norm": 54.25, "kl": 1.7431869506835938, "learning_rate": 5e-07, "logits/chosen": -12060097.333333334, "logits/rejected": 1855096.75, "logps/chosen": -255.29296875, "logps/rejected": -119.77661895751953, "loss": 0.4612, "rewards/chosen": 0.11724484960238139, "rewards/margins": 1.222637305657069, "rewards/rejected": -1.1053924560546875, "step": 3349 }, { "epoch": 0.1775634060371558, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33940985.6, "logits/rejected": -5760085.333333333, "logps/chosen": -226.5893310546875, "logps/rejected": -222.17647298177084, "loss": 0.4197, "rewards/chosen": -0.09698790311813354, "rewards/margins": 1.3704469799995422, "rewards/rejected": -1.4674348831176758, "step": 3350 }, { "epoch": 0.17761641003895795, "grad_norm": 53.5, "kl": 0.20312118530273438, "learning_rate": 5e-07, "logits/chosen": -12998090.0, "logits/rejected": -16006322.0, "logps/chosen": -246.71290588378906, "logps/rejected": -300.77142333984375, "loss": 0.3557, "rewards/chosen": 0.10171985626220703, "rewards/margins": 1.3124281167984009, "rewards/rejected": -1.2107082605361938, "step": 3351 }, { "epoch": 0.1776694140407601, "grad_norm": 53.0, "kl": 0.31321144104003906, "learning_rate": 5e-07, "logits/chosen": 1336568.8, "logits/rejected": 4612709.333333333, "logps/chosen": -208.85576171875, "logps/rejected": -256.3916829427083, "loss": 0.3298, "rewards/chosen": 0.49691152572631836, "rewards/margins": 2.065436204274495, "rewards/rejected": -1.568524678548177, "step": 3352 }, { "epoch": 0.17772241804256222, "grad_norm": 30.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4866693.333333333, "logits/rejected": -38679884.8, "logps/chosen": -47.06444295247396, "logps/rejected": -253.0044921875, "loss": 0.2057, "rewards/chosen": 0.7002477645874023, "rewards/margins": 2.843665885925293, "rewards/rejected": -2.1434181213378904, "step": 3353 }, { "epoch": 0.17777542204436436, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40381026.666666664, "logits/rejected": -11770428.8, "logps/chosen": -246.85052490234375, "logps/rejected": -174.97802734375, "loss": 0.2872, "rewards/chosen": 0.1400092045466105, "rewards/margins": 1.7172313610712688, "rewards/rejected": -1.5772221565246582, "step": 3354 }, { "epoch": 0.1778284260461665, "grad_norm": 54.25, "kl": 1.1647834777832031, "learning_rate": 5e-07, "logits/chosen": -39684720.0, "logits/rejected": -4688259.5, "logps/chosen": -294.86944580078125, "logps/rejected": -180.39346313476562, "loss": 0.3076, "rewards/chosen": 0.682725191116333, "rewards/margins": 2.273568868637085, "rewards/rejected": -1.590843677520752, "step": 3355 }, { "epoch": 0.1778814300479686, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4159814.0, "logits/rejected": -14579578.666666666, "logps/chosen": -272.929052734375, "logps/rejected": -224.9555460611979, "loss": 0.3501, "rewards/chosen": 0.11536872386932373, "rewards/margins": 2.051115234692891, "rewards/rejected": -1.9357465108235676, "step": 3356 }, { "epoch": 0.17793443404977075, "grad_norm": 54.5, "kl": 0.38552093505859375, "learning_rate": 5e-07, "logits/chosen": -16202099.42857143, "logits/rejected": 4869808.0, "logps/chosen": -248.05215890066964, "logps/rejected": -55.295127868652344, "loss": 0.411, "rewards/chosen": 0.4510202407836914, "rewards/margins": 0.6008028090000153, "rewards/rejected": -0.14978256821632385, "step": 3357 }, { "epoch": 0.17798743805157288, "grad_norm": 65.5, "kl": 0.8520889282226562, "learning_rate": 5e-07, "logits/chosen": -29019292.0, "logits/rejected": -9271812.0, "logps/chosen": -350.269775390625, "logps/rejected": -376.576904296875, "loss": 0.3438, "rewards/chosen": 0.21529807150363922, "rewards/margins": 1.6839896589517593, "rewards/rejected": -1.4686915874481201, "step": 3358 }, { "epoch": 0.17804044205337502, "grad_norm": 89.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65375045.333333336, "logits/rejected": -25361868.8, "logps/chosen": -1222.7080891927083, "logps/rejected": -337.8151123046875, "loss": 0.2584, "rewards/chosen": 0.1684168577194214, "rewards/margins": 2.1174437284469603, "rewards/rejected": -1.949026870727539, "step": 3359 }, { "epoch": 0.17809344605517716, "grad_norm": 87.0, "kl": 0.7823562622070312, "learning_rate": 5e-07, "logits/chosen": 4480341.0, "logits/rejected": -1326839.0, "logps/chosen": -541.9494018554688, "logps/rejected": -397.812744140625, "loss": 0.3148, "rewards/chosen": 0.7676080465316772, "rewards/margins": 1.905826210975647, "rewards/rejected": -1.1382181644439697, "step": 3360 }, { "epoch": 0.1781464500569793, "grad_norm": 74.5, "kl": 1.1753034591674805, "learning_rate": 5e-07, "logits/chosen": -8306476.666666667, "logits/rejected": 9343895.2, "logps/chosen": -667.0286865234375, "logps/rejected": -392.7201904296875, "loss": 0.2903, "rewards/chosen": 0.47218286991119385, "rewards/margins": 1.8023983240127563, "rewards/rejected": -1.3302154541015625, "step": 3361 }, { "epoch": 0.17819945405878143, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55465576.0, "logits/rejected": -10208526.0, "logps/chosen": -735.4864501953125, "logps/rejected": -328.236083984375, "loss": 0.2854, "rewards/chosen": 0.496027410030365, "rewards/margins": 2.078214108943939, "rewards/rejected": -1.5821866989135742, "step": 3362 }, { "epoch": 0.17825245806058357, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -88238037.33333333, "logits/rejected": 22138676.8, "logps/chosen": -453.150634765625, "logps/rejected": -275.20732421875, "loss": 0.3568, "rewards/chosen": 0.10959168275197347, "rewards/margins": 1.5432232936223347, "rewards/rejected": -1.4336316108703613, "step": 3363 }, { "epoch": 0.1783054620623857, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12091777.0, "logits/rejected": -13337469.333333334, "logps/chosen": -175.25987243652344, "logps/rejected": -335.3193766276042, "loss": 0.292, "rewards/chosen": -0.12757617235183716, "rewards/margins": 1.7979351878166199, "rewards/rejected": -1.925511360168457, "step": 3364 }, { "epoch": 0.17835846606418784, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41434512.0, "logits/rejected": -37801229.333333336, "logps/chosen": -348.47027587890625, "logps/rejected": -331.4365234375, "loss": 0.2573, "rewards/chosen": -0.5448654294013977, "rewards/margins": 1.7799167831738791, "rewards/rejected": -2.324782212575277, "step": 3365 }, { "epoch": 0.17841147006598998, "grad_norm": 59.0, "kl": 0.43314361572265625, "learning_rate": 5e-07, "logits/chosen": -63929648.0, "logits/rejected": -45749092.0, "logps/chosen": -366.26318359375, "logps/rejected": -650.3787841796875, "loss": 0.3944, "rewards/chosen": 0.13677467902501425, "rewards/margins": 2.183634420235952, "rewards/rejected": -2.0468597412109375, "step": 3366 }, { "epoch": 0.17846447406779212, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25540956.8, "logits/rejected": -34764325.333333336, "logps/chosen": -205.73984375, "logps/rejected": -236.51627604166666, "loss": 0.4017, "rewards/chosen": -0.05161375999450683, "rewards/margins": 1.309275992711385, "rewards/rejected": -1.3608897527058919, "step": 3367 }, { "epoch": 0.17851747806959425, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44591244.0, "logits/rejected": -6104002.0, "logps/chosen": -294.9235534667969, "logps/rejected": -106.75178527832031, "loss": 0.3103, "rewards/chosen": 0.44331875443458557, "rewards/margins": 1.8078040182590485, "rewards/rejected": -1.364485263824463, "step": 3368 }, { "epoch": 0.1785704820713964, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12875894.0, "logits/rejected": -86910808.0, "logps/chosen": -139.46250915527344, "logps/rejected": -423.9720153808594, "loss": 0.3562, "rewards/chosen": -0.5097282528877258, "rewards/margins": 1.739496648311615, "rewards/rejected": -2.249224901199341, "step": 3369 }, { "epoch": 0.17862348607319853, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1368352.6666666667, "logits/rejected": -42096988.8, "logps/chosen": -154.40746053059897, "logps/rejected": -426.53994140625, "loss": 0.2283, "rewards/chosen": 0.6443827152252197, "rewards/margins": 2.9133701801300047, "rewards/rejected": -2.268987464904785, "step": 3370 }, { "epoch": 0.17867649007500067, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43199052.8, "logits/rejected": -17554440.0, "logps/chosen": -265.687353515625, "logps/rejected": -384.3055013020833, "loss": 0.3535, "rewards/chosen": 0.18304145336151123, "rewards/margins": 1.8271899620691936, "rewards/rejected": -1.6441485087076824, "step": 3371 }, { "epoch": 0.1787294940768028, "grad_norm": 54.25, "kl": 0.4729461669921875, "learning_rate": 5e-07, "logits/chosen": -12483648.8, "logits/rejected": -42350925.333333336, "logps/chosen": -186.4662109375, "logps/rejected": -314.69504801432294, "loss": 0.4156, "rewards/chosen": -0.3218820333480835, "rewards/margins": 1.5595846732457477, "rewards/rejected": -1.8814667065938313, "step": 3372 }, { "epoch": 0.17878249807860494, "grad_norm": 65.5, "kl": 1.616830825805664, "learning_rate": 5e-07, "logits/chosen": -16930572.0, "logits/rejected": -7776653.5, "logps/chosen": -575.9638264973959, "logps/rejected": -340.4740905761719, "loss": 0.3226, "rewards/chosen": 0.7229953606923422, "rewards/margins": 2.5432453950246177, "rewards/rejected": -1.8202500343322754, "step": 3373 }, { "epoch": 0.17883550208040708, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10157274.0, "logits/rejected": -68810931.2, "logps/chosen": -162.6670125325521, "logps/rejected": -451.22197265625, "loss": 0.2488, "rewards/chosen": 0.10330582658449809, "rewards/margins": 2.7532457451025643, "rewards/rejected": -2.6499399185180663, "step": 3374 }, { "epoch": 0.17888850608220921, "grad_norm": 46.75, "kl": 0.8537578582763672, "learning_rate": 5e-07, "logits/chosen": -413878.8, "logits/rejected": -2406563.1666666665, "logps/chosen": -196.5559326171875, "logps/rejected": -80.78879801432292, "loss": 0.3059, "rewards/chosen": 0.7913867950439453, "rewards/margins": 2.1226921399434406, "rewards/rejected": -1.3313053448994954, "step": 3375 }, { "epoch": 0.17894151008401135, "grad_norm": 49.75, "kl": 1.1920166015625, "learning_rate": 5e-07, "logits/chosen": -17580212.0, "logits/rejected": -11387658.0, "logps/chosen": -339.98712158203125, "logps/rejected": -225.7503662109375, "loss": 0.3576, "rewards/chosen": 0.0036380887031555176, "rewards/margins": 1.8747732043266296, "rewards/rejected": -1.8711351156234741, "step": 3376 }, { "epoch": 0.1789945140858135, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3046301.714285714, "logits/rejected": -41437800.0, "logps/chosen": -178.53862653459822, "logps/rejected": -678.1298217773438, "loss": 0.4206, "rewards/chosen": 0.16467947619301931, "rewards/margins": 2.1048345054898943, "rewards/rejected": -1.940155029296875, "step": 3377 }, { "epoch": 0.17904751808761563, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30398764.8, "logits/rejected": -55578773.333333336, "logps/chosen": -379.1024169921875, "logps/rejected": -316.73828125, "loss": 0.3449, "rewards/chosen": 0.3146139860153198, "rewards/margins": 1.9022274096806844, "rewards/rejected": -1.5876134236653645, "step": 3378 }, { "epoch": 0.17910052208941776, "grad_norm": 41.0, "kl": 0.059871673583984375, "learning_rate": 5e-07, "logits/chosen": -920874.8125, "logits/rejected": -30797826.0, "logps/chosen": -112.0809326171875, "logps/rejected": -290.5027160644531, "loss": 0.3005, "rewards/chosen": 0.4356631338596344, "rewards/margins": 1.8986097872257233, "rewards/rejected": -1.4629466533660889, "step": 3379 }, { "epoch": 0.1791535260912199, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26614362.666666668, "logits/rejected": -20951636.0, "logps/chosen": -235.91914876302084, "logps/rejected": -424.73236083984375, "loss": 0.3649, "rewards/chosen": 0.18972321351369223, "rewards/margins": 2.551374872525533, "rewards/rejected": -2.361651659011841, "step": 3380 }, { "epoch": 0.179206530093022, "grad_norm": 88.0, "kl": 0.7231216430664062, "learning_rate": 5e-07, "logits/chosen": -54673606.4, "logits/rejected": 57803232.0, "logps/chosen": -544.590087890625, "logps/rejected": -348.94384765625, "loss": 0.3232, "rewards/chosen": 0.5864145278930664, "rewards/margins": 2.0075189590454103, "rewards/rejected": -1.4211044311523438, "step": 3381 }, { "epoch": 0.17925953409482415, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22989362.0, "logits/rejected": -70247768.0, "logps/chosen": -404.44342041015625, "logps/rejected": -542.99755859375, "loss": 0.2066, "rewards/chosen": 0.8852630853652954, "rewards/margins": 3.259147047996521, "rewards/rejected": -2.3738839626312256, "step": 3382 }, { "epoch": 0.17931253809662628, "grad_norm": 56.5, "kl": 0.8269748687744141, "learning_rate": 5e-07, "logits/chosen": -13968444.0, "logits/rejected": -73006485.33333333, "logps/chosen": -910.9200439453125, "logps/rejected": -293.2512613932292, "loss": 0.2254, "rewards/chosen": 0.8719436526298523, "rewards/margins": 2.721983293692271, "rewards/rejected": -1.8500396410624187, "step": 3383 }, { "epoch": 0.17936554209842842, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33644188.8, "logits/rejected": -41733413.333333336, "logps/chosen": -313.90546875, "logps/rejected": -444.7047119140625, "loss": 0.3179, "rewards/chosen": 0.21429550647735596, "rewards/margins": 2.4584707816441855, "rewards/rejected": -2.2441752751668296, "step": 3384 }, { "epoch": 0.17941854610023056, "grad_norm": 75.0, "kl": 1.0296134948730469, "learning_rate": 5e-07, "logits/chosen": -33390688.0, "logits/rejected": -21010028.0, "logps/chosen": -457.18251953125, "logps/rejected": -389.4091796875, "loss": 0.3062, "rewards/chosen": 0.5154470443725586, "rewards/margins": 2.6014938990275063, "rewards/rejected": -2.0860468546549478, "step": 3385 }, { "epoch": 0.1794715501020327, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29090869.333333332, "logits/rejected": -71556627.2, "logps/chosen": -227.5246378580729, "logps/rejected": -336.555029296875, "loss": 0.2746, "rewards/chosen": -0.30978671709696454, "rewards/margins": 1.9187533219655355, "rewards/rejected": -2.2285400390625, "step": 3386 }, { "epoch": 0.17952455410383483, "grad_norm": 50.25, "kl": 0.9450817108154297, "learning_rate": 5e-07, "logits/chosen": -10398343.2, "logits/rejected": -17713197.333333332, "logps/chosen": -410.534130859375, "logps/rejected": -356.0947265625, "loss": 0.2934, "rewards/chosen": 0.7291860103607177, "rewards/margins": 2.4691202958424885, "rewards/rejected": -1.7399342854817708, "step": 3387 }, { "epoch": 0.17957755810563697, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11319991.0, "logits/rejected": -33595232.0, "logps/chosen": -103.76055908203125, "logps/rejected": -306.6444498697917, "loss": 0.2799, "rewards/chosen": 0.18965452909469604, "rewards/margins": 1.6261266271273296, "rewards/rejected": -1.4364720980326335, "step": 3388 }, { "epoch": 0.1796305621074391, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27949314.666666668, "logits/rejected": -33896080.0, "logps/chosen": -114.25400797526042, "logps/rejected": -425.17144775390625, "loss": 0.3927, "rewards/chosen": 0.1625648240248362, "rewards/margins": 1.6138938168684642, "rewards/rejected": -1.451328992843628, "step": 3389 }, { "epoch": 0.17968356610924124, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20757160.0, "logits/rejected": -31102357.333333332, "logps/chosen": -270.4844665527344, "logps/rejected": -317.8986002604167, "loss": 0.2554, "rewards/chosen": 0.36394327878952026, "rewards/margins": 1.846487859884898, "rewards/rejected": -1.4825445810953777, "step": 3390 }, { "epoch": 0.17973657011104338, "grad_norm": 57.5, "kl": 0.264801025390625, "learning_rate": 5e-07, "logits/chosen": -10031379.0, "logits/rejected": -84230552.0, "logps/chosen": -257.04290771484375, "logps/rejected": -200.47457885742188, "loss": 0.4128, "rewards/chosen": 0.1353553831577301, "rewards/margins": 0.8288993537425995, "rewards/rejected": -0.6935439705848694, "step": 3391 }, { "epoch": 0.17978957411284552, "grad_norm": 54.75, "kl": 0.5808448791503906, "learning_rate": 5e-07, "logits/chosen": -24858890.0, "logits/rejected": 19414594.0, "logps/chosen": -276.66259765625, "logps/rejected": -256.1123046875, "loss": 0.4752, "rewards/chosen": -0.3876497447490692, "rewards/margins": 0.437673956155777, "rewards/rejected": -0.8253237009048462, "step": 3392 }, { "epoch": 0.17984257811464766, "grad_norm": 58.0, "kl": 0.3365459442138672, "learning_rate": 5e-07, "logits/chosen": -54968144.0, "logits/rejected": -6833734.5, "logps/chosen": -365.62744140625, "logps/rejected": -212.90557861328125, "loss": 0.3438, "rewards/chosen": 0.1418260633945465, "rewards/margins": 1.6439277231693268, "rewards/rejected": -1.5021016597747803, "step": 3393 }, { "epoch": 0.1798955821164498, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63092106.666666664, "logits/rejected": -10216134.4, "logps/chosen": -588.9552408854166, "logps/rejected": -414.197021484375, "loss": 0.2926, "rewards/chosen": 0.27471617857615155, "rewards/margins": 1.9429521481196086, "rewards/rejected": -1.668235969543457, "step": 3394 }, { "epoch": 0.17994858611825193, "grad_norm": 62.75, "kl": 0.41790008544921875, "learning_rate": 5e-07, "logits/chosen": -18934604.0, "logits/rejected": 1298135.5, "logps/chosen": -507.82440185546875, "logps/rejected": -111.53964233398438, "loss": 0.3421, "rewards/chosen": 0.4819905757904053, "rewards/margins": 1.4995661973953247, "rewards/rejected": -1.0175756216049194, "step": 3395 }, { "epoch": 0.18000159012005407, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28071885.714285713, "logits/rejected": -139062032.0, "logps/chosen": -315.43603515625, "logps/rejected": -1477.9876708984375, "loss": 0.359, "rewards/chosen": 0.37379612241472515, "rewards/margins": 6.728861945016043, "rewards/rejected": -6.355065822601318, "step": 3396 }, { "epoch": 0.1800545941218562, "grad_norm": 60.25, "kl": 0.72613525390625, "learning_rate": 5e-07, "logits/chosen": -63934224.0, "logits/rejected": -25579384.0, "logps/chosen": -441.3509928385417, "logps/rejected": -323.0410400390625, "loss": 0.2625, "rewards/chosen": 0.6136576334635416, "rewards/margins": 2.029044500986735, "rewards/rejected": -1.4153868675231933, "step": 3397 }, { "epoch": 0.18010759812365834, "grad_norm": 50.75, "kl": 0.36167335510253906, "learning_rate": 5e-07, "logits/chosen": 18404882.0, "logits/rejected": -24898581.333333332, "logps/chosen": -867.0426635742188, "logps/rejected": -259.37961832682294, "loss": 0.2288, "rewards/chosen": 1.2606945037841797, "rewards/margins": 2.842637538909912, "rewards/rejected": -1.5819430351257324, "step": 3398 }, { "epoch": 0.18016060212546048, "grad_norm": 49.75, "kl": 0.18194818496704102, "learning_rate": 5e-07, "logits/chosen": -36827702.85714286, "logits/rejected": 16639928.0, "logps/chosen": -239.62515694754464, "logps/rejected": -721.721435546875, "loss": 0.3968, "rewards/chosen": 0.269834041595459, "rewards/margins": 2.86921763420105, "rewards/rejected": -2.599383592605591, "step": 3399 }, { "epoch": 0.18021360612726262, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33286762.0, "logits/rejected": -38177128.0, "logps/chosen": -276.60430908203125, "logps/rejected": -400.21331787109375, "loss": 0.3639, "rewards/chosen": -0.4517613649368286, "rewards/margins": 1.5508421659469604, "rewards/rejected": -2.002603530883789, "step": 3400 }, { "epoch": 0.18026661012906475, "grad_norm": 77.0, "kl": 1.4497642517089844, "learning_rate": 5e-07, "logits/chosen": -51303337.6, "logits/rejected": -41448645.333333336, "logps/chosen": -623.471875, "logps/rejected": -436.6716715494792, "loss": 0.3424, "rewards/chosen": 0.5437352657318115, "rewards/margins": 2.7663692633310952, "rewards/rejected": -2.2226339975992837, "step": 3401 }, { "epoch": 0.1803196141308669, "grad_norm": 49.25, "kl": 0.15952301025390625, "learning_rate": 5e-07, "logits/chosen": -11002798.0, "logits/rejected": -43762582.85714286, "logps/chosen": -488.1031188964844, "logps/rejected": -376.23238699776783, "loss": 0.1887, "rewards/chosen": 0.4159393310546875, "rewards/margins": 2.2088279724121094, "rewards/rejected": -1.7928886413574219, "step": 3402 }, { "epoch": 0.18037261813266903, "grad_norm": 62.5, "kl": 0.5557289123535156, "learning_rate": 5e-07, "logits/chosen": -19138422.0, "logits/rejected": -11171771.0, "logps/chosen": -407.59674072265625, "logps/rejected": -208.03453063964844, "loss": 0.3213, "rewards/chosen": 0.584058403968811, "rewards/margins": 1.7447642087936401, "rewards/rejected": -1.160705804824829, "step": 3403 }, { "epoch": 0.18042562213447116, "grad_norm": 78.5, "kl": 0.4489402770996094, "learning_rate": 5e-07, "logits/chosen": 30228021.333333332, "logits/rejected": -7452173.5, "logps/chosen": -490.4222412109375, "logps/rejected": -188.75497436523438, "loss": 0.3267, "rewards/chosen": 0.4608375628789266, "rewards/margins": 2.755693872769674, "rewards/rejected": -2.294856309890747, "step": 3404 }, { "epoch": 0.1804786261362733, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21657082.0, "logits/rejected": -247762.875, "logps/chosen": -243.22059631347656, "logps/rejected": -83.80908203125, "loss": 0.4227, "rewards/chosen": -0.14812473952770233, "rewards/margins": 1.1043554097414017, "rewards/rejected": -1.252480149269104, "step": 3405 }, { "epoch": 0.18053163013807544, "grad_norm": 53.25, "kl": 2.44482421875, "learning_rate": 5e-07, "logits/chosen": -49345872.0, "logits/rejected": -58042768.0, "logps/chosen": -598.026123046875, "logps/rejected": -476.652099609375, "loss": 0.3762, "rewards/chosen": 0.6696121692657471, "rewards/margins": 2.265432119369507, "rewards/rejected": -1.5958199501037598, "step": 3406 }, { "epoch": 0.18058463413987755, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51459930.666666664, "logits/rejected": -21877731.2, "logps/chosen": -347.1361490885417, "logps/rejected": -292.414404296875, "loss": 0.2822, "rewards/chosen": -0.06318715214729309, "rewards/margins": 2.1395178496837617, "rewards/rejected": -2.2027050018310548, "step": 3407 }, { "epoch": 0.18063763814167969, "grad_norm": 51.5, "kl": 0.28576087951660156, "learning_rate": 5e-07, "logits/chosen": -59123082.666666664, "logits/rejected": -3789820.8, "logps/chosen": -168.91280110677084, "logps/rejected": -215.3271484375, "loss": 0.3459, "rewards/chosen": 0.0954335629940033, "rewards/margins": 1.2146247327327728, "rewards/rejected": -1.1191911697387695, "step": 3408 }, { "epoch": 0.18069064214348182, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9675330.666666666, "logits/rejected": -40696528.0, "logps/chosen": -118.166259765625, "logps/rejected": -174.0430908203125, "loss": 0.4614, "rewards/chosen": -0.18922754128774008, "rewards/margins": 1.1207616726557414, "rewards/rejected": -1.3099892139434814, "step": 3409 }, { "epoch": 0.18074364614528396, "grad_norm": 51.75, "kl": 1.4894256591796875, "learning_rate": 5e-07, "logits/chosen": -14964376.0, "logits/rejected": -47337656.0, "logps/chosen": -317.40875244140625, "logps/rejected": -489.9165954589844, "loss": 0.2432, "rewards/chosen": 0.7519500851631165, "rewards/margins": 3.158966362476349, "rewards/rejected": -2.4070162773132324, "step": 3410 }, { "epoch": 0.1807966501470861, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7658699.2, "logits/rejected": -648826.0416666666, "logps/chosen": -130.63306884765626, "logps/rejected": -209.81595865885416, "loss": 0.4073, "rewards/chosen": 0.2861460208892822, "rewards/margins": 0.8781542936960856, "rewards/rejected": -0.5920082728068033, "step": 3411 }, { "epoch": 0.18084965414888823, "grad_norm": 49.75, "kl": 0.6111860275268555, "learning_rate": 5e-07, "logits/chosen": -28528464.0, "logits/rejected": -47220288.0, "logps/chosen": -231.6379150390625, "logps/rejected": -450.7615559895833, "loss": 0.3184, "rewards/chosen": 0.3708514928817749, "rewards/margins": 2.4313308159510294, "rewards/rejected": -2.0604793230692544, "step": 3412 }, { "epoch": 0.18090265815069037, "grad_norm": 45.25, "kl": 0.27822113037109375, "learning_rate": 5e-07, "logits/chosen": -26853138.0, "logits/rejected": -34004588.0, "logps/chosen": -291.09100341796875, "logps/rejected": -189.6666717529297, "loss": 0.3851, "rewards/chosen": -0.2593238949775696, "rewards/margins": 1.2069113850593567, "rewards/rejected": -1.4662352800369263, "step": 3413 }, { "epoch": 0.1809556621524925, "grad_norm": 95.5, "kl": 3.334522247314453, "learning_rate": 5e-07, "logits/chosen": -49024676.571428575, "logits/rejected": -60484384.0, "logps/chosen": -507.84842354910717, "logps/rejected": -466.2687683105469, "loss": 0.4388, "rewards/chosen": 0.4776858261653355, "rewards/margins": 2.01631190095629, "rewards/rejected": -1.5386260747909546, "step": 3414 }, { "epoch": 0.18100866615429465, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38945177.6, "logits/rejected": 109222794.66666667, "logps/chosen": -261.182275390625, "logps/rejected": -351.1547444661458, "loss": 0.4164, "rewards/chosen": -0.317801308631897, "rewards/margins": 1.7350897868474326, "rewards/rejected": -2.0528910954793296, "step": 3415 }, { "epoch": 0.18106167015609678, "grad_norm": 40.0, "kl": 0.059350013732910156, "learning_rate": 5e-07, "logits/chosen": -12777434.0, "logits/rejected": -6222837.142857143, "logps/chosen": -189.50662231445312, "logps/rejected": -261.75697544642856, "loss": 0.3109, "rewards/chosen": -0.5411453247070312, "rewards/margins": 0.6575521741594588, "rewards/rejected": -1.19869749886649, "step": 3416 }, { "epoch": 0.18111467415789892, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7529597.6, "logits/rejected": -42397810.666666664, "logps/chosen": -328.3755859375, "logps/rejected": -655.5163981119791, "loss": 0.2595, "rewards/chosen": 0.8557771682739258, "rewards/margins": 3.7752705256144203, "rewards/rejected": -2.9194933573404946, "step": 3417 }, { "epoch": 0.18116767815970106, "grad_norm": 63.0, "kl": 0.6931915283203125, "learning_rate": 5e-07, "logits/chosen": -30580240.0, "logits/rejected": -61242712.0, "logps/chosen": -384.52500697544644, "logps/rejected": -428.5671691894531, "loss": 0.4455, "rewards/chosen": 0.07459390163421631, "rewards/margins": 2.5040160417556763, "rewards/rejected": -2.42942214012146, "step": 3418 }, { "epoch": 0.1812206821615032, "grad_norm": 70.0, "kl": 2.478710174560547, "learning_rate": 5e-07, "logits/chosen": -32257312.0, "logits/rejected": 2937895.3333333335, "logps/chosen": -669.2888671875, "logps/rejected": -218.60308837890625, "loss": 0.4213, "rewards/chosen": 0.6180553913116456, "rewards/margins": 1.0558282216389974, "rewards/rejected": -0.43777283032735187, "step": 3419 }, { "epoch": 0.18127368616330533, "grad_norm": 40.75, "kl": 0.13202476501464844, "learning_rate": 5e-07, "logits/chosen": -10978139.0, "logits/rejected": -18186480.0, "logps/chosen": -158.21461486816406, "logps/rejected": -239.74090576171875, "loss": 0.3163, "rewards/chosen": 0.17902636528015137, "rewards/margins": 1.787025809288025, "rewards/rejected": -1.6079994440078735, "step": 3420 }, { "epoch": 0.18132669016510747, "grad_norm": 52.0, "kl": 0.6780242919921875, "learning_rate": 5e-07, "logits/chosen": -130797130.66666667, "logits/rejected": -68112345.6, "logps/chosen": -631.711181640625, "logps/rejected": -281.669970703125, "loss": 0.2459, "rewards/chosen": 0.8315969308217367, "rewards/margins": 2.46101663907369, "rewards/rejected": -1.629419708251953, "step": 3421 }, { "epoch": 0.1813796941669096, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8543852.0, "logits/rejected": -13723907.2, "logps/chosen": -382.9155680338542, "logps/rejected": -110.5200927734375, "loss": 0.3268, "rewards/chosen": 0.3539690574010213, "rewards/margins": 1.3719101508458456, "rewards/rejected": -1.0179410934448243, "step": 3422 }, { "epoch": 0.18143269816871174, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44167452.0, "logits/rejected": -22404872.0, "logps/chosen": -632.3914184570312, "logps/rejected": -272.7239074707031, "loss": 0.2788, "rewards/chosen": 0.5819786190986633, "rewards/margins": 2.6490851044654846, "rewards/rejected": -2.0671064853668213, "step": 3423 }, { "epoch": 0.18148570217051388, "grad_norm": 57.5, "kl": 1.3423595428466797, "learning_rate": 5e-07, "logits/chosen": -26111530.666666668, "logits/rejected": -16765495.0, "logps/chosen": -358.1785481770833, "logps/rejected": -194.22494506835938, "loss": 0.4272, "rewards/chosen": 0.14417240023612976, "rewards/margins": 1.6225111186504364, "rewards/rejected": -1.4783387184143066, "step": 3424 }, { "epoch": 0.18153870617231602, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10583720.0, "logits/rejected": -17158284.0, "logps/chosen": -538.8218383789062, "logps/rejected": -183.43212890625, "loss": 0.2819, "rewards/chosen": 0.2538864016532898, "rewards/margins": 1.6411630511283875, "rewards/rejected": -1.3872766494750977, "step": 3425 }, { "epoch": 0.18159171017411815, "grad_norm": 31.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5940024.0, "logits/rejected": -49314521.6, "logps/chosen": -13.39621607462565, "logps/rejected": -238.3736083984375, "loss": 0.3605, "rewards/chosen": -0.21653195222218832, "rewards/margins": 1.077947513262431, "rewards/rejected": -1.2944794654846192, "step": 3426 }, { "epoch": 0.1816447141759203, "grad_norm": 59.0, "kl": 0.07567310333251953, "learning_rate": 5e-07, "logits/chosen": -26950732.8, "logits/rejected": -54212197.333333336, "logps/chosen": -277.9674072265625, "logps/rejected": -253.44148763020834, "loss": 0.3799, "rewards/chosen": 0.055408167839050296, "rewards/margins": 1.525106914838155, "rewards/rejected": -1.4696987469991047, "step": 3427 }, { "epoch": 0.18169771817772243, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54327088.0, "logits/rejected": -21902672.0, "logps/chosen": -297.4083251953125, "logps/rejected": -392.876220703125, "loss": 0.2165, "rewards/chosen": 0.1868583709001541, "rewards/margins": 2.4097743382056556, "rewards/rejected": -2.2229159673055015, "step": 3428 }, { "epoch": 0.18175072217952457, "grad_norm": 60.75, "kl": 0.4825897216796875, "learning_rate": 5e-07, "logits/chosen": -13126207.0, "logits/rejected": -52963880.0, "logps/chosen": -494.25732421875, "logps/rejected": -258.167236328125, "loss": 0.3058, "rewards/chosen": 0.6353752017021179, "rewards/margins": 1.8609194159507751, "rewards/rejected": -1.2255442142486572, "step": 3429 }, { "epoch": 0.1818037261813267, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45585724.0, "logits/rejected": -29583608.0, "logps/chosen": -408.5390625, "logps/rejected": -424.77935791015625, "loss": 0.3048, "rewards/chosen": 0.04005032032728195, "rewards/margins": 2.179135613143444, "rewards/rejected": -2.139085292816162, "step": 3430 }, { "epoch": 0.18185673018312884, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16653333.333333334, "logits/rejected": -29241868.8, "logps/chosen": -112.35831705729167, "logps/rejected": -227.103271484375, "loss": 0.3986, "rewards/chosen": 0.1638893187046051, "rewards/margins": 0.7567326605319977, "rewards/rejected": -0.5928433418273926, "step": 3431 }, { "epoch": 0.18190973418493095, "grad_norm": 39.0, "kl": 0.05491781234741211, "learning_rate": 5e-07, "logits/chosen": -44951.458333333336, "logits/rejected": -79697036.8, "logps/chosen": -193.13983154296875, "logps/rejected": -162.64691162109375, "loss": 0.353, "rewards/chosen": 0.30325116713841754, "rewards/margins": 1.2434089620908102, "rewards/rejected": -0.9401577949523926, "step": 3432 }, { "epoch": 0.1819627381867331, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1588830.3333333333, "logits/rejected": -21661849.6, "logps/chosen": -181.61564127604166, "logps/rejected": -319.101513671875, "loss": 0.307, "rewards/chosen": -0.005559672911961873, "rewards/margins": 1.5923136333624524, "rewards/rejected": -1.5978733062744142, "step": 3433 }, { "epoch": 0.18201574218853522, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8514388.0, "logits/rejected": -52242741.333333336, "logps/chosen": -166.114306640625, "logps/rejected": -133.07294718424478, "loss": 0.3439, "rewards/chosen": 0.5924414157867431, "rewards/margins": 1.3813449700673421, "rewards/rejected": -0.788903554280599, "step": 3434 }, { "epoch": 0.18206874619033736, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11040785.333333334, "logits/rejected": -20501764.8, "logps/chosen": -351.0159098307292, "logps/rejected": -280.65849609375, "loss": 0.2238, "rewards/chosen": 0.77434770266215, "rewards/margins": 2.6606048742930093, "rewards/rejected": -1.8862571716308594, "step": 3435 }, { "epoch": 0.1821217501921395, "grad_norm": 83.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42223958.4, "logits/rejected": -41327290.666666664, "logps/chosen": -497.262060546875, "logps/rejected": -438.0436197916667, "loss": 0.2956, "rewards/chosen": 0.43758420944213866, "rewards/margins": 2.6530449867248533, "rewards/rejected": -2.215460777282715, "step": 3436 }, { "epoch": 0.18217475419394163, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11066653.0, "logits/rejected": -9942113.0, "logps/chosen": -147.28851318359375, "logps/rejected": -409.6290283203125, "loss": 0.2812, "rewards/chosen": 0.25770676136016846, "rewards/margins": 2.3837519884109497, "rewards/rejected": -2.1260452270507812, "step": 3437 }, { "epoch": 0.18222775819574377, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18873804.8, "logits/rejected": -15132570.666666666, "logps/chosen": -225.7040283203125, "logps/rejected": -114.02952067057292, "loss": 0.3347, "rewards/chosen": 0.4739020347595215, "rewards/margins": 1.7896423816680909, "rewards/rejected": -1.3157403469085693, "step": 3438 }, { "epoch": 0.1822807621975459, "grad_norm": 53.75, "kl": 1.2740135192871094, "learning_rate": 5e-07, "logits/chosen": -32297088.0, "logits/rejected": 19448828.0, "logps/chosen": -347.259814453125, "logps/rejected": -515.4835205078125, "loss": 0.2739, "rewards/chosen": 0.6146166801452637, "rewards/margins": 3.668138154347738, "rewards/rejected": -3.053521474202474, "step": 3439 }, { "epoch": 0.18233376619934805, "grad_norm": 48.0, "kl": 0.72723388671875, "learning_rate": 5e-07, "logits/chosen": -39579347.2, "logits/rejected": -22511130.666666668, "logps/chosen": -331.175146484375, "logps/rejected": -276.7877604166667, "loss": 0.3061, "rewards/chosen": 0.6117562294006348, "rewards/margins": 2.2310322761535644, "rewards/rejected": -1.6192760467529297, "step": 3440 }, { "epoch": 0.18238677020115018, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36139760.0, "logits/rejected": 15314992.0, "logps/chosen": -501.5813802083333, "logps/rejected": -393.9357666015625, "loss": 0.2046, "rewards/chosen": 0.851306676864624, "rewards/margins": 2.9659232616424562, "rewards/rejected": -2.114616584777832, "step": 3441 }, { "epoch": 0.18243977420295232, "grad_norm": 65.5, "kl": 2.272674560546875, "learning_rate": 5e-07, "logits/chosen": -33987668.0, "logits/rejected": -67879856.0, "logps/chosen": -1104.4913330078125, "logps/rejected": -411.3358459472656, "loss": 0.2783, "rewards/chosen": 0.9207908511161804, "rewards/margins": 2.2471172213554382, "rewards/rejected": -1.3263263702392578, "step": 3442 }, { "epoch": 0.18249277820475446, "grad_norm": 58.75, "kl": 0.4419403076171875, "learning_rate": 5e-07, "logits/chosen": -8066812.0, "logits/rejected": -35093184.0, "logps/chosen": -359.850341796875, "logps/rejected": -141.80978393554688, "loss": 0.4001, "rewards/chosen": 0.12517772118250528, "rewards/margins": 2.008001665274302, "rewards/rejected": -1.8828239440917969, "step": 3443 }, { "epoch": 0.1825457822065566, "grad_norm": 60.0, "kl": 0.041515350341796875, "learning_rate": 5e-07, "logits/chosen": -83207372.8, "logits/rejected": 3415502.6666666665, "logps/chosen": -338.523046875, "logps/rejected": -296.6756998697917, "loss": 0.3078, "rewards/chosen": 0.5001940727233887, "rewards/margins": 2.1709329287211103, "rewards/rejected": -1.6707388559977214, "step": 3444 }, { "epoch": 0.18259878620835873, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4507977.666666667, "logits/rejected": -43879801.6, "logps/chosen": -59.95087178548177, "logps/rejected": -435.42109375, "loss": 0.2961, "rewards/chosen": -0.5723246335983276, "rewards/margins": 1.8349982500076294, "rewards/rejected": -2.407322883605957, "step": 3445 }, { "epoch": 0.18265179021016087, "grad_norm": 81.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43452484.0, "logits/rejected": -75134624.0, "logps/chosen": -162.71566772460938, "logps/rejected": -623.1361694335938, "loss": 0.312, "rewards/chosen": -0.060480013489723206, "rewards/margins": 2.4184576123952866, "rewards/rejected": -2.4789376258850098, "step": 3446 }, { "epoch": 0.182704794211963, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34085513.6, "logits/rejected": -66991306.666666664, "logps/chosen": -182.460107421875, "logps/rejected": -495.4734293619792, "loss": 0.3809, "rewards/chosen": -0.025089114904403687, "rewards/margins": 1.6537790199120839, "rewards/rejected": -1.6788681348164876, "step": 3447 }, { "epoch": 0.18275779821376514, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1661187.25, "logits/rejected": -74091536.0, "logps/chosen": -198.27536010742188, "logps/rejected": -341.9723815917969, "loss": 0.2746, "rewards/chosen": 0.2528463304042816, "rewards/margins": 2.40930637717247, "rewards/rejected": -2.1564600467681885, "step": 3448 }, { "epoch": 0.18281080221556728, "grad_norm": 44.5, "kl": 0.45021820068359375, "learning_rate": 5e-07, "logits/chosen": -51351400.0, "logits/rejected": -94315112.0, "logps/chosen": -340.41766357421875, "logps/rejected": -686.8782958984375, "loss": 0.3211, "rewards/chosen": 0.1341584324836731, "rewards/margins": 2.6453247666358948, "rewards/rejected": -2.5111663341522217, "step": 3449 }, { "epoch": 0.18286380621736942, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23962950.4, "logits/rejected": -30827589.333333332, "logps/chosen": -302.2339599609375, "logps/rejected": -460.0182291666667, "loss": 0.2813, "rewards/chosen": 0.4613210678100586, "rewards/margins": 3.0809868494669597, "rewards/rejected": -2.619665781656901, "step": 3450 }, { "epoch": 0.18291681021917156, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2235246.3333333335, "logits/rejected": 129963814.4, "logps/chosen": -143.10430908203125, "logps/rejected": -529.590771484375, "loss": 0.2842, "rewards/chosen": 0.06885427236557007, "rewards/margins": 2.4432677626609802, "rewards/rejected": -2.37441349029541, "step": 3451 }, { "epoch": 0.1829698142209737, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11072881.0, "logits/rejected": -27689426.0, "logps/chosen": -106.75407409667969, "logps/rejected": -263.393310546875, "loss": 0.3554, "rewards/chosen": -0.0854543149471283, "rewards/margins": 1.438134342432022, "rewards/rejected": -1.5235886573791504, "step": 3452 }, { "epoch": 0.18302281822277583, "grad_norm": 42.5, "kl": 0.017345428466796875, "learning_rate": 5e-07, "logits/chosen": -7660384.0, "logits/rejected": -8654945.0, "logps/chosen": -114.59431966145833, "logps/rejected": -263.2698974609375, "loss": 0.3722, "rewards/chosen": 0.252649188041687, "rewards/margins": 2.049627900123596, "rewards/rejected": -1.7969787120819092, "step": 3453 }, { "epoch": 0.18307582222457797, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20848794.0, "logits/rejected": -14835404.0, "logps/chosen": -281.3294372558594, "logps/rejected": -276.349853515625, "loss": 0.2908, "rewards/chosen": 0.3536837100982666, "rewards/margins": 2.176828980445862, "rewards/rejected": -1.8231452703475952, "step": 3454 }, { "epoch": 0.1831288262263801, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19456312.0, "logits/rejected": -4709294.0, "logps/chosen": -291.80816650390625, "logps/rejected": -173.8516845703125, "loss": 0.4293, "rewards/chosen": -0.06816083192825317, "rewards/margins": 1.5475541949272156, "rewards/rejected": -1.6157150268554688, "step": 3455 }, { "epoch": 0.18318183022818224, "grad_norm": 82.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10463104.0, "logits/rejected": -26168164.0, "logps/chosen": -214.0565185546875, "logps/rejected": -486.91937255859375, "loss": 0.2959, "rewards/chosen": -0.026007860898971558, "rewards/margins": 2.7792380154132843, "rewards/rejected": -2.805245876312256, "step": 3456 }, { "epoch": 0.18323483422998435, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4904528.0, "logits/rejected": -69486549.33333333, "logps/chosen": -171.4330078125, "logps/rejected": -800.955078125, "loss": 0.2622, "rewards/chosen": 0.6387880325317383, "rewards/margins": 3.719739087422689, "rewards/rejected": -3.0809510548909507, "step": 3457 }, { "epoch": 0.1832878382317865, "grad_norm": 77.0, "kl": 0.2394256591796875, "learning_rate": 5e-07, "logits/chosen": 10035274.0, "logits/rejected": -101175152.0, "logps/chosen": -889.013916015625, "logps/rejected": -329.70440673828125, "loss": 0.3244, "rewards/chosen": 0.3032256066799164, "rewards/margins": 1.7292003333568573, "rewards/rejected": -1.425974726676941, "step": 3458 }, { "epoch": 0.18334084223358862, "grad_norm": 50.5, "kl": 0.36940765380859375, "learning_rate": 5e-07, "logits/chosen": -32048442.0, "logits/rejected": -50348844.0, "logps/chosen": -342.39398193359375, "logps/rejected": -229.78050231933594, "loss": 0.3577, "rewards/chosen": -0.008179470896720886, "rewards/margins": 1.391901209950447, "rewards/rejected": -1.400080680847168, "step": 3459 }, { "epoch": 0.18339384623539076, "grad_norm": 63.0, "kl": 2.5067367553710938, "learning_rate": 5e-07, "logits/chosen": -4008714.5, "logits/rejected": -29913766.0, "logps/chosen": -477.91168212890625, "logps/rejected": -308.0852355957031, "loss": 0.2947, "rewards/chosen": 0.8279536366462708, "rewards/margins": 2.494928300380707, "rewards/rejected": -1.666974663734436, "step": 3460 }, { "epoch": 0.1834468502371929, "grad_norm": 47.0, "kl": 0.03634071350097656, "learning_rate": 5e-07, "logits/chosen": -11389638.0, "logits/rejected": -4963370.5, "logps/chosen": -398.3016662597656, "logps/rejected": -152.30357360839844, "loss": 0.3428, "rewards/chosen": 0.3955938518047333, "rewards/margins": 1.6425852477550507, "rewards/rejected": -1.2469913959503174, "step": 3461 }, { "epoch": 0.18349985423899504, "grad_norm": 58.25, "kl": 0.19561004638671875, "learning_rate": 5e-07, "logits/chosen": -18049462.85714286, "logits/rejected": -7671171.5, "logps/chosen": -191.57756696428572, "logps/rejected": -119.79371643066406, "loss": 0.4347, "rewards/chosen": 0.12772519247872488, "rewards/margins": 1.8183884109769548, "rewards/rejected": -1.69066321849823, "step": 3462 }, { "epoch": 0.18355285824079717, "grad_norm": 46.5, "kl": 0.7344532012939453, "learning_rate": 5e-07, "logits/chosen": -11767680.0, "logits/rejected": -5844439.333333333, "logps/chosen": -310.2405029296875, "logps/rejected": -108.7778828938802, "loss": 0.387, "rewards/chosen": 0.2712252140045166, "rewards/margins": 1.3827428817749023, "rewards/rejected": -1.1115176677703857, "step": 3463 }, { "epoch": 0.1836058622425993, "grad_norm": 43.5, "kl": 0.5732212066650391, "learning_rate": 5e-07, "logits/chosen": -25153366.0, "logits/rejected": -100582704.0, "logps/chosen": -144.421875, "logps/rejected": -167.1686248779297, "loss": 0.3318, "rewards/chosen": 0.5564316511154175, "rewards/margins": 1.7132961750030518, "rewards/rejected": -1.1568645238876343, "step": 3464 }, { "epoch": 0.18365886624440145, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -367192.0, "logits/rejected": -26367280.0, "logps/chosen": -271.705859375, "logps/rejected": -186.74881998697916, "loss": 0.3965, "rewards/chosen": 0.2069300651550293, "rewards/margins": 1.024499003092448, "rewards/rejected": -0.8175689379374186, "step": 3465 }, { "epoch": 0.18371187024620358, "grad_norm": 47.75, "kl": 0.17123031616210938, "learning_rate": 5e-07, "logits/chosen": 17095932.0, "logits/rejected": -39254089.6, "logps/chosen": -244.29073079427084, "logps/rejected": -300.5431640625, "loss": 0.2555, "rewards/chosen": 0.060253908236821495, "rewards/margins": 2.569505693515142, "rewards/rejected": -2.5092517852783205, "step": 3466 }, { "epoch": 0.18376487424800572, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13633280.0, "logits/rejected": 72774112.0, "logps/chosen": -329.56585693359375, "logps/rejected": -462.7690124511719, "loss": 0.3398, "rewards/chosen": -0.14160558581352234, "rewards/margins": 3.031652480363846, "rewards/rejected": -3.173258066177368, "step": 3467 }, { "epoch": 0.18381787824980786, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32574682.0, "logits/rejected": -40611900.0, "logps/chosen": -190.70523071289062, "logps/rejected": -399.79547119140625, "loss": 0.2885, "rewards/chosen": 0.1937119960784912, "rewards/margins": 2.2378618717193604, "rewards/rejected": -2.044149875640869, "step": 3468 }, { "epoch": 0.18387088225161, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56877120.0, "logits/rejected": -88668000.0, "logps/chosen": -313.643798828125, "logps/rejected": -457.8578287760417, "loss": 0.2872, "rewards/chosen": 0.3667656898498535, "rewards/margins": 3.097977034250895, "rewards/rejected": -2.7312113444010415, "step": 3469 }, { "epoch": 0.18392388625341213, "grad_norm": 59.0, "kl": 0.9648551940917969, "learning_rate": 5e-07, "logits/chosen": -31609376.0, "logits/rejected": -27625276.0, "logps/chosen": -352.2212320963542, "logps/rejected": -390.2659912109375, "loss": 0.3553, "rewards/chosen": 0.3876768747965495, "rewards/margins": 2.484219710032145, "rewards/rejected": -2.0965428352355957, "step": 3470 }, { "epoch": 0.18397689025521427, "grad_norm": 44.0, "kl": 0.45168304443359375, "learning_rate": 5e-07, "logits/chosen": -21353026.0, "logits/rejected": -30472992.0, "logps/chosen": -250.10659790039062, "logps/rejected": -576.1729736328125, "loss": 0.257, "rewards/chosen": 0.5187076926231384, "rewards/margins": 2.796705186367035, "rewards/rejected": -2.2779974937438965, "step": 3471 }, { "epoch": 0.1840298942570164, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10403594.0, "logits/rejected": -12855621.0, "logps/chosen": -219.94712829589844, "logps/rejected": -158.5926971435547, "loss": 0.34, "rewards/chosen": 0.10557413101196289, "rewards/margins": 1.5444395542144775, "rewards/rejected": -1.4388654232025146, "step": 3472 }, { "epoch": 0.18408289825881854, "grad_norm": 59.5, "kl": 0.6520004272460938, "learning_rate": 5e-07, "logits/chosen": -35426312.0, "logits/rejected": -9358569.0, "logps/chosen": -383.29693603515625, "logps/rejected": -135.55679321289062, "loss": 0.4117, "rewards/chosen": 0.29866543412208557, "rewards/margins": 0.9312750995159149, "rewards/rejected": -0.6326096653938293, "step": 3473 }, { "epoch": 0.18413590226062068, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23193541.333333332, "logits/rejected": -3284463.0, "logps/chosen": -192.31475830078125, "logps/rejected": -192.38485717773438, "loss": 0.4095, "rewards/chosen": 0.007693260908126831, "rewards/margins": 1.8240871131420135, "rewards/rejected": -1.8163938522338867, "step": 3474 }, { "epoch": 0.18418890626242282, "grad_norm": 48.0, "kl": 0.4770164489746094, "learning_rate": 5e-07, "logits/chosen": -34210265.6, "logits/rejected": -43608477.333333336, "logps/chosen": -347.5275634765625, "logps/rejected": -275.68902587890625, "loss": 0.2847, "rewards/chosen": 0.5033538818359375, "rewards/margins": 3.207428741455078, "rewards/rejected": -2.7040748596191406, "step": 3475 }, { "epoch": 0.18424191026422496, "grad_norm": 60.0, "kl": 0.8243732452392578, "learning_rate": 5e-07, "logits/chosen": -23141546.0, "logits/rejected": -16088824.0, "logps/chosen": -588.3402099609375, "logps/rejected": -490.01983642578125, "loss": 0.2179, "rewards/chosen": 1.0459752082824707, "rewards/margins": 3.158982753753662, "rewards/rejected": -2.1130075454711914, "step": 3476 }, { "epoch": 0.1842949142660271, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -103748790.85714285, "logits/rejected": -6886319.0, "logps/chosen": -408.13881138392856, "logps/rejected": -64.8791732788086, "loss": 0.434, "rewards/chosen": 0.0529533965247018, "rewards/margins": 2.791256104196821, "rewards/rejected": -2.738302707672119, "step": 3477 }, { "epoch": 0.18434791826782923, "grad_norm": 84.0, "kl": 0.717717170715332, "learning_rate": 5e-07, "logits/chosen": -17586950.4, "logits/rejected": -4771748.0, "logps/chosen": -237.624951171875, "logps/rejected": -228.08951822916666, "loss": 0.3815, "rewards/chosen": 0.30836513042449953, "rewards/margins": 1.4719655752182006, "rewards/rejected": -1.1636004447937012, "step": 3478 }, { "epoch": 0.18440092226963137, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13556042.666666666, "logits/rejected": -18970896.0, "logps/chosen": -180.4468994140625, "logps/rejected": -68.21659851074219, "loss": 0.4573, "rewards/chosen": -0.026395410299301147, "rewards/margins": 0.9090835750102997, "rewards/rejected": -0.9354789853096008, "step": 3479 }, { "epoch": 0.1844539262714335, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -15130098.0, "logps/rejected": -218.7781982421875, "loss": 0.1725, "rewards/rejected": -1.7706221342086792, "step": 3480 }, { "epoch": 0.18450693027323564, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27323123.2, "logits/rejected": -11489648.0, "logps/chosen": -205.55205078125, "logps/rejected": -205.35738118489584, "loss": 0.3823, "rewards/chosen": 0.1379407525062561, "rewards/margins": 1.5699520389238995, "rewards/rejected": -1.4320112864176433, "step": 3481 }, { "epoch": 0.18455993427503778, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25054248.0, "logits/rejected": -11113454.4, "logps/chosen": -315.10341389973956, "logps/rejected": -200.7191162109375, "loss": 0.2193, "rewards/chosen": 0.8384223779042562, "rewards/margins": 2.572428878148397, "rewards/rejected": -1.7340065002441407, "step": 3482 }, { "epoch": 0.1846129382768399, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10175528.0, "logits/rejected": -39330064.0, "logps/chosen": -201.47921752929688, "logps/rejected": -420.5677083333333, "loss": 0.2832, "rewards/chosen": -0.32866334915161133, "rewards/margins": 1.3369879722595215, "rewards/rejected": -1.6656513214111328, "step": 3483 }, { "epoch": 0.18466594227864203, "grad_norm": 60.0, "kl": 0.5035743713378906, "learning_rate": 5e-07, "logits/chosen": 58046960.0, "logits/rejected": -55135292.0, "logps/chosen": -374.4539794921875, "logps/rejected": -483.20770263671875, "loss": 0.3085, "rewards/chosen": 0.4476393163204193, "rewards/margins": 2.176613301038742, "rewards/rejected": -1.7289739847183228, "step": 3484 }, { "epoch": 0.18471894628044416, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4338889.0, "logits/rejected": -26467498.0, "logps/chosen": -163.1924591064453, "logps/rejected": -268.913330078125, "loss": 0.315, "rewards/chosen": 0.24543750286102295, "rewards/margins": 1.7805830240249634, "rewards/rejected": -1.5351455211639404, "step": 3485 }, { "epoch": 0.1847719502822463, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15740208.0, "logits/rejected": -45818970.666666664, "logps/chosen": -228.4683349609375, "logps/rejected": -414.948974609375, "loss": 0.3961, "rewards/chosen": -0.1394671082496643, "rewards/margins": 1.5453648289044697, "rewards/rejected": -1.684831937154134, "step": 3486 }, { "epoch": 0.18482495428404844, "grad_norm": 55.75, "kl": 0.334136962890625, "learning_rate": 5e-07, "logits/chosen": -15450731.0, "logits/rejected": -23400586.0, "logps/chosen": -385.3441162109375, "logps/rejected": -750.7505493164062, "loss": 0.3222, "rewards/chosen": 0.1291603147983551, "rewards/margins": 3.7773071825504303, "rewards/rejected": -3.648146867752075, "step": 3487 }, { "epoch": 0.18487795828585057, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2866401.5, "logits/rejected": -23154954.666666668, "logps/chosen": -34.59521484375, "logps/rejected": -212.440185546875, "loss": 0.3306, "rewards/chosen": -0.13239115476608276, "rewards/margins": 0.9941975871721904, "rewards/rejected": -1.1265887419382732, "step": 3488 }, { "epoch": 0.1849309622876527, "grad_norm": 65.0, "kl": 2.7087020874023438, "learning_rate": 5e-07, "logits/chosen": -18579824.0, "logits/rejected": -22039614.0, "logps/chosen": -492.357177734375, "logps/rejected": -285.93548583984375, "loss": 0.4641, "rewards/chosen": 0.2330372929573059, "rewards/margins": 1.6315316557884216, "rewards/rejected": -1.3984943628311157, "step": 3489 }, { "epoch": 0.18498396628945485, "grad_norm": 60.25, "kl": 0.5293807983398438, "learning_rate": 5e-07, "logits/chosen": -65702661.333333336, "logits/rejected": -26380040.0, "logps/chosen": -992.8545735677084, "logps/rejected": -348.363671875, "loss": 0.1898, "rewards/chosen": 0.9381184577941895, "rewards/margins": 3.2051974296569825, "rewards/rejected": -2.267078971862793, "step": 3490 }, { "epoch": 0.18503697029125699, "grad_norm": 68.0, "kl": 0.23024368286132812, "learning_rate": 5e-07, "logits/chosen": -36588617.6, "logits/rejected": -23389813.333333332, "logps/chosen": -452.34736328125, "logps/rejected": -421.6044108072917, "loss": 0.3151, "rewards/chosen": 0.5324841499328613, "rewards/margins": 1.9908713658650714, "rewards/rejected": -1.4583872159322102, "step": 3491 }, { "epoch": 0.18508997429305912, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31347546.666666668, "logits/rejected": -39186816.0, "logps/chosen": -252.73824055989584, "logps/rejected": -365.4720458984375, "loss": 0.4616, "rewards/chosen": -0.20261325438817343, "rewards/margins": 1.854180673758189, "rewards/rejected": -2.0567939281463623, "step": 3492 }, { "epoch": 0.18514297829486126, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10408506.0, "logits/rejected": -18618577.333333332, "logps/chosen": -97.02674102783203, "logps/rejected": -149.29886881510416, "loss": 0.3224, "rewards/chosen": -0.3562886118888855, "rewards/margins": 0.9830000996589661, "rewards/rejected": -1.3392887115478516, "step": 3493 }, { "epoch": 0.1851959822966634, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32094114.666666668, "logits/rejected": -4767658.4, "logps/chosen": -101.05553181966145, "logps/rejected": -370.4974365234375, "loss": 0.3814, "rewards/chosen": -0.328169306119283, "rewards/margins": 0.9384536186854044, "rewards/rejected": -1.2666229248046874, "step": 3494 }, { "epoch": 0.18524898629846553, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9352993.333333334, "logits/rejected": -35338644.0, "logps/chosen": -64.28066507975261, "logps/rejected": -268.2452087402344, "loss": 0.4853, "rewards/chosen": -0.34635762373606366, "rewards/margins": 1.387410004933675, "rewards/rejected": -1.7337676286697388, "step": 3495 }, { "epoch": 0.18530199030026767, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3584996.5, "logits/rejected": -2812142.8571428573, "logps/chosen": -204.15179443359375, "logps/rejected": -349.60532924107144, "loss": 0.154, "rewards/chosen": 0.13164977729320526, "rewards/margins": 2.7106924844639644, "rewards/rejected": -2.579042707170759, "step": 3496 }, { "epoch": 0.1853549943020698, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37732832.0, "logits/rejected": -8978339.0, "logps/chosen": -485.9993489583333, "logps/rejected": -199.47315979003906, "loss": 0.3608, "rewards/chosen": 0.4527859687805176, "rewards/margins": 1.5592496395111084, "rewards/rejected": -1.1064636707305908, "step": 3497 }, { "epoch": 0.18540799830387195, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16266198.0, "logits/rejected": -16074369.0, "logps/chosen": -333.64190673828125, "logps/rejected": -241.35977172851562, "loss": 0.2297, "rewards/chosen": 0.6220541596412659, "rewards/margins": 2.7866483330726624, "rewards/rejected": -2.1645941734313965, "step": 3498 }, { "epoch": 0.18546100230567408, "grad_norm": 68.5, "kl": 1.2297563552856445, "learning_rate": 5e-07, "logits/chosen": -32976538.666666668, "logits/rejected": -28021756.0, "logps/chosen": -384.0939534505208, "logps/rejected": -302.7585754394531, "loss": 0.4251, "rewards/chosen": 0.24538089831670126, "rewards/margins": 1.2673826416333516, "rewards/rejected": -1.0220017433166504, "step": 3499 }, { "epoch": 0.18551400630747622, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16359613.333333334, "logits/rejected": 16601466.0, "logps/chosen": -315.14186604817706, "logps/rejected": -270.8547058105469, "loss": 0.4585, "rewards/chosen": -0.10506363709767659, "rewards/margins": 0.9535752932230631, "rewards/rejected": -1.0586389303207397, "step": 3500 }, { "epoch": 0.18556701030927836, "grad_norm": 59.5, "kl": 0.5011205673217773, "learning_rate": 5e-07, "logits/chosen": -44714560.0, "logits/rejected": -53708404.0, "logps/chosen": -214.44084821428572, "logps/rejected": -610.5369873046875, "loss": 0.4382, "rewards/chosen": 0.07437984432492938, "rewards/margins": 3.239565534251077, "rewards/rejected": -3.1651856899261475, "step": 3501 }, { "epoch": 0.1856200143110805, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62757040.0, "logits/rejected": -28813004.8, "logps/chosen": -550.4761962890625, "logps/rejected": -339.42626953125, "loss": 0.1842, "rewards/chosen": 0.8013773759206136, "rewards/margins": 3.0199595292409263, "rewards/rejected": -2.2185821533203125, "step": 3502 }, { "epoch": 0.18567301831288263, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9428799.0, "logits/rejected": -18475236.0, "logps/chosen": -393.7099914550781, "logps/rejected": -225.50906372070312, "loss": 0.3046, "rewards/chosen": 0.7959384918212891, "rewards/margins": 1.721321702003479, "rewards/rejected": -0.9253832101821899, "step": 3503 }, { "epoch": 0.18572602231468477, "grad_norm": 71.0, "kl": 1.0414705276489258, "learning_rate": 5e-07, "logits/chosen": -4342634.666666667, "logits/rejected": -28147360.0, "logps/chosen": -222.6195068359375, "logps/rejected": -369.196728515625, "loss": 0.2817, "rewards/chosen": 0.6489845116933187, "rewards/margins": 1.8719520409901937, "rewards/rejected": -1.222967529296875, "step": 3504 }, { "epoch": 0.1857790263164869, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5137351.6, "logits/rejected": -57205024.0, "logps/chosen": -146.37880859375, "logps/rejected": -567.3355305989584, "loss": 0.3586, "rewards/chosen": 0.14447498321533203, "rewards/margins": 2.513024648030599, "rewards/rejected": -2.368549664815267, "step": 3505 }, { "epoch": 0.18583203031828904, "grad_norm": 75.5, "kl": 1.1198883056640625, "learning_rate": 5e-07, "logits/chosen": -77989888.0, "logits/rejected": -11912180.0, "logps/chosen": -437.39410400390625, "logps/rejected": -296.5241394042969, "loss": 0.3351, "rewards/chosen": 0.26208189129829407, "rewards/margins": 1.8190378248691559, "rewards/rejected": -1.5569559335708618, "step": 3506 }, { "epoch": 0.18588503432009118, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8106922.0, "logits/rejected": -11635221.333333334, "logps/chosen": -308.44134521484375, "logps/rejected": -236.57661946614584, "loss": 0.2434, "rewards/chosen": 0.8772781491279602, "rewards/margins": 2.2714940508206682, "rewards/rejected": -1.3942159016927083, "step": 3507 }, { "epoch": 0.1859380383218933, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12908558.0, "logits/rejected": -12839549.333333334, "logps/chosen": -596.8345336914062, "logps/rejected": -191.3831583658854, "loss": 0.266, "rewards/chosen": 0.705670177936554, "rewards/margins": 1.9123190840085347, "rewards/rejected": -1.2066489060719807, "step": 3508 }, { "epoch": 0.18599104232369543, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21515801.333333332, "logits/rejected": -16194267.2, "logps/chosen": -314.5197347005208, "logps/rejected": -237.9399658203125, "loss": 0.2584, "rewards/chosen": 0.48957622051239014, "rewards/margins": 2.3852604627609253, "rewards/rejected": -1.8956842422485352, "step": 3509 }, { "epoch": 0.18604404632549756, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37646040.0, "logits/rejected": -25229584.0, "logps/chosen": -345.0301920572917, "logps/rejected": -456.685400390625, "loss": 0.2065, "rewards/chosen": 0.581279476483663, "rewards/margins": 3.2294575770696006, "rewards/rejected": -2.6481781005859375, "step": 3510 }, { "epoch": 0.1860970503272997, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24950074.666666668, "logits/rejected": -20828702.0, "logps/chosen": -308.14703369140625, "logps/rejected": -263.8404235839844, "loss": 0.4328, "rewards/chosen": 0.12026199698448181, "rewards/margins": 0.9029219448566437, "rewards/rejected": -0.7826599478721619, "step": 3511 }, { "epoch": 0.18615005432910184, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22138340.0, "logits/rejected": -10096132.0, "logps/chosen": -253.4172821044922, "logps/rejected": -113.69676208496094, "loss": 0.3181, "rewards/chosen": 0.23962892591953278, "rewards/margins": 1.7825809866189957, "rewards/rejected": -1.542952060699463, "step": 3512 }, { "epoch": 0.18620305833090398, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28098050.0, "logits/rejected": 4733296.0, "logps/chosen": -411.9550476074219, "logps/rejected": -213.73245239257812, "loss": 0.3129, "rewards/chosen": 0.6369922757148743, "rewards/margins": 2.0148438811302185, "rewards/rejected": -1.3778516054153442, "step": 3513 }, { "epoch": 0.1862560623327061, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31428784.0, "logits/rejected": -46309120.0, "logps/chosen": -292.8585510253906, "logps/rejected": -376.76834542410717, "loss": 0.1922, "rewards/chosen": 1.4124481678009033, "rewards/margins": 3.0592490264347623, "rewards/rejected": -1.6468008586338587, "step": 3514 }, { "epoch": 0.18630906633450825, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40436085.333333336, "logits/rejected": -44903353.6, "logps/chosen": -419.0369873046875, "logps/rejected": -319.676416015625, "loss": 0.3535, "rewards/chosen": -0.33021748065948486, "rewards/margins": 1.1036916494369506, "rewards/rejected": -1.4339091300964355, "step": 3515 }, { "epoch": 0.1863620703363104, "grad_norm": 46.0, "kl": 0.2385730743408203, "learning_rate": 5e-07, "logits/chosen": -15966020.0, "logits/rejected": -35558310.4, "logps/chosen": -182.5804443359375, "logps/rejected": -290.0150390625, "loss": 0.2907, "rewards/chosen": 0.6639361381530762, "rewards/margins": 2.0534396171569824, "rewards/rejected": -1.3895034790039062, "step": 3516 }, { "epoch": 0.18641507433811252, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81858608.0, "logits/rejected": -17207045.714285713, "logps/chosen": -431.74566650390625, "logps/rejected": -367.76876395089283, "loss": 0.2767, "rewards/chosen": 0.2002105712890625, "rewards/margins": 1.4645498820713587, "rewards/rejected": -1.2643393107822962, "step": 3517 }, { "epoch": 0.18646807833991466, "grad_norm": 50.75, "kl": 0.3603553771972656, "learning_rate": 5e-07, "logits/chosen": -56718613.333333336, "logits/rejected": -54413299.2, "logps/chosen": -372.4654134114583, "logps/rejected": -286.2634033203125, "loss": 0.2845, "rewards/chosen": -0.044366454084714256, "rewards/margins": 1.7467733393112819, "rewards/rejected": -1.791139793395996, "step": 3518 }, { "epoch": 0.1865210823417168, "grad_norm": 51.0, "kl": 0.6683626174926758, "learning_rate": 5e-07, "logits/chosen": -15923091.2, "logits/rejected": -12695830.666666666, "logps/chosen": -358.8781494140625, "logps/rejected": -222.24003092447916, "loss": 0.3404, "rewards/chosen": 0.4064481258392334, "rewards/margins": 2.000271876653035, "rewards/rejected": -1.593823750813802, "step": 3519 }, { "epoch": 0.18657408634351894, "grad_norm": 56.5, "kl": 0.2276763916015625, "learning_rate": 5e-07, "logits/chosen": -50468864.0, "logits/rejected": -44255128.0, "logps/chosen": -343.0605061848958, "logps/rejected": -377.80712890625, "loss": 0.4155, "rewards/chosen": -0.07095528642336528, "rewards/margins": 2.187885751326879, "rewards/rejected": -2.258841037750244, "step": 3520 }, { "epoch": 0.18662709034532107, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15028814.0, "logits/rejected": -36540261.333333336, "logps/chosen": -193.6829833984375, "logps/rejected": -386.0260823567708, "loss": 0.187, "rewards/chosen": 0.42653733491897583, "rewards/margins": 2.6163501540819802, "rewards/rejected": -2.1898128191630044, "step": 3521 }, { "epoch": 0.1866800943471232, "grad_norm": 52.5, "kl": 0.14812660217285156, "learning_rate": 5e-07, "logits/chosen": -29927156.0, "logits/rejected": -9453592.0, "logps/chosen": -264.06304931640625, "logps/rejected": -367.0564270019531, "loss": 0.2713, "rewards/chosen": 0.5082153677940369, "rewards/margins": 2.515824019908905, "rewards/rejected": -2.007608652114868, "step": 3522 }, { "epoch": 0.18673309834892535, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24819092.0, "logits/rejected": -11539608.0, "logps/chosen": -255.7813720703125, "logps/rejected": -119.41998291015625, "loss": 0.282, "rewards/chosen": 0.2831977903842926, "rewards/margins": 1.6231105228265126, "rewards/rejected": -1.33991273244222, "step": 3523 }, { "epoch": 0.18678610235072748, "grad_norm": 67.5, "kl": 2.415180206298828, "learning_rate": 5e-07, "logits/chosen": -20817688.0, "logits/rejected": 2932189.5, "logps/chosen": -450.73779296875, "logps/rejected": -301.3689880371094, "loss": 0.4414, "rewards/chosen": 0.2829178373018901, "rewards/margins": 1.3481633464495342, "rewards/rejected": -1.065245509147644, "step": 3524 }, { "epoch": 0.18683910635252962, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9305924.0, "logits/rejected": -5986797.2, "logps/chosen": -193.5997517903646, "logps/rejected": -164.8784912109375, "loss": 0.3218, "rewards/chosen": 0.1909390687942505, "rewards/margins": 1.4878451585769654, "rewards/rejected": -1.296906089782715, "step": 3525 }, { "epoch": 0.18689211035433176, "grad_norm": 70.0, "kl": 0.14016342163085938, "learning_rate": 5e-07, "logits/chosen": -2567964.5714285714, "logits/rejected": -89088768.0, "logps/chosen": -281.203125, "logps/rejected": -367.5303649902344, "loss": 0.469, "rewards/chosen": -0.0762612053326198, "rewards/margins": 2.4094291499682834, "rewards/rejected": -2.4856903553009033, "step": 3526 }, { "epoch": 0.1869451143561339, "grad_norm": 55.25, "kl": 0.9742965698242188, "learning_rate": 5e-07, "logits/chosen": -26815547.2, "logits/rejected": -45035413.333333336, "logps/chosen": -307.740625, "logps/rejected": -340.8822428385417, "loss": 0.3714, "rewards/chosen": 0.2894716739654541, "rewards/margins": 1.6334607283274334, "rewards/rejected": -1.3439890543619792, "step": 3527 }, { "epoch": 0.18699811835793603, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26395044.0, "logits/rejected": -38296388.0, "logps/chosen": -236.0662841796875, "logps/rejected": -332.3862609863281, "loss": 0.3893, "rewards/chosen": 0.08759155869483948, "rewards/margins": 1.4359914362430573, "rewards/rejected": -1.3483998775482178, "step": 3528 }, { "epoch": 0.18705112235973817, "grad_norm": 61.75, "kl": 0.23479461669921875, "learning_rate": 5e-07, "logits/chosen": -25337952.0, "logits/rejected": -25123224.0, "logps/chosen": -463.5595397949219, "logps/rejected": -248.6357421875, "loss": 0.3023, "rewards/chosen": 0.1740356683731079, "rewards/margins": 1.426292061805725, "rewards/rejected": -1.2522563934326172, "step": 3529 }, { "epoch": 0.1871041263615403, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20457794.0, "logits/rejected": -104400776.0, "logps/chosen": -314.62841796875, "logps/rejected": -416.23126220703125, "loss": 0.2824, "rewards/chosen": 0.2850528955459595, "rewards/margins": 2.5699011087417603, "rewards/rejected": -2.284848213195801, "step": 3530 }, { "epoch": 0.18715713036334244, "grad_norm": 59.75, "kl": 0.5828933715820312, "learning_rate": 5e-07, "logits/chosen": -14477878.0, "logits/rejected": 4571624.0, "logps/chosen": -300.8040466308594, "logps/rejected": -460.59912109375, "loss": 0.2422, "rewards/chosen": 0.40071165561676025, "rewards/margins": 2.1006494760513306, "rewards/rejected": -1.6999378204345703, "step": 3531 }, { "epoch": 0.18721013436514458, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2893519.75, "logits/rejected": -27369480.0, "logps/chosen": -41.670658111572266, "logps/rejected": -405.0355224609375, "loss": 0.2596, "rewards/chosen": -0.26826241612434387, "rewards/margins": 1.8372319241364798, "rewards/rejected": -2.1054943402608237, "step": 3532 }, { "epoch": 0.1872631383669467, "grad_norm": 49.25, "kl": 0.16571807861328125, "learning_rate": 5e-07, "logits/chosen": -5565870.333333333, "logits/rejected": -26108758.0, "logps/chosen": -199.12896728515625, "logps/rejected": -239.8370361328125, "loss": 0.4722, "rewards/chosen": -0.15776746471722922, "rewards/margins": 0.9180312852064768, "rewards/rejected": -1.075798749923706, "step": 3533 }, { "epoch": 0.18731614236874883, "grad_norm": 38.0, "kl": 0.182342529296875, "learning_rate": 5e-07, "logits/chosen": -14097398.0, "logits/rejected": -23919012.0, "logps/chosen": -89.74966430664062, "logps/rejected": -153.19534301757812, "loss": 0.2747, "rewards/chosen": 0.5309773683547974, "rewards/margins": 2.204503655433655, "rewards/rejected": -1.6735262870788574, "step": 3534 }, { "epoch": 0.18736914637055097, "grad_norm": 71.0, "kl": 2.0252456665039062, "learning_rate": 5e-07, "logits/chosen": -10006024.0, "logits/rejected": -4301118.666666667, "logps/chosen": -464.0822265625, "logps/rejected": -370.4724934895833, "loss": 0.3373, "rewards/chosen": 0.6640856742858887, "rewards/margins": 2.1795860608418782, "rewards/rejected": -1.5155003865559895, "step": 3535 }, { "epoch": 0.1874221503723531, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32135028.0, "logits/rejected": -12811651.0, "logps/chosen": -343.0528869628906, "logps/rejected": -193.09393310546875, "loss": 0.3223, "rewards/chosen": 0.36237889528274536, "rewards/margins": 1.598957121372223, "rewards/rejected": -1.2365782260894775, "step": 3536 }, { "epoch": 0.18747515437415524, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17275069.333333332, "logits/rejected": -25789964.8, "logps/chosen": -279.69114176432294, "logps/rejected": -420.1154296875, "loss": 0.2833, "rewards/chosen": -0.08787842591603597, "rewards/margins": 2.0887281338373818, "rewards/rejected": -2.176606559753418, "step": 3537 }, { "epoch": 0.18752815837595738, "grad_norm": 49.5, "kl": 0.14278030395507812, "learning_rate": 5e-07, "logits/chosen": -57149572.0, "logits/rejected": -21498406.0, "logps/chosen": -300.2320556640625, "logps/rejected": -225.54080200195312, "loss": 0.3412, "rewards/chosen": -0.24165114760398865, "rewards/margins": 1.7410087883472443, "rewards/rejected": -1.982659935951233, "step": 3538 }, { "epoch": 0.1875811623777595, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12774800.0, "logits/rejected": -18291772.8, "logps/chosen": -195.72347005208334, "logps/rejected": -325.85498046875, "loss": 0.3579, "rewards/chosen": -0.3990791241327922, "rewards/margins": 1.1125210841496784, "rewards/rejected": -1.5116002082824707, "step": 3539 }, { "epoch": 0.18763416637956165, "grad_norm": 46.0, "kl": 0.2834815979003906, "learning_rate": 5e-07, "logits/chosen": -54400064.0, "logits/rejected": -40620809.6, "logps/chosen": -349.440673828125, "logps/rejected": -316.931103515625, "loss": 0.2371, "rewards/chosen": 0.5946533679962158, "rewards/margins": 3.0546616077423097, "rewards/rejected": -2.460008239746094, "step": 3540 }, { "epoch": 0.1876871703813638, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10584444.0, "logits/rejected": -5882892.666666667, "logps/chosen": -279.39227294921875, "logps/rejected": -140.0324910481771, "loss": 0.327, "rewards/chosen": 0.3375968933105469, "rewards/margins": 1.2714506785074868, "rewards/rejected": -0.9338537851969401, "step": 3541 }, { "epoch": 0.18774017438316593, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2533082.0, "logits/rejected": -11172755.42857143, "logps/chosen": -29.801219940185547, "logps/rejected": -298.3247767857143, "loss": 0.2396, "rewards/chosen": 0.45857927203178406, "rewards/margins": 1.819747315985816, "rewards/rejected": -1.361168043954032, "step": 3542 }, { "epoch": 0.18779317838496806, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65659513.6, "logits/rejected": -14118408.0, "logps/chosen": -374.166455078125, "logps/rejected": -393.399169921875, "loss": 0.262, "rewards/chosen": 0.6302929878234863, "rewards/margins": 3.228557046254476, "rewards/rejected": -2.5982640584309897, "step": 3543 }, { "epoch": 0.1878461823867702, "grad_norm": 51.25, "kl": 0.467193603515625, "learning_rate": 5e-07, "logits/chosen": -21029730.0, "logits/rejected": -25311164.0, "logps/chosen": -241.87425231933594, "logps/rejected": -210.94825744628906, "loss": 0.3527, "rewards/chosen": 0.2118077576160431, "rewards/margins": 1.494561344385147, "rewards/rejected": -1.282753586769104, "step": 3544 }, { "epoch": 0.18789918638857234, "grad_norm": 53.0, "kl": 1.4265785217285156, "learning_rate": 5e-07, "logits/chosen": -41958728.0, "logits/rejected": -29430774.0, "logps/chosen": -289.96356201171875, "logps/rejected": -307.32391357421875, "loss": 0.3777, "rewards/chosen": 0.0628373920917511, "rewards/margins": 2.153009444475174, "rewards/rejected": -2.090172052383423, "step": 3545 }, { "epoch": 0.18795219039037447, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94024104.0, "logits/rejected": -10607546.666666666, "logps/chosen": -381.75921630859375, "logps/rejected": -237.0945027669271, "loss": 0.2851, "rewards/chosen": 0.3687225580215454, "rewards/margins": 1.5932331482569377, "rewards/rejected": -1.2245105902353923, "step": 3546 }, { "epoch": 0.1880051943921766, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29873193.6, "logits/rejected": -25969216.0, "logps/chosen": -381.682373046875, "logps/rejected": -336.6021728515625, "loss": 0.3171, "rewards/chosen": 0.3928208351135254, "rewards/margins": 2.205774148305257, "rewards/rejected": -1.8129533131917317, "step": 3547 }, { "epoch": 0.18805819839397875, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8697587.42857143, "logits/rejected": -6047785.0, "logps/chosen": -116.18623570033482, "logps/rejected": -46.233001708984375, "loss": 0.4517, "rewards/chosen": 0.20434858117784774, "rewards/margins": 0.49031852398599896, "rewards/rejected": -0.28596994280815125, "step": 3548 }, { "epoch": 0.18811120239578089, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21670486.4, "logits/rejected": -49629781.333333336, "logps/chosen": -201.17337646484376, "logps/rejected": -407.8795979817708, "loss": 0.4011, "rewards/chosen": -0.11040763854980469, "rewards/margins": 1.699480088551839, "rewards/rejected": -1.8098877271016438, "step": 3549 }, { "epoch": 0.18816420639758302, "grad_norm": 81.0, "kl": 3.9095001220703125, "learning_rate": 5e-07, "logits/chosen": -66039688.0, "logits/rejected": -30127620.0, "logps/chosen": -801.3324584960938, "logps/rejected": -288.82513427734375, "loss": 0.3494, "rewards/chosen": 0.7230287790298462, "rewards/margins": 2.226415753364563, "rewards/rejected": -1.5033869743347168, "step": 3550 }, { "epoch": 0.18821721039938516, "grad_norm": 68.5, "kl": 0.595088005065918, "learning_rate": 5e-07, "logits/chosen": -26145138.0, "logits/rejected": 110530544.0, "logps/chosen": -535.9744262695312, "logps/rejected": -227.8214111328125, "loss": 0.339, "rewards/chosen": 0.5250434875488281, "rewards/margins": 1.4641698598861694, "rewards/rejected": -0.9391263723373413, "step": 3551 }, { "epoch": 0.1882702144011873, "grad_norm": 51.0, "kl": 0.14957046508789062, "learning_rate": 5e-07, "logits/chosen": -2891588.0, "logits/rejected": -30243272.0, "logps/chosen": -424.932861328125, "logps/rejected": -289.04534912109375, "loss": 0.2593, "rewards/chosen": 0.59173583984375, "rewards/margins": 2.458258271217346, "rewards/rejected": -1.8665224313735962, "step": 3552 }, { "epoch": 0.18832321840298943, "grad_norm": 71.5, "kl": 1.10858154296875, "learning_rate": 5e-07, "logits/chosen": -27852640.0, "logits/rejected": -9261208.0, "logps/chosen": -596.1210123697916, "logps/rejected": -140.9537353515625, "loss": 0.3314, "rewards/chosen": 0.9994756380716959, "rewards/margins": 1.397443930308024, "rewards/rejected": -0.3979682922363281, "step": 3553 }, { "epoch": 0.18837622240479157, "grad_norm": 43.0, "kl": 0.31733131408691406, "learning_rate": 5e-07, "logits/chosen": -17525154.0, "logits/rejected": -15176406.0, "logps/chosen": -183.5657501220703, "logps/rejected": -126.72444915771484, "loss": 0.4193, "rewards/chosen": -0.16335947811603546, "rewards/margins": 0.7460271567106247, "rewards/rejected": -0.9093866348266602, "step": 3554 }, { "epoch": 0.1884292264065937, "grad_norm": 66.5, "kl": 0.5971794128417969, "learning_rate": 5e-07, "logits/chosen": -29149331.2, "logits/rejected": -41454677.333333336, "logps/chosen": -353.802490234375, "logps/rejected": -560.980712890625, "loss": 0.2942, "rewards/chosen": 0.43131532669067385, "rewards/margins": 2.7891766866048178, "rewards/rejected": -2.357861359914144, "step": 3555 }, { "epoch": 0.18848223040839585, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17671676.8, "logits/rejected": -26319413.333333332, "logps/chosen": -299.89970703125, "logps/rejected": -299.4001057942708, "loss": 0.3989, "rewards/chosen": 0.07691956162452698, "rewards/margins": 1.2192886730035146, "rewards/rejected": -1.1423691113789876, "step": 3556 }, { "epoch": 0.18853523441019798, "grad_norm": 60.25, "kl": 0.10586166381835938, "learning_rate": 5e-07, "logits/chosen": -69387322.66666667, "logits/rejected": -16637209.6, "logps/chosen": -364.1781005859375, "logps/rejected": -345.3388916015625, "loss": 0.2986, "rewards/chosen": 0.22525330384572348, "rewards/margins": 1.660550888379415, "rewards/rejected": -1.4352975845336915, "step": 3557 }, { "epoch": 0.18858823841200012, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16476448.0, "logits/rejected": -18788726.4, "logps/chosen": -510.2307535807292, "logps/rejected": -154.93812255859376, "loss": 0.2115, "rewards/chosen": 1.1694340705871582, "rewards/margins": 2.684881496429443, "rewards/rejected": -1.5154474258422852, "step": 3558 }, { "epoch": 0.18864124241380223, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51515824.0, "logits/rejected": -14187686.4, "logps/chosen": -408.544677734375, "logps/rejected": -222.1146240234375, "loss": 0.3681, "rewards/chosen": -0.2684641679128011, "rewards/margins": 0.9823400656382244, "rewards/rejected": -1.2508042335510254, "step": 3559 }, { "epoch": 0.18869424641560437, "grad_norm": 49.5, "kl": 0.6301021575927734, "learning_rate": 5e-07, "logits/chosen": -16023901.333333334, "logits/rejected": -15580428.8, "logps/chosen": -538.4927571614584, "logps/rejected": -510.41953125, "loss": 0.2002, "rewards/chosen": 0.4529673258463542, "rewards/margins": 3.513337771097819, "rewards/rejected": -3.060370445251465, "step": 3560 }, { "epoch": 0.1887472504174065, "grad_norm": 54.5, "kl": 0.7359657287597656, "learning_rate": 5e-07, "logits/chosen": -14219080.0, "logits/rejected": -2554921.6666666665, "logps/chosen": -246.8435546875, "logps/rejected": -188.08553059895834, "loss": 0.3309, "rewards/chosen": 0.41816158294677735, "rewards/margins": 2.0470883369445803, "rewards/rejected": -1.6289267539978027, "step": 3561 }, { "epoch": 0.18880025441920864, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8331946.4, "logits/rejected": -45417840.0, "logps/chosen": -120.127490234375, "logps/rejected": -399.7526448567708, "loss": 0.3724, "rewards/chosen": -0.03281651139259338, "rewards/margins": 1.9134725511074067, "rewards/rejected": -1.9462890625, "step": 3562 }, { "epoch": 0.18885325842101078, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22015454.0, "logits/rejected": -2472888.75, "logps/chosen": -186.9502410888672, "logps/rejected": -308.7130126953125, "loss": 0.3136, "rewards/chosen": -0.1750919073820114, "rewards/margins": 2.2404075413942337, "rewards/rejected": -2.415499448776245, "step": 3563 }, { "epoch": 0.18890626242281292, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19056629.333333332, "logits/rejected": -29783030.4, "logps/chosen": -349.7102457682292, "logps/rejected": -430.766796875, "loss": 0.2275, "rewards/chosen": 0.34503785769144696, "rewards/margins": 2.7284895102183024, "rewards/rejected": -2.3834516525268556, "step": 3564 }, { "epoch": 0.18895926642461505, "grad_norm": 62.5, "kl": 0.36354637145996094, "learning_rate": 5e-07, "logits/chosen": -15182361.142857144, "logits/rejected": -24013098.0, "logps/chosen": -277.0750034877232, "logps/rejected": -545.8380126953125, "loss": 0.4192, "rewards/chosen": 0.18920091220310756, "rewards/margins": 2.487809453691755, "rewards/rejected": -2.2986085414886475, "step": 3565 }, { "epoch": 0.1890122704264172, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60692192.0, "logits/rejected": -12235084.0, "logps/chosen": -269.14593505859375, "logps/rejected": -455.90740966796875, "loss": 0.2699, "rewards/chosen": 0.12504330277442932, "rewards/margins": 2.7622129023075104, "rewards/rejected": -2.637169599533081, "step": 3566 }, { "epoch": 0.18906527442821933, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12690679.0, "logits/rejected": -29363925.333333332, "logps/chosen": -89.65748596191406, "logps/rejected": -329.35675048828125, "loss": 0.2603, "rewards/chosen": -0.07520179450511932, "rewards/margins": 1.5730702430009842, "rewards/rejected": -1.6482720375061035, "step": 3567 }, { "epoch": 0.18911827843002146, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3641705.0, "logits/rejected": -37152412.0, "logps/chosen": -218.955078125, "logps/rejected": -447.73345947265625, "loss": 0.3176, "rewards/chosen": 0.17279481887817383, "rewards/margins": 1.8247487545013428, "rewards/rejected": -1.651953935623169, "step": 3568 }, { "epoch": 0.1891712824318236, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -113392952.0, "logits/rejected": -413348.0, "logps/chosen": -316.86431884765625, "logps/rejected": -441.1109619140625, "loss": 0.2368, "rewards/chosen": -0.2901649475097656, "rewards/margins": 1.8144137064615884, "rewards/rejected": -2.104578653971354, "step": 3569 }, { "epoch": 0.18922428643362574, "grad_norm": 57.5, "kl": 1.1853179931640625, "learning_rate": 5e-07, "logits/chosen": -12494353.0, "logits/rejected": -46752152.0, "logps/chosen": -530.1798095703125, "logps/rejected": -585.7034912109375, "loss": 0.2088, "rewards/chosen": 1.0528168678283691, "rewards/margins": 3.4408364295959473, "rewards/rejected": -2.388019561767578, "step": 3570 }, { "epoch": 0.18927729043542788, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17737206.0, "logits/rejected": -29239840.0, "logps/chosen": -151.04318237304688, "logps/rejected": -451.0869445800781, "loss": 0.2675, "rewards/chosen": 0.43076157569885254, "rewards/margins": 2.625680685043335, "rewards/rejected": -2.1949191093444824, "step": 3571 }, { "epoch": 0.18933029443723, "grad_norm": 47.5, "kl": 0.4252815246582031, "learning_rate": 5e-07, "logits/chosen": -20228526.666666668, "logits/rejected": -22029996.8, "logps/chosen": -419.5615234375, "logps/rejected": -190.6283447265625, "loss": 0.2837, "rewards/chosen": 0.48311007022857666, "rewards/margins": 1.900169587135315, "rewards/rejected": -1.4170595169067384, "step": 3572 }, { "epoch": 0.18938329843903215, "grad_norm": 65.0, "kl": 0.2952117919921875, "learning_rate": 5e-07, "logits/chosen": -32102764.8, "logits/rejected": -246214.33333333334, "logps/chosen": -630.61357421875, "logps/rejected": -136.04649861653647, "loss": 0.3607, "rewards/chosen": 0.6690390110015869, "rewards/margins": 1.3525969187418618, "rewards/rejected": -0.683557907740275, "step": 3573 }, { "epoch": 0.1894363024408343, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3930720.75, "logits/rejected": -18487414.0, "logps/chosen": -163.0706024169922, "logps/rejected": -171.16644287109375, "loss": 0.3143, "rewards/chosen": 0.3431810140609741, "rewards/margins": 1.6797292232513428, "rewards/rejected": -1.3365482091903687, "step": 3574 }, { "epoch": 0.18948930644263642, "grad_norm": 94.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22741730.0, "logits/rejected": 25446348.0, "logps/chosen": -288.109375, "logps/rejected": -390.0985107421875, "loss": 0.3666, "rewards/chosen": 0.3245546221733093, "rewards/margins": 1.179081916809082, "rewards/rejected": -0.8545272946357727, "step": 3575 }, { "epoch": 0.18954231044443856, "grad_norm": 47.75, "kl": 0.4561271667480469, "learning_rate": 5e-07, "logits/chosen": -26684132.0, "logits/rejected": -4461073.5, "logps/chosen": -295.76593017578125, "logps/rejected": -237.225830078125, "loss": 0.2433, "rewards/chosen": 0.7083816528320312, "rewards/margins": 3.5078344345092773, "rewards/rejected": -2.799452781677246, "step": 3576 }, { "epoch": 0.1895953144462407, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8299570.0, "logits/rejected": -16933340.8, "logps/chosen": -162.60039265950522, "logps/rejected": -159.16348876953126, "loss": 0.2717, "rewards/chosen": 0.43554170926411945, "rewards/margins": 1.9847070535024007, "rewards/rejected": -1.5491653442382813, "step": 3577 }, { "epoch": 0.18964831844804284, "grad_norm": 53.75, "kl": 0.2573089599609375, "learning_rate": 5e-07, "logits/chosen": -11559164.0, "logits/rejected": 1613150.0, "logps/chosen": -325.484912109375, "logps/rejected": -74.32583618164062, "loss": 0.3752, "rewards/chosen": 0.07613441348075867, "rewards/margins": 1.6727138062318165, "rewards/rejected": -1.5965793927510579, "step": 3578 }, { "epoch": 0.18970132244984497, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43527500.0, "logits/rejected": -26661192.0, "logps/chosen": -275.80413818359375, "logps/rejected": -387.883544921875, "loss": 0.3431, "rewards/chosen": -0.07656002044677734, "rewards/margins": 1.6364942789077759, "rewards/rejected": -1.7130542993545532, "step": 3579 }, { "epoch": 0.1897543264516471, "grad_norm": 58.75, "kl": 1.2206897735595703, "learning_rate": 5e-07, "logits/chosen": -50292176.0, "logits/rejected": -56184536.0, "logps/chosen": -485.1160481770833, "logps/rejected": -154.23861694335938, "loss": 0.2966, "rewards/chosen": 0.7140328884124756, "rewards/margins": 3.407181739807129, "rewards/rejected": -2.6931488513946533, "step": 3580 }, { "epoch": 0.18980733045344925, "grad_norm": 51.25, "kl": 0.17083358764648438, "learning_rate": 5e-07, "logits/chosen": -33771076.0, "logits/rejected": -24791164.0, "logps/chosen": -273.96868896484375, "logps/rejected": -234.48007202148438, "loss": 0.3108, "rewards/chosen": 0.3440931439399719, "rewards/margins": 1.8134663701057434, "rewards/rejected": -1.4693732261657715, "step": 3581 }, { "epoch": 0.18986033445525138, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11020536.0, "logits/rejected": -44801392.0, "logps/chosen": -244.43159993489584, "logps/rejected": -532.07041015625, "loss": 0.2984, "rewards/chosen": 0.10425619284311931, "rewards/margins": 1.7832290093104046, "rewards/rejected": -1.6789728164672852, "step": 3582 }, { "epoch": 0.18991333845705352, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19987948.0, "logits/rejected": -7089730.4, "logps/chosen": -168.25056966145834, "logps/rejected": -271.625927734375, "loss": 0.3299, "rewards/chosen": 0.3427836100260417, "rewards/margins": 1.5501699129740398, "rewards/rejected": -1.207386302947998, "step": 3583 }, { "epoch": 0.18996634245885563, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19470996.0, "logits/rejected": -52669252.0, "logps/chosen": -352.8747863769531, "logps/rejected": -514.505859375, "loss": 0.3149, "rewards/chosen": 0.02364959940314293, "rewards/margins": 2.206712629646063, "rewards/rejected": -2.18306303024292, "step": 3584 }, { "epoch": 0.19001934646065777, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35843236.571428575, "logits/rejected": -10570324.0, "logps/chosen": -327.27608816964283, "logps/rejected": -934.8323364257812, "loss": 0.4729, "rewards/chosen": -0.1648258822304862, "rewards/margins": 6.79569057055882, "rewards/rejected": -6.960516452789307, "step": 3585 }, { "epoch": 0.1900723504624599, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20875140.0, "logits/rejected": 14397453.333333334, "logps/chosen": -103.05436706542969, "logps/rejected": -353.3927001953125, "loss": 0.3314, "rewards/chosen": -0.31083232164382935, "rewards/margins": 0.9439492424329121, "rewards/rejected": -1.2547815640767415, "step": 3586 }, { "epoch": 0.19012535446426204, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31718412.0, "logits/rejected": -9549280.0, "logps/chosen": -234.8071746826172, "logps/rejected": -116.64910888671875, "loss": 0.3409, "rewards/chosen": 0.2217174470424652, "rewards/margins": 1.6154726445674896, "rewards/rejected": -1.3937551975250244, "step": 3587 }, { "epoch": 0.19017835846606418, "grad_norm": 65.0, "kl": 2.6488075256347656, "learning_rate": 5e-07, "logits/chosen": 6724271.333333333, "logits/rejected": -22522488.0, "logps/chosen": -702.1122233072916, "logps/rejected": -189.62855224609376, "loss": 0.2709, "rewards/chosen": 0.9248514970143636, "rewards/margins": 2.7601832230885823, "rewards/rejected": -1.8353317260742188, "step": 3588 }, { "epoch": 0.19023136246786632, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15006761.0, "logits/rejected": -23797428.0, "logps/chosen": -198.70106506347656, "logps/rejected": -409.7466125488281, "loss": 0.4188, "rewards/chosen": -0.6885793209075928, "rewards/margins": 0.9435876607894897, "rewards/rejected": -1.6321669816970825, "step": 3589 }, { "epoch": 0.19028436646966845, "grad_norm": 59.5, "kl": 0.6841926574707031, "learning_rate": 5e-07, "logits/chosen": -22883261.333333332, "logits/rejected": -58227417.6, "logps/chosen": -636.8634440104166, "logps/rejected": -450.412060546875, "loss": 0.2681, "rewards/chosen": 0.02753448486328125, "rewards/margins": 2.155104064941406, "rewards/rejected": -2.127569580078125, "step": 3590 }, { "epoch": 0.1903373704714706, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22860178.666666668, "logits/rejected": -28587270.4, "logps/chosen": -369.3657633463542, "logps/rejected": -323.46357421875, "loss": 0.2895, "rewards/chosen": 0.39370473225911456, "rewards/margins": 1.8185594876607258, "rewards/rejected": -1.4248547554016113, "step": 3591 }, { "epoch": 0.19039037447327273, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3437882.3333333335, "logits/rejected": -5603504.8, "logps/chosen": -175.09025065104166, "logps/rejected": -54.95833740234375, "loss": 0.3413, "rewards/chosen": -0.13810521364212036, "rewards/margins": 1.2666717410087585, "rewards/rejected": -1.4047769546508788, "step": 3592 }, { "epoch": 0.19044337847507486, "grad_norm": 62.5, "kl": 0.2701301574707031, "learning_rate": 5e-07, "logits/chosen": -28469476.0, "logits/rejected": -17675756.0, "logps/chosen": -243.60775756835938, "logps/rejected": -275.7647705078125, "loss": 0.2952, "rewards/chosen": 0.16470642387866974, "rewards/margins": 2.165169671177864, "rewards/rejected": -2.0004632472991943, "step": 3593 }, { "epoch": 0.190496382476877, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52086168.0, "logits/rejected": -25394418.0, "logps/chosen": -484.2826843261719, "logps/rejected": -284.1918029785156, "loss": 0.2639, "rewards/chosen": 0.3676872253417969, "rewards/margins": 2.4496688842773438, "rewards/rejected": -2.081981658935547, "step": 3594 }, { "epoch": 0.19054938647867914, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62279690.666666664, "logits/rejected": -12258731.2, "logps/chosen": -156.82872517903647, "logps/rejected": -219.9156005859375, "loss": 0.3133, "rewards/chosen": -0.010566840569178263, "rewards/margins": 1.5171585698922474, "rewards/rejected": -1.5277254104614257, "step": 3595 }, { "epoch": 0.19060239048048128, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30094542.0, "logits/rejected": -35433236.0, "logps/chosen": -288.6261291503906, "logps/rejected": -224.4256134033203, "loss": 0.3958, "rewards/chosen": -0.16726809740066528, "rewards/margins": 1.0101551413536072, "rewards/rejected": -1.1774232387542725, "step": 3596 }, { "epoch": 0.1906553944822834, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20758550.0, "logits/rejected": -14651092.0, "logps/chosen": -192.93215942382812, "logps/rejected": -348.28436279296875, "loss": 0.2839, "rewards/chosen": 0.3651348352432251, "rewards/margins": 2.230380058288574, "rewards/rejected": -1.8652452230453491, "step": 3597 }, { "epoch": 0.19070839848408555, "grad_norm": 45.25, "kl": 0.17087173461914062, "learning_rate": 5e-07, "logits/chosen": -21363918.0, "logits/rejected": -35094906.666666664, "logps/chosen": -395.42193603515625, "logps/rejected": -483.0654296875, "loss": 0.2206, "rewards/chosen": 0.0870576873421669, "rewards/margins": 2.6846217488249144, "rewards/rejected": -2.5975640614827475, "step": 3598 }, { "epoch": 0.1907614024858877, "grad_norm": 52.0, "kl": 1.1531257629394531, "learning_rate": 5e-07, "logits/chosen": -12136124.0, "logits/rejected": -55107640.0, "logps/chosen": -165.50874837239584, "logps/rejected": -233.36917114257812, "loss": 0.3995, "rewards/chosen": 0.1541301210721334, "rewards/margins": 2.4142688711484275, "rewards/rejected": -2.260138750076294, "step": 3599 }, { "epoch": 0.19081440648768982, "grad_norm": 55.0, "kl": 0.5396041870117188, "learning_rate": 5e-07, "logits/chosen": -9181937.333333334, "logits/rejected": -30139232.0, "logps/chosen": -297.35813395182294, "logps/rejected": -223.362744140625, "loss": 0.3472, "rewards/chosen": 0.5651947259902954, "rewards/margins": 1.329826283454895, "rewards/rejected": -0.7646315574645997, "step": 3600 }, { "epoch": 0.19086741048949196, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27053768.0, "logits/rejected": -32033484.0, "logps/chosen": -245.35079956054688, "logps/rejected": -295.67535400390625, "loss": 0.2899, "rewards/chosen": 0.511472225189209, "rewards/margins": 1.889714241027832, "rewards/rejected": -1.378242015838623, "step": 3601 }, { "epoch": 0.1909204144912941, "grad_norm": 55.25, "kl": 0.22978973388671875, "learning_rate": 5e-07, "logits/chosen": -12696848.0, "logits/rejected": 4133868.0, "logps/chosen": -275.1455485026042, "logps/rejected": -47.777870178222656, "loss": 0.3885, "rewards/chosen": 0.3525311549504598, "rewards/margins": 1.5703466733296711, "rewards/rejected": -1.2178155183792114, "step": 3602 }, { "epoch": 0.19097341849309624, "grad_norm": 66.0, "kl": 1.1651878356933594, "learning_rate": 5e-07, "logits/chosen": -41085666.666666664, "logits/rejected": -18201648.0, "logps/chosen": -486.2177734375, "logps/rejected": -421.06024169921875, "loss": 0.3668, "rewards/chosen": 0.5895004272460938, "rewards/margins": 1.6582077741622925, "rewards/rejected": -1.0687073469161987, "step": 3603 }, { "epoch": 0.19102642249489837, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21913538.666666668, "logits/rejected": 4283988.0, "logps/chosen": -369.4560546875, "logps/rejected": -371.6742919921875, "loss": 0.3078, "rewards/chosen": -0.19206543763478598, "rewards/margins": 1.488692084948222, "rewards/rejected": -1.6807575225830078, "step": 3604 }, { "epoch": 0.1910794264967005, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57370858.666666664, "logits/rejected": -12141841.6, "logps/chosen": -328.8089192708333, "logps/rejected": -181.38563232421876, "loss": 0.3327, "rewards/chosen": 0.6436309019724528, "rewards/margins": 1.4643614927927653, "rewards/rejected": -0.8207305908203125, "step": 3605 }, { "epoch": 0.19113243049850265, "grad_norm": 57.0, "kl": 0.7150669097900391, "learning_rate": 5e-07, "logits/chosen": -26757996.8, "logits/rejected": -2834662.6666666665, "logps/chosen": -339.5417236328125, "logps/rejected": -116.14312744140625, "loss": 0.2892, "rewards/chosen": 0.9058682441711425, "rewards/margins": 2.043480110168457, "rewards/rejected": -1.1376118659973145, "step": 3606 }, { "epoch": 0.19118543450030479, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7152396.0, "logits/rejected": -31172829.333333332, "logps/chosen": -219.05712890625, "logps/rejected": -228.18721516927084, "loss": 0.3666, "rewards/chosen": 0.02923004627227783, "rewards/margins": 1.9631944576899212, "rewards/rejected": -1.9339644114176433, "step": 3607 }, { "epoch": 0.19123843850210692, "grad_norm": 48.75, "kl": 0.3179473876953125, "learning_rate": 5e-07, "logits/chosen": -18406569.6, "logits/rejected": -6615970.666666667, "logps/chosen": -209.4199951171875, "logps/rejected": -118.47172037760417, "loss": 0.3091, "rewards/chosen": 0.4203042030334473, "rewards/margins": 2.3071345329284667, "rewards/rejected": -1.8868303298950195, "step": 3608 }, { "epoch": 0.19129144250390903, "grad_norm": 55.0, "kl": 0.36687660217285156, "learning_rate": 5e-07, "logits/chosen": -48768036.571428575, "logits/rejected": -2672477.5, "logps/chosen": -152.45277622767858, "logps/rejected": -101.9052734375, "loss": 0.4619, "rewards/chosen": 0.11250074420656477, "rewards/margins": 0.9385864308902195, "rewards/rejected": -0.8260856866836548, "step": 3609 }, { "epoch": 0.19134444650571117, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51187804.8, "logits/rejected": -29042370.666666668, "logps/chosen": -296.67626953125, "logps/rejected": -255.60076904296875, "loss": 0.3428, "rewards/chosen": 0.2175915479660034, "rewards/margins": 1.8739575783411662, "rewards/rejected": -1.6563660303751628, "step": 3610 }, { "epoch": 0.1913974505075133, "grad_norm": 48.75, "kl": 0.09462738037109375, "learning_rate": 5e-07, "logits/chosen": -50134496.0, "logits/rejected": -11996114.285714285, "logps/chosen": -216.64822387695312, "logps/rejected": -201.757568359375, "loss": 0.2369, "rewards/chosen": -0.02098541334271431, "rewards/margins": 1.454777362623385, "rewards/rejected": -1.4757627759660994, "step": 3611 }, { "epoch": 0.19145045450931544, "grad_norm": 75.0, "kl": 3.5090179443359375, "learning_rate": 5e-07, "logits/chosen": -20010878.666666668, "logits/rejected": -26139336.0, "logps/chosen": -597.0894775390625, "logps/rejected": -236.74609375, "loss": 0.3684, "rewards/chosen": 0.8277180989583334, "rewards/margins": 2.4342114528020224, "rewards/rejected": -1.606493353843689, "step": 3612 }, { "epoch": 0.19150345851111758, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19717218.0, "logits/rejected": -19293010.0, "logps/chosen": -312.7281188964844, "logps/rejected": -394.4917907714844, "loss": 0.3069, "rewards/chosen": 0.49837207794189453, "rewards/margins": 2.2927156686782837, "rewards/rejected": -1.7943435907363892, "step": 3613 }, { "epoch": 0.19155646251291972, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20824010.0, "logits/rejected": -1185581.3333333333, "logps/chosen": -208.4959716796875, "logps/rejected": -291.7091064453125, "loss": 0.3039, "rewards/chosen": 0.06248035281896591, "rewards/margins": 1.4828250234325726, "rewards/rejected": -1.4203446706136067, "step": 3614 }, { "epoch": 0.19160946651472185, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -144030.6875, "logits/rejected": -52565776.0, "logps/chosen": -242.83656311035156, "logps/rejected": -349.17645263671875, "loss": 0.268, "rewards/chosen": 0.6456419229507446, "rewards/margins": 2.2668988704681396, "rewards/rejected": -1.621256947517395, "step": 3615 }, { "epoch": 0.191662470516524, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21022106.0, "logits/rejected": -11122883.0, "logps/chosen": -395.7933044433594, "logps/rejected": -357.26092529296875, "loss": 0.3473, "rewards/chosen": 0.1185886338353157, "rewards/margins": 1.5932683423161507, "rewards/rejected": -1.474679708480835, "step": 3616 }, { "epoch": 0.19171547451832613, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58924492.0, "logits/rejected": -32747030.0, "logps/chosen": -155.65476989746094, "logps/rejected": -382.0306396484375, "loss": 0.3193, "rewards/chosen": -0.05702095851302147, "rewards/margins": 2.113953772932291, "rewards/rejected": -2.1709747314453125, "step": 3617 }, { "epoch": 0.19176847852012827, "grad_norm": 83.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -93193120.0, "logits/rejected": -15567288.0, "logps/chosen": -546.092041015625, "logps/rejected": -349.1340637207031, "loss": 0.363, "rewards/chosen": -0.24249878525733948, "rewards/margins": 1.798053354024887, "rewards/rejected": -2.0405521392822266, "step": 3618 }, { "epoch": 0.1918214825219304, "grad_norm": 40.0, "kl": 0.06866455078125, "learning_rate": 5e-07, "logits/chosen": -15992713.0, "logits/rejected": -25817661.333333332, "logps/chosen": -207.77171325683594, "logps/rejected": -275.6842854817708, "loss": 0.1943, "rewards/chosen": 0.2536637485027313, "rewards/margins": 2.492643723885218, "rewards/rejected": -2.238979975382487, "step": 3619 }, { "epoch": 0.19187448652373254, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 71427208.0, "logits/rejected": -11659793.333333334, "logps/chosen": -60.23554229736328, "logps/rejected": -174.6922403971354, "loss": 0.2546, "rewards/chosen": 0.009918596595525742, "rewards/margins": 1.8375238440930843, "rewards/rejected": -1.8276052474975586, "step": 3620 }, { "epoch": 0.19192749052553468, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25735117.333333332, "logits/rejected": -18291222.4, "logps/chosen": -245.70796712239584, "logps/rejected": -385.87041015625, "loss": 0.2123, "rewards/chosen": 0.5518035888671875, "rewards/margins": 2.9793878555297852, "rewards/rejected": -2.4275842666625977, "step": 3621 }, { "epoch": 0.19198049452733681, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57615480.0, "logits/rejected": -21620306.666666668, "logps/chosen": -344.5340881347656, "logps/rejected": -263.80072021484375, "loss": 0.3247, "rewards/chosen": 0.09390487521886826, "rewards/margins": 1.1230087106426556, "rewards/rejected": -1.0291038354237874, "step": 3622 }, { "epoch": 0.19203349852913895, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 812639.0, "logits/rejected": -16861474.666666668, "logps/chosen": -232.17642211914062, "logps/rejected": -237.6537068684896, "loss": 0.2492, "rewards/chosen": -0.11552724242210388, "rewards/margins": 1.8350202937920888, "rewards/rejected": -1.9505475362141926, "step": 3623 }, { "epoch": 0.1920865025309411, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48924464.0, "logits/rejected": -29098931.2, "logps/chosen": -420.6804606119792, "logps/rejected": -551.4654296875, "loss": 0.1787, "rewards/chosen": 0.7058522701263428, "rewards/margins": 3.3982520580291746, "rewards/rejected": -2.692399787902832, "step": 3624 }, { "epoch": 0.19213950653274323, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36970036.0, "logits/rejected": -22306808.0, "logps/chosen": -310.8706359863281, "logps/rejected": -162.57847595214844, "loss": 0.3307, "rewards/chosen": 0.20430642366409302, "rewards/margins": 1.57720547914505, "rewards/rejected": -1.372899055480957, "step": 3625 }, { "epoch": 0.19219251053454536, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12436425.333333334, "logits/rejected": -51410448.0, "logps/chosen": -263.6527506510417, "logps/rejected": -262.6287536621094, "loss": 0.3506, "rewards/chosen": 0.44518017768859863, "rewards/margins": 2.0858635902404785, "rewards/rejected": -1.6406834125518799, "step": 3626 }, { "epoch": 0.1922455145363475, "grad_norm": 68.0, "kl": 0.10997867584228516, "learning_rate": 5e-07, "logits/chosen": -42857216.0, "logits/rejected": -51917552.0, "logps/chosen": -285.4539306640625, "logps/rejected": -454.5799560546875, "loss": 0.328, "rewards/chosen": 0.47966575622558594, "rewards/margins": 1.938682238260905, "rewards/rejected": -1.459016482035319, "step": 3627 }, { "epoch": 0.19229851853814964, "grad_norm": 97.5, "kl": 4.699895858764648, "learning_rate": 5e-07, "logits/chosen": -37728411.428571425, "logits/rejected": 396186624.0, "logps/chosen": -824.6594587053571, "logps/rejected": -452.62353515625, "loss": 0.42, "rewards/chosen": 0.704580579485212, "rewards/margins": 1.8538633925574166, "rewards/rejected": -1.1492828130722046, "step": 3628 }, { "epoch": 0.19235152253995177, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24551946.666666668, "logits/rejected": -12559532.8, "logps/chosen": -151.26126098632812, "logps/rejected": -200.2971923828125, "loss": 0.2776, "rewards/chosen": 0.7022629578908285, "rewards/margins": 1.8863032182057697, "rewards/rejected": -1.1840402603149414, "step": 3629 }, { "epoch": 0.1924045265417539, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17277178.666666668, "logits/rejected": -6166633.6, "logps/chosen": -367.0804850260417, "logps/rejected": -289.9677490234375, "loss": 0.2972, "rewards/chosen": 0.1106613278388977, "rewards/margins": 2.1950826764106752, "rewards/rejected": -2.0844213485717775, "step": 3630 }, { "epoch": 0.19245753054355605, "grad_norm": 55.5, "kl": 0.2022838592529297, "learning_rate": 5e-07, "logits/chosen": -14699684.8, "logits/rejected": -28750048.0, "logps/chosen": -261.4078125, "logps/rejected": -339.8590901692708, "loss": 0.3092, "rewards/chosen": 0.4289459228515625, "rewards/margins": 2.178788248697917, "rewards/rejected": -1.7498423258463542, "step": 3631 }, { "epoch": 0.1925105345453582, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50001221.333333336, "logits/rejected": -32854896.0, "logps/chosen": -352.5830078125, "logps/rejected": -256.94375, "loss": 0.2468, "rewards/chosen": 0.3494720458984375, "rewards/margins": 2.239731788635254, "rewards/rejected": -1.8902597427368164, "step": 3632 }, { "epoch": 0.19256353854716032, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12989636.0, "logits/rejected": -4439910.5, "logps/chosen": -112.78997802734375, "logps/rejected": -160.96311950683594, "loss": 0.3078, "rewards/chosen": 0.23544491827487946, "rewards/margins": 2.234206333756447, "rewards/rejected": -1.9987614154815674, "step": 3633 }, { "epoch": 0.19261654254896246, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35515994.666666664, "logits/rejected": 24875763.2, "logps/chosen": -635.4957682291666, "logps/rejected": -285.81064453125, "loss": 0.2654, "rewards/chosen": 0.40339461962382, "rewards/margins": 2.3773390928904217, "rewards/rejected": -1.9739444732666016, "step": 3634 }, { "epoch": 0.19266954655076457, "grad_norm": 60.75, "kl": 0.1420440673828125, "learning_rate": 5e-07, "logits/chosen": -23825920.0, "logits/rejected": -56655576.0, "logps/chosen": -329.60672433035717, "logps/rejected": -641.5016479492188, "loss": 0.4275, "rewards/chosen": 0.08316395112446376, "rewards/margins": 3.5632421714918956, "rewards/rejected": -3.4800782203674316, "step": 3635 }, { "epoch": 0.1927225505525667, "grad_norm": 47.0, "kl": 1.1426887512207031, "learning_rate": 5e-07, "logits/chosen": -28979542.4, "logits/rejected": -27031968.0, "logps/chosen": -266.600634765625, "logps/rejected": -241.3578084309896, "loss": 0.3648, "rewards/chosen": 0.27167272567749023, "rewards/margins": 1.888665994008382, "rewards/rejected": -1.6169932683308919, "step": 3636 }, { "epoch": 0.19277555455436884, "grad_norm": 85.0, "kl": 1.103729248046875, "learning_rate": 5e-07, "logits/chosen": -17219536.0, "logits/rejected": -17947613.333333332, "logps/chosen": -470.4287109375, "logps/rejected": -420.9283447265625, "loss": 0.2834, "rewards/chosen": 0.8668018341064453, "rewards/margins": 2.4766655921936036, "rewards/rejected": -1.6098637580871582, "step": 3637 }, { "epoch": 0.19282855855617098, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46121748.0, "logits/rejected": -8493514.0, "logps/chosen": -563.5538940429688, "logps/rejected": -361.67584228515625, "loss": 0.2745, "rewards/chosen": 0.5770881772041321, "rewards/margins": 2.4348438382148743, "rewards/rejected": -1.8577556610107422, "step": 3638 }, { "epoch": 0.19288156255797312, "grad_norm": 59.75, "kl": 0.40489768981933594, "learning_rate": 5e-07, "logits/chosen": -30854877.333333332, "logits/rejected": -5708897.5, "logps/chosen": -372.7799479166667, "logps/rejected": -50.877864837646484, "loss": 0.4211, "rewards/chosen": 0.31458983818689984, "rewards/margins": 0.989246944586436, "rewards/rejected": -0.6746571063995361, "step": 3639 }, { "epoch": 0.19293456655977526, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40071177.14285714, "logits/rejected": 7558234.0, "logps/chosen": -315.826171875, "logps/rejected": -87.59097290039062, "loss": 0.4557, "rewards/chosen": 0.14582017489842006, "rewards/margins": 0.6060695094721658, "rewards/rejected": -0.4602493345737457, "step": 3640 }, { "epoch": 0.1929875705615774, "grad_norm": 72.5, "kl": 0.138946533203125, "learning_rate": 5e-07, "logits/chosen": -54629062.4, "logits/rejected": 13335650.666666666, "logps/chosen": -618.006640625, "logps/rejected": -168.7509765625, "loss": 0.4426, "rewards/chosen": -0.1713830590248108, "rewards/margins": 0.9490640997886658, "rewards/rejected": -1.1204471588134766, "step": 3641 }, { "epoch": 0.19304057456337953, "grad_norm": 63.75, "kl": 1.8814697265625, "learning_rate": 5e-07, "logits/chosen": -33178464.0, "logits/rejected": -25359850.0, "logps/chosen": -435.0776774088542, "logps/rejected": -302.130615234375, "loss": 0.3853, "rewards/chosen": 0.2581835587819417, "rewards/margins": 3.63144858678182, "rewards/rejected": -3.373265027999878, "step": 3642 }, { "epoch": 0.19309357856518167, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19800128.0, "logits/rejected": -21698052.0, "logps/chosen": -130.48602294921875, "logps/rejected": -323.1748352050781, "loss": 0.4107, "rewards/chosen": -0.3376983404159546, "rewards/margins": 0.8956412076950073, "rewards/rejected": -1.233339548110962, "step": 3643 }, { "epoch": 0.1931465825669838, "grad_norm": 112.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37664024.0, "logits/rejected": -49388710.4, "logps/chosen": -364.8662923177083, "logps/rejected": -180.92376708984375, "loss": 0.3338, "rewards/chosen": -0.14793052275975546, "rewards/margins": 1.2492385903994243, "rewards/rejected": -1.3971691131591797, "step": 3644 }, { "epoch": 0.19319958656878594, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11097469.333333334, "logits/rejected": -22299611.2, "logps/chosen": -309.3870035807292, "logps/rejected": -363.334814453125, "loss": 0.2159, "rewards/chosen": 0.3861094315846761, "rewards/margins": 2.935070498784383, "rewards/rejected": -2.548961067199707, "step": 3645 }, { "epoch": 0.19325259057058808, "grad_norm": 46.75, "kl": 0.007121562957763672, "learning_rate": 5e-07, "logits/chosen": -62384616.0, "logits/rejected": -11406468.0, "logps/chosen": -353.0624694824219, "logps/rejected": -121.58083089192708, "loss": 0.3305, "rewards/chosen": 0.33106154203414917, "rewards/margins": 1.2190059224764505, "rewards/rejected": -0.8879443804423014, "step": 3646 }, { "epoch": 0.19330559457239022, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27207622.4, "logits/rejected": -22372698.666666668, "logps/chosen": -698.82001953125, "logps/rejected": -251.68743896484375, "loss": 0.3017, "rewards/chosen": 0.5073190212249756, "rewards/margins": 2.287902084986369, "rewards/rejected": -1.7805830637613933, "step": 3647 }, { "epoch": 0.19335859857419235, "grad_norm": 47.5, "kl": 1.8090457916259766, "learning_rate": 5e-07, "logits/chosen": -38946483.2, "logits/rejected": -24111226.666666668, "logps/chosen": -546.471142578125, "logps/rejected": -668.6068522135416, "loss": 0.2639, "rewards/chosen": 0.8405855178833008, "rewards/margins": 3.7976790746053064, "rewards/rejected": -2.9570935567220054, "step": 3648 }, { "epoch": 0.1934116025759945, "grad_norm": 55.75, "kl": 0.022281646728515625, "learning_rate": 5e-07, "logits/chosen": -85705016.0, "logits/rejected": -25717662.0, "logps/chosen": -371.38641357421875, "logps/rejected": -209.982666015625, "loss": 0.2993, "rewards/chosen": -0.03618631511926651, "rewards/margins": 2.4131266102194786, "rewards/rejected": -2.449312925338745, "step": 3649 }, { "epoch": 0.19346460657779663, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42450386.666666664, "logits/rejected": -33065728.0, "logps/chosen": -364.6871337890625, "logps/rejected": -393.98992919921875, "loss": 0.2918, "rewards/chosen": 0.6685603459676107, "rewards/margins": 2.8028882344563804, "rewards/rejected": -2.1343278884887695, "step": 3650 }, { "epoch": 0.19351761057959876, "grad_norm": 62.0, "kl": 1.1420974731445312, "learning_rate": 5e-07, "logits/chosen": -18225672.0, "logits/rejected": -38918544.0, "logps/chosen": -649.2023518880209, "logps/rejected": -341.3932800292969, "loss": 0.3732, "rewards/chosen": 0.3612145185470581, "rewards/margins": 2.1113080978393555, "rewards/rejected": -1.7500935792922974, "step": 3651 }, { "epoch": 0.1935706145814009, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 894883.6666666666, "logits/rejected": -27153766.4, "logps/chosen": -231.7584431966146, "logps/rejected": -212.6456787109375, "loss": 0.2928, "rewards/chosen": 0.8310019175211588, "rewards/margins": 1.8183211962382, "rewards/rejected": -0.9873192787170411, "step": 3652 }, { "epoch": 0.19362361858320304, "grad_norm": 86.5, "kl": 0.5129165649414062, "learning_rate": 5e-07, "logits/chosen": -31705529.6, "logits/rejected": -39082549.333333336, "logps/chosen": -323.689208984375, "logps/rejected": -447.5994466145833, "loss": 0.3684, "rewards/chosen": 0.46247272491455077, "rewards/margins": 1.9715013186136883, "rewards/rejected": -1.5090285936991374, "step": 3653 }, { "epoch": 0.19367662258500518, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32510772.0, "logits/rejected": -33088724.0, "logps/chosen": -385.5508117675781, "logps/rejected": -271.4505920410156, "loss": 0.3364, "rewards/chosen": -0.14251059293746948, "rewards/margins": 1.9257763028144836, "rewards/rejected": -2.068286895751953, "step": 3654 }, { "epoch": 0.1937296265868073, "grad_norm": 47.25, "kl": 0.3597879409790039, "learning_rate": 5e-07, "logits/chosen": 8070886.666666667, "logits/rejected": -29078694.4, "logps/chosen": -65.15578206380208, "logps/rejected": -304.69638671875, "loss": 0.2524, "rewards/chosen": 0.4767828782399495, "rewards/margins": 2.3350796540578207, "rewards/rejected": -1.858296775817871, "step": 3655 }, { "epoch": 0.19378263058860945, "grad_norm": 54.0, "kl": 0.1372833251953125, "learning_rate": 5e-07, "logits/chosen": -23606389.333333332, "logits/rejected": -180849.625, "logps/chosen": -165.86957804361978, "logps/rejected": -49.9482421875, "loss": 0.4851, "rewards/chosen": 0.030021660029888153, "rewards/margins": 0.23650797456502914, "rewards/rejected": -0.206486314535141, "step": 3656 }, { "epoch": 0.1938356345904116, "grad_norm": 29.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3855703.75, "logits/rejected": -12844453.333333334, "logps/chosen": -199.30738830566406, "logps/rejected": -420.463134765625, "loss": 0.1749, "rewards/chosen": 0.26670876145362854, "rewards/margins": 2.9752207497755685, "rewards/rejected": -2.70851198832194, "step": 3657 }, { "epoch": 0.19388863859221372, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17445556.0, "logits/rejected": 31880146.666666668, "logps/chosen": -487.91949462890625, "logps/rejected": -353.2117513020833, "loss": 0.3124, "rewards/chosen": 0.202564999461174, "rewards/margins": 1.4565633585055668, "rewards/rejected": -1.2539983590443928, "step": 3658 }, { "epoch": 0.19394164259401586, "grad_norm": 47.5, "kl": 0.25037384033203125, "learning_rate": 5e-07, "logits/chosen": -25797762.0, "logits/rejected": -14485980.0, "logps/chosen": -186.24935913085938, "logps/rejected": -273.83856201171875, "loss": 0.2998, "rewards/chosen": 0.3235210180282593, "rewards/margins": 2.1279157400131226, "rewards/rejected": -1.8043947219848633, "step": 3659 }, { "epoch": 0.19399464659581797, "grad_norm": 45.25, "kl": 1.6008148193359375, "learning_rate": 5e-07, "logits/chosen": -32897401.6, "logits/rejected": -19575068.0, "logps/chosen": -527.73125, "logps/rejected": -280.95599365234375, "loss": 0.3352, "rewards/chosen": 0.6906763553619385, "rewards/margins": 2.717941586176554, "rewards/rejected": -2.0272652308146157, "step": 3660 }, { "epoch": 0.1940476505976201, "grad_norm": 80.0, "kl": 0.028141021728515625, "learning_rate": 5e-07, "logits/chosen": -22583478.85714286, "logits/rejected": -2217901.75, "logps/chosen": -450.3357631138393, "logps/rejected": -77.45924377441406, "loss": 0.4424, "rewards/chosen": 0.12704316207340785, "rewards/margins": 1.1466843230383736, "rewards/rejected": -1.0196411609649658, "step": 3661 }, { "epoch": 0.19410065459942225, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6152915.0, "logits/rejected": -39854901.333333336, "logps/chosen": -112.06629943847656, "logps/rejected": -464.7061360677083, "loss": 0.247, "rewards/chosen": -0.18416938185691833, "rewards/margins": 2.0056400994459787, "rewards/rejected": -2.189809481302897, "step": 3662 }, { "epoch": 0.19415365860122438, "grad_norm": 54.75, "kl": 1.11932373046875, "learning_rate": 5e-07, "logits/chosen": -39035900.8, "logits/rejected": -21420097.333333332, "logps/chosen": -440.390185546875, "logps/rejected": -264.3653564453125, "loss": 0.3798, "rewards/chosen": 0.19672009944915772, "rewards/margins": 1.7926762660344442, "rewards/rejected": -1.5959561665852864, "step": 3663 }, { "epoch": 0.19420666260302652, "grad_norm": 36.5, "kl": 0.279693603515625, "learning_rate": 5e-07, "logits/chosen": -6077472.0, "logits/rejected": -31436936.0, "logps/chosen": -175.40220642089844, "logps/rejected": -289.1998291015625, "loss": 0.2238, "rewards/chosen": 0.6983265280723572, "rewards/margins": 2.8783106207847595, "rewards/rejected": -2.1799840927124023, "step": 3664 }, { "epoch": 0.19425966660482866, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24907698.666666668, "logits/rejected": -11566790.4, "logps/chosen": -206.59476725260416, "logps/rejected": -238.142431640625, "loss": 0.2542, "rewards/chosen": 0.8467667897542318, "rewards/margins": 2.297458585103353, "rewards/rejected": -1.4506917953491212, "step": 3665 }, { "epoch": 0.1943126706066308, "grad_norm": 83.5, "kl": 1.7688674926757812, "learning_rate": 5e-07, "logits/chosen": -34920933.333333336, "logits/rejected": -18120846.0, "logps/chosen": -637.1330159505209, "logps/rejected": -197.09674072265625, "loss": 0.4034, "rewards/chosen": 0.37679187456766766, "rewards/margins": 1.9629664818445842, "rewards/rejected": -1.5861746072769165, "step": 3666 }, { "epoch": 0.19436567460843293, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72202457.6, "logits/rejected": -21060561.333333332, "logps/chosen": -270.23720703125, "logps/rejected": -373.8514404296875, "loss": 0.293, "rewards/chosen": 0.4343863487243652, "rewards/margins": 2.625585651397705, "rewards/rejected": -2.19119930267334, "step": 3667 }, { "epoch": 0.19441867861023507, "grad_norm": 44.75, "kl": 0.15224456787109375, "learning_rate": 5e-07, "logits/chosen": -35170256.0, "logits/rejected": -11087704.0, "logps/chosen": -361.8529357910156, "logps/rejected": -323.3692626953125, "loss": 0.3193, "rewards/chosen": 0.11151637881994247, "rewards/margins": 2.423238180577755, "rewards/rejected": -2.3117218017578125, "step": 3668 }, { "epoch": 0.1944716826120372, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14178244.0, "logits/rejected": -6163920.5, "logps/chosen": -125.6038818359375, "logps/rejected": -226.09103393554688, "loss": 0.3065, "rewards/chosen": 0.3150388300418854, "rewards/margins": 1.8092389404773712, "rewards/rejected": -1.4942001104354858, "step": 3669 }, { "epoch": 0.19452468661383934, "grad_norm": 60.0, "kl": 0.556488037109375, "learning_rate": 5e-07, "logits/chosen": -14773360.0, "logits/rejected": 4721954.0, "logps/chosen": -290.8189697265625, "logps/rejected": -131.78907267252603, "loss": 0.3059, "rewards/chosen": 0.6449345588684082, "rewards/margins": 2.073835277557373, "rewards/rejected": -1.4289007186889648, "step": 3670 }, { "epoch": 0.19457769061564148, "grad_norm": 44.5, "kl": 0.8247346878051758, "learning_rate": 5e-07, "logits/chosen": -5303029.666666667, "logits/rejected": -36748198.4, "logps/chosen": -277.16648356119794, "logps/rejected": -491.51171875, "loss": 0.2672, "rewards/chosen": 0.43204553922017414, "rewards/margins": 2.501145346959432, "rewards/rejected": -2.0690998077392577, "step": 3671 }, { "epoch": 0.19463069461744362, "grad_norm": 50.75, "kl": 0.5396270751953125, "learning_rate": 5e-07, "logits/chosen": -31840066.666666668, "logits/rejected": -59479840.0, "logps/chosen": -367.6344807942708, "logps/rejected": -249.763427734375, "loss": 0.2859, "rewards/chosen": 0.5738568305969238, "rewards/margins": 2.118226718902588, "rewards/rejected": -1.544369888305664, "step": 3672 }, { "epoch": 0.19468369861924575, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72247296.0, "logits/rejected": -9162602.285714285, "logps/chosen": -779.1948852539062, "logps/rejected": -179.2303466796875, "loss": 0.2779, "rewards/chosen": 0.4537719786167145, "rewards/margins": 1.6745122969150543, "rewards/rejected": -1.2207403182983398, "step": 3673 }, { "epoch": 0.1947367026210479, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19761868.0, "logits/rejected": -18636098.0, "logps/chosen": -232.79220581054688, "logps/rejected": -500.3263244628906, "loss": 0.2975, "rewards/chosen": 0.3521243631839752, "rewards/margins": 2.921884208917618, "rewards/rejected": -2.5697598457336426, "step": 3674 }, { "epoch": 0.19478970662285003, "grad_norm": 51.25, "kl": 1.4202651977539062, "learning_rate": 5e-07, "logits/chosen": -29585324.0, "logits/rejected": -33150948.0, "logps/chosen": -401.6961975097656, "logps/rejected": -283.3778076171875, "loss": 0.3485, "rewards/chosen": 0.09925347566604614, "rewards/margins": 1.8189072012901306, "rewards/rejected": -1.7196537256240845, "step": 3675 }, { "epoch": 0.19484271062465217, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26504296.0, "logits/rejected": 19315832.0, "logps/chosen": -229.40858459472656, "logps/rejected": -692.6116943359375, "loss": 0.2459, "rewards/chosen": 0.4013214409351349, "rewards/margins": 3.486698418855667, "rewards/rejected": -3.0853769779205322, "step": 3676 }, { "epoch": 0.1948957146264543, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -111775530.66666667, "logits/rejected": -12604387.2, "logps/chosen": -866.7888997395834, "logps/rejected": -221.6337890625, "loss": 0.3074, "rewards/chosen": 0.5410858392715454, "rewards/margins": 1.7467132806777954, "rewards/rejected": -1.20562744140625, "step": 3677 }, { "epoch": 0.19494871862825644, "grad_norm": 56.5, "kl": 0.046179771423339844, "learning_rate": 5e-07, "logits/chosen": -50135872.0, "logits/rejected": -51701896.0, "logps/chosen": -234.57613118489584, "logps/rejected": -454.2889404296875, "loss": 0.4343, "rewards/chosen": -0.17817898591359457, "rewards/margins": 2.2688668171564736, "rewards/rejected": -2.4470458030700684, "step": 3678 }, { "epoch": 0.19500172263005858, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28112448.0, "logits/rejected": -36381456.0, "logps/chosen": -189.2069580078125, "logps/rejected": -415.1627604166667, "loss": 0.3228, "rewards/chosen": 0.3165191411972046, "rewards/margins": 2.9011957248051963, "rewards/rejected": -2.5846765836079917, "step": 3679 }, { "epoch": 0.19505472663186071, "grad_norm": 74.5, "kl": 1.8476505279541016, "learning_rate": 5e-07, "logits/chosen": -39733778.28571428, "logits/rejected": -24506708.0, "logps/chosen": -359.02232142857144, "logps/rejected": -327.36871337890625, "loss": 0.4125, "rewards/chosen": 0.5296057292393276, "rewards/margins": 1.2069433757237027, "rewards/rejected": -0.677337646484375, "step": 3680 }, { "epoch": 0.19510773063366285, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55171381.333333336, "logits/rejected": -91354361.6, "logps/chosen": -220.12027994791666, "logps/rejected": -265.6227783203125, "loss": 0.3448, "rewards/chosen": -0.1704728603363037, "rewards/margins": 1.3579403400421142, "rewards/rejected": -1.5284132003784179, "step": 3681 }, { "epoch": 0.195160734635465, "grad_norm": 60.0, "kl": 0.18756580352783203, "learning_rate": 5e-07, "logits/chosen": -58356755.2, "logits/rejected": -50507893.333333336, "logps/chosen": -153.1065185546875, "logps/rejected": -430.1685384114583, "loss": 0.3019, "rewards/chosen": 0.3448871374130249, "rewards/margins": 2.67326549688975, "rewards/rejected": -2.328378359476725, "step": 3682 }, { "epoch": 0.19521373863726713, "grad_norm": 62.75, "kl": 0.12975311279296875, "learning_rate": 5e-07, "logits/chosen": -59420184.0, "logits/rejected": -40698658.666666664, "logps/chosen": -491.4392395019531, "logps/rejected": -345.1146647135417, "loss": 0.3028, "rewards/chosen": 0.3021438717842102, "rewards/margins": 1.6223907272020976, "rewards/rejected": -1.3202468554178874, "step": 3683 }, { "epoch": 0.19526674263906926, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21376461.333333332, "logits/rejected": -16815414.4, "logps/chosen": -260.47552490234375, "logps/rejected": -250.65517578125, "loss": 0.225, "rewards/chosen": 0.2070818543434143, "rewards/margins": 2.6983152985572816, "rewards/rejected": -2.4912334442138673, "step": 3684 }, { "epoch": 0.19531974664087137, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28967042.0, "logits/rejected": -39321568.0, "logps/chosen": -197.9832763671875, "logps/rejected": -524.97509765625, "loss": 0.2933, "rewards/chosen": 0.0559602789580822, "rewards/margins": 2.9763382486999035, "rewards/rejected": -2.9203779697418213, "step": 3685 }, { "epoch": 0.1953727506426735, "grad_norm": 93.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53460329.6, "logits/rejected": 46463.416666666664, "logps/chosen": -237.681982421875, "logps/rejected": -106.41647338867188, "loss": 0.3626, "rewards/chosen": -0.004442179203033447, "rewards/margins": 2.0258101185162865, "rewards/rejected": -2.03025229771932, "step": 3686 }, { "epoch": 0.19542575464447565, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24955572.0, "logits/rejected": -846220.0, "logps/chosen": -385.28582763671875, "logps/rejected": -321.73345947265625, "loss": 0.215, "rewards/chosen": 0.9958990812301636, "rewards/margins": 3.233847498893738, "rewards/rejected": -2.237948417663574, "step": 3687 }, { "epoch": 0.19547875864627778, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7797296.666666667, "logits/rejected": -10017393.6, "logps/chosen": -379.0560302734375, "logps/rejected": -306.708056640625, "loss": 0.2451, "rewards/chosen": 0.4972831805547078, "rewards/margins": 2.39264984925588, "rewards/rejected": -1.8953666687011719, "step": 3688 }, { "epoch": 0.19553176264807992, "grad_norm": 81.0, "kl": 0.8431739807128906, "learning_rate": 5e-07, "logits/chosen": -77273555.2, "logits/rejected": -34861117.333333336, "logps/chosen": -1401.9927734375, "logps/rejected": -223.5173136393229, "loss": 0.2349, "rewards/chosen": 0.989412498474121, "rewards/margins": 3.145470015207926, "rewards/rejected": -2.156057516733805, "step": 3689 }, { "epoch": 0.19558476664988206, "grad_norm": 50.0, "kl": 0.451873779296875, "learning_rate": 5e-07, "logits/chosen": -54594128.0, "logits/rejected": -36971308.0, "logps/chosen": -433.19000244140625, "logps/rejected": -617.4056396484375, "loss": 0.2413, "rewards/chosen": 0.5161327719688416, "rewards/margins": 3.133380115032196, "rewards/rejected": -2.6172473430633545, "step": 3690 }, { "epoch": 0.1956377706516842, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55473941.333333336, "logits/rejected": -73729.0, "logps/chosen": -302.10325113932294, "logps/rejected": -276.5822448730469, "loss": 0.37, "rewards/chosen": 0.2631215453147888, "rewards/margins": 1.933995544910431, "rewards/rejected": -1.670873999595642, "step": 3691 }, { "epoch": 0.19569077465348633, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26323800.0, "logits/rejected": -33769408.0, "logps/chosen": -222.49176025390625, "logps/rejected": -384.534619140625, "loss": 0.2718, "rewards/chosen": 0.6788607438405355, "rewards/margins": 2.582870944341024, "rewards/rejected": -1.9040102005004882, "step": 3692 }, { "epoch": 0.19574377865528847, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7341888.666666667, "logits/rejected": -16657934.4, "logps/chosen": -260.0113932291667, "logps/rejected": -179.901708984375, "loss": 0.2841, "rewards/chosen": 0.535925030708313, "rewards/margins": 2.044015336036682, "rewards/rejected": -1.5080903053283692, "step": 3693 }, { "epoch": 0.1957967826570906, "grad_norm": 60.0, "kl": 0.6331100463867188, "learning_rate": 5e-07, "logits/chosen": -22465064.0, "logits/rejected": 14259480.0, "logps/chosen": -295.37957763671875, "logps/rejected": -299.906982421875, "loss": 0.3682, "rewards/chosen": 0.054562002420425415, "rewards/margins": 1.3731949627399445, "rewards/rejected": -1.318632960319519, "step": 3694 }, { "epoch": 0.19584978665889274, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28744354.666666668, "logits/rejected": -36987641.6, "logps/chosen": -605.0059000651041, "logps/rejected": -199.6568359375, "loss": 0.2756, "rewards/chosen": 0.644273598988851, "rewards/margins": 1.8926529248555504, "rewards/rejected": -1.2483793258666993, "step": 3695 }, { "epoch": 0.19590279066069488, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28855712.0, "logits/rejected": -13205396.0, "logps/chosen": -276.6980285644531, "logps/rejected": -378.5672912597656, "loss": 0.2681, "rewards/chosen": 0.24830614030361176, "rewards/margins": 2.6561680883169174, "rewards/rejected": -2.4078619480133057, "step": 3696 }, { "epoch": 0.19595579466249702, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51757412.0, "logits/rejected": -35479076.0, "logps/chosen": -375.10174560546875, "logps/rejected": -330.5160827636719, "loss": 0.3819, "rewards/chosen": -0.3338829278945923, "rewards/margins": 1.6537142992019653, "rewards/rejected": -1.9875972270965576, "step": 3697 }, { "epoch": 0.19600879866429916, "grad_norm": 54.0, "kl": 1.0252056121826172, "learning_rate": 5e-07, "logits/chosen": -24230891.2, "logits/rejected": -26922792.0, "logps/chosen": -374.2055419921875, "logps/rejected": -342.3737386067708, "loss": 0.3055, "rewards/chosen": 0.45692954063415525, "rewards/margins": 2.640184418360392, "rewards/rejected": -2.183254877726237, "step": 3698 }, { "epoch": 0.1960618026661013, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38154128.0, "logits/rejected": -18648828.0, "logps/chosen": -335.2357482910156, "logps/rejected": -365.59466552734375, "loss": 0.2883, "rewards/chosen": 0.44283032417297363, "rewards/margins": 2.2726941108703613, "rewards/rejected": -1.8298637866973877, "step": 3699 }, { "epoch": 0.19611480666790343, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56187152.0, "logits/rejected": -24518912.0, "logps/chosen": -290.5420227050781, "logps/rejected": -207.11575317382812, "loss": 0.296, "rewards/chosen": 0.15179443359375, "rewards/margins": 2.2175450325012207, "rewards/rejected": -2.0657505989074707, "step": 3700 }, { "epoch": 0.19616781066970557, "grad_norm": 55.5, "kl": 0.06485748291015625, "learning_rate": 5e-07, "logits/chosen": -25171262.4, "logits/rejected": 12999541.333333334, "logps/chosen": -305.0072998046875, "logps/rejected": -599.4012044270834, "loss": 0.3154, "rewards/chosen": 0.5267296314239502, "rewards/margins": 4.104263989130656, "rewards/rejected": -3.5775343577067056, "step": 3701 }, { "epoch": 0.1962208146715077, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14013584.0, "logits/rejected": -65679402.666666664, "logps/chosen": -178.27658081054688, "logps/rejected": -457.4518636067708, "loss": 0.167, "rewards/chosen": 0.6194944381713867, "rewards/margins": 3.027010917663574, "rewards/rejected": -2.4075164794921875, "step": 3702 }, { "epoch": 0.19627381867330984, "grad_norm": 55.5, "kl": 0.5549516677856445, "learning_rate": 5e-07, "logits/chosen": -34980672.0, "logits/rejected": -41656301.333333336, "logps/chosen": -269.5148193359375, "logps/rejected": -457.7461751302083, "loss": 0.3394, "rewards/chosen": 0.4523135185241699, "rewards/margins": 2.030092652638753, "rewards/rejected": -1.5777791341145833, "step": 3703 }, { "epoch": 0.19632682267511198, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21740617.6, "logits/rejected": -29270488.0, "logps/chosen": -221.246337890625, "logps/rejected": -146.1300048828125, "loss": 0.4154, "rewards/chosen": 0.0012507617473602294, "rewards/margins": 1.0430318693319958, "rewards/rejected": -1.0417811075846355, "step": 3704 }, { "epoch": 0.19637982667691412, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21247738.666666668, "logits/rejected": -22368136.0, "logps/chosen": -430.9562581380208, "logps/rejected": -274.6859375, "loss": 0.215, "rewards/chosen": 0.9280478159586588, "rewards/margins": 2.655000940958659, "rewards/rejected": -1.726953125, "step": 3705 }, { "epoch": 0.19643283067871625, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46981328.0, "logits/rejected": -39289632.0, "logps/chosen": -408.35699462890625, "logps/rejected": -347.7342529296875, "loss": 0.2027, "rewards/chosen": 0.6662338376045227, "rewards/margins": 2.9573766191800437, "rewards/rejected": -2.291142781575521, "step": 3706 }, { "epoch": 0.1964858346805184, "grad_norm": 63.0, "kl": 1.5930862426757812, "learning_rate": 5e-07, "logits/chosen": -33530365.333333332, "logits/rejected": -7184680.0, "logps/chosen": -370.8068033854167, "logps/rejected": -146.91030883789062, "loss": 0.3807, "rewards/chosen": 0.6323515574137369, "rewards/margins": 1.513508121172587, "rewards/rejected": -0.8811565637588501, "step": 3707 }, { "epoch": 0.19653883868232053, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32228022.0, "logits/rejected": 3137263.5, "logps/chosen": -326.2406311035156, "logps/rejected": -197.12083435058594, "loss": 0.4034, "rewards/chosen": -0.034617796540260315, "rewards/margins": 0.9364024251699448, "rewards/rejected": -0.9710202217102051, "step": 3708 }, { "epoch": 0.19659184268412266, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4899441.0, "logits/rejected": -39621824.0, "logps/chosen": -179.01007080078125, "logps/rejected": -460.175537109375, "loss": 0.3349, "rewards/chosen": 0.19775009155273438, "rewards/margins": 1.6422392129898071, "rewards/rejected": -1.4444891214370728, "step": 3709 }, { "epoch": 0.19664484668592477, "grad_norm": 71.0, "kl": 0.26024627685546875, "learning_rate": 5e-07, "logits/chosen": -42795885.71428572, "logits/rejected": 7096344.0, "logps/chosen": -386.51217215401783, "logps/rejected": -88.1975326538086, "loss": 0.4137, "rewards/chosen": 0.467358010155814, "rewards/margins": 0.5225734657474926, "rewards/rejected": -0.05521545559167862, "step": 3710 }, { "epoch": 0.1966978506877269, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43507318.4, "logits/rejected": -40908069.333333336, "logps/chosen": -63.56636962890625, "logps/rejected": -326.4920654296875, "loss": 0.3964, "rewards/chosen": -0.164483642578125, "rewards/margins": 1.5778101921081542, "rewards/rejected": -1.7422938346862793, "step": 3711 }, { "epoch": 0.19675085468952905, "grad_norm": 59.75, "kl": 0.058063507080078125, "learning_rate": 5e-07, "logits/chosen": -2288882.6666666665, "logits/rejected": -7805989.0, "logps/chosen": -337.7766520182292, "logps/rejected": -228.7296142578125, "loss": 0.3714, "rewards/chosen": 0.23093966643015543, "rewards/margins": 2.014825622240702, "rewards/rejected": -1.7838859558105469, "step": 3712 }, { "epoch": 0.19680385869133118, "grad_norm": 54.5, "kl": 1.4584159851074219, "learning_rate": 5e-07, "logits/chosen": -16449902.4, "logits/rejected": 1096350.3333333333, "logps/chosen": -308.980859375, "logps/rejected": -181.82476806640625, "loss": 0.3845, "rewards/chosen": 0.26006031036376953, "rewards/margins": 1.7279079755147297, "rewards/rejected": -1.4678476651509602, "step": 3713 }, { "epoch": 0.19685686269313332, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9418174.0, "logits/rejected": -4219564.0, "logps/chosen": -276.9936218261719, "logps/rejected": -88.92139434814453, "loss": 0.2763, "rewards/chosen": 0.542898416519165, "rewards/margins": 2.3059911727905273, "rewards/rejected": -1.7630927562713623, "step": 3714 }, { "epoch": 0.19690986669493546, "grad_norm": 60.25, "kl": 1.5216751098632812, "learning_rate": 5e-07, "logits/chosen": 19921898.666666668, "logits/rejected": -57946099.2, "logps/chosen": -547.20654296875, "logps/rejected": -365.58515625, "loss": 0.2852, "rewards/chosen": 0.6376688877741495, "rewards/margins": 2.21753663221995, "rewards/rejected": -1.5798677444458007, "step": 3715 }, { "epoch": 0.1969628706967376, "grad_norm": 40.25, "kl": 1.7292327880859375, "learning_rate": 5e-07, "logits/chosen": -15734929.333333334, "logits/rejected": -30114688.0, "logps/chosen": -390.6979573567708, "logps/rejected": -384.39794921875, "loss": 0.1523, "rewards/chosen": 1.2824559211730957, "rewards/margins": 4.026297283172608, "rewards/rejected": -2.743841361999512, "step": 3716 }, { "epoch": 0.19701587469853973, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48913656.0, "logits/rejected": -33365922.666666668, "logps/chosen": -363.38409423828125, "logps/rejected": -291.98927815755206, "loss": 0.2299, "rewards/chosen": 0.2302345335483551, "rewards/margins": 2.2397919396559396, "rewards/rejected": -2.0095574061075845, "step": 3717 }, { "epoch": 0.19706887870034187, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11478894.4, "logits/rejected": -17428390.666666668, "logps/chosen": -211.5072265625, "logps/rejected": -344.330810546875, "loss": 0.4422, "rewards/chosen": -0.21641833782196046, "rewards/margins": 1.2433483680089314, "rewards/rejected": -1.4597667058308919, "step": 3718 }, { "epoch": 0.197121882702144, "grad_norm": 76.0, "kl": 1.8476181030273438, "learning_rate": 5e-07, "logits/chosen": -27571630.0, "logps/chosen": -253.7353057861328, "loss": 0.5568, "rewards/chosen": -0.05876933038234711, "step": 3719 }, { "epoch": 0.19717488670394615, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62914576.0, "logits/rejected": -27596181.333333332, "logps/chosen": -396.16278076171875, "logps/rejected": -295.24648030598956, "loss": 0.2822, "rewards/chosen": 0.07998733222484589, "rewards/margins": 1.5681875199079514, "rewards/rejected": -1.4882001876831055, "step": 3720 }, { "epoch": 0.19722789070574828, "grad_norm": 56.25, "kl": 0.3135719299316406, "learning_rate": 5e-07, "logits/chosen": -43781237.333333336, "logits/rejected": 2300841.0, "logps/chosen": -566.0520833333334, "logps/rejected": -122.98275756835938, "loss": 0.3874, "rewards/chosen": 0.5672899881998698, "rewards/margins": 1.2367487947146096, "rewards/rejected": -0.66945880651474, "step": 3721 }, { "epoch": 0.19728089470755042, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26403452.0, "logits/rejected": -7499613.0, "logps/chosen": -342.66845703125, "logps/rejected": -321.15911865234375, "loss": 0.35, "rewards/chosen": 0.16711503267288208, "rewards/margins": 1.42775958776474, "rewards/rejected": -1.260644555091858, "step": 3722 }, { "epoch": 0.19733389870935256, "grad_norm": 51.75, "kl": 0.3488636016845703, "learning_rate": 5e-07, "logits/chosen": -30606518.4, "logits/rejected": -34687413.333333336, "logps/chosen": -305.659521484375, "logps/rejected": -135.23528035481772, "loss": 0.3269, "rewards/chosen": 0.28314130306243895, "rewards/margins": 2.7142485062281287, "rewards/rejected": -2.43110720316569, "step": 3723 }, { "epoch": 0.1973869027111547, "grad_norm": 48.0, "kl": 0.09492111206054688, "learning_rate": 5e-07, "logits/chosen": -4088984.0, "logits/rejected": -4533028.0, "logps/chosen": -230.2503865559896, "logps/rejected": -248.9842041015625, "loss": 0.2908, "rewards/chosen": 0.16949077447255453, "rewards/margins": 1.8329951842625936, "rewards/rejected": -1.663504409790039, "step": 3724 }, { "epoch": 0.19743990671295683, "grad_norm": 50.75, "kl": 0.8418221473693848, "learning_rate": 5e-07, "logits/chosen": -15859705.0, "logits/rejected": -9894287.333333334, "logps/chosen": -1049.892578125, "logps/rejected": -139.3128662109375, "loss": 0.2733, "rewards/chosen": 0.16418683528900146, "rewards/margins": 1.7761377096176147, "rewards/rejected": -1.6119508743286133, "step": 3725 }, { "epoch": 0.19749291071475897, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5441044.8, "logits/rejected": -30509408.0, "logps/chosen": -95.33194580078126, "logps/rejected": -582.0453287760416, "loss": 0.3255, "rewards/chosen": 0.04588108062744141, "rewards/margins": 3.430901972452799, "rewards/rejected": -3.385020891825358, "step": 3726 }, { "epoch": 0.1975459147165611, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37661811.2, "logits/rejected": -7738510.0, "logps/chosen": -242.7996826171875, "logps/rejected": -209.02445475260416, "loss": 0.379, "rewards/chosen": 0.04350614547729492, "rewards/margins": 1.563175360361735, "rewards/rejected": -1.5196692148844402, "step": 3727 }, { "epoch": 0.19759891871836324, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28312006.4, "logits/rejected": -61815701.333333336, "logps/chosen": -280.5203857421875, "logps/rejected": -219.97591145833334, "loss": 0.3665, "rewards/chosen": 0.03964822292327881, "rewards/margins": 1.8508311986923218, "rewards/rejected": -1.811182975769043, "step": 3728 }, { "epoch": 0.19765192272016538, "grad_norm": 48.0, "kl": 0.3707160949707031, "learning_rate": 5e-07, "logits/chosen": 12975321.6, "logits/rejected": -54767589.333333336, "logps/chosen": -380.26845703125, "logps/rejected": -405.3968505859375, "loss": 0.3163, "rewards/chosen": 0.34966654777526857, "rewards/margins": 2.4348671754201257, "rewards/rejected": -2.085200627644857, "step": 3729 }, { "epoch": 0.19770492672196752, "grad_norm": 54.0, "kl": 0.7867012023925781, "learning_rate": 5e-07, "logits/chosen": -16784461.333333332, "logits/rejected": -20998960.0, "logps/chosen": -295.1703287760417, "logps/rejected": -209.72747802734375, "loss": 0.373, "rewards/chosen": 0.41206196943918866, "rewards/margins": 1.4375831286112468, "rewards/rejected": -1.025521159172058, "step": 3730 }, { "epoch": 0.19775793072376965, "grad_norm": 58.0, "kl": 1.9796409606933594, "learning_rate": 5e-07, "logits/chosen": 556115.5, "logits/rejected": -9794765.714285715, "logps/chosen": -2927.265869140625, "logps/rejected": -114.93160574776786, "loss": 0.2912, "rewards/chosen": 2.0860352516174316, "rewards/margins": 3.086451939174107, "rewards/rejected": -1.0004166875566756, "step": 3731 }, { "epoch": 0.1978109347255718, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44165148.0, "logits/rejected": -29386284.0, "logps/chosen": -324.740478515625, "logps/rejected": -327.968994140625, "loss": 0.3531, "rewards/chosen": 0.025212865322828293, "rewards/margins": 1.625784020870924, "rewards/rejected": -1.6005711555480957, "step": 3732 }, { "epoch": 0.19786393872737393, "grad_norm": 166.0, "kl": 0.17681884765625, "learning_rate": 5e-07, "logits/chosen": 18373588.0, "logits/rejected": -28103696.0, "logps/chosen": -337.1887512207031, "logps/rejected": -210.83314514160156, "loss": 0.2923, "rewards/chosen": 0.8393825888633728, "rewards/margins": 1.9434691071510315, "rewards/rejected": -1.1040865182876587, "step": 3733 }, { "epoch": 0.19791694272917607, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 29332014.0, "logits/rejected": -13563750.857142856, "logps/chosen": -201.83901977539062, "logps/rejected": -180.13375418526786, "loss": 0.2596, "rewards/chosen": -0.4071106016635895, "rewards/margins": 1.0515782705375127, "rewards/rejected": -1.4586888722011022, "step": 3734 }, { "epoch": 0.1979699467309782, "grad_norm": 85.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86003882.66666667, "logits/rejected": -38302128.0, "logps/chosen": -299.13706461588544, "logps/rejected": -481.9412109375, "loss": 0.1947, "rewards/chosen": 0.605703870455424, "rewards/margins": 3.2235442558924357, "rewards/rejected": -2.617840385437012, "step": 3735 }, { "epoch": 0.1980229507327803, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12911387.0, "logits/rejected": 6816013.333333333, "logps/chosen": -281.6057434082031, "logps/rejected": -298.609130859375, "loss": 0.213, "rewards/chosen": 0.21912497282028198, "rewards/margins": 2.15316520134608, "rewards/rejected": -1.9340402285257976, "step": 3736 }, { "epoch": 0.19807595473458245, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29653254.4, "logits/rejected": -30400890.666666668, "logps/chosen": -278.1898681640625, "logps/rejected": -222.71183268229166, "loss": 0.3585, "rewards/chosen": 0.19599449634552002, "rewards/margins": 1.664232850074768, "rewards/rejected": -1.468238353729248, "step": 3737 }, { "epoch": 0.19812895873638459, "grad_norm": 68.5, "kl": 0.1024322509765625, "learning_rate": 5e-07, "logits/chosen": -18558502.666666668, "logits/rejected": -10949248.0, "logps/chosen": -318.10137939453125, "logps/rejected": -142.6088409423828, "loss": 0.3628, "rewards/chosen": 0.23534661531448364, "rewards/margins": 2.3257935643196106, "rewards/rejected": -2.090446949005127, "step": 3738 }, { "epoch": 0.19818196273818672, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47539552.0, "logits/rejected": -28812616.0, "logps/chosen": -488.0924072265625, "logps/rejected": -375.266845703125, "loss": 0.2468, "rewards/chosen": 0.4950847625732422, "rewards/margins": 2.7619125843048096, "rewards/rejected": -2.2668278217315674, "step": 3739 }, { "epoch": 0.19823496673998886, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12311588.0, "logits/rejected": 16341362.0, "logps/chosen": -115.8975830078125, "logps/rejected": -172.32794189453125, "loss": 0.3426, "rewards/chosen": 0.48024141788482666, "rewards/margins": 2.1155136823654175, "rewards/rejected": -1.6352722644805908, "step": 3740 }, { "epoch": 0.198287970741791, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32040793.6, "logits/rejected": -14800050.666666666, "logps/chosen": -259.8705078125, "logps/rejected": -456.4754638671875, "loss": 0.3429, "rewards/chosen": 0.08197997808456421, "rewards/margins": 2.1505520224571226, "rewards/rejected": -2.0685720443725586, "step": 3741 }, { "epoch": 0.19834097474359313, "grad_norm": 55.75, "kl": 1.4824600219726562, "learning_rate": 5e-07, "logits/chosen": -38985536.0, "logits/rejected": -27124852.0, "logps/chosen": -304.50429280598956, "logps/rejected": -312.74285888671875, "loss": 0.3576, "rewards/chosen": 0.3979169925053914, "rewards/margins": 3.182501037915548, "rewards/rejected": -2.7845840454101562, "step": 3742 }, { "epoch": 0.19839397874539527, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48476469.333333336, "logits/rejected": -37150985.6, "logps/chosen": -285.5688069661458, "logps/rejected": -505.429931640625, "loss": 0.2428, "rewards/chosen": 0.4794169267018636, "rewards/margins": 2.3664244492848714, "rewards/rejected": -1.8870075225830079, "step": 3743 }, { "epoch": 0.1984469827471974, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -109288048.0, "logits/rejected": -15806889.333333334, "logps/chosen": -598.2548828125, "logps/rejected": -190.5454305013021, "loss": 0.2196, "rewards/chosen": 0.21338807046413422, "rewards/margins": 2.3692165464162827, "rewards/rejected": -2.1558284759521484, "step": 3744 }, { "epoch": 0.19849998674899955, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23546330.666666668, "logits/rejected": -53207980.8, "logps/chosen": -206.31465657552084, "logps/rejected": -413.993310546875, "loss": 0.2679, "rewards/chosen": 0.2028841773668925, "rewards/margins": 2.5572756568590798, "rewards/rejected": -2.3543914794921874, "step": 3745 }, { "epoch": 0.19855299075080168, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -338902.0, "logits/rejected": -18818294.666666668, "logps/chosen": -24.070266723632812, "logps/rejected": -229.41097005208334, "loss": 0.3582, "rewards/chosen": -0.6589719653129578, "rewards/margins": 0.4769247968991597, "rewards/rejected": -1.1358967622121174, "step": 3746 }, { "epoch": 0.19860599475260382, "grad_norm": 82.5, "kl": 1.09197998046875, "learning_rate": 5e-07, "logits/chosen": -42141296.0, "logits/rejected": -22244398.0, "logps/chosen": -547.1124267578125, "logps/rejected": -270.5320739746094, "loss": 0.34, "rewards/chosen": 0.3648742735385895, "rewards/margins": 1.899096518754959, "rewards/rejected": -1.5342222452163696, "step": 3747 }, { "epoch": 0.19865899875440596, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3449519.0, "logits/rejected": -5907926.0, "logps/chosen": -108.2130355834961, "logps/rejected": -259.8261413574219, "loss": 0.3002, "rewards/chosen": 0.3539440631866455, "rewards/margins": 2.100642681121826, "rewards/rejected": -1.7466986179351807, "step": 3748 }, { "epoch": 0.1987120027562081, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40709804.0, "logits/rejected": -31738288.0, "logps/chosen": -238.2452392578125, "logps/rejected": -357.8810729980469, "loss": 0.3544, "rewards/chosen": -0.01629525236785412, "rewards/margins": 1.7673991341143847, "rewards/rejected": -1.7836943864822388, "step": 3749 }, { "epoch": 0.19876500675801023, "grad_norm": 54.75, "kl": 0.6574478149414062, "learning_rate": 5e-07, "logits/chosen": -25259930.666666668, "logits/rejected": -14169497.6, "logps/chosen": -305.3121744791667, "logps/rejected": -300.861572265625, "loss": 0.3184, "rewards/chosen": 0.00608132282892863, "rewards/margins": 1.5049938480059306, "rewards/rejected": -1.498912525177002, "step": 3750 }, { "epoch": 0.19881801075981237, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10376117.0, "logits/rejected": -21666422.0, "logps/chosen": -256.34173583984375, "logps/rejected": -332.4402770996094, "loss": 0.3277, "rewards/chosen": 0.2857162058353424, "rewards/margins": 1.7478841841220856, "rewards/rejected": -1.4621679782867432, "step": 3751 }, { "epoch": 0.1988710147616145, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21519963.2, "logits/rejected": -9661206.666666666, "logps/chosen": -279.8944580078125, "logps/rejected": -442.6515299479167, "loss": 0.3724, "rewards/chosen": -0.003536558151245117, "rewards/margins": 2.034702761967977, "rewards/rejected": -2.038239320119222, "step": 3752 }, { "epoch": 0.19892401876341664, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15053660.8, "logits/rejected": -63395936.0, "logps/chosen": -215.996435546875, "logps/rejected": -265.0038248697917, "loss": 0.4295, "rewards/chosen": 0.000367128849029541, "rewards/margins": 0.8925060073534647, "rewards/rejected": -0.8921388785044352, "step": 3753 }, { "epoch": 0.19897702276521878, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16746433.0, "logits/rejected": -5469102.0, "logps/chosen": -257.9028015136719, "logps/rejected": -254.71563720703125, "loss": 0.3374, "rewards/chosen": 0.011759579181671143, "rewards/margins": 1.8447534441947937, "rewards/rejected": -1.8329938650131226, "step": 3754 }, { "epoch": 0.19903002676702092, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11135587.333333334, "logits/rejected": -25466857.6, "logps/chosen": -345.0811767578125, "logps/rejected": -342.9779296875, "loss": 0.2831, "rewards/chosen": -0.046482344468434654, "rewards/margins": 2.1559096614519753, "rewards/rejected": -2.20239200592041, "step": 3755 }, { "epoch": 0.19908303076882305, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25034116.8, "logits/rejected": -5071581.333333333, "logps/chosen": -318.9879150390625, "logps/rejected": -390.478271484375, "loss": 0.3055, "rewards/chosen": 0.5727431297302246, "rewards/margins": 2.1760583241780598, "rewards/rejected": -1.6033151944478352, "step": 3756 }, { "epoch": 0.1991360347706252, "grad_norm": 79.5, "kl": 0.27567481994628906, "learning_rate": 5e-07, "logits/chosen": -33867589.333333336, "logits/rejected": -73379240.0, "logps/chosen": -474.8729654947917, "logps/rejected": -207.81304931640625, "loss": 0.3876, "rewards/chosen": 0.3168523112932841, "rewards/margins": 1.4681684772173564, "rewards/rejected": -1.1513161659240723, "step": 3757 }, { "epoch": 0.19918903877242733, "grad_norm": 53.0, "kl": 0.19709014892578125, "learning_rate": 5e-07, "logits/chosen": -2646621.5, "logits/rejected": -19937217.333333332, "logps/chosen": -469.9014892578125, "logps/rejected": -261.3951416015625, "loss": 0.2794, "rewards/chosen": 0.09658204019069672, "rewards/margins": 1.5675569623708725, "rewards/rejected": -1.4709749221801758, "step": 3758 }, { "epoch": 0.19924204277422947, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4013352.6666666665, "logits/rejected": -36037686.4, "logps/chosen": -30.186871846516926, "logps/rejected": -312.905517578125, "loss": 0.2791, "rewards/chosen": 0.34965940316518146, "rewards/margins": 2.0212879737218223, "rewards/rejected": -1.6716285705566407, "step": 3759 }, { "epoch": 0.1992950467760316, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36595368.0, "logits/rejected": -17949270.666666668, "logps/chosen": -248.64883422851562, "logps/rejected": -256.3834228515625, "loss": 0.2559, "rewards/chosen": 0.33862781524658203, "rewards/margins": 1.8599748611450195, "rewards/rejected": -1.5213470458984375, "step": 3760 }, { "epoch": 0.1993480507778337, "grad_norm": 63.25, "kl": 1.8501434326171875, "learning_rate": 5e-07, "logits/chosen": -46082304.0, "logits/rejected": 77592944.0, "logps/chosen": -372.76708984375, "logps/rejected": -131.97674560546875, "loss": 0.4985, "rewards/chosen": 0.008488510336194719, "rewards/margins": 1.9692139370100838, "rewards/rejected": -1.9607254266738892, "step": 3761 }, { "epoch": 0.19940105477963585, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21980256.0, "logits/rejected": -9603522.0, "logps/chosen": -435.7652587890625, "logps/rejected": -189.85415649414062, "loss": 0.3652, "rewards/chosen": 0.3410896062850952, "rewards/margins": 1.1651642322540283, "rewards/rejected": -0.8240746259689331, "step": 3762 }, { "epoch": 0.199454058781438, "grad_norm": 55.25, "kl": 1.6021499633789062, "learning_rate": 5e-07, "logits/chosen": -42993141.333333336, "logits/rejected": -26267457.6, "logps/chosen": -518.1865234375, "logps/rejected": -326.8965087890625, "loss": 0.2271, "rewards/chosen": 0.8802591959635416, "rewards/margins": 3.0913166681925452, "rewards/rejected": -2.2110574722290037, "step": 3763 }, { "epoch": 0.19950706278324012, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41018661.333333336, "logits/rejected": 7505022.0, "logps/chosen": -367.1161295572917, "logps/rejected": -239.00485229492188, "loss": 0.4529, "rewards/chosen": -0.03906091054280599, "rewards/margins": 1.2256436745325725, "rewards/rejected": -1.2647045850753784, "step": 3764 }, { "epoch": 0.19956006678504226, "grad_norm": 36.5, "kl": 0.12330436706542969, "learning_rate": 5e-07, "logits/chosen": -29824876.0, "logits/rejected": -13681712.0, "logps/chosen": -128.68215942382812, "logps/rejected": -323.0670166015625, "loss": 0.3456, "rewards/chosen": -0.14428719878196716, "rewards/margins": 1.9687873423099518, "rewards/rejected": -2.113074541091919, "step": 3765 }, { "epoch": 0.1996130707868444, "grad_norm": 106.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8230489.5, "logits/rejected": 36591364.0, "logps/chosen": -266.76397705078125, "logps/rejected": -556.8761596679688, "loss": 0.3479, "rewards/chosen": 0.39518776535987854, "rewards/margins": 1.3706813752651215, "rewards/rejected": -0.9754936099052429, "step": 3766 }, { "epoch": 0.19966607478864654, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52735925.333333336, "logits/rejected": -1269315.3, "logps/chosen": -791.7925618489584, "logps/rejected": -329.7418212890625, "loss": 0.2662, "rewards/chosen": 0.10414632161458333, "rewards/margins": 2.1382907231648765, "rewards/rejected": -2.034144401550293, "step": 3767 }, { "epoch": 0.19971907879044867, "grad_norm": 62.0, "kl": 0.8017158508300781, "learning_rate": 5e-07, "logits/chosen": -34234992.0, "logits/rejected": -5910104.5, "logps/chosen": -374.1039225260417, "logps/rejected": -317.47711181640625, "loss": 0.3039, "rewards/chosen": 0.7653289635976156, "rewards/margins": 2.304817279179891, "rewards/rejected": -1.5394883155822754, "step": 3768 }, { "epoch": 0.1997720827922508, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15671825.6, "logits/rejected": -35663221.333333336, "logps/chosen": -431.38779296875, "logps/rejected": -228.47672526041666, "loss": 0.3109, "rewards/chosen": 0.47502622604370115, "rewards/margins": 2.5512976010640465, "rewards/rejected": -2.076271375020345, "step": 3769 }, { "epoch": 0.19982508679405295, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69643546.66666667, "logits/rejected": -44425644.8, "logps/chosen": -441.8435872395833, "logps/rejected": -618.3833984375, "loss": 0.3207, "rewards/chosen": -0.038029988606770836, "rewards/margins": 2.3310368220011393, "rewards/rejected": -2.3690668106079102, "step": 3770 }, { "epoch": 0.19987809079585508, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63744120.0, "logits/rejected": -18716608.0, "logps/chosen": -351.32342529296875, "logps/rejected": -316.92718505859375, "loss": 0.422, "rewards/chosen": -0.5830375552177429, "rewards/margins": 1.0779042840003967, "rewards/rejected": -1.6609418392181396, "step": 3771 }, { "epoch": 0.19993109479765722, "grad_norm": 54.0, "kl": 0.34213733673095703, "learning_rate": 5e-07, "logits/chosen": -51446585.6, "logits/rejected": -11544430.666666666, "logps/chosen": -470.721484375, "logps/rejected": -180.41780598958334, "loss": 0.3336, "rewards/chosen": 0.38619611263275144, "rewards/margins": 1.9624358574549357, "rewards/rejected": -1.5762397448221843, "step": 3772 }, { "epoch": 0.19998409879945936, "grad_norm": 52.5, "kl": 0.12405776977539062, "learning_rate": 5e-07, "logits/chosen": -23347460.8, "logits/rejected": -86077797.33333333, "logps/chosen": -315.35712890625, "logps/rejected": -370.7156982421875, "loss": 0.3662, "rewards/chosen": 0.15491745471954346, "rewards/margins": 1.7029041210810345, "rewards/rejected": -1.547986666361491, "step": 3773 }, { "epoch": 0.2000371028012615, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30060048.0, "logits/rejected": -21864544.0, "logps/chosen": -227.31802368164062, "logps/rejected": -376.2958068847656, "loss": 0.3204, "rewards/chosen": 0.2761169672012329, "rewards/margins": 1.7705010175704956, "rewards/rejected": -1.4943840503692627, "step": 3774 }, { "epoch": 0.20009010680306363, "grad_norm": 50.25, "kl": 0.6697654724121094, "learning_rate": 5e-07, "logits/chosen": -11158841.0, "logits/rejected": -1332990.75, "logps/chosen": -707.5193481445312, "logps/rejected": -205.36825561523438, "loss": 0.336, "rewards/chosen": 0.4763171672821045, "rewards/margins": 1.6268813610076904, "rewards/rejected": -1.150564193725586, "step": 3775 }, { "epoch": 0.20014311080486577, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25232300.0, "logits/rejected": -1598110.0, "logps/chosen": -353.1188659667969, "logps/rejected": -679.153076171875, "loss": 0.2665, "rewards/chosen": 0.347494900226593, "rewards/margins": 3.1131675839424133, "rewards/rejected": -2.7656726837158203, "step": 3776 }, { "epoch": 0.2001961148066679, "grad_norm": 50.25, "kl": 1.5638542175292969, "learning_rate": 5e-07, "logits/chosen": -24681598.0, "logits/rejected": -18439844.0, "logps/chosen": -401.1655578613281, "logps/rejected": -275.7989196777344, "loss": 0.2926, "rewards/chosen": 0.39272481203079224, "rewards/margins": 2.081834614276886, "rewards/rejected": -1.6891098022460938, "step": 3777 }, { "epoch": 0.20024911880847004, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36927706.666666664, "logits/rejected": -10934658.0, "logps/chosen": -241.0606486002604, "logps/rejected": -160.95997619628906, "loss": 0.3895, "rewards/chosen": 0.12057749430338542, "rewards/margins": 2.137607971827189, "rewards/rejected": -2.0170304775238037, "step": 3778 }, { "epoch": 0.20030212281027218, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49866496.0, "logits/rejected": -37643272.0, "logps/chosen": -196.5503946940104, "logps/rejected": -421.319091796875, "loss": 0.4056, "rewards/chosen": 0.02679567039012909, "rewards/margins": 3.3499491661787033, "rewards/rejected": -3.323153495788574, "step": 3779 }, { "epoch": 0.20035512681207432, "grad_norm": 57.5, "kl": 1.0336074829101562, "learning_rate": 5e-07, "logits/chosen": -32827728.0, "logits/rejected": -2969874.0, "logps/chosen": -376.59775390625, "logps/rejected": -154.82618204752603, "loss": 0.3093, "rewards/chosen": 0.6985981464385986, "rewards/margins": 2.3146965503692627, "rewards/rejected": -1.616098403930664, "step": 3780 }, { "epoch": 0.20040813081387646, "grad_norm": 68.5, "kl": 3.370086669921875, "learning_rate": 5e-07, "logits/chosen": -29459702.0, "logits/rejected": -29808830.0, "logps/chosen": -660.1849365234375, "logps/rejected": -295.40679931640625, "loss": 0.2495, "rewards/chosen": 1.3898506164550781, "rewards/margins": 3.196060061454773, "rewards/rejected": -1.8062094449996948, "step": 3781 }, { "epoch": 0.2004611348156786, "grad_norm": 64.5, "kl": 0.381439208984375, "learning_rate": 5e-07, "logits/chosen": -73701734.4, "logits/rejected": -9819250.666666666, "logps/chosen": -253.5567138671875, "logps/rejected": -547.2376302083334, "loss": 0.404, "rewards/chosen": 0.00036163330078125, "rewards/margins": 2.1408515294392907, "rewards/rejected": -2.1404898961385093, "step": 3782 }, { "epoch": 0.20051413881748073, "grad_norm": 82.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 75789704.0, "logits/rejected": -35173080.0, "logps/chosen": -414.11956787109375, "logps/rejected": -228.4686737060547, "loss": 0.4072, "rewards/chosen": -0.14159919321537018, "rewards/margins": 1.1003311723470688, "rewards/rejected": -1.241930365562439, "step": 3783 }, { "epoch": 0.20056714281928287, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20335586.666666668, "logits/rejected": -69453696.0, "logps/chosen": -211.30025227864584, "logps/rejected": -371.11533203125, "loss": 0.3718, "rewards/chosen": -0.9233012199401855, "rewards/margins": 0.7077589988708497, "rewards/rejected": -1.6310602188110352, "step": 3784 }, { "epoch": 0.200620146821085, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19673422.666666668, "logits/rejected": -43501240.0, "logps/chosen": -175.2252197265625, "logps/rejected": -481.49853515625, "loss": 0.4038, "rewards/chosen": 0.03318742414315542, "rewards/margins": 2.012150083978971, "rewards/rejected": -1.9789626598358154, "step": 3785 }, { "epoch": 0.20067315082288711, "grad_norm": 62.0, "kl": 0.32221031188964844, "learning_rate": 5e-07, "logits/chosen": -23968716.0, "logits/rejected": -31074952.0, "logps/chosen": -302.0971984863281, "logps/rejected": -314.022216796875, "loss": 0.3385, "rewards/chosen": 0.4048309624195099, "rewards/margins": 1.5529620945453644, "rewards/rejected": -1.1481311321258545, "step": 3786 }, { "epoch": 0.20072615482468925, "grad_norm": 57.0, "kl": 1.5179519653320312, "learning_rate": 5e-07, "logits/chosen": -38943290.666666664, "logits/rejected": -46061481.6, "logps/chosen": -670.37939453125, "logps/rejected": -372.72021484375, "loss": 0.1989, "rewards/chosen": 0.7727310657501221, "rewards/margins": 3.146440935134888, "rewards/rejected": -2.373709869384766, "step": 3787 }, { "epoch": 0.2007791588264914, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30421785.6, "logits/rejected": -25621426.666666668, "logps/chosen": -181.3660400390625, "logps/rejected": -272.19472249348956, "loss": 0.3851, "rewards/chosen": -0.09510639905929566, "rewards/margins": 1.6665711839993793, "rewards/rejected": -1.761677583058675, "step": 3788 }, { "epoch": 0.20083216282829353, "grad_norm": 61.75, "kl": 2.1162147521972656, "learning_rate": 5e-07, "logits/chosen": -35493270.4, "logits/rejected": -9611678.666666666, "logps/chosen": -363.290234375, "logps/rejected": -282.5965983072917, "loss": 0.3677, "rewards/chosen": 0.4877138137817383, "rewards/margins": 1.7696026961008708, "rewards/rejected": -1.2818888823191326, "step": 3789 }, { "epoch": 0.20088516683009566, "grad_norm": 70.5, "kl": 1.3907546997070312, "learning_rate": 5e-07, "logits/chosen": -45241193.14285714, "logits/rejected": -503354.65625, "logps/chosen": -636.5545479910714, "logps/rejected": -277.8701477050781, "loss": 0.3323, "rewards/chosen": 0.7244648933410645, "rewards/margins": 3.9223287105560303, "rewards/rejected": -3.197863817214966, "step": 3790 }, { "epoch": 0.2009381708318978, "grad_norm": 58.5, "kl": 0.20037841796875, "learning_rate": 5e-07, "logits/chosen": -61718976.0, "logits/rejected": -24434928.0, "logps/chosen": -349.633203125, "logps/rejected": -269.4088541666667, "loss": 0.3744, "rewards/chosen": -0.05364029407501221, "rewards/margins": 1.7632208585739135, "rewards/rejected": -1.8168611526489258, "step": 3791 }, { "epoch": 0.20099117483369994, "grad_norm": 93.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27989012.0, "logits/rejected": 54203052.0, "logps/chosen": -331.13623046875, "logps/rejected": -653.8538818359375, "loss": 0.3233, "rewards/chosen": 0.3208163380622864, "rewards/margins": 1.9823091626167297, "rewards/rejected": -1.6614928245544434, "step": 3792 }, { "epoch": 0.20104417883550207, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24685920.0, "logits/rejected": -96558624.0, "logps/chosen": -501.657861328125, "logps/rejected": -411.9243977864583, "loss": 0.3403, "rewards/chosen": 0.2607052087783813, "rewards/margins": 2.1201574563980103, "rewards/rejected": -1.859452247619629, "step": 3793 }, { "epoch": 0.2010971828373042, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50568582.4, "logits/rejected": -122294.33333333333, "logps/chosen": -266.433349609375, "logps/rejected": -265.49713134765625, "loss": 0.3547, "rewards/chosen": 0.3164837837219238, "rewards/margins": 1.681724198659261, "rewards/rejected": -1.3652404149373372, "step": 3794 }, { "epoch": 0.20115018683910635, "grad_norm": 62.0, "kl": 2.142576217651367, "learning_rate": 5e-07, "logits/chosen": -20690745.6, "logits/rejected": -26519616.0, "logps/chosen": -385.7051513671875, "logps/rejected": -292.47601318359375, "loss": 0.3649, "rewards/chosen": 0.563758373260498, "rewards/margins": 1.7914896806081135, "rewards/rejected": -1.2277313073476155, "step": 3795 }, { "epoch": 0.20120319084090849, "grad_norm": 58.75, "kl": 0.5441169738769531, "learning_rate": 5e-07, "logits/chosen": -24816246.0, "logits/rejected": -21674894.0, "logps/chosen": -412.3586120605469, "logps/rejected": -115.8794174194336, "loss": 0.3355, "rewards/chosen": 0.32599106431007385, "rewards/margins": 1.5427482426166534, "rewards/rejected": -1.2167571783065796, "step": 3796 }, { "epoch": 0.20125619484271062, "grad_norm": 52.25, "kl": 0.15816402435302734, "learning_rate": 5e-07, "logits/chosen": -10239254.0, "logits/rejected": -8775106.0, "logps/chosen": -240.4326171875, "logps/rejected": -227.75430297851562, "loss": 0.4051, "rewards/chosen": -0.1709451675415039, "rewards/margins": 1.049633502960205, "rewards/rejected": -1.220578670501709, "step": 3797 }, { "epoch": 0.20130919884451276, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14592872.0, "logits/rejected": -33246870.4, "logps/chosen": -205.78645833333334, "logps/rejected": -314.8986083984375, "loss": 0.2912, "rewards/chosen": 0.1293874184290568, "rewards/margins": 1.7744571129481, "rewards/rejected": -1.645069694519043, "step": 3798 }, { "epoch": 0.2013622028463149, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22481288.0, "logits/rejected": -41940133.333333336, "logps/chosen": -135.25836181640625, "logps/rejected": -380.1732584635417, "loss": 0.2309, "rewards/chosen": -0.2505660951137543, "rewards/margins": 1.96018718679746, "rewards/rejected": -2.2107532819112143, "step": 3799 }, { "epoch": 0.20141520684811703, "grad_norm": 82.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32511196.0, "logits/rejected": -18514604.0, "logps/chosen": -768.0235595703125, "logps/rejected": -479.8033447265625, "loss": 0.2795, "rewards/chosen": 0.24439647793769836, "rewards/margins": 2.512126237154007, "rewards/rejected": -2.2677297592163086, "step": 3800 }, { "epoch": 0.20146821084991917, "grad_norm": 43.0, "kl": 1.2640514373779297, "learning_rate": 5e-07, "logits/chosen": -8053008.5, "logits/rejected": -4789023.0, "logps/chosen": -1007.1377563476562, "logps/rejected": -184.0543416341146, "loss": 0.246, "rewards/chosen": 1.2938939332962036, "rewards/margins": 2.6434158881505327, "rewards/rejected": -1.3495219548543294, "step": 3801 }, { "epoch": 0.2015212148517213, "grad_norm": 36.0, "kl": 0.9854583740234375, "learning_rate": 5e-07, "logits/chosen": -4971772.8, "logits/rejected": -24776954.666666668, "logps/chosen": -406.569921875, "logps/rejected": -207.6929931640625, "loss": 0.3335, "rewards/chosen": 0.5171592712402344, "rewards/margins": 2.5075459162394207, "rewards/rejected": -1.9903866449991863, "step": 3802 }, { "epoch": 0.20157421885352345, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11751881.6, "logits/rejected": -17047942.666666668, "logps/chosen": -224.5724365234375, "logps/rejected": -531.4871012369791, "loss": 0.2488, "rewards/chosen": 0.4983789920806885, "rewards/margins": 4.22769538561503, "rewards/rejected": -3.7293163935343423, "step": 3803 }, { "epoch": 0.20162722285532558, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29966724.0, "logits/rejected": -29957096.0, "logps/chosen": -245.9066162109375, "logps/rejected": -365.457275390625, "loss": 0.319, "rewards/chosen": 0.3242260217666626, "rewards/margins": 1.7274643182754517, "rewards/rejected": -1.403238296508789, "step": 3804 }, { "epoch": 0.20168022685712772, "grad_norm": 65.0, "kl": 0.5863571166992188, "learning_rate": 5e-07, "logits/chosen": -8391485.6, "logits/rejected": -19356838.666666668, "logps/chosen": -471.07607421875, "logps/rejected": -170.1517333984375, "loss": 0.3104, "rewards/chosen": 0.8182823181152343, "rewards/margins": 1.8232009728749592, "rewards/rejected": -1.0049186547597249, "step": 3805 }, { "epoch": 0.20173323085892986, "grad_norm": 38.0, "kl": 0.04532623291015625, "learning_rate": 5e-07, "logits/chosen": -108837.95833333333, "logits/rejected": -9475403.2, "logps/chosen": -123.50478108723958, "logps/rejected": -288.4049072265625, "loss": 0.1902, "rewards/chosen": 0.7210559050242106, "rewards/margins": 3.152938381830851, "rewards/rejected": -2.4318824768066407, "step": 3806 }, { "epoch": 0.201786234860732, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6798735.5, "logits/rejected": -39621906.28571428, "logps/chosen": -429.5081481933594, "logps/rejected": -358.93994140625, "loss": 0.1717, "rewards/chosen": 1.3670532703399658, "rewards/margins": 3.2815892015184676, "rewards/rejected": -1.9145359311785017, "step": 3807 }, { "epoch": 0.20183923886253413, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -152087744.0, "logits/rejected": -36917572.0, "logps/chosen": -287.0328674316406, "logps/rejected": -424.4471435546875, "loss": 0.3292, "rewards/chosen": -0.1438276767730713, "rewards/margins": 2.7526116371154785, "rewards/rejected": -2.89643931388855, "step": 3808 }, { "epoch": 0.20189224286433627, "grad_norm": 179.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48473076.0, "logits/rejected": -14414785.142857144, "logps/chosen": -227.65957641601562, "logps/rejected": -241.84451729910714, "loss": 0.2072, "rewards/chosen": 0.5088226199150085, "rewards/margins": 2.178097903728485, "rewards/rejected": -1.6692752838134766, "step": 3809 }, { "epoch": 0.2019452468661384, "grad_norm": 49.25, "kl": 0.1289825439453125, "learning_rate": 5e-07, "logits/chosen": -20225718.666666668, "logits/rejected": -30947046.4, "logps/chosen": -201.7518513997396, "logps/rejected": -327.8398681640625, "loss": 0.2957, "rewards/chosen": 0.01945610096057256, "rewards/margins": 1.8250473027427991, "rewards/rejected": -1.8055912017822267, "step": 3810 }, { "epoch": 0.20199825086794054, "grad_norm": 54.0, "kl": 5.255271911621094, "learning_rate": 5e-07, "logits/chosen": 2326495.6, "logits/rejected": -3873697.6666666665, "logps/chosen": -615.909765625, "logps/rejected": -82.93961588541667, "loss": 0.4163, "rewards/chosen": 0.8047337532043457, "rewards/margins": 1.988639195760091, "rewards/rejected": -1.1839054425557454, "step": 3811 }, { "epoch": 0.20205125486974265, "grad_norm": 72.5, "kl": 1.5052413940429688, "learning_rate": 5e-07, "logits/chosen": -52904540.8, "logits/rejected": -8977466.666666666, "logps/chosen": -421.303662109375, "logps/rejected": -393.7235514322917, "loss": 0.3389, "rewards/chosen": 0.5252720832824707, "rewards/margins": 2.1868402481079103, "rewards/rejected": -1.6615681648254395, "step": 3812 }, { "epoch": 0.2021042588715448, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48631048.0, "logits/rejected": -20798970.666666668, "logps/chosen": -202.4306640625, "logps/rejected": -273.5120849609375, "loss": 0.3743, "rewards/chosen": -0.35228195786476135, "rewards/margins": 0.7501243849595387, "rewards/rejected": -1.1024063428243, "step": 3813 }, { "epoch": 0.20215726287334693, "grad_norm": 61.0, "kl": 0.6226654052734375, "learning_rate": 5e-07, "logits/chosen": -94652088.0, "logits/rejected": -8772272.0, "logps/chosen": -484.0739440917969, "logps/rejected": -158.96713256835938, "loss": 0.3919, "rewards/chosen": -0.25328418612480164, "rewards/margins": 1.3649348318576813, "rewards/rejected": -1.618219017982483, "step": 3814 }, { "epoch": 0.20221026687514906, "grad_norm": 74.0, "kl": 2.178617000579834, "learning_rate": 5e-07, "logits/chosen": -44269770.666666664, "logits/rejected": -3513112.75, "logps/chosen": -409.1960042317708, "logps/rejected": -573.8974609375, "loss": 0.377, "rewards/chosen": 0.518058975537618, "rewards/margins": 2.683862646420797, "rewards/rejected": -2.1658036708831787, "step": 3815 }, { "epoch": 0.2022632708769512, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16620228.0, "logits/rejected": -22999848.0, "logps/chosen": -418.1630554199219, "logps/rejected": -320.32659912109375, "loss": 0.3702, "rewards/chosen": -0.1687820553779602, "rewards/margins": 1.2536140084266663, "rewards/rejected": -1.4223960638046265, "step": 3816 }, { "epoch": 0.20231627487875334, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10439135.333333334, "logits/rejected": -32994630.4, "logps/chosen": -145.48173014322916, "logps/rejected": -176.53323974609376, "loss": 0.2834, "rewards/chosen": 0.17559383312861124, "rewards/margins": 1.763990859190623, "rewards/rejected": -1.5883970260620117, "step": 3817 }, { "epoch": 0.20236927888055548, "grad_norm": 54.75, "kl": 0.3870201110839844, "learning_rate": 5e-07, "logits/chosen": -43405376.0, "logits/rejected": -26573756.8, "logps/chosen": -746.5589192708334, "logps/rejected": -284.40419921875, "loss": 0.2102, "rewards/chosen": 1.0745442708333333, "rewards/margins": 2.8542456944783527, "rewards/rejected": -1.7797014236450195, "step": 3818 }, { "epoch": 0.2024222828823576, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5225578.0, "logits/rejected": -33087366.4, "logps/chosen": -129.97065226236978, "logps/rejected": -310.004541015625, "loss": 0.272, "rewards/chosen": 0.41299347082773846, "rewards/margins": 1.9962549606959026, "rewards/rejected": -1.583261489868164, "step": 3819 }, { "epoch": 0.20247528688415975, "grad_norm": 76.5, "kl": 0.03167724609375, "learning_rate": 5e-07, "logits/chosen": -41910710.85714286, "logits/rejected": -31166286.0, "logps/chosen": -368.1821986607143, "logps/rejected": -192.712890625, "loss": 0.4507, "rewards/chosen": 0.17717443193708146, "rewards/margins": 0.7698929735592434, "rewards/rejected": -0.5927185416221619, "step": 3820 }, { "epoch": 0.2025282908859619, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12927144.0, "logits/rejected": -24051902.4, "logps/chosen": -342.4175211588542, "logps/rejected": -274.895263671875, "loss": 0.259, "rewards/chosen": 0.46388689676920575, "rewards/margins": 2.2838167826334637, "rewards/rejected": -1.8199298858642579, "step": 3821 }, { "epoch": 0.20258129488776402, "grad_norm": 53.25, "kl": 2.759883403778076, "learning_rate": 5e-07, "logits/chosen": -26959174.4, "logits/rejected": -8609242.0, "logps/chosen": -475.7765625, "logps/rejected": -144.69783528645834, "loss": 0.342, "rewards/chosen": 0.6957735061645508, "rewards/margins": 1.8788557688395184, "rewards/rejected": -1.1830822626749675, "step": 3822 }, { "epoch": 0.20263429888956616, "grad_norm": 51.0, "kl": 0.8911361694335938, "learning_rate": 5e-07, "logits/chosen": -57298259.2, "logits/rejected": -42218909.333333336, "logps/chosen": -364.33603515625, "logps/rejected": -358.3826904296875, "loss": 0.3611, "rewards/chosen": 0.38759140968322753, "rewards/margins": 2.5053128083546956, "rewards/rejected": -2.1177213986714682, "step": 3823 }, { "epoch": 0.2026873028913683, "grad_norm": 49.0, "kl": 0.7111129760742188, "learning_rate": 5e-07, "logits/chosen": -27832112.0, "logits/rejected": -15759595.0, "logps/chosen": -598.2276611328125, "logps/rejected": -289.7449951171875, "loss": 0.2515, "rewards/chosen": 0.6274784207344055, "rewards/margins": 3.0240162014961243, "rewards/rejected": -2.3965377807617188, "step": 3824 }, { "epoch": 0.20274030689317044, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36486917.333333336, "logits/rejected": -14009800.0, "logps/chosen": -262.40850830078125, "logps/rejected": -358.98232421875, "loss": 0.2477, "rewards/chosen": 0.4236447811126709, "rewards/margins": 2.5555551052093506, "rewards/rejected": -2.1319103240966797, "step": 3825 }, { "epoch": 0.20279331089497257, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1933251.0, "logits/rejected": -30026212.57142857, "logps/chosen": -27.932472229003906, "logps/rejected": -337.93589564732144, "loss": 0.2198, "rewards/chosen": -0.11094742268323898, "rewards/margins": 1.949984191783837, "rewards/rejected": -2.060931614467076, "step": 3826 }, { "epoch": 0.2028463148967747, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16594682.666666666, "logits/rejected": -19778153.6, "logps/chosen": -172.7602742513021, "logps/rejected": -125.715478515625, "loss": 0.3228, "rewards/chosen": -0.09793243805567424, "rewards/margins": 1.4598231275876363, "rewards/rejected": -1.5577555656433106, "step": 3827 }, { "epoch": 0.20289931889857685, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28633008.0, "logits/rejected": -18523356.0, "logps/chosen": -417.2802429199219, "logps/rejected": -459.9938659667969, "loss": 0.3047, "rewards/chosen": 0.19240225851535797, "rewards/margins": 2.4893108755350113, "rewards/rejected": -2.2969086170196533, "step": 3828 }, { "epoch": 0.20295232290037898, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11716330.0, "logits/rejected": -64571108.0, "logps/chosen": -54.621437072753906, "logps/rejected": -423.84808349609375, "loss": 0.3598, "rewards/chosen": -0.24102988839149475, "rewards/margins": 1.5420452058315277, "rewards/rejected": -1.7830750942230225, "step": 3829 }, { "epoch": 0.20300532690218112, "grad_norm": 39.75, "kl": 0.008142471313476562, "learning_rate": 5e-07, "logits/chosen": -3236751.6666666665, "logits/rejected": -10085790.4, "logps/chosen": -162.46356201171875, "logps/rejected": -277.2177734375, "loss": 0.2988, "rewards/chosen": 0.36870721975962323, "rewards/margins": 2.209292356173197, "rewards/rejected": -1.8405851364135741, "step": 3830 }, { "epoch": 0.20305833090398326, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8987605.333333334, "logits/rejected": -65013324.8, "logps/chosen": -158.97022501627603, "logps/rejected": -293.946337890625, "loss": 0.2236, "rewards/chosen": 0.5207465092341105, "rewards/margins": 2.78559197584788, "rewards/rejected": -2.2648454666137696, "step": 3831 }, { "epoch": 0.2031113349057854, "grad_norm": 51.75, "kl": 0.02321338653564453, "learning_rate": 5e-07, "logits/chosen": 35284096.0, "logits/rejected": -31577088.0, "logps/chosen": -376.3060607910156, "logps/rejected": -613.3762613932291, "loss": 0.2828, "rewards/chosen": 0.08998031914234161, "rewards/margins": 1.7695842136939366, "rewards/rejected": -1.679603894551595, "step": 3832 }, { "epoch": 0.20316433890758753, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63313941.333333336, "logits/rejected": -3930022.0, "logps/chosen": -410.3316650390625, "logps/rejected": -54.23952865600586, "loss": 0.3961, "rewards/chosen": 0.20346248149871826, "rewards/margins": 1.3994442224502563, "rewards/rejected": -1.195981740951538, "step": 3833 }, { "epoch": 0.20321734290938967, "grad_norm": 52.75, "kl": 0.2939777374267578, "learning_rate": 5e-07, "logits/chosen": -17482755.2, "logits/rejected": -10887046.0, "logps/chosen": -332.7917236328125, "logps/rejected": -429.4957275390625, "loss": 0.2786, "rewards/chosen": 0.6911468505859375, "rewards/margins": 3.2152843475341797, "rewards/rejected": -2.524137496948242, "step": 3834 }, { "epoch": 0.2032703469111918, "grad_norm": 65.0, "kl": 0.5393905639648438, "learning_rate": 5e-07, "logits/chosen": -51949068.8, "logits/rejected": -15810940.0, "logps/chosen": -409.63984375, "logps/rejected": -213.91035970052084, "loss": 0.4115, "rewards/chosen": 0.09050483703613281, "rewards/margins": 1.203860870997111, "rewards/rejected": -1.1133560339609783, "step": 3835 }, { "epoch": 0.20332335091299394, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3353345.3333333335, "logits/rejected": -14765638.4, "logps/chosen": -121.28238932291667, "logps/rejected": -337.3611328125, "loss": 0.3339, "rewards/chosen": -0.11600581804911296, "rewards/margins": 1.4912695725758869, "rewards/rejected": -1.607275390625, "step": 3836 }, { "epoch": 0.20337635491479605, "grad_norm": 65.5, "kl": 0.10010528564453125, "learning_rate": 5e-07, "logits/chosen": -39639168.0, "logits/rejected": -7020484.0, "logps/chosen": -502.3638916015625, "logps/rejected": -178.55079650878906, "loss": 0.4009, "rewards/chosen": -0.054693251848220825, "rewards/margins": 0.9887842833995819, "rewards/rejected": -1.0434775352478027, "step": 3837 }, { "epoch": 0.2034293589165982, "grad_norm": 57.25, "kl": 0.42971038818359375, "learning_rate": 5e-07, "logits/chosen": -32686365.333333332, "logits/rejected": -10491228.0, "logps/chosen": -442.9774983723958, "logps/rejected": -245.09007263183594, "loss": 0.2541, "rewards/chosen": 0.8483227094014486, "rewards/margins": 3.6682804425557456, "rewards/rejected": -2.819957733154297, "step": 3838 }, { "epoch": 0.20348236291840033, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63916480.0, "logits/rejected": -13649662.0, "logps/chosen": -299.7865804036458, "logps/rejected": -179.70657348632812, "loss": 0.354, "rewards/chosen": 0.2571250995000203, "rewards/margins": 3.5389191706975303, "rewards/rejected": -3.2817940711975098, "step": 3839 }, { "epoch": 0.20353536692020247, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6973326.666666667, "logits/rejected": -43727715.2, "logps/chosen": -215.8291015625, "logps/rejected": -270.83369140625, "loss": 0.3388, "rewards/chosen": 0.35707680384318036, "rewards/margins": 1.438003698984782, "rewards/rejected": -1.0809268951416016, "step": 3840 }, { "epoch": 0.2035883709220046, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15944870.4, "logits/rejected": -450144.0, "logps/chosen": -349.2365478515625, "logps/rejected": -552.0928955078125, "loss": 0.3348, "rewards/chosen": 0.3133674144744873, "rewards/margins": 2.5459965546925862, "rewards/rejected": -2.232629140218099, "step": 3841 }, { "epoch": 0.20364137492380674, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36763384.0, "logits/rejected": -17701741.714285713, "logps/chosen": -435.2121887207031, "logps/rejected": -351.7949916294643, "loss": 0.2027, "rewards/chosen": -0.06649170070886612, "rewards/margins": 1.838755987584591, "rewards/rejected": -1.905247688293457, "step": 3842 }, { "epoch": 0.20369437892560888, "grad_norm": 56.25, "kl": 1.6838350296020508, "learning_rate": 5e-07, "logits/chosen": -4885392.8, "logits/rejected": -23135141.333333332, "logps/chosen": -400.63583984375, "logps/rejected": -192.01900227864584, "loss": 0.3626, "rewards/chosen": 0.5936242580413819, "rewards/margins": 2.0620999177296957, "rewards/rejected": -1.4684756596883137, "step": 3843 }, { "epoch": 0.203747382927411, "grad_norm": 60.0, "kl": 1.3397836685180664, "learning_rate": 5e-07, "logits/chosen": -23364062.4, "logits/rejected": -44497533.333333336, "logps/chosen": -288.282373046875, "logps/rejected": -335.3668212890625, "loss": 0.394, "rewards/chosen": 0.4285780906677246, "rewards/margins": 1.4835791905721027, "rewards/rejected": -1.0550010999043782, "step": 3844 }, { "epoch": 0.20380038692921315, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49318752.0, "logits/rejected": -53440501.333333336, "logps/chosen": -510.87969970703125, "logps/rejected": -403.9300130208333, "loss": 0.1937, "rewards/chosen": 0.5500320196151733, "rewards/margins": 2.508329113324483, "rewards/rejected": -1.9582970937093098, "step": 3845 }, { "epoch": 0.2038533909310153, "grad_norm": 102.5, "kl": 0.25115203857421875, "learning_rate": 5e-07, "logits/chosen": -51627888.0, "logits/rejected": 2976917.3333333335, "logps/chosen": -254.64842224121094, "logps/rejected": -398.1473795572917, "loss": 0.332, "rewards/chosen": -0.5283752679824829, "rewards/margins": 0.6487586895624797, "rewards/rejected": -1.1771339575449626, "step": 3846 }, { "epoch": 0.20390639493281743, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3853570.0, "logits/rejected": -24455644.8, "logps/chosen": -136.24739583333334, "logps/rejected": -303.365625, "loss": 0.2401, "rewards/chosen": 1.2510957717895508, "rewards/margins": 2.5650710105895995, "rewards/rejected": -1.3139752388000487, "step": 3847 }, { "epoch": 0.20395939893461956, "grad_norm": 54.0, "kl": 0.5192184448242188, "learning_rate": 5e-07, "logits/chosen": -21498284.0, "logits/rejected": -29751336.0, "logps/chosen": -372.166015625, "logps/rejected": -483.58428955078125, "loss": 0.2332, "rewards/chosen": 0.7038183212280273, "rewards/margins": 3.141268730163574, "rewards/rejected": -2.437450408935547, "step": 3848 }, { "epoch": 0.2040124029364217, "grad_norm": 40.75, "kl": 0.9404191970825195, "learning_rate": 5e-07, "logits/chosen": -21116084.0, "logits/rejected": -2811091.0, "logps/chosen": -104.92038981119792, "logps/rejected": -191.9351806640625, "loss": 0.4918, "rewards/chosen": -0.196647842725118, "rewards/margins": 1.044257918993632, "rewards/rejected": -1.24090576171875, "step": 3849 }, { "epoch": 0.20406540693822384, "grad_norm": 47.25, "kl": 0.26386451721191406, "learning_rate": 5e-07, "logits/chosen": 3643465.0, "logits/rejected": -25523168.0, "logps/chosen": -276.72003173828125, "logps/rejected": -280.55340576171875, "loss": 0.2576, "rewards/chosen": 0.5782455205917358, "rewards/margins": 2.6637781858444214, "rewards/rejected": -2.0855326652526855, "step": 3850 }, { "epoch": 0.20411841094002597, "grad_norm": 34.25, "kl": 0.8884763717651367, "learning_rate": 5e-07, "logits/chosen": -3237604.6666666665, "logits/rejected": -33026000.0, "logps/chosen": -145.0949910481771, "logps/rejected": -335.14666748046875, "loss": 0.289, "rewards/chosen": 0.7810855706532797, "rewards/margins": 3.113504489262899, "rewards/rejected": -2.332418918609619, "step": 3851 }, { "epoch": 0.2041714149418281, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67783098.66666667, "logits/rejected": -30638147.2, "logps/chosen": -208.48089599609375, "logps/rejected": -561.838720703125, "loss": 0.2399, "rewards/chosen": 0.3714803059895833, "rewards/margins": 2.428258260091146, "rewards/rejected": -2.0567779541015625, "step": 3852 }, { "epoch": 0.20422441894363025, "grad_norm": 91.0, "kl": 2.505401611328125, "learning_rate": 5e-07, "logits/chosen": 22716619.42857143, "logits/rejected": -44110344.0, "logps/chosen": -605.9460100446429, "logps/rejected": -552.2291259765625, "loss": 0.3579, "rewards/chosen": 0.7689313888549805, "rewards/margins": 2.508262515068054, "rewards/rejected": -1.7393311262130737, "step": 3853 }, { "epoch": 0.20427742294543239, "grad_norm": 62.5, "kl": 0.2729988098144531, "learning_rate": 5e-07, "logits/chosen": -94172288.0, "logits/rejected": -48511052.8, "logps/chosen": -763.2859700520834, "logps/rejected": -449.785302734375, "loss": 0.2603, "rewards/chosen": 0.40994465351104736, "rewards/margins": 2.283923077583313, "rewards/rejected": -1.8739784240722657, "step": 3854 }, { "epoch": 0.20433042694723452, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47600245.333333336, "logits/rejected": -14783101.0, "logps/chosen": -405.7740071614583, "logps/rejected": -288.0215148925781, "loss": 0.3955, "rewards/chosen": 0.19704411427179971, "rewards/margins": 1.525440792242686, "rewards/rejected": -1.3283966779708862, "step": 3855 }, { "epoch": 0.20438343094903666, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50606604.0, "logits/rejected": -13971786.666666666, "logps/chosen": -323.69891357421875, "logps/rejected": -194.75447591145834, "loss": 0.2926, "rewards/chosen": 0.3869720697402954, "rewards/margins": 1.5508105357487996, "rewards/rejected": -1.1638384660085042, "step": 3856 }, { "epoch": 0.2044364349508388, "grad_norm": 67.5, "kl": 2.2503433227539062, "learning_rate": 5e-07, "logits/chosen": -16670166.4, "logits/rejected": -61584938.666666664, "logps/chosen": -882.76689453125, "logps/rejected": -390.8609212239583, "loss": 0.207, "rewards/chosen": 1.4777243614196778, "rewards/margins": 3.5539621988932293, "rewards/rejected": -2.0762378374735513, "step": 3857 }, { "epoch": 0.20448943895264093, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2605990.6666666665, "logits/rejected": -36431091.2, "logps/chosen": -255.92232259114584, "logps/rejected": -481.91064453125, "loss": 0.2835, "rewards/chosen": 0.3547176520029704, "rewards/margins": 1.9632789770762127, "rewards/rejected": -1.6085613250732422, "step": 3858 }, { "epoch": 0.20454244295444307, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27187466.666666668, "logits/rejected": -15163308.0, "logps/chosen": -250.422607421875, "logps/rejected": -395.5906066894531, "loss": 0.357, "rewards/chosen": 0.4802076816558838, "rewards/margins": 1.986945629119873, "rewards/rejected": -1.5067379474639893, "step": 3859 }, { "epoch": 0.2045954469562452, "grad_norm": 58.75, "kl": 0.7085533142089844, "learning_rate": 5e-07, "logits/chosen": -54579884.8, "logits/rejected": -70003168.0, "logps/chosen": -340.415478515625, "logps/rejected": -793.36572265625, "loss": 0.3994, "rewards/chosen": -0.09386718273162842, "rewards/margins": 3.0253761212031045, "rewards/rejected": -3.119243303934733, "step": 3860 }, { "epoch": 0.20464845095804735, "grad_norm": 37.75, "kl": 0.5563602447509766, "learning_rate": 5e-07, "logits/chosen": -17637668.0, "logits/rejected": -8016865.333333333, "logps/chosen": -165.58120727539062, "logps/rejected": -125.51678466796875, "loss": 0.2638, "rewards/chosen": 0.3108787536621094, "rewards/margins": 1.7203224500020344, "rewards/rejected": -1.409443696339925, "step": 3861 }, { "epoch": 0.20470145495984945, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4186009.3333333335, "logits/rejected": -12423473.6, "logps/chosen": -64.78761291503906, "logps/rejected": -204.9511962890625, "loss": 0.3041, "rewards/chosen": 0.03865312536557516, "rewards/margins": 1.6103456874688467, "rewards/rejected": -1.5716925621032716, "step": 3862 }, { "epoch": 0.2047544589616516, "grad_norm": 47.75, "kl": 0.23192214965820312, "learning_rate": 5e-07, "logits/chosen": -21095419.2, "logits/rejected": -10065770.0, "logps/chosen": -289.076513671875, "logps/rejected": -139.88902791341147, "loss": 0.3154, "rewards/chosen": 0.8077415466308594, "rewards/margins": 1.8155570665995282, "rewards/rejected": -1.0078155199686687, "step": 3863 }, { "epoch": 0.20480746296345373, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46877024.0, "logits/rejected": -33751482.666666664, "logps/chosen": -651.802734375, "logps/rejected": -309.0821126302083, "loss": 0.2188, "rewards/chosen": 0.6857666373252869, "rewards/margins": 2.482289731502533, "rewards/rejected": -1.796523094177246, "step": 3864 }, { "epoch": 0.20486046696525587, "grad_norm": 28.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10125390.0, "logits/rejected": -34181408.0, "logps/chosen": -197.74520874023438, "logps/rejected": -366.7551676432292, "loss": 0.1882, "rewards/chosen": 0.4311053454875946, "rewards/margins": 2.809795747200648, "rewards/rejected": -2.3786904017130532, "step": 3865 }, { "epoch": 0.204913470967058, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19973204.0, "logits/rejected": -5826340.0, "logps/chosen": -271.4002685546875, "logps/rejected": -370.2316487630208, "loss": 0.1711, "rewards/chosen": 1.257075309753418, "rewards/margins": 3.158215681711833, "rewards/rejected": -1.9011403719584148, "step": 3866 }, { "epoch": 0.20496647496886014, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 240055.5, "logits/rejected": -35577267.2, "logps/chosen": -89.65562947591145, "logps/rejected": -577.80263671875, "loss": 0.2987, "rewards/chosen": -0.11450666189193726, "rewards/margins": 3.018232595920563, "rewards/rejected": -3.1327392578125, "step": 3867 }, { "epoch": 0.20501947897066228, "grad_norm": 53.75, "kl": 0.027065277099609375, "learning_rate": 5e-07, "logits/chosen": -24929816.0, "logits/rejected": -14558991.0, "logps/chosen": -356.9190368652344, "logps/rejected": -199.92803955078125, "loss": 0.2791, "rewards/chosen": 0.7681121826171875, "rewards/margins": 2.143433094024658, "rewards/rejected": -1.3753209114074707, "step": 3868 }, { "epoch": 0.20507248297246441, "grad_norm": 61.0, "kl": 0.0108642578125, "learning_rate": 5e-07, "logits/chosen": -51637354.666666664, "logits/rejected": -52690868.0, "logps/chosen": -363.3060709635417, "logps/rejected": -318.04339599609375, "loss": 0.3656, "rewards/chosen": 0.37873323758443195, "rewards/margins": 1.9760161240895588, "rewards/rejected": -1.597282886505127, "step": 3869 }, { "epoch": 0.20512548697426655, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55797724.0, "logits/rejected": -26526946.285714287, "logps/chosen": -188.79086303710938, "logps/rejected": -329.72391183035717, "loss": 0.2169, "rewards/chosen": 0.658251941204071, "rewards/margins": 2.2829873647008623, "rewards/rejected": -1.6247354234967912, "step": 3870 }, { "epoch": 0.2051784909760687, "grad_norm": 43.5, "kl": 0.057422637939453125, "learning_rate": 5e-07, "logits/chosen": -16507752.0, "logits/rejected": -13416734.0, "logps/chosen": -172.51346842447916, "logps/rejected": -310.0530090332031, "loss": 0.4078, "rewards/chosen": 0.04801072676976522, "rewards/margins": 1.7391622265179951, "rewards/rejected": -1.69115149974823, "step": 3871 }, { "epoch": 0.20523149497787083, "grad_norm": 38.0, "kl": 0.3803558349609375, "learning_rate": 5e-07, "logits/chosen": 4203123.5, "logits/rejected": -49846628.571428575, "logps/chosen": -453.07440185546875, "logps/rejected": -386.89271763392856, "loss": 0.1156, "rewards/chosen": 0.3572631776332855, "rewards/margins": 3.122260387454714, "rewards/rejected": -2.7649972098214284, "step": 3872 }, { "epoch": 0.20528449897967296, "grad_norm": 60.0, "kl": 1.12261962890625, "learning_rate": 5e-07, "logits/chosen": -41632128.0, "logits/rejected": -62362768.0, "logps/chosen": -535.29677734375, "logps/rejected": -390.239990234375, "loss": 0.3241, "rewards/chosen": 0.6244997501373291, "rewards/margins": 2.6489812056223547, "rewards/rejected": -2.024481455485026, "step": 3873 }, { "epoch": 0.2053375029814751, "grad_norm": 60.75, "kl": 0.44775390625, "learning_rate": 5e-07, "logits/chosen": -65974952.0, "logits/rejected": -41133952.0, "logps/chosen": -576.0202026367188, "logps/rejected": -426.415283203125, "loss": 0.3173, "rewards/chosen": 0.4372100830078125, "rewards/margins": 1.6258398691813152, "rewards/rejected": -1.1886297861735027, "step": 3874 }, { "epoch": 0.20539050698327724, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45808682.666666664, "logits/rejected": -37414032.0, "logps/chosen": -266.5950927734375, "logps/rejected": -362.233642578125, "loss": 0.2993, "rewards/chosen": 0.13177159428596497, "rewards/margins": 2.09683011174202, "rewards/rejected": -1.9650585174560546, "step": 3875 }, { "epoch": 0.20544351098507938, "grad_norm": 50.0, "kl": 0.160247802734375, "learning_rate": 5e-07, "logits/chosen": -36099253.333333336, "logits/rejected": -5132228.0, "logps/chosen": -506.0629475911458, "logps/rejected": -233.407763671875, "loss": 0.2857, "rewards/chosen": 0.7990880012512207, "rewards/margins": 1.9550893783569336, "rewards/rejected": -1.156001377105713, "step": 3876 }, { "epoch": 0.2054965149868815, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14953806.666666666, "logits/rejected": -38094444.8, "logps/chosen": -252.904541015625, "logps/rejected": -370.2690185546875, "loss": 0.242, "rewards/chosen": -0.05235010385513306, "rewards/margins": 2.7159266829490663, "rewards/rejected": -2.7682767868041993, "step": 3877 }, { "epoch": 0.20554951898868365, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -120104984.0, "logits/rejected": -34497664.0, "logps/chosen": -611.87451171875, "logps/rejected": -397.7565104166667, "loss": 0.198, "rewards/chosen": 0.7948395013809204, "rewards/margins": 2.8043635288874307, "rewards/rejected": -2.0095240275065103, "step": 3878 }, { "epoch": 0.2056025229904858, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25739682.0, "logits/rejected": -71264576.0, "logps/chosen": -197.13833618164062, "logps/rejected": -557.115478515625, "loss": 0.2623, "rewards/chosen": 0.11144786328077316, "rewards/margins": 3.183468632400036, "rewards/rejected": -3.0720207691192627, "step": 3879 }, { "epoch": 0.20565552699228792, "grad_norm": 42.75, "kl": 0.2655954360961914, "learning_rate": 5e-07, "logits/chosen": -31243088.0, "logits/rejected": -39313712.0, "logps/chosen": -275.3018493652344, "logps/rejected": -576.71142578125, "loss": 0.3458, "rewards/chosen": 0.339371919631958, "rewards/margins": 3.1719248294830322, "rewards/rejected": -2.832552909851074, "step": 3880 }, { "epoch": 0.20570853099409006, "grad_norm": 66.0, "kl": 1.4797821044921875, "learning_rate": 5e-07, "logits/chosen": -9205014.857142856, "logits/rejected": -60805580.0, "logps/chosen": -267.3887939453125, "logps/rejected": -529.4373779296875, "loss": 0.3963, "rewards/chosen": 0.3833679471697126, "rewards/margins": 4.113058498927525, "rewards/rejected": -3.7296905517578125, "step": 3881 }, { "epoch": 0.2057615349958922, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17821162.666666668, "logits/rejected": -11100964.0, "logps/chosen": -124.84372965494792, "logps/rejected": -334.799755859375, "loss": 0.3243, "rewards/chosen": -0.5121307373046875, "rewards/margins": 1.2910169601440429, "rewards/rejected": -1.8031476974487304, "step": 3882 }, { "epoch": 0.20581453899769434, "grad_norm": 63.75, "kl": 0.6814041137695312, "learning_rate": 5e-07, "logits/chosen": -30705202.285714287, "logits/rejected": 6811952.0, "logps/chosen": -297.0043247767857, "logps/rejected": -122.90462493896484, "loss": 0.4107, "rewards/chosen": 0.47738650866917204, "rewards/margins": 0.6890907628195626, "rewards/rejected": -0.21170425415039062, "step": 3883 }, { "epoch": 0.20586754299949647, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9162952.0, "logits/rejected": -16880017.333333332, "logps/chosen": -592.0694580078125, "logps/rejected": -372.025390625, "loss": 0.3111, "rewards/chosen": -0.18685302138328552, "rewards/margins": 1.1760233938694, "rewards/rejected": -1.3628764152526855, "step": 3884 }, { "epoch": 0.2059205470012986, "grad_norm": 46.25, "kl": 0.8727569580078125, "learning_rate": 5e-07, "logits/chosen": -23572339.2, "logits/rejected": -37440426.666666664, "logps/chosen": -275.132373046875, "logps/rejected": -290.27036539713544, "loss": 0.3318, "rewards/chosen": 0.2709000825881958, "rewards/margins": 2.5117080926895143, "rewards/rejected": -2.2408080101013184, "step": 3885 }, { "epoch": 0.20597355100310075, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39108000.0, "logits/rejected": -13335414.4, "logps/chosen": -232.385009765625, "logps/rejected": -166.19677734375, "loss": 0.3068, "rewards/chosen": 0.3937258720397949, "rewards/margins": 1.7263882637023926, "rewards/rejected": -1.3326623916625977, "step": 3886 }, { "epoch": 0.20602655500490288, "grad_norm": 50.25, "kl": 0.02776336669921875, "learning_rate": 5e-07, "logits/chosen": -9957553.0, "logits/rejected": -31842220.0, "logps/chosen": -429.6451416015625, "logps/rejected": -383.33642578125, "loss": 0.2429, "rewards/chosen": 0.6168149709701538, "rewards/margins": 3.3672879934310913, "rewards/rejected": -2.7504730224609375, "step": 3887 }, { "epoch": 0.206079559006705, "grad_norm": 47.0, "kl": 0.13199996948242188, "learning_rate": 5e-07, "logits/chosen": -51223144.0, "logits/rejected": -60978164.0, "logps/chosen": -288.0986022949219, "logps/rejected": -358.64239501953125, "loss": 0.3319, "rewards/chosen": 0.24912983179092407, "rewards/margins": 1.9634451270103455, "rewards/rejected": -1.7143152952194214, "step": 3888 }, { "epoch": 0.20613256300850713, "grad_norm": 64.0, "kl": 2.2594213485717773, "learning_rate": 5e-07, "logits/chosen": -1498650.6666666667, "logits/rejected": -20624200.0, "logps/chosen": -827.5861002604166, "logps/rejected": -450.841357421875, "loss": 0.2027, "rewards/chosen": 1.0452001889546711, "rewards/margins": 3.668638261159261, "rewards/rejected": -2.62343807220459, "step": 3889 }, { "epoch": 0.20618556701030927, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27367517.333333332, "logits/rejected": -27224425.6, "logps/chosen": -151.81205240885416, "logps/rejected": -271.9494873046875, "loss": 0.3063, "rewards/chosen": -0.2057765523592631, "rewards/margins": 1.7258251627286274, "rewards/rejected": -1.9316017150878906, "step": 3890 }, { "epoch": 0.2062385710121114, "grad_norm": 86.0, "kl": 4.092430114746094, "learning_rate": 5e-07, "logits/chosen": -81401829.33333333, "logits/rejected": -38410425.6, "logps/chosen": -769.8104654947916, "logps/rejected": -451.24521484375, "loss": 0.2923, "rewards/chosen": 0.8437489668528239, "rewards/margins": 2.916515843073527, "rewards/rejected": -2.072766876220703, "step": 3891 }, { "epoch": 0.20629157501391354, "grad_norm": 66.5, "kl": 1.5497055053710938, "learning_rate": 5e-07, "logits/chosen": -70441196.8, "logits/rejected": -43806778.666666664, "logps/chosen": -398.09140625, "logps/rejected": -394.7649739583333, "loss": 0.35, "rewards/chosen": 0.36972932815551757, "rewards/margins": 2.166477171579997, "rewards/rejected": -1.7967478434244792, "step": 3892 }, { "epoch": 0.20634457901571568, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29305117.333333332, "logits/rejected": 22590004.0, "logps/chosen": -364.4781901041667, "logps/rejected": -370.0464172363281, "loss": 0.4113, "rewards/chosen": 0.027196228504180908, "rewards/margins": 1.701592743396759, "rewards/rejected": -1.6743965148925781, "step": 3893 }, { "epoch": 0.20639758301751782, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16250621.0, "logits/rejected": -51822250.666666664, "logps/chosen": -127.69708251953125, "logps/rejected": -457.7826334635417, "loss": 0.2221, "rewards/chosen": -0.026611320674419403, "rewards/margins": 2.35488510876894, "rewards/rejected": -2.3814964294433594, "step": 3894 }, { "epoch": 0.20645058701931995, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19988980.0, "logits/rejected": -24305209.6, "logps/chosen": -261.40358479817706, "logps/rejected": -270.1807373046875, "loss": 0.3155, "rewards/chosen": -0.1589245597521464, "rewards/margins": 1.5184886177380879, "rewards/rejected": -1.6774131774902343, "step": 3895 }, { "epoch": 0.2065035910211221, "grad_norm": 44.75, "kl": 0.6515178680419922, "learning_rate": 5e-07, "logits/chosen": -43776810.666666664, "logits/rejected": -18055451.2, "logps/chosen": -202.20487467447916, "logps/rejected": -248.6029541015625, "loss": 0.3362, "rewards/chosen": 0.2660805384318034, "rewards/margins": 1.4732035319010417, "rewards/rejected": -1.2071229934692382, "step": 3896 }, { "epoch": 0.20655659502292423, "grad_norm": 38.0, "kl": 0.4316291809082031, "learning_rate": 5e-07, "logits/chosen": -10021921.333333334, "logits/rejected": -38807180.8, "logps/chosen": -152.94901529947916, "logps/rejected": -551.758544921875, "loss": 0.2125, "rewards/chosen": 0.6561758915583292, "rewards/margins": 2.943488017717997, "rewards/rejected": -2.287312126159668, "step": 3897 }, { "epoch": 0.20660959902472636, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42553176.0, "logits/rejected": -27422224.0, "logps/chosen": -372.0478515625, "logps/rejected": -518.7379557291666, "loss": 0.1997, "rewards/chosen": 0.4958130121231079, "rewards/margins": 3.6462494134902954, "rewards/rejected": -3.1504364013671875, "step": 3898 }, { "epoch": 0.2066626030265285, "grad_norm": 95.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14785630.4, "logits/rejected": -60419605.333333336, "logps/chosen": -411.065771484375, "logps/rejected": -385.0939127604167, "loss": 0.3535, "rewards/chosen": 0.11326916217803955, "rewards/margins": 1.9147923707962036, "rewards/rejected": -1.801523208618164, "step": 3899 }, { "epoch": 0.20671560702833064, "grad_norm": 56.25, "kl": 0.15163421630859375, "learning_rate": 5e-07, "logits/chosen": -45708966.4, "logits/rejected": -35088842.666666664, "logps/chosen": -362.62177734375, "logps/rejected": -255.2528076171875, "loss": 0.3324, "rewards/chosen": 0.631956958770752, "rewards/margins": 1.7436902364095053, "rewards/rejected": -1.1117332776387532, "step": 3900 }, { "epoch": 0.20676861103013278, "grad_norm": 59.25, "kl": 0.7933731079101562, "learning_rate": 5e-07, "logits/chosen": -46069624.0, "logits/rejected": -27696230.0, "logps/chosen": -410.121337890625, "logps/rejected": -210.80162048339844, "loss": 0.3015, "rewards/chosen": 0.4185172915458679, "rewards/margins": 2.286024034023285, "rewards/rejected": -1.867506742477417, "step": 3901 }, { "epoch": 0.2068216150319349, "grad_norm": 50.0, "kl": 0.29297637939453125, "learning_rate": 5e-07, "logits/chosen": -7246384.8, "logits/rejected": -16077814.666666666, "logps/chosen": -291.980517578125, "logps/rejected": -109.68747965494792, "loss": 0.3371, "rewards/chosen": 0.4375319480895996, "rewards/margins": 1.9760751724243164, "rewards/rejected": -1.5385432243347168, "step": 3902 }, { "epoch": 0.20687461903373705, "grad_norm": 46.5, "kl": 1.3586788177490234, "learning_rate": 5e-07, "logits/chosen": -52720213.333333336, "logits/rejected": 17913776.0, "logps/chosen": -232.1677449544271, "logps/rejected": -565.4863891601562, "loss": 0.4657, "rewards/chosen": 0.03321864207585653, "rewards/margins": 2.9181649883588157, "rewards/rejected": -2.884946346282959, "step": 3903 }, { "epoch": 0.2069276230355392, "grad_norm": 62.5, "kl": 0.06575775146484375, "learning_rate": 5e-07, "logits/chosen": -119444824.0, "logits/rejected": -16992302.0, "logps/chosen": -482.85601806640625, "logps/rejected": -190.85662841796875, "loss": 0.3776, "rewards/chosen": 0.07305678725242615, "rewards/margins": 1.1303265988826752, "rewards/rejected": -1.057269811630249, "step": 3904 }, { "epoch": 0.20698062703734132, "grad_norm": 57.0, "kl": 2.0390586853027344, "learning_rate": 5e-07, "logits/chosen": -51306624.0, "logits/rejected": -70355032.0, "logps/chosen": -478.1112060546875, "logps/rejected": -498.3074951171875, "loss": 0.3347, "rewards/chosen": 0.28599661588668823, "rewards/margins": 2.178097903728485, "rewards/rejected": -1.8921012878417969, "step": 3905 }, { "epoch": 0.20703363103914346, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4020537.0, "logits/rejected": -59340196.0, "logps/chosen": -84.2304458618164, "logps/rejected": -499.380615234375, "loss": 0.2984, "rewards/chosen": -0.13264095783233643, "rewards/margins": 2.910788893699646, "rewards/rejected": -3.0434298515319824, "step": 3906 }, { "epoch": 0.2070866350409456, "grad_norm": 53.25, "kl": 0.9030256271362305, "learning_rate": 5e-07, "logits/chosen": -43199880.0, "logits/rejected": -45624592.0, "logps/chosen": -272.04833984375, "logps/rejected": -402.69793701171875, "loss": 0.408, "rewards/chosen": 0.1616895099480947, "rewards/margins": 1.9826962848504384, "rewards/rejected": -1.8210067749023438, "step": 3907 }, { "epoch": 0.20713963904274774, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9675501.333333334, "logits/rejected": -26875267.2, "logps/chosen": -207.11991373697916, "logps/rejected": -279.129736328125, "loss": 0.3603, "rewards/chosen": -0.6393477916717529, "rewards/margins": 1.1216866016387939, "rewards/rejected": -1.7610343933105468, "step": 3908 }, { "epoch": 0.20719264304454987, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3409668.6666666665, "logits/rejected": -54373433.6, "logps/chosen": -20.204962412516277, "logps/rejected": -436.495263671875, "loss": 0.278, "rewards/chosen": 0.25792547067006427, "rewards/margins": 2.680327280362447, "rewards/rejected": -2.4224018096923827, "step": 3909 }, { "epoch": 0.207245647046352, "grad_norm": 54.0, "kl": 2.614100456237793, "learning_rate": 5e-07, "logits/chosen": -8254870.666666667, "logits/rejected": -46832044.0, "logps/chosen": -191.37615966796875, "logps/rejected": -446.6942443847656, "loss": 0.3932, "rewards/chosen": 0.4970564842224121, "rewards/margins": 2.0531082153320312, "rewards/rejected": -1.5560517311096191, "step": 3910 }, { "epoch": 0.20729865104815415, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37996772.0, "logits/rejected": -29376398.0, "logps/chosen": -209.9481964111328, "logps/rejected": -378.173828125, "loss": 0.3528, "rewards/chosen": 0.12412568926811218, "rewards/margins": 1.395779699087143, "rewards/rejected": -1.2716540098190308, "step": 3911 }, { "epoch": 0.20735165504995628, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19320260.0, "logits/rejected": -26830190.4, "logps/chosen": -200.54410807291666, "logps/rejected": -350.081103515625, "loss": 0.2418, "rewards/chosen": 0.24851886431376138, "rewards/margins": 2.41532146135966, "rewards/rejected": -2.1668025970458986, "step": 3912 }, { "epoch": 0.2074046590517584, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10847559.333333334, "logits/rejected": -13380936.8, "logps/chosen": -137.8281046549479, "logps/rejected": -336.772265625, "loss": 0.2499, "rewards/chosen": 0.5309341748555502, "rewards/margins": 2.564853318532308, "rewards/rejected": -2.0339191436767576, "step": 3913 }, { "epoch": 0.20745766305356053, "grad_norm": 60.75, "kl": 1.9466562271118164, "learning_rate": 5e-07, "logits/chosen": -28530280.0, "logits/rejected": -34298448.0, "logps/chosen": -564.7158203125, "logps/rejected": -178.03564453125, "loss": 0.3455, "rewards/chosen": 0.9291083812713623, "rewards/margins": 2.02534556388855, "rewards/rejected": -1.0962371826171875, "step": 3914 }, { "epoch": 0.20751066705536267, "grad_norm": 50.5, "kl": 0.011627197265625, "learning_rate": 5e-07, "logits/chosen": -42264840.0, "logits/rejected": -11009317.0, "logps/chosen": -324.14312744140625, "logps/rejected": -132.05809020996094, "loss": 0.2682, "rewards/chosen": 0.48271864652633667, "rewards/margins": 2.397584021091461, "rewards/rejected": -1.9148653745651245, "step": 3915 }, { "epoch": 0.2075636710571648, "grad_norm": 48.0, "kl": 0.13346481323242188, "learning_rate": 5e-07, "logits/chosen": -56392394.666666664, "logits/rejected": -15214376.0, "logps/chosen": -471.1202799479167, "logps/rejected": -244.144775390625, "loss": 0.2468, "rewards/chosen": 0.6275217533111572, "rewards/margins": 2.7367327213287354, "rewards/rejected": -2.109210968017578, "step": 3916 }, { "epoch": 0.20761667505896694, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38289864.0, "logits/rejected": -19731260.0, "logps/chosen": -273.23822021484375, "logps/rejected": -138.9109344482422, "loss": 0.311, "rewards/chosen": 0.221712127327919, "rewards/margins": 1.8185869604349136, "rewards/rejected": -1.5968748331069946, "step": 3917 }, { "epoch": 0.20766967906076908, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51667648.0, "logits/rejected": -8638476.8, "logps/chosen": -407.8170166015625, "logps/rejected": -227.247119140625, "loss": 0.2442, "rewards/chosen": 0.507288654645284, "rewards/margins": 2.197115238507589, "rewards/rejected": -1.6898265838623048, "step": 3918 }, { "epoch": 0.20772268306257122, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40139184.0, "logits/rejected": -23279426.666666668, "logps/chosen": -229.8668212890625, "logps/rejected": -250.02545166015625, "loss": 0.406, "rewards/chosen": -0.03591949939727783, "rewards/margins": 1.2414485692977906, "rewards/rejected": -1.2773680686950684, "step": 3919 }, { "epoch": 0.20777568706437335, "grad_norm": 46.5, "kl": 0.03994941711425781, "learning_rate": 5e-07, "logits/chosen": -43864744.0, "logits/rejected": -25563072.0, "logps/chosen": -273.107421875, "logps/rejected": -285.4168395996094, "loss": 0.2742, "rewards/chosen": 0.4269867241382599, "rewards/margins": 2.5934403240680695, "rewards/rejected": -2.1664535999298096, "step": 3920 }, { "epoch": 0.2078286910661755, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9277016.0, "logits/rejected": -3044455.0, "logps/chosen": -257.7811686197917, "logps/rejected": -249.4080810546875, "loss": 0.2946, "rewards/chosen": 0.6233388582865397, "rewards/margins": 1.8271373430887858, "rewards/rejected": -1.203798484802246, "step": 3921 }, { "epoch": 0.20788169506797763, "grad_norm": 42.25, "kl": 0.3287391662597656, "learning_rate": 5e-07, "logits/chosen": -14558390.0, "logits/rejected": -54349192.0, "logps/chosen": -206.73556518554688, "logps/rejected": -299.8948669433594, "loss": 0.285, "rewards/chosen": 0.5898663997650146, "rewards/margins": 2.173133611679077, "rewards/rejected": -1.5832672119140625, "step": 3922 }, { "epoch": 0.20793469906977977, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15321578.666666666, "logits/rejected": -13027377.6, "logps/chosen": -303.3165283203125, "logps/rejected": -283.192822265625, "loss": 0.2803, "rewards/chosen": 0.6714212894439697, "rewards/margins": 1.84768385887146, "rewards/rejected": -1.1762625694274902, "step": 3923 }, { "epoch": 0.2079877030715819, "grad_norm": 43.25, "kl": 0.077911376953125, "learning_rate": 5e-07, "logits/chosen": -64444016.0, "logits/rejected": -12032220.0, "logps/chosen": -272.2583923339844, "logps/rejected": -206.5679931640625, "loss": 0.2876, "rewards/chosen": 0.43086662888526917, "rewards/margins": 2.4904641211032867, "rewards/rejected": -2.0595974922180176, "step": 3924 }, { "epoch": 0.20804070707338404, "grad_norm": 60.0, "kl": 0.1540994644165039, "learning_rate": 5e-07, "logits/chosen": -35528288.0, "logits/rejected": -18668126.4, "logps/chosen": -717.7959798177084, "logps/rejected": -138.8763916015625, "loss": 0.2723, "rewards/chosen": 0.7706146240234375, "rewards/margins": 2.067587375640869, "rewards/rejected": -1.2969727516174316, "step": 3925 }, { "epoch": 0.20809371107518618, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44172252.0, "logits/rejected": -38035300.0, "logps/chosen": -202.36032104492188, "logps/rejected": -441.3624267578125, "loss": 0.2994, "rewards/chosen": -0.11840939521789551, "rewards/margins": 2.5730881690979004, "rewards/rejected": -2.691497564315796, "step": 3926 }, { "epoch": 0.20814671507698831, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -130309672.0, "logits/rejected": -19986924.0, "logps/chosen": -444.895751953125, "logps/rejected": -349.17327880859375, "loss": 0.3555, "rewards/chosen": 0.23501814901828766, "rewards/margins": 1.3541795164346695, "rewards/rejected": -1.1191613674163818, "step": 3927 }, { "epoch": 0.20819971907879045, "grad_norm": 47.5, "kl": 0.3224678039550781, "learning_rate": 5e-07, "logits/chosen": 490000.875, "logits/rejected": -43090508.8, "logps/chosen": -76.48887125651042, "logps/rejected": -189.3436767578125, "loss": 0.32, "rewards/chosen": 0.28780625263849896, "rewards/margins": 1.502508286635081, "rewards/rejected": -1.214702033996582, "step": 3928 }, { "epoch": 0.2082527230805926, "grad_norm": 65.5, "kl": 1.3979759216308594, "learning_rate": 5e-07, "logits/chosen": 3326544.5, "logits/rejected": -44963450.666666664, "logps/chosen": -1288.293701171875, "logps/rejected": -419.4519449869792, "loss": 0.1881, "rewards/chosen": 0.568434476852417, "rewards/margins": 3.020226081212362, "rewards/rejected": -2.451791604359945, "step": 3929 }, { "epoch": 0.20830572708239473, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24466372.0, "logits/rejected": -28849664.0, "logps/chosen": -746.0858154296875, "logps/rejected": -333.4980163574219, "loss": 0.313, "rewards/chosen": 0.4749334156513214, "rewards/margins": 2.0377797186374664, "rewards/rejected": -1.562846302986145, "step": 3930 }, { "epoch": 0.20835873108419686, "grad_norm": 58.0, "kl": 2.91351318359375, "learning_rate": 5e-07, "logits/chosen": -25159504.0, "logits/rejected": -27202186.0, "logps/chosen": -634.2567545572916, "logps/rejected": -460.5389404296875, "loss": 0.3794, "rewards/chosen": 0.6757442156473795, "rewards/margins": 1.9844983021418252, "rewards/rejected": -1.3087540864944458, "step": 3931 }, { "epoch": 0.208411735085999, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -161768857.6, "logits/rejected": -14048672.0, "logps/chosen": -216.183642578125, "logps/rejected": -266.2054036458333, "loss": 0.3557, "rewards/chosen": 0.07251297235488892, "rewards/margins": 1.905578641096751, "rewards/rejected": -1.833065668741862, "step": 3932 }, { "epoch": 0.20846473908780114, "grad_norm": 131.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66301124.0, "logits/rejected": -18413812.0, "logps/chosen": -1679.805419921875, "logps/rejected": -339.1471252441406, "loss": 0.3556, "rewards/chosen": 0.03645649552345276, "rewards/margins": 1.6575590670108795, "rewards/rejected": -1.6211025714874268, "step": 3933 }, { "epoch": 0.20851774308960327, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3953665.0, "logits/rejected": -40961212.8, "logps/chosen": -226.28767903645834, "logps/rejected": -353.8686279296875, "loss": 0.3235, "rewards/chosen": 0.16379525264104208, "rewards/margins": 1.521339770158132, "rewards/rejected": -1.35754451751709, "step": 3934 }, { "epoch": 0.2085707470914054, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18952458.666666668, "logits/rejected": -26237372.8, "logps/chosen": -224.00189208984375, "logps/rejected": -535.882373046875, "loss": 0.3187, "rewards/chosen": -0.07799314459164937, "rewards/margins": 2.185613117615382, "rewards/rejected": -2.2636062622070314, "step": 3935 }, { "epoch": 0.20862375109320755, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23459132.0, "logits/rejected": 2291182.25, "logps/chosen": -202.05345153808594, "logps/rejected": -198.25942993164062, "loss": 0.3323, "rewards/chosen": 0.12526030838489532, "rewards/margins": 1.7452901154756546, "rewards/rejected": -1.6200298070907593, "step": 3936 }, { "epoch": 0.20867675509500969, "grad_norm": 41.75, "kl": 0.0827178955078125, "learning_rate": 5e-07, "logits/chosen": -21692968.0, "logits/rejected": -21172038.4, "logps/chosen": -200.41923014322916, "logps/rejected": -164.376416015625, "loss": 0.2792, "rewards/chosen": 0.8302021821339926, "rewards/margins": 2.0996662934621177, "rewards/rejected": -1.269464111328125, "step": 3937 }, { "epoch": 0.2087297590968118, "grad_norm": 56.75, "kl": 0.00191497802734375, "learning_rate": 5e-07, "logits/chosen": -28928716.0, "logits/rejected": -40295504.0, "logps/chosen": -162.78224182128906, "logps/rejected": -446.134033203125, "loss": 0.3324, "rewards/chosen": 0.3379303812980652, "rewards/margins": 1.7820129990577698, "rewards/rejected": -1.4440826177597046, "step": 3938 }, { "epoch": 0.20878276309861393, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16219736.0, "logits/rejected": -40557248.0, "logps/chosen": -210.73924037388392, "logps/rejected": -543.4254150390625, "loss": 0.3902, "rewards/chosen": 0.2965846742902483, "rewards/margins": 3.12724883215768, "rewards/rejected": -2.8306641578674316, "step": 3939 }, { "epoch": 0.20883576710041607, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42792306.666666664, "logits/rejected": -64648928.0, "logps/chosen": -379.8006184895833, "logps/rejected": -511.0541076660156, "loss": 0.413, "rewards/chosen": -0.00985377033551534, "rewards/margins": 2.029373715321223, "rewards/rejected": -2.0392274856567383, "step": 3940 }, { "epoch": 0.2088887711022182, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6702242.666666667, "logits/rejected": -15085833.6, "logps/chosen": -146.5666300455729, "logps/rejected": -264.99638671875, "loss": 0.3103, "rewards/chosen": 0.19103850920995077, "rewards/margins": 1.5658889730771381, "rewards/rejected": -1.3748504638671875, "step": 3941 }, { "epoch": 0.20894177510402034, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41377580.8, "logits/rejected": -10667892.0, "logps/chosen": -350.701025390625, "logps/rejected": -307.423828125, "loss": 0.2905, "rewards/chosen": 0.6748282432556152, "rewards/margins": 2.4424770991007487, "rewards/rejected": -1.7676488558451335, "step": 3942 }, { "epoch": 0.20899477910582248, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24029588.0, "logits/rejected": -75885328.0, "logps/chosen": -119.39309692382812, "logps/rejected": -322.61981201171875, "loss": 0.3344, "rewards/chosen": 0.16365833580493927, "rewards/margins": 1.8069948703050613, "rewards/rejected": -1.643336534500122, "step": 3943 }, { "epoch": 0.20904778310762462, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64627936.0, "logits/rejected": -36523020.8, "logps/chosen": -304.2257486979167, "logps/rejected": -320.103173828125, "loss": 0.2379, "rewards/chosen": 0.28479208548863727, "rewards/margins": 2.432781740029653, "rewards/rejected": -2.1479896545410155, "step": 3944 }, { "epoch": 0.20910078710942676, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72635824.0, "logits/rejected": -1646608.25, "logps/chosen": -386.8150329589844, "logps/rejected": -132.03895568847656, "loss": 0.4127, "rewards/chosen": 0.028188418596982956, "rewards/margins": 0.818485926836729, "rewards/rejected": -0.7902975082397461, "step": 3945 }, { "epoch": 0.2091537911112289, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22745680.0, "logits/rejected": 46056336.0, "logps/chosen": -213.990673828125, "logps/rejected": -404.7209065755208, "loss": 0.2892, "rewards/chosen": 0.5465744018554688, "rewards/margins": 2.7673590660095213, "rewards/rejected": -2.2207846641540527, "step": 3946 }, { "epoch": 0.20920679511303103, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53490152.0, "logits/rejected": -52679990.85714286, "logps/chosen": -222.61868286132812, "logps/rejected": -198.544921875, "loss": 0.3408, "rewards/chosen": -0.8422042727470398, "rewards/margins": 0.1626131449426924, "rewards/rejected": -1.0048174176897322, "step": 3947 }, { "epoch": 0.20925979911483317, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29710456.0, "logits/rejected": -20903652.0, "logps/chosen": -376.9588317871094, "logps/rejected": -331.1625061035156, "loss": 0.2741, "rewards/chosen": 0.5852606296539307, "rewards/margins": 2.2437397241592407, "rewards/rejected": -1.65847909450531, "step": 3948 }, { "epoch": 0.2093128031166353, "grad_norm": 59.5, "kl": 0.4512825012207031, "learning_rate": 5e-07, "logits/chosen": -20890298.666666668, "logits/rejected": -6950564.8, "logps/chosen": -342.6295572916667, "logps/rejected": -271.15927734375, "loss": 0.3307, "rewards/chosen": 0.2681312561035156, "rewards/margins": 1.4249929428100585, "rewards/rejected": -1.1568616867065429, "step": 3949 }, { "epoch": 0.20936580711843744, "grad_norm": 53.0, "kl": 0.5384140014648438, "learning_rate": 5e-07, "logits/chosen": -49946780.0, "logits/rejected": -5747190.0, "logps/chosen": -391.08306884765625, "logps/rejected": -293.0543212890625, "loss": 0.3343, "rewards/chosen": 0.11436577141284943, "rewards/margins": 1.9012814313173294, "rewards/rejected": -1.78691565990448, "step": 3950 }, { "epoch": 0.20941881112023958, "grad_norm": 50.0, "kl": 0.193115234375, "learning_rate": 5e-07, "logits/chosen": -31789075.2, "logits/rejected": -8785445.333333334, "logps/chosen": -251.140283203125, "logps/rejected": -460.861572265625, "loss": 0.3468, "rewards/chosen": 0.14419219493865967, "rewards/margins": 1.9742164373397828, "rewards/rejected": -1.830024242401123, "step": 3951 }, { "epoch": 0.20947181512204172, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 41217132.0, "logits/rejected": -10849257.0, "logps/chosen": -224.71958923339844, "logps/rejected": -273.64984130859375, "loss": 0.3278, "rewards/chosen": 6.24675303697586e-05, "rewards/margins": 2.0450301188975573, "rewards/rejected": -2.0449676513671875, "step": 3952 }, { "epoch": 0.20952481912384385, "grad_norm": 64.0, "kl": 0.3812065124511719, "learning_rate": 5e-07, "logits/chosen": -31477910.0, "logits/rejected": -4023770.6666666665, "logps/chosen": -456.2401428222656, "logps/rejected": -234.196533203125, "loss": 0.2745, "rewards/chosen": 0.940571665763855, "rewards/margins": 2.180890917778015, "rewards/rejected": -1.2403192520141602, "step": 3953 }, { "epoch": 0.209577823125646, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33579474.666666664, "logits/rejected": -27403545.6, "logps/chosen": -214.140625, "logps/rejected": -319.997021484375, "loss": 0.2391, "rewards/chosen": 1.176192839940389, "rewards/margins": 2.632680400212606, "rewards/rejected": -1.4564875602722167, "step": 3954 }, { "epoch": 0.20963082712744813, "grad_norm": 45.75, "kl": 0.13560771942138672, "learning_rate": 5e-07, "logits/chosen": -22137184.0, "logits/rejected": -26099237.333333332, "logps/chosen": -360.0241455078125, "logps/rejected": -269.2802734375, "loss": 0.3455, "rewards/chosen": 0.3782409429550171, "rewards/margins": 2.0495256185531616, "rewards/rejected": -1.6712846755981445, "step": 3955 }, { "epoch": 0.20968383112925026, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11610818.0, "logits/rejected": -28539984.0, "logps/chosen": -576.9287109375, "logps/rejected": -402.431640625, "loss": 0.2348, "rewards/chosen": -0.31629639863967896, "rewards/margins": 1.2854178718158178, "rewards/rejected": -1.6017142704554967, "step": 3956 }, { "epoch": 0.2097368351310524, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25020264.0, "logits/rejected": -47699376.0, "logps/chosen": -233.2308553059896, "logps/rejected": -174.8087646484375, "loss": 0.3106, "rewards/chosen": -0.06664733091990153, "rewards/margins": 1.790499504407247, "rewards/rejected": -1.8571468353271485, "step": 3957 }, { "epoch": 0.20978983913285454, "grad_norm": 80.0, "kl": 1.032728672027588, "learning_rate": 5e-07, "logits/chosen": -44357600.0, "logits/rejected": -20777580.0, "logps/chosen": -371.9208577473958, "logps/rejected": -204.27371215820312, "loss": 0.3819, "rewards/chosen": 0.3088194529215495, "rewards/margins": 2.6771041552225747, "rewards/rejected": -2.3682847023010254, "step": 3958 }, { "epoch": 0.20984284313465668, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7531532.5, "logits/rejected": -17724557.333333332, "logps/chosen": -70.67235565185547, "logps/rejected": -170.4379679361979, "loss": 0.3166, "rewards/chosen": -0.29617273807525635, "rewards/margins": 1.3925161759058635, "rewards/rejected": -1.6886889139811199, "step": 3959 }, { "epoch": 0.2098958471364588, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33799229.333333336, "logits/rejected": -31142089.6, "logps/chosen": -187.0321044921875, "logps/rejected": -563.407373046875, "loss": 0.2391, "rewards/chosen": 0.2161637544631958, "rewards/margins": 2.790906834602356, "rewards/rejected": -2.57474308013916, "step": 3960 }, { "epoch": 0.20994885113826095, "grad_norm": 42.75, "kl": 0.5062713623046875, "learning_rate": 5e-07, "logits/chosen": -15322688.0, "logits/rejected": -6133809.2, "logps/chosen": -266.7928059895833, "logps/rejected": -257.133203125, "loss": 0.3409, "rewards/chosen": -0.13421769936879477, "rewards/margins": 1.3832389275232952, "rewards/rejected": -1.51745662689209, "step": 3961 }, { "epoch": 0.2100018551400631, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30376586.0, "logits/rejected": 1494443.6666666667, "logps/chosen": -268.76031494140625, "logps/rejected": -216.13385009765625, "loss": 0.3264, "rewards/chosen": -0.027500249445438385, "rewards/margins": 1.1375008250276248, "rewards/rejected": -1.1650010744730632, "step": 3962 }, { "epoch": 0.21005485914186522, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1516410.3333333333, "logits/rejected": -45694070.4, "logps/chosen": -246.04205322265625, "logps/rejected": -253.1920654296875, "loss": 0.2891, "rewards/chosen": 0.008125513792037964, "rewards/margins": 1.789125269651413, "rewards/rejected": -1.780999755859375, "step": 3963 }, { "epoch": 0.21010786314366733, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27384784.0, "logits/rejected": -54124485.333333336, "logps/chosen": -163.15546875, "logps/rejected": -451.6029459635417, "loss": 0.3444, "rewards/chosen": 0.11089926958084106, "rewards/margins": 2.335844655831655, "rewards/rejected": -2.224945386250814, "step": 3964 }, { "epoch": 0.21016086714546947, "grad_norm": 63.5, "kl": 0.9317693710327148, "learning_rate": 5e-07, "logits/chosen": -24199797.333333332, "logits/rejected": -525764.75, "logps/chosen": -394.8389078776042, "logps/rejected": -90.47810363769531, "loss": 0.4339, "rewards/chosen": 0.3636314471562703, "rewards/margins": 0.6908442576726277, "rewards/rejected": -0.3272128105163574, "step": 3965 }, { "epoch": 0.2102138711472716, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33381018.666666668, "logits/rejected": -31548601.6, "logps/chosen": -157.86805216471353, "logps/rejected": -302.9139892578125, "loss": 0.2879, "rewards/chosen": -0.0853367547194163, "rewards/margins": 1.813425185283025, "rewards/rejected": -1.8987619400024414, "step": 3966 }, { "epoch": 0.21026687514907375, "grad_norm": 59.5, "kl": 0.09223556518554688, "learning_rate": 5e-07, "logits/chosen": -9701296.0, "logits/rejected": -16953965.333333332, "logps/chosen": -253.647412109375, "logps/rejected": -413.5748697916667, "loss": 0.3351, "rewards/chosen": 0.3525161027908325, "rewards/margins": 2.033844558397929, "rewards/rejected": -1.6813284556070964, "step": 3967 }, { "epoch": 0.21031987915087588, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 604300586.6666666, "logits/rejected": -9720411.2, "logps/chosen": -413.0989990234375, "logps/rejected": -356.941943359375, "loss": 0.3186, "rewards/chosen": -0.21104647715886435, "rewards/margins": 1.7092231074968975, "rewards/rejected": -1.9202695846557618, "step": 3968 }, { "epoch": 0.21037288315267802, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18285040.0, "logits/rejected": -14600827.2, "logps/chosen": -159.22283935546875, "logps/rejected": -320.5265625, "loss": 0.3379, "rewards/chosen": 0.10185286402702332, "rewards/margins": 1.2912666022777557, "rewards/rejected": -1.1894137382507324, "step": 3969 }, { "epoch": 0.21042588715448016, "grad_norm": 57.0, "kl": 0.20318984985351562, "learning_rate": 5e-07, "logits/chosen": -51496377.6, "logits/rejected": -18314590.666666668, "logps/chosen": -428.484130859375, "logps/rejected": -233.4947713216146, "loss": 0.3455, "rewards/chosen": 0.1432602047920227, "rewards/margins": 2.235990250110626, "rewards/rejected": -2.0927300453186035, "step": 3970 }, { "epoch": 0.2104788911562823, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6434700.0, "logits/rejected": -36393312.0, "logps/chosen": -198.4554443359375, "logps/rejected": -558.5780029296875, "loss": 0.2773, "rewards/chosen": 0.5850290060043335, "rewards/margins": 2.3000322580337524, "rewards/rejected": -1.715003252029419, "step": 3971 }, { "epoch": 0.21053189515808443, "grad_norm": 54.25, "kl": 0.211395263671875, "learning_rate": 5e-07, "logits/chosen": -27967573.333333332, "logits/rejected": 484086784.0, "logps/chosen": -175.774169921875, "logps/rejected": -443.02105712890625, "loss": 0.4284, "rewards/chosen": 0.06455506881078084, "rewards/margins": 1.1978795727094014, "rewards/rejected": -1.1333245038986206, "step": 3972 }, { "epoch": 0.21058489915988657, "grad_norm": 50.25, "kl": 0.07168197631835938, "learning_rate": 5e-07, "logits/chosen": -10018566.0, "logits/rejected": -33073296.0, "logps/chosen": -620.071044921875, "logps/rejected": -468.9509582519531, "loss": 0.2563, "rewards/chosen": 0.6305500268936157, "rewards/margins": 2.5712231397628784, "rewards/rejected": -1.9406731128692627, "step": 3973 }, { "epoch": 0.2106379031616887, "grad_norm": 45.0, "kl": 0.24697494506835938, "learning_rate": 5e-07, "logits/chosen": -7437983.2, "logits/rejected": -16620420.0, "logps/chosen": -204.713818359375, "logps/rejected": -145.47950236002603, "loss": 0.3041, "rewards/chosen": 0.6424170970916748, "rewards/margins": 2.0528581778208417, "rewards/rejected": -1.4104410807291667, "step": 3974 }, { "epoch": 0.21069090716349084, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16297993.333333334, "logits/rejected": -21699512.0, "logps/chosen": -63.441792805989586, "logps/rejected": -278.740283203125, "loss": 0.3101, "rewards/chosen": 0.1753442088762919, "rewards/margins": 1.5350969592730206, "rewards/rejected": -1.3597527503967286, "step": 3975 }, { "epoch": 0.21074391116529298, "grad_norm": 61.25, "kl": 0.3283500671386719, "learning_rate": 5e-07, "logits/chosen": -12278423.0, "logits/rejected": -17663128.0, "logps/chosen": -364.22119140625, "logps/rejected": -251.3813934326172, "loss": 0.2792, "rewards/chosen": 0.5470072031021118, "rewards/margins": 2.2838536500930786, "rewards/rejected": -1.7368464469909668, "step": 3976 }, { "epoch": 0.21079691516709512, "grad_norm": 39.75, "kl": 0.08335399627685547, "learning_rate": 5e-07, "logits/chosen": 1613800.1666666667, "logits/rejected": -16856854.4, "logps/chosen": -24.736648559570312, "logps/rejected": -322.81181640625, "loss": 0.3532, "rewards/chosen": -0.254756232102712, "rewards/margins": 1.2883206645647685, "rewards/rejected": -1.5430768966674804, "step": 3977 }, { "epoch": 0.21084991916889725, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37813768.0, "logits/rejected": 61208.0, "logps/chosen": -216.95048014322916, "logps/rejected": -270.978857421875, "loss": 0.3752, "rewards/chosen": -0.5279748439788818, "rewards/margins": 0.9371784687042237, "rewards/rejected": -1.4651533126831056, "step": 3978 }, { "epoch": 0.2109029231706994, "grad_norm": 66.5, "kl": 0.10369873046875, "learning_rate": 5e-07, "logits/chosen": -35231142.4, "logits/rejected": -20150605.333333332, "logps/chosen": -437.5904296875, "logps/rejected": -369.9342854817708, "loss": 0.369, "rewards/chosen": 0.21982789039611816, "rewards/margins": 1.5695704619089763, "rewards/rejected": -1.3497425715128581, "step": 3979 }, { "epoch": 0.21095592717250153, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7596159.5, "logits/rejected": -23933874.0, "logps/chosen": -227.34410095214844, "logps/rejected": -363.8174133300781, "loss": 0.2489, "rewards/chosen": 0.6956139206886292, "rewards/margins": 2.5133913159370422, "rewards/rejected": -1.817777395248413, "step": 3980 }, { "epoch": 0.21100893117430367, "grad_norm": 52.75, "kl": 0.11538505554199219, "learning_rate": 5e-07, "logits/chosen": -15640132.8, "logits/rejected": -11732113.333333334, "logps/chosen": -293.5821533203125, "logps/rejected": -148.70552571614584, "loss": 0.3973, "rewards/chosen": -0.08772628903388976, "rewards/margins": 1.5558420677979787, "rewards/rejected": -1.6435683568318684, "step": 3981 }, { "epoch": 0.2110619351761058, "grad_norm": 35.75, "kl": 0.23043155670166016, "learning_rate": 5e-07, "logits/chosen": -34321674.666666664, "logits/rejected": -12431753.6, "logps/chosen": -147.25865681966147, "logps/rejected": -147.480859375, "loss": 0.3621, "rewards/chosen": 0.3336265484491984, "rewards/margins": 1.2136540333429973, "rewards/rejected": -0.8800274848937988, "step": 3982 }, { "epoch": 0.21111493917790794, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5691394.857142857, "logits/rejected": -12212228.0, "logps/chosen": -163.00430733816964, "logps/rejected": -386.9759826660156, "loss": 0.4898, "rewards/chosen": -0.17647559302193777, "rewards/margins": 1.714930704661778, "rewards/rejected": -1.8914062976837158, "step": 3983 }, { "epoch": 0.21116794317971008, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45736176.0, "logits/rejected": -79737.8125, "logps/chosen": -361.69736328125, "logps/rejected": -190.07421875, "loss": 0.3783, "rewards/chosen": 0.055413514375686646, "rewards/margins": 1.759908785422643, "rewards/rejected": -1.7044952710469563, "step": 3984 }, { "epoch": 0.21122094718151221, "grad_norm": 49.5, "kl": 0.5115432739257812, "learning_rate": 5e-07, "logits/chosen": -87674032.0, "logits/rejected": -91131016.0, "logps/chosen": -374.5614318847656, "logps/rejected": -326.841552734375, "loss": 0.2971, "rewards/chosen": 0.10302696377038956, "rewards/margins": 2.4221399798989296, "rewards/rejected": -2.31911301612854, "step": 3985 }, { "epoch": 0.21127395118331435, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46511434.666666664, "logits/rejected": 408790208.0, "logps/chosen": -234.65767415364584, "logps/rejected": -707.3840942382812, "loss": 0.4283, "rewards/chosen": -0.03443986177444458, "rewards/margins": 2.546909272670746, "rewards/rejected": -2.5813491344451904, "step": 3986 }, { "epoch": 0.2113269551851165, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -255242.25, "logits/rejected": -12301642.0, "logps/chosen": -393.6826171875, "logps/rejected": -284.8077697753906, "loss": 0.3126, "rewards/chosen": 0.5253616571426392, "rewards/margins": 1.890414834022522, "rewards/rejected": -1.3650531768798828, "step": 3987 }, { "epoch": 0.21137995918691863, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11349793.333333334, "logits/rejected": -38540052.0, "logps/chosen": -167.22221883138022, "logps/rejected": -627.607421875, "loss": 0.3319, "rewards/chosen": 0.4881613254547119, "rewards/margins": 2.762824058532715, "rewards/rejected": -2.274662733078003, "step": 3988 }, { "epoch": 0.21143296318872074, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44809952.0, "logits/rejected": -80001669.33333333, "logps/chosen": -417.830078125, "logps/rejected": -338.92148844401044, "loss": 0.3551, "rewards/chosen": 0.09969239234924317, "rewards/margins": 1.8609591007232666, "rewards/rejected": -1.7612667083740234, "step": 3989 }, { "epoch": 0.21148596719052287, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34011968.0, "logits/rejected": -18201416.0, "logps/chosen": -288.4774169921875, "logps/rejected": -214.46083068847656, "loss": 0.2998, "rewards/chosen": 0.1282055675983429, "rewards/margins": 2.1461730301380157, "rewards/rejected": -2.017967462539673, "step": 3990 }, { "epoch": 0.211538971192325, "grad_norm": 36.75, "kl": 0.1874523162841797, "learning_rate": 5e-07, "logits/chosen": 1995991.0, "logits/rejected": -5007283.2, "logps/chosen": -59.177042643229164, "logps/rejected": -636.1861328125, "loss": 0.2809, "rewards/chosen": -0.16163470347722372, "rewards/margins": 2.1716946880022685, "rewards/rejected": -2.3333293914794924, "step": 3991 }, { "epoch": 0.21159197519412715, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5513151.333333333, "logits/rejected": -31141686.4, "logps/chosen": -132.1837158203125, "logps/rejected": -258.047412109375, "loss": 0.2577, "rewards/chosen": 0.6988060474395752, "rewards/margins": 2.3844795703887938, "rewards/rejected": -1.6856735229492188, "step": 3992 }, { "epoch": 0.21164497919592928, "grad_norm": 46.0, "kl": 0.5696067810058594, "learning_rate": 5e-07, "logits/chosen": -18619028.0, "logits/rejected": -26280276.0, "logps/chosen": -273.6371154785156, "logps/rejected": -352.8935546875, "loss": 0.3179, "rewards/chosen": 0.28052079677581787, "rewards/margins": 2.542570471763611, "rewards/rejected": -2.262049674987793, "step": 3993 }, { "epoch": 0.21169798319773142, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30458476.8, "logits/rejected": -25732336.0, "logps/chosen": -405.830615234375, "logps/rejected": -233.965576171875, "loss": 0.3439, "rewards/chosen": 0.18051837682723998, "rewards/margins": 1.8879403154055279, "rewards/rejected": -1.7074219385782878, "step": 3994 }, { "epoch": 0.21175098719953356, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16533715.2, "logits/rejected": -498795.1666666667, "logps/chosen": -165.1358642578125, "logps/rejected": -82.80671691894531, "loss": 0.4286, "rewards/chosen": -0.27071199417114256, "rewards/margins": 1.1418216069539389, "rewards/rejected": -1.4125336011250813, "step": 3995 }, { "epoch": 0.2118039912013357, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34812572.0, "logits/rejected": -10070491.333333334, "logps/chosen": -314.43798828125, "logps/rejected": -167.75485229492188, "loss": 0.2339, "rewards/chosen": 0.9000656604766846, "rewards/margins": 2.548432429631551, "rewards/rejected": -1.6483667691548665, "step": 3996 }, { "epoch": 0.21185699520313783, "grad_norm": 50.25, "kl": 1.1011390686035156, "learning_rate": 5e-07, "logits/chosen": -18099691.2, "logits/rejected": -13891100.0, "logps/chosen": -363.479443359375, "logps/rejected": -122.91868082682292, "loss": 0.3709, "rewards/chosen": 0.0757571280002594, "rewards/margins": 1.8122778952121734, "rewards/rejected": -1.736520767211914, "step": 3997 }, { "epoch": 0.21190999920493997, "grad_norm": 62.5, "kl": 0.8187255859375, "learning_rate": 5e-07, "logits/chosen": -11082469.0, "logits/rejected": -52443188.0, "logps/chosen": -632.3472290039062, "logps/rejected": -180.07949829101562, "loss": 0.3707, "rewards/chosen": 0.4869537353515625, "rewards/margins": 1.3131316304206848, "rewards/rejected": -0.8261778950691223, "step": 3998 }, { "epoch": 0.2119630032067421, "grad_norm": 50.75, "kl": 0.8755531311035156, "learning_rate": 5e-07, "logits/chosen": -35603484.0, "logits/rejected": -12527294.0, "logps/chosen": -231.119873046875, "logps/rejected": -228.3079833984375, "loss": 0.3688, "rewards/chosen": 0.3439965844154358, "rewards/margins": 1.6025519967079163, "rewards/rejected": -1.2585554122924805, "step": 3999 }, { "epoch": 0.21201600720854424, "grad_norm": 54.5, "kl": 0.4593181610107422, "learning_rate": 5e-07, "logits/chosen": -31632309.333333332, "logits/rejected": -64026952.0, "logps/chosen": -188.44512939453125, "logps/rejected": -355.43853759765625, "loss": 0.4585, "rewards/chosen": -0.18343142668406168, "rewards/margins": 1.7462182839711506, "rewards/rejected": -1.9296497106552124, "step": 4000 }, { "epoch": 0.21206901121034638, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10802292.0, "logits/rejected": -22147512.0, "logps/chosen": -389.6166076660156, "logps/rejected": -460.4733479817708, "loss": 0.2047, "rewards/chosen": 0.7893860340118408, "rewards/margins": 2.600306113560994, "rewards/rejected": -1.8109200795491536, "step": 4001 }, { "epoch": 0.21212201521214852, "grad_norm": 48.0, "kl": 0.5647716522216797, "learning_rate": 5e-07, "logits/chosen": -34085276.8, "logits/rejected": -7383472.0, "logps/chosen": -166.166943359375, "logps/rejected": -679.8269856770834, "loss": 0.3397, "rewards/chosen": 0.2457895278930664, "rewards/margins": 2.5214529037475586, "rewards/rejected": -2.275663375854492, "step": 4002 }, { "epoch": 0.21217501921395066, "grad_norm": 45.25, "kl": 2.0669097900390625, "learning_rate": 5e-07, "logits/chosen": -39020634.666666664, "logits/rejected": -12380051.2, "logps/chosen": -572.1944173177084, "logps/rejected": -207.408349609375, "loss": 0.2561, "rewards/chosen": 1.09027099609375, "rewards/margins": 2.959920310974121, "rewards/rejected": -1.869649314880371, "step": 4003 }, { "epoch": 0.2122280232157528, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19327348.0, "logits/rejected": -38911764.0, "logps/chosen": -328.02972412109375, "logps/rejected": -365.6262512207031, "loss": 0.3511, "rewards/chosen": 0.09794864058494568, "rewards/margins": 1.4964788854122162, "rewards/rejected": -1.3985302448272705, "step": 4004 }, { "epoch": 0.21228102721755493, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51364490.666666664, "logits/rejected": -8374240.0, "logps/chosen": -123.05657958984375, "logps/rejected": -168.4815673828125, "loss": 0.3174, "rewards/chosen": -0.12544276316960654, "rewards/margins": 1.4486639300982158, "rewards/rejected": -1.5741066932678223, "step": 4005 }, { "epoch": 0.21233403121935707, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51312448.0, "logits/rejected": -17392236.0, "logps/chosen": -179.7369842529297, "logps/rejected": -245.87420654296875, "loss": 0.3839, "rewards/chosen": -0.3104551434516907, "rewards/margins": 1.3039990067481995, "rewards/rejected": -1.6144541501998901, "step": 4006 }, { "epoch": 0.2123870352211592, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10741286.0, "logits/rejected": -50900464.0, "logps/chosen": -270.2440490722656, "logps/rejected": -443.2103271484375, "loss": 0.2738, "rewards/chosen": 0.14306393265724182, "rewards/margins": 2.525320440530777, "rewards/rejected": -2.382256507873535, "step": 4007 }, { "epoch": 0.21244003922296134, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27347902.0, "logits/rejected": -18375026.0, "logps/chosen": -252.40216064453125, "logps/rejected": -183.657470703125, "loss": 0.3509, "rewards/chosen": 0.336658239364624, "rewards/margins": 1.5832242965698242, "rewards/rejected": -1.2465660572052002, "step": 4008 }, { "epoch": 0.21249304322476348, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9218025.0, "logits/rejected": -100037192.0, "logps/chosen": -694.682373046875, "logps/rejected": -505.408935546875, "loss": 0.2052, "rewards/chosen": 0.7770485281944275, "rewards/margins": 3.5799805521965027, "rewards/rejected": -2.802932024002075, "step": 4009 }, { "epoch": 0.21254604722656562, "grad_norm": 56.25, "kl": 0.06216621398925781, "learning_rate": 5e-07, "logits/chosen": -12327616.0, "logits/rejected": 6635999.5, "logps/chosen": -255.35972595214844, "logps/rejected": -198.3957977294922, "loss": 0.421, "rewards/chosen": -0.03223379701375961, "rewards/margins": 0.6429237499833107, "rewards/rejected": -0.6751575469970703, "step": 4010 }, { "epoch": 0.21259905122836775, "grad_norm": 53.0, "kl": 0.16005229949951172, "learning_rate": 5e-07, "logits/chosen": -47792870.4, "logits/rejected": -23143984.0, "logps/chosen": -277.472802734375, "logps/rejected": -147.62517293294272, "loss": 0.4332, "rewards/chosen": -0.08288147449493408, "rewards/margins": 0.930047329266866, "rewards/rejected": -1.0129288037618, "step": 4011 }, { "epoch": 0.2126520552301699, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8608169.0, "logits/rejected": -42409876.0, "logps/chosen": -127.80143737792969, "logps/rejected": -239.03248596191406, "loss": 0.378, "rewards/chosen": 0.049721717834472656, "rewards/margins": 1.159485101699829, "rewards/rejected": -1.1097633838653564, "step": 4012 }, { "epoch": 0.21270505923197203, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73733242.66666667, "logits/rejected": -69903641.6, "logps/chosen": -417.4060872395833, "logps/rejected": -483.79404296875, "loss": 0.2275, "rewards/chosen": 0.4081268310546875, "rewards/margins": 2.7376171112060548, "rewards/rejected": -2.3294902801513673, "step": 4013 }, { "epoch": 0.21275806323377414, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26918048.0, "logits/rejected": -11789426.0, "logps/chosen": -192.44735717773438, "logps/rejected": -221.02354431152344, "loss": 0.4084, "rewards/chosen": -0.26922541856765747, "rewards/margins": 1.004412591457367, "rewards/rejected": -1.2736380100250244, "step": 4014 }, { "epoch": 0.21281106723557627, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54696992.0, "logits/rejected": -51671780.0, "logps/chosen": -493.163330078125, "logps/rejected": -672.3084716796875, "loss": 0.2915, "rewards/chosen": 0.024697110056877136, "rewards/margins": 2.461245819926262, "rewards/rejected": -2.4365487098693848, "step": 4015 }, { "epoch": 0.2128640712373784, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5618982.666666667, "logits/rejected": -36697651.2, "logps/chosen": -138.85445149739584, "logps/rejected": -303.537451171875, "loss": 0.3176, "rewards/chosen": 0.24245198567708334, "rewards/margins": 1.883515485127767, "rewards/rejected": -1.6410634994506836, "step": 4016 }, { "epoch": 0.21291707523918055, "grad_norm": 43.5, "kl": 0.30091285705566406, "learning_rate": 5e-07, "logits/chosen": -2428198.4, "logits/rejected": -62350709.333333336, "logps/chosen": -220.5006591796875, "logps/rejected": -352.6442057291667, "loss": 0.3814, "rewards/chosen": 0.02810360789299011, "rewards/margins": 1.9871963838736217, "rewards/rejected": -1.9590927759806316, "step": 4017 }, { "epoch": 0.21297007924098268, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21357080.0, "logits/rejected": -31298245.333333332, "logps/chosen": -689.830322265625, "logps/rejected": -412.9191080729167, "loss": 0.2539, "rewards/chosen": 0.4497482180595398, "rewards/margins": 2.0419984062512713, "rewards/rejected": -1.5922501881917317, "step": 4018 }, { "epoch": 0.21302308324278482, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12808728.0, "logits/rejected": -1281994.4, "logps/chosen": -179.02876790364584, "logps/rejected": -73.0399169921875, "loss": 0.3832, "rewards/chosen": -0.06795768936475118, "rewards/margins": 0.898485658566157, "rewards/rejected": -0.9664433479309082, "step": 4019 }, { "epoch": 0.21307608724458696, "grad_norm": 56.5, "kl": 1.2757072448730469, "learning_rate": 5e-07, "logits/chosen": 5619063.0, "logits/rejected": -38821528.0, "logps/chosen": -1279.482421875, "logps/rejected": -454.7318522135417, "loss": 0.1735, "rewards/chosen": 1.638798475265503, "rewards/margins": 3.4970115820566816, "rewards/rejected": -1.8582131067911785, "step": 4020 }, { "epoch": 0.2131290912463891, "grad_norm": 81.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 89169523.2, "logits/rejected": -27177485.333333332, "logps/chosen": -463.74970703125, "logps/rejected": -349.6216227213542, "loss": 0.4068, "rewards/chosen": -0.23323183059692382, "rewards/margins": 1.4940621376037597, "rewards/rejected": -1.7272939682006836, "step": 4021 }, { "epoch": 0.21318209524819123, "grad_norm": 46.75, "kl": 0.33858680725097656, "learning_rate": 5e-07, "logits/chosen": -2247112.1666666665, "logits/rejected": 1159422.4, "logps/chosen": -253.85066731770834, "logps/rejected": -228.233935546875, "loss": 0.3046, "rewards/chosen": 0.8246275583902994, "rewards/margins": 1.8544727007548012, "rewards/rejected": -1.0298451423645019, "step": 4022 }, { "epoch": 0.21323509924999337, "grad_norm": 58.0, "kl": 1.3674163818359375, "learning_rate": 5e-07, "logits/chosen": -56202662.4, "logits/rejected": -10724296.0, "logps/chosen": -302.092578125, "logps/rejected": -293.95994059244794, "loss": 0.4105, "rewards/chosen": 0.2903388261795044, "rewards/margins": 1.5165961503982544, "rewards/rejected": -1.22625732421875, "step": 4023 }, { "epoch": 0.2132881032517955, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25781253.333333332, "logits/rejected": -2495497.4, "logps/chosen": -254.67313639322916, "logps/rejected": -92.19945678710937, "loss": 0.3264, "rewards/chosen": 0.15248235066731772, "rewards/margins": 1.618447240193685, "rewards/rejected": -1.4659648895263673, "step": 4024 }, { "epoch": 0.21334110725359764, "grad_norm": 68.0, "kl": 0.45665740966796875, "learning_rate": 5e-07, "logits/chosen": -6414255.0, "logits/rejected": -3046643.0, "logps/chosen": -332.04656982421875, "logps/rejected": -620.2857666015625, "loss": 0.3376, "rewards/chosen": -0.04450682923197746, "rewards/margins": 2.574200589209795, "rewards/rejected": -2.6187074184417725, "step": 4025 }, { "epoch": 0.21339411125539978, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8418796.0, "logits/rejected": -2168338.0, "logps/chosen": -236.4661376953125, "logps/rejected": -262.6824137369792, "loss": 0.444, "rewards/chosen": -0.05005713701248169, "rewards/margins": 1.1110693176587423, "rewards/rejected": -1.1611264546712239, "step": 4026 }, { "epoch": 0.21344711525720192, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10402961.0, "logps/chosen": -206.14572143554688, "loss": 0.5284, "rewards/chosen": -0.11017577350139618, "step": 4027 }, { "epoch": 0.21350011925900406, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48122778.666666664, "logits/rejected": -11691363.2, "logps/chosen": -339.5920003255208, "logps/rejected": -142.345556640625, "loss": 0.4224, "rewards/chosen": -0.25820714235305786, "rewards/margins": 0.45613259077072144, "rewards/rejected": -0.7143397331237793, "step": 4028 }, { "epoch": 0.2135531232608062, "grad_norm": 129.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61189139.2, "logits/rejected": -6716023.333333333, "logps/chosen": -289.00556640625, "logps/rejected": -180.3604939778646, "loss": 0.4414, "rewards/chosen": -0.21456236839294435, "rewards/margins": 0.9212515830993653, "rewards/rejected": -1.1358139514923096, "step": 4029 }, { "epoch": 0.21360612726260833, "grad_norm": 49.75, "kl": 0.04754638671875, "learning_rate": 5e-07, "logits/chosen": -49676540.0, "logits/rejected": -15041163.0, "logps/chosen": -325.40740966796875, "logps/rejected": -180.95233154296875, "loss": 0.2701, "rewards/chosen": 0.27408939599990845, "rewards/margins": 2.7863243222236633, "rewards/rejected": -2.512234926223755, "step": 4030 }, { "epoch": 0.21365913126441047, "grad_norm": 54.25, "kl": 0.6870040893554688, "learning_rate": 5e-07, "logits/chosen": -21849328.0, "logits/rejected": -122718592.0, "logps/chosen": -231.8934326171875, "logps/rejected": -276.4709879557292, "loss": 0.3417, "rewards/chosen": 0.18116860389709472, "rewards/margins": 2.4464841365814207, "rewards/rejected": -2.265315532684326, "step": 4031 }, { "epoch": 0.2137121352662126, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30047300.0, "logits/rejected": -670594.6666666666, "logps/chosen": -573.79541015625, "logps/rejected": -392.0045166015625, "loss": 0.263, "rewards/chosen": 0.10164336860179901, "rewards/margins": 1.808220351735751, "rewards/rejected": -1.706576983133952, "step": 4032 }, { "epoch": 0.21376513926801474, "grad_norm": 62.25, "kl": 0.5336532592773438, "learning_rate": 5e-07, "logits/chosen": -22676361.6, "logits/rejected": -14177276.0, "logps/chosen": -489.140087890625, "logps/rejected": -245.53411865234375, "loss": 0.3566, "rewards/chosen": 0.17528535127639772, "rewards/margins": 2.086432468891144, "rewards/rejected": -1.911147117614746, "step": 4033 }, { "epoch": 0.21381814326981688, "grad_norm": 45.0, "kl": 0.42937755584716797, "learning_rate": 5e-07, "logits/chosen": 8756983.0, "logits/rejected": 6361946.666666667, "logps/chosen": -440.72845458984375, "logps/rejected": -144.48273722330728, "loss": 0.2822, "rewards/chosen": 0.9693104028701782, "rewards/margins": 2.027484854062398, "rewards/rejected": -1.05817445119222, "step": 4034 }, { "epoch": 0.21387114727161902, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 24138352.0, "logits/rejected": 4099189.0, "logps/chosen": -703.88134765625, "logps/rejected": -282.11199951171875, "loss": 0.3468, "rewards/chosen": 0.3400602340698242, "rewards/margins": 1.4368748664855957, "rewards/rejected": -1.0968146324157715, "step": 4035 }, { "epoch": 0.21392415127342115, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3081703.0, "logits/rejected": -13285937.333333334, "logps/chosen": -328.41973876953125, "logps/rejected": -212.3985392252604, "loss": 0.2725, "rewards/chosen": 0.5130302906036377, "rewards/margins": 1.989020586013794, "rewards/rejected": -1.4759902954101562, "step": 4036 }, { "epoch": 0.2139771552752233, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61684704.0, "logits/rejected": -39787781.333333336, "logps/chosen": -559.267822265625, "logps/rejected": -534.6309407552084, "loss": 0.1807, "rewards/chosen": 0.34103167057037354, "rewards/margins": 3.135578433672587, "rewards/rejected": -2.7945467631022134, "step": 4037 }, { "epoch": 0.21403015927702543, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2245187.5, "logits/rejected": 2618636.5714285714, "logps/chosen": -41.32630920410156, "logps/rejected": -280.04603794642856, "loss": 0.2453, "rewards/chosen": -0.7806640863418579, "rewards/margins": 1.1247379609516688, "rewards/rejected": -1.9054020472935267, "step": 4038 }, { "epoch": 0.21408316327882757, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62087232.0, "logits/rejected": -34869349.333333336, "logps/chosen": -234.7707061767578, "logps/rejected": -364.4525553385417, "loss": 0.227, "rewards/chosen": 0.16012878715991974, "rewards/margins": 2.2038153360287347, "rewards/rejected": -2.043686548868815, "step": 4039 }, { "epoch": 0.21413616728062967, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63797424.0, "logits/rejected": -19379230.0, "logps/chosen": -516.1369018554688, "logps/rejected": -204.93052673339844, "loss": 0.3116, "rewards/chosen": 0.24892465770244598, "rewards/margins": 1.8882614821195602, "rewards/rejected": -1.6393368244171143, "step": 4040 }, { "epoch": 0.2141891712824318, "grad_norm": 52.75, "kl": 0.7212285995483398, "learning_rate": 5e-07, "logits/chosen": -12286829.333333334, "logits/rejected": -37281208.0, "logps/chosen": -189.23699951171875, "logps/rejected": -497.9685974121094, "loss": 0.4333, "rewards/chosen": -0.031580751140912376, "rewards/margins": 1.8818148920933406, "rewards/rejected": -1.913395643234253, "step": 4041 }, { "epoch": 0.21424217528423395, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 32246560.0, "logits/rejected": -40031444.0, "logps/chosen": -267.75360107421875, "logps/rejected": -452.8229064941406, "loss": 0.2922, "rewards/chosen": 0.15806227922439575, "rewards/margins": 2.818104088306427, "rewards/rejected": -2.6600418090820312, "step": 4042 }, { "epoch": 0.21429517928603609, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50056387.2, "logits/rejected": -1047936.0, "logps/chosen": -535.366845703125, "logps/rejected": -380.4299723307292, "loss": 0.4, "rewards/chosen": -0.058322155475616456, "rewards/margins": 1.5200868805249532, "rewards/rejected": -1.5784090360005696, "step": 4043 }, { "epoch": 0.21434818328783822, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73759477.33333333, "logits/rejected": -22019363.2, "logps/chosen": -360.7748616536458, "logps/rejected": -283.38955078125, "loss": 0.348, "rewards/chosen": -0.4296773274739583, "rewards/margins": 1.354180399576823, "rewards/rejected": -1.7838577270507812, "step": 4044 }, { "epoch": 0.21440118728964036, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52048160.0, "logits/rejected": -34162280.0, "logps/chosen": -363.1499328613281, "logps/rejected": -297.32781982421875, "loss": 0.3482, "rewards/chosen": -0.2234768122434616, "rewards/margins": 1.8045216351747513, "rewards/rejected": -2.027998447418213, "step": 4045 }, { "epoch": 0.2144541912914425, "grad_norm": 54.0, "kl": 0.29386138916015625, "learning_rate": 5e-07, "logits/chosen": -28240358.4, "logits/rejected": 156733834.66666666, "logps/chosen": -432.8921875, "logps/rejected": -751.2134602864584, "loss": 0.2719, "rewards/chosen": 0.7041150093078613, "rewards/margins": 3.071994241078695, "rewards/rejected": -2.3678792317708335, "step": 4046 }, { "epoch": 0.21450719529324463, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9574127.0, "logits/rejected": 67706024.0, "logps/chosen": -123.77011108398438, "logps/rejected": -424.3481140136719, "loss": 0.3643, "rewards/chosen": -0.09013195335865021, "rewards/margins": 1.711918517947197, "rewards/rejected": -1.8020504713058472, "step": 4047 }, { "epoch": 0.21456019929504677, "grad_norm": 50.75, "kl": 0.021337509155273438, "learning_rate": 5e-07, "logits/chosen": -22426870.4, "logits/rejected": -34768277.333333336, "logps/chosen": -225.88720703125, "logps/rejected": -382.26513671875, "loss": 0.4149, "rewards/chosen": -0.35723557472229006, "rewards/margins": 1.5934208393096925, "rewards/rejected": -1.9506564140319824, "step": 4048 }, { "epoch": 0.2146132032968489, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40808524.0, "logits/rejected": -39220620.0, "logps/chosen": -394.99456787109375, "logps/rejected": -294.0069580078125, "loss": 0.2558, "rewards/chosen": 0.6698997616767883, "rewards/margins": 2.482708752155304, "rewards/rejected": -1.8128089904785156, "step": 4049 }, { "epoch": 0.21466620729865105, "grad_norm": 48.25, "kl": 0.2126016616821289, "learning_rate": 5e-07, "logits/chosen": -28945414.4, "logits/rejected": 8102265.333333333, "logps/chosen": -355.475927734375, "logps/rejected": -306.406982421875, "loss": 0.2993, "rewards/chosen": 0.5645242691040039, "rewards/margins": 2.3379717826843263, "rewards/rejected": -1.7734475135803223, "step": 4050 }, { "epoch": 0.21471921130045318, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45301024.0, "logits/rejected": -795452.5, "logps/chosen": -489.720458984375, "logps/rejected": -342.7269775390625, "loss": 0.29, "rewards/chosen": 0.09717889626820882, "rewards/margins": 1.7723433097203571, "rewards/rejected": -1.6751644134521484, "step": 4051 }, { "epoch": 0.21477221530225532, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22367134.0, "logits/rejected": -6443784.666666667, "logps/chosen": -153.83189392089844, "logps/rejected": -326.0255126953125, "loss": 0.2291, "rewards/chosen": 0.6033178567886353, "rewards/margins": 2.4431326786677046, "rewards/rejected": -1.839814821879069, "step": 4052 }, { "epoch": 0.21482521930405746, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -27901024.0, "logps/rejected": -412.9771423339844, "loss": 0.1438, "rewards/rejected": -2.021245002746582, "step": 4053 }, { "epoch": 0.2148782233058596, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11241186.666666666, "logits/rejected": -39919481.6, "logps/chosen": -382.2027180989583, "logps/rejected": -503.01962890625, "loss": 0.165, "rewards/chosen": 1.109108527501424, "rewards/margins": 4.065686209996541, "rewards/rejected": -2.956577682495117, "step": 4054 }, { "epoch": 0.21493122730766173, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17657322.0, "logits/rejected": -34472256.0, "logps/chosen": -332.5433654785156, "logps/rejected": -489.92010498046875, "loss": 0.3396, "rewards/chosen": -0.1086650863289833, "rewards/margins": 1.9980803951621056, "rewards/rejected": -2.106745481491089, "step": 4055 }, { "epoch": 0.21498423130946387, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75593264.0, "logits/rejected": -34788432.0, "logps/chosen": -548.5701904296875, "logps/rejected": -310.95359293619794, "loss": 0.2742, "rewards/chosen": 0.533424437046051, "rewards/margins": 2.238512893517812, "rewards/rejected": -1.705088456471761, "step": 4056 }, { "epoch": 0.215037235311266, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23153252.0, "logits/rejected": -25860476.0, "logps/chosen": -206.25094604492188, "logps/rejected": -310.2256774902344, "loss": 0.3628, "rewards/chosen": -0.29549065232276917, "rewards/margins": 1.466266244649887, "rewards/rejected": -1.7617568969726562, "step": 4057 }, { "epoch": 0.21509023931306814, "grad_norm": 51.0, "kl": 0.20554161071777344, "learning_rate": 5e-07, "logits/chosen": -66765296.0, "logits/rejected": -26316184.0, "logps/chosen": -464.3907877604167, "logps/rejected": -300.6263427734375, "loss": 0.2862, "rewards/chosen": 0.24924516677856445, "rewards/margins": 2.0449923515319823, "rewards/rejected": -1.7957471847534179, "step": 4058 }, { "epoch": 0.21514324331487028, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35725092.0, "logits/rejected": -62397384.0, "logps/chosen": -374.3369140625, "logps/rejected": -318.02874755859375, "loss": 0.2724, "rewards/chosen": 0.5776781439781189, "rewards/margins": 2.3364580273628235, "rewards/rejected": -1.7587798833847046, "step": 4059 }, { "epoch": 0.21519624731667242, "grad_norm": 60.75, "kl": 0.41768932342529297, "learning_rate": 5e-07, "logits/chosen": -23681136.0, "logits/rejected": -3769656.25, "logps/chosen": -348.1605224609375, "logps/rejected": -99.57101440429688, "loss": 0.3397, "rewards/chosen": 0.2534578740596771, "rewards/margins": 1.7033950984477997, "rewards/rejected": -1.4499372243881226, "step": 4060 }, { "epoch": 0.21524925131847455, "grad_norm": 68.5, "kl": 0.5785484313964844, "learning_rate": 5e-07, "logits/chosen": -69457772.8, "logits/rejected": -71152522.66666667, "logps/chosen": -340.2487060546875, "logps/rejected": -409.490966796875, "loss": 0.3425, "rewards/chosen": 0.33418855667114256, "rewards/margins": 2.113080596923828, "rewards/rejected": -1.7788920402526855, "step": 4061 }, { "epoch": 0.2153022553202767, "grad_norm": 41.75, "kl": 0.07495880126953125, "learning_rate": 5e-07, "logits/chosen": -11689961.0, "logits/rejected": 201327520.0, "logps/chosen": -264.8762512207031, "logps/rejected": -426.96710205078125, "loss": 0.2461, "rewards/chosen": 0.46125662326812744, "rewards/margins": 3.0067096948623657, "rewards/rejected": -2.5454530715942383, "step": 4062 }, { "epoch": 0.21535525932207883, "grad_norm": 65.0, "kl": 1.1574935913085938, "learning_rate": 5e-07, "logits/chosen": -30351884.8, "logits/rejected": -38290765.333333336, "logps/chosen": -623.3119140625, "logps/rejected": -209.6611328125, "loss": 0.4042, "rewards/chosen": 0.25998334884643554, "rewards/margins": 1.728160349527995, "rewards/rejected": -1.4681770006815593, "step": 4063 }, { "epoch": 0.21540826332388097, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15575509.0, "logits/rejected": -42852251.428571425, "logps/chosen": -301.1882019042969, "logps/rejected": -389.6469029017857, "loss": 0.2006, "rewards/chosen": -0.5079193115234375, "rewards/margins": 1.3952478681291853, "rewards/rejected": -1.9031671796526228, "step": 4064 }, { "epoch": 0.21546126732568308, "grad_norm": 48.75, "kl": 1.7533292770385742, "learning_rate": 5e-07, "logits/chosen": -55095152.0, "logits/rejected": -11128349.0, "logps/chosen": -348.3241882324219, "logps/rejected": -207.63427734375, "loss": 0.3613, "rewards/chosen": 0.007201403379440308, "rewards/margins": 1.3791479766368866, "rewards/rejected": -1.3719465732574463, "step": 4065 }, { "epoch": 0.2155142713274852, "grad_norm": 59.5, "kl": 0.9759864807128906, "learning_rate": 5e-07, "logits/chosen": -20711690.666666668, "logits/rejected": -77171992.0, "logps/chosen": -444.0846761067708, "logps/rejected": -706.592041015625, "loss": 0.28, "rewards/chosen": 0.7376564343770345, "rewards/margins": 4.410784085591634, "rewards/rejected": -3.6731276512145996, "step": 4066 }, { "epoch": 0.21556727532928735, "grad_norm": 49.25, "kl": 0.18397808074951172, "learning_rate": 5e-07, "logits/chosen": -20724580.0, "logits/rejected": -21995980.0, "logps/chosen": -330.2135009765625, "logps/rejected": -243.26441955566406, "loss": 0.3275, "rewards/chosen": 0.5304741859436035, "rewards/margins": 1.6425931453704834, "rewards/rejected": -1.1121189594268799, "step": 4067 }, { "epoch": 0.2156202793310895, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9827054.666666666, "logits/rejected": -3897288.0, "logps/chosen": -337.22182210286456, "logps/rejected": -364.785302734375, "loss": 0.2496, "rewards/chosen": 0.49086999893188477, "rewards/margins": 2.30108003616333, "rewards/rejected": -1.8102100372314454, "step": 4068 }, { "epoch": 0.21567328333289162, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44505308.8, "logits/rejected": -27714290.666666668, "logps/chosen": -574.9439453125, "logps/rejected": -93.2566630045573, "loss": 0.3107, "rewards/chosen": 0.7643335342407227, "rewards/margins": 1.7973862965901692, "rewards/rejected": -1.0330527623494465, "step": 4069 }, { "epoch": 0.21572628733469376, "grad_norm": 49.5, "kl": 0.7143440246582031, "learning_rate": 5e-07, "logits/chosen": -36584636.0, "logits/rejected": -26359986.666666668, "logps/chosen": -471.44830322265625, "logps/rejected": -281.594482421875, "loss": 0.2991, "rewards/chosen": 0.3295150697231293, "rewards/margins": 1.6174948314825695, "rewards/rejected": -1.2879797617594402, "step": 4070 }, { "epoch": 0.2157792913364959, "grad_norm": 39.5, "kl": 1.0989761352539062, "learning_rate": 5e-07, "logits/chosen": -25964272.0, "logits/rejected": -9879243.0, "logps/chosen": -191.83880615234375, "logps/rejected": -176.9468994140625, "loss": 0.281, "rewards/chosen": 0.3694801330566406, "rewards/margins": 2.5937979221343994, "rewards/rejected": -2.224317789077759, "step": 4071 }, { "epoch": 0.21583229533829804, "grad_norm": 55.75, "kl": 0.2502021789550781, "learning_rate": 5e-07, "logits/chosen": -34146704.0, "logits/rejected": 14456496.0, "logps/chosen": -386.5591796875, "logps/rejected": -91.67303466796875, "loss": 0.363, "rewards/chosen": 0.295650577545166, "rewards/margins": 2.099924182891846, "rewards/rejected": -1.8042736053466797, "step": 4072 }, { "epoch": 0.21588529934010017, "grad_norm": 54.5, "kl": 0.14713287353515625, "learning_rate": 5e-07, "logits/chosen": -44173196.8, "logits/rejected": -16506162.666666666, "logps/chosen": -277.10908203125, "logps/rejected": -322.5743815104167, "loss": 0.3842, "rewards/chosen": -0.28886749744415285, "rewards/margins": 2.5286221265792848, "rewards/rejected": -2.8174896240234375, "step": 4073 }, { "epoch": 0.2159383033419023, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 24988186.666666668, "logits/rejected": -17020105.6, "logps/chosen": -222.7128702799479, "logps/rejected": -204.180712890625, "loss": 0.2986, "rewards/chosen": 0.3441757361094157, "rewards/margins": 1.9476757208506268, "rewards/rejected": -1.603499984741211, "step": 4074 }, { "epoch": 0.21599130734370445, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26810066.0, "logits/rejected": -21270612.57142857, "logps/chosen": -460.2240295410156, "logps/rejected": -251.39927455357142, "loss": 0.2484, "rewards/chosen": 0.27154237031936646, "rewards/margins": 1.7338400823729379, "rewards/rejected": -1.4622977120535714, "step": 4075 }, { "epoch": 0.21604431134550658, "grad_norm": 58.0, "kl": 1.3565254211425781, "learning_rate": 5e-07, "logits/chosen": -41702512.0, "logits/rejected": -5198088.5, "logps/chosen": -441.7080891927083, "logps/rejected": -202.8086700439453, "loss": 0.3724, "rewards/chosen": 0.46426061789194745, "rewards/margins": 2.009491960207621, "rewards/rejected": -1.5452313423156738, "step": 4076 }, { "epoch": 0.21609731534730872, "grad_norm": 51.0, "kl": 0.20537948608398438, "learning_rate": 5e-07, "logits/chosen": -48243608.0, "logits/rejected": -15769090.666666666, "logps/chosen": -465.55194091796875, "logps/rejected": -149.80423990885416, "loss": 0.3312, "rewards/chosen": 0.9296097159385681, "rewards/margins": 1.6249416867891946, "rewards/rejected": -0.6953319708506266, "step": 4077 }, { "epoch": 0.21615031934911086, "grad_norm": 56.5, "kl": 0.7506141662597656, "learning_rate": 5e-07, "logits/chosen": -5413224.8, "logits/rejected": -46698410.666666664, "logps/chosen": -325.845068359375, "logps/rejected": -380.5895589192708, "loss": 0.4072, "rewards/chosen": -0.05833314061164856, "rewards/margins": 1.6856175442536672, "rewards/rejected": -1.7439506848653157, "step": 4078 }, { "epoch": 0.216203323350913, "grad_norm": 30.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2035346.5, "logits/rejected": 811167.9, "logps/chosen": -39.426859537760414, "logps/rejected": -392.8810791015625, "loss": 0.3339, "rewards/chosen": -0.8418131669362386, "rewards/margins": 1.4206740538279212, "rewards/rejected": -2.26248722076416, "step": 4079 }, { "epoch": 0.21625632735271513, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1445242.1666666667, "logits/rejected": -36863446.4, "logps/chosen": -83.2735087076823, "logps/rejected": -245.282275390625, "loss": 0.4129, "rewards/chosen": -0.8730972607930502, "rewards/margins": 0.7031522432963052, "rewards/rejected": -1.5762495040893554, "step": 4080 }, { "epoch": 0.21630933135451727, "grad_norm": 57.0, "kl": 0.21356964111328125, "learning_rate": 5e-07, "logits/chosen": -50365974.85714286, "logits/rejected": -20733916.0, "logps/chosen": -264.04171316964283, "logps/rejected": -465.9506530761719, "loss": 0.3806, "rewards/chosen": 0.3534369468688965, "rewards/margins": 3.1674842834472656, "rewards/rejected": -2.814047336578369, "step": 4081 }, { "epoch": 0.2163623353563194, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17848508.0, "logits/rejected": -41610656.0, "logps/chosen": -307.642822265625, "logps/rejected": -214.07872009277344, "loss": 0.3283, "rewards/chosen": 0.30722981691360474, "rewards/margins": 1.737323820590973, "rewards/rejected": -1.4300940036773682, "step": 4082 }, { "epoch": 0.21641533935812154, "grad_norm": 36.25, "kl": 0.18346214294433594, "learning_rate": 5e-07, "logits/chosen": -4721995.0, "logits/rejected": -42271968.0, "logps/chosen": -174.56103515625, "logps/rejected": -199.2537384033203, "loss": 0.2832, "rewards/chosen": 0.5459246039390564, "rewards/margins": 2.3697145581245422, "rewards/rejected": -1.8237899541854858, "step": 4083 }, { "epoch": 0.21646834335992368, "grad_norm": 44.0, "kl": 0.12498092651367188, "learning_rate": 5e-07, "logits/chosen": -18024408.0, "logits/rejected": -101774200.0, "logps/chosen": -200.7288360595703, "logps/rejected": -207.0853729248047, "loss": 0.296, "rewards/chosen": 0.46461221575737, "rewards/margins": 1.8739342391490936, "rewards/rejected": -1.4093220233917236, "step": 4084 }, { "epoch": 0.21652134736172582, "grad_norm": 52.75, "kl": 0.6995372772216797, "learning_rate": 5e-07, "logits/chosen": -15340568.0, "logits/rejected": -27910137.6, "logps/chosen": -534.8893229166666, "logps/rejected": -355.29775390625, "loss": 0.2952, "rewards/chosen": 0.4319334030151367, "rewards/margins": 2.0520177841186524, "rewards/rejected": -1.6200843811035157, "step": 4085 }, { "epoch": 0.21657435136352796, "grad_norm": 43.75, "kl": 0.1890869140625, "learning_rate": 5e-07, "logits/chosen": -16087993.333333334, "logits/rejected": -26137209.6, "logps/chosen": -182.509521484375, "logps/rejected": -269.94228515625, "loss": 0.271, "rewards/chosen": 0.4008583227793376, "rewards/margins": 1.8960313955942791, "rewards/rejected": -1.4951730728149415, "step": 4086 }, { "epoch": 0.2166273553653301, "grad_norm": 52.5, "kl": 0.7444076538085938, "learning_rate": 5e-07, "logits/chosen": -43769852.8, "logits/rejected": -14009418.666666666, "logps/chosen": -363.9785400390625, "logps/rejected": -289.5320231119792, "loss": 0.2911, "rewards/chosen": 0.5031907558441162, "rewards/margins": 2.8458038171132407, "rewards/rejected": -2.3426130612691245, "step": 4087 }, { "epoch": 0.21668035936713223, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19773732.0, "logits/rejected": -21880718.4, "logps/chosen": -299.4248046875, "logps/rejected": -297.693115234375, "loss": 0.3166, "rewards/chosen": 0.3289240598678589, "rewards/margins": 1.5703840970993042, "rewards/rejected": -1.2414600372314453, "step": 4088 }, { "epoch": 0.21673336336893437, "grad_norm": 84.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29788160.0, "logits/rejected": -4198570.0, "logps/chosen": -613.462158203125, "logps/rejected": -218.7125701904297, "loss": 0.3399, "rewards/chosen": 0.20596161484718323, "rewards/margins": 1.6795579493045807, "rewards/rejected": -1.4735963344573975, "step": 4089 }, { "epoch": 0.21678636737073648, "grad_norm": 55.75, "kl": 0.5706787109375, "learning_rate": 5e-07, "logits/chosen": -36834867.2, "logits/rejected": -11300104.0, "logps/chosen": -305.274365234375, "logps/rejected": -430.2755126953125, "loss": 0.2901, "rewards/chosen": 0.5926672935485839, "rewards/margins": 2.6996245702107746, "rewards/rejected": -2.106957276662191, "step": 4090 }, { "epoch": 0.21683937137253861, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1157455.5, "logits/rejected": -38984365.333333336, "logps/chosen": -196.657470703125, "logps/rejected": -375.548095703125, "loss": 0.2178, "rewards/chosen": 0.49943381547927856, "rewards/margins": 2.5712345242500305, "rewards/rejected": -2.071800708770752, "step": 4091 }, { "epoch": 0.21689237537434075, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30960629.333333332, "logits/rejected": -40351142.4, "logps/chosen": -386.2365315755208, "logps/rejected": -470.305224609375, "loss": 0.2413, "rewards/chosen": 0.21632283926010132, "rewards/margins": 3.008809220790863, "rewards/rejected": -2.792486381530762, "step": 4092 }, { "epoch": 0.2169453793761429, "grad_norm": 49.75, "kl": 0.11614608764648438, "learning_rate": 5e-07, "logits/chosen": -11901435.0, "logits/rejected": -36863672.0, "logps/chosen": -157.8590545654297, "logps/rejected": -420.21136474609375, "loss": 0.3786, "rewards/chosen": -0.22801196575164795, "rewards/margins": 1.5613062381744385, "rewards/rejected": -1.7893182039260864, "step": 4093 }, { "epoch": 0.21699838337794503, "grad_norm": 56.5, "kl": 0.5789108276367188, "learning_rate": 5e-07, "logits/chosen": -31107641.6, "logits/rejected": -46416730.666666664, "logps/chosen": -442.68759765625, "logps/rejected": -313.1940511067708, "loss": 0.4001, "rewards/chosen": 0.04989721775054932, "rewards/margins": 1.273802622159322, "rewards/rejected": -1.2239054044087727, "step": 4094 }, { "epoch": 0.21705138737974716, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2353449.0, "logits/rejected": -27207468.0, "logps/chosen": -139.7502899169922, "logps/rejected": -358.5863037109375, "loss": 0.3267, "rewards/chosen": -0.11115151643753052, "rewards/margins": 2.4446975588798523, "rewards/rejected": -2.555849075317383, "step": 4095 }, { "epoch": 0.2171043913815493, "grad_norm": 86.5, "kl": 0.7514190673828125, "learning_rate": 5e-07, "logits/chosen": -54857452.0, "logits/rejected": -13822507.0, "logps/chosen": -1003.6907958984375, "logps/rejected": -475.41925048828125, "loss": 0.2626, "rewards/chosen": 0.500978946685791, "rewards/margins": 2.6310672760009766, "rewards/rejected": -2.1300883293151855, "step": 4096 }, { "epoch": 0.21715739538335144, "grad_norm": 54.5, "kl": 0.7422714233398438, "learning_rate": 5e-07, "logits/chosen": -36104821.333333336, "logits/rejected": -15639871.0, "logps/chosen": -261.0713297526042, "logps/rejected": -310.91241455078125, "loss": 0.3809, "rewards/chosen": 0.5240242878595988, "rewards/margins": 1.3045342961947122, "rewards/rejected": -0.7805100083351135, "step": 4097 }, { "epoch": 0.21721039938515357, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20357102.0, "logits/rejected": -65038084.0, "logps/chosen": -232.48890686035156, "logps/rejected": -476.4337158203125, "loss": 0.3632, "rewards/chosen": 0.014528512954711914, "rewards/margins": 1.4967461824417114, "rewards/rejected": -1.4822176694869995, "step": 4098 }, { "epoch": 0.2172634033869557, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8097816.0, "logits/rejected": -5131956.8, "logps/chosen": -134.90302530924478, "logps/rejected": -476.7623046875, "loss": 0.242, "rewards/chosen": 0.2954135735829671, "rewards/margins": 2.888040526707967, "rewards/rejected": -2.592626953125, "step": 4099 }, { "epoch": 0.21731640738875785, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37234182.4, "logits/rejected": -10086178.666666666, "logps/chosen": -253.2009521484375, "logps/rejected": -200.375244140625, "loss": 0.4433, "rewards/chosen": -0.3049816131591797, "rewards/margins": 1.1014316240946451, "rewards/rejected": -1.406413237253825, "step": 4100 }, { "epoch": 0.21736941139055999, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1659404.75, "logits/rejected": -41726709.333333336, "logps/chosen": -141.94833374023438, "logps/rejected": -305.12109375, "loss": 0.2536, "rewards/chosen": 0.21778623759746552, "rewards/margins": 1.9686731547117233, "rewards/rejected": -1.7508869171142578, "step": 4101 }, { "epoch": 0.21742241539236212, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76569984.0, "logits/rejected": -56219434.666666664, "logps/chosen": -415.7144775390625, "logps/rejected": -207.11444091796875, "loss": 0.2625, "rewards/chosen": -0.1262405514717102, "rewards/margins": 1.691930631796519, "rewards/rejected": -1.8181711832682292, "step": 4102 }, { "epoch": 0.21747541939416426, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5833448.0, "logits/rejected": -37421305.6, "logps/chosen": -191.17989095052084, "logps/rejected": -461.345068359375, "loss": 0.2547, "rewards/chosen": 0.1357111632823944, "rewards/margins": 2.5933100402355196, "rewards/rejected": -2.457598876953125, "step": 4103 }, { "epoch": 0.2175284233959664, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38008633.6, "logits/rejected": -99420021.33333333, "logps/chosen": -389.080078125, "logps/rejected": -408.6531982421875, "loss": 0.3241, "rewards/chosen": 0.22530090808868408, "rewards/margins": 2.329924384752909, "rewards/rejected": -2.104623476664225, "step": 4104 }, { "epoch": 0.21758142739776853, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24050990.0, "logits/rejected": -15605504.0, "logps/chosen": -318.97625732421875, "logps/rejected": -165.57220458984375, "loss": 0.2651, "rewards/chosen": 0.31360530853271484, "rewards/margins": 1.844375451405843, "rewards/rejected": -1.5307701428731282, "step": 4105 }, { "epoch": 0.21763443139957067, "grad_norm": 46.5, "kl": 0.6685266494750977, "learning_rate": 5e-07, "logits/chosen": 3838705.5, "logits/rejected": -40511900.0, "logps/chosen": -213.44418334960938, "logps/rejected": -298.674072265625, "loss": 0.3053, "rewards/chosen": 0.652460515499115, "rewards/margins": 2.0251105427742004, "rewards/rejected": -1.3726500272750854, "step": 4106 }, { "epoch": 0.2176874354013728, "grad_norm": 35.5, "kl": 0.1677083969116211, "learning_rate": 5e-07, "logits/chosen": -8844130.666666666, "logits/rejected": -12664781.6, "logps/chosen": -50.51048787434896, "logps/rejected": -121.5012451171875, "loss": 0.2983, "rewards/chosen": 0.0652200182278951, "rewards/margins": 1.808691354592641, "rewards/rejected": -1.743471336364746, "step": 4107 }, { "epoch": 0.21774043940317495, "grad_norm": 63.5, "kl": 0.36756134033203125, "learning_rate": 5e-07, "logits/chosen": -24836572.8, "logits/rejected": -31550517.333333332, "logps/chosen": -439.74345703125, "logps/rejected": -458.0269775390625, "loss": 0.2606, "rewards/chosen": 0.8182989120483398, "rewards/margins": 2.7349401473999024, "rewards/rejected": -1.9166412353515625, "step": 4108 }, { "epoch": 0.21779344340497708, "grad_norm": 57.25, "kl": 0.7657985687255859, "learning_rate": 5e-07, "logits/chosen": -48136131.2, "logits/rejected": -22572597.333333332, "logps/chosen": -464.37109375, "logps/rejected": -493.7977701822917, "loss": 0.3221, "rewards/chosen": 0.33418233394622804, "rewards/margins": 2.816739630699158, "rewards/rejected": -2.4825572967529297, "step": 4109 }, { "epoch": 0.21784644740677922, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24586162.0, "logits/rejected": -19487690.666666668, "logps/chosen": -297.533203125, "logps/rejected": -401.9164225260417, "loss": 0.2323, "rewards/chosen": 0.12840425968170166, "rewards/margins": 2.5915669997533164, "rewards/rejected": -2.4631627400716147, "step": 4110 }, { "epoch": 0.21789945140858136, "grad_norm": 72.0, "kl": 2.1572189331054688, "learning_rate": 5e-07, "logits/chosen": -693814.8571428572, "logits/rejected": -3965431.25, "logps/chosen": -276.08719308035717, "logps/rejected": -110.87495422363281, "loss": 0.4968, "rewards/chosen": 0.06858171735491071, "rewards/margins": 1.3230906895228796, "rewards/rejected": -1.2545089721679688, "step": 4111 }, { "epoch": 0.2179524554103835, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1656276.3333333333, "logits/rejected": -27373145.6, "logps/chosen": -249.76422119140625, "logps/rejected": -426.9314453125, "loss": 0.2589, "rewards/chosen": -0.020784194270769756, "rewards/margins": 2.378832428654035, "rewards/rejected": -2.3996166229248046, "step": 4112 }, { "epoch": 0.21800545941218563, "grad_norm": 51.0, "kl": 2.1830949783325195, "learning_rate": 5e-07, "logits/chosen": -37195509.333333336, "logits/rejected": -56708176.0, "logps/chosen": -398.9529622395833, "logps/rejected": -470.9429626464844, "loss": 0.3758, "rewards/chosen": 0.4159732659657796, "rewards/margins": 3.3435245354970298, "rewards/rejected": -2.92755126953125, "step": 4113 }, { "epoch": 0.21805846341398777, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28448980.0, "logits/rejected": 23140204.0, "logps/chosen": -174.14169311523438, "logps/rejected": -305.60406494140625, "loss": 0.4202, "rewards/chosen": -0.1780402660369873, "rewards/margins": 0.932955265045166, "rewards/rejected": -1.1109955310821533, "step": 4114 }, { "epoch": 0.2181114674157899, "grad_norm": 59.5, "kl": 0.024570465087890625, "learning_rate": 5e-07, "logits/chosen": 74297221.33333333, "logits/rejected": -21487658.0, "logps/chosen": -230.89469401041666, "logps/rejected": -592.4800415039062, "loss": 0.3957, "rewards/chosen": 0.29581667979558307, "rewards/margins": 1.4507056673367817, "rewards/rejected": -1.1548889875411987, "step": 4115 }, { "epoch": 0.21816447141759202, "grad_norm": 47.25, "kl": 0.19645118713378906, "learning_rate": 5e-07, "logits/chosen": -47053814.4, "logits/rejected": -48345136.0, "logps/chosen": -133.96258544921875, "logps/rejected": -556.6824951171875, "loss": 0.3951, "rewards/chosen": -0.28853650093078614, "rewards/margins": 2.24407057762146, "rewards/rejected": -2.532607078552246, "step": 4116 }, { "epoch": 0.21821747541939415, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6570206.666666667, "logits/rejected": -44327657.6, "logps/chosen": -435.3972981770833, "logps/rejected": -330.428076171875, "loss": 0.25, "rewards/chosen": 0.49215900897979736, "rewards/margins": 2.1637282609939574, "rewards/rejected": -1.67156925201416, "step": 4117 }, { "epoch": 0.2182704794211963, "grad_norm": 60.0, "kl": 0.22444915771484375, "learning_rate": 5e-07, "logits/chosen": -15911157.0, "logits/rejected": -16319938.0, "logps/chosen": -406.52191162109375, "logps/rejected": -141.77621459960938, "loss": 0.3664, "rewards/chosen": 0.2739192843437195, "rewards/margins": 1.2244155406951904, "rewards/rejected": -0.950496256351471, "step": 4118 }, { "epoch": 0.21832348342299843, "grad_norm": 69.0, "kl": 1.536306381225586, "learning_rate": 5e-07, "logits/chosen": -39605372.0, "logits/rejected": -32078468.0, "logps/chosen": -764.0557861328125, "logps/rejected": -346.24114990234375, "loss": 0.2989, "rewards/chosen": 0.7803223133087158, "rewards/margins": 2.552114248275757, "rewards/rejected": -1.771791934967041, "step": 4119 }, { "epoch": 0.21837648742480056, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17737748.0, "logits/rejected": -23707312.0, "logps/chosen": -476.4515380859375, "logps/rejected": -376.4466959635417, "loss": 0.2526, "rewards/chosen": 0.05929955840110779, "rewards/margins": 1.8638182580471039, "rewards/rejected": -1.804518699645996, "step": 4120 }, { "epoch": 0.2184294914266027, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8170456.0, "logits/rejected": -9277456.666666666, "logps/chosen": -82.48897094726563, "logps/rejected": -209.06754557291666, "loss": 0.4239, "rewards/chosen": 0.09664101600646972, "rewards/margins": 0.7980041027069091, "rewards/rejected": -0.7013630867004395, "step": 4121 }, { "epoch": 0.21848249542840484, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12960278.666666666, "logits/rejected": -86883097.6, "logps/chosen": -114.269287109375, "logps/rejected": -639.98330078125, "loss": 0.1982, "rewards/chosen": 0.28515295187632245, "rewards/margins": 3.6187020699183146, "rewards/rejected": -3.3335491180419923, "step": 4122 }, { "epoch": 0.21853549943020698, "grad_norm": 54.25, "kl": 0.09755706787109375, "learning_rate": 5e-07, "logits/chosen": -27097610.666666668, "logits/rejected": -65826163.2, "logps/chosen": -288.64501953125, "logps/rejected": -371.702978515625, "loss": 0.2891, "rewards/chosen": 0.19755248228708902, "rewards/margins": 1.7672909657160443, "rewards/rejected": -1.5697384834289552, "step": 4123 }, { "epoch": 0.2185885034320091, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23551392.0, "logits/rejected": -19629225.6, "logps/chosen": -69.33992004394531, "logps/rejected": -280.742626953125, "loss": 0.2779, "rewards/chosen": 0.20027567942937216, "rewards/margins": 2.2442068775494897, "rewards/rejected": -2.0439311981201174, "step": 4124 }, { "epoch": 0.21864150743381125, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42517082.666666664, "logits/rejected": 28759993.6, "logps/chosen": -346.3136393229167, "logps/rejected": -402.52880859375, "loss": 0.2889, "rewards/chosen": 0.010903427998224894, "rewards/margins": 1.8106581429640454, "rewards/rejected": -1.7997547149658204, "step": 4125 }, { "epoch": 0.2186945114356134, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16930196.0, "logits/rejected": -25882806.4, "logps/chosen": -190.3770548502604, "logps/rejected": -453.197998046875, "loss": 0.2679, "rewards/chosen": -0.2462566296259562, "rewards/margins": 2.126205643018087, "rewards/rejected": -2.372462272644043, "step": 4126 }, { "epoch": 0.21874751543741552, "grad_norm": 55.75, "kl": 3.0432491302490234, "learning_rate": 5e-07, "logits/chosen": -114909017.6, "logits/rejected": -14204846.666666666, "logps/chosen": -577.99638671875, "logps/rejected": -313.66552734375, "loss": 0.2904, "rewards/chosen": 0.9110549926757813, "rewards/margins": 3.2500444094340004, "rewards/rejected": -2.3389894167582193, "step": 4127 }, { "epoch": 0.21880051943921766, "grad_norm": 55.0, "kl": 0.32610321044921875, "learning_rate": 5e-07, "logits/chosen": -5245536.0, "logits/rejected": -69398984.0, "logps/chosen": -284.0609130859375, "logps/rejected": -411.9128723144531, "loss": 0.2843, "rewards/chosen": 0.12542419135570526, "rewards/margins": 2.691928431391716, "rewards/rejected": -2.5665042400360107, "step": 4128 }, { "epoch": 0.2188535234410198, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42991989.333333336, "logits/rejected": -28426816.0, "logps/chosen": -207.379638671875, "logps/rejected": -481.90283203125, "loss": 0.287, "rewards/chosen": -0.011530684928099314, "rewards/margins": 2.3325092320640883, "rewards/rejected": -2.3440399169921875, "step": 4129 }, { "epoch": 0.21890652744282194, "grad_norm": 76.5, "kl": 0.1646556854248047, "learning_rate": 5e-07, "logits/chosen": 31408122.0, "logits/rejected": -5894106.0, "logps/chosen": -484.9832458496094, "logps/rejected": -184.48666381835938, "loss": 0.2901, "rewards/chosen": 0.6230522394180298, "rewards/margins": 2.0289417505264282, "rewards/rejected": -1.4058895111083984, "step": 4130 }, { "epoch": 0.21895953144462407, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20648968.0, "logits/rejected": -1876544.75, "logps/chosen": -312.0328369140625, "logps/rejected": -228.06715393066406, "loss": 0.3552, "rewards/chosen": -0.17178060114383698, "rewards/margins": 1.5224711745977402, "rewards/rejected": -1.6942517757415771, "step": 4131 }, { "epoch": 0.2190125354464262, "grad_norm": 54.0, "kl": 0.6608982086181641, "learning_rate": 5e-07, "logits/chosen": -54940768.0, "logits/rejected": -8475569.6, "logps/chosen": -379.7727864583333, "logps/rejected": -210.2923095703125, "loss": 0.2892, "rewards/chosen": 0.3683939774831136, "rewards/margins": 1.825555690129598, "rewards/rejected": -1.4571617126464844, "step": 4132 }, { "epoch": 0.21906553944822835, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58910553.6, "logits/rejected": -24425498.666666668, "logps/chosen": -465.900537109375, "logps/rejected": -303.27121988932294, "loss": 0.3865, "rewards/chosen": 0.08824829459190368, "rewards/margins": 1.3248994767665863, "rewards/rejected": -1.2366511821746826, "step": 4133 }, { "epoch": 0.21911854345003048, "grad_norm": 63.5, "kl": 0.18542861938476562, "learning_rate": 5e-07, "logits/chosen": -30055909.333333332, "logits/rejected": 1745579.5, "logps/chosen": -770.4859212239584, "logps/rejected": -52.7552490234375, "loss": 0.3789, "rewards/chosen": 0.38003591696421307, "rewards/margins": 1.577618400255839, "rewards/rejected": -1.197582483291626, "step": 4134 }, { "epoch": 0.21917154745183262, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13202712.0, "logits/rejected": -18194364.0, "logps/chosen": -604.7069702148438, "logps/rejected": -233.856201171875, "loss": 0.3719, "rewards/chosen": 0.2830728590488434, "rewards/margins": 1.1640180051326752, "rewards/rejected": -0.8809451460838318, "step": 4135 }, { "epoch": 0.21922455145363476, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6212613.333333333, "logits/rejected": -71299840.0, "logps/chosen": -100.92962646484375, "logps/rejected": -769.2318115234375, "loss": 0.4393, "rewards/chosen": -0.20005313555399576, "rewards/margins": 2.3856311639149985, "rewards/rejected": -2.585684299468994, "step": 4136 }, { "epoch": 0.2192775554554369, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23621970.666666668, "logits/rejected": -16118774.0, "logps/chosen": -290.8836263020833, "logps/rejected": -176.93435668945312, "loss": 0.4423, "rewards/chosen": 0.07507432003815968, "rewards/margins": 0.8371986200412115, "rewards/rejected": -0.7621243000030518, "step": 4137 }, { "epoch": 0.21933055945723903, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33439120.0, "logits/rejected": -14584333.0, "logps/chosen": -266.9444580078125, "logps/rejected": -351.66900634765625, "loss": 0.3945, "rewards/chosen": 0.2932235797246297, "rewards/margins": 2.183228691418966, "rewards/rejected": -1.890005111694336, "step": 4138 }, { "epoch": 0.21938356345904117, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28732413.333333332, "logits/rejected": -83686.4, "logps/chosen": -56.9974365234375, "logps/rejected": -154.698583984375, "loss": 0.3337, "rewards/chosen": 0.26147447029749554, "rewards/margins": 1.4612618962923687, "rewards/rejected": -1.199787425994873, "step": 4139 }, { "epoch": 0.2194365674608433, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10490887.333333334, "logits/rejected": -28369148.8, "logps/chosen": -265.8029378255208, "logps/rejected": -453.832177734375, "loss": 0.221, "rewards/chosen": 0.6277937491734823, "rewards/margins": 2.9077749808629356, "rewards/rejected": -2.279981231689453, "step": 4140 }, { "epoch": 0.21948957146264542, "grad_norm": 58.25, "kl": 1.1952705383300781, "learning_rate": 5e-07, "logits/chosen": -36044660.0, "logits/rejected": -11577570.0, "logps/chosen": -633.5816650390625, "logps/rejected": -217.76190185546875, "loss": 0.2637, "rewards/chosen": 0.7480928897857666, "rewards/margins": 3.399441957473755, "rewards/rejected": -2.6513490676879883, "step": 4141 }, { "epoch": 0.21954257546444755, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72056736.0, "logits/rejected": -36843008.0, "logps/chosen": -374.7212727864583, "logps/rejected": -355.12216796875, "loss": 0.2969, "rewards/chosen": -0.13467025756835938, "rewards/margins": 1.8953046798706055, "rewards/rejected": -2.029974937438965, "step": 4142 }, { "epoch": 0.2195955794662497, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31410632.0, "logits/rejected": -49673500.0, "logps/chosen": -257.14630126953125, "logps/rejected": -424.752685546875, "loss": 0.3101, "rewards/chosen": 0.1734318733215332, "rewards/margins": 2.2120699882507324, "rewards/rejected": -2.038638114929199, "step": 4143 }, { "epoch": 0.21964858346805183, "grad_norm": 52.0, "kl": 1.0651636123657227, "learning_rate": 5e-07, "logits/chosen": -27023856.0, "logits/rejected": -124193920.0, "logps/chosen": -285.6234944661458, "logps/rejected": -473.00360107421875, "loss": 0.325, "rewards/chosen": 0.519634485244751, "rewards/margins": 3.2461955547332764, "rewards/rejected": -2.7265610694885254, "step": 4144 }, { "epoch": 0.21970158746985397, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52202476.8, "logits/rejected": -21199962.666666668, "logps/chosen": -266.97568359375, "logps/rejected": -291.41957600911456, "loss": 0.4156, "rewards/chosen": -0.08522777557373047, "rewards/margins": 1.125237496693929, "rewards/rejected": -1.2104652722676594, "step": 4145 }, { "epoch": 0.2197545914716561, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25542736.0, "logits/rejected": -29602988.8, "logps/chosen": -579.7469889322916, "logps/rejected": -356.75283203125, "loss": 0.2334, "rewards/chosen": 0.5337142944335938, "rewards/margins": 2.573141098022461, "rewards/rejected": -2.039426803588867, "step": 4146 }, { "epoch": 0.21980759547345824, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31246101.333333332, "logits/rejected": -45241382.4, "logps/chosen": -285.73581949869794, "logps/rejected": -472.4513671875, "loss": 0.3111, "rewards/chosen": 0.0032100677490234375, "rewards/margins": 1.7351245880126953, "rewards/rejected": -1.7319145202636719, "step": 4147 }, { "epoch": 0.21986059947526038, "grad_norm": 59.25, "kl": 0.5111541748046875, "learning_rate": 5e-07, "logits/chosen": -57960016.0, "logits/rejected": -56005248.0, "logps/chosen": -546.4620361328125, "logps/rejected": -320.00712890625, "loss": 0.2513, "rewards/chosen": 0.5310490926106771, "rewards/margins": 2.1767608006795247, "rewards/rejected": -1.6457117080688477, "step": 4148 }, { "epoch": 0.2199136034770625, "grad_norm": 47.5, "kl": 1.0776252746582031, "learning_rate": 5e-07, "logits/chosen": -5022819.0, "logits/rejected": -3748255.0, "logps/chosen": -280.61309814453125, "logps/rejected": -190.79360961914062, "loss": 0.3249, "rewards/chosen": 0.5044002532958984, "rewards/margins": 1.9205079078674316, "rewards/rejected": -1.4161076545715332, "step": 4149 }, { "epoch": 0.21996660747886465, "grad_norm": 56.5, "kl": 1.3035306930541992, "learning_rate": 5e-07, "logits/chosen": -24915488.0, "logits/rejected": -20794782.0, "logps/chosen": -732.4578857421875, "logps/rejected": -304.33660888671875, "loss": 0.1917, "rewards/chosen": 1.1128510236740112, "rewards/margins": 3.24370276927948, "rewards/rejected": -2.1308517456054688, "step": 4150 }, { "epoch": 0.2200196114806668, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21713352.0, "logits/rejected": -36204112.0, "logps/chosen": -304.23779296875, "logps/rejected": -300.8756408691406, "loss": 0.3201, "rewards/chosen": 0.19185811281204224, "rewards/margins": 1.8236375451087952, "rewards/rejected": -1.631779432296753, "step": 4151 }, { "epoch": 0.22007261548246893, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37578378.666666664, "logits/rejected": -56702940.0, "logps/chosen": -428.0002034505208, "logps/rejected": -359.70611572265625, "loss": 0.3141, "rewards/chosen": 0.5611335833867391, "rewards/margins": 3.2344581683476767, "rewards/rejected": -2.6733245849609375, "step": 4152 }, { "epoch": 0.22012561948427106, "grad_norm": 40.75, "kl": 0.2278881072998047, "learning_rate": 5e-07, "logits/chosen": -4383469.6, "logits/rejected": 11988982.666666666, "logps/chosen": -192.92579345703126, "logps/rejected": -303.1216634114583, "loss": 0.2595, "rewards/chosen": 0.6890531063079834, "rewards/margins": 2.9994844277699793, "rewards/rejected": -2.3104313214619956, "step": 4153 }, { "epoch": 0.2201786234860732, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62531824.0, "logits/rejected": -30674197.333333332, "logps/chosen": -427.7041320800781, "logps/rejected": -343.4593098958333, "loss": 0.2234, "rewards/chosen": 0.007321164011955261, "rewards/margins": 2.1306511213382087, "rewards/rejected": -2.1233299573262534, "step": 4154 }, { "epoch": 0.22023162748787534, "grad_norm": 58.5, "kl": 0.543670654296875, "learning_rate": 5e-07, "logits/chosen": -42504362.666666664, "logits/rejected": -43278960.0, "logps/chosen": -232.1716512044271, "logps/rejected": -292.276611328125, "loss": 0.4139, "rewards/chosen": 0.12919013698895773, "rewards/margins": 1.7011871834595997, "rewards/rejected": -1.571997046470642, "step": 4155 }, { "epoch": 0.22028463148967747, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50438112.0, "logits/rejected": -27597033.6, "logps/chosen": -561.1310221354166, "logps/rejected": -232.0625, "loss": 0.3789, "rewards/chosen": -0.034388224283854164, "rewards/margins": 0.8300615628560385, "rewards/rejected": -0.8644497871398926, "step": 4156 }, { "epoch": 0.2203376354914796, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16403337.333333334, "logits/rejected": -72316360.0, "logps/chosen": -255.30509440104166, "logps/rejected": -399.9754943847656, "loss": 0.4389, "rewards/chosen": -0.19411540031433105, "rewards/margins": 2.480571746826172, "rewards/rejected": -2.674687147140503, "step": 4157 }, { "epoch": 0.22039063949328175, "grad_norm": 67.5, "kl": 3.2871923446655273, "learning_rate": 5e-07, "logits/chosen": -27255980.0, "logits/rejected": -28883482.0, "logps/chosen": -371.9240417480469, "logps/rejected": -146.15040588378906, "loss": 0.3147, "rewards/chosen": 1.1394309997558594, "rewards/margins": 2.357836961746216, "rewards/rejected": -1.2184059619903564, "step": 4158 }, { "epoch": 0.22044364349508389, "grad_norm": 60.75, "kl": 1.6164512634277344, "learning_rate": 5e-07, "logits/chosen": -26593664.0, "logits/rejected": 5948561.0, "logps/chosen": -585.2357177734375, "logps/rejected": -385.957763671875, "loss": 0.2794, "rewards/chosen": 0.4738677144050598, "rewards/margins": 2.8035603165626526, "rewards/rejected": -2.3296926021575928, "step": 4159 }, { "epoch": 0.22049664749688602, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2630772.6666666665, "logits/rejected": -23327328.0, "logps/chosen": -73.68310546875, "logps/rejected": -275.3395751953125, "loss": 0.3048, "rewards/chosen": -0.08657671014467876, "rewards/margins": 1.8028726001580555, "rewards/rejected": -1.8894493103027343, "step": 4160 }, { "epoch": 0.22054965149868816, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64217520.0, "logits/rejected": -5668294.0, "logps/chosen": -467.1315104166667, "logps/rejected": -203.744384765625, "loss": 0.2987, "rewards/chosen": 0.44300846258799237, "rewards/margins": 1.7011507431666057, "rewards/rejected": -1.2581422805786133, "step": 4161 }, { "epoch": 0.2206026555004903, "grad_norm": 62.25, "kl": 0.724395751953125, "learning_rate": 5e-07, "logits/chosen": -23484790.4, "logits/rejected": -28176224.0, "logps/chosen": -394.821337890625, "logps/rejected": -402.6229654947917, "loss": 0.3207, "rewards/chosen": 0.2544825792312622, "rewards/margins": 2.9684572140375773, "rewards/rejected": -2.713974634806315, "step": 4162 }, { "epoch": 0.22065565950229243, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45582307.2, "logits/rejected": -30696586.666666668, "logps/chosen": -235.6743408203125, "logps/rejected": -546.5032145182291, "loss": 0.3857, "rewards/chosen": -0.26331367492675783, "rewards/margins": 2.0713142712910972, "rewards/rejected": -2.334627946217855, "step": 4163 }, { "epoch": 0.22070866350409457, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38339907.2, "logits/rejected": -39036848.0, "logps/chosen": -224.374853515625, "logps/rejected": -471.7847086588542, "loss": 0.3855, "rewards/chosen": 0.03183901309967041, "rewards/margins": 1.5127787192662556, "rewards/rejected": -1.4809397061665852, "step": 4164 }, { "epoch": 0.2207616675058967, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17434482.0, "logits/rejected": -8765154.0, "logps/chosen": -162.42904663085938, "logps/rejected": -221.49917602539062, "loss": 0.3046, "rewards/chosen": 0.18180671334266663, "rewards/margins": 1.9655453264713287, "rewards/rejected": -1.783738613128662, "step": 4165 }, { "epoch": 0.22081467150769882, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29451952.0, "logits/rejected": -27655228.8, "logps/chosen": -340.5876057942708, "logps/rejected": -193.96044921875, "loss": 0.3702, "rewards/chosen": -0.2462839682896932, "rewards/margins": 1.035910169283549, "rewards/rejected": -1.2821941375732422, "step": 4166 }, { "epoch": 0.22086767550950095, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69962768.0, "logits/rejected": -33715696.0, "logps/chosen": -324.3475341796875, "logps/rejected": -413.175537109375, "loss": 0.372, "rewards/chosen": -0.2224903106689453, "rewards/margins": 1.6966729164123535, "rewards/rejected": -1.9191632270812988, "step": 4167 }, { "epoch": 0.2209206795113031, "grad_norm": 48.25, "kl": 0.5227642059326172, "learning_rate": 5e-07, "logits/chosen": -19993224.0, "logits/rejected": -29740252.0, "logps/chosen": -289.93798828125, "logps/rejected": -249.6600799560547, "loss": 0.295, "rewards/chosen": 0.6228156089782715, "rewards/margins": 3.497912883758545, "rewards/rejected": -2.8750972747802734, "step": 4168 }, { "epoch": 0.22097368351310523, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9101828.0, "logits/rejected": 82367104.0, "logps/chosen": -378.51983642578125, "logps/rejected": -506.35430908203125, "loss": 0.2946, "rewards/chosen": 0.3268989324569702, "rewards/margins": 2.4298113584518433, "rewards/rejected": -2.102912425994873, "step": 4169 }, { "epoch": 0.22102668751490737, "grad_norm": 53.5, "kl": 0.45160675048828125, "learning_rate": 5e-07, "logits/chosen": -63494101.333333336, "logits/rejected": -11408906.4, "logps/chosen": -508.9021809895833, "logps/rejected": -271.497314453125, "loss": 0.29, "rewards/chosen": -0.09859110911687215, "rewards/margins": 2.09817707935969, "rewards/rejected": -2.1967681884765624, "step": 4170 }, { "epoch": 0.2210796915167095, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75652096.0, "logits/rejected": -9438169.142857144, "logps/chosen": -666.831787109375, "logps/rejected": -143.8941650390625, "loss": 0.1978, "rewards/chosen": 1.5478394031524658, "rewards/margins": 3.1197694710322788, "rewards/rejected": -1.5719300678798132, "step": 4171 }, { "epoch": 0.22113269551851164, "grad_norm": 54.0, "kl": 1.0946674346923828, "learning_rate": 5e-07, "logits/chosen": -27295433.6, "logits/rejected": -9341520.0, "logps/chosen": -355.998876953125, "logps/rejected": -603.4273274739584, "loss": 0.3661, "rewards/chosen": 0.017894327640533447, "rewards/margins": 2.6149014433224997, "rewards/rejected": -2.5970071156819663, "step": 4172 }, { "epoch": 0.22118569952031378, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25259338.666666668, "logits/rejected": -28891756.8, "logps/chosen": -273.07102457682294, "logps/rejected": -326.72880859375, "loss": 0.2802, "rewards/chosen": 0.5519452095031738, "rewards/margins": 1.9708731651306153, "rewards/rejected": -1.4189279556274415, "step": 4173 }, { "epoch": 0.22123870352211591, "grad_norm": 81.5, "kl": 1.334808349609375, "learning_rate": 5e-07, "logits/chosen": -44956124.0, "logits/rejected": 14969232.0, "logps/chosen": -919.2991943359375, "logps/rejected": -406.96484375, "loss": 0.2978, "rewards/chosen": 0.40387773513793945, "rewards/margins": 2.014233350753784, "rewards/rejected": -1.6103556156158447, "step": 4174 }, { "epoch": 0.22129170752391805, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37855232.0, "logits/rejected": -8538477.333333334, "logps/chosen": -144.04977416992188, "logps/rejected": -227.81827799479166, "loss": 0.3507, "rewards/chosen": -0.39553070068359375, "rewards/margins": 0.8185745875040691, "rewards/rejected": -1.2141052881876628, "step": 4175 }, { "epoch": 0.2213447115257202, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4058658.5, "logits/rejected": -27010859.42857143, "logps/chosen": -3.9039459228515625, "logps/rejected": -268.28499930245533, "loss": 0.2505, "rewards/chosen": 0.032509613782167435, "rewards/margins": 1.5676658637821674, "rewards/rejected": -1.53515625, "step": 4176 }, { "epoch": 0.22139771552752233, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14147908.0, "logits/rejected": -24127414.0, "logps/chosen": -55.5385856628418, "logps/rejected": -228.76991271972656, "loss": 0.3807, "rewards/chosen": -0.38094520568847656, "rewards/margins": 1.3402438163757324, "rewards/rejected": -1.721189022064209, "step": 4177 }, { "epoch": 0.22145071952932446, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3872367.25, "logits/rejected": -21307396.0, "logps/chosen": -54.41471862792969, "logps/rejected": -530.3838500976562, "loss": 0.3653, "rewards/chosen": -0.19233939051628113, "rewards/margins": 1.68754044175148, "rewards/rejected": -1.8798798322677612, "step": 4178 }, { "epoch": 0.2215037235311266, "grad_norm": 54.25, "kl": 2.5673751831054688, "learning_rate": 5e-07, "logits/chosen": -42854925.333333336, "logits/rejected": -35424787.2, "logps/chosen": -539.41015625, "logps/rejected": -454.37763671875, "loss": 0.2652, "rewards/chosen": 0.5895299116770426, "rewards/margins": 2.7562886397043864, "rewards/rejected": -2.166758728027344, "step": 4179 }, { "epoch": 0.22155672753292874, "grad_norm": 53.0, "kl": 1.6640853881835938, "learning_rate": 5e-07, "logits/chosen": -26570616.0, "logits/rejected": -28097046.0, "logps/chosen": -242.8480224609375, "logps/rejected": -207.46609497070312, "loss": 0.3677, "rewards/chosen": 0.5521610180536906, "rewards/margins": 1.9335391918818154, "rewards/rejected": -1.381378173828125, "step": 4180 }, { "epoch": 0.22160973153473087, "grad_norm": 42.75, "kl": 0.14273786544799805, "learning_rate": 5e-07, "logits/chosen": -11705748.8, "logits/rejected": -15807185.333333334, "logps/chosen": -115.76453857421875, "logps/rejected": -284.9843343098958, "loss": 0.3669, "rewards/chosen": 0.018049129843711854, "rewards/margins": 1.8358658641576766, "rewards/rejected": -1.8178167343139648, "step": 4181 }, { "epoch": 0.221662735536533, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5313290.5, "logits/rejected": -18430402.285714287, "logps/chosen": -279.54595947265625, "logps/rejected": -240.25997488839286, "loss": 0.2119, "rewards/chosen": 1.2932952642440796, "rewards/margins": 2.8084366151264737, "rewards/rejected": -1.5151413508823939, "step": 4182 }, { "epoch": 0.22171573953833515, "grad_norm": 94.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57312533.333333336, "logits/rejected": -61075033.6, "logps/chosen": -317.77880859375, "logps/rejected": -496.726416015625, "loss": 0.271, "rewards/chosen": 0.053307597835858665, "rewards/margins": 2.076763026913007, "rewards/rejected": -2.0234554290771483, "step": 4183 }, { "epoch": 0.2217687435401373, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2027108.3333333333, "logits/rejected": -44151596.8, "logps/chosen": -126.94666544596355, "logps/rejected": -281.6924072265625, "loss": 0.2856, "rewards/chosen": 0.4691607157389323, "rewards/margins": 1.8481753031412762, "rewards/rejected": -1.3790145874023438, "step": 4184 }, { "epoch": 0.22182174754193942, "grad_norm": 79.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69751840.0, "logits/rejected": -3855035.0, "logps/chosen": -255.9850311279297, "logps/rejected": -76.26078796386719, "loss": 0.3333, "rewards/chosen": 0.36645498871803284, "rewards/margins": 1.5490638315677643, "rewards/rejected": -1.1826088428497314, "step": 4185 }, { "epoch": 0.22187475154374156, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5363591.5, "logits/rejected": -17902746.666666668, "logps/chosen": -156.3660888671875, "logps/rejected": -432.32666015625, "loss": 0.1662, "rewards/chosen": 0.7624947428703308, "rewards/margins": 3.181948204835256, "rewards/rejected": -2.4194534619649253, "step": 4186 }, { "epoch": 0.2219277555455437, "grad_norm": 54.25, "kl": 1.6085796356201172, "learning_rate": 5e-07, "logits/chosen": -16456032.0, "logits/rejected": -7706348.666666667, "logps/chosen": -383.85791015625, "logps/rejected": -576.0179036458334, "loss": 0.2651, "rewards/chosen": 0.8699122428894043, "rewards/margins": 3.544910208384196, "rewards/rejected": -2.6749979654947915, "step": 4187 }, { "epoch": 0.22198075954734584, "grad_norm": 46.5, "kl": 0.24793052673339844, "learning_rate": 5e-07, "logits/chosen": -20523492.0, "logits/rejected": -41353916.0, "logps/chosen": -201.62925720214844, "logps/rejected": -681.09326171875, "loss": 0.3303, "rewards/chosen": 0.015435799956321716, "rewards/margins": 1.8907805532217026, "rewards/rejected": -1.8753447532653809, "step": 4188 }, { "epoch": 0.22203376354914797, "grad_norm": 52.0, "kl": 0.8396568298339844, "learning_rate": 5e-07, "logits/chosen": -10174406.4, "logits/rejected": -53455232.0, "logps/chosen": -415.652294921875, "logps/rejected": -399.537109375, "loss": 0.2851, "rewards/chosen": 0.8276973724365234, "rewards/margins": 2.6289262135823566, "rewards/rejected": -1.8012288411458333, "step": 4189 }, { "epoch": 0.2220867675509501, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6831768.0, "logits/rejected": -30871770.666666668, "logps/chosen": -260.94580078125, "logps/rejected": -477.4769694010417, "loss": 0.2635, "rewards/chosen": 0.7189011573791504, "rewards/margins": 3.0991772015889487, "rewards/rejected": -2.3802760442097983, "step": 4190 }, { "epoch": 0.22213977155275225, "grad_norm": 60.25, "kl": 1.1209068298339844, "learning_rate": 5e-07, "logits/chosen": -25986044.0, "logits/rejected": -11990756.0, "logps/chosen": -319.060791015625, "logps/rejected": -302.3968505859375, "loss": 0.3919, "rewards/chosen": 0.1508493423461914, "rewards/margins": 1.4005670547485352, "rewards/rejected": -1.2497177124023438, "step": 4191 }, { "epoch": 0.22219277555455436, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3914180.0, "logits/rejected": 289639.9, "logps/chosen": -93.06103515625, "logps/rejected": -86.4801513671875, "loss": 0.4242, "rewards/chosen": -0.5459294319152832, "rewards/margins": 0.3859834671020508, "rewards/rejected": -0.931912899017334, "step": 4192 }, { "epoch": 0.2222457795563565, "grad_norm": 57.25, "kl": 0.5442581176757812, "learning_rate": 5e-07, "logits/chosen": -17323689.6, "logits/rejected": -23118906.666666668, "logps/chosen": -737.7634765625, "logps/rejected": -129.76596069335938, "loss": 0.3289, "rewards/chosen": 0.4322805881500244, "rewards/margins": 2.056767193476359, "rewards/rejected": -1.6244866053263347, "step": 4193 }, { "epoch": 0.22229878355815863, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -99412085.33333333, "logits/rejected": -18718033.6, "logps/chosen": -388.0902099609375, "logps/rejected": -201.5576416015625, "loss": 0.3325, "rewards/chosen": -0.02191162109375, "rewards/margins": 1.3987771987915039, "rewards/rejected": -1.4206888198852539, "step": 4194 }, { "epoch": 0.22235178755996077, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7547543.0, "logits/rejected": -5611611.428571428, "logps/chosen": -305.2377624511719, "logps/rejected": -154.66746303013392, "loss": 0.2205, "rewards/chosen": 1.1260406970977783, "rewards/margins": 2.4658712318965366, "rewards/rejected": -1.3398305347987585, "step": 4195 }, { "epoch": 0.2224047915617629, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8940996.0, "logits/rejected": -15325160.0, "logps/chosen": -178.4120076497396, "logps/rejected": -263.747119140625, "loss": 0.303, "rewards/chosen": 0.0375600258509318, "rewards/margins": 1.6961989800135295, "rewards/rejected": -1.6586389541625977, "step": 4196 }, { "epoch": 0.22245779556356504, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56256712.0, "logits/rejected": -19346354.0, "logps/chosen": -182.65789794921875, "logps/rejected": -459.4403381347656, "loss": 0.2859, "rewards/chosen": 0.05505190044641495, "rewards/margins": 2.586299516260624, "rewards/rejected": -2.531247615814209, "step": 4197 }, { "epoch": 0.22251079956536718, "grad_norm": 58.25, "kl": 3.9389514923095703, "learning_rate": 5e-07, "logits/chosen": -46024810.666666664, "logits/rejected": 5976101.0, "logps/chosen": -633.0667317708334, "logps/rejected": -696.2039794921875, "loss": 0.3112, "rewards/chosen": 0.8917003472646078, "rewards/margins": 4.928326686223348, "rewards/rejected": -4.03662633895874, "step": 4198 }, { "epoch": 0.22256380356716932, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31299584.0, "logits/rejected": -16571558.4, "logps/chosen": -223.4918212890625, "logps/rejected": -289.939892578125, "loss": 0.3295, "rewards/chosen": -0.2070310115814209, "rewards/margins": 1.4854097843170166, "rewards/rejected": -1.6924407958984375, "step": 4199 }, { "epoch": 0.22261680756897145, "grad_norm": 43.5, "kl": 0.09992027282714844, "learning_rate": 5e-07, "logits/chosen": -59264768.0, "logits/rejected": -14316822.4, "logps/chosen": -273.220458984375, "logps/rejected": -221.872216796875, "loss": 0.3014, "rewards/chosen": 0.03329748411973318, "rewards/margins": 1.918905394275983, "rewards/rejected": -1.88560791015625, "step": 4200 }, { "epoch": 0.2226698115707736, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1388251.4, "logits/rejected": -65939920.0, "logps/chosen": -96.91381225585937, "logps/rejected": -488.2078450520833, "loss": 0.3747, "rewards/chosen": -0.28590874671936034, "rewards/margins": 2.6530115922292077, "rewards/rejected": -2.938920338948568, "step": 4201 }, { "epoch": 0.22272281557257573, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57783680.0, "logits/rejected": -1995025.5, "logps/chosen": -362.3860168457031, "logps/rejected": -278.6918640136719, "loss": 0.327, "rewards/chosen": -0.11908715963363647, "rewards/margins": 2.5348077416419983, "rewards/rejected": -2.6538949012756348, "step": 4202 }, { "epoch": 0.22277581957437786, "grad_norm": 60.25, "kl": 1.455230712890625, "learning_rate": 5e-07, "logits/chosen": -43375300.571428575, "logits/rejected": -8303427.0, "logps/chosen": -292.89254324776783, "logps/rejected": -341.9808349609375, "loss": 0.4645, "rewards/chosen": 0.07825125115258354, "rewards/margins": 2.8813427771840776, "rewards/rejected": -2.803091526031494, "step": 4203 }, { "epoch": 0.22282882357618, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17179420.0, "logits/rejected": -14938351.0, "logps/chosen": -462.9449462890625, "logps/rejected": -162.1764373779297, "loss": 0.3039, "rewards/chosen": 0.17343120276927948, "rewards/margins": 1.9710097759962082, "rewards/rejected": -1.7975785732269287, "step": 4204 }, { "epoch": 0.22288182757798214, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28179565.333333332, "logits/rejected": -23744248.0, "logps/chosen": -172.36895751953125, "logps/rejected": -327.12445068359375, "loss": 0.4126, "rewards/chosen": 0.18008214235305786, "rewards/margins": 1.1393916010856628, "rewards/rejected": -0.959309458732605, "step": 4205 }, { "epoch": 0.22293483157978428, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14377421.333333334, "logits/rejected": -5358372.0, "logps/chosen": -484.246337890625, "logps/rejected": -243.2599609375, "loss": 0.2672, "rewards/chosen": 0.5171213944753011, "rewards/margins": 2.1894667466481526, "rewards/rejected": -1.6723453521728515, "step": 4206 }, { "epoch": 0.2229878355815864, "grad_norm": 49.75, "kl": 0.2682771682739258, "learning_rate": 5e-07, "logits/chosen": -30710723.2, "logits/rejected": -27132594.666666668, "logps/chosen": -242.055078125, "logps/rejected": -211.4625447591146, "loss": 0.3456, "rewards/chosen": 0.1461189031600952, "rewards/margins": 2.628729764620463, "rewards/rejected": -2.4826108614603677, "step": 4207 }, { "epoch": 0.22304083958338855, "grad_norm": 49.5, "kl": 0.5239486694335938, "learning_rate": 5e-07, "logits/chosen": 3238098.3333333335, "logits/rejected": -27884496.0, "logps/chosen": -286.39752197265625, "logps/rejected": -402.2436279296875, "loss": 0.251, "rewards/chosen": 0.43494145075480145, "rewards/margins": 2.638553206125895, "rewards/rejected": -2.2036117553710937, "step": 4208 }, { "epoch": 0.2230938435851907, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4753952.0, "logits/rejected": -27417666.666666668, "logps/chosen": -25.768558502197266, "logps/rejected": -175.5995076497396, "loss": 0.3325, "rewards/chosen": -0.2137460708618164, "rewards/margins": 0.9200314680735271, "rewards/rejected": -1.1337775389353435, "step": 4209 }, { "epoch": 0.22314684758699282, "grad_norm": 45.5, "kl": 0.3087158203125, "learning_rate": 5e-07, "logits/chosen": -50043212.8, "logits/rejected": -34688733.333333336, "logps/chosen": -358.2326171875, "logps/rejected": -224.1228230794271, "loss": 0.2672, "rewards/chosen": 0.7473116397857666, "rewards/margins": 2.6229156335194905, "rewards/rejected": -1.8756039937337239, "step": 4210 }, { "epoch": 0.22319985158879496, "grad_norm": 31.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11975637.0, "logits/rejected": -21169749.333333332, "logps/chosen": -55.716346740722656, "logps/rejected": -313.28955078125, "loss": 0.1861, "rewards/chosen": 0.2754003405570984, "rewards/margins": 2.6994447906812034, "rewards/rejected": -2.424044450124105, "step": 4211 }, { "epoch": 0.2232528555905971, "grad_norm": 62.75, "kl": 1.660675048828125, "learning_rate": 5e-07, "logits/chosen": -43413716.0, "logits/rejected": -58346716.0, "logps/chosen": -921.3206787109375, "logps/rejected": -357.636962890625, "loss": 0.2494, "rewards/chosen": 1.0070396661758423, "rewards/margins": 2.5697022676467896, "rewards/rejected": -1.5626626014709473, "step": 4212 }, { "epoch": 0.22330585959239924, "grad_norm": 65.0, "kl": 0.33318519592285156, "learning_rate": 5e-07, "logits/chosen": -36443628.8, "logits/rejected": -13350346.666666666, "logps/chosen": -266.9092041015625, "logps/rejected": -320.1673583984375, "loss": 0.3218, "rewards/chosen": 0.5244277000427247, "rewards/margins": 2.292935339609782, "rewards/rejected": -1.7685076395670574, "step": 4213 }, { "epoch": 0.22335886359420137, "grad_norm": 60.0, "kl": 1.7317390441894531, "learning_rate": 5e-07, "logits/chosen": -33429216.0, "logits/rejected": -13815258.0, "logps/chosen": -284.8603515625, "logps/rejected": -599.95947265625, "loss": 0.3845, "rewards/chosen": 0.35142040252685547, "rewards/margins": 2.481928825378418, "rewards/rejected": -2.1305084228515625, "step": 4214 }, { "epoch": 0.2234118675960035, "grad_norm": 61.0, "kl": 1.1078224182128906, "learning_rate": 5e-07, "logits/chosen": -9480201.333333334, "logits/rejected": -24694626.0, "logps/chosen": -477.8339436848958, "logps/rejected": -191.98683166503906, "loss": 0.3431, "rewards/chosen": 0.5396936337153116, "rewards/margins": 2.337604562441508, "rewards/rejected": -1.7979109287261963, "step": 4215 }, { "epoch": 0.22346487159780565, "grad_norm": 79.0, "kl": 2.8409500122070312, "learning_rate": 5e-07, "logits/chosen": -37628456.0, "logits/rejected": -18094464.0, "logps/chosen": -720.8886108398438, "logps/rejected": -184.7646484375, "loss": 0.3709, "rewards/chosen": 0.9734055399894714, "rewards/margins": 1.6803881525993347, "rewards/rejected": -0.7069826126098633, "step": 4216 }, { "epoch": 0.22351787559960776, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33722384.0, "logits/rejected": -40809392.0, "logps/chosen": -242.26896158854166, "logps/rejected": -300.005029296875, "loss": 0.2527, "rewards/chosen": 0.42613879839579266, "rewards/margins": 2.1157153288523354, "rewards/rejected": -1.6895765304565429, "step": 4217 }, { "epoch": 0.2235708796014099, "grad_norm": 48.0, "kl": 0.459014892578125, "learning_rate": 5e-07, "logits/chosen": -83431296.0, "logits/rejected": -36609962.666666664, "logps/chosen": -218.620751953125, "logps/rejected": -508.6215006510417, "loss": 0.3838, "rewards/chosen": -0.16019690036773682, "rewards/margins": 2.120085120201111, "rewards/rejected": -2.2802820205688477, "step": 4218 }, { "epoch": 0.22362388360321203, "grad_norm": 74.0, "kl": 0.017859935760498047, "learning_rate": 5e-07, "logits/chosen": -13957406.4, "logits/rejected": -3492981.3333333335, "logps/chosen": -346.7796630859375, "logps/rejected": -566.0770670572916, "loss": 0.4086, "rewards/chosen": -0.1949093222618103, "rewards/margins": 1.7394616723060607, "rewards/rejected": -1.934370994567871, "step": 4219 }, { "epoch": 0.22367688760501417, "grad_norm": 40.0, "kl": 0.00446319580078125, "learning_rate": 5e-07, "logits/chosen": -64957194.666666664, "logits/rejected": -9883456.8, "logps/chosen": -250.25604248046875, "logps/rejected": -227.770068359375, "loss": 0.2473, "rewards/chosen": 0.26519761482874554, "rewards/margins": 2.2838427106539405, "rewards/rejected": -2.018645095825195, "step": 4220 }, { "epoch": 0.2237298916068163, "grad_norm": 50.75, "kl": 0.5781307220458984, "learning_rate": 5e-07, "logits/chosen": -28457216.0, "logits/rejected": -58969504.0, "logps/chosen": -265.8673502604167, "logps/rejected": -111.389892578125, "loss": 0.4278, "rewards/chosen": 0.12210744619369507, "rewards/margins": 1.4897500872612, "rewards/rejected": -1.3676426410675049, "step": 4221 }, { "epoch": 0.22378289560861844, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 27844.25, "logits/rejected": -29869458.0, "logps/chosen": -104.3844985961914, "logps/rejected": -172.5032958984375, "loss": 0.3967, "rewards/chosen": 0.16072410345077515, "rewards/margins": 0.9931262135505676, "rewards/rejected": -0.8324021100997925, "step": 4222 }, { "epoch": 0.22383589961042058, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23234552.0, "logits/rejected": -35999472.0, "logps/chosen": -326.6856384277344, "logps/rejected": -331.10703822544644, "loss": 0.1938, "rewards/chosen": 1.3111084699630737, "rewards/margins": 3.2441290616989136, "rewards/rejected": -1.9330205917358398, "step": 4223 }, { "epoch": 0.22388890361222272, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6013272.0, "logits/rejected": 2205322.0, "logps/chosen": -99.89849853515625, "logps/rejected": -348.1314453125, "loss": 0.323, "rewards/chosen": -0.24413414796193442, "rewards/margins": 1.5787002484003703, "rewards/rejected": -1.8228343963623046, "step": 4224 }, { "epoch": 0.22394190761402485, "grad_norm": 37.25, "kl": 0.34287071228027344, "learning_rate": 5e-07, "logits/chosen": -24835285.333333332, "logits/rejected": -10398004.8, "logps/chosen": -198.1928507486979, "logps/rejected": -132.5700927734375, "loss": 0.2861, "rewards/chosen": 0.5200487772623698, "rewards/margins": 1.9567830721537272, "rewards/rejected": -1.4367342948913575, "step": 4225 }, { "epoch": 0.223994911615827, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 31244640.0, "logits/rejected": -7426244.8, "logps/chosen": -136.52530924479166, "logps/rejected": -317.2328369140625, "loss": 0.2956, "rewards/chosen": -0.4844746987024943, "rewards/margins": 1.8977837165196736, "rewards/rejected": -2.382258415222168, "step": 4226 }, { "epoch": 0.22404791561762913, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19638116.0, "logits/rejected": -27552869.333333332, "logps/chosen": -100.95075225830078, "logps/rejected": -371.6910807291667, "loss": 0.2391, "rewards/chosen": 0.170196533203125, "rewards/margins": 2.114995002746582, "rewards/rejected": -1.944798469543457, "step": 4227 }, { "epoch": 0.22410091961943127, "grad_norm": 53.5, "kl": 0.7039718627929688, "learning_rate": 5e-07, "logits/chosen": 1426316.25, "logits/rejected": -1401368.0, "logps/chosen": -306.59088134765625, "logps/rejected": -334.1194661458333, "loss": 0.2934, "rewards/chosen": 0.26499253511428833, "rewards/margins": 1.8318578203519185, "rewards/rejected": -1.5668652852376301, "step": 4228 }, { "epoch": 0.2241539236212334, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3213208.8, "logits/rejected": -1865629.3333333333, "logps/chosen": -184.94619140625, "logps/rejected": -80.57982381184895, "loss": 0.2835, "rewards/chosen": 0.6969318389892578, "rewards/margins": 2.7334394454956055, "rewards/rejected": -2.0365076065063477, "step": 4229 }, { "epoch": 0.22420692762303554, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -980181.4375, "logits/rejected": -16759740.0, "logps/chosen": -192.60511779785156, "logps/rejected": -214.30247497558594, "loss": 0.3844, "rewards/chosen": 0.04886952042579651, "rewards/margins": 1.1913550198078156, "rewards/rejected": -1.142485499382019, "step": 4230 }, { "epoch": 0.22425993162483768, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42354624.0, "logits/rejected": -32358278.0, "logps/chosen": -430.6746012369792, "logps/rejected": -234.8496551513672, "loss": 0.3282, "rewards/chosen": 0.3836025396982829, "rewards/margins": 2.7597526709238687, "rewards/rejected": -2.376150131225586, "step": 4231 }, { "epoch": 0.22431293562663981, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16093083.0, "logits/rejected": -23600066.0, "logps/chosen": -307.38201904296875, "logps/rejected": -146.74880981445312, "loss": 0.2896, "rewards/chosen": 0.5832607746124268, "rewards/margins": 2.126786470413208, "rewards/rejected": -1.5435256958007812, "step": 4232 }, { "epoch": 0.22436593962844195, "grad_norm": 55.75, "kl": 0.40238189697265625, "learning_rate": 5e-07, "logits/chosen": -45984892.8, "logits/rejected": -13123037.333333334, "logps/chosen": -355.1886962890625, "logps/rejected": -205.6135050455729, "loss": 0.4179, "rewards/chosen": 0.20519800186157228, "rewards/margins": 0.8548034350077311, "rewards/rejected": -0.6496054331461588, "step": 4233 }, { "epoch": 0.2244189436302441, "grad_norm": 44.0, "kl": 0.8653745651245117, "learning_rate": 5e-07, "logits/chosen": -11953217.0, "logits/rejected": -28696464.0, "logps/chosen": -223.47457885742188, "logps/rejected": -173.91757202148438, "loss": 0.3832, "rewards/chosen": -0.018107205629348755, "rewards/margins": 1.2782709300518036, "rewards/rejected": -1.2963781356811523, "step": 4234 }, { "epoch": 0.22447194763204623, "grad_norm": 61.5, "kl": 0.0362701416015625, "learning_rate": 5e-07, "logits/chosen": 981098.8, "logits/rejected": -12420636.0, "logps/chosen": -278.756103515625, "logps/rejected": -546.2146809895834, "loss": 0.4001, "rewards/chosen": -0.12828311920166016, "rewards/margins": 2.1684292793273925, "rewards/rejected": -2.2967123985290527, "step": 4235 }, { "epoch": 0.22452495163384836, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36264396.8, "logits/rejected": -74049888.0, "logps/chosen": -344.42802734375, "logps/rejected": -370.0454915364583, "loss": 0.2967, "rewards/chosen": 0.2999658823013306, "rewards/margins": 2.9639582872390746, "rewards/rejected": -2.663992404937744, "step": 4236 }, { "epoch": 0.2245779556356505, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20478377.6, "logits/rejected": -23244365.333333332, "logps/chosen": -167.16409912109376, "logps/rejected": -285.528076171875, "loss": 0.3473, "rewards/chosen": 0.17136597633361816, "rewards/margins": 2.014013210932414, "rewards/rejected": -1.8426472345987956, "step": 4237 }, { "epoch": 0.22463095963745264, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91729000.0, "logits/rejected": -28206730.0, "logps/chosen": -256.836669921875, "logps/rejected": -273.04095458984375, "loss": 0.3046, "rewards/chosen": 0.2524135708808899, "rewards/margins": 2.2383511662483215, "rewards/rejected": -1.9859375953674316, "step": 4238 }, { "epoch": 0.22468396363925477, "grad_norm": 53.75, "kl": 0.5351409912109375, "learning_rate": 5e-07, "logits/chosen": -63312533.333333336, "logits/rejected": -43267718.4, "logps/chosen": -385.078369140625, "logps/rejected": -441.23095703125, "loss": 0.2502, "rewards/chosen": 0.31581828991572064, "rewards/margins": 2.4959555427233378, "rewards/rejected": -2.1801372528076173, "step": 4239 }, { "epoch": 0.2247369676410569, "grad_norm": 82.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11902948.8, "logits/rejected": -7258342.0, "logps/chosen": -352.5453369140625, "logps/rejected": -152.48335774739584, "loss": 0.4085, "rewards/chosen": -0.12157607078552246, "rewards/margins": 1.5915571053822835, "rewards/rejected": -1.713133176167806, "step": 4240 }, { "epoch": 0.22478997164285905, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20801754.0, "logits/rejected": -1882116.6666666667, "logps/chosen": -138.20948791503906, "logps/rejected": -324.2447916666667, "loss": 0.269, "rewards/chosen": 0.6257780194282532, "rewards/margins": 1.8716269930203755, "rewards/rejected": -1.2458489735921223, "step": 4241 }, { "epoch": 0.22484297564466116, "grad_norm": 62.0, "kl": 1.5258255004882812, "learning_rate": 5e-07, "logits/chosen": -29976250.0, "logits/rejected": -43686264.0, "logps/chosen": -342.11688232421875, "logps/rejected": -435.2615661621094, "loss": 0.2385, "rewards/chosen": 0.7353939414024353, "rewards/margins": 3.3688639998435974, "rewards/rejected": -2.633470058441162, "step": 4242 }, { "epoch": 0.2248959796464633, "grad_norm": 43.25, "kl": 0.7122879028320312, "learning_rate": 5e-07, "logits/chosen": -42881362.666666664, "logits/rejected": -24528913.6, "logps/chosen": -248.51200358072916, "logps/rejected": -272.92763671875, "loss": 0.2891, "rewards/chosen": -0.15924599766731262, "rewards/margins": 2.006568592786789, "rewards/rejected": -2.1658145904541017, "step": 4243 }, { "epoch": 0.22494898364826543, "grad_norm": 57.5, "kl": 0.6663970947265625, "learning_rate": 5e-07, "logits/chosen": -29125733.333333332, "logits/rejected": -45001030.4, "logps/chosen": -286.2533365885417, "logps/rejected": -331.96689453125, "loss": 0.3082, "rewards/chosen": 0.5035995642344157, "rewards/margins": 1.7038654486338296, "rewards/rejected": -1.200265884399414, "step": 4244 }, { "epoch": 0.22500198765006757, "grad_norm": 48.75, "kl": 1.8274202346801758, "learning_rate": 5e-07, "logits/chosen": -13978156.8, "logits/rejected": -66481637.333333336, "logps/chosen": -254.50478515625, "logps/rejected": -572.7246907552084, "loss": 0.2835, "rewards/chosen": 0.6678973197937011, "rewards/margins": 3.60522247950236, "rewards/rejected": -2.9373251597086587, "step": 4245 }, { "epoch": 0.2250549916518697, "grad_norm": 49.75, "kl": 0.9596328735351562, "learning_rate": 5e-07, "logits/chosen": -7484758.666666667, "logits/rejected": -2760536.0, "logps/chosen": -151.57145182291666, "logps/rejected": -171.7537841796875, "loss": 0.4015, "rewards/chosen": 0.2096067468325297, "rewards/margins": 1.3540648023287456, "rewards/rejected": -1.1444580554962158, "step": 4246 }, { "epoch": 0.22510799565367184, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2606731.5, "logits/rejected": -28139746.285714287, "logps/chosen": -0.5955982208251953, "logps/rejected": -412.32017299107144, "loss": 0.1973, "rewards/chosen": 0.007981300354003906, "rewards/margins": 1.934997831072126, "rewards/rejected": -1.9270165307181222, "step": 4247 }, { "epoch": 0.22516099965547398, "grad_norm": 75.0, "kl": 2.1265735626220703, "learning_rate": 5e-07, "logits/chosen": -18842772.0, "logits/rejected": 41388688.0, "logps/chosen": -904.0022786458334, "logps/rejected": -248.72119140625, "loss": 0.2833, "rewards/chosen": 0.6767709255218506, "rewards/margins": 2.5454617977142333, "rewards/rejected": -1.8686908721923827, "step": 4248 }, { "epoch": 0.22521400365727612, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7453939.5, "logits/rejected": -26081002.666666668, "logps/chosen": -200.40216064453125, "logps/rejected": -297.9482421875, "loss": 0.2209, "rewards/chosen": 0.23604507744312286, "rewards/margins": 2.1998790452877683, "rewards/rejected": -1.9638339678446453, "step": 4249 }, { "epoch": 0.22526700765907826, "grad_norm": 48.5, "kl": 0.07535171508789062, "learning_rate": 5e-07, "logits/chosen": -38754643.2, "logits/rejected": -12877649.333333334, "logps/chosen": -417.604931640625, "logps/rejected": -267.05491129557294, "loss": 0.3065, "rewards/chosen": 0.5171450614929199, "rewards/margins": 2.420874341328939, "rewards/rejected": -1.9037292798360188, "step": 4250 }, { "epoch": 0.2253200116608804, "grad_norm": 41.5, "kl": 0.24367046356201172, "learning_rate": 5e-07, "logits/chosen": -32373472.0, "logits/rejected": -33459526.0, "logps/chosen": -171.37311662946428, "logps/rejected": -448.29400634765625, "loss": 0.3411, "rewards/chosen": 0.5632606233869281, "rewards/margins": 3.057370730808803, "rewards/rejected": -2.494110107421875, "step": 4251 }, { "epoch": 0.22537301566268253, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 25779678.0, "logits/rejected": -40373376.0, "logps/chosen": -247.02001953125, "logps/rejected": -204.77847290039062, "loss": 0.3555, "rewards/chosen": 0.10374145209789276, "rewards/margins": 1.5157267302274704, "rewards/rejected": -1.4119852781295776, "step": 4252 }, { "epoch": 0.22542601966448467, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15407122.666666666, "logits/rejected": -9508379.2, "logps/chosen": -224.4186808268229, "logps/rejected": -289.0330322265625, "loss": 0.264, "rewards/chosen": 0.32933255036671955, "rewards/margins": 2.0360384066899617, "rewards/rejected": -1.7067058563232422, "step": 4253 }, { "epoch": 0.2254790236662868, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69060720.0, "logits/rejected": -63545241.6, "logps/chosen": -297.053955078125, "logps/rejected": -295.1465576171875, "loss": 0.3021, "rewards/chosen": -0.28478749593098956, "rewards/margins": 1.703804651896159, "rewards/rejected": -1.9885921478271484, "step": 4254 }, { "epoch": 0.22553202766808894, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35125498.666666664, "logits/rejected": -19466985.6, "logps/chosen": -205.2001953125, "logps/rejected": -215.1251708984375, "loss": 0.3398, "rewards/chosen": -0.3055505156517029, "rewards/margins": 1.2957359910011292, "rewards/rejected": -1.6012865066528321, "step": 4255 }, { "epoch": 0.22558503166989108, "grad_norm": 38.5, "kl": 0.1496124267578125, "learning_rate": 5e-07, "logits/chosen": -21098222.0, "logits/rejected": -26296344.0, "logps/chosen": -181.5292205810547, "logps/rejected": -468.5196533203125, "loss": 0.2391, "rewards/chosen": 0.8166926503181458, "rewards/margins": 2.973961651325226, "rewards/rejected": -2.15726900100708, "step": 4256 }, { "epoch": 0.22563803567169322, "grad_norm": 60.25, "kl": 1.1898612976074219, "learning_rate": 5e-07, "logits/chosen": -33023520.0, "logits/rejected": -20151394.666666668, "logps/chosen": -332.4900146484375, "logps/rejected": -153.153076171875, "loss": 0.3477, "rewards/chosen": 0.4654675483703613, "rewards/margins": 1.9787385622660318, "rewards/rejected": -1.5132710138956706, "step": 4257 }, { "epoch": 0.22569103967349535, "grad_norm": 40.5, "kl": 0.35733890533447266, "learning_rate": 5e-07, "logits/chosen": 9796748.0, "logits/rejected": -1622887.0, "logps/chosen": -19.226163864135742, "logps/rejected": -438.0771484375, "loss": 0.2352, "rewards/chosen": 0.336615651845932, "rewards/margins": 2.4450717866420746, "rewards/rejected": -2.1084561347961426, "step": 4258 }, { "epoch": 0.2257440436752975, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38141504.0, "logits/rejected": 3721488.0, "logps/chosen": -328.197265625, "logps/rejected": -428.07635498046875, "loss": 0.2911, "rewards/chosen": 0.2031915783882141, "rewards/margins": 2.5369189381599426, "rewards/rejected": -2.3337273597717285, "step": 4259 }, { "epoch": 0.22579704767709963, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1032988.25, "logits/rejected": -36945816.0, "logps/chosen": -49.5896110534668, "logps/rejected": -418.1875305175781, "loss": 0.2954, "rewards/chosen": 0.12089840322732925, "rewards/margins": 2.3322569504380226, "rewards/rejected": -2.2113585472106934, "step": 4260 }, { "epoch": 0.22585005167890176, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17618732.8, "logits/rejected": -17842248.0, "logps/chosen": -245.5425537109375, "logps/rejected": -141.60868326822916, "loss": 0.3785, "rewards/chosen": 0.2529563903808594, "rewards/margins": 1.3148181438446045, "rewards/rejected": -1.0618617534637451, "step": 4261 }, { "epoch": 0.2259030556807039, "grad_norm": 59.25, "kl": 0.105926513671875, "learning_rate": 5e-07, "logits/chosen": -45588376.0, "logits/rejected": -42533712.0, "logps/chosen": -815.3451538085938, "logps/rejected": -260.8955078125, "loss": 0.282, "rewards/chosen": 0.555493950843811, "rewards/margins": 2.1321197748184204, "rewards/rejected": -1.5766258239746094, "step": 4262 }, { "epoch": 0.22595605968250604, "grad_norm": 61.0, "kl": 1.3580455780029297, "learning_rate": 5e-07, "logits/chosen": -2510056.0, "logits/rejected": -26128914.666666668, "logps/chosen": -171.6509521484375, "logps/rejected": -376.6211751302083, "loss": 0.4481, "rewards/chosen": -0.17778708934783935, "rewards/margins": 1.3090038855870565, "rewards/rejected": -1.4867909749348958, "step": 4263 }, { "epoch": 0.22600906368430818, "grad_norm": 70.0, "kl": 0.6147041320800781, "learning_rate": 5e-07, "logits/chosen": -27516089.6, "logits/rejected": -8515166.666666666, "logps/chosen": -338.375537109375, "logps/rejected": -230.63370768229166, "loss": 0.3959, "rewards/chosen": -0.007791731506586075, "rewards/margins": 1.5208180211484432, "rewards/rejected": -1.5286097526550293, "step": 4264 }, { "epoch": 0.2260620676861103, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72412069.33333333, "logits/rejected": 2639227.2, "logps/chosen": -257.38515218098956, "logps/rejected": -454.769384765625, "loss": 0.236, "rewards/chosen": 0.7080963452657064, "rewards/margins": 2.7460417111714683, "rewards/rejected": -2.037945365905762, "step": 4265 }, { "epoch": 0.22611507168791245, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25588794.0, "logits/rejected": 23206184.0, "logps/chosen": -364.03564453125, "logps/rejected": -354.8536376953125, "loss": 0.2719, "rewards/chosen": 0.46136248111724854, "rewards/margins": 2.2965612411499023, "rewards/rejected": -1.8351987600326538, "step": 4266 }, { "epoch": 0.2261680756897146, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4498016.0, "logits/rejected": -7327127.2, "logps/chosen": -265.78603108723956, "logps/rejected": -133.10361328125, "loss": 0.2605, "rewards/chosen": 0.5201317071914673, "rewards/margins": 2.3078739404678346, "rewards/rejected": -1.7877422332763673, "step": 4267 }, { "epoch": 0.2262210796915167, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34531344.0, "logits/rejected": -346019.75, "logps/chosen": -236.61520385742188, "logps/rejected": -140.8776092529297, "loss": 0.3269, "rewards/chosen": 0.5530683994293213, "rewards/margins": 1.5618089437484741, "rewards/rejected": -1.0087405443191528, "step": 4268 }, { "epoch": 0.22627408369331883, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23257573.333333332, "logits/rejected": -2257934.4, "logps/chosen": -297.55702718098956, "logps/rejected": -154.745361328125, "loss": 0.2966, "rewards/chosen": -0.14978242913881937, "rewards/margins": 1.612710322936376, "rewards/rejected": -1.7624927520751954, "step": 4269 }, { "epoch": 0.22632708769512097, "grad_norm": 50.25, "kl": 0.28295135498046875, "learning_rate": 5e-07, "logits/chosen": -28792290.666666668, "logits/rejected": -20661908.8, "logps/chosen": -255.1926066080729, "logps/rejected": -237.6901611328125, "loss": 0.3601, "rewards/chosen": 0.016428634524345398, "rewards/margins": 1.3859938353300094, "rewards/rejected": -1.369565200805664, "step": 4270 }, { "epoch": 0.2263800916969231, "grad_norm": 46.25, "kl": 0.33768463134765625, "learning_rate": 5e-07, "logits/chosen": -32145754.0, "logits/rejected": -29076500.0, "logps/chosen": -473.68658447265625, "logps/rejected": -145.32252502441406, "loss": 0.267, "rewards/chosen": 0.7526934146881104, "rewards/margins": 2.2606589794158936, "rewards/rejected": -1.5079655647277832, "step": 4271 }, { "epoch": 0.22643309569872525, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3568360.5, "logits/rejected": -37661174.85714286, "logps/chosen": -418.09515380859375, "logps/rejected": -215.57550920758928, "loss": 0.1981, "rewards/chosen": 0.8574615716934204, "rewards/margins": 2.417763352394104, "rewards/rejected": -1.5603017807006836, "step": 4272 }, { "epoch": 0.22648609970052738, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4166109.0, "logits/rejected": -9457561.333333334, "logps/chosen": -201.23165893554688, "logps/rejected": -100.388916015625, "loss": 0.2927, "rewards/chosen": 0.12627562880516052, "rewards/margins": 1.395121345917384, "rewards/rejected": -1.2688457171122234, "step": 4273 }, { "epoch": 0.22653910370232952, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19689354.666666668, "logits/rejected": -7616528.0, "logps/chosen": -182.98221842447916, "logps/rejected": -232.08010864257812, "loss": 0.3633, "rewards/chosen": 0.2190018097559611, "rewards/margins": 2.3266525665918985, "rewards/rejected": -2.1076507568359375, "step": 4274 }, { "epoch": 0.22659210770413166, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21264236.8, "logits/rejected": -37770885.333333336, "logps/chosen": -339.1281005859375, "logps/rejected": -418.8467610677083, "loss": 0.3648, "rewards/chosen": 0.12059440612792968, "rewards/margins": 2.4715436935424804, "rewards/rejected": -2.350949287414551, "step": 4275 }, { "epoch": 0.2266451117059338, "grad_norm": 48.75, "kl": 0.9628429412841797, "learning_rate": 5e-07, "logits/chosen": -11515185.333333334, "logits/rejected": -37460628.0, "logps/chosen": -223.62015787760416, "logps/rejected": -374.89996337890625, "loss": 0.4007, "rewards/chosen": 0.1985448201497396, "rewards/margins": 2.7400949796040854, "rewards/rejected": -2.5415501594543457, "step": 4276 }, { "epoch": 0.22669811570773593, "grad_norm": 60.75, "kl": 1.954838752746582, "learning_rate": 5e-07, "logits/chosen": -24710896.0, "logits/rejected": -9504816.0, "logps/chosen": -610.5025024414062, "logps/rejected": -329.1269836425781, "loss": 0.3538, "rewards/chosen": 0.6019085049629211, "rewards/margins": 1.8504806160926819, "rewards/rejected": -1.2485721111297607, "step": 4277 }, { "epoch": 0.22675111970953807, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66400576.0, "logits/rejected": -52780860.8, "logps/chosen": -379.0735677083333, "logps/rejected": -449.194677734375, "loss": 0.2825, "rewards/chosen": 0.5022725264231364, "rewards/margins": 2.30171004931132, "rewards/rejected": -1.7994375228881836, "step": 4278 }, { "epoch": 0.2268041237113402, "grad_norm": 52.0, "kl": 0.12604045867919922, "learning_rate": 5e-07, "logits/chosen": -9072492.0, "logits/rejected": -21657332.8, "logps/chosen": -225.58964029947916, "logps/rejected": -209.85458984375, "loss": 0.2351, "rewards/chosen": 0.718218723932902, "rewards/margins": 2.7324784437815346, "rewards/rejected": -2.0142597198486327, "step": 4279 }, { "epoch": 0.22685712771314234, "grad_norm": 88.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38411914.666666664, "logits/rejected": -66096404.0, "logps/chosen": -475.4725341796875, "logps/rejected": -453.62420654296875, "loss": 0.3356, "rewards/chosen": 0.4289561112721761, "rewards/margins": 2.3132150967915854, "rewards/rejected": -1.8842589855194092, "step": 4280 }, { "epoch": 0.22691013171494448, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52146681.6, "logits/rejected": 3224115.6666666665, "logps/chosen": -192.76260986328126, "logps/rejected": -130.76261393229166, "loss": 0.3323, "rewards/chosen": 0.41776285171508787, "rewards/margins": 1.9010946909586588, "rewards/rejected": -1.483331839243571, "step": 4281 }, { "epoch": 0.22696313571674662, "grad_norm": 83.5, "kl": 0.25952911376953125, "learning_rate": 5e-07, "logits/chosen": -28497237.333333332, "logits/rejected": -21038016.0, "logps/chosen": -435.6415608723958, "logps/rejected": -333.4407958984375, "loss": 0.3308, "rewards/chosen": 0.7823789914449056, "rewards/margins": 1.4896591504414878, "rewards/rejected": -0.707280158996582, "step": 4282 }, { "epoch": 0.22701613971854875, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28371020.8, "logits/rejected": 11598848.0, "logps/chosen": -337.32822265625, "logps/rejected": -811.2627766927084, "loss": 0.3537, "rewards/chosen": 0.22919983863830568, "rewards/margins": 2.7649670759836833, "rewards/rejected": -2.5357672373453775, "step": 4283 }, { "epoch": 0.2270691437203509, "grad_norm": 60.25, "kl": 1.7998027801513672, "learning_rate": 5e-07, "logits/chosen": -14498941.333333334, "logits/rejected": 7987158.4, "logps/chosen": -787.2461751302084, "logps/rejected": -172.6110595703125, "loss": 0.1779, "rewards/chosen": 1.0952779452006023, "rewards/margins": 3.3530923525492353, "rewards/rejected": -2.257814407348633, "step": 4284 }, { "epoch": 0.22712214772215303, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14596523.42857143, "logits/rejected": -48870804.0, "logps/chosen": -217.91671316964286, "logps/rejected": -434.3951416015625, "loss": 0.5086, "rewards/chosen": -0.23931680406842912, "rewards/margins": 1.4406789711543493, "rewards/rejected": -1.6799957752227783, "step": 4285 }, { "epoch": 0.22717515172395517, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61962960.0, "logits/rejected": -14824314.666666666, "logps/chosen": -193.90618896484375, "logps/rejected": -307.84063720703125, "loss": 0.2134, "rewards/chosen": 0.25660666823387146, "rewards/margins": 2.130222409963608, "rewards/rejected": -1.8736157417297363, "step": 4286 }, { "epoch": 0.2272281557257573, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15326752.0, "logits/rejected": -50417881.6, "logps/chosen": -899.7711588541666, "logps/rejected": -348.26669921875, "loss": 0.2629, "rewards/chosen": 0.7174042065938314, "rewards/margins": 2.2918917973836264, "rewards/rejected": -1.5744875907897948, "step": 4287 }, { "epoch": 0.22728115972755944, "grad_norm": 57.0, "kl": 0.7315292358398438, "learning_rate": 5e-07, "logits/chosen": -2939207.5, "logits/rejected": -33942148.0, "logps/chosen": -286.63116455078125, "logps/rejected": -197.40185546875, "loss": 0.3949, "rewards/chosen": 0.11781008541584015, "rewards/margins": 1.1234985142946243, "rewards/rejected": -1.0056884288787842, "step": 4288 }, { "epoch": 0.22733416372936158, "grad_norm": 56.75, "kl": 1.2127604484558105, "learning_rate": 5e-07, "logits/chosen": -61599408.0, "logits/rejected": 168852.325, "logps/chosen": -290.29433186848956, "logps/rejected": -301.5271484375, "loss": 0.3404, "rewards/chosen": 0.23302382230758667, "rewards/margins": 1.4235108256340028, "rewards/rejected": -1.190487003326416, "step": 4289 }, { "epoch": 0.22738716773116371, "grad_norm": 45.25, "kl": 0.6989631652832031, "learning_rate": 5e-07, "logits/chosen": -11675511.0, "logits/rejected": 6062328.0, "logps/chosen": -469.96087646484375, "logps/rejected": -126.85758972167969, "loss": 0.2381, "rewards/chosen": 1.177947998046875, "rewards/margins": 2.889755368232727, "rewards/rejected": -1.711807370185852, "step": 4290 }, { "epoch": 0.22744017173296585, "grad_norm": 70.0, "kl": 2.407984733581543, "learning_rate": 5e-07, "logits/chosen": -30020842.666666668, "logits/rejected": -897861.3125, "logps/chosen": -407.5043131510417, "logps/rejected": -56.18683624267578, "loss": 0.3933, "rewards/chosen": 0.754241148630778, "rewards/margins": 1.3516164819399514, "rewards/rejected": -0.5973753333091736, "step": 4291 }, { "epoch": 0.227493175734768, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9973068.8, "logits/rejected": -46947845.333333336, "logps/chosen": -248.42783203125, "logps/rejected": -323.451171875, "loss": 0.3069, "rewards/chosen": 0.26533875465393064, "rewards/margins": 2.5858540693918863, "rewards/rejected": -2.3205153147379556, "step": 4292 }, { "epoch": 0.2275461797365701, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19283184.0, "logits/rejected": -43836994.666666664, "logps/chosen": -246.959228515625, "logps/rejected": -370.4039306640625, "loss": 0.3624, "rewards/chosen": -0.04370390772819519, "rewards/margins": 2.5330131510893503, "rewards/rejected": -2.5767170588175454, "step": 4293 }, { "epoch": 0.22759918373837223, "grad_norm": 50.25, "kl": 0.17080307006835938, "learning_rate": 5e-07, "logits/chosen": -33536309.333333332, "logits/rejected": -26256067.2, "logps/chosen": -254.246337890625, "logps/rejected": -460.549365234375, "loss": 0.2151, "rewards/chosen": 0.6143173376719157, "rewards/margins": 2.799625031153361, "rewards/rejected": -2.1853076934814455, "step": 4294 }, { "epoch": 0.22765218774017437, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2850218.5, "logits/rejected": -9519687.333333334, "logps/chosen": -59.04118347167969, "logps/rejected": -100.70554606119792, "loss": 0.2809, "rewards/chosen": 0.6096603274345398, "rewards/margins": 1.813744326432546, "rewards/rejected": -1.204083998998006, "step": 4295 }, { "epoch": 0.2277051917419765, "grad_norm": 57.75, "kl": 0.8003654479980469, "learning_rate": 5e-07, "logits/chosen": -735500.3125, "logits/rejected": -47726956.0, "logps/chosen": -204.75051879882812, "logps/rejected": -167.4417724609375, "loss": 0.4501, "rewards/chosen": -0.05672338604927063, "rewards/margins": 0.5290052592754364, "rewards/rejected": -0.585728645324707, "step": 4296 }, { "epoch": 0.22775819574377865, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -38064704.0, "logps/rejected": -362.02532958984375, "loss": 0.1862, "rewards/rejected": -1.8732789754867554, "step": 4297 }, { "epoch": 0.22781119974558078, "grad_norm": 51.0, "kl": 0.13715362548828125, "learning_rate": 5e-07, "logits/chosen": -13363989.0, "logits/rejected": -46837808.0, "logps/chosen": -174.25927734375, "logps/rejected": -370.46502685546875, "loss": 0.3172, "rewards/chosen": 0.6110836267471313, "rewards/margins": 2.0417189598083496, "rewards/rejected": -1.4306353330612183, "step": 4298 }, { "epoch": 0.22786420374738292, "grad_norm": 108.5, "kl": 3.455150604248047, "learning_rate": 5e-07, "logits/chosen": -44395110.4, "logits/rejected": -35699544.0, "logps/chosen": -812.0103515625, "logps/rejected": -486.9831949869792, "loss": 0.3588, "rewards/chosen": 0.5441110134124756, "rewards/margins": 2.978734095891317, "rewards/rejected": -2.4346230824788413, "step": 4299 }, { "epoch": 0.22791720774918506, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27030286.0, "logits/rejected": -26136742.0, "logps/chosen": -147.74139404296875, "logps/rejected": -279.0033264160156, "loss": 0.3202, "rewards/chosen": 0.31486549973487854, "rewards/margins": 2.62993660569191, "rewards/rejected": -2.3150711059570312, "step": 4300 }, { "epoch": 0.2279702117509872, "grad_norm": 93.0, "kl": 1.8335113525390625, "learning_rate": 5e-07, "logits/chosen": -24823417.6, "logits/rejected": -71408938.66666667, "logps/chosen": -406.0557373046875, "logps/rejected": -213.70426432291666, "loss": 0.4326, "rewards/chosen": -0.0340386152267456, "rewards/margins": 1.4623905102411907, "rewards/rejected": -1.4964291254679363, "step": 4301 }, { "epoch": 0.22802321575278933, "grad_norm": 43.5, "kl": 0.382110595703125, "learning_rate": 5e-07, "logits/chosen": -13828620.0, "logits/rejected": -5753962.0, "logps/chosen": -111.127685546875, "logps/rejected": -286.72509765625, "loss": 0.2598, "rewards/chosen": 0.22084704041481018, "rewards/margins": 2.0304974615573883, "rewards/rejected": -1.8096504211425781, "step": 4302 }, { "epoch": 0.22807621975459147, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14919225.6, "logits/rejected": -30705082.666666668, "logps/chosen": -297.770361328125, "logps/rejected": -544.6681315104166, "loss": 0.2653, "rewards/chosen": 0.5957925319671631, "rewards/margins": 3.000187317530314, "rewards/rejected": -2.404394785563151, "step": 4303 }, { "epoch": 0.2281292237563936, "grad_norm": 57.0, "kl": 0.0009250640869140625, "learning_rate": 5e-07, "logits/chosen": -11939238.666666666, "logits/rejected": 28358.9, "logps/chosen": -104.23704020182292, "logps/rejected": -366.724462890625, "loss": 0.2997, "rewards/chosen": 0.3145304520924886, "rewards/margins": 1.746520217259725, "rewards/rejected": -1.4319897651672364, "step": 4304 }, { "epoch": 0.22818222775819574, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13601631.0, "logits/rejected": -30140152.0, "logps/chosen": -331.0198974609375, "logps/rejected": -473.5720520019531, "loss": 0.2673, "rewards/chosen": 0.3266223669052124, "rewards/margins": 2.8389028310775757, "rewards/rejected": -2.5122804641723633, "step": 4305 }, { "epoch": 0.22823523175999788, "grad_norm": 51.75, "kl": 0.4629364013671875, "learning_rate": 5e-07, "logits/chosen": -29431945.6, "logits/rejected": -44249429.333333336, "logps/chosen": -247.465380859375, "logps/rejected": -513.2199300130209, "loss": 0.2602, "rewards/chosen": 0.5153931140899658, "rewards/margins": 3.9136605421702066, "rewards/rejected": -3.3982674280802407, "step": 4306 }, { "epoch": 0.22828823576180002, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11373334.0, "logits/rejected": -23113784.0, "logps/chosen": -323.4044494628906, "logps/rejected": -109.88536071777344, "loss": 0.3156, "rewards/chosen": 0.4355226457118988, "rewards/margins": 1.820973128080368, "rewards/rejected": -1.3854504823684692, "step": 4307 }, { "epoch": 0.22834123976360216, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9233493.6, "logits/rejected": -83741765.33333333, "logps/chosen": -141.3032958984375, "logps/rejected": -1183.5035807291667, "loss": 0.3645, "rewards/chosen": -0.1833634614944458, "rewards/margins": 4.654081161816914, "rewards/rejected": -4.83744462331136, "step": 4308 }, { "epoch": 0.2283942437654043, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38824732.0, "logits/rejected": -6944949.0, "logps/chosen": -255.20294189453125, "logps/rejected": -253.01808166503906, "loss": 0.3644, "rewards/chosen": 0.04744162783026695, "rewards/margins": 1.4948874972760677, "rewards/rejected": -1.4474458694458008, "step": 4309 }, { "epoch": 0.22844724776720643, "grad_norm": 51.5, "kl": 0.32973480224609375, "learning_rate": 5e-07, "logits/chosen": -29758451.2, "logits/rejected": -9315414.666666666, "logps/chosen": -182.57783203125, "logps/rejected": -325.15887451171875, "loss": 0.3703, "rewards/chosen": 0.20079851150512695, "rewards/margins": 1.9156812032063801, "rewards/rejected": -1.7148826917012532, "step": 4310 }, { "epoch": 0.22850025176900857, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28284128.0, "logits/rejected": -95301552.0, "logps/chosen": -220.62355041503906, "logps/rejected": -504.88671875, "loss": 0.306, "rewards/chosen": -0.02599439211189747, "rewards/margins": 2.6206404250115156, "rewards/rejected": -2.646634817123413, "step": 4311 }, { "epoch": 0.2285532557708107, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42092762.666666664, "logits/rejected": -46393030.4, "logps/chosen": -409.7567952473958, "logps/rejected": -260.91533203125, "loss": 0.1965, "rewards/chosen": 1.123687744140625, "rewards/margins": 2.957958221435547, "rewards/rejected": -1.8342704772949219, "step": 4312 }, { "epoch": 0.22860625977261284, "grad_norm": 50.5, "kl": 0.24828338623046875, "learning_rate": 5e-07, "logits/chosen": -27429876.0, "logits/rejected": 15523371.0, "logps/chosen": -264.0975036621094, "logps/rejected": -390.6390380859375, "loss": 0.3773, "rewards/chosen": 0.24102868139743805, "rewards/margins": 1.2502658516168594, "rewards/rejected": -1.0092371702194214, "step": 4313 }, { "epoch": 0.22865926377441498, "grad_norm": 54.0, "kl": 0.6779403686523438, "learning_rate": 5e-07, "logits/chosen": -47741048.0, "logits/rejected": -33942544.0, "logps/chosen": -446.84332275390625, "logps/rejected": -318.4398193359375, "loss": 0.293, "rewards/chosen": 0.3951606750488281, "rewards/margins": 2.6231932640075684, "rewards/rejected": -2.2280325889587402, "step": 4314 }, { "epoch": 0.22871226777621712, "grad_norm": 46.25, "kl": 0.00579071044921875, "learning_rate": 5e-07, "logits/chosen": -33246762.666666668, "logits/rejected": -45593286.4, "logps/chosen": -434.5865478515625, "logps/rejected": -396.1488037109375, "loss": 0.2047, "rewards/chosen": 0.45833130677541095, "rewards/margins": 3.067458732922872, "rewards/rejected": -2.609127426147461, "step": 4315 }, { "epoch": 0.22876527177801925, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22852134.0, "logits/rejected": -47868330.666666664, "logps/chosen": -68.3935546875, "logps/rejected": -342.7859700520833, "loss": 0.2201, "rewards/chosen": -0.018636606633663177, "rewards/margins": 2.063942529261112, "rewards/rejected": -2.0825791358947754, "step": 4316 }, { "epoch": 0.2288182757798214, "grad_norm": 63.75, "kl": 0.3334236145019531, "learning_rate": 5e-07, "logits/chosen": -31245557.333333332, "logits/rejected": 9657434.0, "logps/chosen": -319.2872721354167, "logps/rejected": -306.31622314453125, "loss": 0.3747, "rewards/chosen": 0.31299487749735516, "rewards/margins": 2.194028298060099, "rewards/rejected": -1.8810334205627441, "step": 4317 }, { "epoch": 0.2288712797816235, "grad_norm": 133.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58482184.0, "logits/rejected": 20691066.666666668, "logps/chosen": -426.38311767578125, "logps/rejected": -351.0147705078125, "loss": 0.351, "rewards/chosen": -0.7270050644874573, "rewards/margins": 0.4932210644086201, "rewards/rejected": -1.2202261288960774, "step": 4318 }, { "epoch": 0.22892428378342564, "grad_norm": 74.5, "kl": 1.31097412109375, "learning_rate": 5e-07, "logits/chosen": -23250524.8, "logits/rejected": -11498368.0, "logps/chosen": -338.536474609375, "logps/rejected": -117.3421122233073, "loss": 0.3188, "rewards/chosen": 0.5833616256713867, "rewards/margins": 2.337657769521077, "rewards/rejected": -1.7542961438496907, "step": 4319 }, { "epoch": 0.22897728778522777, "grad_norm": 57.25, "kl": 2.557973861694336, "learning_rate": 5e-07, "logits/chosen": -84687718.4, "logits/rejected": -79090458.66666667, "logps/chosen": -543.200146484375, "logps/rejected": -305.1526692708333, "loss": 0.3597, "rewards/chosen": 0.609257173538208, "rewards/margins": 2.099139165878296, "rewards/rejected": -1.489881992340088, "step": 4320 }, { "epoch": 0.2290302917870299, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42751564.0, "logits/rejected": -15051745.333333334, "logps/chosen": -157.6849822998047, "logps/rejected": -440.0521647135417, "loss": 0.2018, "rewards/chosen": 0.032566070556640625, "rewards/margins": 2.5067286491394043, "rewards/rejected": -2.4741625785827637, "step": 4321 }, { "epoch": 0.22908329578883205, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3089998.0, "logits/rejected": -17156128.0, "logps/chosen": -115.13580322265625, "logps/rejected": -393.081787109375, "loss": 0.2931, "rewards/chosen": -0.08474999666213989, "rewards/margins": 1.9618602633476256, "rewards/rejected": -2.0466102600097655, "step": 4322 }, { "epoch": 0.22913629979063418, "grad_norm": 56.5, "kl": 0.008209228515625, "learning_rate": 5e-07, "logits/chosen": -30694517.333333332, "logits/rejected": -65366764.0, "logps/chosen": -572.1185709635416, "logps/rejected": -433.8470458984375, "loss": 0.2483, "rewards/chosen": 1.1202363173166912, "rewards/margins": 3.598283449808757, "rewards/rejected": -2.4780471324920654, "step": 4323 }, { "epoch": 0.22918930379243632, "grad_norm": 58.75, "kl": 0.37664794921875, "learning_rate": 5e-07, "logits/chosen": -15904665.6, "logits/rejected": -4928694.666666667, "logps/chosen": -333.1412109375, "logps/rejected": -326.53208414713544, "loss": 0.4185, "rewards/chosen": -0.3115811347961426, "rewards/margins": 1.6951444943745932, "rewards/rejected": -2.006725629170736, "step": 4324 }, { "epoch": 0.22924230779423846, "grad_norm": 39.0, "kl": 0.03176689147949219, "learning_rate": 5e-07, "logits/chosen": -31275818.0, "logits/rejected": -16662823.0, "logps/chosen": -201.0371856689453, "logps/rejected": -397.579833984375, "loss": 0.2771, "rewards/chosen": 0.33759593963623047, "rewards/margins": 2.609023094177246, "rewards/rejected": -2.2714271545410156, "step": 4325 }, { "epoch": 0.2292953117960406, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 60203354.666666664, "logits/rejected": 46598972.8, "logps/chosen": -102.0269266764323, "logps/rejected": -280.4228515625, "loss": 0.3005, "rewards/chosen": 0.2238054076830546, "rewards/margins": 1.8484739104906718, "rewards/rejected": -1.624668502807617, "step": 4326 }, { "epoch": 0.22934831579784273, "grad_norm": 52.25, "kl": 1.1115398406982422, "learning_rate": 5e-07, "logits/chosen": -36554100.0, "logits/rejected": 1624159.25, "logps/chosen": -352.9222106933594, "logps/rejected": -278.6348876953125, "loss": 0.2788, "rewards/chosen": 0.8935074210166931, "rewards/margins": 2.136822760105133, "rewards/rejected": -1.24331533908844, "step": 4327 }, { "epoch": 0.22940131979964487, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10439602.0, "logits/rejected": -4428358.4, "logps/chosen": -72.41982523600261, "logps/rejected": -352.3444091796875, "loss": 0.3457, "rewards/chosen": -0.15712000926335654, "rewards/margins": 1.1066981593767802, "rewards/rejected": -1.2638181686401366, "step": 4328 }, { "epoch": 0.229454323801447, "grad_norm": 49.75, "kl": 1.0359821319580078, "learning_rate": 5e-07, "logits/chosen": -12654838.4, "logits/rejected": -18796701.333333332, "logps/chosen": -245.66533203125, "logps/rejected": -161.19451904296875, "loss": 0.4153, "rewards/chosen": 0.12752275466918944, "rewards/margins": 1.300093936920166, "rewards/rejected": -1.1725711822509766, "step": 4329 }, { "epoch": 0.22950732780324914, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24046650.666666668, "logits/rejected": -8349542.4, "logps/chosen": -144.47903442382812, "logps/rejected": -188.249072265625, "loss": 0.3009, "rewards/chosen": -0.22648398081461588, "rewards/margins": 1.6792989095052082, "rewards/rejected": -1.9057828903198242, "step": 4330 }, { "epoch": 0.22956033180505128, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 457908.0, "logits/rejected": -10240238.4, "logps/chosen": -251.8277791341146, "logps/rejected": -332.0725830078125, "loss": 0.2871, "rewards/chosen": -0.08256810903549194, "rewards/margins": 1.790006124973297, "rewards/rejected": -1.872574234008789, "step": 4331 }, { "epoch": 0.22961333580685342, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16503273.0, "logits/rejected": -27835060.0, "logps/chosen": -179.02505493164062, "logps/rejected": -185.5787811279297, "loss": 0.2679, "rewards/chosen": 0.3353964388370514, "rewards/margins": 2.5653566420078278, "rewards/rejected": -2.2299602031707764, "step": 4332 }, { "epoch": 0.22966633980865556, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7661938.0, "logits/rejected": -27527124.0, "logps/chosen": -188.73704528808594, "logps/rejected": -251.86007690429688, "loss": 0.2709, "rewards/chosen": 0.5899561643600464, "rewards/margins": 2.5146549940109253, "rewards/rejected": -1.924698829650879, "step": 4333 }, { "epoch": 0.2297193438104577, "grad_norm": 50.25, "kl": 0.5968666076660156, "learning_rate": 5e-07, "logits/chosen": -63687728.0, "logits/rejected": -22935668.0, "logps/chosen": -344.4105224609375, "logps/rejected": -394.54925537109375, "loss": 0.2484, "rewards/chosen": 0.7304651737213135, "rewards/margins": 2.845259189605713, "rewards/rejected": -2.1147940158843994, "step": 4334 }, { "epoch": 0.22977234781225983, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27395160.0, "logits/rejected": -45054168.0, "logps/chosen": -316.0050964355469, "logps/rejected": -789.06005859375, "loss": 0.264, "rewards/chosen": 0.5983992218971252, "rewards/margins": 3.616694748401642, "rewards/rejected": -3.0182955265045166, "step": 4335 }, { "epoch": 0.22982535181406197, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18847768.0, "logits/rejected": -25820883.2, "logps/chosen": -107.65498860677083, "logps/rejected": -332.6831787109375, "loss": 0.3275, "rewards/chosen": -0.10718193650245667, "rewards/margins": 1.4590473115444182, "rewards/rejected": -1.566229248046875, "step": 4336 }, { "epoch": 0.2298783558158641, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45574092.0, "logits/rejected": -4457064.0, "logps/chosen": -376.797607421875, "logps/rejected": -189.6220906575521, "loss": 0.2652, "rewards/chosen": 0.05410308390855789, "rewards/margins": 2.009516266485055, "rewards/rejected": -1.9554131825764973, "step": 4337 }, { "epoch": 0.22993135981766624, "grad_norm": 47.0, "kl": 2.1876373291015625, "learning_rate": 5e-07, "logits/chosen": -7554268.8, "logits/rejected": -39666053.333333336, "logps/chosen": -254.12470703125, "logps/rejected": -189.8212890625, "loss": 0.3204, "rewards/chosen": 0.6243336200714111, "rewards/margins": 2.68576709429423, "rewards/rejected": -2.061433474222819, "step": 4338 }, { "epoch": 0.22998436381946838, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55302794.666666664, "logits/rejected": -52929920.0, "logps/chosen": -458.7155354817708, "logps/rejected": -296.775390625, "loss": 0.4406, "rewards/chosen": -0.15057003498077393, "rewards/margins": 1.4376400709152222, "rewards/rejected": -1.588210105895996, "step": 4339 }, { "epoch": 0.23003736782127052, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50948544.0, "logits/rejected": -28389828.0, "logps/chosen": -387.034423828125, "logps/rejected": -322.9364318847656, "loss": 0.4244, "rewards/chosen": -0.12089474995930989, "rewards/margins": 1.6838874022165935, "rewards/rejected": -1.8047821521759033, "step": 4340 }, { "epoch": 0.23009037182307265, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40153755.428571425, "logits/rejected": 1771928.0, "logps/chosen": -237.73123604910714, "logps/rejected": -28.30474090576172, "loss": 0.49, "rewards/chosen": -0.02287478106362479, "rewards/margins": 0.3558230016912733, "rewards/rejected": -0.37869778275489807, "step": 4341 }, { "epoch": 0.2301433758248748, "grad_norm": 50.5, "kl": 0.45813751220703125, "learning_rate": 5e-07, "logits/chosen": -70692512.0, "logits/rejected": -29018934.4, "logps/chosen": -277.3934733072917, "logps/rejected": -223.982421875, "loss": 0.2741, "rewards/chosen": 0.3481786251068115, "rewards/margins": 2.089476156234741, "rewards/rejected": -1.7412975311279297, "step": 4342 }, { "epoch": 0.23019637982667693, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22638170.666666668, "logits/rejected": -12617689.6, "logps/chosen": -312.4428304036458, "logps/rejected": -153.946337890625, "loss": 0.2929, "rewards/chosen": 0.2292714516321818, "rewards/margins": 1.703319493929545, "rewards/rejected": -1.4740480422973632, "step": 4343 }, { "epoch": 0.23024938382847904, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27381666.666666668, "logits/rejected": -2859468.0, "logps/chosen": -396.7311197916667, "logps/rejected": -355.1654296875, "loss": 0.2662, "rewards/chosen": 0.5586420694986979, "rewards/margins": 2.5258034388224284, "rewards/rejected": -1.9671613693237304, "step": 4344 }, { "epoch": 0.23030238783028117, "grad_norm": 54.0, "kl": 0.5151615142822266, "learning_rate": 5e-07, "logits/chosen": -41356856.0, "logits/rejected": -73504448.0, "logps/chosen": -174.724365234375, "logps/rejected": -479.02642822265625, "loss": 0.3085, "rewards/chosen": -0.01620183140039444, "rewards/margins": 2.4279457554221153, "rewards/rejected": -2.4441475868225098, "step": 4345 }, { "epoch": 0.2303553918320833, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31379093.333333332, "logits/rejected": -12811162.4, "logps/chosen": -215.69989013671875, "logps/rejected": -153.4169677734375, "loss": 0.3121, "rewards/chosen": 0.11073457201321919, "rewards/margins": 1.6492664476235708, "rewards/rejected": -1.5385318756103517, "step": 4346 }, { "epoch": 0.23040839583388545, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5602717.5, "logits/rejected": -3917381.5, "logps/chosen": -162.5619659423828, "logps/rejected": -93.04708862304688, "loss": 0.4008, "rewards/chosen": 0.16236276924610138, "rewards/margins": 0.8369665890932083, "rewards/rejected": -0.6746038198471069, "step": 4347 }, { "epoch": 0.23046139983568759, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58032636.0, "logits/rejected": -6177159.0, "logps/chosen": -202.66665649414062, "logps/rejected": -366.4780578613281, "loss": 0.3564, "rewards/chosen": -0.15675488114356995, "rewards/margins": 1.4296419322490692, "rewards/rejected": -1.5863968133926392, "step": 4348 }, { "epoch": 0.23051440383748972, "grad_norm": 29.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17409110.666666668, "logits/rejected": -41119139.2, "logps/chosen": -70.55305480957031, "logps/rejected": -121.93662109375, "loss": 0.3226, "rewards/chosen": -0.3395702044169108, "rewards/margins": 1.594983704884847, "rewards/rejected": -1.9345539093017579, "step": 4349 }, { "epoch": 0.23056740783929186, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1857422.875, "logits/rejected": -21371750.0, "logps/chosen": -82.40784454345703, "logps/rejected": -212.05311584472656, "loss": 0.2595, "rewards/chosen": 0.5239828824996948, "rewards/margins": 2.4840668439865112, "rewards/rejected": -1.9600839614868164, "step": 4350 }, { "epoch": 0.230620411841094, "grad_norm": 50.5, "kl": 1.4094772338867188, "learning_rate": 5e-07, "logits/chosen": -49083973.333333336, "logits/rejected": -7858207.2, "logps/chosen": -304.4209798177083, "logps/rejected": -153.370947265625, "loss": 0.3703, "rewards/chosen": 0.3655797640482585, "rewards/margins": 1.421707884470622, "rewards/rejected": -1.0561281204223634, "step": 4351 }, { "epoch": 0.23067341584289613, "grad_norm": 67.0, "kl": 0.8812904357910156, "learning_rate": 5e-07, "logits/chosen": -81782266.66666667, "logits/rejected": -37935896.0, "logps/chosen": -398.83203125, "logps/rejected": -81.2878189086914, "loss": 0.4861, "rewards/chosen": -0.21333009004592896, "rewards/margins": 1.1623730063438416, "rewards/rejected": -1.3757030963897705, "step": 4352 }, { "epoch": 0.23072641984469827, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20466096.0, "logits/rejected": -25481552.0, "logps/chosen": -148.95028686523438, "logps/rejected": -315.46380615234375, "loss": 0.4345, "rewards/chosen": -0.3050472140312195, "rewards/margins": 0.6755919456481934, "rewards/rejected": -0.9806391596794128, "step": 4353 }, { "epoch": 0.2307794238465004, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63465401.6, "logits/rejected": -54742048.0, "logps/chosen": -443.69365234375, "logps/rejected": -117.14385986328125, "loss": 0.3735, "rewards/chosen": 0.28833534717559817, "rewards/margins": 1.336454971631368, "rewards/rejected": -1.0481196244557698, "step": 4354 }, { "epoch": 0.23083242784830255, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62729062.4, "logits/rejected": -40478274.666666664, "logps/chosen": -524.694287109375, "logps/rejected": -544.677001953125, "loss": 0.2596, "rewards/chosen": 0.7384472846984863, "rewards/margins": 2.94612512588501, "rewards/rejected": -2.2076778411865234, "step": 4355 }, { "epoch": 0.23088543185010468, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27174322.666666668, "logits/rejected": -19389812.8, "logps/chosen": -502.0082194010417, "logps/rejected": -284.237255859375, "loss": 0.3135, "rewards/chosen": -0.3680125077565511, "rewards/margins": 1.609024159113566, "rewards/rejected": -1.9770366668701171, "step": 4356 }, { "epoch": 0.23093843585190682, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58360505.6, "logits/rejected": 2337749.3333333335, "logps/chosen": -381.647412109375, "logps/rejected": -87.0680440266927, "loss": 0.3904, "rewards/chosen": 0.11893446445465088, "rewards/margins": 1.1687625964482624, "rewards/rejected": -1.0498281319936116, "step": 4357 }, { "epoch": 0.23099143985370896, "grad_norm": 74.0, "kl": 3.6512908935546875, "learning_rate": 5e-07, "logits/chosen": -19592880.0, "logits/rejected": 620908.6666666666, "logps/chosen": -638.572607421875, "logps/rejected": -68.5020243326823, "loss": 0.317, "rewards/chosen": 0.9776476860046387, "rewards/margins": 1.927484099070231, "rewards/rejected": -0.9498364130655924, "step": 4358 }, { "epoch": 0.2310444438555111, "grad_norm": 62.5, "kl": 1.4375114440917969, "learning_rate": 5e-07, "logits/chosen": 8873014.666666666, "logits/rejected": -64797576.0, "logps/chosen": -495.8395182291667, "logps/rejected": -370.24908447265625, "loss": 0.3943, "rewards/chosen": 0.4577023983001709, "rewards/margins": 1.6290998458862305, "rewards/rejected": -1.1713974475860596, "step": 4359 }, { "epoch": 0.23109744785731323, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7314233.333333333, "logits/rejected": -13937252.8, "logps/chosen": -530.8550618489584, "logps/rejected": -281.6017578125, "loss": 0.2658, "rewards/chosen": 0.34410401185353595, "rewards/margins": 2.00247593720754, "rewards/rejected": -1.658371925354004, "step": 4360 }, { "epoch": 0.23115045185911537, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9861426.0, "logits/rejected": -39820016.0, "logps/chosen": -323.7940673828125, "logps/rejected": -265.75701904296875, "loss": 0.2643, "rewards/chosen": 0.6844776272773743, "rewards/margins": 2.2791159749031067, "rewards/rejected": -1.5946383476257324, "step": 4361 }, { "epoch": 0.2312034558609175, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28466142.0, "logits/rejected": -20630024.0, "logps/chosen": -347.8161926269531, "logps/rejected": -310.1985778808594, "loss": 0.2638, "rewards/chosen": 0.5708077549934387, "rewards/margins": 3.0797465443611145, "rewards/rejected": -2.508938789367676, "step": 4362 }, { "epoch": 0.23125645986271964, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16963562.666666668, "logits/rejected": -26358892.8, "logps/chosen": -149.05616251627603, "logps/rejected": -266.633544921875, "loss": 0.3545, "rewards/chosen": -0.8271708488464355, "rewards/margins": 1.0954209327697755, "rewards/rejected": -1.922591781616211, "step": 4363 }, { "epoch": 0.23130946386452178, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60297240.0, "logits/rejected": -27746473.14285714, "logps/chosen": -492.10595703125, "logps/rejected": -193.26041085379464, "loss": 0.2736, "rewards/chosen": 0.34927064180374146, "rewards/margins": 1.4533934167453222, "rewards/rejected": -1.1041227749415807, "step": 4364 }, { "epoch": 0.23136246786632392, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 61773322.666666664, "logits/rejected": -21677417.6, "logps/chosen": -607.2915852864584, "logps/rejected": -427.565869140625, "loss": 0.2978, "rewards/chosen": -0.012558231751124064, "rewards/margins": 2.006705035765966, "rewards/rejected": -2.01926326751709, "step": 4365 }, { "epoch": 0.23141547186812605, "grad_norm": 49.5, "kl": 0.8202953338623047, "learning_rate": 5e-07, "logits/chosen": -47332662.4, "logits/rejected": -20359461.333333332, "logps/chosen": -248.569580078125, "logps/rejected": -334.5726318359375, "loss": 0.353, "rewards/chosen": 0.23541715145111083, "rewards/margins": 2.070724431673686, "rewards/rejected": -1.835307280222575, "step": 4366 }, { "epoch": 0.2314684758699282, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1988737.8333333333, "logits/rejected": -22858761.6, "logps/chosen": -75.64338684082031, "logps/rejected": -469.52099609375, "loss": 0.2712, "rewards/chosen": 0.014987240235010782, "rewards/margins": 2.631484089295069, "rewards/rejected": -2.6164968490600584, "step": 4367 }, { "epoch": 0.23152147987173033, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23223004.0, "logits/rejected": 4554598.0, "logps/chosen": -172.38119506835938, "logps/rejected": -46.42879867553711, "loss": 0.4552, "rewards/chosen": -0.1790449172258377, "rewards/margins": 0.36784039437770844, "rewards/rejected": -0.5468853116035461, "step": 4368 }, { "epoch": 0.23157448387353244, "grad_norm": 54.25, "kl": 0.3873100280761719, "learning_rate": 5e-07, "logits/chosen": -17987802.666666668, "logits/rejected": -36571187.2, "logps/chosen": -738.4783528645834, "logps/rejected": -331.1479736328125, "loss": 0.2274, "rewards/chosen": 0.7623214721679688, "rewards/margins": 2.918001174926758, "rewards/rejected": -2.155679702758789, "step": 4369 }, { "epoch": 0.23162748787533458, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2370477.0, "logits/rejected": -17904120.0, "logps/chosen": -28.282455444335938, "logps/rejected": -242.70693359375, "loss": 0.3244, "rewards/chosen": -0.34525394439697266, "rewards/margins": 1.569894790649414, "rewards/rejected": -1.9151487350463867, "step": 4370 }, { "epoch": 0.2316804918771367, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40389208.0, "logits/rejected": -29861868.0, "logps/chosen": -373.25286865234375, "logps/rejected": -339.45654296875, "loss": 0.3279, "rewards/chosen": 0.12153090536594391, "rewards/margins": 1.6493858247995377, "rewards/rejected": -1.5278549194335938, "step": 4371 }, { "epoch": 0.23173349587893885, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6387229.5, "logits/rejected": -1294784.142857143, "logps/chosen": -61.64198684692383, "logps/rejected": -396.8594447544643, "loss": 0.2451, "rewards/chosen": -0.7613838315010071, "rewards/margins": 0.9840333546910967, "rewards/rejected": -1.7454171861921037, "step": 4372 }, { "epoch": 0.231786499880741, "grad_norm": 58.5, "kl": 1.500732421875, "learning_rate": 5e-07, "logits/chosen": -36251738.666666664, "logits/rejected": -3172619.5, "logps/chosen": -430.3123372395833, "logps/rejected": -94.3867416381836, "loss": 0.3546, "rewards/chosen": 0.584752082824707, "rewards/margins": 1.769073247909546, "rewards/rejected": -1.1843211650848389, "step": 4373 }, { "epoch": 0.23183950388254312, "grad_norm": 61.75, "kl": 0.2882537841796875, "learning_rate": 5e-07, "logits/chosen": -44131168.0, "logits/rejected": -13509098.0, "logps/chosen": -421.1347961425781, "logps/rejected": -154.75270080566406, "loss": 0.3688, "rewards/chosen": 0.39962083101272583, "rewards/margins": 1.2520825862884521, "rewards/rejected": -0.8524617552757263, "step": 4374 }, { "epoch": 0.23189250788434526, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36498920.0, "logits/rejected": -13198856.0, "logps/chosen": -329.3781433105469, "logps/rejected": -241.721435546875, "loss": 0.3388, "rewards/chosen": 0.27623483538627625, "rewards/margins": 1.4431564509868622, "rewards/rejected": -1.166921615600586, "step": 4375 }, { "epoch": 0.2319455118861474, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31832096.0, "logits/rejected": -25655477.333333332, "logps/chosen": -308.51376953125, "logps/rejected": -367.44677734375, "loss": 0.349, "rewards/chosen": 0.2462012767791748, "rewards/margins": 1.8823187351226807, "rewards/rejected": -1.6361174583435059, "step": 4376 }, { "epoch": 0.23199851588794954, "grad_norm": 42.0, "kl": 0.29340553283691406, "learning_rate": 5e-07, "logits/chosen": -7576709.333333333, "logits/rejected": -20888064.0, "logps/chosen": -133.66401163736978, "logps/rejected": -225.0515380859375, "loss": 0.3483, "rewards/chosen": -0.28676287333170575, "rewards/margins": 1.3463002522786458, "rewards/rejected": -1.6330631256103516, "step": 4377 }, { "epoch": 0.23205151988975167, "grad_norm": 53.0, "kl": 0.28650665283203125, "learning_rate": 5e-07, "logits/chosen": -55137013.333333336, "logits/rejected": 1780451.2, "logps/chosen": -334.2303059895833, "logps/rejected": -180.24593505859374, "loss": 0.2462, "rewards/chosen": 0.5880088408788046, "rewards/margins": 2.284713133176168, "rewards/rejected": -1.6967042922973632, "step": 4378 }, { "epoch": 0.2321045238915538, "grad_norm": 56.25, "kl": 0.127593994140625, "learning_rate": 5e-07, "logits/chosen": -71973120.0, "logits/rejected": -21578168.0, "logps/chosen": -530.9627075195312, "logps/rejected": -382.044921875, "loss": 0.2415, "rewards/chosen": 0.6179039478302002, "rewards/margins": 2.9085476398468018, "rewards/rejected": -2.2906436920166016, "step": 4379 }, { "epoch": 0.23215752789335595, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11629865.0, "logits/rejected": -24633330.666666668, "logps/chosen": -134.08462524414062, "logps/rejected": -397.4842529296875, "loss": 0.2677, "rewards/chosen": 0.0317077562212944, "rewards/margins": 2.0736242855588594, "rewards/rejected": -2.041916529337565, "step": 4380 }, { "epoch": 0.23221053189515808, "grad_norm": 65.5, "kl": 0.25829315185546875, "learning_rate": 5e-07, "logits/chosen": -15807924.8, "logits/rejected": 14772632.0, "logps/chosen": -417.982861328125, "logps/rejected": -130.21464029947916, "loss": 0.3393, "rewards/chosen": 0.6027496337890625, "rewards/margins": 1.6993105093638103, "rewards/rejected": -1.0965608755747478, "step": 4381 }, { "epoch": 0.23226353589696022, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46764105.6, "logits/rejected": -6871528.666666667, "logps/chosen": -402.33427734375, "logps/rejected": -231.74774169921875, "loss": 0.3192, "rewards/chosen": 0.3213775634765625, "rewards/margins": 2.1198521931966146, "rewards/rejected": -1.798474629720052, "step": 4382 }, { "epoch": 0.23231653989876236, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29359636.0, "logits/rejected": -45545160.0, "logps/chosen": -328.84564208984375, "logps/rejected": -501.703125, "loss": 0.2856, "rewards/chosen": 0.01886139065027237, "rewards/margins": 2.4715715423226357, "rewards/rejected": -2.4527101516723633, "step": 4383 }, { "epoch": 0.2323695439005645, "grad_norm": 67.0, "kl": 0.8790206909179688, "learning_rate": 5e-07, "logits/chosen": 12823139.2, "logits/rejected": -19158408.0, "logps/chosen": -380.174658203125, "logps/rejected": -492.4105631510417, "loss": 0.295, "rewards/chosen": 0.5987859725952148, "rewards/margins": 2.9994644482930504, "rewards/rejected": -2.4006784756978354, "step": 4384 }, { "epoch": 0.23242254790236663, "grad_norm": 41.25, "kl": 0.3696918487548828, "learning_rate": 5e-07, "logits/chosen": 4689914.666666667, "logits/rejected": -44937155.2, "logps/chosen": -38.24796040852865, "logps/rejected": -280.6422119140625, "loss": 0.3108, "rewards/chosen": 0.4936087528864543, "rewards/margins": 1.7323650280634564, "rewards/rejected": -1.238756275177002, "step": 4385 }, { "epoch": 0.23247555190416877, "grad_norm": 37.5, "kl": 0.24420166015625, "learning_rate": 5e-07, "logits/chosen": -41734232.0, "logits/rejected": -46801328.0, "logps/chosen": -859.32763671875, "logps/rejected": -426.8705240885417, "loss": 0.1294, "rewards/chosen": 1.613887071609497, "rewards/margins": 3.935206174850464, "rewards/rejected": -2.321319103240967, "step": 4386 }, { "epoch": 0.2325285559059709, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3819293.0, "logits/rejected": -18778534.666666668, "logps/chosen": -150.59715270996094, "logps/rejected": -202.96354166666666, "loss": 0.2585, "rewards/chosen": 0.2744041383266449, "rewards/margins": 2.087857812643051, "rewards/rejected": -1.8134536743164062, "step": 4387 }, { "epoch": 0.23258155990777304, "grad_norm": 58.75, "kl": 1.896252155303955, "learning_rate": 5e-07, "logits/chosen": -2438658.6666666665, "logits/rejected": -15839373.0, "logps/chosen": -555.0236409505209, "logps/rejected": -99.81106567382812, "loss": 0.3265, "rewards/chosen": 0.6245935360590616, "rewards/margins": 3.0182588497797647, "rewards/rejected": -2.393665313720703, "step": 4388 }, { "epoch": 0.23263456390957518, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79537733.33333333, "logits/rejected": -29687964.8, "logps/chosen": -404.6907552083333, "logps/rejected": -319.33984375, "loss": 0.2686, "rewards/chosen": -0.08677979310353597, "rewards/margins": 2.1655311504999797, "rewards/rejected": -2.252310943603516, "step": 4389 }, { "epoch": 0.23268756791137732, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38864501.333333336, "logits/rejected": -27348236.8, "logps/chosen": -594.7954915364584, "logps/rejected": -393.0390380859375, "loss": 0.1925, "rewards/chosen": 1.0643049875895183, "rewards/margins": 3.5605694452921552, "rewards/rejected": -2.4962644577026367, "step": 4390 }, { "epoch": 0.23274057191317946, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5808622.0, "logits/rejected": -34440608.0, "logps/chosen": -197.58935546875, "logps/rejected": -311.6717122395833, "loss": 0.2078, "rewards/chosen": 0.6769284009933472, "rewards/margins": 2.4902423620224, "rewards/rejected": -1.8133139610290527, "step": 4391 }, { "epoch": 0.2327935759149816, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35709082.666666664, "logits/rejected": 460421.6, "logps/chosen": -194.12748209635416, "logps/rejected": -299.609423828125, "loss": 0.3061, "rewards/chosen": 0.1578547259171804, "rewards/margins": 1.9406838198502858, "rewards/rejected": -1.7828290939331055, "step": 4392 }, { "epoch": 0.23284657991678373, "grad_norm": 47.25, "kl": 1.0462045669555664, "learning_rate": 5e-07, "logits/chosen": -18026472.0, "logits/rejected": -1575808.5, "logps/chosen": -183.56427001953125, "logps/rejected": -93.60110473632812, "loss": 0.4787, "rewards/chosen": -0.5246942639350891, "rewards/margins": 0.4287123680114746, "rewards/rejected": -0.9534066319465637, "step": 4393 }, { "epoch": 0.23289958391858584, "grad_norm": 49.25, "kl": 0.0245819091796875, "learning_rate": 5e-07, "logits/chosen": -6296201.333333333, "logits/rejected": -35486886.4, "logps/chosen": -263.006591796875, "logps/rejected": -339.410791015625, "loss": 0.3009, "rewards/chosen": 0.0917762815952301, "rewards/margins": 1.9089624464511872, "rewards/rejected": -1.817186164855957, "step": 4394 }, { "epoch": 0.23295258792038798, "grad_norm": 76.5, "kl": 0.40249156951904297, "learning_rate": 5e-07, "logits/chosen": -28667635.2, "logits/rejected": 127935520.0, "logps/chosen": -354.14580078125, "logps/rejected": -492.7469075520833, "loss": 0.3516, "rewards/chosen": 0.2652399778366089, "rewards/margins": 1.7658156792322797, "rewards/rejected": -1.5005757013956706, "step": 4395 }, { "epoch": 0.23300559192219011, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11213748.0, "logits/rejected": -13647584.0, "logps/chosen": -80.1946309407552, "logps/rejected": -187.3733154296875, "loss": 0.3118, "rewards/chosen": -0.12201557556788127, "rewards/margins": 1.6260433157285055, "rewards/rejected": -1.7480588912963868, "step": 4396 }, { "epoch": 0.23305859592399225, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36521104.0, "logits/rejected": -12572494.4, "logps/chosen": -415.6318359375, "logps/rejected": -249.36201171875, "loss": 0.2156, "rewards/chosen": 0.7309447924296061, "rewards/margins": 2.605022589365641, "rewards/rejected": -1.8740777969360352, "step": 4397 }, { "epoch": 0.2331115999257944, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19968373.333333332, "logits/rejected": -24896988.0, "logps/chosen": -172.8261515299479, "logps/rejected": -178.2926483154297, "loss": 0.4147, "rewards/chosen": -0.04938310384750366, "rewards/margins": 2.0071621537208557, "rewards/rejected": -2.0565452575683594, "step": 4398 }, { "epoch": 0.23316460392759653, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2149612.8, "logits/rejected": -14316698.666666666, "logps/chosen": -86.24899291992188, "logps/rejected": -592.4778645833334, "loss": 0.3146, "rewards/chosen": 0.12082206010818482, "rewards/margins": 3.948093203703562, "rewards/rejected": -3.8272711435953775, "step": 4399 }, { "epoch": 0.23321760792939866, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1042671.0, "logits/rejected": -28175464.0, "logps/chosen": -222.3421173095703, "logps/rejected": -395.5433654785156, "loss": 0.2862, "rewards/chosen": 0.39788180589675903, "rewards/margins": 2.88178688287735, "rewards/rejected": -2.483905076980591, "step": 4400 }, { "epoch": 0.2332706119312008, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60955760.0, "logits/rejected": -23406656.0, "logps/chosen": -241.63259887695312, "logps/rejected": -357.5096028645833, "loss": 0.2389, "rewards/chosen": -0.08842393755912781, "rewards/margins": 1.9462125599384308, "rewards/rejected": -2.0346364974975586, "step": 4401 }, { "epoch": 0.23332361593300294, "grad_norm": 61.25, "kl": 0.3708662986755371, "learning_rate": 5e-07, "logits/chosen": -44966868.0, "logits/rejected": -13522860.0, "logps/chosen": -362.092529296875, "logps/rejected": -180.854736328125, "loss": 0.3386, "rewards/chosen": 0.28451624512672424, "rewards/margins": 1.7129699289798737, "rewards/rejected": -1.4284536838531494, "step": 4402 }, { "epoch": 0.23337661993480507, "grad_norm": 47.0, "kl": 1.0791587829589844, "learning_rate": 5e-07, "logits/chosen": -38453568.0, "logits/rejected": -66232232.0, "logps/chosen": -601.5247802734375, "logps/rejected": -614.832275390625, "loss": 0.2133, "rewards/chosen": 1.1976219415664673, "rewards/margins": 3.7295972108840942, "rewards/rejected": -2.531975269317627, "step": 4403 }, { "epoch": 0.2334296239366072, "grad_norm": 43.5, "kl": 0.1266632080078125, "learning_rate": 5e-07, "logits/chosen": -3569023.6666666665, "logits/rejected": -13375123.2, "logps/chosen": -259.5067545572917, "logps/rejected": -242.150146484375, "loss": 0.2698, "rewards/chosen": 0.3139763077100118, "rewards/margins": 1.9775688370068867, "rewards/rejected": -1.663592529296875, "step": 4404 }, { "epoch": 0.23348262793840935, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51431605.333333336, "logits/rejected": -15379116.8, "logps/chosen": -694.4921875, "logps/rejected": -459.864453125, "loss": 0.1992, "rewards/chosen": 0.5654836098353068, "rewards/margins": 4.203616086641947, "rewards/rejected": -3.6381324768066405, "step": 4405 }, { "epoch": 0.23353563194021149, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24153232.0, "logits/rejected": -45927408.0, "logps/chosen": -360.18865966796875, "logps/rejected": -292.85711669921875, "loss": 0.3775, "rewards/chosen": 0.5142496824264526, "rewards/margins": 1.254742980003357, "rewards/rejected": -0.7404932975769043, "step": 4406 }, { "epoch": 0.23358863594201362, "grad_norm": 72.5, "kl": 2.109638214111328, "learning_rate": 5e-07, "logits/chosen": -36216605.333333336, "logits/rejected": -88139592.0, "logps/chosen": -436.5752766927083, "logps/rejected": -334.79351806640625, "loss": 0.3833, "rewards/chosen": 0.49263548851013184, "rewards/margins": 1.741398811340332, "rewards/rejected": -1.2487633228302002, "step": 4407 }, { "epoch": 0.23364163994381576, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14740896.0, "logits/rejected": -44383050.666666664, "logps/chosen": -262.66083984375, "logps/rejected": -447.1500651041667, "loss": 0.2779, "rewards/chosen": 0.4657552242279053, "rewards/margins": 3.0090760389963784, "rewards/rejected": -2.543320814768473, "step": 4408 }, { "epoch": 0.2336946439456179, "grad_norm": 47.0, "kl": 0.1820964813232422, "learning_rate": 5e-07, "logits/chosen": -18557604.0, "logits/rejected": -1269251.5, "logps/chosen": -201.49147033691406, "logps/rejected": -267.6590576171875, "loss": 0.3237, "rewards/chosen": 0.12300701439380646, "rewards/margins": 1.9724726229906082, "rewards/rejected": -1.8494656085968018, "step": 4409 }, { "epoch": 0.23374764794742003, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15921033.333333334, "logits/rejected": 6135630.4, "logps/chosen": -222.80753580729166, "logps/rejected": -432.346826171875, "loss": 0.2517, "rewards/chosen": 0.2949591477711995, "rewards/margins": 2.402997859319051, "rewards/rejected": -2.1080387115478514, "step": 4410 }, { "epoch": 0.23380065194922217, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27234144.0, "logits/rejected": -25115018.0, "logps/chosen": -179.8983154296875, "logps/rejected": -291.7731628417969, "loss": 0.3374, "rewards/chosen": -0.04363007843494415, "rewards/margins": 2.1246295422315598, "rewards/rejected": -2.168259620666504, "step": 4411 }, { "epoch": 0.2338536559510243, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30456395.42857143, "logits/rejected": -16195907.0, "logps/chosen": -198.43022809709822, "logps/rejected": -269.9624938964844, "loss": 0.4728, "rewards/chosen": -0.0623134970664978, "rewards/margins": 1.3031192421913147, "rewards/rejected": -1.3654327392578125, "step": 4412 }, { "epoch": 0.23390665995282645, "grad_norm": 56.25, "kl": 0.2574272155761719, "learning_rate": 5e-07, "logits/chosen": -40360293.333333336, "logits/rejected": -3797232.8, "logps/chosen": -315.81064860026044, "logps/rejected": -107.93997802734376, "loss": 0.3609, "rewards/chosen": -0.07978310187657674, "rewards/margins": 0.9990725080172221, "rewards/rejected": -1.078855609893799, "step": 4413 }, { "epoch": 0.23395966395462858, "grad_norm": 51.5, "kl": 0.1427459716796875, "learning_rate": 5e-07, "logits/chosen": -48357704.0, "logits/rejected": -21720652.0, "logps/chosen": -355.88983154296875, "logps/rejected": -242.40118408203125, "loss": 0.3045, "rewards/chosen": 0.5992605090141296, "rewards/margins": 1.9948453307151794, "rewards/rejected": -1.3955848217010498, "step": 4414 }, { "epoch": 0.23401266795643072, "grad_norm": 71.0, "kl": 2.0548553466796875, "learning_rate": 5e-07, "logits/chosen": -35976185.14285714, "logits/rejected": -98313488.0, "logps/chosen": -569.8915318080357, "logps/rejected": -745.9859619140625, "loss": 0.4179, "rewards/chosen": 0.40631532669067383, "rewards/margins": 2.7011518478393555, "rewards/rejected": -2.2948365211486816, "step": 4415 }, { "epoch": 0.23406567195823286, "grad_norm": 42.5, "kl": 0.2793998718261719, "learning_rate": 5e-07, "logits/chosen": -21736308.0, "logits/rejected": -34386292.0, "logps/chosen": -184.87603759765625, "logps/rejected": -235.0818328857422, "loss": 0.3527, "rewards/chosen": -0.05322644114494324, "rewards/margins": 1.561311036348343, "rewards/rejected": -1.6145374774932861, "step": 4416 }, { "epoch": 0.234118675960035, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17306992.0, "logits/rejected": -11066948.8, "logps/chosen": -393.9716389973958, "logps/rejected": -276.08173828125, "loss": 0.2393, "rewards/chosen": 0.06895955403645833, "rewards/margins": 2.915862019856771, "rewards/rejected": -2.8469024658203126, "step": 4417 }, { "epoch": 0.23417167996183713, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23156762.0, "logits/rejected": -37623908.0, "logps/chosen": -344.19158935546875, "logps/rejected": -310.4742736816406, "loss": 0.3307, "rewards/chosen": 0.0063568055629730225, "rewards/margins": 1.7413166463375092, "rewards/rejected": -1.7349598407745361, "step": 4418 }, { "epoch": 0.23422468396363927, "grad_norm": 50.75, "kl": 2.391468048095703, "learning_rate": 5e-07, "logits/chosen": -9332243.2, "logits/rejected": -76005749.33333333, "logps/chosen": -534.45439453125, "logps/rejected": -284.595947265625, "loss": 0.316, "rewards/chosen": 0.8205035209655762, "rewards/margins": 2.6800636927286785, "rewards/rejected": -1.8595601717631023, "step": 4419 }, { "epoch": 0.23427768796544138, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91616312.0, "logits/rejected": -25073781.333333332, "logps/chosen": -369.8636474609375, "logps/rejected": -208.60308837890625, "loss": 0.3061, "rewards/chosen": -0.141490176320076, "rewards/margins": 1.3567531555891037, "rewards/rejected": -1.4982433319091797, "step": 4420 }, { "epoch": 0.23433069196724352, "grad_norm": 54.5, "kl": 0.8824615478515625, "learning_rate": 5e-07, "logits/chosen": -31137816.0, "logits/rejected": -13378948.0, "logps/chosen": -411.36859130859375, "logps/rejected": -129.79904174804688, "loss": 0.3315, "rewards/chosen": 0.45824527740478516, "rewards/margins": 1.6957401037216187, "rewards/rejected": -1.2374948263168335, "step": 4421 }, { "epoch": 0.23438369596904565, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15899333.0, "logps/chosen": -196.96575927734375, "loss": 0.4736, "rewards/chosen": 0.11316454410552979, "step": 4422 }, { "epoch": 0.2344366999708478, "grad_norm": 65.5, "kl": 1.2286157608032227, "learning_rate": 5e-07, "logits/chosen": -41323317.333333336, "logits/rejected": -11411614.0, "logps/chosen": -286.21136474609375, "logps/rejected": -154.8827667236328, "loss": 0.3944, "rewards/chosen": 0.4203222592671712, "rewards/margins": 1.352769096692403, "rewards/rejected": -0.9324468374252319, "step": 4423 }, { "epoch": 0.23448970397264993, "grad_norm": 65.0, "kl": 0.08611297607421875, "learning_rate": 5e-07, "logits/chosen": -47273136.0, "logits/rejected": -40214952.0, "logps/chosen": -342.0863850911458, "logps/rejected": -314.98614501953125, "loss": 0.3824, "rewards/chosen": 0.16479673981666565, "rewards/margins": 2.2806886732578278, "rewards/rejected": -2.115891933441162, "step": 4424 }, { "epoch": 0.23454270797445206, "grad_norm": 69.0, "kl": 1.3555583953857422, "learning_rate": 5e-07, "logits/chosen": -54295184.0, "logits/rejected": -43158512.0, "logps/chosen": -621.3689575195312, "logps/rejected": -343.96685791015625, "loss": 0.3132, "rewards/chosen": 0.6154369115829468, "rewards/margins": 2.1512691974639893, "rewards/rejected": -1.5358322858810425, "step": 4425 }, { "epoch": 0.2345957119762542, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12661340.0, "logits/rejected": 46668810.666666664, "logps/chosen": -203.20455932617188, "logps/rejected": -316.6196695963542, "loss": 0.219, "rewards/chosen": 0.5921825170516968, "rewards/margins": 2.4140989383061724, "rewards/rejected": -1.8219164212544758, "step": 4426 }, { "epoch": 0.23464871597805634, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16110024.0, "logits/rejected": -31205092.0, "logps/chosen": -61.15758514404297, "logps/rejected": -444.380126953125, "loss": 0.3167, "rewards/chosen": 0.10060515999794006, "rewards/margins": 2.3251902163028717, "rewards/rejected": -2.2245850563049316, "step": 4427 }, { "epoch": 0.23470171997985848, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25486188.0, "logits/rejected": -48235984.0, "logps/chosen": -277.9344482421875, "logps/rejected": -274.20013427734375, "loss": 0.2844, "rewards/chosen": 0.2591651976108551, "rewards/margins": 2.2402974665164948, "rewards/rejected": -1.9811322689056396, "step": 4428 }, { "epoch": 0.2347547239816606, "grad_norm": 60.75, "kl": 1.3434410095214844, "learning_rate": 5e-07, "logits/chosen": -33438054.0, "logits/rejected": -144302576.0, "logps/chosen": -655.2855224609375, "logps/rejected": -240.50816345214844, "loss": 0.2799, "rewards/chosen": 0.797157883644104, "rewards/margins": 2.4191267490386963, "rewards/rejected": -1.6219688653945923, "step": 4429 }, { "epoch": 0.23480772798346275, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47151712.0, "logits/rejected": -47094944.0, "logps/chosen": -190.68405151367188, "logps/rejected": -375.57159423828125, "loss": 0.322, "rewards/chosen": -0.20858660340309143, "rewards/margins": 2.4044290482997894, "rewards/rejected": -2.613015651702881, "step": 4430 }, { "epoch": 0.2348607319852649, "grad_norm": 40.25, "kl": 0.24331092834472656, "learning_rate": 5e-07, "logits/chosen": -17900312.0, "logits/rejected": -29372144.0, "logps/chosen": -173.02850341796875, "logps/rejected": -275.8909912109375, "loss": 0.3077, "rewards/chosen": 0.32039256890614826, "rewards/margins": 1.756206472714742, "rewards/rejected": -1.4358139038085938, "step": 4431 }, { "epoch": 0.23491373598706702, "grad_norm": 72.0, "kl": 1.1654541492462158, "learning_rate": 5e-07, "logits/chosen": 49233574.4, "logits/rejected": -24646728.0, "logps/chosen": -289.601123046875, "logps/rejected": -205.6754150390625, "loss": 0.4293, "rewards/chosen": -0.10372419357299804, "rewards/margins": 1.4207170486450196, "rewards/rejected": -1.5244412422180176, "step": 4432 }, { "epoch": 0.23496673998886916, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45231476.0, "logits/rejected": -48942888.0, "logps/chosen": -227.74554443359375, "logps/rejected": -225.31765747070312, "loss": 0.3901, "rewards/chosen": -0.5769264698028564, "rewards/margins": 1.2562694549560547, "rewards/rejected": -1.8331959247589111, "step": 4433 }, { "epoch": 0.2350197439906713, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -101477465.6, "logits/rejected": -10478554.666666666, "logps/chosen": -170.3285400390625, "logps/rejected": -325.23956298828125, "loss": 0.4005, "rewards/chosen": 0.14331893920898436, "rewards/margins": 1.5871642112731934, "rewards/rejected": -1.443845272064209, "step": 4434 }, { "epoch": 0.23507274799247344, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30250992.0, "logits/rejected": -9977146.0, "logps/chosen": -454.95880126953125, "logps/rejected": -280.5009765625, "loss": 0.3364, "rewards/chosen": -0.020648933947086334, "rewards/margins": 2.059246562421322, "rewards/rejected": -2.079895496368408, "step": 4435 }, { "epoch": 0.23512575199427557, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31756664.0, "logits/rejected": -33890202.666666664, "logps/chosen": -237.82473754882812, "logps/rejected": -210.516357421875, "loss": 0.2884, "rewards/chosen": 0.33434203267097473, "rewards/margins": 1.6277613937854767, "rewards/rejected": -1.293419361114502, "step": 4436 }, { "epoch": 0.2351787559960777, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1827575.3333333333, "logits/rejected": -14821576.0, "logps/chosen": -200.5908203125, "logps/rejected": -256.052783203125, "loss": 0.2696, "rewards/chosen": 0.3554453452428182, "rewards/margins": 2.10226407845815, "rewards/rejected": -1.746818733215332, "step": 4437 }, { "epoch": 0.23523175999787985, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -101958192.0, "logits/rejected": -17835664.0, "logps/chosen": -817.6332397460938, "logps/rejected": -223.70267740885416, "loss": 0.2158, "rewards/chosen": 1.1031494140625, "rewards/margins": 2.585920810699463, "rewards/rejected": -1.482771396636963, "step": 4438 }, { "epoch": 0.23528476399968198, "grad_norm": 67.0, "kl": 2.4244346618652344, "learning_rate": 5e-07, "logits/chosen": -32178330.666666668, "logits/rejected": -23426626.0, "logps/chosen": -323.9843343098958, "logps/rejected": -267.4818115234375, "loss": 0.4035, "rewards/chosen": 0.5297067165374756, "rewards/margins": 1.6891361474990845, "rewards/rejected": -1.1594294309616089, "step": 4439 }, { "epoch": 0.23533776800148412, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14408342.4, "logits/rejected": 8478997.333333334, "logps/chosen": -238.176025390625, "logps/rejected": -296.9217122395833, "loss": 0.3506, "rewards/chosen": 0.4252768516540527, "rewards/margins": 1.5140046437581378, "rewards/rejected": -1.0887277921040852, "step": 4440 }, { "epoch": 0.23539077200328626, "grad_norm": 62.0, "kl": 0.4497976303100586, "learning_rate": 5e-07, "logits/chosen": -61881429.333333336, "logits/rejected": 605399.625, "logps/chosen": -472.438720703125, "logps/rejected": -386.8069763183594, "loss": 0.3609, "rewards/chosen": 0.35653889179229736, "rewards/margins": 2.1580954790115356, "rewards/rejected": -1.8015565872192383, "step": 4441 }, { "epoch": 0.2354437760050884, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40652576.0, "logits/rejected": -7883404.0, "logps/chosen": -275.2834228515625, "logps/rejected": -443.8128662109375, "loss": 0.3555, "rewards/chosen": 0.12056484222412109, "rewards/margins": 2.0811227798461913, "rewards/rejected": -1.9605579376220703, "step": 4442 }, { "epoch": 0.23549678000689053, "grad_norm": 52.25, "kl": 1.0463485717773438, "learning_rate": 5e-07, "logits/chosen": -21607854.0, "logits/rejected": -67170224.0, "logps/chosen": -357.6742858886719, "logps/rejected": -702.2618408203125, "loss": 0.2236, "rewards/chosen": 0.910596489906311, "rewards/margins": 3.145938992500305, "rewards/rejected": -2.235342502593994, "step": 4443 }, { "epoch": 0.23554978400869267, "grad_norm": 79.0, "kl": 1.350982666015625, "learning_rate": 5e-07, "logits/chosen": 2168825.1666666665, "logits/rejected": -44744440.0, "logps/chosen": -515.97705078125, "logps/rejected": -296.189697265625, "loss": 0.4044, "rewards/chosen": 0.3890407880147298, "rewards/margins": 1.4427567323048909, "rewards/rejected": -1.0537159442901611, "step": 4444 }, { "epoch": 0.23560278801049478, "grad_norm": 59.5, "kl": 0.5820679664611816, "learning_rate": 5e-07, "logits/chosen": -77184032.0, "logits/rejected": -45526896.0, "logps/chosen": -284.2593180338542, "logps/rejected": -458.1950988769531, "loss": 0.3846, "rewards/chosen": 0.21183514595031738, "rewards/margins": 3.3024723529815674, "rewards/rejected": -3.09063720703125, "step": 4445 }, { "epoch": 0.23565579201229692, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9844098.666666666, "logits/rejected": -43894185.6, "logps/chosen": -434.7386881510417, "logps/rejected": -313.973828125, "loss": 0.2473, "rewards/chosen": 0.4818468888600667, "rewards/margins": 2.428776248296102, "rewards/rejected": -1.946929359436035, "step": 4446 }, { "epoch": 0.23570879601409905, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47179312.0, "logits/rejected": -19814361.6, "logps/chosen": -314.6186930338542, "logps/rejected": -187.4295654296875, "loss": 0.3066, "rewards/chosen": 0.07175241907437642, "rewards/margins": 1.619276585181554, "rewards/rejected": -1.5475241661071777, "step": 4447 }, { "epoch": 0.2357618000159012, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7396225.333333333, "logits/rejected": -37305731.2, "logps/chosen": -172.29471842447916, "logps/rejected": -246.105126953125, "loss": 0.3147, "rewards/chosen": 0.17174301544825235, "rewards/margins": 1.4719089547793072, "rewards/rejected": -1.3001659393310547, "step": 4448 }, { "epoch": 0.23581480401770333, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49281744.0, "logits/rejected": 5530907.0, "logps/chosen": -534.67451171875, "logps/rejected": -402.6484375, "loss": 0.3022, "rewards/chosen": 0.4269354820251465, "rewards/margins": 3.144837347666422, "rewards/rejected": -2.717901865641276, "step": 4449 }, { "epoch": 0.23586780801950546, "grad_norm": 67.5, "kl": 0.5909519195556641, "learning_rate": 5e-07, "logits/chosen": -15092590.0, "logits/rejected": -57715636.0, "logps/chosen": -605.3228759765625, "logps/rejected": -533.2105102539062, "loss": 0.3229, "rewards/chosen": 0.15931132435798645, "rewards/margins": 2.2809288799762726, "rewards/rejected": -2.121617555618286, "step": 4450 }, { "epoch": 0.2359208120213076, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60110140.0, "logits/rejected": -11119517.333333334, "logps/chosen": -669.4879150390625, "logps/rejected": -309.1324462890625, "loss": 0.2243, "rewards/chosen": 0.7777175903320312, "rewards/margins": 2.4024984041849775, "rewards/rejected": -1.624780813852946, "step": 4451 }, { "epoch": 0.23597381602310974, "grad_norm": 64.5, "kl": 1.3631858825683594, "learning_rate": 5e-07, "logits/chosen": -20993548.0, "logits/rejected": -45060420.0, "logps/chosen": -381.6578776041667, "logps/rejected": -154.59950256347656, "loss": 0.352, "rewards/chosen": 0.6311095555623373, "rewards/margins": 2.0331557591756186, "rewards/rejected": -1.4020462036132812, "step": 4452 }, { "epoch": 0.23602682002491188, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -100420912.0, "logits/rejected": -42404144.0, "logps/chosen": -361.13031005859375, "logps/rejected": -507.6290283203125, "loss": 0.2801, "rewards/chosen": 0.36381322145462036, "rewards/margins": 2.3075392842292786, "rewards/rejected": -1.9437260627746582, "step": 4453 }, { "epoch": 0.236079824026714, "grad_norm": 52.75, "kl": 0.2541484832763672, "learning_rate": 5e-07, "logits/chosen": -877382.2, "logits/rejected": -13665581.333333334, "logps/chosen": -192.6420654296875, "logps/rejected": -438.3734130859375, "loss": 0.3876, "rewards/chosen": 0.03068070411682129, "rewards/margins": 1.6638910452524822, "rewards/rejected": -1.6332103411356609, "step": 4454 }, { "epoch": 0.23613282802851615, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1473604.875, "logits/rejected": -28650586.666666668, "logps/chosen": -38.04149627685547, "logps/rejected": -422.5059814453125, "loss": 0.2052, "rewards/chosen": -0.15944471955299377, "rewards/margins": 2.3756485879421234, "rewards/rejected": -2.535093307495117, "step": 4455 }, { "epoch": 0.2361858320303183, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2980477.25, "logits/rejected": -29174501.333333332, "logps/chosen": -100.51756286621094, "logps/rejected": -236.50679524739584, "loss": 0.2933, "rewards/chosen": -0.057553481310606, "rewards/margins": 1.3582688656946023, "rewards/rejected": -1.4158223470052083, "step": 4456 }, { "epoch": 0.23623883603212043, "grad_norm": 100.5, "kl": 2.0985031127929688, "learning_rate": 5e-07, "logits/chosen": 32556848.0, "logits/rejected": -25694566.0, "logps/chosen": -464.0040690104167, "logps/rejected": -574.5145874023438, "loss": 0.3934, "rewards/chosen": 0.2764439781506856, "rewards/margins": 3.0730457504590354, "rewards/rejected": -2.7966017723083496, "step": 4457 }, { "epoch": 0.23629184003392256, "grad_norm": 45.5, "kl": 0.06845855712890625, "learning_rate": 5e-07, "logits/chosen": -105937184.0, "logits/rejected": -12689456.0, "logps/chosen": -427.00323486328125, "logps/rejected": -387.5520833333333, "loss": 0.2667, "rewards/chosen": 0.351766973733902, "rewards/margins": 1.924780438343684, "rewards/rejected": -1.573013464609782, "step": 4458 }, { "epoch": 0.2363448440357247, "grad_norm": 65.0, "kl": 1.823770523071289, "learning_rate": 5e-07, "logits/chosen": 11590210.666666666, "logits/rejected": -59973156.0, "logps/chosen": -414.959228515625, "logps/rejected": -184.11932373046875, "loss": 0.3758, "rewards/chosen": 0.8156353632609049, "rewards/margins": 1.3695950309435525, "rewards/rejected": -0.5539596676826477, "step": 4459 }, { "epoch": 0.23639784803752684, "grad_norm": 58.75, "kl": 0.6894645690917969, "learning_rate": 5e-07, "logits/chosen": -48917258.666666664, "logits/rejected": -16534029.0, "logps/chosen": -396.6987711588542, "logps/rejected": -98.24790954589844, "loss": 0.4157, "rewards/chosen": 0.2702288230260213, "rewards/margins": 1.366815408070882, "rewards/rejected": -1.0965865850448608, "step": 4460 }, { "epoch": 0.23645085203932897, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 126557120.0, "logits/rejected": -34413157.333333336, "logps/chosen": -486.85028076171875, "logps/rejected": -264.8422444661458, "loss": 0.1932, "rewards/chosen": 0.4065658748149872, "rewards/margins": 2.707507977883021, "rewards/rejected": -2.3009421030680337, "step": 4461 }, { "epoch": 0.2365038560411311, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18074508.0, "logits/rejected": -30995363.2, "logps/chosen": -189.23828125, "logps/rejected": -363.430517578125, "loss": 0.317, "rewards/chosen": 0.14692999919255575, "rewards/margins": 1.5728329380353292, "rewards/rejected": -1.4259029388427735, "step": 4462 }, { "epoch": 0.23655686004293325, "grad_norm": 42.5, "kl": 0.08753204345703125, "learning_rate": 5e-07, "logits/chosen": -15057548.8, "logits/rejected": -8746740.0, "logps/chosen": -307.6297119140625, "logps/rejected": -428.5105387369792, "loss": 0.263, "rewards/chosen": 0.4917306423187256, "rewards/margins": 3.683033577601115, "rewards/rejected": -3.191302935282389, "step": 4463 }, { "epoch": 0.23660986404473539, "grad_norm": 61.25, "kl": 0.5337638854980469, "learning_rate": 5e-07, "logits/chosen": -7019584.5, "logits/rejected": -36955188.0, "logps/chosen": -1119.441650390625, "logps/rejected": -384.7549133300781, "loss": 0.2356, "rewards/chosen": 1.2228400707244873, "rewards/margins": 2.864134192466736, "rewards/rejected": -1.6412941217422485, "step": 4464 }, { "epoch": 0.23666286804653752, "grad_norm": 54.25, "kl": 0.03580284118652344, "learning_rate": 5e-07, "logits/chosen": 16577785.6, "logits/rejected": -9621656.0, "logps/chosen": -236.6349609375, "logps/rejected": -259.9931233723958, "loss": 0.344, "rewards/chosen": 0.5277814865112305, "rewards/margins": 1.5254881381988525, "rewards/rejected": -0.9977066516876221, "step": 4465 }, { "epoch": 0.23671587204833966, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19527216.0, "logits/rejected": -24041280.0, "logps/chosen": -366.2906494140625, "logps/rejected": -182.60316467285156, "loss": 0.3704, "rewards/chosen": -0.25522077083587646, "rewards/margins": 1.596280574798584, "rewards/rejected": -1.8515013456344604, "step": 4466 }, { "epoch": 0.2367688760501418, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30739424.0, "logits/rejected": -47157798.4, "logps/chosen": -187.7021687825521, "logps/rejected": -341.925439453125, "loss": 0.2932, "rewards/chosen": -0.13719356060028076, "rewards/margins": 2.4098092317581177, "rewards/rejected": -2.5470027923583984, "step": 4467 }, { "epoch": 0.23682188005194393, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25270186.666666668, "logits/rejected": -11775671.2, "logps/chosen": -635.181884765625, "logps/rejected": -155.92364501953125, "loss": 0.3274, "rewards/chosen": 0.3204002380371094, "rewards/margins": 1.4973627090454102, "rewards/rejected": -1.1769624710083009, "step": 4468 }, { "epoch": 0.23687488405374607, "grad_norm": 69.0, "kl": 2.6886062622070312, "learning_rate": 5e-07, "logits/chosen": -32937277.333333332, "logits/rejected": -13324156.0, "logps/chosen": -348.7461751302083, "logps/rejected": -129.31997680664062, "loss": 0.3801, "rewards/chosen": 0.5738228559494019, "rewards/margins": 2.4104703664779663, "rewards/rejected": -1.8366475105285645, "step": 4469 }, { "epoch": 0.23692788805554818, "grad_norm": 97.5, "kl": 3.2738037109375, "learning_rate": 5e-07, "logits/chosen": -54981094.4, "logits/rejected": -14541361.333333334, "logps/chosen": -750.80849609375, "logps/rejected": -195.7215372721354, "loss": 0.4041, "rewards/chosen": 0.62916579246521, "rewards/margins": 1.9151601632436117, "rewards/rejected": -1.2859943707784016, "step": 4470 }, { "epoch": 0.23698089205735032, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6272083.0, "logits/rejected": -30266290.666666668, "logps/chosen": -169.97247314453125, "logps/rejected": -268.41693115234375, "loss": 0.3778, "rewards/chosen": -1.083519458770752, "rewards/margins": 0.09399954477945971, "rewards/rejected": -1.1775190035502117, "step": 4471 }, { "epoch": 0.23703389605915245, "grad_norm": 51.0, "kl": 0.21106338500976562, "learning_rate": 5e-07, "logits/chosen": -12735140.0, "logits/rejected": -22777836.0, "logps/chosen": -122.2594223022461, "logps/rejected": -656.2088623046875, "loss": 0.2763, "rewards/chosen": 0.20819950103759766, "rewards/margins": 3.17130446434021, "rewards/rejected": -2.9631049633026123, "step": 4472 }, { "epoch": 0.2370869000609546, "grad_norm": 74.0, "kl": 1.2004280090332031, "learning_rate": 5e-07, "logits/chosen": -49085424.0, "logits/rejected": -29328880.0, "logps/chosen": -756.0829467773438, "logps/rejected": -557.7108764648438, "loss": 0.3174, "rewards/chosen": 0.3655296266078949, "rewards/margins": 2.4314495027065277, "rewards/rejected": -2.065919876098633, "step": 4473 }, { "epoch": 0.23713990406275673, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51884544.0, "logits/rejected": 3071592.5, "logps/chosen": -263.15945870535717, "logps/rejected": -208.88339233398438, "loss": 0.4037, "rewards/chosen": 0.1821859564099993, "rewards/margins": 4.5582326139722555, "rewards/rejected": -4.376046657562256, "step": 4474 }, { "epoch": 0.23719290806455887, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14783090.0, "logits/rejected": -32890446.0, "logps/chosen": -245.91018676757812, "logps/rejected": -239.61326599121094, "loss": 0.4235, "rewards/chosen": -0.41728365421295166, "rewards/margins": 0.7621105909347534, "rewards/rejected": -1.179394245147705, "step": 4475 }, { "epoch": 0.237245912066361, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55590997.333333336, "logits/rejected": -69549772.8, "logps/chosen": -626.3946940104166, "logps/rejected": -231.8154541015625, "loss": 0.2678, "rewards/chosen": 0.7665875752766927, "rewards/margins": 2.0097220738728843, "rewards/rejected": -1.2431344985961914, "step": 4476 }, { "epoch": 0.23729891606816314, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 312955.6666666667, "logits/rejected": -37850252.8, "logps/chosen": -234.2615763346354, "logps/rejected": -542.7859375, "loss": 0.1779, "rewards/chosen": 0.6864622433980306, "rewards/margins": 3.3003593762715657, "rewards/rejected": -2.613897132873535, "step": 4477 }, { "epoch": 0.23735192006996528, "grad_norm": 51.75, "kl": 1.1999177932739258, "learning_rate": 5e-07, "logits/chosen": -45134432.0, "logits/rejected": -30200582.0, "logps/chosen": -194.47030203683036, "logps/rejected": -116.3989028930664, "loss": 0.4379, "rewards/chosen": 0.2884939398084368, "rewards/margins": 1.4193907465253557, "rewards/rejected": -1.130896806716919, "step": 4478 }, { "epoch": 0.23740492407176741, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36091973.333333336, "logits/rejected": -46532323.2, "logps/chosen": -315.86236572265625, "logps/rejected": -601.2158203125, "loss": 0.2285, "rewards/chosen": 0.8686418533325195, "rewards/margins": 2.611141014099121, "rewards/rejected": -1.7424991607666016, "step": 4479 }, { "epoch": 0.23745792807356955, "grad_norm": 30.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2771671.0, "logits/rejected": -76289664.0, "logps/chosen": -39.635955810546875, "logps/rejected": -298.4613037109375, "loss": 0.3466, "rewards/chosen": -0.059072159230709076, "rewards/margins": 1.7540586963295937, "rewards/rejected": -1.8131308555603027, "step": 4480 }, { "epoch": 0.2375109320753717, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41855052.8, "logits/rejected": -1155048.0, "logps/chosen": -247.381201171875, "logps/rejected": -348.8046875, "loss": 0.3428, "rewards/chosen": 0.16838154792785645, "rewards/margins": 2.072886006037394, "rewards/rejected": -1.9045044581095378, "step": 4481 }, { "epoch": 0.23756393607717383, "grad_norm": 65.0, "kl": 1.4365997314453125, "learning_rate": 5e-07, "logits/chosen": -23899260.8, "logits/rejected": -44199456.0, "logps/chosen": -516.653515625, "logps/rejected": -279.7209879557292, "loss": 0.3566, "rewards/chosen": 0.5014820098876953, "rewards/margins": 2.030040582021077, "rewards/rejected": -1.528558572133382, "step": 4482 }, { "epoch": 0.23761694007897596, "grad_norm": 60.75, "kl": 0.09337520599365234, "learning_rate": 5e-07, "logits/chosen": -111364761.6, "logits/rejected": 7913550.666666667, "logps/chosen": -375.2377685546875, "logps/rejected": -285.6653238932292, "loss": 0.4213, "rewards/chosen": -0.12399781942367553, "rewards/margins": 1.450751809279124, "rewards/rejected": -1.5747496287027996, "step": 4483 }, { "epoch": 0.2376699440807781, "grad_norm": 56.5, "kl": 1.4610381126403809, "learning_rate": 5e-07, "logits/chosen": -7419744.0, "logits/rejected": -17126262.0, "logps/chosen": -482.7593994140625, "logps/rejected": -144.59820556640625, "loss": 0.4095, "rewards/chosen": 0.41008418798446655, "rewards/margins": 1.1237757205963135, "rewards/rejected": -0.7136915326118469, "step": 4484 }, { "epoch": 0.23772294808258024, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34769394.666666664, "logits/rejected": -39883814.4, "logps/chosen": -546.140625, "logps/rejected": -208.33251953125, "loss": 0.2702, "rewards/chosen": 0.39419809977213544, "rewards/margins": 2.1733959833780925, "rewards/rejected": -1.779197883605957, "step": 4485 }, { "epoch": 0.23777595208438237, "grad_norm": 66.5, "kl": 0.9997329711914062, "learning_rate": 5e-07, "logits/chosen": -62418009.6, "logits/rejected": -20268040.0, "logps/chosen": -541.25498046875, "logps/rejected": -436.047607421875, "loss": 0.3405, "rewards/chosen": 0.6671765327453614, "rewards/margins": 1.9422289212544759, "rewards/rejected": -1.2750523885091145, "step": 4486 }, { "epoch": 0.2378289560861845, "grad_norm": 73.5, "kl": 0.35565185546875, "learning_rate": 5e-07, "logits/chosen": 23183229.333333332, "logits/rejected": 2661307.5, "logps/chosen": -413.6930745442708, "logps/rejected": -145.8878173828125, "loss": 0.4276, "rewards/chosen": 0.036905611554781594, "rewards/margins": 1.5137855658928554, "rewards/rejected": -1.4768799543380737, "step": 4487 }, { "epoch": 0.23788196008798665, "grad_norm": 41.25, "kl": 0.13271141052246094, "learning_rate": 5e-07, "logits/chosen": -28149588.0, "logits/rejected": -53466720.0, "logps/chosen": -361.6981201171875, "logps/rejected": -494.1350911458333, "loss": 0.1816, "rewards/chosen": 0.4094299376010895, "rewards/margins": 2.955473651488622, "rewards/rejected": -2.5460437138875327, "step": 4488 }, { "epoch": 0.2379349640897888, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34683980.8, "logits/rejected": -14014328.0, "logps/chosen": -340.2996337890625, "logps/rejected": -274.80706787109375, "loss": 0.2668, "rewards/chosen": 0.5068712711334229, "rewards/margins": 3.002497402826945, "rewards/rejected": -2.495626131693522, "step": 4489 }, { "epoch": 0.23798796809159092, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33981836.0, "logits/rejected": -67879813.33333333, "logps/chosen": -152.74501037597656, "logps/rejected": -410.3754475911458, "loss": 0.2199, "rewards/chosen": -0.44767630100250244, "rewards/margins": 2.264212965965271, "rewards/rejected": -2.7118892669677734, "step": 4490 }, { "epoch": 0.23804097209339306, "grad_norm": 85.0, "kl": 0.26068878173828125, "learning_rate": 5e-07, "logits/chosen": -67166899.2, "logits/rejected": -24332285.333333332, "logps/chosen": -303.50888671875, "logps/rejected": -222.68326822916666, "loss": 0.3339, "rewards/chosen": 0.5183923721313477, "rewards/margins": 1.9064799626668294, "rewards/rejected": -1.3880875905354817, "step": 4491 }, { "epoch": 0.2380939760951952, "grad_norm": 63.25, "kl": 1.443197250366211, "learning_rate": 5e-07, "logits/chosen": 6674119.333333333, "logits/rejected": -30932300.0, "logps/chosen": -279.7631022135417, "logps/rejected": -381.1225891113281, "loss": 0.446, "rewards/chosen": 0.17747588952382407, "rewards/margins": 1.5011539061864216, "rewards/rejected": -1.3236780166625977, "step": 4492 }, { "epoch": 0.23814698009699733, "grad_norm": 52.0, "kl": 0.025081634521484375, "learning_rate": 5e-07, "logits/chosen": -77150984.0, "logits/rejected": -36793572.0, "logps/chosen": -344.02447509765625, "logps/rejected": -496.5968017578125, "loss": 0.2876, "rewards/chosen": 0.034694477915763855, "rewards/margins": 2.5804963558912277, "rewards/rejected": -2.545801877975464, "step": 4493 }, { "epoch": 0.23819998409879947, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22474577.6, "logits/rejected": -16989133.333333332, "logps/chosen": -307.64736328125, "logps/rejected": -415.746826171875, "loss": 0.2719, "rewards/chosen": 0.7536786079406739, "rewards/margins": 2.8059851010640466, "rewards/rejected": -2.0523064931233725, "step": 4494 }, { "epoch": 0.2382529881006016, "grad_norm": 57.75, "kl": 0.6256752014160156, "learning_rate": 5e-07, "logits/chosen": -57794848.0, "logits/rejected": -52950597.333333336, "logps/chosen": -484.2365234375, "logps/rejected": -583.1630859375, "loss": 0.2453, "rewards/chosen": 0.9553384780883789, "rewards/margins": 3.2543884913126626, "rewards/rejected": -2.2990500132242837, "step": 4495 }, { "epoch": 0.23830599210240372, "grad_norm": 49.75, "kl": 0.7429561614990234, "learning_rate": 5e-07, "logits/chosen": -17567422.4, "logits/rejected": -41658138.666666664, "logps/chosen": -79.55836181640625, "logps/rejected": -742.421875, "loss": 0.3615, "rewards/chosen": 0.2503936767578125, "rewards/margins": 2.7146718978881834, "rewards/rejected": -2.464278221130371, "step": 4496 }, { "epoch": 0.23835899610420586, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4078209.25, "logits/rejected": -19615080.0, "logps/chosen": -327.98516845703125, "logps/rejected": -612.96728515625, "loss": 0.2825, "rewards/chosen": -0.1800074577331543, "rewards/margins": 3.281299352645874, "rewards/rejected": -3.4613068103790283, "step": 4497 }, { "epoch": 0.238412000106008, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16509029.0, "logits/rejected": -12117840.0, "logps/chosen": -268.827392578125, "logps/rejected": -434.2847595214844, "loss": 0.3727, "rewards/chosen": 0.15433797240257263, "rewards/margins": 1.254567950963974, "rewards/rejected": -1.1002299785614014, "step": 4498 }, { "epoch": 0.23846500410781013, "grad_norm": 69.5, "kl": 0.9042510986328125, "learning_rate": 5e-07, "logits/chosen": -59782259.2, "logits/rejected": -3511580.3333333335, "logps/chosen": -498.6109375, "logps/rejected": -273.66656494140625, "loss": 0.3646, "rewards/chosen": 0.49224491119384767, "rewards/margins": 1.5372163772583007, "rewards/rejected": -1.0449714660644531, "step": 4499 }, { "epoch": 0.23851800810961227, "grad_norm": 48.25, "kl": 2.163209915161133, "learning_rate": 5e-07, "logits/chosen": -12361468.0, "logits/rejected": 20981840.0, "logps/chosen": -588.5302734375, "logps/rejected": -193.18349609375, "loss": 0.2443, "rewards/chosen": 1.090663194656372, "rewards/margins": 2.4151724338531495, "rewards/rejected": -1.3245092391967774, "step": 4500 }, { "epoch": 0.2385710121114144, "grad_norm": 53.5, "kl": 0.027740478515625, "learning_rate": 5e-07, "logits/chosen": -46238860.0, "logits/rejected": 2410135.25, "logps/chosen": -298.79583740234375, "logps/rejected": -307.3782653808594, "loss": 0.3681, "rewards/chosen": 0.1076708734035492, "rewards/margins": 1.4102477729320526, "rewards/rejected": -1.3025768995285034, "step": 4501 }, { "epoch": 0.23862401611321654, "grad_norm": 52.75, "kl": 1.7447776794433594, "learning_rate": 5e-07, "logits/chosen": -62841912.0, "logits/rejected": -14125531.0, "logps/chosen": -438.4810485839844, "logps/rejected": -422.3506774902344, "loss": 0.2874, "rewards/chosen": 0.7255843877792358, "rewards/margins": 3.411662220954895, "rewards/rejected": -2.686077833175659, "step": 4502 }, { "epoch": 0.23867702011501868, "grad_norm": 45.5, "kl": 0.9476394653320312, "learning_rate": 5e-07, "logits/chosen": -52938244.0, "logits/rejected": -7768920.5, "logps/chosen": -363.111083984375, "logps/rejected": -168.7562255859375, "loss": 0.3121, "rewards/chosen": 0.5561531186103821, "rewards/margins": 2.321759283542633, "rewards/rejected": -1.765606164932251, "step": 4503 }, { "epoch": 0.23873002411682082, "grad_norm": 54.0, "kl": 0.399627685546875, "learning_rate": 5e-07, "logits/chosen": -46678948.0, "logits/rejected": -20374160.0, "logps/chosen": -420.82891845703125, "logps/rejected": -387.1618347167969, "loss": 0.2593, "rewards/chosen": 0.6364462375640869, "rewards/margins": 2.6033769845962524, "rewards/rejected": -1.9669307470321655, "step": 4504 }, { "epoch": 0.23878302811862295, "grad_norm": 73.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8999093.333333334, "logits/rejected": -12437372.0, "logps/chosen": -454.5087890625, "logps/rejected": -291.8973876953125, "loss": 0.3763, "rewards/chosen": -0.8592915534973145, "rewards/margins": 0.95895357131958, "rewards/rejected": -1.8182451248168945, "step": 4505 }, { "epoch": 0.2388360321204251, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39527429.333333336, "logits/rejected": -38022412.8, "logps/chosen": -390.4823811848958, "logps/rejected": -346.93837890625, "loss": 0.2511, "rewards/chosen": 0.4499053955078125, "rewards/margins": 2.517902946472168, "rewards/rejected": -2.0679975509643556, "step": 4506 }, { "epoch": 0.23888903612222723, "grad_norm": 50.25, "kl": 0.26658058166503906, "learning_rate": 5e-07, "logits/chosen": -4156789.3333333335, "logits/rejected": -22867232.0, "logps/chosen": -97.43258666992188, "logps/rejected": -255.6930389404297, "loss": 0.4172, "rewards/chosen": 0.0164709376792113, "rewards/margins": 1.5644270467261474, "rewards/rejected": -1.547956109046936, "step": 4507 }, { "epoch": 0.23894204012402936, "grad_norm": 100.5, "kl": 1.7046241760253906, "learning_rate": 5e-07, "logits/chosen": -9767152.0, "logits/rejected": -11208595.0, "logps/chosen": -1002.5743408203125, "logps/rejected": -374.6619567871094, "loss": 0.3869, "rewards/chosen": 0.2865663766860962, "rewards/margins": 1.4028635025024414, "rewards/rejected": -1.1162971258163452, "step": 4508 }, { "epoch": 0.2389950441258315, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23821506.666666668, "logits/rejected": -20334152.0, "logps/chosen": -295.7688802083333, "logps/rejected": -270.139013671875, "loss": 0.2514, "rewards/chosen": 0.4689773718516032, "rewards/margins": 2.273553482691447, "rewards/rejected": -1.8045761108398437, "step": 4509 }, { "epoch": 0.23904804812763364, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31772736.0, "logits/rejected": 3883220.25, "logps/chosen": -804.5408325195312, "logps/rejected": -276.5985107421875, "loss": 0.2242, "rewards/chosen": 1.0947970151901245, "rewards/margins": 3.060840606689453, "rewards/rejected": -1.9660435914993286, "step": 4510 }, { "epoch": 0.23910105212943578, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53850277.333333336, "logits/rejected": -16602806.4, "logps/chosen": -405.5420328776042, "logps/rejected": -223.9251953125, "loss": 0.3022, "rewards/chosen": 0.597706119219462, "rewards/margins": 1.8480990330378213, "rewards/rejected": -1.2503929138183594, "step": 4511 }, { "epoch": 0.2391540561312379, "grad_norm": 67.5, "kl": 0.8741989135742188, "learning_rate": 5e-07, "logits/chosen": 2186692.714285714, "logits/rejected": -29399542.0, "logps/chosen": -430.33628627232144, "logps/rejected": -598.3511352539062, "loss": 0.3947, "rewards/chosen": 0.35833910533360075, "rewards/margins": 3.7807696206229076, "rewards/rejected": -3.4224305152893066, "step": 4512 }, { "epoch": 0.23920706013304005, "grad_norm": 52.25, "kl": 0.5129709243774414, "learning_rate": 5e-07, "logits/chosen": -8136808.0, "logits/rejected": -41556632.0, "logps/chosen": -256.03082275390625, "logps/rejected": -338.9940490722656, "loss": 0.2492, "rewards/chosen": 0.7718251943588257, "rewards/margins": 2.980766177177429, "rewards/rejected": -2.2089409828186035, "step": 4513 }, { "epoch": 0.2392600641348422, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 879602.25, "logits/rejected": -5827072.0, "logps/chosen": -51.81145477294922, "logps/rejected": -376.7759602864583, "loss": 0.263, "rewards/chosen": -0.7189017534255981, "rewards/margins": 1.2432435750961304, "rewards/rejected": -1.9621453285217285, "step": 4514 }, { "epoch": 0.23931306813664432, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16176056.0, "logits/rejected": -13679317.333333334, "logps/chosen": -182.2574462890625, "logps/rejected": -464.2555338541667, "loss": 0.3387, "rewards/chosen": -0.09479619264602661, "rewards/margins": 2.6455167015393575, "rewards/rejected": -2.7403128941853843, "step": 4515 }, { "epoch": 0.23936607213844646, "grad_norm": 49.25, "kl": 0.5062503814697266, "learning_rate": 5e-07, "logits/chosen": -23241945.14285714, "logits/rejected": -11928185.0, "logps/chosen": -135.56497628348214, "logps/rejected": -73.39903259277344, "loss": 0.4873, "rewards/chosen": -0.08460744789668492, "rewards/margins": 1.5473559243338448, "rewards/rejected": -1.6319633722305298, "step": 4516 }, { "epoch": 0.2394190761402486, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41002232.0, "logits/rejected": -7471052.0, "logps/chosen": -426.498291015625, "logps/rejected": -155.3187255859375, "loss": 0.2866, "rewards/chosen": 1.0389766693115234, "rewards/margins": 2.062256097793579, "rewards/rejected": -1.0232794284820557, "step": 4517 }, { "epoch": 0.23947208014205074, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37726301.333333336, "logits/rejected": -11328326.0, "logps/chosen": -308.473388671875, "logps/rejected": -249.43292236328125, "loss": 0.4628, "rewards/chosen": -0.3344821532567342, "rewards/margins": 1.530102292696635, "rewards/rejected": -1.8645844459533691, "step": 4518 }, { "epoch": 0.23952508414385287, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2056505.1666666667, "logits/rejected": -38733961.6, "logps/chosen": -189.8202107747396, "logps/rejected": -304.23896484375, "loss": 0.2561, "rewards/chosen": 0.6658132473627726, "rewards/margins": 2.2231868664423624, "rewards/rejected": -1.55737361907959, "step": 4519 }, { "epoch": 0.239578088145655, "grad_norm": 42.0, "kl": 0.043666839599609375, "learning_rate": 5e-07, "logits/chosen": -6184278.5, "logits/rejected": -7219281.333333333, "logps/chosen": -190.54006958007812, "logps/rejected": -113.32748413085938, "loss": 0.2759, "rewards/chosen": 0.7788587808609009, "rewards/margins": 2.0065931876500445, "rewards/rejected": -1.2277344067891438, "step": 4520 }, { "epoch": 0.23963109214745712, "grad_norm": 50.25, "kl": 3.0035743713378906, "learning_rate": 5e-07, "logits/chosen": -15115297.0, "logits/rejected": -36198800.0, "logps/chosen": -697.1696166992188, "logps/rejected": -327.49188232421875, "loss": 0.278, "rewards/chosen": 1.189186453819275, "rewards/margins": 2.7069222927093506, "rewards/rejected": -1.5177358388900757, "step": 4521 }, { "epoch": 0.23968409614925926, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12811519.2, "logits/rejected": -38149272.0, "logps/chosen": -305.6392822265625, "logps/rejected": -422.1338704427083, "loss": 0.3734, "rewards/chosen": -0.1604226589202881, "rewards/margins": 2.2450551827748617, "rewards/rejected": -2.40547784169515, "step": 4522 }, { "epoch": 0.2397371001510614, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29751636.0, "logits/rejected": -61645344.0, "logps/chosen": -328.841796875, "logps/rejected": -382.39007568359375, "loss": 0.2188, "rewards/chosen": 1.120461106300354, "rewards/margins": 2.9951547384262085, "rewards/rejected": -1.8746936321258545, "step": 4523 }, { "epoch": 0.23979010415286353, "grad_norm": 65.0, "kl": 0.8851165771484375, "learning_rate": 5e-07, "logits/chosen": -39331091.2, "logits/rejected": -24918229.333333332, "logps/chosen": -212.9115478515625, "logps/rejected": -419.6584879557292, "loss": 0.3382, "rewards/chosen": 0.5703635215759277, "rewards/margins": 2.005742073059082, "rewards/rejected": -1.4353785514831543, "step": 4524 }, { "epoch": 0.23984310815466567, "grad_norm": 62.25, "kl": 1.4811248779296875, "learning_rate": 5e-07, "logits/chosen": -31560480.0, "logits/rejected": -58948917.333333336, "logps/chosen": -333.0749267578125, "logps/rejected": -330.11753336588544, "loss": 0.3366, "rewards/chosen": 0.5091290950775147, "rewards/margins": 2.7815374215443933, "rewards/rejected": -2.2724083264668784, "step": 4525 }, { "epoch": 0.2398961121564678, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54810741.333333336, "logits/rejected": -43466144.0, "logps/chosen": -321.06129964192706, "logps/rejected": -258.3749755859375, "loss": 0.2634, "rewards/chosen": 0.159407248099645, "rewards/margins": 2.217096151908239, "rewards/rejected": -2.057688903808594, "step": 4526 }, { "epoch": 0.23994911615826994, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19055364.0, "logits/rejected": -22804677.333333332, "logps/chosen": -354.1827392578125, "logps/rejected": -297.7430013020833, "loss": 0.3147, "rewards/chosen": -0.004602812230587006, "rewards/margins": 1.874684748550256, "rewards/rejected": -1.879287560780843, "step": 4527 }, { "epoch": 0.24000212016007208, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18904222.4, "logits/rejected": -69026922.66666667, "logps/chosen": -324.428076171875, "logps/rejected": -455.8521728515625, "loss": 0.3208, "rewards/chosen": 0.19751592874526977, "rewards/margins": 2.986189329624176, "rewards/rejected": -2.7886734008789062, "step": 4528 }, { "epoch": 0.24005512416187422, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56164892.0, "logits/rejected": -11407938.0, "logps/chosen": -442.9323425292969, "logps/rejected": -351.261962890625, "loss": 0.3031, "rewards/chosen": 0.09507675468921661, "rewards/margins": 2.542965844273567, "rewards/rejected": -2.4478890895843506, "step": 4529 }, { "epoch": 0.24010812816367635, "grad_norm": 43.25, "kl": 0.5325469970703125, "learning_rate": 5e-07, "logits/chosen": -6481923.333333333, "logits/rejected": -187422800.0, "logps/chosen": -139.0525105794271, "logps/rejected": -165.90570068359375, "loss": 0.3759, "rewards/chosen": 0.26360857486724854, "rewards/margins": 2.3697680234909058, "rewards/rejected": -2.1061594486236572, "step": 4530 }, { "epoch": 0.2401611321654785, "grad_norm": 61.5, "kl": 1.0305099487304688, "learning_rate": 5e-07, "logits/chosen": -25211421.333333332, "logits/rejected": -44689264.0, "logps/chosen": -362.3756103515625, "logps/rejected": -103.52287292480469, "loss": 0.4293, "rewards/chosen": 0.2633681297302246, "rewards/margins": 1.1470080614089966, "rewards/rejected": -0.883639931678772, "step": 4531 }, { "epoch": 0.24021413616728063, "grad_norm": 83.0, "kl": 0.8368778228759766, "learning_rate": 5e-07, "logits/chosen": -39899120.0, "logits/rejected": -478246.0, "logps/chosen": -270.363037109375, "logps/rejected": -374.98828125, "loss": 0.3745, "rewards/chosen": 0.49383392333984377, "rewards/margins": 1.3684514204661051, "rewards/rejected": -0.8746174971262614, "step": 4532 }, { "epoch": 0.24026714016908277, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12236418.0, "logits/rejected": -42205180.0, "logps/chosen": -344.36260986328125, "logps/rejected": -61.036956787109375, "loss": 0.3854, "rewards/chosen": 0.20839348435401917, "rewards/margins": 1.0157150328159332, "rewards/rejected": -0.8073215484619141, "step": 4533 }, { "epoch": 0.2403201441708849, "grad_norm": 43.0, "kl": 0.21483421325683594, "learning_rate": 5e-07, "logits/chosen": -25928012.8, "logits/rejected": -46215402.666666664, "logps/chosen": -178.60147705078126, "logps/rejected": -354.69775390625, "loss": 0.3294, "rewards/chosen": 0.1402444839477539, "rewards/margins": 2.7012465794881186, "rewards/rejected": -2.5610020955403647, "step": 4534 }, { "epoch": 0.24037314817268704, "grad_norm": 73.0, "kl": 1.497314453125, "learning_rate": 5e-07, "logits/chosen": 21395990.4, "logits/rejected": -35502397.333333336, "logps/chosen": -483.52158203125, "logps/rejected": -411.6376546223958, "loss": 0.3352, "rewards/chosen": 0.3899643898010254, "rewards/margins": 2.8618817011515296, "rewards/rejected": -2.4719173113505044, "step": 4535 }, { "epoch": 0.24042615217448918, "grad_norm": 59.0, "kl": 1.7944984436035156, "learning_rate": 5e-07, "logits/chosen": -33256454.4, "logits/rejected": -4242626.666666667, "logps/chosen": -445.986279296875, "logps/rejected": -225.7727254231771, "loss": 0.3033, "rewards/chosen": 1.0396013259887695, "rewards/margins": 2.6370978355407715, "rewards/rejected": -1.597496509552002, "step": 4536 }, { "epoch": 0.24047915617629131, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47376248.0, "logits/rejected": -6538892.666666667, "logps/chosen": -189.0654296875, "logps/rejected": -300.1514078776042, "loss": 0.2629, "rewards/chosen": -0.07326126098632812, "rewards/margins": 1.66918150583903, "rewards/rejected": -1.7424427668253581, "step": 4537 }, { "epoch": 0.24053216017809345, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40425689.6, "logits/rejected": -32055168.0, "logps/chosen": -249.08330078125, "logps/rejected": -294.61572265625, "loss": 0.4147, "rewards/chosen": -0.24927423000335694, "rewards/margins": 1.519734724362691, "rewards/rejected": -1.769008954366048, "step": 4538 }, { "epoch": 0.2405851641798956, "grad_norm": 53.25, "kl": 0.6640090942382812, "learning_rate": 5e-07, "logits/chosen": -5603522.666666667, "logits/rejected": -78974579.2, "logps/chosen": -402.5444742838542, "logps/rejected": -534.446044921875, "loss": 0.1958, "rewards/chosen": 1.008650779724121, "rewards/margins": 3.8880205154418945, "rewards/rejected": -2.8793697357177734, "step": 4539 }, { "epoch": 0.24063816818169773, "grad_norm": 49.75, "kl": 0.1724834442138672, "learning_rate": 5e-07, "logits/chosen": -42401683.2, "logits/rejected": -12264909.333333334, "logps/chosen": -302.0382568359375, "logps/rejected": -322.0255940755208, "loss": 0.3404, "rewards/chosen": 0.14166368246078492, "rewards/margins": 2.1522106846173608, "rewards/rejected": -2.0105470021565757, "step": 4540 }, { "epoch": 0.24069117218349986, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47586844.0, "logits/rejected": -30414014.0, "logps/chosen": -416.4870300292969, "logps/rejected": -440.1447448730469, "loss": 0.2915, "rewards/chosen": 0.8864361047744751, "rewards/margins": 2.6977776288986206, "rewards/rejected": -1.8113415241241455, "step": 4541 }, { "epoch": 0.240744176185302, "grad_norm": 66.5, "kl": 0.3297691345214844, "learning_rate": 5e-07, "logits/chosen": -40184384.0, "logits/rejected": -13983078.0, "logps/chosen": -337.2344970703125, "logps/rejected": -268.13543701171875, "loss": 0.3426, "rewards/chosen": -0.1382366120815277, "rewards/margins": 1.7861949503421783, "rewards/rejected": -1.924431562423706, "step": 4542 }, { "epoch": 0.24079718018710414, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13915809.6, "logits/rejected": -30511952.0, "logps/chosen": -234.474267578125, "logps/rejected": -467.8680013020833, "loss": 0.289, "rewards/chosen": 0.5426437377929687, "rewards/margins": 2.7504358609517414, "rewards/rejected": -2.207792123158773, "step": 4543 }, { "epoch": 0.24085018418890627, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10342576.0, "logits/rejected": 8315795.2, "logps/chosen": -452.5128173828125, "logps/rejected": -185.07001953125, "loss": 0.2905, "rewards/chosen": 0.6501695315043131, "rewards/margins": 1.9503806749979655, "rewards/rejected": -1.3002111434936523, "step": 4544 }, { "epoch": 0.2409031881907084, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39492965.333333336, "logits/rejected": -20878728.0, "logps/chosen": -150.1754150390625, "logps/rejected": -331.8428955078125, "loss": 0.3376, "rewards/chosen": -0.10253715515136719, "rewards/margins": 1.5742813110351563, "rewards/rejected": -1.6768184661865235, "step": 4545 }, { "epoch": 0.24095619219251052, "grad_norm": 66.0, "kl": 2.312246322631836, "learning_rate": 5e-07, "logits/chosen": -19984420.0, "logits/rejected": -11897548.0, "logps/chosen": -961.6697998046875, "logps/rejected": -148.99697875976562, "loss": 0.2439, "rewards/chosen": 0.9359956979751587, "rewards/margins": 2.5851588249206543, "rewards/rejected": -1.6491631269454956, "step": 4546 }, { "epoch": 0.24100919619431266, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -111295210.66666667, "logits/rejected": -13643950.4, "logps/chosen": -249.566162109375, "logps/rejected": -237.870166015625, "loss": 0.4092, "rewards/chosen": -0.20494844516118368, "rewards/margins": 0.850866679350535, "rewards/rejected": -1.0558151245117187, "step": 4547 }, { "epoch": 0.2410622001961148, "grad_norm": 68.0, "kl": 1.241476058959961, "learning_rate": 5e-07, "logits/chosen": -25719611.2, "logits/rejected": 94156.77083333333, "logps/chosen": -496.9111328125, "logps/rejected": -97.61409505208333, "loss": 0.3499, "rewards/chosen": 0.41019654273986816, "rewards/margins": 1.5430519580841064, "rewards/rejected": -1.1328554153442383, "step": 4548 }, { "epoch": 0.24111520419791693, "grad_norm": 62.0, "kl": 0.2622489929199219, "learning_rate": 5e-07, "logits/chosen": -16485072.0, "logits/rejected": 4757918.0, "logps/chosen": -199.8546142578125, "logps/rejected": -24.625816345214844, "loss": 0.5369, "rewards/chosen": -0.19694321496146067, "rewards/margins": 0.15170753853661673, "rewards/rejected": -0.3486507534980774, "step": 4549 }, { "epoch": 0.24116820819971907, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14485707.0, "logits/rejected": -56852808.0, "logps/chosen": -246.66888427734375, "logps/rejected": -618.503173828125, "loss": 0.3151, "rewards/chosen": -0.09555597603321075, "rewards/margins": 2.4198000878095627, "rewards/rejected": -2.5153560638427734, "step": 4550 }, { "epoch": 0.2412212122015212, "grad_norm": 49.75, "kl": 0.681736946105957, "learning_rate": 5e-07, "logits/chosen": 7920602.0, "logits/rejected": -22766294.0, "logps/chosen": -556.3858642578125, "logps/rejected": -258.3805847167969, "loss": 0.2816, "rewards/chosen": 0.6765429973602295, "rewards/margins": 2.813126802444458, "rewards/rejected": -2.1365838050842285, "step": 4551 }, { "epoch": 0.24127421620332334, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56530356.0, "logits/rejected": -14298483.0, "logps/chosen": -192.060791015625, "logps/rejected": -274.5792236328125, "loss": 0.2943, "rewards/chosen": 0.1411559134721756, "rewards/margins": 2.2439555674791336, "rewards/rejected": -2.102799654006958, "step": 4552 }, { "epoch": 0.24132722020512548, "grad_norm": 60.0, "kl": 0.28849029541015625, "learning_rate": 5e-07, "logits/chosen": -61202752.0, "logits/rejected": -1547222.6666666667, "logps/chosen": -407.7875244140625, "logps/rejected": -337.9386393229167, "loss": 0.3752, "rewards/chosen": 0.09898680448532104, "rewards/margins": 1.6671234965324402, "rewards/rejected": -1.5681366920471191, "step": 4553 }, { "epoch": 0.24138022420692762, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -329510.5, "logits/rejected": -18634942.4, "logps/chosen": -167.22933959960938, "logps/rejected": -198.516162109375, "loss": 0.3666, "rewards/chosen": -0.5515259901682535, "rewards/margins": 0.7186318238576254, "rewards/rejected": -1.270157814025879, "step": 4554 }, { "epoch": 0.24143322820872976, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26126322.285714287, "logits/rejected": -55196760.0, "logps/chosen": -222.49968610491072, "logps/rejected": -190.2353057861328, "loss": 0.5049, "rewards/chosen": -0.24739112172807967, "rewards/margins": 1.6255581208637782, "rewards/rejected": -1.872949242591858, "step": 4555 }, { "epoch": 0.2414862322105319, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14819192.0, "logits/rejected": -36263450.666666664, "logps/chosen": -314.93194580078125, "logps/rejected": -378.8516438802083, "loss": 0.2874, "rewards/chosen": -0.7517068982124329, "rewards/margins": 1.3664778272310891, "rewards/rejected": -2.118184725443522, "step": 4556 }, { "epoch": 0.24153923621233403, "grad_norm": 65.5, "kl": 1.7804794311523438, "learning_rate": 5e-07, "logits/chosen": -9430184.0, "logits/rejected": -9265288.0, "logps/chosen": -351.445556640625, "logps/rejected": -153.293212890625, "loss": 0.3877, "rewards/chosen": 0.5150420665740967, "rewards/margins": 1.856468915939331, "rewards/rejected": -1.3414268493652344, "step": 4557 }, { "epoch": 0.24159224021413617, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50136245.333333336, "logits/rejected": -15144198.0, "logps/chosen": -196.8959757486979, "logps/rejected": -169.58255004882812, "loss": 0.4452, "rewards/chosen": 0.008660828073819479, "rewards/margins": 0.96238545080026, "rewards/rejected": -0.9537246227264404, "step": 4558 }, { "epoch": 0.2416452442159383, "grad_norm": 47.5, "kl": 0.4786949157714844, "learning_rate": 5e-07, "logits/chosen": -19095074.0, "logits/rejected": -8196881.0, "logps/chosen": -445.2816467285156, "logps/rejected": -153.69529724121094, "loss": 0.2585, "rewards/chosen": 0.7718753218650818, "rewards/margins": 2.582474648952484, "rewards/rejected": -1.8105993270874023, "step": 4559 }, { "epoch": 0.24169824821774044, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4145728.3333333335, "logits/rejected": -48982454.4, "logps/chosen": -222.6993408203125, "logps/rejected": -165.13240966796874, "loss": 0.2437, "rewards/chosen": 0.8912905057271322, "rewards/margins": 2.2808574994405113, "rewards/rejected": -1.389566993713379, "step": 4560 }, { "epoch": 0.24175125221954258, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2107861.0, "logits/rejected": -16746747.2, "logps/chosen": -267.20263671875, "logps/rejected": -239.217822265625, "loss": 0.2885, "rewards/chosen": 0.9287114938100179, "rewards/margins": 2.1758637269337973, "rewards/rejected": -1.2471522331237792, "step": 4561 }, { "epoch": 0.24180425622134472, "grad_norm": 58.25, "kl": 2.152202606201172, "learning_rate": 5e-07, "logits/chosen": -32790924.8, "logits/rejected": -10086090.666666666, "logps/chosen": -370.0197265625, "logps/rejected": -197.93994140625, "loss": 0.3731, "rewards/chosen": 0.5889477729797363, "rewards/margins": 1.719023068745931, "rewards/rejected": -1.1300752957661946, "step": 4562 }, { "epoch": 0.24185726022314685, "grad_norm": 45.25, "kl": 0.136322021484375, "learning_rate": 5e-07, "logits/chosen": -31487872.0, "logits/rejected": -9883912.0, "logps/chosen": -417.8243001302083, "logps/rejected": -200.02177734375, "loss": 0.2466, "rewards/chosen": 0.5136006673177084, "rewards/margins": 2.309993680318197, "rewards/rejected": -1.7963930130004884, "step": 4563 }, { "epoch": 0.241910264224949, "grad_norm": 50.75, "kl": 0.6792697906494141, "learning_rate": 5e-07, "logits/chosen": -42433670.4, "logits/rejected": -82369632.0, "logps/chosen": -402.429736328125, "logps/rejected": -569.6680501302084, "loss": 0.3204, "rewards/chosen": 0.28815245628356934, "rewards/margins": 2.6457809607187905, "rewards/rejected": -2.357628504435221, "step": 4564 }, { "epoch": 0.24196326822675113, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2873608.0, "logits/rejected": -24056962.0, "logps/chosen": -282.0511474609375, "logps/rejected": -265.6000061035156, "loss": 0.2713, "rewards/chosen": 0.5196479558944702, "rewards/margins": 2.394264817237854, "rewards/rejected": -1.8746168613433838, "step": 4565 }, { "epoch": 0.24201627222855326, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68707504.0, "logits/rejected": -28955840.0, "logps/chosen": -374.6428527832031, "logps/rejected": -397.1839599609375, "loss": 0.3232, "rewards/chosen": 0.14865170419216156, "rewards/margins": 1.9152439683675766, "rewards/rejected": -1.766592264175415, "step": 4566 }, { "epoch": 0.2420692762303554, "grad_norm": 56.25, "kl": 0.48084545135498047, "learning_rate": 5e-07, "logits/chosen": -20503088.0, "logits/rejected": -15824530.666666666, "logps/chosen": -355.70587158203125, "logps/rejected": -250.77596028645834, "loss": 0.2476, "rewards/chosen": 0.5398727655410767, "rewards/margins": 2.024012843767802, "rewards/rejected": -1.4841400782267253, "step": 4567 }, { "epoch": 0.24212228023215754, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10505994.0, "logits/rejected": -2916207.3333333335, "logps/chosen": -144.824951171875, "logps/rejected": -297.33506266276044, "loss": 0.3359, "rewards/chosen": -0.5628620386123657, "rewards/margins": 1.2047206958134968, "rewards/rejected": -1.7675827344258626, "step": 4568 }, { "epoch": 0.24217528423395968, "grad_norm": 70.0, "kl": 1.6548805236816406, "learning_rate": 5e-07, "logits/chosen": -60708264.0, "logits/rejected": 12360209.0, "logps/chosen": -784.1726684570312, "logps/rejected": -223.03416442871094, "loss": 0.3569, "rewards/chosen": 0.630282461643219, "rewards/margins": 1.3819757103919983, "rewards/rejected": -0.7516932487487793, "step": 4569 }, { "epoch": 0.2422282882357618, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28152456.0, "logits/rejected": -36546248.0, "logps/chosen": -149.9037628173828, "logps/rejected": -251.10943603515625, "loss": 0.3572, "rewards/chosen": 0.1518464982509613, "rewards/margins": 1.373783677816391, "rewards/rejected": -1.2219371795654297, "step": 4570 }, { "epoch": 0.24228129223756395, "grad_norm": 42.75, "kl": 0.9640140533447266, "learning_rate": 5e-07, "logits/chosen": -27300645.333333332, "logits/rejected": -133149016.0, "logps/chosen": -141.6165568033854, "logps/rejected": -572.1173706054688, "loss": 0.3599, "rewards/chosen": 0.28986865282058716, "rewards/margins": 3.6956868767738342, "rewards/rejected": -3.405818223953247, "step": 4571 }, { "epoch": 0.24233429623936606, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1453065.5, "logits/rejected": -14498885.333333334, "logps/chosen": -35.663414001464844, "logps/rejected": -175.6388956705729, "loss": 0.3401, "rewards/chosen": -0.0074333250522613525, "rewards/margins": 1.0505991876125336, "rewards/rejected": -1.058032512664795, "step": 4572 }, { "epoch": 0.2423873002411682, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34248916.0, "logits/rejected": -39315452.0, "logps/chosen": -394.8581237792969, "logps/rejected": -414.7608337402344, "loss": 0.2991, "rewards/chosen": 0.3635249733924866, "rewards/margins": 2.1290488839149475, "rewards/rejected": -1.765523910522461, "step": 4573 }, { "epoch": 0.24244030424297033, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10549264.0, "logits/rejected": -28830396.0, "logps/chosen": -227.66468811035156, "logps/rejected": -333.1151123046875, "loss": 0.3126, "rewards/chosen": 0.2726319432258606, "rewards/margins": 1.841520369052887, "rewards/rejected": -1.5688884258270264, "step": 4574 }, { "epoch": 0.24249330824477247, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7011087.333333333, "logits/rejected": -28636438.4, "logps/chosen": -57.962178548177086, "logps/rejected": -298.849658203125, "loss": 0.316, "rewards/chosen": -0.04014631857474645, "rewards/margins": 1.5784634913007418, "rewards/rejected": -1.6186098098754882, "step": 4575 }, { "epoch": 0.2425463122465746, "grad_norm": 80.5, "kl": 0.5471649169921875, "learning_rate": 5e-07, "logits/chosen": -61382357.333333336, "logits/rejected": -60631988.0, "logps/chosen": -646.8743896484375, "logps/rejected": -588.0850219726562, "loss": 0.33, "rewards/chosen": 0.5622148116429647, "rewards/margins": 2.9702105124791465, "rewards/rejected": -2.4079957008361816, "step": 4576 }, { "epoch": 0.24259931624837675, "grad_norm": 72.5, "kl": 0.23892974853515625, "learning_rate": 5e-07, "logits/chosen": -35432776.0, "logits/rejected": -15163614.0, "logps/chosen": -379.2088216145833, "logps/rejected": -398.42620849609375, "loss": 0.4181, "rewards/chosen": 0.05119126538435618, "rewards/margins": 2.290505583087603, "rewards/rejected": -2.239314317703247, "step": 4577 }, { "epoch": 0.24265232025017888, "grad_norm": 46.0, "kl": 1.0017852783203125, "learning_rate": 5e-07, "logits/chosen": -17010162.0, "logits/rejected": -13676620.0, "logps/chosen": -779.9612426757812, "logps/rejected": -256.8066813151042, "loss": 0.172, "rewards/chosen": 1.2998710870742798, "rewards/margins": 3.286786675453186, "rewards/rejected": -1.9869155883789062, "step": 4578 }, { "epoch": 0.24270532425198102, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29627612.0, "logits/rejected": -18787292.0, "logps/chosen": -483.54400634765625, "logps/rejected": -275.5853271484375, "loss": 0.2133, "rewards/chosen": 1.0820999145507812, "rewards/margins": 3.4646615982055664, "rewards/rejected": -2.382561683654785, "step": 4579 }, { "epoch": 0.24275832825378316, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86750816.0, "logits/rejected": -33602517.333333336, "logps/chosen": -290.2458740234375, "logps/rejected": -191.5852254231771, "loss": 0.3305, "rewards/chosen": 0.34230427742004393, "rewards/margins": 1.8346904595692952, "rewards/rejected": -1.4923861821492512, "step": 4580 }, { "epoch": 0.2428113322555853, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33593274.666666664, "logits/rejected": -3158535.2, "logps/chosen": -143.5225830078125, "logps/rejected": -513.557470703125, "loss": 0.196, "rewards/chosen": 0.4429456790288289, "rewards/margins": 3.7153165896733604, "rewards/rejected": -3.2723709106445313, "step": 4581 }, { "epoch": 0.24286433625738743, "grad_norm": 46.25, "kl": 1.015228271484375, "learning_rate": 5e-07, "logits/chosen": -78837592.0, "logits/rejected": -140436800.0, "logps/chosen": -629.98388671875, "logps/rejected": -329.3198649088542, "loss": 0.1739, "rewards/chosen": 1.102685570716858, "rewards/margins": 3.4591373999913535, "rewards/rejected": -2.3564518292744956, "step": 4582 }, { "epoch": 0.24291734025918957, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5072577.6, "logits/rejected": -29565376.0, "logps/chosen": -155.5859619140625, "logps/rejected": -181.15555826822916, "loss": 0.4135, "rewards/chosen": -0.238007116317749, "rewards/margins": 1.5588848908742268, "rewards/rejected": -1.7968920071919758, "step": 4583 }, { "epoch": 0.2429703442609917, "grad_norm": 55.25, "kl": 0.28616905212402344, "learning_rate": 5e-07, "logits/chosen": -30811618.666666668, "logits/rejected": -24416636.0, "logps/chosen": -234.72281901041666, "logps/rejected": -151.49520874023438, "loss": 0.459, "rewards/chosen": -0.17057836055755615, "rewards/margins": 1.570568561553955, "rewards/rejected": -1.7411469221115112, "step": 4584 }, { "epoch": 0.24302334826279384, "grad_norm": 69.0, "kl": 1.6557331085205078, "learning_rate": 5e-07, "logits/chosen": -22282873.333333332, "logits/rejected": -689315.3125, "logps/chosen": -421.1567789713542, "logps/rejected": -65.31529998779297, "loss": 0.4069, "rewards/chosen": 0.4421783685684204, "rewards/margins": 1.414170265197754, "rewards/rejected": -0.9719918966293335, "step": 4585 }, { "epoch": 0.24307635226459598, "grad_norm": 51.5, "kl": 0.8379135131835938, "learning_rate": 5e-07, "logits/chosen": -15754937.6, "logits/rejected": -43760954.666666664, "logps/chosen": -186.33670654296876, "logps/rejected": -300.1793619791667, "loss": 0.4024, "rewards/chosen": -0.04788598418235779, "rewards/margins": 1.5009294251600902, "rewards/rejected": -1.548815409342448, "step": 4586 }, { "epoch": 0.24312935626639812, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10684716.8, "logits/rejected": -14192645.333333334, "logps/chosen": -125.2919677734375, "logps/rejected": -348.6798095703125, "loss": 0.3849, "rewards/chosen": -0.16291409730911255, "rewards/margins": 1.7865168452262878, "rewards/rejected": -1.9494309425354004, "step": 4587 }, { "epoch": 0.24318236026820025, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4274386.0, "logits/rejected": -7584934.0, "logps/chosen": -198.5410919189453, "logps/rejected": -209.38418579101562, "loss": 0.371, "rewards/chosen": 0.2640109956264496, "rewards/margins": 1.289620965719223, "rewards/rejected": -1.0256099700927734, "step": 4588 }, { "epoch": 0.2432353642700024, "grad_norm": 65.5, "kl": 1.4209060668945312, "learning_rate": 5e-07, "logits/chosen": -31332547.2, "logits/rejected": 4698065.0, "logps/chosen": -499.009228515625, "logps/rejected": -65.87748209635417, "loss": 0.3831, "rewards/chosen": 0.6131136894226075, "rewards/margins": 1.3289923350016277, "rewards/rejected": -0.7158786455790201, "step": 4589 }, { "epoch": 0.24328836827180453, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4862514.285714285, "logits/rejected": -400979.6875, "logps/chosen": -121.97719029017857, "logps/rejected": -85.40835571289062, "loss": 0.4792, "rewards/chosen": -0.08518074240003314, "rewards/margins": 1.3850509098597936, "rewards/rejected": -1.4702316522598267, "step": 4590 }, { "epoch": 0.24334137227360667, "grad_norm": 43.75, "kl": 0.37775230407714844, "learning_rate": 5e-07, "logits/chosen": -514099.25, "logits/rejected": 4561986.4, "logps/chosen": -242.80098470052084, "logps/rejected": -183.02818603515624, "loss": 0.2594, "rewards/chosen": 0.7408839066823324, "rewards/margins": 2.1313917001088463, "rewards/rejected": -1.3905077934265138, "step": 4591 }, { "epoch": 0.2433943762754088, "grad_norm": 55.25, "kl": 1.18280029296875, "learning_rate": 5e-07, "logits/chosen": -22500388.0, "logits/rejected": -4010998.0, "logps/chosen": -638.756103515625, "logps/rejected": -388.8728942871094, "loss": 0.2287, "rewards/chosen": 1.186418056488037, "rewards/margins": 3.013962507247925, "rewards/rejected": -1.8275444507598877, "step": 4592 }, { "epoch": 0.24344738027721094, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27930589.333333332, "logits/rejected": -21002251.2, "logps/chosen": -189.76200358072916, "logps/rejected": -195.5691650390625, "loss": 0.2948, "rewards/chosen": -0.09488589564959209, "rewards/margins": 1.6188395758469898, "rewards/rejected": -1.713725471496582, "step": 4593 }, { "epoch": 0.24350038427901308, "grad_norm": 56.0, "kl": 0.4674072265625, "learning_rate": 5e-07, "logits/chosen": -22468921.6, "logits/rejected": 15812780.0, "logps/chosen": -275.9286376953125, "logps/rejected": -456.5668131510417, "loss": 0.3648, "rewards/chosen": 0.001211261749267578, "rewards/margins": 2.0875874519348145, "rewards/rejected": -2.086376190185547, "step": 4594 }, { "epoch": 0.24355338828081521, "grad_norm": 71.0, "kl": 0.31708621978759766, "learning_rate": 5e-07, "logits/chosen": -58221477.333333336, "logits/rejected": -27390394.0, "logps/chosen": -556.5308024088541, "logps/rejected": -334.13873291015625, "loss": 0.4002, "rewards/chosen": 0.111990491549174, "rewards/margins": 2.281011621157328, "rewards/rejected": -2.1690211296081543, "step": 4595 }, { "epoch": 0.24360639228261735, "grad_norm": 75.5, "kl": 1.7224254608154297, "learning_rate": 5e-07, "logits/chosen": -23564286.4, "logits/rejected": -40866853.333333336, "logps/chosen": -754.72646484375, "logps/rejected": -331.67799886067706, "loss": 0.3424, "rewards/chosen": 0.41976561546325686, "rewards/margins": 1.9096633434295653, "rewards/rejected": -1.4898977279663086, "step": 4596 }, { "epoch": 0.24365939628441946, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29352304.0, "logits/rejected": -5557903.0, "logps/chosen": -304.01068115234375, "logps/rejected": -179.89369201660156, "loss": 0.2974, "rewards/chosen": 0.6331738233566284, "rewards/margins": 1.9157358407974243, "rewards/rejected": -1.282562017440796, "step": 4597 }, { "epoch": 0.2437124002862216, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38143584.0, "logits/rejected": -24068896.0, "logps/chosen": -300.80084228515625, "logps/rejected": -291.7640904017857, "loss": 0.192, "rewards/chosen": 0.7570129632949829, "rewards/margins": 2.9472325699669972, "rewards/rejected": -2.1902196066720143, "step": 4598 }, { "epoch": 0.24376540428802373, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47504280.0, "logits/rejected": -15325906.666666666, "logps/chosen": -575.043212890625, "logps/rejected": -133.9330037434896, "loss": 0.3294, "rewards/chosen": -0.07933806627988815, "rewards/margins": 1.0671587064862251, "rewards/rejected": -1.1464967727661133, "step": 4599 }, { "epoch": 0.24381840828982587, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26310648.0, "logits/rejected": -31335449.6, "logps/chosen": -128.7823689778646, "logps/rejected": -478.180029296875, "loss": 0.2753, "rewards/chosen": 0.21221856276194254, "rewards/margins": 2.2482746998469034, "rewards/rejected": -2.036056137084961, "step": 4600 }, { "epoch": 0.243871412291628, "grad_norm": 48.25, "kl": 2.561163902282715, "learning_rate": 5e-07, "logits/chosen": -27334988.8, "logits/rejected": 1816840.6666666667, "logps/chosen": -545.690234375, "logps/rejected": -36.37218221028646, "loss": 0.381, "rewards/chosen": 0.6942131042480468, "rewards/margins": 1.8141780217488606, "rewards/rejected": -1.1199649175008137, "step": 4601 }, { "epoch": 0.24392441629343015, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3275905.25, "logits/rejected": -13866194.285714285, "logps/chosen": -10.467418670654297, "logps/rejected": -394.31459263392856, "loss": 0.2503, "rewards/chosen": 0.010547637939453125, "rewards/margins": 1.9264963694981165, "rewards/rejected": -1.9159487315586634, "step": 4602 }, { "epoch": 0.24397742029523228, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36085366.4, "logits/rejected": -53907546.666666664, "logps/chosen": -332.56572265625, "logps/rejected": -388.8321126302083, "loss": 0.3391, "rewards/chosen": 0.15596375465393067, "rewards/margins": 2.7362688541412354, "rewards/rejected": -2.5803050994873047, "step": 4603 }, { "epoch": 0.24403042429703442, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3880236.0, "logits/rejected": -9355494.4, "logps/chosen": -296.3524169921875, "logps/rejected": -269.20029296875, "loss": 0.3062, "rewards/chosen": 0.052764892578125, "rewards/margins": 1.9144403457641601, "rewards/rejected": -1.8616754531860351, "step": 4604 }, { "epoch": 0.24408342829883656, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9004204.0, "logits/rejected": -34279158.4, "logps/chosen": -214.8947957356771, "logps/rejected": -311.3410888671875, "loss": 0.2861, "rewards/chosen": 0.09377798438072205, "rewards/margins": 1.8637689054012299, "rewards/rejected": -1.7699909210205078, "step": 4605 }, { "epoch": 0.2441364323006387, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94430704.0, "logits/rejected": -32044370.666666668, "logps/chosen": -322.17169189453125, "logps/rejected": -352.15966796875, "loss": 0.2094, "rewards/chosen": 0.15232086181640625, "rewards/margins": 2.8507506052652993, "rewards/rejected": -2.698429743448893, "step": 4606 }, { "epoch": 0.24418943630244083, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17474154.0, "logits/rejected": -58727958.85714286, "logps/chosen": -239.0344696044922, "logps/rejected": -297.26346261160717, "loss": 0.3252, "rewards/chosen": -0.2955993711948395, "rewards/margins": 0.7247954308986664, "rewards/rejected": -1.0203948020935059, "step": 4607 }, { "epoch": 0.24424244030424297, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17694018.0, "logits/rejected": -9405631.333333334, "logps/chosen": -143.26551818847656, "logps/rejected": -249.27652994791666, "loss": 0.3118, "rewards/chosen": -0.13022193312644958, "rewards/margins": 1.318210353453954, "rewards/rejected": -1.4484322865804036, "step": 4608 }, { "epoch": 0.2442954443060451, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16728470.0, "logits/rejected": -1778696.1666666667, "logps/chosen": -90.54263305664062, "logps/rejected": -301.0445963541667, "loss": 0.2807, "rewards/chosen": -0.02133578062057495, "rewards/margins": 1.4223891695340474, "rewards/rejected": -1.4437249501546223, "step": 4609 }, { "epoch": 0.24434844830784724, "grad_norm": 38.75, "kl": 0.13478660583496094, "learning_rate": 5e-07, "logits/chosen": -51093556.0, "logits/rejected": -32090884.0, "logps/chosen": -765.8146362304688, "logps/rejected": -507.7637939453125, "loss": 0.2274, "rewards/chosen": 0.8896116018295288, "rewards/margins": 3.567160487174988, "rewards/rejected": -2.677548885345459, "step": 4610 }, { "epoch": 0.24440145230964938, "grad_norm": 82.5, "kl": 2.528961181640625, "learning_rate": 5e-07, "logits/chosen": -46242138.666666664, "logits/rejected": -37985560.0, "logps/chosen": -741.0734049479166, "logps/rejected": -398.7724609375, "loss": 0.3696, "rewards/chosen": 0.5884501139322916, "rewards/margins": 2.3971659342447915, "rewards/rejected": -1.8087158203125, "step": 4611 }, { "epoch": 0.24445445631145152, "grad_norm": 45.0, "kl": 0.14107322692871094, "learning_rate": 5e-07, "logits/chosen": 587344.875, "logits/rejected": -14701448.0, "logps/chosen": -128.57891845703125, "logps/rejected": -359.4156494140625, "loss": 0.3281, "rewards/chosen": 0.21689248085021973, "rewards/margins": 1.7221043109893799, "rewards/rejected": -1.5052118301391602, "step": 4612 }, { "epoch": 0.24450746031325366, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39386096.0, "logits/rejected": -31356612.0, "logps/chosen": -281.947998046875, "logps/rejected": -206.2742156982422, "loss": 0.3202, "rewards/chosen": 0.23031030595302582, "rewards/margins": 1.8587450832128525, "rewards/rejected": -1.6284347772598267, "step": 4613 }, { "epoch": 0.2445604643150558, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32877464.0, "logits/rejected": -37968870.4, "logps/chosen": -191.00541178385416, "logps/rejected": -377.802783203125, "loss": 0.2802, "rewards/chosen": -0.011958817640940348, "rewards/margins": 2.0264626224835713, "rewards/rejected": -2.0384214401245115, "step": 4614 }, { "epoch": 0.24461346831685793, "grad_norm": 43.0, "kl": 0.5295677185058594, "learning_rate": 5e-07, "logits/chosen": -21739642.666666668, "logits/rejected": -27158080.0, "logps/chosen": -263.91046142578125, "logps/rejected": -346.689892578125, "loss": 0.2774, "rewards/chosen": 0.3899892171223958, "rewards/margins": 2.51292241414388, "rewards/rejected": -2.122933197021484, "step": 4615 }, { "epoch": 0.24466647231866007, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -593576.0, "logits/rejected": -11975044.0, "logps/chosen": -100.39566040039062, "logps/rejected": -501.04815673828125, "loss": 0.3599, "rewards/chosen": -0.031387291848659515, "rewards/margins": 2.662760056555271, "rewards/rejected": -2.6941473484039307, "step": 4616 }, { "epoch": 0.2447194763204622, "grad_norm": 56.75, "kl": 0.8400726318359375, "learning_rate": 5e-07, "logits/chosen": -44428496.0, "logits/rejected": -28838050.666666668, "logps/chosen": -256.8415771484375, "logps/rejected": -293.47332763671875, "loss": 0.4061, "rewards/chosen": 0.0020767331123352053, "rewards/margins": 1.711856472492218, "rewards/rejected": -1.7097797393798828, "step": 4617 }, { "epoch": 0.24477248032226434, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36880294.85714286, "logits/rejected": -58178224.0, "logps/chosen": -357.05650111607144, "logps/rejected": -402.18707275390625, "loss": 0.3937, "rewards/chosen": 0.2997217518942697, "rewards/margins": 2.9998438698904857, "rewards/rejected": -2.700122117996216, "step": 4618 }, { "epoch": 0.24482548432406648, "grad_norm": 57.5, "kl": 0.9746208190917969, "learning_rate": 5e-07, "logits/chosen": -11118430.0, "logits/rejected": -9427858.0, "logps/chosen": -268.0831604003906, "logps/rejected": -312.3577880859375, "loss": 0.3714, "rewards/chosen": 0.07944060862064362, "rewards/margins": 1.4496756941080093, "rewards/rejected": -1.3702350854873657, "step": 4619 }, { "epoch": 0.24487848832586862, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40920866.666666664, "logits/rejected": -45341897.6, "logps/chosen": -335.640380859375, "logps/rejected": -277.07275390625, "loss": 0.2871, "rewards/chosen": 0.5342061519622803, "rewards/margins": 2.2600772380828857, "rewards/rejected": -1.7258710861206055, "step": 4620 }, { "epoch": 0.24493149232767075, "grad_norm": 59.25, "kl": 0.028482437133789062, "learning_rate": 5e-07, "logits/chosen": -44712628.0, "logits/rejected": 3681833.5, "logps/chosen": -307.4270324707031, "logps/rejected": -477.5953369140625, "loss": 0.3241, "rewards/chosen": 0.14565487205982208, "rewards/margins": 1.9922648221254349, "rewards/rejected": -1.8466099500656128, "step": 4621 }, { "epoch": 0.24498449632947286, "grad_norm": 32.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4881918.0, "logits/rejected": -14945413.333333334, "logps/chosen": -55.58512878417969, "logps/rejected": -374.2515869140625, "loss": 0.2019, "rewards/chosen": 0.1897689700126648, "rewards/margins": 3.0019707878430686, "rewards/rejected": -2.812201817830404, "step": 4622 }, { "epoch": 0.245037500331275, "grad_norm": 30.875, "kl": 0.04549407958984375, "learning_rate": 5e-07, "logits/chosen": -22418874.666666668, "logits/rejected": -24457088.0, "logps/chosen": -139.49411010742188, "logps/rejected": -241.7423095703125, "loss": 0.2182, "rewards/chosen": 0.2957555850346883, "rewards/margins": 3.089550026257833, "rewards/rejected": -2.7937944412231444, "step": 4623 }, { "epoch": 0.24509050433307714, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49655466.666666664, "logits/rejected": -6491979.2, "logps/chosen": -602.6534830729166, "logps/rejected": -125.8116455078125, "loss": 0.2781, "rewards/chosen": 0.5615530808766683, "rewards/margins": 1.9129713853200276, "rewards/rejected": -1.3514183044433594, "step": 4624 }, { "epoch": 0.24514350833487927, "grad_norm": 55.75, "kl": 1.1869487762451172, "learning_rate": 5e-07, "logits/chosen": -14296416.0, "logps/chosen": -242.27352905273438, "loss": 0.3999, "rewards/chosen": 0.5794451236724854, "step": 4625 }, { "epoch": 0.2451965123366814, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13396801.333333334, "logits/rejected": 1860256.8, "logps/chosen": -218.28519694010416, "logps/rejected": -399.1901123046875, "loss": 0.2819, "rewards/chosen": -0.3744558095932007, "rewards/margins": 2.020820450782776, "rewards/rejected": -2.3952762603759767, "step": 4626 }, { "epoch": 0.24524951633848355, "grad_norm": 47.5, "kl": 0.3719215393066406, "learning_rate": 5e-07, "logits/chosen": -43247757.333333336, "logits/rejected": -6166846.4, "logps/chosen": -292.56813557942706, "logps/rejected": -248.1512451171875, "loss": 0.269, "rewards/chosen": 0.895305871963501, "rewards/margins": 2.4324340343475344, "rewards/rejected": -1.5371281623840332, "step": 4627 }, { "epoch": 0.24530252034028568, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50434868.0, "logits/rejected": -15185661.0, "logps/chosen": -280.5799560546875, "logps/rejected": -185.83526611328125, "loss": 0.3738, "rewards/chosen": 0.04285565763711929, "rewards/margins": 1.2465267553925514, "rewards/rejected": -1.2036710977554321, "step": 4628 }, { "epoch": 0.24535552434208782, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23316546.0, "logits/rejected": -40246264.0, "logps/chosen": -192.43325805664062, "logps/rejected": -357.82745361328125, "loss": 0.2726, "rewards/chosen": 0.3610069155693054, "rewards/margins": 2.345287024974823, "rewards/rejected": -1.9842801094055176, "step": 4629 }, { "epoch": 0.24540852834388996, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43395944.0, "logits/rejected": -11004680.666666666, "logps/chosen": -270.6877746582031, "logps/rejected": -235.44954427083334, "loss": 0.2403, "rewards/chosen": 0.8455474972724915, "rewards/margins": 2.6337577303250628, "rewards/rejected": -1.7882102330525715, "step": 4630 }, { "epoch": 0.2454615323456921, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31683318.4, "logits/rejected": -21818240.0, "logps/chosen": -283.302880859375, "logps/rejected": -112.5030008951823, "loss": 0.4858, "rewards/chosen": -0.20505821704864502, "rewards/margins": 0.41088553269704187, "rewards/rejected": -0.6159437497456869, "step": 4631 }, { "epoch": 0.24551453634749423, "grad_norm": 51.0, "kl": 0.384979248046875, "learning_rate": 5e-07, "logits/chosen": -50436528.0, "logits/rejected": -35020736.0, "logps/chosen": -464.33587646484375, "logps/rejected": -335.2626546223958, "loss": 0.2889, "rewards/chosen": 0.12163849174976349, "rewards/margins": 1.616152639190356, "rewards/rejected": -1.4945141474405925, "step": 4632 }, { "epoch": 0.24556754034929637, "grad_norm": 54.75, "kl": 2.1465587615966797, "learning_rate": 5e-07, "logits/chosen": -54750.5, "logits/rejected": -21176958.666666668, "logps/chosen": -824.1249389648438, "logps/rejected": -251.7708536783854, "loss": 0.1959, "rewards/chosen": 1.4861938953399658, "rewards/margins": 2.9664814472198486, "rewards/rejected": -1.4802875518798828, "step": 4633 }, { "epoch": 0.2456205443510985, "grad_norm": 53.0, "kl": 0.4978020191192627, "learning_rate": 5e-07, "logits/chosen": -26296317.333333332, "logits/rejected": 15224592.0, "logps/chosen": -183.66621907552084, "logps/rejected": -235.3412109375, "loss": 0.2984, "rewards/chosen": 0.21054989099502563, "rewards/margins": 1.9199477791786195, "rewards/rejected": -1.7093978881835938, "step": 4634 }, { "epoch": 0.24567354835290064, "grad_norm": 33.75, "kl": 0.19371795654296875, "learning_rate": 5e-07, "logits/chosen": -17185740.0, "logits/rejected": -6816566.4, "logps/chosen": -53.97689310709635, "logps/rejected": -235.844677734375, "loss": 0.2772, "rewards/chosen": -0.04629809161027273, "rewards/margins": 2.2529717753330867, "rewards/rejected": -2.2992698669433596, "step": 4635 }, { "epoch": 0.24572655235470278, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2419096.0, "logits/rejected": -42129328.0, "logps/chosen": -415.2472839355469, "logps/rejected": -331.7477111816406, "loss": 0.3533, "rewards/chosen": -0.09550777077674866, "rewards/margins": 1.4973903596401215, "rewards/rejected": -1.5928981304168701, "step": 4636 }, { "epoch": 0.24577955635650492, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76256234.66666667, "logits/rejected": -7813331.2, "logps/chosen": -463.0846354166667, "logps/rejected": -115.2171875, "loss": 0.2516, "rewards/chosen": 0.561896284421285, "rewards/margins": 2.2410325606664023, "rewards/rejected": -1.6791362762451172, "step": 4637 }, { "epoch": 0.24583256035830706, "grad_norm": 60.0, "kl": 0.8694496154785156, "learning_rate": 5e-07, "logits/chosen": -39223680.0, "logits/rejected": -21173174.0, "logps/chosen": -372.0712483723958, "logps/rejected": -611.2041625976562, "loss": 0.3843, "rewards/chosen": 0.1665184199810028, "rewards/margins": 3.0704369246959686, "rewards/rejected": -2.903918504714966, "step": 4638 }, { "epoch": 0.2458855643601092, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4965895.0, "logits/rejected": -7659266.5, "logps/chosen": -60.106781005859375, "logps/rejected": -361.0769958496094, "loss": 0.3772, "rewards/chosen": -0.43732282519340515, "rewards/margins": 1.6164019405841827, "rewards/rejected": -2.053724765777588, "step": 4639 }, { "epoch": 0.24593856836191133, "grad_norm": 45.5, "kl": 0.008533477783203125, "learning_rate": 5e-07, "logits/chosen": -15496406.4, "logits/rejected": 3671554.6666666665, "logps/chosen": -171.14632568359374, "logps/rejected": -87.07875569661458, "loss": 0.3927, "rewards/chosen": 0.12313148975372315, "rewards/margins": 1.3667469580968221, "rewards/rejected": -1.2436154683430989, "step": 4640 }, { "epoch": 0.24599157236371347, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27425080.0, "logits/rejected": -22391624.0, "logps/chosen": -256.4412536621094, "logps/rejected": -426.3982340494792, "loss": 0.2871, "rewards/chosen": -0.023295022547245026, "rewards/margins": 2.0254210457205772, "rewards/rejected": -2.0487160682678223, "step": 4641 }, { "epoch": 0.2460445763655156, "grad_norm": 44.75, "kl": 0.10091781616210938, "learning_rate": 5e-07, "logits/chosen": -37503141.333333336, "logits/rejected": -41218028.8, "logps/chosen": -180.55289713541666, "logps/rejected": -152.69766845703126, "loss": 0.3723, "rewards/chosen": 0.4182347853978475, "rewards/margins": 1.1021878798802693, "rewards/rejected": -0.6839530944824219, "step": 4642 }, { "epoch": 0.24609758036731774, "grad_norm": 59.5, "kl": 1.4643135070800781, "learning_rate": 5e-07, "logits/chosen": -38080457.6, "logits/rejected": -26749453.333333332, "logps/chosen": -410.183837890625, "logps/rejected": -178.45513916015625, "loss": 0.3982, "rewards/chosen": 0.5111583232879638, "rewards/margins": 0.9033814430236816, "rewards/rejected": -0.3922231197357178, "step": 4643 }, { "epoch": 0.24615058436911988, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11187467.2, "logits/rejected": -46942688.0, "logps/chosen": -80.86661376953126, "logps/rejected": -549.385009765625, "loss": 0.3836, "rewards/chosen": -0.07152082920074462, "rewards/margins": 2.1802778005599976, "rewards/rejected": -2.251798629760742, "step": 4644 }, { "epoch": 0.24620358837092202, "grad_norm": 37.75, "kl": 0.23929691314697266, "learning_rate": 5e-07, "logits/chosen": -6662802.0, "logits/rejected": -27092613.333333332, "logps/chosen": -116.03675537109375, "logps/rejected": -380.6455891927083, "loss": 0.4044, "rewards/chosen": -0.10791878700256348, "rewards/margins": 1.6307789007822673, "rewards/rejected": -1.7386976877848308, "step": 4645 }, { "epoch": 0.24625659237272415, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7590852.0, "logits/rejected": -15639460.8, "logps/chosen": -286.82065836588544, "logps/rejected": -332.9795654296875, "loss": 0.3339, "rewards/chosen": -0.12452888488769531, "rewards/margins": 1.35863037109375, "rewards/rejected": -1.4831592559814453, "step": 4646 }, { "epoch": 0.24630959637452626, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27065682.0, "logits/rejected": -40710328.0, "logps/chosen": -402.7347412109375, "logps/rejected": -259.684814453125, "loss": 0.3713, "rewards/chosen": 0.20055429637432098, "rewards/margins": 1.1869392842054367, "rewards/rejected": -0.9863849878311157, "step": 4647 }, { "epoch": 0.2463626003763284, "grad_norm": 54.5, "kl": 0.9083175659179688, "learning_rate": 5e-07, "logits/chosen": -86797274.66666667, "logits/rejected": -13945145.6, "logps/chosen": -494.4849446614583, "logps/rejected": -480.733203125, "loss": 0.1997, "rewards/chosen": 0.9588308334350586, "rewards/margins": 3.0691249847412108, "rewards/rejected": -2.110294151306152, "step": 4648 }, { "epoch": 0.24641560437813054, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69075573.33333333, "logits/rejected": -50661670.4, "logps/chosen": -489.5136311848958, "logps/rejected": -372.683984375, "loss": 0.2595, "rewards/chosen": 0.10975545644760132, "rewards/margins": 2.074057328701019, "rewards/rejected": -1.964301872253418, "step": 4649 }, { "epoch": 0.24646860837993267, "grad_norm": 61.0, "kl": 0.37278175354003906, "learning_rate": 5e-07, "logits/chosen": -69143952.0, "logits/rejected": -4745965.5, "logps/chosen": -847.77734375, "logps/rejected": -209.1416015625, "loss": 0.2575, "rewards/chosen": 0.7568145990371704, "rewards/margins": 2.428050398826599, "rewards/rejected": -1.6712357997894287, "step": 4650 }, { "epoch": 0.2465216123817348, "grad_norm": 51.25, "kl": 2.0460739135742188, "learning_rate": 5e-07, "logits/chosen": -22099057.333333332, "logits/rejected": -37978265.6, "logps/chosen": -420.1249186197917, "logps/rejected": -361.6470703125, "loss": 0.2823, "rewards/chosen": 0.20705058177312216, "rewards/margins": 2.5103079517682394, "rewards/rejected": -2.303257369995117, "step": 4651 }, { "epoch": 0.24657461638353695, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36133740.0, "logits/rejected": -34743548.0, "logps/chosen": -328.1227111816406, "logps/rejected": -171.04373168945312, "loss": 0.3473, "rewards/chosen": 0.02575259655714035, "rewards/margins": 1.579260639846325, "rewards/rejected": -1.5535080432891846, "step": 4652 }, { "epoch": 0.24662762038533909, "grad_norm": 51.75, "kl": 0.6493759155273438, "learning_rate": 5e-07, "logits/chosen": -22805961.14285714, "logits/rejected": -15826002.0, "logps/chosen": -527.4017159598214, "logps/rejected": -236.99388122558594, "loss": 0.3964, "rewards/chosen": 0.5084805147988456, "rewards/margins": 2.4463711636407037, "rewards/rejected": -1.937890648841858, "step": 4653 }, { "epoch": 0.24668062438714122, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24697956.0, "logits/rejected": -17330408.0, "logps/chosen": -440.4312438964844, "logps/rejected": -101.31243896484375, "loss": 0.2965, "rewards/chosen": 0.26628416776657104, "rewards/margins": 1.4518954157829285, "rewards/rejected": -1.1856112480163574, "step": 4654 }, { "epoch": 0.24673362838894336, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36421992.0, "logits/rejected": -33103974.85714286, "logps/chosen": -187.25201416015625, "logps/rejected": -430.8738490513393, "loss": 0.1631, "rewards/chosen": 0.0055908202193677425, "rewards/margins": 2.4634563990070353, "rewards/rejected": -2.4578655787876675, "step": 4655 }, { "epoch": 0.2467866323907455, "grad_norm": 46.75, "kl": 0.04765892028808594, "learning_rate": 5e-07, "logits/chosen": 4015350.5, "logits/rejected": -16945330.666666668, "logps/chosen": -208.4542999267578, "logps/rejected": -372.7256266276042, "loss": 0.1982, "rewards/chosen": 1.0711262226104736, "rewards/margins": 3.2164334456125894, "rewards/rejected": -2.1453072230021157, "step": 4656 }, { "epoch": 0.24683963639254763, "grad_norm": 70.0, "kl": 0.5883092880249023, "learning_rate": 5e-07, "logits/chosen": -56563936.0, "logits/rejected": -39736484.0, "logps/chosen": -429.6728922526042, "logps/rejected": -250.0381622314453, "loss": 0.383, "rewards/chosen": 0.3651203711827596, "rewards/margins": 1.7737949689229329, "rewards/rejected": -1.4086745977401733, "step": 4657 }, { "epoch": 0.24689264039434977, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33786597.333333336, "logits/rejected": -15057795.2, "logps/chosen": -503.6046956380208, "logps/rejected": -299.819384765625, "loss": 0.2557, "rewards/chosen": 0.5226369301478068, "rewards/margins": 2.2642812172571816, "rewards/rejected": -1.741644287109375, "step": 4658 }, { "epoch": 0.2469456443961519, "grad_norm": 31.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9863818.0, "logits/rejected": -30314426.666666668, "logps/chosen": -72.9229736328125, "logps/rejected": -436.238525390625, "loss": 0.1869, "rewards/chosen": 0.14170321822166443, "rewards/margins": 2.7727584143479667, "rewards/rejected": -2.6310551961263022, "step": 4659 }, { "epoch": 0.24699864839795405, "grad_norm": 58.5, "kl": 0.39870643615722656, "learning_rate": 5e-07, "logits/chosen": -27272088.0, "logits/rejected": -2123282.0, "logps/chosen": -285.5849609375, "logps/rejected": -634.0169677734375, "loss": 0.2852, "rewards/chosen": 0.4687443971633911, "rewards/margins": 2.349091053009033, "rewards/rejected": -1.880346655845642, "step": 4660 }, { "epoch": 0.24705165239975618, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78529964.8, "logits/rejected": -150223776.0, "logps/chosen": -231.783251953125, "logps/rejected": -251.66341145833334, "loss": 0.3061, "rewards/chosen": 0.21569862365722656, "rewards/margins": 3.0171652475992836, "rewards/rejected": -2.801466623942057, "step": 4661 }, { "epoch": 0.24710465640155832, "grad_norm": 94.0, "kl": 7.025923728942871, "learning_rate": 5e-07, "logits/chosen": -40477242.666666664, "logits/rejected": 9834821.0, "logps/chosen": -944.2086588541666, "logps/rejected": -267.4072570800781, "loss": 0.3388, "rewards/chosen": 1.3858022689819336, "rewards/margins": 3.166386604309082, "rewards/rejected": -1.7805843353271484, "step": 4662 }, { "epoch": 0.24715766040336046, "grad_norm": 220.0, "kl": 0.22608184814453125, "learning_rate": 5e-07, "logits/chosen": -33691058.28571428, "logits/rejected": -9883708.0, "logps/chosen": -245.19093540736608, "logps/rejected": -66.31722259521484, "loss": 0.4673, "rewards/chosen": -0.028396529810769216, "rewards/margins": 1.713225679738181, "rewards/rejected": -1.7416222095489502, "step": 4663 }, { "epoch": 0.2472106644051626, "grad_norm": 78.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12461034.4, "logits/rejected": -10222331.333333334, "logps/chosen": -233.4560791015625, "logps/rejected": -345.859130859375, "loss": 0.3635, "rewards/chosen": 0.34407663345336914, "rewards/margins": 1.5949438412984211, "rewards/rejected": -1.250867207845052, "step": 4664 }, { "epoch": 0.24726366840696473, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37957072.0, "logits/rejected": 6646443.5, "logps/chosen": -404.8666687011719, "logps/rejected": -330.0174560546875, "loss": 0.3874, "rewards/chosen": 0.24694693088531494, "rewards/margins": 1.2821320295333862, "rewards/rejected": -1.0351850986480713, "step": 4665 }, { "epoch": 0.24731667240876687, "grad_norm": 46.25, "kl": 0.23219776153564453, "learning_rate": 5e-07, "logits/chosen": -34180276.0, "logits/rejected": -13892196.0, "logps/chosen": -229.04986572265625, "logps/rejected": -351.244140625, "loss": 0.1828, "rewards/chosen": 1.1378555297851562, "rewards/margins": 3.112086931864421, "rewards/rejected": -1.9742314020792644, "step": 4666 }, { "epoch": 0.247369676410569, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22236510.666666668, "logits/rejected": -644854.25, "logps/chosen": -152.3607177734375, "logps/rejected": -43.06081008911133, "loss": 0.3854, "rewards/chosen": 0.3597203493118286, "rewards/margins": 1.2683624029159546, "rewards/rejected": -0.908642053604126, "step": 4667 }, { "epoch": 0.24742268041237114, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -123246856.0, "logits/rejected": -122068928.0, "logps/chosen": -430.57073974609375, "logps/rejected": -513.5570678710938, "loss": 0.294, "rewards/chosen": 0.24822846055030823, "rewards/margins": 2.198087364435196, "rewards/rejected": -1.9498589038848877, "step": 4668 }, { "epoch": 0.24747568441417328, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18573624.0, "logits/rejected": -4295720.5, "logps/chosen": -165.0333048502604, "logps/rejected": -213.861083984375, "loss": 0.384, "rewards/chosen": 0.19058730204900107, "rewards/margins": 2.0224573810895285, "rewards/rejected": -1.8318700790405273, "step": 4669 }, { "epoch": 0.24752868841597542, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50437482.666666664, "logits/rejected": 63029529.6, "logps/chosen": -224.19942220052084, "logps/rejected": -346.2162353515625, "loss": 0.2937, "rewards/chosen": -0.04417082667350769, "rewards/margins": 1.6953387558460236, "rewards/rejected": -1.7395095825195312, "step": 4670 }, { "epoch": 0.24758169241777755, "grad_norm": 78.5, "kl": 2.1705856323242188, "learning_rate": 5e-07, "logits/chosen": -33765541.333333336, "logits/rejected": -14135592.0, "logps/chosen": -400.2465006510417, "logps/rejected": -337.6591491699219, "loss": 0.4042, "rewards/chosen": 0.3734657367070516, "rewards/margins": 2.367736061414083, "rewards/rejected": -1.9942703247070312, "step": 4671 }, { "epoch": 0.2476346964195797, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6978129.333333333, "logits/rejected": -14263542.4, "logps/chosen": -191.43526204427084, "logps/rejected": -355.9612060546875, "loss": 0.3559, "rewards/chosen": -1.133697748184204, "rewards/margins": 0.9243796825408936, "rewards/rejected": -2.0580774307250977, "step": 4672 }, { "epoch": 0.2476877004213818, "grad_norm": 51.25, "kl": 0.10699844360351562, "learning_rate": 5e-07, "logits/chosen": -40274424.0, "logits/rejected": 4141800.6666666665, "logps/chosen": -246.83413696289062, "logps/rejected": -404.8341878255208, "loss": 0.2641, "rewards/chosen": 0.18493500351905823, "rewards/margins": 1.7425464689731598, "rewards/rejected": -1.5576114654541016, "step": 4673 }, { "epoch": 0.24774070442318394, "grad_norm": 68.5, "kl": 0.2822761535644531, "learning_rate": 5e-07, "logits/chosen": -44370976.0, "logits/rejected": -41767603.2, "logps/chosen": -349.48876953125, "logps/rejected": -363.4482666015625, "loss": 0.2107, "rewards/chosen": 0.5885305007298788, "rewards/margins": 2.9385745604832967, "rewards/rejected": -2.350044059753418, "step": 4674 }, { "epoch": 0.24779370842498608, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 43017573.333333336, "logits/rejected": -18707376.0, "logps/chosen": -222.24275716145834, "logps/rejected": -303.27435302734375, "loss": 0.4175, "rewards/chosen": 0.047504618763923645, "rewards/margins": 1.4665628224611282, "rewards/rejected": -1.4190582036972046, "step": 4675 }, { "epoch": 0.2478467124267882, "grad_norm": 63.5, "kl": 0.5269756317138672, "learning_rate": 5e-07, "logits/chosen": -19602612.0, "logits/rejected": -75535008.0, "logps/chosen": -545.2091674804688, "logps/rejected": -191.9669952392578, "loss": 0.2794, "rewards/chosen": 0.26990360021591187, "rewards/margins": 2.76497620344162, "rewards/rejected": -2.495072603225708, "step": 4676 }, { "epoch": 0.24789971642859035, "grad_norm": 61.25, "kl": 1.2933502197265625, "learning_rate": 5e-07, "logits/chosen": -57006784.0, "logits/rejected": -8818184.0, "logps/chosen": -437.9291015625, "logps/rejected": -253.9796142578125, "loss": 0.3789, "rewards/chosen": 0.5978582859039306, "rewards/margins": 1.4258256117502848, "rewards/rejected": -0.8279673258463541, "step": 4677 }, { "epoch": 0.2479527204303925, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15297272.0, "logits/rejected": -7470774.285714285, "logps/chosen": -369.4122619628906, "logps/rejected": -185.70474679129464, "loss": 0.18, "rewards/chosen": 0.6589019894599915, "rewards/margins": 2.5149651510374884, "rewards/rejected": -1.8560631615774972, "step": 4678 }, { "epoch": 0.24800572443219462, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8988382.0, "logits/rejected": -39982948.0, "logps/chosen": -666.68115234375, "logps/rejected": -319.0335693359375, "loss": 0.2926, "rewards/chosen": 0.20176085829734802, "rewards/margins": 2.554423898458481, "rewards/rejected": -2.352663040161133, "step": 4679 }, { "epoch": 0.24805872843399676, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24057178.666666668, "logits/rejected": -75057196.8, "logps/chosen": -429.05859375, "logps/rejected": -258.9245849609375, "loss": 0.2323, "rewards/chosen": 0.7461995283762614, "rewards/margins": 2.4349000136057537, "rewards/rejected": -1.6887004852294922, "step": 4680 }, { "epoch": 0.2481117324357989, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86573664.0, "logits/rejected": -27264013.333333332, "logps/chosen": -418.3121337890625, "logps/rejected": -149.2996826171875, "loss": 0.3294, "rewards/chosen": -0.22816163301467896, "rewards/margins": 0.9154590964317322, "rewards/rejected": -1.1436207294464111, "step": 4681 }, { "epoch": 0.24816473643760104, "grad_norm": 79.0, "kl": 1.2021312713623047, "learning_rate": 5e-07, "logits/chosen": -28912034.0, "logits/rejected": -6822923.5, "logps/chosen": -485.0414123535156, "logps/rejected": -82.25100708007812, "loss": 0.2489, "rewards/chosen": 1.0707725286483765, "rewards/margins": 2.4081664085388184, "rewards/rejected": -1.337393879890442, "step": 4682 }, { "epoch": 0.24821774043940317, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26925984.0, "logits/rejected": -43906949.333333336, "logps/chosen": -171.9612548828125, "logps/rejected": -594.4217936197916, "loss": 0.3357, "rewards/chosen": -0.037323829531669614, "rewards/margins": 3.0299400915702184, "rewards/rejected": -3.067263921101888, "step": 4683 }, { "epoch": 0.2482707444412053, "grad_norm": 59.25, "kl": 0.06858062744140625, "learning_rate": 5e-07, "logits/chosen": -37561701.333333336, "logits/rejected": -102828616.0, "logps/chosen": -319.38535563151044, "logps/rejected": -652.9976806640625, "loss": 0.3222, "rewards/chosen": 0.479419469833374, "rewards/margins": 2.657785654067993, "rewards/rejected": -2.178366184234619, "step": 4684 }, { "epoch": 0.24832374844300745, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43125740.8, "logits/rejected": -16312372.0, "logps/chosen": -293.1396240234375, "logps/rejected": -490.6693522135417, "loss": 0.3156, "rewards/chosen": 0.33582923412322996, "rewards/margins": 2.405299480756124, "rewards/rejected": -2.069470246632894, "step": 4685 }, { "epoch": 0.24837675244480958, "grad_norm": 26.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3781534.3333333335, "logits/rejected": -47888838.4, "logps/chosen": -40.01783752441406, "logps/rejected": -303.4720703125, "loss": 0.2879, "rewards/chosen": 0.019550641377766926, "rewards/margins": 1.9911547978719075, "rewards/rejected": -1.9716041564941407, "step": 4686 }, { "epoch": 0.24842975644661172, "grad_norm": 60.0, "kl": 0.792724609375, "learning_rate": 5e-07, "logits/chosen": -24155936.0, "logits/rejected": 1899280.625, "logps/chosen": -320.56466238839283, "logps/rejected": -91.82560729980469, "loss": 0.4119, "rewards/chosen": 0.37802042279924664, "rewards/margins": 1.4312255552836826, "rewards/rejected": -1.053205132484436, "step": 4687 }, { "epoch": 0.24848276044841386, "grad_norm": 56.0, "kl": 1.2146549224853516, "learning_rate": 5e-07, "logits/chosen": -27094890.0, "logits/rejected": 12070960.0, "logps/chosen": -338.2337341308594, "logps/rejected": -210.73553466796875, "loss": 0.3663, "rewards/chosen": 0.5764167904853821, "rewards/margins": 1.351232886314392, "rewards/rejected": -0.77481609582901, "step": 4688 }, { "epoch": 0.248535764450216, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45497077.333333336, "logits/rejected": -6836663.2, "logps/chosen": -187.2651163736979, "logps/rejected": -216.75146484375, "loss": 0.3214, "rewards/chosen": 0.07535126805305481, "rewards/margins": 1.3914206802845002, "rewards/rejected": -1.3160694122314454, "step": 4689 }, { "epoch": 0.24858876845201813, "grad_norm": 58.25, "kl": 0.2545623779296875, "learning_rate": 5e-07, "logits/chosen": -12530249.6, "logits/rejected": -30058885.333333332, "logps/chosen": -367.2636962890625, "logps/rejected": -281.94281005859375, "loss": 0.2515, "rewards/chosen": 0.8099171638488769, "rewards/margins": 3.3692185401916506, "rewards/rejected": -2.5593013763427734, "step": 4690 }, { "epoch": 0.24864177245382027, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 407349.5, "logits/rejected": -2194168.0, "logps/chosen": -222.0821533203125, "logps/rejected": -317.11346435546875, "loss": 0.2828, "rewards/chosen": 0.4207124710083008, "rewards/margins": 2.242741584777832, "rewards/rejected": -1.8220291137695312, "step": 4691 }, { "epoch": 0.2486947764556224, "grad_norm": 52.0, "kl": 0.0941925048828125, "learning_rate": 5e-07, "logits/chosen": -46883366.4, "logits/rejected": -23821965.333333332, "logps/chosen": -538.607666015625, "logps/rejected": -327.71364339192706, "loss": 0.3142, "rewards/chosen": 0.791200590133667, "rewards/margins": 1.9688383261362712, "rewards/rejected": -1.1776377360026042, "step": 4692 }, { "epoch": 0.24874778045742454, "grad_norm": 50.0, "kl": 0.7917251586914062, "learning_rate": 5e-07, "logits/chosen": -50324056.0, "logits/rejected": -84640392.0, "logps/chosen": -388.7507629394531, "logps/rejected": -459.06842041015625, "loss": 0.3653, "rewards/chosen": -0.14925850927829742, "rewards/margins": 2.0075427144765854, "rewards/rejected": -2.156801223754883, "step": 4693 }, { "epoch": 0.24880078445922668, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24464558.0, "logits/rejected": -37642428.0, "logps/chosen": -202.07608032226562, "logps/rejected": -534.744140625, "loss": 0.2884, "rewards/chosen": -0.06254217028617859, "rewards/margins": 2.876025468111038, "rewards/rejected": -2.938567638397217, "step": 4694 }, { "epoch": 0.24885378846102882, "grad_norm": 42.25, "kl": 1.2785797119140625, "learning_rate": 5e-07, "logits/chosen": -10942036.0, "logits/rejected": -16822205.333333332, "logps/chosen": -267.2381103515625, "logps/rejected": -309.4496663411458, "loss": 0.3278, "rewards/chosen": 0.6654414653778076, "rewards/margins": 2.6124430815378825, "rewards/rejected": -1.947001616160075, "step": 4695 }, { "epoch": 0.24890679246283096, "grad_norm": 70.0, "kl": 0.6483650207519531, "learning_rate": 5e-07, "logits/chosen": -71564288.0, "logits/rejected": -12168010.666666666, "logps/chosen": -379.1470703125, "logps/rejected": -164.57805379231772, "loss": 0.4104, "rewards/chosen": 0.28951904773712156, "rewards/margins": 1.1679515600204469, "rewards/rejected": -0.8784325122833252, "step": 4696 }, { "epoch": 0.2489597964646331, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52708092.0, "logits/rejected": -42933936.0, "logps/chosen": -228.06504821777344, "logps/rejected": -618.3435465494791, "loss": 0.2009, "rewards/chosen": -0.012152478098869324, "rewards/margins": 2.882877543568611, "rewards/rejected": -2.8950300216674805, "step": 4697 }, { "epoch": 0.2490128004664352, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91003200.0, "logits/rejected": 32736819.2, "logps/chosen": -565.9217122395834, "logps/rejected": -621.30322265625, "loss": 0.3034, "rewards/chosen": 0.045896401007970176, "rewards/margins": 2.1887047429879507, "rewards/rejected": -2.1428083419799804, "step": 4698 }, { "epoch": 0.24906580446823734, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -106927392.0, "logits/rejected": -29652844.8, "logps/chosen": -706.6376953125, "logps/rejected": -476.62294921875, "loss": 0.193, "rewards/chosen": 0.7515747547149658, "rewards/margins": 2.8912353992462156, "rewards/rejected": -2.13966064453125, "step": 4699 }, { "epoch": 0.24911880847003948, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48740906.666666664, "logits/rejected": -31721091.2, "logps/chosen": -254.1846720377604, "logps/rejected": -265.3317138671875, "loss": 0.2314, "rewards/chosen": 0.39616847038269043, "rewards/margins": 2.388472890853882, "rewards/rejected": -1.9923044204711915, "step": 4700 }, { "epoch": 0.2491718124718416, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30991706.666666668, "logits/rejected": -40126451.2, "logps/chosen": -321.2659505208333, "logps/rejected": -377.77099609375, "loss": 0.2535, "rewards/chosen": 0.2970428466796875, "rewards/margins": 2.2089984893798826, "rewards/rejected": -1.9119556427001954, "step": 4701 }, { "epoch": 0.24922481647364375, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9479216.0, "logits/rejected": -18339043.42857143, "logps/chosen": -330.8764343261719, "logps/rejected": -361.45054408482144, "loss": 0.2418, "rewards/chosen": 1.3921416997909546, "rewards/margins": 2.697201405252729, "rewards/rejected": -1.3050597054617745, "step": 4702 }, { "epoch": 0.2492778204754459, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26741120.0, "logits/rejected": -31876806.0, "logps/chosen": -175.22403971354166, "logps/rejected": -465.48834228515625, "loss": 0.4404, "rewards/chosen": -0.2548369765281677, "rewards/margins": 2.2720962166786194, "rewards/rejected": -2.526933193206787, "step": 4703 }, { "epoch": 0.24933082447724803, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13901274.0, "logits/rejected": -31283877.333333332, "logps/chosen": -390.2684020996094, "logps/rejected": -377.0658365885417, "loss": 0.2238, "rewards/chosen": 0.25987473130226135, "rewards/margins": 2.3427959183851876, "rewards/rejected": -2.0829211870829263, "step": 4704 }, { "epoch": 0.24938382847905016, "grad_norm": 37.75, "kl": 1.0457687377929688, "learning_rate": 5e-07, "logits/chosen": -12095590.0, "logits/rejected": -10280224.0, "logps/chosen": -316.64764404296875, "logps/rejected": -171.52536010742188, "loss": 0.2625, "rewards/chosen": 1.2056888341903687, "rewards/margins": 2.2336446046829224, "rewards/rejected": -1.0279557704925537, "step": 4705 }, { "epoch": 0.2494368324808523, "grad_norm": 47.25, "kl": 1.0039310455322266, "learning_rate": 5e-07, "logits/chosen": -54076656.0, "logits/rejected": 85870080.0, "logps/chosen": -952.1250610351562, "logps/rejected": -318.432861328125, "loss": 0.2998, "rewards/chosen": 0.6833986043930054, "rewards/margins": 2.815665602684021, "rewards/rejected": -2.1322669982910156, "step": 4706 }, { "epoch": 0.24948983648265444, "grad_norm": 79.0, "kl": 2.8803024291992188, "learning_rate": 5e-07, "logits/chosen": -87463193.6, "logits/rejected": -18057478.666666668, "logps/chosen": -752.39443359375, "logps/rejected": -518.5317789713541, "loss": 0.3443, "rewards/chosen": 0.7210650444030762, "rewards/margins": 2.862907886505127, "rewards/rejected": -2.141842842102051, "step": 4707 }, { "epoch": 0.24954284048445657, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66699744.0, "logits/rejected": -20443294.0, "logps/chosen": -374.9736633300781, "logps/rejected": -263.48358154296875, "loss": 0.334, "rewards/chosen": 0.12675915658473969, "rewards/margins": 1.8463573306798935, "rewards/rejected": -1.7195981740951538, "step": 4708 }, { "epoch": 0.2495958444862587, "grad_norm": 35.25, "kl": 0.4168586730957031, "learning_rate": 5e-07, "logits/chosen": -16161091.0, "logits/rejected": -1674233.625, "logps/chosen": -206.65731811523438, "logps/rejected": -141.14117431640625, "loss": 0.3681, "rewards/chosen": -0.3388540744781494, "rewards/margins": 1.5634231567382812, "rewards/rejected": -1.9022772312164307, "step": 4709 }, { "epoch": 0.24964884848806085, "grad_norm": 45.25, "kl": 0.214385986328125, "learning_rate": 5e-07, "logits/chosen": 3388944.6666666665, "logits/rejected": -35014003.2, "logps/chosen": -600.879638671875, "logps/rejected": -460.170458984375, "loss": 0.1983, "rewards/chosen": 0.8744216759999593, "rewards/margins": 3.255314048131307, "rewards/rejected": -2.3808923721313477, "step": 4710 }, { "epoch": 0.24970185248986299, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28376396.8, "logits/rejected": -31521738.666666668, "logps/chosen": -113.9320556640625, "logps/rejected": -276.6191813151042, "loss": 0.4175, "rewards/chosen": -0.4765657901763916, "rewards/margins": 1.9774322986602784, "rewards/rejected": -2.45399808883667, "step": 4711 }, { "epoch": 0.24975485649166512, "grad_norm": 51.75, "kl": 1.3001041412353516, "learning_rate": 5e-07, "logits/chosen": -10078612.0, "logits/rejected": -44797404.0, "logps/chosen": -341.6677551269531, "logps/rejected": -272.099853515625, "loss": 0.3586, "rewards/chosen": 0.17171576619148254, "rewards/margins": 1.6942656338214874, "rewards/rejected": -1.5225498676300049, "step": 4712 }, { "epoch": 0.24980786049346726, "grad_norm": 46.25, "kl": 0.3671417236328125, "learning_rate": 5e-07, "logits/chosen": -29721994.666666668, "logits/rejected": -26370078.4, "logps/chosen": -264.7685546875, "logps/rejected": -334.49072265625, "loss": 0.2751, "rewards/chosen": 0.47708078225453693, "rewards/margins": 2.2810545523961387, "rewards/rejected": -1.8039737701416017, "step": 4713 }, { "epoch": 0.2498608644952694, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50676952.0, "logits/rejected": -53970299.428571425, "logps/chosen": -327.23272705078125, "logps/rejected": -368.9258510044643, "loss": 0.1496, "rewards/chosen": -0.308013916015625, "rewards/margins": 2.261622837611607, "rewards/rejected": -2.569636753627232, "step": 4714 }, { "epoch": 0.24991386849707153, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -844327.0, "logits/rejected": -19074500.0, "logps/chosen": -453.1298522949219, "logps/rejected": -419.4832763671875, "loss": 0.1749, "rewards/chosen": 0.68719482421875, "rewards/margins": 2.8782620429992676, "rewards/rejected": -2.1910672187805176, "step": 4715 }, { "epoch": 0.24996687249887367, "grad_norm": 61.25, "kl": 0.049304962158203125, "learning_rate": 5e-07, "logits/chosen": -29392504.0, "logits/rejected": -29712424.0, "logps/chosen": -391.5002034505208, "logps/rejected": -165.30703735351562, "loss": 0.4653, "rewards/chosen": -0.039661029974619545, "rewards/margins": 0.8016549547513326, "rewards/rejected": -0.8413159847259521, "step": 4716 }, { "epoch": 0.2500198765006758, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31225212.8, "logits/rejected": -26109850.666666668, "logps/chosen": -453.45078125, "logps/rejected": -214.4801025390625, "loss": 0.2865, "rewards/chosen": 0.7954086303710938, "rewards/margins": 2.7618054389953612, "rewards/rejected": -1.9663968086242676, "step": 4717 }, { "epoch": 0.25007288050247795, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13269200.0, "logits/rejected": -31745018.0, "logps/chosen": -71.81851959228516, "logps/rejected": -272.06097412109375, "loss": 0.3561, "rewards/chosen": -0.36119213700294495, "rewards/margins": 1.7593160569667816, "rewards/rejected": -2.1205081939697266, "step": 4718 }, { "epoch": 0.2501258845042801, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44448730.666666664, "logits/rejected": -7545099.2, "logps/chosen": -390.2772216796875, "logps/rejected": -179.9878173828125, "loss": 0.2181, "rewards/chosen": 0.7340273062388102, "rewards/margins": 2.570227829615275, "rewards/rejected": -1.8362005233764649, "step": 4719 }, { "epoch": 0.2501788885060822, "grad_norm": 43.25, "kl": 0.2291736602783203, "learning_rate": 5e-07, "logits/chosen": -108439328.0, "logits/rejected": -16468840.0, "logps/chosen": -482.2149353027344, "logps/rejected": -195.81278483072916, "loss": 0.2029, "rewards/chosen": 0.2506774961948395, "rewards/margins": 2.6960990329583487, "rewards/rejected": -2.4454215367635093, "step": 4720 }, { "epoch": 0.25023189250788436, "grad_norm": 43.75, "kl": 2.132688522338867, "learning_rate": 5e-07, "logits/chosen": -13235600.0, "logits/rejected": -15666713.0, "logps/chosen": -382.9452209472656, "logps/rejected": -159.21649169921875, "loss": 0.3222, "rewards/chosen": 0.40525388717651367, "rewards/margins": 1.7727081775665283, "rewards/rejected": -1.3674542903900146, "step": 4721 }, { "epoch": 0.2502848965096865, "grad_norm": 45.25, "kl": 0.19384098052978516, "learning_rate": 5e-07, "logits/chosen": 9857908.0, "logits/rejected": -28999518.0, "logps/chosen": -126.7956034342448, "logps/rejected": -443.6451721191406, "loss": 0.4164, "rewards/chosen": 0.08759119113286336, "rewards/margins": 1.8291295965512593, "rewards/rejected": -1.741538405418396, "step": 4722 }, { "epoch": 0.25033790051148863, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -166429936.0, "logits/rejected": -18900035.42857143, "logps/chosen": -340.03082275390625, "logps/rejected": -267.74074009486606, "loss": 0.2811, "rewards/chosen": -0.35698243975639343, "rewards/margins": 1.1753843128681183, "rewards/rejected": -1.5323667526245117, "step": 4723 }, { "epoch": 0.25039090451329077, "grad_norm": 49.25, "kl": 0.00726318359375, "learning_rate": 5e-07, "logits/chosen": -43317032.0, "logits/rejected": -5442140.0, "logps/chosen": -437.00189208984375, "logps/rejected": -432.9942321777344, "loss": 0.2343, "rewards/chosen": 0.6639640927314758, "rewards/margins": 2.8515313267707825, "rewards/rejected": -2.1875672340393066, "step": 4724 }, { "epoch": 0.2504439085150929, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 436749.3333333333, "logits/rejected": -44015344.0, "logps/chosen": -99.08766682942708, "logps/rejected": -421.197265625, "loss": 0.2737, "rewards/chosen": -0.22548097372055054, "rewards/margins": 2.1737465500831603, "rewards/rejected": -2.399227523803711, "step": 4725 }, { "epoch": 0.25049691251689504, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13473634.0, "logits/rejected": -37353064.0, "logps/chosen": -172.0286865234375, "logps/rejected": -618.85498046875, "loss": 0.2364, "rewards/chosen": 0.6765978336334229, "rewards/margins": 4.041057825088501, "rewards/rejected": -3.364459991455078, "step": 4726 }, { "epoch": 0.2505499165186972, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16011329.0, "logits/rejected": -54489448.0, "logps/chosen": -257.35028076171875, "logps/rejected": -314.35052490234375, "loss": 0.3776, "rewards/chosen": 0.0059721097350120544, "rewards/margins": 1.2553526982665062, "rewards/rejected": -1.2493805885314941, "step": 4727 }, { "epoch": 0.2506029205204993, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43031258.666666664, "logits/rejected": 1641384.0, "logps/chosen": -245.20751953125, "logps/rejected": -440.691162109375, "loss": 0.3851, "rewards/chosen": 0.18937732776006064, "rewards/margins": 2.1634695728619895, "rewards/rejected": -1.9740922451019287, "step": 4728 }, { "epoch": 0.25065592452230145, "grad_norm": 46.75, "kl": 0.18548107147216797, "learning_rate": 5e-07, "logits/chosen": -18276898.0, "logits/rejected": 17497556.0, "logps/chosen": -223.37770080566406, "logps/rejected": -362.9313659667969, "loss": 0.3922, "rewards/chosen": -0.15194319188594818, "rewards/margins": 1.209133043885231, "rewards/rejected": -1.3610762357711792, "step": 4729 }, { "epoch": 0.2507089285241036, "grad_norm": 46.0, "kl": 1.8573112487792969, "learning_rate": 5e-07, "logits/chosen": -10703261.0, "logits/rejected": -13334200.0, "logps/chosen": -350.4342041015625, "logps/rejected": -194.6612548828125, "loss": 0.2565, "rewards/chosen": 1.4319438934326172, "rewards/margins": 2.660784959793091, "rewards/rejected": -1.2288410663604736, "step": 4730 }, { "epoch": 0.25076193252590573, "grad_norm": 59.0, "kl": 0.6056365966796875, "learning_rate": 5e-07, "logits/chosen": -26944940.8, "logits/rejected": 187218496.0, "logps/chosen": -408.5791748046875, "logps/rejected": -241.89029947916666, "loss": 0.3195, "rewards/chosen": 0.5331612586975097, "rewards/margins": 1.997720177968343, "rewards/rejected": -1.4645589192708333, "step": 4731 }, { "epoch": 0.25081493652770787, "grad_norm": 42.25, "kl": 0.1431427001953125, "learning_rate": 5e-07, "logits/chosen": -58349226.666666664, "logits/rejected": -30964524.8, "logps/chosen": -551.7266438802084, "logps/rejected": -306.7380859375, "loss": 0.1866, "rewards/chosen": 1.040978988011678, "rewards/margins": 3.1181415398915604, "rewards/rejected": -2.0771625518798826, "step": 4732 }, { "epoch": 0.25086794052951, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9133766.0, "logits/rejected": -2554159.2, "logps/chosen": -94.44129435221355, "logps/rejected": -96.61064453125, "loss": 0.3369, "rewards/chosen": -0.01327286163965861, "rewards/margins": 1.3748866041501362, "rewards/rejected": -1.3881594657897949, "step": 4733 }, { "epoch": 0.25092094453131214, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40616240.0, "logits/rejected": -23191060.0, "logps/chosen": -303.867431640625, "logps/rejected": -166.9163360595703, "loss": 0.3818, "rewards/chosen": -0.5091468691825867, "rewards/margins": 1.476754605770111, "rewards/rejected": -1.9859014749526978, "step": 4734 }, { "epoch": 0.2509739485331143, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19652928.0, "logits/rejected": -29731240.0, "logps/chosen": -221.55331420898438, "logps/rejected": -422.0679117838542, "loss": 0.2639, "rewards/chosen": -0.3231361508369446, "rewards/margins": 1.8294692238171897, "rewards/rejected": -2.1526053746541343, "step": 4735 }, { "epoch": 0.25102695253491636, "grad_norm": 47.25, "kl": 1.4118690490722656, "learning_rate": 5e-07, "logits/chosen": -28991658.666666668, "logits/rejected": 497039.9375, "logps/chosen": -221.30867513020834, "logps/rejected": -40.03950500488281, "loss": 0.4763, "rewards/chosen": 0.07757231593132019, "rewards/margins": 0.9640646278858185, "rewards/rejected": -0.8864923119544983, "step": 4736 }, { "epoch": 0.2510799565367185, "grad_norm": 60.25, "kl": 1.0610198974609375, "learning_rate": 5e-07, "logits/chosen": -38751333.333333336, "logits/rejected": -18757206.4, "logps/chosen": -570.9156087239584, "logps/rejected": -196.57122802734375, "loss": 0.2659, "rewards/chosen": 0.6559712886810303, "rewards/margins": 2.499229001998901, "rewards/rejected": -1.8432577133178711, "step": 4737 }, { "epoch": 0.25113296053852063, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1388299.4166666667, "logits/rejected": -1223336.3, "logps/chosen": -53.864420572916664, "logps/rejected": -110.331396484375, "loss": 0.346, "rewards/chosen": -0.027835845947265625, "rewards/margins": 1.2018854141235351, "rewards/rejected": -1.2297212600708007, "step": 4738 }, { "epoch": 0.25118596454032277, "grad_norm": 53.0, "kl": 0.5678291320800781, "learning_rate": 5e-07, "logits/chosen": -32817131.42857143, "logits/rejected": -26835516.0, "logps/chosen": -292.30324009486606, "logps/rejected": -387.962890625, "loss": 0.3319, "rewards/chosen": 0.7232653754098075, "rewards/margins": 2.0654650585992, "rewards/rejected": -1.342199683189392, "step": 4739 }, { "epoch": 0.2512389685421249, "grad_norm": 117.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14222772.8, "logits/rejected": 140533.33333333334, "logps/chosen": -241.7976318359375, "logps/rejected": -276.8096110026042, "loss": 0.4033, "rewards/chosen": 0.3013119697570801, "rewards/margins": 0.9535353978474935, "rewards/rejected": -0.6522234280904134, "step": 4740 }, { "epoch": 0.25129197254392704, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 16486524.0, "logits/rejected": 6417376.0, "logps/chosen": -42.58224105834961, "logps/rejected": -489.1597377232143, "loss": 0.1751, "rewards/chosen": -0.4780139923095703, "rewards/margins": 2.1498938969203403, "rewards/rejected": -2.6279078892299106, "step": 4741 }, { "epoch": 0.2513449765457292, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17864414.0, "logits/rejected": -23185282.0, "logps/chosen": -267.2015380859375, "logps/rejected": -454.8494873046875, "loss": 0.2968, "rewards/chosen": 0.22067490220069885, "rewards/margins": 2.6524697840213776, "rewards/rejected": -2.4317948818206787, "step": 4742 }, { "epoch": 0.2513979805475313, "grad_norm": 50.0, "kl": 0.22670745849609375, "learning_rate": 5e-07, "logits/chosen": -52664920.0, "logits/rejected": -38034152.0, "logps/chosen": -371.16290283203125, "logps/rejected": -380.23492431640625, "loss": 0.3162, "rewards/chosen": 0.3095240592956543, "rewards/margins": 1.9767441749572754, "rewards/rejected": -1.667220115661621, "step": 4743 }, { "epoch": 0.25145098454933346, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12924696.0, "logits/rejected": -29496044.8, "logps/chosen": -241.92936197916666, "logps/rejected": -417.95439453125, "loss": 0.3263, "rewards/chosen": -0.22144432862599692, "rewards/margins": 1.609727470080058, "rewards/rejected": -1.8311717987060547, "step": 4744 }, { "epoch": 0.2515039885511356, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8264107.2, "logits/rejected": -53008426.666666664, "logps/chosen": -172.62991943359376, "logps/rejected": -119.0835673014323, "loss": 0.3619, "rewards/chosen": 0.16765921115875243, "rewards/margins": 2.6253065347671507, "rewards/rejected": -2.4576473236083984, "step": 4745 }, { "epoch": 0.25155699255293773, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43391076.0, "logits/rejected": -384296.0, "logps/chosen": -332.0755920410156, "logps/rejected": -321.9038899739583, "loss": 0.2077, "rewards/chosen": 0.4312072992324829, "rewards/margins": 2.4587045907974243, "rewards/rejected": -2.0274972915649414, "step": 4746 }, { "epoch": 0.25160999655473987, "grad_norm": 63.25, "kl": 0.35077953338623047, "learning_rate": 5e-07, "logits/chosen": -41135600.0, "logits/rejected": -12443998.0, "logps/chosen": -442.4144287109375, "logps/rejected": -157.7810516357422, "loss": 0.3319, "rewards/chosen": 0.6139652729034424, "rewards/margins": 1.5358144044876099, "rewards/rejected": -0.9218491315841675, "step": 4747 }, { "epoch": 0.251663000556542, "grad_norm": 39.0, "kl": 0.2092456817626953, "learning_rate": 5e-07, "logits/chosen": -12469389.0, "logits/rejected": -15764834.285714285, "logps/chosen": -26.076557159423828, "logps/rejected": -277.07882254464283, "loss": 0.3167, "rewards/chosen": -0.6990179419517517, "rewards/margins": 0.5603749326297216, "rewards/rejected": -1.2593928745814733, "step": 4748 }, { "epoch": 0.25171600455834414, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12574341.0, "logits/rejected": -53516458.666666664, "logps/chosen": -208.37399291992188, "logps/rejected": -443.2862141927083, "loss": 0.2479, "rewards/chosen": -0.13587942719459534, "rewards/margins": 2.1856997112433114, "rewards/rejected": -2.3215791384379068, "step": 4749 }, { "epoch": 0.2517690085601463, "grad_norm": 49.75, "kl": 0.09437370300292969, "learning_rate": 5e-07, "logits/chosen": -30584984.0, "logits/rejected": -5695960.5, "logps/chosen": -251.883056640625, "logps/rejected": -152.07826232910156, "loss": 0.4052, "rewards/chosen": 0.048964242140452065, "rewards/margins": 1.7555477420488994, "rewards/rejected": -1.7065834999084473, "step": 4750 }, { "epoch": 0.2518220125619484, "grad_norm": 48.5, "kl": 0.15991592407226562, "learning_rate": 5e-07, "logits/chosen": -21410142.666666668, "logits/rejected": -6161849.6, "logps/chosen": -483.082275390625, "logps/rejected": -131.633935546875, "loss": 0.3091, "rewards/chosen": 0.4199833075205485, "rewards/margins": 1.6508939901987711, "rewards/rejected": -1.2309106826782226, "step": 4751 }, { "epoch": 0.25187501656375055, "grad_norm": 48.0, "kl": 2.327911376953125, "learning_rate": 5e-07, "logits/chosen": -19586902.0, "logits/rejected": -65797400.0, "logps/chosen": -703.3981323242188, "logps/rejected": -499.89453125, "loss": 0.2861, "rewards/chosen": 0.9260091781616211, "rewards/margins": 2.8000330924987793, "rewards/rejected": -1.8740239143371582, "step": 4752 }, { "epoch": 0.2519280205655527, "grad_norm": 41.25, "kl": 0.08263587951660156, "learning_rate": 5e-07, "logits/chosen": 1341775.0, "logits/rejected": -18835446.666666668, "logps/chosen": -95.9133056640625, "logps/rejected": -246.01798502604166, "loss": 0.352, "rewards/chosen": 0.15247353315353393, "rewards/margins": 2.0415637532869977, "rewards/rejected": -1.8890902201334636, "step": 4753 }, { "epoch": 0.2519810245673548, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15378012.8, "logits/rejected": 4784524.666666667, "logps/chosen": -195.83896484375, "logps/rejected": -224.22772216796875, "loss": 0.3471, "rewards/chosen": 0.2071892261505127, "rewards/margins": 1.9820998350779215, "rewards/rejected": -1.774910608927409, "step": 4754 }, { "epoch": 0.25203402856915696, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63392616.0, "logits/rejected": -4658877.666666667, "logps/chosen": -503.1047668457031, "logps/rejected": -287.2416585286458, "loss": 0.2989, "rewards/chosen": -0.10193786770105362, "rewards/margins": 1.2822099352876346, "rewards/rejected": -1.3841478029886882, "step": 4755 }, { "epoch": 0.2520870325709591, "grad_norm": 41.0, "kl": 1.1327400207519531, "learning_rate": 5e-07, "logits/chosen": 20044886.0, "logits/rejected": -8942780.0, "logps/chosen": -181.95054626464844, "logps/rejected": -277.3392028808594, "loss": 0.2835, "rewards/chosen": 0.39049044251441956, "rewards/margins": 2.593834787607193, "rewards/rejected": -2.2033443450927734, "step": 4756 }, { "epoch": 0.25214003657276124, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29349229.333333332, "logits/rejected": -34391344.0, "logps/chosen": -208.60847981770834, "logps/rejected": -293.7254638671875, "loss": 0.3166, "rewards/chosen": -0.30889713764190674, "rewards/margins": 1.5585848569869996, "rewards/rejected": -1.8674819946289063, "step": 4757 }, { "epoch": 0.2521930405745634, "grad_norm": 65.0, "kl": 0.874359130859375, "learning_rate": 5e-07, "logits/chosen": -32925900.8, "logits/rejected": -11334874.666666666, "logps/chosen": -572.55078125, "logps/rejected": -196.7056681315104, "loss": 0.335, "rewards/chosen": 0.682197904586792, "rewards/margins": 2.034279203414917, "rewards/rejected": -1.352081298828125, "step": 4758 }, { "epoch": 0.2522460445763655, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21898702.4, "logits/rejected": -26503592.0, "logps/chosen": -282.707861328125, "logps/rejected": -226.1860555013021, "loss": 0.2887, "rewards/chosen": 0.6563942909240723, "rewards/margins": 2.5786790529886883, "rewards/rejected": -1.922284762064616, "step": 4759 }, { "epoch": 0.25229904857816765, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68400656.0, "logits/rejected": -43023976.0, "logps/chosen": -548.0223388671875, "logps/rejected": -165.5313720703125, "loss": 0.369, "rewards/chosen": -0.15157166123390198, "rewards/margins": 1.2863270938396454, "rewards/rejected": -1.4378987550735474, "step": 4760 }, { "epoch": 0.2523520525799698, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26419862.4, "logits/rejected": -10255594.0, "logps/chosen": -206.557666015625, "logps/rejected": -555.7609049479166, "loss": 0.3432, "rewards/chosen": 0.1685237169265747, "rewards/margins": 2.536078063646952, "rewards/rejected": -2.3675543467203775, "step": 4761 }, { "epoch": 0.2524050565817719, "grad_norm": 75.5, "kl": 0.7434768676757812, "learning_rate": 5e-07, "logits/chosen": -19301628.8, "logits/rejected": 13902306.666666666, "logps/chosen": -451.471826171875, "logps/rejected": -255.77596028645834, "loss": 0.3192, "rewards/chosen": 0.684330415725708, "rewards/margins": 1.9494645913441975, "rewards/rejected": -1.2651341756184895, "step": 4762 }, { "epoch": 0.25245806058357406, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44845648.0, "logits/rejected": -9457075.0, "logps/chosen": -137.6493682861328, "logps/rejected": -171.80902099609375, "loss": 0.3329, "rewards/chosen": -0.13512879610061646, "rewards/margins": 1.819890558719635, "rewards/rejected": -1.9550193548202515, "step": 4763 }, { "epoch": 0.2525110645853762, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31937346.666666668, "logits/rejected": -15869596.8, "logps/chosen": -279.91725667317706, "logps/rejected": -273.9030029296875, "loss": 0.237, "rewards/chosen": 0.25850828488667804, "rewards/margins": 2.5888957818349203, "rewards/rejected": -2.330387496948242, "step": 4764 }, { "epoch": 0.25256406858717834, "grad_norm": 59.75, "kl": 0.2931394577026367, "learning_rate": 5e-07, "logits/chosen": -22901088.0, "logits/rejected": -28215442.666666668, "logps/chosen": -387.909521484375, "logps/rejected": -431.2801920572917, "loss": 0.3328, "rewards/chosen": 0.5694097995758056, "rewards/margins": 1.7441815853118896, "rewards/rejected": -1.174771785736084, "step": 4765 }, { "epoch": 0.2526170725889805, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44646429.333333336, "logits/rejected": -29442604.8, "logps/chosen": -339.861572265625, "logps/rejected": -264.927392578125, "loss": 0.2777, "rewards/chosen": 0.3956619103749593, "rewards/margins": 2.0005538781483967, "rewards/rejected": -1.6048919677734375, "step": 4766 }, { "epoch": 0.2526700765907826, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25234092.0, "logits/rejected": -22569098.0, "logps/chosen": -181.85531616210938, "logps/rejected": -161.0455780029297, "loss": 0.4193, "rewards/chosen": -0.19137844443321228, "rewards/margins": 0.8256794512271881, "rewards/rejected": -1.0170578956604004, "step": 4767 }, { "epoch": 0.25272308059258475, "grad_norm": 54.25, "kl": 0.5081787109375, "learning_rate": 5e-07, "logits/chosen": -8936786.0, "logits/rejected": -14775846.0, "logps/chosen": -349.061767578125, "logps/rejected": -264.0221862792969, "loss": 0.3864, "rewards/chosen": 0.11054554581642151, "rewards/margins": 1.2506615221500397, "rewards/rejected": -1.1401159763336182, "step": 4768 }, { "epoch": 0.2527760845943869, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75518.66666666667, "logits/rejected": -38027337.6, "logps/chosen": -130.72555541992188, "logps/rejected": -371.683935546875, "loss": 0.243, "rewards/chosen": 0.5221513112386068, "rewards/margins": 2.620090993245443, "rewards/rejected": -2.097939682006836, "step": 4769 }, { "epoch": 0.252829088596189, "grad_norm": 50.5, "kl": 1.775146484375, "learning_rate": 5e-07, "logits/chosen": -32556024.0, "logits/rejected": -10901261.6, "logps/chosen": -499.518798828125, "logps/rejected": -174.4086181640625, "loss": 0.2546, "rewards/chosen": 0.939079761505127, "rewards/margins": 2.6370327949523924, "rewards/rejected": -1.6979530334472657, "step": 4770 }, { "epoch": 0.25288209259799116, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24711472.0, "logits/rejected": -17776836.0, "logps/chosen": -298.369775390625, "logps/rejected": -266.17014567057294, "loss": 0.4136, "rewards/chosen": -0.18173325061798096, "rewards/margins": 1.2411631345748901, "rewards/rejected": -1.422896385192871, "step": 4771 }, { "epoch": 0.2529350965997933, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27200916.0, "logits/rejected": 5679202.5, "logps/chosen": -304.20916748046875, "logps/rejected": -260.4577331542969, "loss": 0.2051, "rewards/chosen": 0.8230383396148682, "rewards/margins": 3.2951037883758545, "rewards/rejected": -2.4720654487609863, "step": 4772 }, { "epoch": 0.25298810060159543, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33405571.2, "logits/rejected": -24866021.333333332, "logps/chosen": -115.05648193359374, "logps/rejected": -345.0277913411458, "loss": 0.365, "rewards/chosen": -0.18534364700317382, "rewards/margins": 2.399936612447103, "rewards/rejected": -2.585280259450277, "step": 4773 }, { "epoch": 0.25304110460339757, "grad_norm": 51.0, "kl": 0.587885856628418, "learning_rate": 5e-07, "logits/chosen": -13229562.285714285, "logits/rejected": 60409472.0, "logps/chosen": -179.56342424665178, "logps/rejected": -224.45248413085938, "loss": 0.3925, "rewards/chosen": 0.4740692547389439, "rewards/margins": 1.5818176439830234, "rewards/rejected": -1.1077483892440796, "step": 4774 }, { "epoch": 0.2530941086051997, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30294521.6, "logits/rejected": -39333021.333333336, "logps/chosen": -188.41571044921875, "logps/rejected": -567.0680745442709, "loss": 0.3008, "rewards/chosen": 0.41979207992553713, "rewards/margins": 2.8109908739725746, "rewards/rejected": -2.3911987940470376, "step": 4775 }, { "epoch": 0.25314711260700185, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52617344.0, "logits/rejected": -57372792.0, "logps/chosen": -431.7926940917969, "logps/rejected": -370.2845153808594, "loss": 0.2569, "rewards/chosen": 0.3546340763568878, "rewards/margins": 2.771959573030472, "rewards/rejected": -2.417325496673584, "step": 4776 }, { "epoch": 0.253200116608804, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48941754.666666664, "logits/rejected": -38766099.2, "logps/chosen": -345.4951171875, "logps/rejected": -338.09619140625, "loss": 0.2325, "rewards/chosen": 0.5613154172897339, "rewards/margins": 2.75574996471405, "rewards/rejected": -2.1944345474243163, "step": 4777 }, { "epoch": 0.2532531206106061, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3116853.25, "logits/rejected": -5791864.0, "logps/chosen": -62.28078079223633, "logps/rejected": -213.29744466145834, "loss": 0.2971, "rewards/chosen": -0.1800277680158615, "rewards/margins": 1.4647598614295323, "rewards/rejected": -1.6447876294453938, "step": 4778 }, { "epoch": 0.25330612461240826, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10029512.0, "logits/rejected": -40173554.28571428, "logps/chosen": -24.743877410888672, "logps/rejected": -461.9934779575893, "loss": 0.1919, "rewards/chosen": -0.06520634144544601, "rewards/margins": 1.8604118302464485, "rewards/rejected": -1.9256181716918945, "step": 4779 }, { "epoch": 0.2533591286142104, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -143491872.0, "logits/rejected": -20636118.666666668, "logps/chosen": -371.9939880371094, "logps/rejected": -331.3744710286458, "loss": 0.2566, "rewards/chosen": -0.21645812690258026, "rewards/margins": 1.8877708464860916, "rewards/rejected": -2.104228973388672, "step": 4780 }, { "epoch": 0.25341213261601253, "grad_norm": 78.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35360188.0, "logits/rejected": -56090640.0, "logps/chosen": -358.3634948730469, "logps/rejected": -313.67034912109375, "loss": 0.2977, "rewards/chosen": 0.5441761016845703, "rewards/margins": 2.417095899581909, "rewards/rejected": -1.8729197978973389, "step": 4781 }, { "epoch": 0.25346513661781467, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27068978.285714287, "logits/rejected": -58691792.0, "logps/chosen": -327.2407924107143, "logps/rejected": -687.9202270507812, "loss": 0.3563, "rewards/chosen": 0.43521370206560406, "rewards/margins": 3.342348711831229, "rewards/rejected": -2.907135009765625, "step": 4782 }, { "epoch": 0.2535181406196168, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41398148.0, "logits/rejected": -87259656.0, "logps/chosen": -292.75128173828125, "logps/rejected": -477.8979797363281, "loss": 0.3405, "rewards/chosen": 0.19893188774585724, "rewards/margins": 1.46620412170887, "rewards/rejected": -1.2672722339630127, "step": 4783 }, { "epoch": 0.25357114462141894, "grad_norm": 67.0, "kl": 1.3401832580566406, "learning_rate": 5e-07, "logits/chosen": -38250601.6, "logits/rejected": -1810824.6666666667, "logps/chosen": -429.058837890625, "logps/rejected": -326.3172607421875, "loss": 0.3315, "rewards/chosen": 0.6626352310180664, "rewards/margins": 1.9387898286183676, "rewards/rejected": -1.276154597600301, "step": 4784 }, { "epoch": 0.2536241486232211, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45647018.666666664, "logits/rejected": -126197.625, "logps/chosen": -261.8729654947917, "logps/rejected": -72.19418334960938, "loss": 0.3737, "rewards/chosen": 0.14945844809214273, "rewards/margins": 2.479546586672465, "rewards/rejected": -2.3300881385803223, "step": 4785 }, { "epoch": 0.2536771526250232, "grad_norm": 61.25, "kl": 0.751983642578125, "learning_rate": 5e-07, "logits/chosen": -11189173.333333334, "logits/rejected": -28148612.0, "logps/chosen": -516.9667154947916, "logps/rejected": -171.91873168945312, "loss": 0.3808, "rewards/chosen": 0.5858196417490641, "rewards/margins": 2.458148996035258, "rewards/rejected": -1.8723293542861938, "step": 4786 }, { "epoch": 0.2537301566268253, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3364560.6666666665, "logits/rejected": -286423.0, "logps/chosen": -45.67376200358073, "logps/rejected": -400.7879638671875, "loss": 0.2823, "rewards/chosen": 0.13024603327115378, "rewards/margins": 2.0941719671090446, "rewards/rejected": -1.9639259338378907, "step": 4787 }, { "epoch": 0.25378316062862744, "grad_norm": 63.25, "kl": 0.20914840698242188, "learning_rate": 5e-07, "logits/chosen": -17392300.57142857, "logits/rejected": -44809880.0, "logps/chosen": -294.8069545200893, "logps/rejected": -430.22686767578125, "loss": 0.4643, "rewards/chosen": -0.049225066389356344, "rewards/margins": 2.469882036958422, "rewards/rejected": -2.5191071033477783, "step": 4788 }, { "epoch": 0.2538361646304296, "grad_norm": 47.5, "kl": 0.12737464904785156, "learning_rate": 5e-07, "logits/chosen": -53010176.0, "logits/rejected": -26705284.8, "logps/chosen": -168.90789794921875, "logps/rejected": -317.050146484375, "loss": 0.2869, "rewards/chosen": -0.1557872792085012, "rewards/margins": 2.1061719874540965, "rewards/rejected": -2.261959266662598, "step": 4789 }, { "epoch": 0.2538891686322317, "grad_norm": 52.5, "kl": 0.3827781677246094, "learning_rate": 5e-07, "logits/chosen": -40227884.0, "logits/rejected": -20392376.0, "logps/chosen": -342.35498046875, "logps/rejected": -132.0237579345703, "loss": 0.3361, "rewards/chosen": 0.2827069163322449, "rewards/margins": 1.691229522228241, "rewards/rejected": -1.408522605895996, "step": 4790 }, { "epoch": 0.25394217263403385, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30176208.0, "logits/rejected": -12253433.333333334, "logps/chosen": -205.15753173828125, "logps/rejected": -313.64837646484375, "loss": 0.2561, "rewards/chosen": 0.6639190912246704, "rewards/margins": 2.041210452715556, "rewards/rejected": -1.3772913614908855, "step": 4791 }, { "epoch": 0.253995176635836, "grad_norm": 77.5, "kl": 1.5962390899658203, "learning_rate": 5e-07, "logits/chosen": -44153741.71428572, "logits/rejected": -8580065.0, "logps/chosen": -405.89065987723217, "logps/rejected": -99.21356201171875, "loss": 0.3839, "rewards/chosen": 0.5349901744297573, "rewards/margins": 2.0496805735996793, "rewards/rejected": -1.5146903991699219, "step": 4792 }, { "epoch": 0.2540481806376381, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49545913.6, "logits/rejected": -19247968.0, "logps/chosen": -351.655029296875, "logps/rejected": -192.76749674479166, "loss": 0.4135, "rewards/chosen": -0.11215469837188721, "rewards/margins": 1.2370174169540404, "rewards/rejected": -1.3491721153259277, "step": 4793 }, { "epoch": 0.25410118463944026, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8580995.2, "logits/rejected": -20635749.333333332, "logps/chosen": -250.288818359375, "logps/rejected": -471.8448486328125, "loss": 0.2722, "rewards/chosen": 0.5144241333007813, "rewards/margins": 2.89771728515625, "rewards/rejected": -2.3832931518554688, "step": 4794 }, { "epoch": 0.2541541886412424, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23371938.0, "logits/rejected": -24302836.0, "logps/chosen": -237.7583465576172, "logps/rejected": -472.5130310058594, "loss": 0.2852, "rewards/chosen": 0.04313831031322479, "rewards/margins": 2.6484988182783127, "rewards/rejected": -2.605360507965088, "step": 4795 }, { "epoch": 0.25420719264304453, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -89543184.0, "logits/rejected": -22903248.0, "logps/chosen": -254.4633331298828, "logps/rejected": -448.0753479003906, "loss": 0.2705, "rewards/chosen": 0.23998557031154633, "rewards/margins": 2.655361756682396, "rewards/rejected": -2.4153761863708496, "step": 4796 }, { "epoch": 0.25426019664484667, "grad_norm": 62.0, "kl": 1.7549896240234375, "learning_rate": 5e-07, "logits/chosen": -71624025.6, "logits/rejected": 7833713.333333333, "logps/chosen": -447.944189453125, "logps/rejected": -462.0736490885417, "loss": 0.3798, "rewards/chosen": 0.4670724868774414, "rewards/margins": 2.1472686131795244, "rewards/rejected": -1.6801961263020833, "step": 4797 }, { "epoch": 0.2543132006466488, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2575986.75, "logits/rejected": -18880141.333333332, "logps/chosen": -185.8543243408203, "logps/rejected": -357.0456949869792, "loss": 0.2052, "rewards/chosen": 0.6990095376968384, "rewards/margins": 2.5494660933812456, "rewards/rejected": -1.8504565556844075, "step": 4798 }, { "epoch": 0.25436620464845094, "grad_norm": 81.5, "kl": 1.4352607727050781, "learning_rate": 5e-07, "logits/chosen": -71727824.0, "logits/rejected": -3963944.8, "logps/chosen": -705.6119791666666, "logps/rejected": -180.5966552734375, "loss": 0.3573, "rewards/chosen": 0.5986287196477255, "rewards/margins": 1.4863487323125204, "rewards/rejected": -0.8877200126647949, "step": 4799 }, { "epoch": 0.2544192086502531, "grad_norm": 47.25, "kl": 2.5616302490234375, "learning_rate": 5e-07, "logits/chosen": -33998483.2, "logits/rejected": -16842874.666666668, "logps/chosen": -822.31650390625, "logps/rejected": -247.08695475260416, "loss": 0.336, "rewards/chosen": 0.9387874603271484, "rewards/margins": 2.681959629058838, "rewards/rejected": -1.7431721687316895, "step": 4800 }, { "epoch": 0.2544722126520552, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49425824.0, "logits/rejected": 8245837.333333333, "logps/chosen": -365.5063720703125, "logps/rejected": -353.2962239583333, "loss": 0.3601, "rewards/chosen": 0.04432617127895355, "rewards/margins": 2.221454009413719, "rewards/rejected": -2.1771278381347656, "step": 4801 }, { "epoch": 0.25452521665385736, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24313634.0, "logits/rejected": -4202266.0, "logps/chosen": -286.8629150390625, "logps/rejected": -93.23622131347656, "loss": 0.4091, "rewards/chosen": -0.2673564851284027, "rewards/margins": 1.0577141344547272, "rewards/rejected": -1.3250706195831299, "step": 4802 }, { "epoch": 0.2545782206556595, "grad_norm": 42.75, "kl": 0.45812034606933594, "learning_rate": 5e-07, "logits/chosen": -7970555.0, "logits/rejected": -41297960.0, "logps/chosen": -204.6785888671875, "logps/rejected": -360.8898620605469, "loss": 0.3173, "rewards/chosen": 0.3488914668560028, "rewards/margins": 2.0414901673793793, "rewards/rejected": -1.6925987005233765, "step": 4803 }, { "epoch": 0.25463122465746163, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25530557.333333332, "logits/rejected": -14844028.8, "logps/chosen": -290.46962483723956, "logps/rejected": -362.5227294921875, "loss": 0.2554, "rewards/chosen": 0.41657861073811847, "rewards/margins": 2.384363301595052, "rewards/rejected": -1.9677846908569336, "step": 4804 }, { "epoch": 0.25468422865926377, "grad_norm": 40.75, "kl": 0.14617919921875, "learning_rate": 5e-07, "logits/chosen": -34501724.0, "logits/rejected": -39374052.571428575, "logps/chosen": -247.31939697265625, "logps/rejected": -270.620849609375, "loss": 0.152, "rewards/chosen": 1.2430022954940796, "rewards/margins": 3.200140629495893, "rewards/rejected": -1.9571383340018136, "step": 4805 }, { "epoch": 0.2547372326610659, "grad_norm": 36.5, "kl": 0.4857940673828125, "learning_rate": 5e-07, "logits/chosen": -43264912.0, "logits/rejected": -26274642.666666668, "logps/chosen": -242.25021362304688, "logps/rejected": -290.4993896484375, "loss": 0.212, "rewards/chosen": 0.665796160697937, "rewards/margins": 2.530283172925313, "rewards/rejected": -1.8644870122273762, "step": 4806 }, { "epoch": 0.25479023666286804, "grad_norm": 34.5, "kl": 0.6669845581054688, "learning_rate": 5e-07, "logits/chosen": -43198285.333333336, "logits/rejected": -34517996.8, "logps/chosen": -215.74702962239584, "logps/rejected": -280.0821533203125, "loss": 0.2598, "rewards/chosen": 0.3677204449971517, "rewards/margins": 2.333323319753011, "rewards/rejected": -1.9656028747558594, "step": 4807 }, { "epoch": 0.2548432406646702, "grad_norm": 46.25, "kl": 0.307586669921875, "learning_rate": 5e-07, "logits/chosen": -25865664.0, "logits/rejected": -1597286.6, "logps/chosen": -487.6865234375, "logps/rejected": -88.258642578125, "loss": 0.2822, "rewards/chosen": 0.26343870162963867, "rewards/margins": 2.218901348114014, "rewards/rejected": -1.955462646484375, "step": 4808 }, { "epoch": 0.2548962446664723, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18837980.8, "logits/rejected": -39804373.333333336, "logps/chosen": -153.9534423828125, "logps/rejected": -461.5443115234375, "loss": 0.2884, "rewards/chosen": 0.5959049224853515, "rewards/margins": 2.6906667709350587, "rewards/rejected": -2.094761848449707, "step": 4809 }, { "epoch": 0.25494924866827445, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13876212.0, "logits/rejected": -30383568.0, "logps/chosen": -274.5273132324219, "logps/rejected": -236.9563446044922, "loss": 0.3056, "rewards/chosen": 0.28815725445747375, "rewards/margins": 2.050362855195999, "rewards/rejected": -1.7622056007385254, "step": 4810 }, { "epoch": 0.2550022526700766, "grad_norm": 52.0, "kl": 0.0224761962890625, "learning_rate": 5e-07, "logits/chosen": -49685813.333333336, "logits/rejected": 7013608.0, "logps/chosen": -435.5029296875, "logps/rejected": -288.4568359375, "loss": 0.2506, "rewards/chosen": 0.6327738761901855, "rewards/margins": 2.4269049644470213, "rewards/rejected": -1.794131088256836, "step": 4811 }, { "epoch": 0.2550552566718787, "grad_norm": 56.5, "kl": 0.8612899780273438, "learning_rate": 5e-07, "logits/chosen": -54647123.2, "logits/rejected": 1495095.8333333333, "logps/chosen": -287.1673828125, "logps/rejected": -143.6044718424479, "loss": 0.377, "rewards/chosen": 0.10449483394622802, "rewards/margins": 1.8590230385462443, "rewards/rejected": -1.7545282046000164, "step": 4812 }, { "epoch": 0.25510826067368086, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21751230.4, "logits/rejected": 759697.3333333334, "logps/chosen": -299.2012451171875, "logps/rejected": -393.4797770182292, "loss": 0.358, "rewards/chosen": 0.014790952205657959, "rewards/margins": 2.32714980840683, "rewards/rejected": -2.312358856201172, "step": 4813 }, { "epoch": 0.255161264675483, "grad_norm": 50.0, "kl": 0.8051986694335938, "learning_rate": 5e-07, "logits/chosen": -64996024.0, "logits/rejected": -38845040.0, "logps/chosen": -381.8163757324219, "logps/rejected": -311.6661376953125, "loss": 0.3123, "rewards/chosen": 0.5803368091583252, "rewards/margins": 2.0961551666259766, "rewards/rejected": -1.5158183574676514, "step": 4814 }, { "epoch": 0.25521426867728514, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39791993.6, "logits/rejected": -7947520.0, "logps/chosen": -211.824560546875, "logps/rejected": -201.4331258138021, "loss": 0.3538, "rewards/chosen": 0.14954239130020142, "rewards/margins": 1.7413774132728577, "rewards/rejected": -1.5918350219726562, "step": 4815 }, { "epoch": 0.2552672726790873, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91496424.0, "logits/rejected": -13277.5, "logps/chosen": -339.8642883300781, "logps/rejected": -77.56754302978516, "loss": 0.4148, "rewards/chosen": -0.1741413176059723, "rewards/margins": 1.1153765618801117, "rewards/rejected": -1.289517879486084, "step": 4816 }, { "epoch": 0.2553202766808894, "grad_norm": 57.75, "kl": 0.6915950775146484, "learning_rate": 5e-07, "logits/chosen": -72086824.0, "logits/rejected": -10078996.0, "logps/chosen": -747.7272338867188, "logps/rejected": -76.40535736083984, "loss": 0.2832, "rewards/chosen": 0.8742870092391968, "rewards/margins": 2.1322134733200073, "rewards/rejected": -1.2579264640808105, "step": 4817 }, { "epoch": 0.25537328068269155, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36981328.0, "logits/rejected": -528081.125, "logps/chosen": -277.7236328125, "logps/rejected": -92.74244689941406, "loss": 0.353, "rewards/chosen": 0.41817665100097656, "rewards/margins": 1.9292579889297485, "rewards/rejected": -1.511081337928772, "step": 4818 }, { "epoch": 0.2554262846844937, "grad_norm": 54.5, "kl": 0.01678466796875, "learning_rate": 5e-07, "logits/chosen": -31876720.0, "logits/rejected": 25047708.0, "logps/chosen": -283.5161946614583, "logps/rejected": -507.82080078125, "loss": 0.39, "rewards/chosen": 0.09154446919759114, "rewards/margins": 2.4573386510213218, "rewards/rejected": -2.3657941818237305, "step": 4819 }, { "epoch": 0.2554792886862958, "grad_norm": 66.0, "kl": 1.08868408203125, "learning_rate": 5e-07, "logits/chosen": -49165968.0, "logits/rejected": -50528876.8, "logps/chosen": -424.4359537760417, "logps/rejected": -412.736474609375, "loss": 0.332, "rewards/chosen": 0.19360198577245077, "rewards/margins": 1.5094657858212788, "rewards/rejected": -1.315863800048828, "step": 4820 }, { "epoch": 0.25553229268809796, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 54799464.0, "logits/rejected": -17351973.333333332, "logps/chosen": -381.66717529296875, "logps/rejected": -231.62255859375, "loss": 0.2855, "rewards/chosen": 0.4831603765487671, "rewards/margins": 1.771662433942159, "rewards/rejected": -1.2885020573933919, "step": 4821 }, { "epoch": 0.2555852966899001, "grad_norm": 46.75, "kl": 0.000926971435546875, "learning_rate": 5e-07, "logits/chosen": -24077988.0, "logits/rejected": -25920792.0, "logps/chosen": -314.360107421875, "logps/rejected": -363.56646728515625, "loss": 0.2477, "rewards/chosen": 0.5940189361572266, "rewards/margins": 2.8706936836242676, "rewards/rejected": -2.276674747467041, "step": 4822 }, { "epoch": 0.25563830069170224, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12566850.666666666, "logits/rejected": -467006.3125, "logps/chosen": -221.39520263671875, "logps/rejected": -105.39724731445312, "loss": 0.4499, "rewards/chosen": -0.1632437308629354, "rewards/margins": 1.2404625813166301, "rewards/rejected": -1.4037063121795654, "step": 4823 }, { "epoch": 0.2556913046935044, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52181941.333333336, "logits/rejected": 302574438.4, "logps/chosen": -358.52783203125, "logps/rejected": -509.465625, "loss": 0.227, "rewards/chosen": 0.5000274976094564, "rewards/margins": 2.5639341672261557, "rewards/rejected": -2.063906669616699, "step": 4824 }, { "epoch": 0.2557443086953065, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7112187.0, "logits/rejected": -12584034.0, "logps/chosen": -240.2354736328125, "logps/rejected": -458.9639892578125, "loss": 0.2207, "rewards/chosen": 0.6258643865585327, "rewards/margins": 3.1853665113449097, "rewards/rejected": -2.559502124786377, "step": 4825 }, { "epoch": 0.25579731269710865, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11678402.4, "logits/rejected": -44995504.0, "logps/chosen": -226.1538330078125, "logps/rejected": -326.6165771484375, "loss": 0.3542, "rewards/chosen": 0.4970210552215576, "rewards/margins": 1.5054782390594483, "rewards/rejected": -1.0084571838378906, "step": 4826 }, { "epoch": 0.2558503166989108, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 23287880.0, "logits/rejected": -24141349.333333332, "logps/chosen": -432.82818603515625, "logps/rejected": -321.89129638671875, "loss": 0.3007, "rewards/chosen": -0.193623349070549, "rewards/margins": 1.3254987746477127, "rewards/rejected": -1.5191221237182617, "step": 4827 }, { "epoch": 0.2559033207007129, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33352984.0, "logits/rejected": -22366281.6, "logps/chosen": -321.3802083333333, "logps/rejected": -506.605908203125, "loss": 0.1819, "rewards/chosen": 0.4676371415456136, "rewards/margins": 3.8425391038258874, "rewards/rejected": -3.3749019622802736, "step": 4828 }, { "epoch": 0.25595632470251506, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5221164.666666667, "logits/rejected": 5565105.0, "logps/chosen": -159.57967122395834, "logps/rejected": -398.21044921875, "loss": 0.4016, "rewards/chosen": 0.12659049034118652, "rewards/margins": 1.83133065700531, "rewards/rejected": -1.7047401666641235, "step": 4829 }, { "epoch": 0.2560093287043172, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20101112.0, "logits/rejected": -18749028.0, "logps/chosen": -239.7886962890625, "logps/rejected": -440.3429260253906, "loss": 0.3011, "rewards/chosen": 0.08323673903942108, "rewards/margins": 2.554554983973503, "rewards/rejected": -2.471318244934082, "step": 4830 }, { "epoch": 0.25606233270611933, "grad_norm": 54.75, "kl": 1.367959976196289, "learning_rate": 5e-07, "logits/chosen": -26357513.6, "logits/rejected": -20637556.0, "logps/chosen": -525.919140625, "logps/rejected": -320.585205078125, "loss": 0.2363, "rewards/chosen": 1.1420449256896972, "rewards/margins": 3.0343684196472167, "rewards/rejected": -1.8923234939575195, "step": 4831 }, { "epoch": 0.25611533670792147, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43620588.0, "logits/rejected": -50467770.666666664, "logps/chosen": -168.50889587402344, "logps/rejected": -271.8291015625, "loss": 0.3517, "rewards/chosen": -0.3575730323791504, "rewards/margins": 0.7317773501078289, "rewards/rejected": -1.0893503824869792, "step": 4832 }, { "epoch": 0.2561683407097236, "grad_norm": 54.5, "kl": 0.6716461181640625, "learning_rate": 5e-07, "logits/chosen": -33228812.8, "logits/rejected": -41477464.0, "logps/chosen": -432.630224609375, "logps/rejected": -550.2239990234375, "loss": 0.3642, "rewards/chosen": 0.12553129196166993, "rewards/margins": 2.7813030242919923, "rewards/rejected": -2.6557717323303223, "step": 4833 }, { "epoch": 0.25622134471152574, "grad_norm": 47.0, "kl": 0.7308731079101562, "learning_rate": 5e-07, "logits/chosen": -19970846.0, "logits/rejected": -32700232.0, "logps/chosen": -301.1282958984375, "logps/rejected": -398.3521321614583, "loss": 0.2446, "rewards/chosen": 0.3374794125556946, "rewards/margins": 2.2813606063524885, "rewards/rejected": -1.9438811937967937, "step": 4834 }, { "epoch": 0.2562743487133279, "grad_norm": 73.5, "kl": 0.11643218994140625, "learning_rate": 5e-07, "logits/chosen": -16985746.666666668, "logits/rejected": -13381303.0, "logps/chosen": -429.3759765625, "logps/rejected": -74.86825561523438, "loss": 0.4299, "rewards/chosen": 0.1991752783457438, "rewards/margins": 0.7849018375078837, "rewards/rejected": -0.5857265591621399, "step": 4835 }, { "epoch": 0.25632735271513, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7049437.333333333, "logits/rejected": -2842747.0, "logps/chosen": -217.09322102864584, "logps/rejected": -335.00543212890625, "loss": 0.4358, "rewards/chosen": 0.09810336430867513, "rewards/margins": 0.9055847724278768, "rewards/rejected": -0.8074814081192017, "step": 4836 }, { "epoch": 0.2563803567169321, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18537449.333333332, "logits/rejected": -25806483.2, "logps/chosen": -183.62418619791666, "logps/rejected": -242.366455078125, "loss": 0.3054, "rewards/chosen": -0.08229075868924458, "rewards/margins": 1.5388850073019664, "rewards/rejected": -1.621175765991211, "step": 4837 }, { "epoch": 0.25643336071873424, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12859841.0, "logits/rejected": -53996245.333333336, "logps/chosen": -376.48309326171875, "logps/rejected": -375.3912760416667, "loss": 0.2293, "rewards/chosen": 0.4548565149307251, "rewards/margins": 2.448970119158427, "rewards/rejected": -1.994113604227702, "step": 4838 }, { "epoch": 0.2564863647205364, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7704808.5, "logits/rejected": -60254256.0, "logps/chosen": -368.7435302734375, "logps/rejected": -337.7290954589844, "loss": 0.3399, "rewards/chosen": 0.11989239603281021, "rewards/margins": 2.090480960905552, "rewards/rejected": -1.9705885648727417, "step": 4839 }, { "epoch": 0.2565393687223385, "grad_norm": 64.5, "kl": 0.11587810516357422, "learning_rate": 5e-07, "logits/chosen": -41091520.0, "logits/rejected": -86297432.0, "logps/chosen": -423.9403076171875, "logps/rejected": -278.45513916015625, "loss": 0.2403, "rewards/chosen": 0.5442700386047363, "rewards/margins": 2.7462515830993652, "rewards/rejected": -2.201981544494629, "step": 4840 }, { "epoch": 0.25659237272414065, "grad_norm": 37.0, "kl": 0.5212898254394531, "learning_rate": 5e-07, "logits/chosen": -25859748.0, "logits/rejected": -20294724.0, "logps/chosen": -141.44415283203125, "logps/rejected": -460.1826171875, "loss": 0.2989, "rewards/chosen": 0.33814266324043274, "rewards/margins": 2.749945431947708, "rewards/rejected": -2.4118027687072754, "step": 4841 }, { "epoch": 0.2566453767259428, "grad_norm": 62.25, "kl": 1.57855224609375, "learning_rate": 5e-07, "logits/chosen": -9514108.0, "logits/rejected": 2962199.0, "logps/chosen": -264.5218200683594, "logps/rejected": -313.537841796875, "loss": 0.3286, "rewards/chosen": 0.3817729949951172, "rewards/margins": 2.4630656242370605, "rewards/rejected": -2.0812926292419434, "step": 4842 }, { "epoch": 0.2566983807277449, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51419452.0, "logits/rejected": -28571594.666666668, "logps/chosen": -429.32403564453125, "logps/rejected": -554.2102864583334, "loss": 0.3025, "rewards/chosen": -0.6582382321357727, "rewards/margins": 1.1296274065971375, "rewards/rejected": -1.7878656387329102, "step": 4843 }, { "epoch": 0.25675138472954706, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38459512.0, "logits/rejected": -45210009.6, "logps/chosen": -309.5012613932292, "logps/rejected": -313.2773681640625, "loss": 0.2789, "rewards/chosen": 0.3898773193359375, "rewards/margins": 2.012233352661133, "rewards/rejected": -1.6223560333251954, "step": 4844 }, { "epoch": 0.2568043887313492, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63811248.0, "logits/rejected": 11263608.0, "logps/chosen": -587.7751057942709, "logps/rejected": -190.8950927734375, "loss": 0.244, "rewards/chosen": 0.6375840902328491, "rewards/margins": 2.437102484703064, "rewards/rejected": -1.7995183944702149, "step": 4845 }, { "epoch": 0.25685739273315134, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36557189.333333336, "logits/rejected": -18552270.4, "logps/chosen": -393.1029866536458, "logps/rejected": -215.1751953125, "loss": 0.284, "rewards/chosen": 0.24279427528381348, "rewards/margins": 1.960970163345337, "rewards/rejected": -1.7181758880615234, "step": 4846 }, { "epoch": 0.2569103967349535, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42901424.0, "logits/rejected": -15523704.0, "logps/chosen": -409.36728515625, "logps/rejected": -328.93902587890625, "loss": 0.4176, "rewards/chosen": -0.07542389035224914, "rewards/margins": 1.0937131901582082, "rewards/rejected": -1.1691370805104573, "step": 4847 }, { "epoch": 0.2569634007367556, "grad_norm": 59.0, "kl": 3.1758460998535156, "learning_rate": 5e-07, "logits/chosen": -21879817.14285714, "logits/rejected": 2509113.25, "logps/chosen": -308.5371791294643, "logps/rejected": -312.9749450683594, "loss": 0.4346, "rewards/chosen": 0.42126127651759554, "rewards/margins": 2.738671507154192, "rewards/rejected": -2.3174102306365967, "step": 4848 }, { "epoch": 0.25701640473855775, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51484496.0, "logits/rejected": -48356940.8, "logps/chosen": -316.1514485677083, "logps/rejected": -308.9898681640625, "loss": 0.353, "rewards/chosen": -0.6205767393112183, "rewards/margins": 1.3793927431106567, "rewards/rejected": -1.999969482421875, "step": 4849 }, { "epoch": 0.2570694087403599, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11652340.0, "logits/rejected": -12850944.0, "logps/chosen": -30.955217361450195, "logps/rejected": -230.9410196940104, "loss": 0.3234, "rewards/chosen": 0.036303892731666565, "rewards/margins": 1.2038374493519466, "rewards/rejected": -1.16753355662028, "step": 4850 }, { "epoch": 0.257122412742162, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1242565.3333333333, "logits/rejected": -18574076.8, "logps/chosen": -412.3643391927083, "logps/rejected": -295.9208740234375, "loss": 0.2883, "rewards/chosen": -0.1044921875, "rewards/margins": 1.807170295715332, "rewards/rejected": -1.911662483215332, "step": 4851 }, { "epoch": 0.25717541674396416, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45654156.8, "logits/rejected": -10073747.333333334, "logps/chosen": -459.31748046875, "logps/rejected": -235.20792643229166, "loss": 0.3722, "rewards/chosen": 0.2608889818191528, "rewards/margins": 1.6179330428441365, "rewards/rejected": -1.3570440610249836, "step": 4852 }, { "epoch": 0.2572284207457663, "grad_norm": 51.75, "kl": 2.4107322692871094, "learning_rate": 5e-07, "logits/chosen": -40011216.0, "logits/rejected": -13916560.0, "logps/chosen": -363.341552734375, "logps/rejected": -228.79623413085938, "loss": 0.414, "rewards/chosen": 0.3725423415501912, "rewards/margins": 2.1381402810414634, "rewards/rejected": -1.765597939491272, "step": 4853 }, { "epoch": 0.25728142474756843, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9443970.0, "logits/rejected": -101627488.0, "logps/chosen": -205.4701690673828, "logps/rejected": -316.1643981933594, "loss": 0.3646, "rewards/chosen": -0.22631406784057617, "rewards/margins": 1.6508708000183105, "rewards/rejected": -1.8771848678588867, "step": 4854 }, { "epoch": 0.25733442874937057, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25243472.0, "logits/rejected": -32205696.0, "logps/chosen": -317.5749104817708, "logps/rejected": -198.9778076171875, "loss": 0.2396, "rewards/chosen": 0.40753380457560223, "rewards/margins": 2.3859199206034343, "rewards/rejected": -1.978386116027832, "step": 4855 }, { "epoch": 0.2573874327511727, "grad_norm": 61.0, "kl": 1.3554458618164062, "learning_rate": 5e-07, "logits/chosen": -62635509.333333336, "logits/rejected": -23037184.0, "logps/chosen": -331.7638753255208, "logps/rejected": -213.14996337890625, "loss": 0.3959, "rewards/chosen": 0.38342610994974774, "rewards/margins": 1.881524125734965, "rewards/rejected": -1.4980980157852173, "step": 4856 }, { "epoch": 0.25744043675297484, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44025724.0, "logits/rejected": -42424490.666666664, "logps/chosen": -552.9193115234375, "logps/rejected": -230.30668131510416, "loss": 0.2504, "rewards/chosen": 0.9580719470977783, "rewards/margins": 2.404552062352498, "rewards/rejected": -1.44648011525472, "step": 4857 }, { "epoch": 0.257493440754777, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48251264.0, "logits/rejected": -597285.625, "logps/chosen": -222.26751708984375, "logps/rejected": -35.053131103515625, "loss": 0.4677, "rewards/chosen": -0.334727942943573, "rewards/margins": 0.279117226600647, "rewards/rejected": -0.61384516954422, "step": 4858 }, { "epoch": 0.2575464447565791, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -82186192.0, "logits/rejected": -2964547.0, "logps/chosen": -455.3468017578125, "logps/rejected": -502.5989685058594, "loss": 0.3334, "rewards/chosen": -0.15217667818069458, "rewards/margins": 1.8679930567741394, "rewards/rejected": -2.020169734954834, "step": 4859 }, { "epoch": 0.25759944875838126, "grad_norm": 51.25, "kl": 1.2506141662597656, "learning_rate": 5e-07, "logits/chosen": 19377465.333333332, "logits/rejected": -15050443.2, "logps/chosen": -370.7775065104167, "logps/rejected": -155.8095458984375, "loss": 0.2688, "rewards/chosen": 0.5343470176060995, "rewards/margins": 2.3706371863683064, "rewards/rejected": -1.836290168762207, "step": 4860 }, { "epoch": 0.2576524527601834, "grad_norm": 60.75, "kl": 0.183074951171875, "learning_rate": 5e-07, "logits/chosen": -87066528.0, "logits/rejected": -14289175.0, "logps/chosen": -364.4527893066406, "logps/rejected": -215.46054077148438, "loss": 0.3617, "rewards/chosen": 0.1577156037092209, "rewards/margins": 1.3627962321043015, "rewards/rejected": -1.2050806283950806, "step": 4861 }, { "epoch": 0.25770545676198553, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11112276.0, "logits/rejected": -10483474.666666666, "logps/chosen": -259.2018737792969, "logps/rejected": -142.0386962890625, "loss": 0.2676, "rewards/chosen": 0.06821175664663315, "rewards/margins": 1.862026256819566, "rewards/rejected": -1.7938145001729329, "step": 4862 }, { "epoch": 0.25775846076378767, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56942164.0, "logits/rejected": -26059988.0, "logps/chosen": -252.45147705078125, "logps/rejected": -385.5824890136719, "loss": 0.3019, "rewards/chosen": 0.11525154113769531, "rewards/margins": 2.294431686401367, "rewards/rejected": -2.179180145263672, "step": 4863 }, { "epoch": 0.2578114647655898, "grad_norm": 63.25, "kl": 0.3782625198364258, "learning_rate": 5e-07, "logits/chosen": -16774912.0, "logits/rejected": -10792792.0, "logps/chosen": -225.7377726236979, "logps/rejected": -287.3946533203125, "loss": 0.4383, "rewards/chosen": 0.13431484500567117, "rewards/margins": 0.9649365643660227, "rewards/rejected": -0.8306217193603516, "step": 4864 }, { "epoch": 0.25786446876739194, "grad_norm": 36.75, "kl": 0.5829463005065918, "learning_rate": 5e-07, "logits/chosen": -2977619.75, "logits/rejected": -27385004.0, "logps/chosen": -115.72111511230469, "logps/rejected": -250.08642578125, "loss": 0.3011, "rewards/chosen": 0.13010433316230774, "rewards/margins": 2.6601217091083527, "rewards/rejected": -2.530017375946045, "step": 4865 }, { "epoch": 0.2579174727691941, "grad_norm": 45.25, "kl": 0.49477386474609375, "learning_rate": 5e-07, "logits/chosen": -57970004.0, "logits/rejected": -20736460.0, "logps/chosen": -225.15415954589844, "logps/rejected": -264.40191650390625, "loss": 0.3608, "rewards/chosen": -0.09783878922462463, "rewards/margins": 1.718450278043747, "rewards/rejected": -1.8162890672683716, "step": 4866 }, { "epoch": 0.2579704767709962, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23251700.8, "logits/rejected": -46002197.333333336, "logps/chosen": -255.007666015625, "logps/rejected": -503.5886637369792, "loss": 0.2969, "rewards/chosen": 0.2801643133163452, "rewards/margins": 3.0644215663274132, "rewards/rejected": -2.784257253011068, "step": 4867 }, { "epoch": 0.25802348077279835, "grad_norm": 63.75, "kl": 0.35086822509765625, "learning_rate": 5e-07, "logits/chosen": 56952480.0, "logits/rejected": -13680244.8, "logps/chosen": -433.3301595052083, "logps/rejected": -136.235546875, "loss": 0.3033, "rewards/chosen": 0.4050430456797282, "rewards/margins": 1.8271007696787518, "rewards/rejected": -1.4220577239990235, "step": 4868 }, { "epoch": 0.2580764847746005, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64072832.0, "logits/rejected": 4501077.0, "logps/chosen": -740.34033203125, "logps/rejected": -506.1381022135417, "loss": 0.1954, "rewards/chosen": 0.9659394025802612, "rewards/margins": 2.8251359065373736, "rewards/rejected": -1.8591965039571126, "step": 4869 }, { "epoch": 0.2581294887764026, "grad_norm": 53.0, "kl": 0.6923456192016602, "learning_rate": 5e-07, "logits/chosen": 11830730.0, "logits/rejected": -10804836.0, "logps/chosen": -427.75396728515625, "logps/rejected": -171.28182983398438, "loss": 0.3365, "rewards/chosen": 0.20187950134277344, "rewards/margins": 1.9901012182235718, "rewards/rejected": -1.7882217168807983, "step": 4870 }, { "epoch": 0.25818249277820476, "grad_norm": 64.5, "kl": 1.2223243713378906, "learning_rate": 5e-07, "logits/chosen": -2544510.5714285714, "logits/rejected": 2550143.5, "logps/chosen": -189.02155412946428, "logps/rejected": -20.249338150024414, "loss": 0.4571, "rewards/chosen": 0.3490751470838274, "rewards/margins": 0.31435548993093626, "rewards/rejected": 0.03471965715289116, "step": 4871 }, { "epoch": 0.2582354967800069, "grad_norm": 67.5, "kl": 1.2985115051269531, "learning_rate": 5e-07, "logits/chosen": -25492900.57142857, "logits/rejected": -40349168.0, "logps/chosen": -275.84898158482144, "logps/rejected": -926.8553466796875, "loss": 0.4787, "rewards/chosen": -0.06636192117418561, "rewards/margins": 3.281593544142587, "rewards/rejected": -3.3479554653167725, "step": 4872 }, { "epoch": 0.25828850078180904, "grad_norm": 62.5, "kl": 0.4754180908203125, "learning_rate": 5e-07, "logits/chosen": -40241448.0, "logits/rejected": -31114896.0, "logps/chosen": -562.364990234375, "logps/rejected": -253.74111328125, "loss": 0.2937, "rewards/chosen": 0.346498966217041, "rewards/margins": 2.0023699760437013, "rewards/rejected": -1.6558710098266602, "step": 4873 }, { "epoch": 0.2583415047836112, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31897624.0, "logits/rejected": -34967971.2, "logps/chosen": -261.3402913411458, "logps/rejected": -329.727197265625, "loss": 0.271, "rewards/chosen": 0.2548786203066508, "rewards/margins": 2.1895818750063576, "rewards/rejected": -1.934703254699707, "step": 4874 }, { "epoch": 0.2583945087854133, "grad_norm": 67.0, "kl": 0.15438461303710938, "learning_rate": 5e-07, "logits/chosen": -113531157.33333333, "logits/rejected": -65783016.0, "logps/chosen": -528.2522379557291, "logps/rejected": -121.54759979248047, "loss": 0.4328, "rewards/chosen": 0.1894092559814453, "rewards/margins": 0.8644412755966187, "rewards/rejected": -0.6750320196151733, "step": 4875 }, { "epoch": 0.25844751278721545, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2319971.75, "logits/rejected": -23720712.0, "logps/chosen": -137.96517944335938, "logps/rejected": -335.72802734375, "loss": 0.3193, "rewards/chosen": 0.005351334810256958, "rewards/margins": 1.9562110006809235, "rewards/rejected": -1.9508596658706665, "step": 4876 }, { "epoch": 0.2585005167890176, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5007960.0, "logits/rejected": -11697282.666666666, "logps/chosen": -92.00003814697266, "logps/rejected": -228.8537801106771, "loss": 0.2724, "rewards/chosen": -0.04197120666503906, "rewards/margins": 1.6055045127868652, "rewards/rejected": -1.6474757194519043, "step": 4877 }, { "epoch": 0.2585535207908197, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20358796.0, "logits/rejected": -13872880.0, "logps/chosen": -132.9707794189453, "logps/rejected": -177.08554077148438, "loss": 0.3114, "rewards/chosen": 0.26014918088912964, "rewards/margins": 1.9938074946403503, "rewards/rejected": -1.7336583137512207, "step": 4878 }, { "epoch": 0.25860652479262186, "grad_norm": 46.75, "kl": 0.9749622344970703, "learning_rate": 5e-07, "logits/chosen": -33503546.666666668, "logits/rejected": -14533246.4, "logps/chosen": -260.517822265625, "logps/rejected": -510.157177734375, "loss": 0.2511, "rewards/chosen": 0.6232545375823975, "rewards/margins": 2.425569486618042, "rewards/rejected": -1.8023149490356445, "step": 4879 }, { "epoch": 0.258659528794424, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1610772.0, "logits/rejected": 43402520.0, "logps/chosen": -115.27203369140625, "logps/rejected": -280.44476318359375, "loss": 0.2946, "rewards/chosen": 0.2961382269859314, "rewards/margins": 2.1135846972465515, "rewards/rejected": -1.8174464702606201, "step": 4880 }, { "epoch": 0.25871253279622614, "grad_norm": 53.75, "kl": 2.1343936920166016, "learning_rate": 5e-07, "logits/chosen": 2655298.4, "logits/rejected": -44192984.0, "logps/chosen": -639.3666015625, "logps/rejected": -189.8440144856771, "loss": 0.3799, "rewards/chosen": 0.6318231105804444, "rewards/margins": 1.835141944885254, "rewards/rejected": -1.2033188343048096, "step": 4881 }, { "epoch": 0.2587655367980283, "grad_norm": 42.25, "kl": 0.0755767822265625, "learning_rate": 5e-07, "logits/chosen": -9308762.0, "logits/rejected": -23539206.0, "logps/chosen": -183.5423583984375, "logps/rejected": -435.43353271484375, "loss": 0.2429, "rewards/chosen": 0.4560021758079529, "rewards/margins": 3.270890176296234, "rewards/rejected": -2.8148880004882812, "step": 4882 }, { "epoch": 0.2588185407998304, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28737938.666666668, "logits/rejected": -21200826.0, "logps/chosen": -434.3626302083333, "logps/rejected": -266.72314453125, "loss": 0.3893, "rewards/chosen": 0.20275853077570596, "rewards/margins": 1.8886616031328838, "rewards/rejected": -1.6859030723571777, "step": 4883 }, { "epoch": 0.25887154480163255, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 558659.125, "logits/rejected": -31368461.333333332, "logps/chosen": -85.34393310546875, "logps/rejected": -320.0318196614583, "loss": 0.2187, "rewards/chosen": -0.3031372129917145, "rewards/margins": 2.4096663097540536, "rewards/rejected": -2.712803522745768, "step": 4884 }, { "epoch": 0.2589245488034347, "grad_norm": 66.5, "kl": 0.9235000610351562, "learning_rate": 5e-07, "logits/chosen": -71382112.0, "logits/rejected": -32733549.333333332, "logps/chosen": -388.4244384765625, "logps/rejected": -338.18410237630206, "loss": 0.4233, "rewards/chosen": -0.13715558052062987, "rewards/margins": 1.6795782566070556, "rewards/rejected": -1.8167338371276855, "step": 4885 }, { "epoch": 0.2589775528052368, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21901491.2, "logits/rejected": -10167136.0, "logps/chosen": -277.89580078125, "logps/rejected": -182.77081298828125, "loss": 0.2742, "rewards/chosen": 0.6199357032775878, "rewards/margins": 2.5965064684549968, "rewards/rejected": -1.976570765177409, "step": 4886 }, { "epoch": 0.25903055680703896, "grad_norm": 63.0, "kl": 0.4899749755859375, "learning_rate": 5e-07, "logits/chosen": -27075045.333333332, "logits/rejected": -150426880.0, "logps/chosen": -457.70751953125, "logps/rejected": -227.63861083984375, "loss": 0.4189, "rewards/chosen": 0.17974026997884116, "rewards/margins": 1.601202090581258, "rewards/rejected": -1.421461820602417, "step": 4887 }, { "epoch": 0.25908356080884104, "grad_norm": 34.5, "kl": 0.1029815673828125, "learning_rate": 5e-07, "logits/chosen": 1488795.3, "logits/rejected": -11447597.333333334, "logps/chosen": -339.212158203125, "logps/rejected": -166.79827880859375, "loss": 0.3321, "rewards/chosen": 0.3165379285812378, "rewards/margins": 2.2728124380111696, "rewards/rejected": -1.9562745094299316, "step": 4888 }, { "epoch": 0.2591365648106432, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37552748.0, "logits/rejected": -68309008.0, "logps/chosen": -279.4677734375, "logps/rejected": -391.56829833984375, "loss": 0.3241, "rewards/chosen": -0.13163337111473083, "rewards/margins": 2.2018889486789703, "rewards/rejected": -2.333522319793701, "step": 4889 }, { "epoch": 0.2591895688124453, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20453380.0, "logits/rejected": -20022288.0, "logps/chosen": -268.21038818359375, "logps/rejected": -217.18328857421875, "loss": 0.3423, "rewards/chosen": -0.29637691378593445, "rewards/margins": 2.2080457508563995, "rewards/rejected": -2.504422664642334, "step": 4890 }, { "epoch": 0.25924257281424745, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56301290.666666664, "logits/rejected": -90936880.0, "logps/chosen": -400.3806966145833, "logps/rejected": -711.1834716796875, "loss": 0.294, "rewards/chosen": 0.5660038391749064, "rewards/margins": 4.0766163269678755, "rewards/rejected": -3.5106124877929688, "step": 4891 }, { "epoch": 0.2592955768160496, "grad_norm": 56.75, "kl": 2.688068389892578, "learning_rate": 5e-07, "logits/chosen": -16603664.0, "logits/rejected": -4311059.5, "logps/chosen": -424.923828125, "logps/rejected": -81.01628875732422, "loss": 0.3764, "rewards/chosen": 0.7383233479091099, "rewards/margins": 3.5990823677607944, "rewards/rejected": -2.8607590198516846, "step": 4892 }, { "epoch": 0.2593485808178517, "grad_norm": 55.25, "kl": 1.2520437240600586, "learning_rate": 5e-07, "logits/chosen": -9296110.0, "logits/rejected": -29144228.0, "logps/chosen": -450.8597412109375, "logps/rejected": -430.80999755859375, "loss": 0.3383, "rewards/chosen": 0.5927547812461853, "rewards/margins": 2.1784228682518005, "rewards/rejected": -1.5856680870056152, "step": 4893 }, { "epoch": 0.25940158481965386, "grad_norm": 43.75, "kl": 0.3910560607910156, "learning_rate": 5e-07, "logits/chosen": -12933052.8, "logits/rejected": -9837852.0, "logps/chosen": -224.3452392578125, "logps/rejected": -84.2284444173177, "loss": 0.305, "rewards/chosen": 0.46559486389160154, "rewards/margins": 2.591206169128418, "rewards/rejected": -2.1256113052368164, "step": 4894 }, { "epoch": 0.259454588821456, "grad_norm": 48.25, "kl": 0.10093307495117188, "learning_rate": 5e-07, "logits/chosen": -44537669.333333336, "logits/rejected": -18382300.0, "logps/chosen": -227.273193359375, "logps/rejected": -479.52899169921875, "loss": 0.2903, "rewards/chosen": 0.6254028081893921, "rewards/margins": 2.9937347173690796, "rewards/rejected": -2.3683319091796875, "step": 4895 }, { "epoch": 0.25950759282325814, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17752265.6, "logits/rejected": -28840493.333333332, "logps/chosen": -152.82275390625, "logps/rejected": -384.4230550130208, "loss": 0.3982, "rewards/chosen": -0.11515572071075439, "rewards/margins": 1.434511160850525, "rewards/rejected": -1.5496668815612793, "step": 4896 }, { "epoch": 0.2595605968250603, "grad_norm": 35.75, "kl": 0.35187530517578125, "learning_rate": 5e-07, "logits/chosen": -15563864.0, "logits/rejected": -14541353.6, "logps/chosen": -142.4508260091146, "logps/rejected": -295.851513671875, "loss": 0.3279, "rewards/chosen": -0.06873048345247905, "rewards/margins": 1.68692423303922, "rewards/rejected": -1.7556547164916991, "step": 4897 }, { "epoch": 0.2596136008268624, "grad_norm": 57.25, "kl": 2.002290725708008, "learning_rate": 5e-07, "logits/chosen": -61635904.0, "logits/rejected": -19300940.0, "logps/chosen": -893.0642700195312, "logps/rejected": -248.46795654296875, "loss": 0.2408, "rewards/chosen": 0.9424148201942444, "rewards/margins": 3.3677812218666077, "rewards/rejected": -2.4253664016723633, "step": 4898 }, { "epoch": 0.25966660482866455, "grad_norm": 52.0, "kl": 0.2041168212890625, "learning_rate": 5e-07, "logits/chosen": -15088326.666666666, "logits/rejected": -17172902.4, "logps/chosen": -299.44970703125, "logps/rejected": -202.166455078125, "loss": 0.2754, "rewards/chosen": 0.5489476124445597, "rewards/margins": 2.1852932850519817, "rewards/rejected": -1.636345672607422, "step": 4899 }, { "epoch": 0.2597196088304667, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35458808.0, "logits/rejected": 64569504.0, "logps/chosen": -393.3974202473958, "logps/rejected": -227.85146484375, "loss": 0.3386, "rewards/chosen": -0.38179930051167804, "rewards/margins": 1.3373464743296306, "rewards/rejected": -1.7191457748413086, "step": 4900 }, { "epoch": 0.2597726128322688, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75066792.0, "logits/rejected": -40165749.333333336, "logps/chosen": -200.26394653320312, "logps/rejected": -293.1808675130208, "loss": 0.2038, "rewards/chosen": -0.025924310088157654, "rewards/margins": 2.330799793203672, "rewards/rejected": -2.3567241032918296, "step": 4901 }, { "epoch": 0.25982561683407096, "grad_norm": 53.0, "kl": 0.2159423828125, "learning_rate": 5e-07, "logits/chosen": -83238438.4, "logits/rejected": -1986447.3333333333, "logps/chosen": -288.8734619140625, "logps/rejected": -205.84869384765625, "loss": 0.4185, "rewards/chosen": -0.22615294456481932, "rewards/margins": 1.518778912226359, "rewards/rejected": -1.7449318567911785, "step": 4902 }, { "epoch": 0.2598786208358731, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67889272.0, "logits/rejected": 7795796.666666667, "logps/chosen": -342.08392333984375, "logps/rejected": -173.04654947916666, "loss": 0.2945, "rewards/chosen": 0.4744163453578949, "rewards/margins": 1.6776833633581798, "rewards/rejected": -1.203267018000285, "step": 4903 }, { "epoch": 0.25993162483767523, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1992798.0, "logits/rejected": -20014554.0, "logps/chosen": -248.80816650390625, "logps/rejected": -350.3155517578125, "loss": 0.2771, "rewards/chosen": 0.6377547979354858, "rewards/margins": 2.33130419254303, "rewards/rejected": -1.693549394607544, "step": 4904 }, { "epoch": 0.25998462883947737, "grad_norm": 50.5, "kl": 0.5824031829833984, "learning_rate": 5e-07, "logits/chosen": -25857005.714285713, "logits/rejected": -47650844.0, "logps/chosen": -323.60414341517856, "logps/rejected": -502.60894775390625, "loss": 0.3717, "rewards/chosen": 0.4232089860098703, "rewards/margins": 3.802914891924177, "rewards/rejected": -3.3797059059143066, "step": 4905 }, { "epoch": 0.2600376328412795, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14474567.0, "logits/rejected": -31554058.0, "logps/chosen": -79.37127685546875, "logps/rejected": -478.4215087890625, "loss": 0.2499, "rewards/chosen": 0.3090469241142273, "rewards/margins": 2.9343664050102234, "rewards/rejected": -2.625319480895996, "step": 4906 }, { "epoch": 0.26009063684308165, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1027946.1666666666, "logits/rejected": -35468454.4, "logps/chosen": -178.09344482421875, "logps/rejected": -272.1705322265625, "loss": 0.2513, "rewards/chosen": 0.17824339866638184, "rewards/margins": 2.258254957199097, "rewards/rejected": -2.080011558532715, "step": 4907 }, { "epoch": 0.2601436408448838, "grad_norm": 34.75, "kl": 0.23769855499267578, "learning_rate": 5e-07, "logits/chosen": -3273142.0, "logits/rejected": -19948499.2, "logps/chosen": -130.61409505208334, "logps/rejected": -457.80791015625, "loss": 0.1994, "rewards/chosen": 0.46695494651794434, "rewards/margins": 3.222509527206421, "rewards/rejected": -2.7555545806884765, "step": 4908 }, { "epoch": 0.2601966448466859, "grad_norm": 48.25, "kl": 2.583846092224121, "learning_rate": 5e-07, "logits/chosen": 5904762.0, "logits/rejected": -50224112.0, "logps/chosen": -563.4550170898438, "logps/rejected": -456.52801513671875, "loss": 0.3233, "rewards/chosen": 0.35136690735816956, "rewards/margins": 2.7431727945804596, "rewards/rejected": -2.39180588722229, "step": 4909 }, { "epoch": 0.26024964884848806, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9884224.0, "logits/rejected": -25779620.0, "logps/chosen": -104.77593994140625, "logps/rejected": -254.0145721435547, "loss": 0.2983, "rewards/chosen": -4.5433640480041504e-05, "rewards/margins": 2.6197982877492905, "rewards/rejected": -2.6198437213897705, "step": 4910 }, { "epoch": 0.2603026528502902, "grad_norm": 20.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38195432.0, "logits/rejected": -45521946.666666664, "logps/chosen": -174.3668212890625, "logps/rejected": -645.674560546875, "loss": 0.1287, "rewards/chosen": 0.23682785034179688, "rewards/margins": 4.429253578186035, "rewards/rejected": -4.192425727844238, "step": 4911 }, { "epoch": 0.26035565685209233, "grad_norm": 42.0, "kl": 0.7538833618164062, "learning_rate": 5e-07, "logits/chosen": -25494770.0, "logits/rejected": 7230601.333333333, "logps/chosen": -455.96246337890625, "logps/rejected": -136.56120808919272, "loss": 0.2171, "rewards/chosen": 1.262702226638794, "rewards/margins": 2.9385747114817304, "rewards/rejected": -1.6758724848429363, "step": 4912 }, { "epoch": 0.26040866085389447, "grad_norm": 51.5, "kl": 0.2785758972167969, "learning_rate": 5e-07, "logits/chosen": -26925850.666666668, "logits/rejected": -24766097.6, "logps/chosen": -293.8157958984375, "logps/rejected": -304.9196044921875, "loss": 0.2654, "rewards/chosen": 0.4348420699437459, "rewards/margins": 2.113264234860738, "rewards/rejected": -1.6784221649169921, "step": 4913 }, { "epoch": 0.2604616648556966, "grad_norm": 52.25, "kl": 1.1985549926757812, "learning_rate": 5e-07, "logits/chosen": -32171152.0, "logits/rejected": -25073337.6, "logps/chosen": -393.0169677734375, "logps/rejected": -426.49833984375, "loss": 0.264, "rewards/chosen": 0.9364665349324545, "rewards/margins": 2.9908896764119466, "rewards/rejected": -2.054423141479492, "step": 4914 }, { "epoch": 0.26051466885749874, "grad_norm": 54.5, "kl": 0.2539482116699219, "learning_rate": 5e-07, "logits/chosen": -8525532.8, "logits/rejected": -8279891.333333333, "logps/chosen": -240.3616943359375, "logps/rejected": -286.4779866536458, "loss": 0.3564, "rewards/chosen": 0.2232808828353882, "rewards/margins": 1.7638296842575074, "rewards/rejected": -1.5405488014221191, "step": 4915 }, { "epoch": 0.2605676728593009, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47441280.0, "logits/rejected": -448849.75, "logps/chosen": -337.46551513671875, "logps/rejected": -309.983642578125, "loss": 0.3337, "rewards/chosen": 0.3702678382396698, "rewards/margins": 1.5319838225841522, "rewards/rejected": -1.1617159843444824, "step": 4916 }, { "epoch": 0.260620676861103, "grad_norm": 42.0, "kl": 0.2297191619873047, "learning_rate": 5e-07, "logits/chosen": -30138848.0, "logits/rejected": -13487508.0, "logps/chosen": -195.427001953125, "logps/rejected": -228.57827758789062, "loss": 0.3253, "rewards/chosen": 0.24366474151611328, "rewards/margins": 1.7792816162109375, "rewards/rejected": -1.5356168746948242, "step": 4917 }, { "epoch": 0.26067368086290515, "grad_norm": 51.75, "kl": 0.7014427185058594, "learning_rate": 5e-07, "logits/chosen": -47108213.333333336, "logits/rejected": -43970566.4, "logps/chosen": -314.8704020182292, "logps/rejected": -460.864453125, "loss": 0.2686, "rewards/chosen": 0.16308085123697916, "rewards/margins": 2.4286492029825846, "rewards/rejected": -2.2655683517456056, "step": 4918 }, { "epoch": 0.2607266848647073, "grad_norm": 41.75, "kl": 0.6210870742797852, "learning_rate": 5e-07, "logits/chosen": -16374693.0, "logits/rejected": -12703642.0, "logps/chosen": -110.54359436035156, "logps/rejected": -240.97698974609375, "loss": 0.3796, "rewards/chosen": -0.08214139938354492, "rewards/margins": 1.595423698425293, "rewards/rejected": -1.677565097808838, "step": 4919 }, { "epoch": 0.26077968886650943, "grad_norm": 65.0, "kl": 0.1614532470703125, "learning_rate": 5e-07, "logits/chosen": -26557696.0, "logits/rejected": 15211810.0, "logps/chosen": -213.2815399169922, "logps/rejected": -209.6017303466797, "loss": 0.3918, "rewards/chosen": 0.03292074799537659, "rewards/margins": 0.9865316450595856, "rewards/rejected": -0.953610897064209, "step": 4920 }, { "epoch": 0.26083269286831157, "grad_norm": 54.5, "kl": 0.8271408081054688, "learning_rate": 5e-07, "logits/chosen": -83376480.0, "logits/rejected": -23795826.0, "logps/chosen": -252.37445068359375, "logps/rejected": -299.80157470703125, "loss": 0.4166, "rewards/chosen": -0.16335296630859375, "rewards/margins": 1.0365095138549805, "rewards/rejected": -1.1998624801635742, "step": 4921 }, { "epoch": 0.2608856968701137, "grad_norm": 52.0, "kl": 0.6128616333007812, "learning_rate": 5e-07, "logits/chosen": -23553428.8, "logits/rejected": -7915014.666666667, "logps/chosen": -187.811572265625, "logps/rejected": -103.2040506998698, "loss": 0.4222, "rewards/chosen": -0.011687320470809937, "rewards/margins": 1.509760179122289, "rewards/rejected": -1.5214474995930989, "step": 4922 }, { "epoch": 0.26093870087191584, "grad_norm": 48.25, "kl": 0.3071861267089844, "learning_rate": 5e-07, "logits/chosen": -3782640.8, "logits/rejected": -28121109.333333332, "logps/chosen": -143.0615966796875, "logps/rejected": -299.8457845052083, "loss": 0.4516, "rewards/chosen": -0.33182587623596194, "rewards/margins": 1.0463716348012286, "rewards/rejected": -1.3781975110371907, "step": 4923 }, { "epoch": 0.260991704873718, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94970272.0, "logits/rejected": -8262722.285714285, "logps/chosen": -586.93212890625, "logps/rejected": -218.28867885044642, "loss": 0.1916, "rewards/chosen": -0.570483386516571, "rewards/margins": 1.484697026865823, "rewards/rejected": -2.055180413382394, "step": 4924 }, { "epoch": 0.2610447088755201, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75348112.0, "logits/rejected": -29790354.666666668, "logps/chosen": -517.1854858398438, "logps/rejected": -298.977783203125, "loss": 0.2228, "rewards/chosen": 0.6458038091659546, "rewards/margins": 2.4154135783513384, "rewards/rejected": -1.769609769185384, "step": 4925 }, { "epoch": 0.26109771287732225, "grad_norm": 52.75, "kl": 0.15900039672851562, "learning_rate": 5e-07, "logits/chosen": -25032496.0, "logits/rejected": -11776402.0, "logps/chosen": -339.5578918457031, "logps/rejected": -221.97525024414062, "loss": 0.3074, "rewards/chosen": 0.12077417969703674, "rewards/margins": 2.232027441263199, "rewards/rejected": -2.111253261566162, "step": 4926 }, { "epoch": 0.2611507168791244, "grad_norm": 69.0, "kl": 0.18799591064453125, "learning_rate": 5e-07, "logits/chosen": -22345027.2, "logits/rejected": -3342246.3333333335, "logps/chosen": -248.153564453125, "logps/rejected": -316.91404215494794, "loss": 0.2895, "rewards/chosen": 0.7261864185333252, "rewards/margins": 2.206242863337199, "rewards/rejected": -1.4800564448038738, "step": 4927 }, { "epoch": 0.2612037208809265, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33196150.0, "logits/rejected": 2744765.0, "logps/chosen": -378.5932312011719, "logps/rejected": -379.32696533203125, "loss": 0.2855, "rewards/chosen": 0.36683276295661926, "rewards/margins": 2.1576248705387115, "rewards/rejected": -1.7907921075820923, "step": 4928 }, { "epoch": 0.26125672488272866, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8729505.0, "logits/rejected": -12883093.333333334, "logps/chosen": -109.83482360839844, "logps/rejected": -186.44820149739584, "loss": 0.3085, "rewards/chosen": -0.7591550946235657, "rewards/margins": 0.8328018387158711, "rewards/rejected": -1.5919569333394368, "step": 4929 }, { "epoch": 0.2613097288845308, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14421603.0, "logits/rejected": -10377068.0, "logps/chosen": -282.878662109375, "logps/rejected": -516.7457275390625, "loss": 0.3638, "rewards/chosen": 0.03608007729053497, "rewards/margins": 1.6953564435243607, "rewards/rejected": -1.6592763662338257, "step": 4930 }, { "epoch": 0.26136273288633294, "grad_norm": 59.0, "kl": 0.6673364639282227, "learning_rate": 5e-07, "logits/chosen": -27117577.6, "logits/rejected": -22907800.0, "logps/chosen": -339.51806640625, "logps/rejected": -289.2489420572917, "loss": 0.3103, "rewards/chosen": 0.24881196022033691, "rewards/margins": 3.362663825352987, "rewards/rejected": -3.11385186513265, "step": 4931 }, { "epoch": 0.2614157368881351, "grad_norm": 103.0, "kl": 6.587404251098633, "learning_rate": 5e-07, "logits/chosen": -15044022.4, "logits/rejected": -34096677.333333336, "logps/chosen": -925.2126953125, "logps/rejected": -263.98095703125, "loss": 0.351, "rewards/chosen": 1.2500946044921875, "rewards/margins": 2.7755526224772136, "rewards/rejected": -1.5254580179850261, "step": 4932 }, { "epoch": 0.2614687408899372, "grad_norm": 46.5, "kl": 0.5919170379638672, "learning_rate": 5e-07, "logits/chosen": -63120761.6, "logits/rejected": -123570944.0, "logps/chosen": -303.687744140625, "logps/rejected": -296.51662190755206, "loss": 0.2782, "rewards/chosen": 0.7363394260406494, "rewards/margins": 2.6360302130381266, "rewards/rejected": -1.8996907869974773, "step": 4933 }, { "epoch": 0.26152174489173935, "grad_norm": 77.0, "kl": 1.040334701538086, "learning_rate": 5e-07, "logits/chosen": -26490682.666666668, "logits/rejected": 3529768.75, "logps/chosen": -300.1981608072917, "logps/rejected": -62.373573303222656, "loss": 0.5385, "rewards/chosen": -0.1826905608177185, "rewards/margins": 0.11402201652526855, "rewards/rejected": -0.29671257734298706, "step": 4934 }, { "epoch": 0.2615747488935415, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55987604.0, "logits/rejected": -22268346.0, "logps/chosen": -477.70770263671875, "logps/rejected": -144.74159240722656, "loss": 0.3062, "rewards/chosen": 0.2869323790073395, "rewards/margins": 2.121847301721573, "rewards/rejected": -1.8349149227142334, "step": 4935 }, { "epoch": 0.2616277528953436, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41442066.666666664, "logits/rejected": -9786196.8, "logps/chosen": -192.48421223958334, "logps/rejected": -151.65614013671876, "loss": 0.2775, "rewards/chosen": 0.22172242403030396, "rewards/margins": 1.9050087094306947, "rewards/rejected": -1.6832862854003907, "step": 4936 }, { "epoch": 0.26168075689714576, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31237834.0, "logits/rejected": -27897948.0, "logps/chosen": -216.59481811523438, "logps/rejected": -455.9754638671875, "loss": 0.2629, "rewards/chosen": 0.23782110214233398, "rewards/margins": 3.123534679412842, "rewards/rejected": -2.885713577270508, "step": 4937 }, { "epoch": 0.2617337608989479, "grad_norm": 86.0, "kl": 2.574066162109375, "learning_rate": 5e-07, "logits/chosen": -40752970.666666664, "logits/rejected": 54516620.8, "logps/chosen": -1285.862548828125, "logps/rejected": -213.551318359375, "loss": 0.305, "rewards/chosen": 1.3016103108723958, "rewards/margins": 2.320999463399251, "rewards/rejected": -1.0193891525268555, "step": 4938 }, { "epoch": 0.26178676490075, "grad_norm": 86.0, "kl": 0.7069244384765625, "learning_rate": 5e-07, "logits/chosen": -24388490.666666668, "logits/rejected": -57359656.0, "logps/chosen": -387.254150390625, "logps/rejected": -382.9730529785156, "loss": 0.3739, "rewards/chosen": 0.21889607111612955, "rewards/margins": 2.1971302429835, "rewards/rejected": -1.9782341718673706, "step": 4939 }, { "epoch": 0.2618397689025521, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -85763848.0, "logits/rejected": -3402225.5, "logps/chosen": -795.20166015625, "logps/rejected": -419.1604309082031, "loss": 0.2572, "rewards/chosen": 0.6284439563751221, "rewards/margins": 2.9563841819763184, "rewards/rejected": -2.3279402256011963, "step": 4940 }, { "epoch": 0.26189277290435425, "grad_norm": 73.0, "kl": 4.26643180847168, "learning_rate": 5e-07, "logits/chosen": -11678795.42857143, "logits/rejected": -144401440.0, "logps/chosen": -358.52968052455356, "logps/rejected": -328.3074951171875, "loss": 0.4175, "rewards/chosen": 0.6824173927307129, "rewards/margins": 2.325059652328491, "rewards/rejected": -1.6426422595977783, "step": 4941 }, { "epoch": 0.2619457769061564, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57397164.0, "logits/rejected": -60517280.0, "logps/chosen": -507.85009765625, "logps/rejected": -541.4259033203125, "loss": 0.1311, "rewards/chosen": 0.7321853637695312, "rewards/margins": 3.7630189259847007, "rewards/rejected": -3.0308335622151694, "step": 4942 }, { "epoch": 0.26199878090795853, "grad_norm": 38.5, "kl": 0.32471466064453125, "learning_rate": 5e-07, "logits/chosen": 7068282.666666667, "logits/rejected": -26013846.4, "logps/chosen": -85.53647867838542, "logps/rejected": -239.5446533203125, "loss": 0.2921, "rewards/chosen": 0.029565433661142986, "rewards/margins": 2.151174930731455, "rewards/rejected": -2.1216094970703123, "step": 4943 }, { "epoch": 0.26205178490976067, "grad_norm": 53.5, "kl": 0.7294139862060547, "learning_rate": 5e-07, "logits/chosen": 7054.8, "logits/rejected": -11520740.0, "logps/chosen": -105.68060302734375, "logps/rejected": -223.02266438802084, "loss": 0.4609, "rewards/chosen": -0.42574119567871094, "rewards/margins": 0.965090274810791, "rewards/rejected": -1.390831470489502, "step": 4944 }, { "epoch": 0.2621047889115628, "grad_norm": 61.75, "kl": 0.024173736572265625, "learning_rate": 5e-07, "logits/chosen": -55054688.0, "logits/rejected": -12003642.666666666, "logps/chosen": -299.2673583984375, "logps/rejected": -368.8833414713542, "loss": 0.3628, "rewards/chosen": 0.04195571541786194, "rewards/margins": 1.8106426457564038, "rewards/rejected": -1.7686869303385417, "step": 4945 }, { "epoch": 0.26215779291336494, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44972240.0, "logits/rejected": -2392821.6666666665, "logps/chosen": -280.9209899902344, "logps/rejected": -146.10392252604166, "loss": 0.2566, "rewards/chosen": 0.8428802490234375, "rewards/margins": 2.0763455231984453, "rewards/rejected": -1.233465274175008, "step": 4946 }, { "epoch": 0.2622107969151671, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28726314.666666668, "logits/rejected": 17972254.4, "logps/chosen": -432.7867024739583, "logps/rejected": -185.13515625, "loss": 0.3473, "rewards/chosen": 0.5494181315104166, "rewards/margins": 1.3254703203837077, "rewards/rejected": -0.776052188873291, "step": 4947 }, { "epoch": 0.2622638009169692, "grad_norm": 55.5, "kl": 0.6490325927734375, "learning_rate": 5e-07, "logits/chosen": -49006672.0, "logits/rejected": -19877308.0, "logps/chosen": -538.58251953125, "logps/rejected": -218.07723999023438, "loss": 0.3854, "rewards/chosen": -0.04634551703929901, "rewards/margins": 1.5647277384996414, "rewards/rejected": -1.6110732555389404, "step": 4948 }, { "epoch": 0.26231680491877135, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20353168.0, "logits/rejected": -7496108.0, "logps/chosen": -102.99314880371094, "logps/rejected": -233.68714904785156, "loss": 0.403, "rewards/chosen": -0.3923566937446594, "rewards/margins": 1.2170557379722595, "rewards/rejected": -1.609412431716919, "step": 4949 }, { "epoch": 0.2623698089205735, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20354173.333333332, "logits/rejected": -5002293.6, "logps/chosen": -173.65157063802084, "logps/rejected": -176.06385498046876, "loss": 0.3386, "rewards/chosen": -0.2611672083536784, "rewards/margins": 1.3815402348836263, "rewards/rejected": -1.6427074432373048, "step": 4950 }, { "epoch": 0.2624228129223756, "grad_norm": 64.5, "kl": 0.3129692077636719, "learning_rate": 5e-07, "logits/chosen": -44758872.0, "logits/rejected": -69339264.0, "logps/chosen": -382.9291687011719, "logps/rejected": -374.90753173828125, "loss": 0.2734, "rewards/chosen": 0.48631781339645386, "rewards/margins": 2.3110427260398865, "rewards/rejected": -1.8247249126434326, "step": 4951 }, { "epoch": 0.26247581692417776, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 175401.921875, "logits/rejected": -15064006.857142856, "logps/chosen": -64.95297241210938, "logps/rejected": -225.96979631696428, "loss": 0.3098, "rewards/chosen": -1.4080898761749268, "rewards/margins": -0.04890206881931847, "rewards/rejected": -1.3591878073556083, "step": 4952 }, { "epoch": 0.2625288209259799, "grad_norm": 59.0, "kl": 0.37876319885253906, "learning_rate": 5e-07, "logits/chosen": -59457061.333333336, "logits/rejected": -5528683.2, "logps/chosen": -399.173583984375, "logps/rejected": -187.82952880859375, "loss": 0.3397, "rewards/chosen": 0.07878013451894124, "rewards/margins": 1.1917834838231405, "rewards/rejected": -1.1130033493041993, "step": 4953 }, { "epoch": 0.26258182492778204, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16051373.333333334, "logits/rejected": -16624675.2, "logps/chosen": -257.85630289713544, "logps/rejected": -192.704150390625, "loss": 0.3054, "rewards/chosen": 0.5612468719482422, "rewards/margins": 1.6712139129638672, "rewards/rejected": -1.109967041015625, "step": 4954 }, { "epoch": 0.2626348289295842, "grad_norm": 36.5, "kl": 0.5529260635375977, "learning_rate": 5e-07, "logits/chosen": 3771507.6666666665, "logits/rejected": -48033862.4, "logps/chosen": -37.488609313964844, "logps/rejected": -246.365673828125, "loss": 0.2613, "rewards/chosen": 0.43350303173065186, "rewards/margins": 2.2587786436080934, "rewards/rejected": -1.8252756118774414, "step": 4955 }, { "epoch": 0.2626878329313863, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1061649.5, "logits/rejected": -29946902.0, "logps/chosen": -57.816070556640625, "logps/rejected": -279.17437744140625, "loss": 0.3837, "rewards/chosen": -0.08529691398143768, "rewards/margins": 1.2099828273057938, "rewards/rejected": -1.2952797412872314, "step": 4956 }, { "epoch": 0.26274083693318845, "grad_norm": 45.25, "kl": 0.4606037139892578, "learning_rate": 5e-07, "logits/chosen": -23347102.0, "logits/rejected": -37455672.0, "logps/chosen": -238.71783447265625, "logps/rejected": -489.5009460449219, "loss": 0.305, "rewards/chosen": -0.1043996810913086, "rewards/margins": 3.2564773559570312, "rewards/rejected": -3.36087703704834, "step": 4957 }, { "epoch": 0.2627938409349906, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1724324.0, "logits/rejected": -10418803.2, "logps/chosen": -311.21225992838544, "logps/rejected": -254.8744384765625, "loss": 0.3332, "rewards/chosen": 0.2555675506591797, "rewards/margins": 1.5126613616943358, "rewards/rejected": -1.2570938110351562, "step": 4958 }, { "epoch": 0.2628468449367927, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13823633.0, "logits/rejected": -42926484.0, "logps/chosen": -141.67774963378906, "logps/rejected": -389.6856384277344, "loss": 0.3983, "rewards/chosen": -0.44377487897872925, "rewards/margins": 0.9936484694480896, "rewards/rejected": -1.4374233484268188, "step": 4959 }, { "epoch": 0.26289984893859486, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19035400.0, "logits/rejected": -25773381.333333332, "logps/chosen": -154.08587646484375, "logps/rejected": -220.55426025390625, "loss": 0.3708, "rewards/chosen": -0.15978864431381226, "rewards/margins": 2.1947439153989157, "rewards/rejected": -2.354532559712728, "step": 4960 }, { "epoch": 0.262952852940397, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 25804002.0, "logits/rejected": -27249490.0, "logps/chosen": -181.70758056640625, "logps/rejected": -395.13494873046875, "loss": 0.3299, "rewards/chosen": -0.2464626431465149, "rewards/margins": 2.145333230495453, "rewards/rejected": -2.3917958736419678, "step": 4961 }, { "epoch": 0.26300585694219913, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 34540384.0, "logits/rejected": -22926413.333333332, "logps/chosen": -14.654258728027344, "logps/rejected": -285.2628987630208, "loss": 0.239, "rewards/chosen": 0.7878506779670715, "rewards/margins": 2.423973858356476, "rewards/rejected": -1.6361231803894043, "step": 4962 }, { "epoch": 0.26305886094400127, "grad_norm": 56.5, "kl": 2.427854537963867, "learning_rate": 5e-07, "logits/chosen": -40091400.0, "logits/rejected": -24524828.0, "logps/chosen": -616.9569702148438, "logps/rejected": -181.15383911132812, "loss": 0.2878, "rewards/chosen": 1.169710636138916, "rewards/margins": 2.5343728065490723, "rewards/rejected": -1.3646621704101562, "step": 4963 }, { "epoch": 0.2631118649458034, "grad_norm": 50.5, "kl": 3.309311866760254, "learning_rate": 5e-07, "logits/chosen": -24252601.6, "logits/rejected": -23684026.666666668, "logps/chosen": -569.85029296875, "logps/rejected": -528.947021484375, "loss": 0.2738, "rewards/chosen": 1.3575274467468261, "rewards/margins": 3.6397783279418947, "rewards/rejected": -2.2822508811950684, "step": 4964 }, { "epoch": 0.26316486894760555, "grad_norm": 81.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 88007216.0, "logits/rejected": -21476428.0, "logps/chosen": -438.3288879394531, "logps/rejected": -215.8603515625, "loss": 0.2921, "rewards/chosen": 0.3815309703350067, "rewards/margins": 2.20069882273674, "rewards/rejected": -1.8191678524017334, "step": 4965 }, { "epoch": 0.2632178729494077, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50679616.0, "logits/rejected": -27017816.0, "logps/chosen": -251.08486938476562, "logps/rejected": -364.99200439453125, "loss": 0.3094, "rewards/chosen": -0.10378112643957138, "rewards/margins": 2.606282092630863, "rewards/rejected": -2.7100632190704346, "step": 4966 }, { "epoch": 0.2632708769512098, "grad_norm": 44.0, "kl": 0.058368682861328125, "learning_rate": 5e-07, "logits/chosen": -17399486.4, "logits/rejected": -45480421.333333336, "logps/chosen": -225.8388671875, "logps/rejected": -179.67724609375, "loss": 0.3048, "rewards/chosen": 0.3439006328582764, "rewards/margins": 2.745792023340861, "rewards/rejected": -2.4018913904825845, "step": 4967 }, { "epoch": 0.26332388095301196, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2006006.75, "logits/rejected": -21824642.666666668, "logps/chosen": -194.6813201904297, "logps/rejected": -285.7164713541667, "loss": 0.2157, "rewards/chosen": 0.3086814880371094, "rewards/margins": 2.3279167811075845, "rewards/rejected": -2.019235293070475, "step": 4968 }, { "epoch": 0.2633768849548141, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20748418.666666668, "logits/rejected": -527503.5, "logps/chosen": -314.36610921223956, "logps/rejected": -54.39714050292969, "loss": 0.4184, "rewards/chosen": -0.038130814830462136, "rewards/margins": 1.8387528111537297, "rewards/rejected": -1.876883625984192, "step": 4969 }, { "epoch": 0.26342988895661623, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45947296.0, "logits/rejected": -7172041.5, "logps/chosen": -437.95037841796875, "logps/rejected": -208.5198516845703, "loss": 0.3818, "rewards/chosen": -0.07777252793312073, "rewards/margins": 1.4086951911449432, "rewards/rejected": -1.486467719078064, "step": 4970 }, { "epoch": 0.26348289295841837, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37213064.0, "logits/rejected": -13631358.0, "logps/chosen": -199.6173095703125, "logps/rejected": -93.69500732421875, "loss": 0.3629, "rewards/chosen": 0.2597736418247223, "rewards/margins": 1.2329805791378021, "rewards/rejected": -0.9732069373130798, "step": 4971 }, { "epoch": 0.2635358969602205, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68364170.66666667, "logits/rejected": -21160539.2, "logps/chosen": -427.4834391276042, "logps/rejected": -350.0757080078125, "loss": 0.2942, "rewards/chosen": 0.24430441856384277, "rewards/margins": 1.8889312267303466, "rewards/rejected": -1.6446268081665039, "step": 4972 }, { "epoch": 0.26358890096202264, "grad_norm": 51.75, "kl": 1.3741912841796875, "learning_rate": 5e-07, "logits/chosen": -44086659.2, "logits/rejected": -12701973.333333334, "logps/chosen": -368.39208984375, "logps/rejected": -181.2636922200521, "loss": 0.4282, "rewards/chosen": 0.2915025234222412, "rewards/margins": 1.023010778427124, "rewards/rejected": -0.7315082550048828, "step": 4973 }, { "epoch": 0.2636419049638248, "grad_norm": 57.75, "kl": 0.3268318176269531, "learning_rate": 5e-07, "logits/chosen": -32892064.0, "logits/rejected": -33263080.0, "logps/chosen": -536.477099609375, "logps/rejected": -329.24713134765625, "loss": 0.3085, "rewards/chosen": 0.3351738691329956, "rewards/margins": 2.9929383039474486, "rewards/rejected": -2.657764434814453, "step": 4974 }, { "epoch": 0.2636949089656269, "grad_norm": 36.5, "kl": 0.33986663818359375, "learning_rate": 5e-07, "logits/chosen": -53233093.333333336, "logits/rejected": -58066534.4, "logps/chosen": -191.7629191080729, "logps/rejected": -417.016259765625, "loss": 0.3182, "rewards/chosen": -0.024742061893145244, "rewards/margins": 2.045133655269941, "rewards/rejected": -2.069875717163086, "step": 4975 }, { "epoch": 0.26374791296742905, "grad_norm": 28.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -537636.25, "logits/rejected": -3693738.75, "logps/chosen": -36.19458770751953, "logps/rejected": -133.70297241210938, "loss": 0.3006, "rewards/chosen": 0.096518374979496, "rewards/margins": 2.1019404903054237, "rewards/rejected": -2.0054221153259277, "step": 4976 }, { "epoch": 0.2638009169692312, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -117237578.66666667, "logits/rejected": 11652601.6, "logps/chosen": -510.5824381510417, "logps/rejected": -248.685302734375, "loss": 0.265, "rewards/chosen": 0.3649971882502238, "rewards/margins": 2.218710414568583, "rewards/rejected": -1.8537132263183593, "step": 4977 }, { "epoch": 0.26385392097103333, "grad_norm": 55.25, "kl": 0.3299388885498047, "learning_rate": 5e-07, "logits/chosen": -23444808.0, "logits/rejected": -14532891.2, "logps/chosen": -405.7705078125, "logps/rejected": -268.715966796875, "loss": 0.255, "rewards/chosen": 0.5965805451075236, "rewards/margins": 2.2618942658106485, "rewards/rejected": -1.665313720703125, "step": 4978 }, { "epoch": 0.26390692497283547, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32853805.333333332, "logits/rejected": -14363419.2, "logps/chosen": -648.104248046875, "logps/rejected": -188.1119384765625, "loss": 0.3455, "rewards/chosen": -0.1595784624417623, "rewards/margins": 1.1736073056856793, "rewards/rejected": -1.3331857681274415, "step": 4979 }, { "epoch": 0.2639599289746376, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3866807.6, "logits/rejected": -30439349.333333332, "logps/chosen": -76.30950927734375, "logps/rejected": -536.9431966145834, "loss": 0.3353, "rewards/chosen": 0.18447571992874146, "rewards/margins": 2.4937325914700827, "rewards/rejected": -2.3092568715413413, "step": 4980 }, { "epoch": 0.26401293297643974, "grad_norm": 56.0, "kl": 0.2147369384765625, "learning_rate": 5e-07, "logits/chosen": -43545668.0, "logits/rejected": -11889718.0, "logps/chosen": -240.01092529296875, "logps/rejected": -330.73712158203125, "loss": 0.2977, "rewards/chosen": 0.5305595397949219, "rewards/margins": 1.9619216918945312, "rewards/rejected": -1.4313621520996094, "step": 4981 }, { "epoch": 0.2640659369782419, "grad_norm": 43.25, "kl": 0.3267402648925781, "learning_rate": 5e-07, "logits/chosen": -19089588.0, "logits/rejected": -35288604.0, "logps/chosen": -180.07969665527344, "logps/rejected": -434.53607177734375, "loss": 0.2666, "rewards/chosen": 0.7135312557220459, "rewards/margins": 2.6275488138198853, "rewards/rejected": -1.9140175580978394, "step": 4982 }, { "epoch": 0.264118940980044, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9031125.0, "logits/rejected": -54582853.333333336, "logps/chosen": -213.3576202392578, "logps/rejected": -262.7884114583333, "loss": 0.1909, "rewards/chosen": 1.0837839841842651, "rewards/margins": 3.0802431503931684, "rewards/rejected": -1.996459166208903, "step": 4983 }, { "epoch": 0.26417194498184615, "grad_norm": 37.5, "kl": 0.34857845306396484, "learning_rate": 5e-07, "logits/chosen": -48076554.666666664, "logits/rejected": -27298160.0, "logps/chosen": -381.5219319661458, "logps/rejected": -146.4636474609375, "loss": 0.2296, "rewards/chosen": 1.1309784253438313, "rewards/margins": 2.6668121655782064, "rewards/rejected": -1.535833740234375, "step": 4984 }, { "epoch": 0.2642249489836483, "grad_norm": 48.5, "kl": 0.8902015686035156, "learning_rate": 5e-07, "logits/chosen": -26314512.0, "logits/rejected": -15555708.0, "logps/chosen": -247.409228515625, "logps/rejected": -228.27567545572916, "loss": 0.3559, "rewards/chosen": 0.4898808479309082, "rewards/margins": 2.246919600168864, "rewards/rejected": -1.7570387522379558, "step": 4985 }, { "epoch": 0.2642779529854504, "grad_norm": 43.0, "kl": 0.45590972900390625, "learning_rate": 5e-07, "logits/chosen": -11016200.0, "logits/rejected": -12784058.666666666, "logps/chosen": -219.71205139160156, "logps/rejected": -275.93798828125, "loss": 0.2221, "rewards/chosen": 0.464492529630661, "rewards/margins": 2.4810764491558075, "rewards/rejected": -2.0165839195251465, "step": 4986 }, { "epoch": 0.26433095698725256, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5012540.0, "logits/rejected": -6829134.4, "logps/chosen": -55.775797526041664, "logps/rejected": -237.058740234375, "loss": 0.299, "rewards/chosen": -0.16866356134414673, "rewards/margins": 1.6649454474449157, "rewards/rejected": -1.8336090087890624, "step": 4987 }, { "epoch": 0.2643839609890547, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62190485.333333336, "logits/rejected": -18462185.6, "logps/chosen": -537.4791666666666, "logps/rejected": -314.709765625, "loss": 0.2095, "rewards/chosen": 0.5046346187591553, "rewards/margins": 2.8022059917449953, "rewards/rejected": -2.29757137298584, "step": 4988 }, { "epoch": 0.2644369649908568, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4449841.333333333, "logits/rejected": -1984376.6, "logps/chosen": -83.26614888509114, "logps/rejected": -200.99752197265624, "loss": 0.3318, "rewards/chosen": 0.005214569469292958, "rewards/margins": 1.283632252116998, "rewards/rejected": -1.2784176826477052, "step": 4989 }, { "epoch": 0.2644899689926589, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23359604.0, "logits/rejected": -25474740.0, "logps/chosen": -112.62619018554688, "logps/rejected": -321.61834716796875, "loss": 0.2659, "rewards/chosen": 0.7428929805755615, "rewards/margins": 2.3831827640533447, "rewards/rejected": -1.6402897834777832, "step": 4990 }, { "epoch": 0.26454297299446106, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -107231480.0, "logits/rejected": 9574006.0, "logps/chosen": -190.15830993652344, "logps/rejected": -222.5020751953125, "loss": 0.322, "rewards/chosen": -0.1210452988743782, "rewards/margins": 1.1806492532293003, "rewards/rejected": -1.3016945521036785, "step": 4991 }, { "epoch": 0.2645959769962632, "grad_norm": 42.75, "kl": 1.2393684387207031, "learning_rate": 5e-07, "logits/chosen": -37389237.333333336, "logits/rejected": -32454156.8, "logps/chosen": -615.6170247395834, "logps/rejected": -338.66962890625, "loss": 0.2084, "rewards/chosen": 0.9584259192148844, "rewards/margins": 3.18277055422465, "rewards/rejected": -2.2243446350097655, "step": 4992 }, { "epoch": 0.26464898099806533, "grad_norm": 56.75, "kl": 0.23321914672851562, "learning_rate": 5e-07, "logits/chosen": -115264096.0, "logits/rejected": -16550272.0, "logps/chosen": -365.6293131510417, "logps/rejected": -215.169970703125, "loss": 0.2977, "rewards/chosen": -0.048938492933909096, "rewards/margins": 1.6337139805157979, "rewards/rejected": -1.682652473449707, "step": 4993 }, { "epoch": 0.26470198499986747, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15738136.0, "logits/rejected": -12022125.6, "logps/chosen": -243.9872843424479, "logps/rejected": -175.8607177734375, "loss": 0.3682, "rewards/chosen": -0.7526367505391439, "rewards/margins": 0.800400988260905, "rewards/rejected": -1.553037738800049, "step": 4994 }, { "epoch": 0.2647549890016696, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19092744.0, "logits/rejected": -27598610.0, "logps/chosen": -367.1358337402344, "logps/rejected": -382.3609313964844, "loss": 0.2546, "rewards/chosen": 0.4458045959472656, "rewards/margins": 2.7088534832000732, "rewards/rejected": -2.2630488872528076, "step": 4995 }, { "epoch": 0.26480799300347174, "grad_norm": 48.75, "kl": 0.118408203125, "learning_rate": 5e-07, "logits/chosen": 3693527.5, "logits/rejected": -35620861.333333336, "logps/chosen": -211.03851318359375, "logps/rejected": -409.278076171875, "loss": 0.2077, "rewards/chosen": 0.6414467096328735, "rewards/margins": 2.470933397610982, "rewards/rejected": -1.8294866879781086, "step": 4996 }, { "epoch": 0.2648609970052739, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3050234.75, "logits/rejected": -46535492.0, "logps/chosen": -69.20234680175781, "logps/rejected": -292.046630859375, "loss": 0.277, "rewards/chosen": 0.5498597025871277, "rewards/margins": 2.349419057369232, "rewards/rejected": -1.7995593547821045, "step": 4997 }, { "epoch": 0.264914001007076, "grad_norm": 56.25, "kl": 0.24663162231445312, "learning_rate": 5e-07, "logits/chosen": -20156284.0, "logits/rejected": -8820360.0, "logps/chosen": -348.53759765625, "logps/rejected": -311.3177490234375, "loss": 0.277, "rewards/chosen": 0.6510241031646729, "rewards/margins": 2.1297086477279663, "rewards/rejected": -1.4786845445632935, "step": 4998 }, { "epoch": 0.26496700500887815, "grad_norm": 59.5, "kl": 0.05214691162109375, "learning_rate": 5e-07, "logits/chosen": -13210668.0, "logits/rejected": -47079780.0, "logps/chosen": -262.4106038411458, "logps/rejected": -355.613037109375, "loss": 0.3734, "rewards/chosen": 0.30887381235758465, "rewards/margins": 1.9670685927073162, "rewards/rejected": -1.6581947803497314, "step": 4999 }, { "epoch": 0.2650200090106803, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7915980.0, "logits/rejected": -26101229.714285713, "logps/chosen": -162.487548828125, "logps/rejected": -392.34176199776783, "loss": 0.164, "rewards/chosen": 2.6772797107696533, "rewards/margins": 4.4746346814291815, "rewards/rejected": -1.7973549706595284, "step": 5000 }, { "epoch": 0.26507301301248243, "grad_norm": 58.5, "kl": 0.08031463623046875, "learning_rate": 5e-07, "logits/chosen": -9898020.8, "logits/rejected": -40448165.333333336, "logps/chosen": -277.3496337890625, "logps/rejected": -539.3489583333334, "loss": 0.3643, "rewards/chosen": 0.11027344465255737, "rewards/margins": 2.3480432073275246, "rewards/rejected": -2.2377697626749673, "step": 5001 }, { "epoch": 0.26512601701428457, "grad_norm": 30.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16025718.0, "logits/rejected": -24360972.0, "logps/chosen": -91.8835678100586, "logps/rejected": -340.8648681640625, "loss": 0.307, "rewards/chosen": -0.15995541214942932, "rewards/margins": 2.2855769097805023, "rewards/rejected": -2.4455323219299316, "step": 5002 }, { "epoch": 0.2651790210160867, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31346732.8, "logits/rejected": -6587328.0, "logps/chosen": -220.98466796875, "logps/rejected": -246.42987060546875, "loss": 0.2407, "rewards/chosen": 0.665189266204834, "rewards/margins": 3.502288341522217, "rewards/rejected": -2.837099075317383, "step": 5003 }, { "epoch": 0.26523202501788884, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15685694.0, "logits/rejected": -18612560.0, "logps/chosen": -195.23394775390625, "logps/rejected": -315.97943115234375, "loss": 0.3233, "rewards/chosen": 0.0879538506269455, "rewards/margins": 1.871738001704216, "rewards/rejected": -1.7837841510772705, "step": 5004 }, { "epoch": 0.265285029019691, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18468482.666666668, "logits/rejected": -39612707.2, "logps/chosen": -317.8155924479167, "logps/rejected": -276.9518310546875, "loss": 0.3295, "rewards/chosen": 0.2054904898007711, "rewards/margins": 1.4440656622250874, "rewards/rejected": -1.2385751724243164, "step": 5005 }, { "epoch": 0.2653380330214931, "grad_norm": 65.5, "kl": 1.0250186920166016, "learning_rate": 5e-07, "logits/chosen": -8361360.0, "logits/rejected": 10596862.0, "logps/chosen": -604.0716145833334, "logps/rejected": -169.6593780517578, "loss": 0.3032, "rewards/chosen": 0.7370992501576742, "rewards/margins": 2.2902336915334067, "rewards/rejected": -1.5531344413757324, "step": 5006 }, { "epoch": 0.26539103702329525, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36204153.6, "logits/rejected": -39351581.333333336, "logps/chosen": -251.2825927734375, "logps/rejected": -369.2361246744792, "loss": 0.3767, "rewards/chosen": -0.12630654573440553, "rewards/margins": 1.9299117604891458, "rewards/rejected": -2.0562183062235513, "step": 5007 }, { "epoch": 0.2654440410250974, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4717754.5, "logits/rejected": -35098704.0, "logps/chosen": -45.94352722167969, "logps/rejected": -430.0652262369792, "loss": 0.2986, "rewards/chosen": -0.23499912023544312, "rewards/margins": 1.3426337838172913, "rewards/rejected": -1.5776329040527344, "step": 5008 }, { "epoch": 0.2654970450268995, "grad_norm": 60.75, "kl": 0.39905548095703125, "learning_rate": 5e-07, "logits/chosen": -56681509.333333336, "logits/rejected": -15396048.0, "logps/chosen": -430.2805989583333, "logps/rejected": -198.0049560546875, "loss": 0.3253, "rewards/chosen": 0.3300509254137675, "rewards/margins": 1.5633975783983867, "rewards/rejected": -1.233346652984619, "step": 5009 }, { "epoch": 0.26555004902870166, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40143301.333333336, "logits/rejected": 8085652.8, "logps/chosen": -728.6560872395834, "logps/rejected": -327.3031005859375, "loss": 0.2794, "rewards/chosen": 0.4237162272135417, "rewards/margins": 1.8664528528849285, "rewards/rejected": -1.4427366256713867, "step": 5010 }, { "epoch": 0.2656030530305038, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68464293.33333333, "logits/rejected": -49589158.4, "logps/chosen": -444.7773844401042, "logps/rejected": -314.207421875, "loss": 0.2656, "rewards/chosen": 0.3431538740793864, "rewards/margins": 2.041188923517863, "rewards/rejected": -1.6980350494384766, "step": 5011 }, { "epoch": 0.26565605703230594, "grad_norm": 63.5, "kl": 0.23499774932861328, "learning_rate": 5e-07, "logits/chosen": -3862685.3333333335, "logits/rejected": -6159394.0, "logps/chosen": -420.5330810546875, "logps/rejected": -268.3419494628906, "loss": 0.3678, "rewards/chosen": 0.4206518729527791, "rewards/margins": 1.651652415593465, "rewards/rejected": -1.231000542640686, "step": 5012 }, { "epoch": 0.2657090610341081, "grad_norm": 67.5, "kl": 0.8196144104003906, "learning_rate": 5e-07, "logits/chosen": -79795097.6, "logits/rejected": -22977520.0, "logps/chosen": -1266.41796875, "logps/rejected": -232.08780924479166, "loss": 0.2382, "rewards/chosen": 1.1848058700561523, "rewards/margins": 3.7059195836385093, "rewards/rejected": -2.521113713582357, "step": 5013 }, { "epoch": 0.2657620650359102, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65705733.333333336, "logits/rejected": -51222921.6, "logps/chosen": -173.1237996419271, "logps/rejected": -329.3457763671875, "loss": 0.2396, "rewards/chosen": 0.2154674530029297, "rewards/margins": 2.4133983612060548, "rewards/rejected": -2.197930908203125, "step": 5014 }, { "epoch": 0.26581506903771235, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46896312.0, "logits/rejected": -23102864.0, "logps/chosen": -295.8031005859375, "logps/rejected": -261.72243245442706, "loss": 0.225, "rewards/chosen": 0.2042991667985916, "rewards/margins": 2.0080290188392, "rewards/rejected": -1.8037298520406086, "step": 5015 }, { "epoch": 0.2658680730395145, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22295733.333333332, "logits/rejected": -18859492.8, "logps/chosen": -91.50764973958333, "logps/rejected": -285.90888671875, "loss": 0.2268, "rewards/chosen": 0.39947243531545, "rewards/margins": 2.729127128918966, "rewards/rejected": -2.3296546936035156, "step": 5016 }, { "epoch": 0.2659210770413166, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1622258.5, "logits/rejected": -36781949.333333336, "logps/chosen": -110.43851318359376, "logps/rejected": -431.3422037760417, "loss": 0.3626, "rewards/chosen": 0.11134529113769531, "rewards/margins": 1.7904122670491536, "rewards/rejected": -1.6790669759114583, "step": 5017 }, { "epoch": 0.26597408104311876, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2694045.6666666665, "logits/rejected": -22158822.4, "logps/chosen": -57.033610026041664, "logps/rejected": -654.47978515625, "loss": 0.2969, "rewards/chosen": -0.22034219900767008, "rewards/margins": 2.443670709927877, "rewards/rejected": -2.664012908935547, "step": 5018 }, { "epoch": 0.2660270850449209, "grad_norm": 53.25, "kl": 1.4654216766357422, "learning_rate": 5e-07, "logits/chosen": -20578761.333333332, "logits/rejected": -8324218.0, "logps/chosen": -297.23183186848956, "logps/rejected": -114.45612335205078, "loss": 0.4086, "rewards/chosen": 0.3870513439178467, "rewards/margins": 1.3824787139892578, "rewards/rejected": -0.9954273700714111, "step": 5019 }, { "epoch": 0.26608008904672303, "grad_norm": 80.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29947420.0, "logits/rejected": -31651572.0, "logps/chosen": -805.4864501953125, "logps/rejected": -304.0899353027344, "loss": 0.3639, "rewards/chosen": 0.09267730265855789, "rewards/margins": 1.3056983426213264, "rewards/rejected": -1.2130210399627686, "step": 5020 }, { "epoch": 0.26613309304852517, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6542684.0, "logits/rejected": -47569968.0, "logps/chosen": -103.7965316772461, "logps/rejected": -291.62835693359375, "loss": 0.3652, "rewards/chosen": 0.09070128202438354, "rewards/margins": 1.318893849849701, "rewards/rejected": -1.2281925678253174, "step": 5021 }, { "epoch": 0.2661860970503273, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25337685.333333332, "logits/rejected": -19092838.4, "logps/chosen": -196.33504231770834, "logps/rejected": -283.3032470703125, "loss": 0.2778, "rewards/chosen": -0.3177626132965088, "rewards/margins": 2.0887864589691163, "rewards/rejected": -2.406549072265625, "step": 5022 }, { "epoch": 0.26623910105212945, "grad_norm": 42.25, "kl": 0.04730987548828125, "learning_rate": 5e-07, "logits/chosen": -39655213.333333336, "logits/rejected": -17942673.6, "logps/chosen": -514.935791015625, "logps/rejected": -276.017919921875, "loss": 0.1918, "rewards/chosen": 0.757622241973877, "rewards/margins": 3.294537830352783, "rewards/rejected": -2.536915588378906, "step": 5023 }, { "epoch": 0.2662921050539316, "grad_norm": 64.0, "kl": 1.801626205444336, "learning_rate": 5e-07, "logits/chosen": -31921689.6, "logits/rejected": -63574474.666666664, "logps/chosen": -445.596484375, "logps/rejected": -254.07722981770834, "loss": 0.2668, "rewards/chosen": 1.1114618301391601, "rewards/margins": 2.214165957768758, "rewards/rejected": -1.102704127629598, "step": 5024 }, { "epoch": 0.2663451090557337, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34987970.666666664, "logits/rejected": -33248430.0, "logps/chosen": -242.1444295247396, "logps/rejected": -252.6368408203125, "loss": 0.3783, "rewards/chosen": 0.49097434679667157, "rewards/margins": 1.1541899840037029, "rewards/rejected": -0.6632156372070312, "step": 5025 }, { "epoch": 0.26639811305753586, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12903632.0, "logits/rejected": -20174522.666666668, "logps/chosen": -147.6119140625, "logps/rejected": -529.8853352864584, "loss": 0.3739, "rewards/chosen": 0.012884974479675293, "rewards/margins": 2.0446660121281943, "rewards/rejected": -2.031781037648519, "step": 5026 }, { "epoch": 0.266451117059338, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54321728.0, "logits/rejected": -24743388.8, "logps/chosen": -378.725830078125, "logps/rejected": -260.10986328125, "loss": 0.3013, "rewards/chosen": 0.0668322245279948, "rewards/margins": 1.6598847071329754, "rewards/rejected": -1.5930524826049806, "step": 5027 }, { "epoch": 0.26650412106114013, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59573324.8, "logits/rejected": -30835429.333333332, "logps/chosen": -504.666064453125, "logps/rejected": -231.0502726236979, "loss": 0.3366, "rewards/chosen": 0.49227113723754884, "rewards/margins": 2.156312656402588, "rewards/rejected": -1.664041519165039, "step": 5028 }, { "epoch": 0.26655712506294227, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 497989.0, "logits/rejected": -17859920.0, "logps/chosen": -744.2725219726562, "logps/rejected": -471.10577392578125, "loss": 0.2307, "rewards/chosen": 0.6688736081123352, "rewards/margins": 2.842985212802887, "rewards/rejected": -2.1741116046905518, "step": 5029 }, { "epoch": 0.2666101290647444, "grad_norm": 52.75, "kl": 0.39209747314453125, "learning_rate": 5e-07, "logits/chosen": -77027176.0, "logits/rejected": -11538192.0, "logps/chosen": -459.07647705078125, "logps/rejected": -178.43168131510416, "loss": 0.3082, "rewards/chosen": -0.03991546481847763, "rewards/margins": 1.4996982589364052, "rewards/rejected": -1.5396137237548828, "step": 5030 }, { "epoch": 0.26666313306654654, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2950772.3333333335, "logits/rejected": -31860697.6, "logps/chosen": -300.2930094401042, "logps/rejected": -191.11416015625, "loss": 0.3112, "rewards/chosen": -0.2428582509358724, "rewards/margins": 1.4430702845255534, "rewards/rejected": -1.6859285354614257, "step": 5031 }, { "epoch": 0.2667161370683487, "grad_norm": 71.0, "kl": 0.02706146240234375, "learning_rate": 5e-07, "logits/chosen": -44494675.2, "logits/rejected": -23887698.666666668, "logps/chosen": -429.43291015625, "logps/rejected": -361.347412109375, "loss": 0.3222, "rewards/chosen": 0.2691129446029663, "rewards/margins": 2.365878144900004, "rewards/rejected": -2.0967652002970376, "step": 5032 }, { "epoch": 0.2667691410701508, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64321312.0, "logits/rejected": -24838992.0, "logps/chosen": -357.5772705078125, "logps/rejected": -288.44757080078125, "loss": 0.367, "rewards/chosen": -0.14573346078395844, "rewards/margins": 1.5623809248209, "rewards/rejected": -1.7081143856048584, "step": 5033 }, { "epoch": 0.26682214507195295, "grad_norm": 70.5, "kl": 0.5345916748046875, "learning_rate": 5e-07, "logits/chosen": -28219970.666666668, "logits/rejected": 12796240.0, "logps/chosen": -519.903076171875, "logps/rejected": -233.19638061523438, "loss": 0.3307, "rewards/chosen": 0.8782739639282227, "rewards/margins": 1.7460263967514038, "rewards/rejected": -0.8677524328231812, "step": 5034 }, { "epoch": 0.2668751490737551, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25962634.0, "logits/rejected": -23003008.0, "logps/chosen": -215.67080688476562, "logps/rejected": -388.77337646484375, "loss": 0.3511, "rewards/chosen": -0.13553735613822937, "rewards/margins": 1.5379314720630646, "rewards/rejected": -1.673468828201294, "step": 5035 }, { "epoch": 0.26692815307555723, "grad_norm": 56.75, "kl": 2.66754150390625, "learning_rate": 5e-07, "logits/chosen": -19707018.0, "logits/rejected": 447045.75, "logps/chosen": -850.1459350585938, "logps/rejected": -399.9767150878906, "loss": 0.2992, "rewards/chosen": 0.7103714346885681, "rewards/margins": 2.9073380827903748, "rewards/rejected": -2.1969666481018066, "step": 5036 }, { "epoch": 0.26698115707735937, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6038118.666666667, "logits/rejected": -26984953.6, "logps/chosen": -301.5521240234375, "logps/rejected": -343.5514404296875, "loss": 0.276, "rewards/chosen": 0.06244659423828125, "rewards/margins": 2.0204853057861327, "rewards/rejected": -1.9580387115478515, "step": 5037 }, { "epoch": 0.2670341610791615, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43642162.666666664, "logits/rejected": -52251620.0, "logps/chosen": -327.09075927734375, "logps/rejected": -275.95208740234375, "loss": 0.4107, "rewards/chosen": 0.020667900641759235, "rewards/margins": 2.2481421132882438, "rewards/rejected": -2.2274742126464844, "step": 5038 }, { "epoch": 0.26708716508096364, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12060537.0, "logits/rejected": -51702133.333333336, "logps/chosen": -53.13707733154297, "logps/rejected": -440.5367431640625, "loss": 0.2174, "rewards/chosen": -0.15224817395210266, "rewards/margins": 2.7377534409364066, "rewards/rejected": -2.8900016148885093, "step": 5039 }, { "epoch": 0.2671401690827657, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35609829.333333336, "logits/rejected": -25858642.0, "logps/chosen": -216.6647745768229, "logps/rejected": -194.57723999023438, "loss": 0.3675, "rewards/chosen": 0.33317840099334717, "rewards/margins": 1.8116520643234253, "rewards/rejected": -1.4784736633300781, "step": 5040 }, { "epoch": 0.26719317308456786, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5336946.0, "logits/rejected": -16749318.0, "logps/chosen": -291.5489501953125, "logps/rejected": -410.5467529296875, "loss": 0.4007, "rewards/chosen": 0.08537883559862773, "rewards/margins": 1.8569869498411815, "rewards/rejected": -1.7716081142425537, "step": 5041 }, { "epoch": 0.26724617708637, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36026024.0, "logits/rejected": -29584658.285714287, "logps/chosen": -901.9827880859375, "logps/rejected": -393.80126953125, "loss": 0.1602, "rewards/chosen": 1.041436791419983, "rewards/margins": 2.985900793756757, "rewards/rejected": -1.9444640023367745, "step": 5042 }, { "epoch": 0.26729918108817213, "grad_norm": 63.0, "kl": 0.5160388946533203, "learning_rate": 5e-07, "logits/chosen": -29168393.14285714, "logits/rejected": -1666676.625, "logps/chosen": -411.6781529017857, "logps/rejected": -58.07307815551758, "loss": 0.5166, "rewards/chosen": -0.21707068170819963, "rewards/margins": 1.173995750291007, "rewards/rejected": -1.3910664319992065, "step": 5043 }, { "epoch": 0.26735218508997427, "grad_norm": 52.5, "kl": 0.349334716796875, "learning_rate": 5e-07, "logits/chosen": 33257280.0, "logits/rejected": -42215514.666666664, "logps/chosen": -430.67080078125, "logps/rejected": -412.3785400390625, "loss": 0.2705, "rewards/chosen": 0.6727325439453125, "rewards/margins": 2.971217028299967, "rewards/rejected": -2.298484484354655, "step": 5044 }, { "epoch": 0.2674051890917764, "grad_norm": 73.0, "kl": 3.316974639892578, "learning_rate": 5e-07, "logits/chosen": -65453401.6, "logits/rejected": 51162165.333333336, "logps/chosen": -693.03447265625, "logps/rejected": -395.6691080729167, "loss": 0.35, "rewards/chosen": 0.7071178913116455, "rewards/margins": 2.3687593301137286, "rewards/rejected": -1.6616414388020833, "step": 5045 }, { "epoch": 0.26745819309357854, "grad_norm": 44.0, "kl": 1.9979515075683594, "learning_rate": 5e-07, "logits/chosen": -22478400.0, "logits/rejected": -7552459.0, "logps/chosen": -243.72543334960938, "logps/rejected": -126.07769775390625, "loss": 0.3096, "rewards/chosen": 0.43032610416412354, "rewards/margins": 2.211718201637268, "rewards/rejected": -1.7813920974731445, "step": 5046 }, { "epoch": 0.2675111970953807, "grad_norm": 28.25, "kl": 0.2812671661376953, "learning_rate": 5e-07, "logits/chosen": 231465.65625, "logits/rejected": -22426512.0, "logps/chosen": -62.215728759765625, "logps/rejected": -233.68912760416666, "loss": 0.1981, "rewards/chosen": 0.6217814683914185, "rewards/margins": 2.704630970954895, "rewards/rejected": -2.0828495025634766, "step": 5047 }, { "epoch": 0.2675642010971828, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54470880.0, "logits/rejected": -37587680.0, "logps/chosen": -409.3766784667969, "logps/rejected": -180.77684020996094, "loss": 0.2508, "rewards/chosen": 0.3456419110298157, "rewards/margins": 2.997500240802765, "rewards/rejected": -2.651858329772949, "step": 5048 }, { "epoch": 0.26761720509898496, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13335412.0, "logits/rejected": -21008900.0, "logps/chosen": -429.5175476074219, "logps/rejected": -280.9868469238281, "loss": 0.2744, "rewards/chosen": 0.9577018618583679, "rewards/margins": 2.8768232464790344, "rewards/rejected": -1.9191213846206665, "step": 5049 }, { "epoch": 0.2676702091007871, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27103406.0, "logits/rejected": -861335.5, "logps/chosen": -218.3477020263672, "logps/rejected": -454.8507385253906, "loss": 0.299, "rewards/chosen": 0.1960085928440094, "rewards/margins": 2.2563752233982086, "rewards/rejected": -2.060366630554199, "step": 5050 }, { "epoch": 0.26772321310258923, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12940879.0, "logits/rejected": -15356894.0, "logps/chosen": -37.406063079833984, "logps/rejected": -166.3779754638672, "loss": 0.3857, "rewards/chosen": 0.16362150013446808, "rewards/margins": 1.1061185151338577, "rewards/rejected": -0.9424970149993896, "step": 5051 }, { "epoch": 0.26777621710439137, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16910360.0, "logits/rejected": -4637351.0, "logps/chosen": -308.49371337890625, "logps/rejected": -184.00332641601562, "loss": 0.3751, "rewards/chosen": 0.23173431555430093, "rewards/margins": 1.9363574186960857, "rewards/rejected": -1.7046231031417847, "step": 5052 }, { "epoch": 0.2678292211061935, "grad_norm": 48.25, "kl": 0.4902000427246094, "learning_rate": 5e-07, "logits/chosen": -29747594.666666668, "logits/rejected": -30345065.6, "logps/chosen": -336.1286214192708, "logps/rejected": -418.33837890625, "loss": 0.2438, "rewards/chosen": 0.6724995772043864, "rewards/margins": 2.363583294550578, "rewards/rejected": -1.6910837173461915, "step": 5053 }, { "epoch": 0.26788222510799564, "grad_norm": 51.25, "kl": 1.1462440490722656, "learning_rate": 5e-07, "logits/chosen": -38898464.0, "logits/rejected": -15381039.0, "logps/chosen": -306.985107421875, "logps/rejected": -192.86907958984375, "loss": 0.4053, "rewards/chosen": 0.25273871421813965, "rewards/margins": 0.8942639827728271, "rewards/rejected": -0.6415252685546875, "step": 5054 }, { "epoch": 0.2679352291097978, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38262220.0, "logits/rejected": -27571896.0, "logps/chosen": -379.0904846191406, "logps/rejected": -216.2115020751953, "loss": 0.247, "rewards/chosen": 0.5630667209625244, "rewards/margins": 2.633139133453369, "rewards/rejected": -2.0700724124908447, "step": 5055 }, { "epoch": 0.2679882331115999, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32129546.0, "logits/rejected": -70889624.0, "logps/chosen": -246.94918823242188, "logps/rejected": -547.0462646484375, "loss": 0.2847, "rewards/chosen": 0.0629657730460167, "rewards/margins": 2.669419191777706, "rewards/rejected": -2.6064534187316895, "step": 5056 }, { "epoch": 0.26804123711340205, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31112880.0, "logits/rejected": -47748308.0, "logps/chosen": -170.44619750976562, "logps/rejected": -438.4479064941406, "loss": 0.2382, "rewards/chosen": 0.5145937204360962, "rewards/margins": 2.766372799873352, "rewards/rejected": -2.251779079437256, "step": 5057 }, { "epoch": 0.2680942411152042, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9774848.0, "logits/rejected": -4331667.0, "logps/chosen": -323.37237548828125, "logps/rejected": -269.4244079589844, "loss": 0.253, "rewards/chosen": 0.41472700238227844, "rewards/margins": 3.354981690645218, "rewards/rejected": -2.9402546882629395, "step": 5058 }, { "epoch": 0.2681472451170063, "grad_norm": 47.0, "kl": 0.17449951171875, "learning_rate": 5e-07, "logits/chosen": -2039053.3333333333, "logits/rejected": -25179619.2, "logps/chosen": -209.6664021809896, "logps/rejected": -312.76962890625, "loss": 0.3017, "rewards/chosen": -0.020991767446200054, "rewards/margins": 1.630952583750089, "rewards/rejected": -1.651944351196289, "step": 5059 }, { "epoch": 0.26820024911880846, "grad_norm": 54.25, "kl": 1.0961885452270508, "learning_rate": 5e-07, "logits/chosen": -13678105.6, "logits/rejected": -26505538.666666668, "logps/chosen": -211.93115234375, "logps/rejected": -496.8374837239583, "loss": 0.3458, "rewards/chosen": 0.35473155975341797, "rewards/margins": 2.948201815287272, "rewards/rejected": -2.593470255533854, "step": 5060 }, { "epoch": 0.2682532531206106, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1172242.0, "logits/rejected": -23978384.0, "logps/chosen": -34.05958557128906, "logps/rejected": -193.16620744977678, "loss": 0.1787, "rewards/chosen": 0.7692188620567322, "rewards/margins": 2.676769946302686, "rewards/rejected": -1.9075510842459542, "step": 5061 }, { "epoch": 0.26830625712241274, "grad_norm": 41.5, "kl": 0.3112754821777344, "learning_rate": 5e-07, "logits/chosen": 13214000.0, "logits/rejected": -10522012.0, "logps/chosen": -308.2170003255208, "logps/rejected": -134.1086669921875, "loss": 0.2659, "rewards/chosen": 0.7271456718444824, "rewards/margins": 2.1557595252990724, "rewards/rejected": -1.4286138534545898, "step": 5062 }, { "epoch": 0.2683592611242149, "grad_norm": 52.0, "kl": 0.311004638671875, "learning_rate": 5e-07, "logits/chosen": -16063240.0, "logits/rejected": -3374516.0, "logps/chosen": -275.8714192708333, "logps/rejected": -78.36068725585938, "loss": 0.3774, "rewards/chosen": 0.24429086844126383, "rewards/margins": 2.15411905447642, "rewards/rejected": -1.9098281860351562, "step": 5063 }, { "epoch": 0.268412265126017, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12615257.6, "logits/rejected": -12932668.0, "logps/chosen": -264.6705322265625, "logps/rejected": -227.32171630859375, "loss": 0.3392, "rewards/chosen": 0.21657781600952147, "rewards/margins": 2.0544692357381185, "rewards/rejected": -1.837891419728597, "step": 5064 }, { "epoch": 0.26846526912781915, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41888589.333333336, "logits/rejected": -87660672.0, "logps/chosen": -480.6768391927083, "logps/rejected": -547.168212890625, "loss": 0.3361, "rewards/chosen": 0.3513787587483724, "rewards/margins": 2.6007960637410483, "rewards/rejected": -2.249417304992676, "step": 5065 }, { "epoch": 0.2685182731296213, "grad_norm": 43.75, "kl": 0.48633289337158203, "learning_rate": 5e-07, "logits/chosen": -3888818.25, "logits/rejected": -17457022.0, "logps/chosen": -49.662025451660156, "logps/rejected": -259.273193359375, "loss": 0.381, "rewards/chosen": -0.18089351058006287, "rewards/margins": 1.3707118332386017, "rewards/rejected": -1.5516053438186646, "step": 5066 }, { "epoch": 0.2685712771314234, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43640821.333333336, "logits/rejected": -7194118.4, "logps/chosen": -388.1259765625, "logps/rejected": -709.94267578125, "loss": 0.2574, "rewards/chosen": 0.4232472976048787, "rewards/margins": 3.1067748626073204, "rewards/rejected": -2.6835275650024415, "step": 5067 }, { "epoch": 0.26862428113322556, "grad_norm": 53.75, "kl": 0.6794242858886719, "learning_rate": 5e-07, "logits/chosen": -34548120.0, "logps/chosen": -461.2810363769531, "loss": 0.4565, "rewards/chosen": 0.41286414861679077, "step": 5068 }, { "epoch": 0.2686772851350277, "grad_norm": 46.0, "kl": 2.5486536026000977, "learning_rate": 5e-07, "logits/chosen": -26177508.0, "logits/rejected": -28217026.666666668, "logps/chosen": -1070.4315185546875, "logps/rejected": -192.69696044921875, "loss": 0.187, "rewards/chosen": 1.8366714715957642, "rewards/margins": 3.354205409685771, "rewards/rejected": -1.5175339380900066, "step": 5069 }, { "epoch": 0.26873028913682984, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10655542.0, "logits/rejected": -22018986.0, "logps/chosen": -146.31272888183594, "logps/rejected": -562.2409057617188, "loss": 0.2756, "rewards/chosen": 0.3436702787876129, "rewards/margins": 3.380103975534439, "rewards/rejected": -3.036433696746826, "step": 5070 }, { "epoch": 0.268783293138632, "grad_norm": 58.5, "kl": 0.01782989501953125, "learning_rate": 5e-07, "logits/chosen": 11421008.0, "logits/rejected": -2518173.8333333335, "logps/chosen": -328.9700622558594, "logps/rejected": -245.67535400390625, "loss": 0.3256, "rewards/chosen": 0.6792526245117188, "rewards/margins": 1.540943463643392, "rewards/rejected": -0.8616908391316732, "step": 5071 }, { "epoch": 0.2688362971404341, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48897845.333333336, "logits/rejected": -6623887.6, "logps/chosen": -326.501953125, "logps/rejected": -217.414501953125, "loss": 0.3027, "rewards/chosen": 0.31283847490946454, "rewards/margins": 1.8580527464548748, "rewards/rejected": -1.5452142715454102, "step": 5072 }, { "epoch": 0.26888930114223625, "grad_norm": 51.25, "kl": 0.8380851745605469, "learning_rate": 5e-07, "logits/chosen": -84573843.2, "logits/rejected": -68071994.66666667, "logps/chosen": -255.499853515625, "logps/rejected": -101.45839436848958, "loss": 0.3837, "rewards/chosen": 0.008529126644134521, "rewards/margins": 1.5281250675519307, "rewards/rejected": -1.5195959409077961, "step": 5073 }, { "epoch": 0.2689423051440384, "grad_norm": 60.5, "kl": 0.9688692092895508, "learning_rate": 5e-07, "logits/chosen": -79862720.0, "logits/rejected": 9190473.333333334, "logps/chosen": -372.0354736328125, "logps/rejected": -120.80385335286458, "loss": 0.4219, "rewards/chosen": -0.0004474908113479614, "rewards/margins": 1.0450079649686814, "rewards/rejected": -1.0454554557800293, "step": 5074 }, { "epoch": 0.2689953091458405, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18020282.666666668, "logits/rejected": 4399360.4, "logps/chosen": -161.83665974934897, "logps/rejected": -211.4328125, "loss": 0.3145, "rewards/chosen": -0.0578478475411733, "rewards/margins": 1.5841008524099986, "rewards/rejected": -1.6419486999511719, "step": 5075 }, { "epoch": 0.26904831314764266, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36110058.666666664, "logits/rejected": -16197694.4, "logps/chosen": -222.81953938802084, "logps/rejected": -284.0506103515625, "loss": 0.2788, "rewards/chosen": -0.01214967668056488, "rewards/margins": 1.95423826277256, "rewards/rejected": -1.966387939453125, "step": 5076 }, { "epoch": 0.2691013171494448, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31059988.0, "logits/rejected": -7008590.0, "logps/chosen": -436.5326232910156, "logps/rejected": -147.37832641601562, "loss": 0.2975, "rewards/chosen": 0.23533859848976135, "rewards/margins": 2.334106832742691, "rewards/rejected": -2.0987682342529297, "step": 5077 }, { "epoch": 0.26915432115124693, "grad_norm": 31.625, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -3904955.0, "logps/rejected": -418.5440979003906, "loss": 0.1006, "rewards/rejected": -2.5303072929382324, "step": 5078 }, { "epoch": 0.26920732515304907, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29891900.0, "logits/rejected": -39377648.0, "logps/chosen": -323.1205749511719, "logps/rejected": -365.6616617838542, "loss": 0.2127, "rewards/chosen": 0.49579161405563354, "rewards/margins": 2.645894229412079, "rewards/rejected": -2.1501026153564453, "step": 5079 }, { "epoch": 0.2692603291548512, "grad_norm": 53.5, "kl": 0.7567830085754395, "learning_rate": 5e-07, "logits/chosen": -19970050.666666668, "logits/rejected": 2108821.5, "logps/chosen": -340.16514078776044, "logps/rejected": -34.74211120605469, "loss": 0.4282, "rewards/chosen": 0.1468193233013153, "rewards/margins": 1.0016147792339325, "rewards/rejected": -0.8547954559326172, "step": 5080 }, { "epoch": 0.26931333315665335, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22550912.0, "logits/rejected": -29906812.0, "logps/chosen": -422.628662109375, "logps/rejected": -509.6435241699219, "loss": 0.3522, "rewards/chosen": 0.38228126366933185, "rewards/margins": 3.0123571952184043, "rewards/rejected": -2.6300759315490723, "step": 5081 }, { "epoch": 0.2693663371584555, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31002204.0, "logits/rejected": -10920784.0, "logps/chosen": -171.86741638183594, "logps/rejected": -93.31137084960938, "loss": 0.3492, "rewards/chosen": -0.21459683775901794, "rewards/margins": 1.820472627878189, "rewards/rejected": -2.035069465637207, "step": 5082 }, { "epoch": 0.2694193411602576, "grad_norm": 49.25, "kl": 0.5169677734375, "learning_rate": 5e-07, "logits/chosen": -25596708.8, "logits/rejected": -24834954.666666668, "logps/chosen": -226.234814453125, "logps/rejected": -369.586181640625, "loss": 0.3364, "rewards/chosen": 0.14461005926132203, "rewards/margins": 2.5376012841860454, "rewards/rejected": -2.392991224924723, "step": 5083 }, { "epoch": 0.26947234516205976, "grad_norm": 48.0, "kl": 0.02353668212890625, "learning_rate": 5e-07, "logits/chosen": -37033372.8, "logits/rejected": -13239857.333333334, "logps/chosen": -283.537548828125, "logps/rejected": -500.5490315755208, "loss": 0.3052, "rewards/chosen": 0.3749985218048096, "rewards/margins": 3.129787874221802, "rewards/rejected": -2.754789352416992, "step": 5084 }, { "epoch": 0.2695253491638619, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17215330.666666668, "logits/rejected": -48200307.2, "logps/chosen": -417.3966064453125, "logps/rejected": -464.87998046875, "loss": 0.194, "rewards/chosen": 0.8665935198465983, "rewards/margins": 2.9584643999735514, "rewards/rejected": -2.091870880126953, "step": 5085 }, { "epoch": 0.26957835316566403, "grad_norm": 52.0, "kl": 0.7675189971923828, "learning_rate": 5e-07, "logits/chosen": -2934504.25, "logits/rejected": -24634208.0, "logps/chosen": -142.8314666748047, "logps/rejected": -427.1431579589844, "loss": 0.3536, "rewards/chosen": -0.04378237575292587, "rewards/margins": 1.7990768924355507, "rewards/rejected": -1.8428592681884766, "step": 5086 }, { "epoch": 0.26963135716746617, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1408080.0, "logits/rejected": -57723304.0, "logps/chosen": -342.08775111607144, "logps/rejected": -683.64697265625, "loss": 0.4968, "rewards/chosen": -0.18224763870239258, "rewards/margins": 4.717166423797607, "rewards/rejected": -4.8994140625, "step": 5087 }, { "epoch": 0.2696843611692683, "grad_norm": 49.0, "kl": 1.3346977233886719, "learning_rate": 5e-07, "logits/chosen": -32296272.0, "logits/rejected": -48855640.0, "logps/chosen": -355.5965576171875, "logps/rejected": -569.599365234375, "loss": 0.1833, "rewards/chosen": 1.1140923500061035, "rewards/margins": 3.9364078044891357, "rewards/rejected": -2.8223154544830322, "step": 5088 }, { "epoch": 0.26973736517107044, "grad_norm": 67.0, "kl": 1.4538078308105469, "learning_rate": 5e-07, "logits/chosen": 14117296.0, "logits/rejected": -16378796.8, "logps/chosen": -712.316162109375, "logps/rejected": -327.147607421875, "loss": 0.2947, "rewards/chosen": 0.4363505045572917, "rewards/margins": 2.357369295756022, "rewards/rejected": -1.9210187911987304, "step": 5089 }, { "epoch": 0.2697903691728726, "grad_norm": 40.5, "kl": 1.0779781341552734, "learning_rate": 5e-07, "logits/chosen": -26644090.0, "logits/rejected": -45465980.0, "logps/chosen": -196.96885681152344, "logps/rejected": -445.1790771484375, "loss": 0.3123, "rewards/chosen": 0.021690428256988525, "rewards/margins": 2.8556800484657288, "rewards/rejected": -2.8339896202087402, "step": 5090 }, { "epoch": 0.26984337317467466, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56787052.8, "logits/rejected": -48776965.333333336, "logps/chosen": -562.790625, "logps/rejected": -297.9466552734375, "loss": 0.2891, "rewards/chosen": 0.5328581809997559, "rewards/margins": 2.8419373512268065, "rewards/rejected": -2.309079170227051, "step": 5091 }, { "epoch": 0.2698963771764768, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6701106.0, "logits/rejected": -23209896.0, "logps/chosen": -167.04379272460938, "logps/rejected": -320.9602966308594, "loss": 0.3491, "rewards/chosen": -0.18155398964881897, "rewards/margins": 1.71553835272789, "rewards/rejected": -1.897092342376709, "step": 5092 }, { "epoch": 0.26994938117827894, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53765525.333333336, "logits/rejected": -675864.5, "logps/chosen": -314.66457112630206, "logps/rejected": -93.71138916015624, "loss": 0.2038, "rewards/chosen": 0.766462246576945, "rewards/margins": 2.685758129755656, "rewards/rejected": -1.919295883178711, "step": 5093 }, { "epoch": 0.2700023851800811, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29951453.333333332, "logits/rejected": -33778988.8, "logps/chosen": -181.56925455729166, "logps/rejected": -400.3154541015625, "loss": 0.2155, "rewards/chosen": 0.7358555793762207, "rewards/margins": 2.705681324005127, "rewards/rejected": -1.9698257446289062, "step": 5094 }, { "epoch": 0.2700553891818832, "grad_norm": 49.25, "kl": 1.1577692031860352, "learning_rate": 5e-07, "logits/chosen": -2578492.8, "logits/rejected": -13021389.333333334, "logps/chosen": -106.116748046875, "logps/rejected": -310.033203125, "loss": 0.3626, "rewards/chosen": 0.6267357349395752, "rewards/margins": 1.711970853805542, "rewards/rejected": -1.0852351188659668, "step": 5095 }, { "epoch": 0.27010839318368535, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -683188.7, "logits/rejected": -27088933.333333332, "logps/chosen": -121.54522705078125, "logps/rejected": -327.02146402994794, "loss": 0.3661, "rewards/chosen": 0.12090404033660888, "rewards/margins": 1.6607762098312377, "rewards/rejected": -1.539872169494629, "step": 5096 }, { "epoch": 0.2701613971854875, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7254422.4, "logits/rejected": -33208226.666666668, "logps/chosen": -328.853271484375, "logps/rejected": -305.8623453776042, "loss": 0.346, "rewards/chosen": -0.007307893037796021, "rewards/margins": 2.717480136950811, "rewards/rejected": -2.724788029988607, "step": 5097 }, { "epoch": 0.2702144011872896, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29315850.666666668, "logits/rejected": -69271788.8, "logps/chosen": -182.3241170247396, "logps/rejected": -212.437890625, "loss": 0.3464, "rewards/chosen": 0.020142873128255207, "rewards/margins": 1.1715100606282551, "rewards/rejected": -1.1513671875, "step": 5098 }, { "epoch": 0.27026740518909176, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47320632.0, "logits/rejected": -24754830.0, "logps/chosen": -181.9544677734375, "logps/rejected": -281.783203125, "loss": 0.332, "rewards/chosen": 0.09894117712974548, "rewards/margins": 1.6409741342067719, "rewards/rejected": -1.5420329570770264, "step": 5099 }, { "epoch": 0.2703204091908939, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42300360.0, "logits/rejected": -24473040.0, "logps/chosen": -348.9386901855469, "logps/rejected": -292.1136474609375, "loss": 0.1834, "rewards/chosen": 0.43920576572418213, "rewards/margins": 2.5681604941685996, "rewards/rejected": -2.1289547284444175, "step": 5100 }, { "epoch": 0.27037341319269603, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9081344.0, "logits/rejected": -22873002.666666668, "logps/chosen": -365.021728515625, "logps/rejected": -320.0034993489583, "loss": 0.1661, "rewards/chosen": 0.5204734802246094, "rewards/margins": 2.8420475323994956, "rewards/rejected": -2.321574052174886, "step": 5101 }, { "epoch": 0.27042641719449817, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26770358.4, "logits/rejected": -26554648.0, "logps/chosen": -243.55419921875, "logps/rejected": -196.94417317708334, "loss": 0.3318, "rewards/chosen": 0.5272582530975342, "rewards/margins": 1.7598269939422608, "rewards/rejected": -1.2325687408447266, "step": 5102 }, { "epoch": 0.2704794211963003, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2773117.5, "logits/rejected": -44286020.0, "logps/chosen": -113.64808146158855, "logps/rejected": -145.31304931640625, "loss": 0.3647, "rewards/chosen": 0.22914808988571167, "rewards/margins": 2.3284522891044617, "rewards/rejected": -2.09930419921875, "step": 5103 }, { "epoch": 0.27053242519810244, "grad_norm": 55.25, "kl": 0.5794219970703125, "learning_rate": 5e-07, "logits/chosen": -35005605.333333336, "logits/rejected": -38733716.0, "logps/chosen": -597.4033203125, "logps/rejected": -367.31756591796875, "loss": 0.3259, "rewards/chosen": 0.7255520025889078, "rewards/margins": 2.539230744043986, "rewards/rejected": -1.8136787414550781, "step": 5104 }, { "epoch": 0.2705854291999046, "grad_norm": 51.25, "kl": 0.153289794921875, "learning_rate": 5e-07, "logits/chosen": -27255086.0, "logits/rejected": -18773116.0, "logps/chosen": -285.7622375488281, "logps/rejected": -99.84512329101562, "loss": 0.3717, "rewards/chosen": 0.0074371397495269775, "rewards/margins": 1.3179484903812408, "rewards/rejected": -1.3105113506317139, "step": 5105 }, { "epoch": 0.2706384332017067, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41597749.333333336, "logits/rejected": -5788812.4, "logps/chosen": -387.5874430338542, "logps/rejected": -177.5263671875, "loss": 0.2428, "rewards/chosen": 0.7203888098398844, "rewards/margins": 2.340990845362345, "rewards/rejected": -1.6206020355224608, "step": 5106 }, { "epoch": 0.27069143720350886, "grad_norm": 58.25, "kl": 0.9353084564208984, "learning_rate": 5e-07, "logits/chosen": 5882034.666666667, "logits/rejected": -22441334.4, "logps/chosen": -556.9654134114584, "logps/rejected": -165.4173828125, "loss": 0.2224, "rewards/chosen": 1.006051778793335, "rewards/margins": 2.6875617504119873, "rewards/rejected": -1.6815099716186523, "step": 5107 }, { "epoch": 0.270744441205311, "grad_norm": 61.0, "kl": 0.0724334716796875, "learning_rate": 5e-07, "logits/chosen": -48264918.4, "logits/rejected": -38950898.666666664, "logps/chosen": -246.405224609375, "logps/rejected": -554.739990234375, "loss": 0.347, "rewards/chosen": 0.3404407978057861, "rewards/margins": 1.926508156458537, "rewards/rejected": -1.5860673586527507, "step": 5108 }, { "epoch": 0.27079744520711313, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34671248.0, "logits/rejected": -25645102.0, "logps/chosen": -236.98410034179688, "logps/rejected": -642.0223999023438, "loss": 0.2922, "rewards/chosen": 0.07222747802734375, "rewards/margins": 2.270221710205078, "rewards/rejected": -2.1979942321777344, "step": 5109 }, { "epoch": 0.27085044920891527, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40873872.0, "logits/rejected": -13467292.0, "logps/chosen": -283.186279296875, "logps/rejected": -264.3362731933594, "loss": 0.4169, "rewards/chosen": -0.08249463886022568, "rewards/margins": 0.7717963233590126, "rewards/rejected": -0.8542909622192383, "step": 5110 }, { "epoch": 0.2709034532107174, "grad_norm": 52.75, "kl": 2.1586837768554688, "learning_rate": 5e-07, "logits/chosen": -5612164.8, "logits/rejected": -54934416.0, "logps/chosen": -172.228759765625, "logps/rejected": -358.65673828125, "loss": 0.3832, "rewards/chosen": 0.2533968210220337, "rewards/margins": 2.198568018277486, "rewards/rejected": -1.9451711972554524, "step": 5111 }, { "epoch": 0.27095645721251954, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55878228.0, "logits/rejected": -7855896.571428572, "logps/chosen": -405.3582763671875, "logps/rejected": -268.23854282924106, "loss": 0.1598, "rewards/chosen": 1.046453833580017, "rewards/margins": 3.098809463637216, "rewards/rejected": -2.052355630057199, "step": 5112 }, { "epoch": 0.2710094612143217, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14688692.8, "logits/rejected": -17103349.333333332, "logps/chosen": -156.4313720703125, "logps/rejected": -108.2184549967448, "loss": 0.3736, "rewards/chosen": 0.09118732810020447, "rewards/margins": 1.879130055507024, "rewards/rejected": -1.7879427274068196, "step": 5113 }, { "epoch": 0.2710624652161238, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7355889.0, "logits/rejected": -42939812.0, "logps/chosen": -249.3423614501953, "logps/rejected": -385.7609558105469, "loss": 0.2757, "rewards/chosen": 0.3060312271118164, "rewards/margins": 2.4488210678100586, "rewards/rejected": -2.142789840698242, "step": 5114 }, { "epoch": 0.27111546921792595, "grad_norm": 49.0, "kl": 0.9613380432128906, "learning_rate": 5e-07, "logits/chosen": -22362837.333333332, "logits/rejected": 168990.4, "logps/chosen": -386.8413492838542, "logps/rejected": -500.945654296875, "loss": 0.2325, "rewards/chosen": 0.4075022538503011, "rewards/margins": 3.180280574162801, "rewards/rejected": -2.7727783203125, "step": 5115 }, { "epoch": 0.2711684732197281, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42450576.0, "logits/rejected": -12980192.0, "logps/chosen": -550.7127685546875, "logps/rejected": -181.087158203125, "loss": 0.2199, "rewards/chosen": 1.2632874250411987, "rewards/margins": 2.700027482850211, "rewards/rejected": -1.4367400578090124, "step": 5116 }, { "epoch": 0.2712214772215302, "grad_norm": 51.0, "kl": 0.8335514068603516, "learning_rate": 5e-07, "logits/chosen": -10826265.333333334, "logits/rejected": -41483812.0, "logps/chosen": -197.2763671875, "logps/rejected": -219.35009765625, "loss": 0.4252, "rewards/chosen": 0.16207546989123026, "rewards/margins": 1.4113499621550243, "rewards/rejected": -1.249274492263794, "step": 5117 }, { "epoch": 0.27127448122333236, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20100230.666666668, "logits/rejected": -35927686.4, "logps/chosen": -279.229736328125, "logps/rejected": -272.2629638671875, "loss": 0.2715, "rewards/chosen": 0.21371797720591226, "rewards/margins": 2.217958394686381, "rewards/rejected": -2.0042404174804687, "step": 5118 }, { "epoch": 0.2713274852251345, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32524704.0, "logits/rejected": -57881146.666666664, "logps/chosen": -225.541845703125, "logps/rejected": -163.53929646809897, "loss": 0.4544, "rewards/chosen": -0.3711840629577637, "rewards/margins": 1.1126322428385418, "rewards/rejected": -1.4838163057963054, "step": 5119 }, { "epoch": 0.27138048922693664, "grad_norm": 66.0, "kl": 0.8285236358642578, "learning_rate": 5e-07, "logits/chosen": -27148602.666666668, "logits/rejected": 8297536.0, "logps/chosen": -651.6486002604166, "logps/rejected": -493.8883361816406, "loss": 0.3059, "rewards/chosen": 0.6735905806223551, "rewards/margins": 3.052109162012736, "rewards/rejected": -2.378518581390381, "step": 5120 }, { "epoch": 0.2714334932287388, "grad_norm": 52.0, "kl": 1.594569206237793, "learning_rate": 5e-07, "logits/chosen": -7271395.333333333, "logits/rejected": -105066320.0, "logps/chosen": -234.09222412109375, "logps/rejected": -407.85943603515625, "loss": 0.331, "rewards/chosen": 0.6221665541330973, "rewards/margins": 3.07694403330485, "rewards/rejected": -2.454777479171753, "step": 5121 }, { "epoch": 0.2714864972305409, "grad_norm": 69.0, "kl": 3.3906707763671875, "learning_rate": 5e-07, "logits/chosen": -18703400.0, "logps/chosen": -352.4608459472656, "loss": 0.4297, "rewards/chosen": 0.6304988861083984, "step": 5122 }, { "epoch": 0.27153950123234305, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48186796.0, "logits/rejected": 8164210.0, "logps/chosen": -215.0402374267578, "logps/rejected": -138.92385864257812, "loss": 0.42, "rewards/chosen": -0.17159563302993774, "rewards/margins": 0.7202432155609131, "rewards/rejected": -0.8918388485908508, "step": 5123 }, { "epoch": 0.2715925052341452, "grad_norm": 68.5, "kl": 0.5528316497802734, "learning_rate": 5e-07, "logits/chosen": -27869202.666666668, "logits/rejected": -44380308.0, "logps/chosen": -398.0458577473958, "logps/rejected": -290.06390380859375, "loss": 0.3752, "rewards/chosen": 0.3574180603027344, "rewards/margins": 1.708207368850708, "rewards/rejected": -1.3507893085479736, "step": 5124 }, { "epoch": 0.2716455092359473, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1150028.0, "logits/rejected": -5975272.5, "logps/chosen": -429.24871826171875, "logps/rejected": -141.62879943847656, "loss": 0.2493, "rewards/chosen": 0.5644977688789368, "rewards/margins": 2.7860713601112366, "rewards/rejected": -2.2215735912323, "step": 5125 }, { "epoch": 0.27169851323774946, "grad_norm": 47.75, "kl": 0.039429664611816406, "learning_rate": 5e-07, "logits/chosen": -14098569.6, "logits/rejected": -5311802.0, "logps/chosen": -299.37138671875, "logps/rejected": -210.6094767252604, "loss": 0.3269, "rewards/chosen": 0.2993204593658447, "rewards/margins": 2.1115734577178955, "rewards/rejected": -1.8122529983520508, "step": 5126 }, { "epoch": 0.2717515172395516, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1146974.0, "logits/rejected": 18114928.0, "logps/chosen": -220.10287475585938, "logps/rejected": -622.1868896484375, "loss": 0.3741, "rewards/chosen": 0.0688994824886322, "rewards/margins": 2.9202607572078705, "rewards/rejected": -2.8513612747192383, "step": 5127 }, { "epoch": 0.27180452124135374, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38460946.666666664, "logits/rejected": -37073139.2, "logps/chosen": -355.0315755208333, "logps/rejected": -419.7857421875, "loss": 0.2277, "rewards/chosen": -0.0183141032854716, "rewards/margins": 2.8441612919171653, "rewards/rejected": -2.8624753952026367, "step": 5128 }, { "epoch": 0.2718575252431559, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43326777.6, "logits/rejected": -22725384.0, "logps/chosen": -252.7130859375, "logps/rejected": -257.37221272786456, "loss": 0.347, "rewards/chosen": 0.6031984329223633, "rewards/margins": 1.522821013132731, "rewards/rejected": -0.9196225802103678, "step": 5129 }, { "epoch": 0.271910529244958, "grad_norm": 54.0, "kl": 0.7146644592285156, "learning_rate": 5e-07, "logits/chosen": -29368562.666666668, "logits/rejected": -26917798.4, "logps/chosen": -430.983642578125, "logps/rejected": -177.83648681640625, "loss": 0.2744, "rewards/chosen": 0.37236889203389484, "rewards/margins": 2.0438181718190513, "rewards/rejected": -1.6714492797851563, "step": 5130 }, { "epoch": 0.27196353324676015, "grad_norm": 68.5, "kl": 0.5268096923828125, "learning_rate": 5e-07, "logits/chosen": -85847788.8, "logits/rejected": -9037126.666666666, "logps/chosen": -571.345068359375, "logps/rejected": -271.07033284505206, "loss": 0.3547, "rewards/chosen": 0.18867249488830568, "rewards/margins": 1.9679180304209392, "rewards/rejected": -1.7792455355326335, "step": 5131 }, { "epoch": 0.2720165372485623, "grad_norm": 46.25, "kl": 1.2309532165527344, "learning_rate": 5e-07, "logits/chosen": 13089206.0, "logits/rejected": -25361309.333333332, "logps/chosen": -230.905029296875, "logps/rejected": -260.7092692057292, "loss": 0.2719, "rewards/chosen": 0.8780719041824341, "rewards/margins": 2.110137263933818, "rewards/rejected": -1.2320653597513835, "step": 5132 }, { "epoch": 0.2720695412503644, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50291848.0, "logits/rejected": -40353242.666666664, "logps/chosen": -231.16201782226562, "logps/rejected": -514.0815836588541, "loss": 0.158, "rewards/chosen": 0.7340995669364929, "rewards/margins": 3.65631757179896, "rewards/rejected": -2.9222180048624673, "step": 5133 }, { "epoch": 0.27212254525216656, "grad_norm": 66.5, "kl": 3.1938133239746094, "learning_rate": 5e-07, "logits/chosen": -45156988.8, "logits/rejected": -12815053.333333334, "logps/chosen": -610.9537109375, "logps/rejected": -261.0779622395833, "loss": 0.3215, "rewards/chosen": 0.5367672920227051, "rewards/margins": 2.936801242828369, "rewards/rejected": -2.400033950805664, "step": 5134 }, { "epoch": 0.2721755492539687, "grad_norm": 53.25, "kl": 0.18790435791015625, "learning_rate": 5e-07, "logits/chosen": -18591088.0, "logits/rejected": -70592784.0, "logps/chosen": -152.5196075439453, "logps/rejected": -327.8719482421875, "loss": 0.3052, "rewards/chosen": 0.4643573760986328, "rewards/margins": 2.1137571334838867, "rewards/rejected": -1.649399757385254, "step": 5135 }, { "epoch": 0.27222855325577083, "grad_norm": 51.5, "kl": 0.3701591491699219, "learning_rate": 5e-07, "logits/chosen": -33483560.0, "logits/rejected": -1921772.0, "logps/chosen": -197.2008260091146, "logps/rejected": -150.34689331054688, "loss": 0.4042, "rewards/chosen": 0.1359781821568807, "rewards/margins": 1.642417033513387, "rewards/rejected": -1.5064388513565063, "step": 5136 }, { "epoch": 0.27228155725757297, "grad_norm": 70.5, "kl": 3.323261260986328, "learning_rate": 5e-07, "logits/chosen": -23570179.2, "logits/rejected": -28928330.666666668, "logps/chosen": -388.1326904296875, "logps/rejected": -259.4202473958333, "loss": 0.3461, "rewards/chosen": 0.7680445671081543, "rewards/margins": 2.393996175130208, "rewards/rejected": -1.625951608022054, "step": 5137 }, { "epoch": 0.2723345612593751, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10430787.0, "logits/rejected": 42044088.0, "logps/chosen": -240.0495147705078, "logps/rejected": -358.16265869140625, "loss": 0.2997, "rewards/chosen": 0.12743684649467468, "rewards/margins": 2.2716700732707977, "rewards/rejected": -2.144233226776123, "step": 5138 }, { "epoch": 0.27238756526117724, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91679072.0, "logits/rejected": -75819424.0, "logps/chosen": -610.9228515625, "logps/rejected": -195.08357747395834, "loss": 0.3102, "rewards/chosen": 0.07433471828699112, "rewards/margins": 1.2479245041807492, "rewards/rejected": -1.173589785893758, "step": 5139 }, { "epoch": 0.2724405692629794, "grad_norm": 59.25, "kl": 1.3057937622070312, "learning_rate": 5e-07, "logits/chosen": -17325681.333333332, "logits/rejected": -3838907.2, "logps/chosen": -396.1566975911458, "logps/rejected": -242.8112548828125, "loss": 0.2497, "rewards/chosen": 0.5987821420033773, "rewards/margins": 2.7223699410756432, "rewards/rejected": -2.123587799072266, "step": 5140 }, { "epoch": 0.27249357326478146, "grad_norm": 47.75, "kl": 0.21010589599609375, "learning_rate": 5e-07, "logits/chosen": -55408512.0, "logits/rejected": -97456448.0, "logps/chosen": -261.25885009765625, "logps/rejected": -194.84979248046875, "loss": 0.3777, "rewards/chosen": 0.3126911719640096, "rewards/margins": 1.7531636555989583, "rewards/rejected": -1.4404724836349487, "step": 5141 }, { "epoch": 0.2725465772665836, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2343219.3333333335, "logits/rejected": -37459561.6, "logps/chosen": -123.61948649088542, "logps/rejected": -267.09033203125, "loss": 0.319, "rewards/chosen": -0.3438222408294678, "rewards/margins": 1.3803110599517823, "rewards/rejected": -1.72413330078125, "step": 5142 }, { "epoch": 0.27259958126838574, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11711503.0, "logits/rejected": 7321746.0, "logps/chosen": -162.45262145996094, "logps/rejected": -172.98207092285156, "loss": 0.2743, "rewards/chosen": 0.5992305874824524, "rewards/margins": 2.2836782336235046, "rewards/rejected": -1.6844476461410522, "step": 5143 }, { "epoch": 0.2726525852701879, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17604292.0, "logits/rejected": -63254976.0, "logps/chosen": -251.8161417643229, "logps/rejected": -379.89677734375, "loss": 0.2282, "rewards/chosen": 0.5892257293065389, "rewards/margins": 2.84538684686025, "rewards/rejected": -2.256161117553711, "step": 5144 }, { "epoch": 0.27270558927199, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35429205.333333336, "logits/rejected": -44059212.8, "logps/chosen": -314.6033935546875, "logps/rejected": -419.583349609375, "loss": 0.2613, "rewards/chosen": 0.5436864296595255, "rewards/margins": 2.200545064608256, "rewards/rejected": -1.6568586349487304, "step": 5145 }, { "epoch": 0.27275859327379215, "grad_norm": 38.25, "kl": 0.00461578369140625, "learning_rate": 5e-07, "logits/chosen": -5154609.6, "logits/rejected": -38522394.666666664, "logps/chosen": -154.2171630859375, "logps/rejected": -301.5667317708333, "loss": 0.2695, "rewards/chosen": 0.7137864112854004, "rewards/margins": 2.673274008433024, "rewards/rejected": -1.9594875971476238, "step": 5146 }, { "epoch": 0.2728115972755943, "grad_norm": 48.5, "kl": 0.73431396484375, "learning_rate": 5e-07, "logits/chosen": -1136774.25, "logits/rejected": -26817850.0, "logps/chosen": -224.26361083984375, "logps/rejected": -301.2025146484375, "loss": 0.2503, "rewards/chosen": 0.8137763738632202, "rewards/margins": 2.704045057296753, "rewards/rejected": -1.8902686834335327, "step": 5147 }, { "epoch": 0.2728646012773964, "grad_norm": 67.5, "kl": 3.105316162109375, "learning_rate": 5e-07, "logits/chosen": -43438772.0, "logits/rejected": -26200726.0, "logps/chosen": -682.43359375, "logps/rejected": -419.9832763671875, "loss": 0.3068, "rewards/chosen": 0.6826431751251221, "rewards/margins": 2.8003947734832764, "rewards/rejected": -2.1177515983581543, "step": 5148 }, { "epoch": 0.27291760527919856, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13271136.8, "logits/rejected": -11821714.666666666, "logps/chosen": -212.0287109375, "logps/rejected": -277.5618082682292, "loss": 0.3106, "rewards/chosen": 0.19906263351440429, "rewards/margins": 3.3578579902648924, "rewards/rejected": -3.1587953567504883, "step": 5149 }, { "epoch": 0.2729706092810007, "grad_norm": 59.0, "kl": 0.07855987548828125, "learning_rate": 5e-07, "logits/chosen": -30236533.333333332, "logits/rejected": -23530198.4, "logps/chosen": -533.0276285807291, "logps/rejected": -349.5912841796875, "loss": 0.2202, "rewards/chosen": 0.9434611002604166, "rewards/margins": 2.703859583536784, "rewards/rejected": -1.7603984832763673, "step": 5150 }, { "epoch": 0.27302361328280284, "grad_norm": 44.0, "kl": 0.15908336639404297, "learning_rate": 5e-07, "logits/chosen": -15142974.4, "logits/rejected": -26035872.0, "logps/chosen": -183.65699462890626, "logps/rejected": -311.34674072265625, "loss": 0.3303, "rewards/chosen": 0.21654465198516845, "rewards/margins": 2.4813393195470175, "rewards/rejected": -2.264794667561849, "step": 5151 }, { "epoch": 0.27307661728460497, "grad_norm": 66.0, "kl": 0.9201889038085938, "learning_rate": 5e-07, "logits/chosen": -61894848.0, "logits/rejected": 21122172.0, "logps/chosen": -402.6918247767857, "logps/rejected": -188.84429931640625, "loss": 0.4983, "rewards/chosen": 0.08467038188661848, "rewards/margins": 0.23402035449232372, "rewards/rejected": -0.14934997260570526, "step": 5152 }, { "epoch": 0.2731296212864071, "grad_norm": 60.25, "kl": 1.05364990234375, "learning_rate": 5e-07, "logits/chosen": -48110792.0, "logits/rejected": -18218696.0, "logps/chosen": -483.52276611328125, "logps/rejected": -331.9594421386719, "loss": 0.3018, "rewards/chosen": 0.19138489663600922, "rewards/margins": 2.266084536910057, "rewards/rejected": -2.074699640274048, "step": 5153 }, { "epoch": 0.27318262528820925, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13134.75, "logits/rejected": -20898850.666666668, "logps/chosen": -134.0642852783203, "logps/rejected": -374.7650960286458, "loss": 0.2559, "rewards/chosen": 0.33082467317581177, "rewards/margins": 2.3297385970751447, "rewards/rejected": -1.9989139238993328, "step": 5154 }, { "epoch": 0.2732356292900114, "grad_norm": 56.25, "kl": 0.3448619842529297, "learning_rate": 5e-07, "logits/chosen": -50664032.0, "logits/rejected": -48756744.0, "logps/chosen": -393.0045572916667, "logps/rejected": -486.56646728515625, "loss": 0.2867, "rewards/chosen": 0.6121884981791178, "rewards/margins": 3.654789606730143, "rewards/rejected": -3.0426011085510254, "step": 5155 }, { "epoch": 0.2732886332918135, "grad_norm": 51.25, "kl": 0.5740280151367188, "learning_rate": 5e-07, "logits/chosen": -46186892.0, "logits/rejected": -25787372.0, "logps/chosen": -334.916748046875, "logps/rejected": -338.62225341796875, "loss": 0.3274, "rewards/chosen": 0.350461483001709, "rewards/margins": 1.934264898300171, "rewards/rejected": -1.583803415298462, "step": 5156 }, { "epoch": 0.27334163729361566, "grad_norm": 45.5, "kl": 0.21149063110351562, "learning_rate": 5e-07, "logits/chosen": -22094072.0, "logits/rejected": -23237438.0, "logps/chosen": -166.62600708007812, "logps/rejected": -280.80908203125, "loss": 0.3219, "rewards/chosen": 0.30454200506210327, "rewards/margins": 1.8139246106147766, "rewards/rejected": -1.5093826055526733, "step": 5157 }, { "epoch": 0.2733946412954178, "grad_norm": 56.25, "kl": 2.458311080932617, "learning_rate": 5e-07, "logits/chosen": -6100499.2, "logits/rejected": 3890033.6666666665, "logps/chosen": -254.016259765625, "logps/rejected": -271.22422281901044, "loss": 0.343, "rewards/chosen": 0.6625200271606445, "rewards/margins": 2.1824020703633624, "rewards/rejected": -1.519882043202718, "step": 5158 }, { "epoch": 0.27344764529721993, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23544796.0, "logits/rejected": -31702816.0, "logps/chosen": -254.00648498535156, "logps/rejected": -288.61822509765625, "loss": 0.2154, "rewards/chosen": 0.7183563709259033, "rewards/margins": 3.2371747493743896, "rewards/rejected": -2.5188183784484863, "step": 5159 }, { "epoch": 0.27350064929902207, "grad_norm": 57.75, "kl": 3.2941627502441406, "learning_rate": 5e-07, "logits/chosen": -14208734.4, "logits/rejected": -50882090.666666664, "logps/chosen": -547.282373046875, "logps/rejected": -691.1339518229166, "loss": 0.2834, "rewards/chosen": 1.142190933227539, "rewards/margins": 3.848572095235189, "rewards/rejected": -2.70638116200765, "step": 5160 }, { "epoch": 0.2735536533008242, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38838496.0, "logits/rejected": -14259477.333333334, "logps/chosen": -348.4076843261719, "logps/rejected": -160.84856160481772, "loss": 0.2219, "rewards/chosen": 0.11996918171644211, "rewards/margins": 2.0856179917852087, "rewards/rejected": -1.9656488100687664, "step": 5161 }, { "epoch": 0.27360665730262634, "grad_norm": 60.0, "kl": 3.4386186599731445, "learning_rate": 5e-07, "logits/chosen": -32418168.0, "logits/rejected": 2074651.25, "logps/chosen": -540.3704020182291, "logps/rejected": -141.658935546875, "loss": 0.4448, "rewards/chosen": 0.5071926116943359, "rewards/margins": 1.4713947176933289, "rewards/rejected": -0.9642021059989929, "step": 5162 }, { "epoch": 0.2736596613044285, "grad_norm": 41.5, "kl": 0.3591957092285156, "learning_rate": 5e-07, "logits/chosen": 2719747.2, "logits/rejected": -8715424.0, "logps/chosen": -140.222119140625, "logps/rejected": -489.8172200520833, "loss": 0.3758, "rewards/chosen": -0.05623656511306763, "rewards/margins": 1.8702262441317241, "rewards/rejected": -1.9264628092447917, "step": 5163 }, { "epoch": 0.2737126653062306, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2325088.75, "logits/rejected": -4015704.25, "logps/chosen": -90.3291244506836, "logps/rejected": -117.5586166381836, "loss": 0.3412, "rewards/chosen": 0.2298232614994049, "rewards/margins": 1.4478220045566559, "rewards/rejected": -1.217998743057251, "step": 5164 }, { "epoch": 0.27376566930803276, "grad_norm": 49.75, "kl": 0.3668975830078125, "learning_rate": 5e-07, "logits/chosen": -14078850.285714285, "logits/rejected": -100036328.0, "logps/chosen": -195.69930594308036, "logps/rejected": -264.52203369140625, "loss": 0.4298, "rewards/chosen": 0.28744660105024067, "rewards/margins": 1.1301162668636868, "rewards/rejected": -0.842669665813446, "step": 5165 }, { "epoch": 0.2738186733098349, "grad_norm": 59.25, "kl": 0.43672943115234375, "learning_rate": 5e-07, "logits/chosen": -53284880.0, "logits/rejected": -24803850.666666668, "logps/chosen": -282.6678955078125, "logps/rejected": -351.4041748046875, "loss": 0.3716, "rewards/chosen": 0.03632477223873139, "rewards/margins": 1.6774210462967556, "rewards/rejected": -1.6410962740580242, "step": 5166 }, { "epoch": 0.27387167731163703, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32640501.333333332, "logits/rejected": -19475484.0, "logps/chosen": -229.92228190104166, "logps/rejected": -307.3433837890625, "loss": 0.3498, "rewards/chosen": 0.16999773184458414, "rewards/margins": 3.9744253555933633, "rewards/rejected": -3.8044276237487793, "step": 5167 }, { "epoch": 0.27392468131343917, "grad_norm": 52.25, "kl": 0.34577083587646484, "learning_rate": 5e-07, "logits/chosen": -1387991.1666666667, "logits/rejected": -3449369.75, "logps/chosen": -324.87559000651044, "logps/rejected": -293.95355224609375, "loss": 0.3851, "rewards/chosen": 0.2149869998296102, "rewards/margins": 2.1333195765813193, "rewards/rejected": -1.918332576751709, "step": 5168 }, { "epoch": 0.2739776853152413, "grad_norm": 59.0, "kl": 0.057476043701171875, "learning_rate": 5e-07, "logits/chosen": -44695312.0, "logits/rejected": -69231856.0, "logps/chosen": -376.4093831380208, "logps/rejected": -489.44036865234375, "loss": 0.347, "rewards/chosen": 0.2874082128206889, "rewards/margins": 2.961898664633433, "rewards/rejected": -2.674490451812744, "step": 5169 }, { "epoch": 0.27403068931704344, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73353176.0, "logits/rejected": -6426833.5, "logps/chosen": -196.10690307617188, "logps/rejected": -150.8354034423828, "loss": 0.3547, "rewards/chosen": 0.09677819162607193, "rewards/margins": 1.5792909041047096, "rewards/rejected": -1.4825127124786377, "step": 5170 }, { "epoch": 0.2740836933188456, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60211316.0, "logits/rejected": 9219803.0, "logps/chosen": -401.7010498046875, "logps/rejected": -440.08746337890625, "loss": 0.3073, "rewards/chosen": 0.17799873650074005, "rewards/margins": 2.5878271609544754, "rewards/rejected": -2.4098284244537354, "step": 5171 }, { "epoch": 0.2741366973206477, "grad_norm": 58.5, "kl": 1.4265575408935547, "learning_rate": 5e-07, "logits/chosen": -35947526.4, "logits/rejected": -22485490.666666668, "logps/chosen": -288.770361328125, "logps/rejected": -237.973388671875, "loss": 0.3703, "rewards/chosen": 0.31457808017730715, "rewards/margins": 1.7821237484614056, "rewards/rejected": -1.4675456682840984, "step": 5172 }, { "epoch": 0.27418970132244985, "grad_norm": 52.25, "kl": 0.6918563842773438, "learning_rate": 5e-07, "logits/chosen": -23047324.0, "logits/rejected": -24659770.0, "logps/chosen": -307.1009521484375, "logps/rejected": -190.3707275390625, "loss": 0.3108, "rewards/chosen": 0.640669047832489, "rewards/margins": 1.8129846453666687, "rewards/rejected": -1.1723155975341797, "step": 5173 }, { "epoch": 0.274242705324252, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -87010352.0, "logits/rejected": -19911908.57142857, "logps/chosen": -160.68048095703125, "logps/rejected": -232.13692801339286, "loss": 0.2323, "rewards/chosen": -0.09903259575366974, "rewards/margins": 1.72907278580325, "rewards/rejected": -1.8281053815569197, "step": 5174 }, { "epoch": 0.2742957093260541, "grad_norm": 69.0, "kl": 0.9046897888183594, "learning_rate": 5e-07, "logits/chosen": -80520778.66666667, "logits/rejected": -67662328.0, "logps/chosen": -438.4629720052083, "logps/rejected": -203.35916137695312, "loss": 0.3728, "rewards/chosen": 0.4804784456888835, "rewards/margins": 1.6801199118296306, "rewards/rejected": -1.199641466140747, "step": 5175 }, { "epoch": 0.27434871332785626, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21474109.714285713, "logits/rejected": -22243356.0, "logps/chosen": -184.32291085379464, "logps/rejected": -291.1886901855469, "loss": 0.5142, "rewards/chosen": -0.30051486832754953, "rewards/margins": 1.5274301256452287, "rewards/rejected": -1.8279449939727783, "step": 5176 }, { "epoch": 0.2744017173296584, "grad_norm": 39.75, "kl": 0.47760486602783203, "learning_rate": 5e-07, "logits/chosen": -9968787.2, "logits/rejected": -44063440.0, "logps/chosen": -102.12372436523438, "logps/rejected": -221.8268025716146, "loss": 0.286, "rewards/chosen": 0.5807719230651855, "rewards/margins": 2.5036090215047198, "rewards/rejected": -1.9228370984395344, "step": 5177 }, { "epoch": 0.27445472133146054, "grad_norm": 30.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -253990.875, "logits/rejected": -21194813.333333332, "logps/chosen": -129.8328399658203, "logps/rejected": -300.7880859375, "loss": 0.2024, "rewards/chosen": 0.4075676202774048, "rewards/margins": 2.5544573068618774, "rewards/rejected": -2.1468896865844727, "step": 5178 }, { "epoch": 0.2745077253332627, "grad_norm": 50.25, "kl": 0.758305549621582, "learning_rate": 5e-07, "logits/chosen": -57679350.85714286, "logits/rejected": -26094120.0, "logps/chosen": -223.19670758928572, "logps/rejected": -280.9598388671875, "loss": 0.4051, "rewards/chosen": 0.2732828174318586, "rewards/margins": 3.215223125049046, "rewards/rejected": -2.9419403076171875, "step": 5179 }, { "epoch": 0.2745607293350648, "grad_norm": 60.25, "kl": 0.6735420227050781, "learning_rate": 5e-07, "logits/chosen": -32541558.0, "logits/rejected": -17201644.0, "logps/chosen": -499.56622314453125, "logps/rejected": -219.52001953125, "loss": 0.2982, "rewards/chosen": 0.5269851684570312, "rewards/margins": 2.065763473510742, "rewards/rejected": -1.538778305053711, "step": 5180 }, { "epoch": 0.27461373333686695, "grad_norm": 52.5, "kl": 0.32501220703125, "learning_rate": 5e-07, "logits/chosen": -5414449.6, "logits/rejected": -24804008.0, "logps/chosen": -245.3853271484375, "logps/rejected": -329.57069905598956, "loss": 0.3084, "rewards/chosen": 0.5874827384948731, "rewards/margins": 2.544507630666097, "rewards/rejected": -1.9570248921712239, "step": 5181 }, { "epoch": 0.2746667373386691, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42208746.666666664, "logits/rejected": -52555184.0, "logps/chosen": -89.94586181640625, "logps/rejected": -402.514111328125, "loss": 0.3128, "rewards/chosen": 0.1734999418258667, "rewards/margins": 1.4886746168136598, "rewards/rejected": -1.315174674987793, "step": 5182 }, { "epoch": 0.2747197413404712, "grad_norm": 68.5, "kl": 1.9934005737304688, "learning_rate": 5e-07, "logits/chosen": -48258812.8, "logits/rejected": -17089798.666666668, "logps/chosen": -542.403564453125, "logps/rejected": -293.5890706380208, "loss": 0.3282, "rewards/chosen": 0.7864404678344726, "rewards/margins": 2.412222957611084, "rewards/rejected": -1.6257824897766113, "step": 5183 }, { "epoch": 0.27477274534227336, "grad_norm": 48.75, "kl": 0.19908905029296875, "learning_rate": 5e-07, "logits/chosen": -46649258.666666664, "logits/rejected": 2437274.0, "logps/chosen": -359.086181640625, "logps/rejected": -316.623828125, "loss": 0.3349, "rewards/chosen": 0.556824247042338, "rewards/margins": 1.6592258850733437, "rewards/rejected": -1.1024016380310058, "step": 5184 }, { "epoch": 0.2748257493440755, "grad_norm": 86.5, "kl": 1.47052001953125, "learning_rate": 5e-07, "logits/chosen": 82796928.0, "logits/rejected": -15715508.0, "logps/chosen": -2342.934814453125, "logps/rejected": -196.01566569010416, "loss": 0.1968, "rewards/chosen": 0.7485044598579407, "rewards/margins": 2.9732966224352517, "rewards/rejected": -2.224792162577311, "step": 5185 }, { "epoch": 0.27487875334587764, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19892330.666666668, "logits/rejected": -11998166.4, "logps/chosen": -165.9704386393229, "logps/rejected": -299.033447265625, "loss": 0.2368, "rewards/chosen": 0.4033048152923584, "rewards/margins": 2.3556071758270263, "rewards/rejected": -1.9523023605346679, "step": 5186 }, { "epoch": 0.2749317573476798, "grad_norm": 56.25, "kl": 0.01604461669921875, "learning_rate": 5e-07, "logits/chosen": -58330041.6, "logits/rejected": -1670861.3333333333, "logps/chosen": -391.107958984375, "logps/rejected": -206.67232259114584, "loss": 0.3354, "rewards/chosen": 0.06415696144104004, "rewards/margins": 2.6956344763437907, "rewards/rejected": -2.6314775149027505, "step": 5187 }, { "epoch": 0.2749847613494819, "grad_norm": 53.0, "kl": 1.0770282745361328, "learning_rate": 5e-07, "logits/chosen": -29010457.6, "logits/rejected": -21614449.333333332, "logps/chosen": -223.383837890625, "logps/rejected": -171.5501708984375, "loss": 0.3078, "rewards/chosen": 0.7249484539031983, "rewards/margins": 2.037454334894816, "rewards/rejected": -1.312505880991618, "step": 5188 }, { "epoch": 0.27503776535128405, "grad_norm": 56.25, "kl": 4.628047943115234, "learning_rate": 5e-07, "logits/chosen": -24175725.333333332, "logits/rejected": -37442280.0, "logps/chosen": -419.2421875, "logps/rejected": -633.503173828125, "loss": 0.3642, "rewards/chosen": 0.8597884972890218, "rewards/margins": 2.9097793896993003, "rewards/rejected": -2.0499908924102783, "step": 5189 }, { "epoch": 0.2750907693530862, "grad_norm": 79.0, "kl": 0.4916553497314453, "learning_rate": 5e-07, "logits/chosen": -27712056.0, "logits/rejected": 468850.0, "logps/chosen": -285.6796875, "logps/rejected": -278.4600524902344, "loss": 0.2718, "rewards/chosen": 0.9568922519683838, "rewards/margins": 2.2663553953170776, "rewards/rejected": -1.3094631433486938, "step": 5190 }, { "epoch": 0.2751437733548883, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29789176.0, "logits/rejected": 121956.8, "logps/chosen": -200.2052001953125, "logps/rejected": -424.382177734375, "loss": 0.2935, "rewards/chosen": 0.200413445631663, "rewards/margins": 1.8705544749895733, "rewards/rejected": -1.6701410293579102, "step": 5191 }, { "epoch": 0.2751967773566904, "grad_norm": 47.75, "kl": 0.526153564453125, "learning_rate": 5e-07, "logits/chosen": -32894508.8, "logits/rejected": -15202472.0, "logps/chosen": -191.82242431640626, "logps/rejected": -243.7337646484375, "loss": 0.4077, "rewards/chosen": -0.08775740265846252, "rewards/margins": 1.7158713599046072, "rewards/rejected": -1.8036287625630696, "step": 5192 }, { "epoch": 0.27524978135849254, "grad_norm": 53.75, "kl": 0.8321609497070312, "learning_rate": 5e-07, "logits/chosen": -55599416.0, "logits/rejected": -17398836.0, "logps/chosen": -333.6104431152344, "logps/rejected": -356.9586181640625, "loss": 0.3689, "rewards/chosen": -0.32824259996414185, "rewards/margins": 1.7945770621299744, "rewards/rejected": -2.122819662094116, "step": 5193 }, { "epoch": 0.2753027853602947, "grad_norm": 54.25, "kl": 1.1412582397460938, "learning_rate": 5e-07, "logits/chosen": -15743878.4, "logits/rejected": -22782146.666666668, "logps/chosen": -202.97374267578124, "logps/rejected": -179.08154296875, "loss": 0.4201, "rewards/chosen": 0.25421719551086425, "rewards/margins": 1.0378854751586915, "rewards/rejected": -0.7836682796478271, "step": 5194 }, { "epoch": 0.2753557893620968, "grad_norm": 69.0, "kl": 2.6932945251464844, "learning_rate": 5e-07, "logits/chosen": -41777216.0, "logits/rejected": -20046788.0, "logps/chosen": -430.0082194010417, "logps/rejected": -277.1730041503906, "loss": 0.4242, "rewards/chosen": 0.4886217912038167, "rewards/margins": 1.375030775864919, "rewards/rejected": -0.8864089846611023, "step": 5195 }, { "epoch": 0.27540879336389895, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9466616.666666666, "logits/rejected": -13076526.4, "logps/chosen": -149.13967895507812, "logps/rejected": -224.91083984375, "loss": 0.2973, "rewards/chosen": 0.15042470892270407, "rewards/margins": 1.6647647281487783, "rewards/rejected": -1.5143400192260743, "step": 5196 }, { "epoch": 0.2754617973657011, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62819333.333333336, "logits/rejected": -9707796.8, "logps/chosen": -333.7398274739583, "logps/rejected": -243.0572265625, "loss": 0.2289, "rewards/chosen": 0.7745452721913656, "rewards/margins": 2.8491496880849203, "rewards/rejected": -2.0746044158935546, "step": 5197 }, { "epoch": 0.2755148013675032, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44755264.0, "logits/rejected": -30097472.0, "logps/chosen": -884.4479166666666, "logps/rejected": -301.0107666015625, "loss": 0.2253, "rewards/chosen": 0.8337494532267252, "rewards/margins": 2.7182083765665688, "rewards/rejected": -1.8844589233398437, "step": 5198 }, { "epoch": 0.27556780536930536, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31234205.333333332, "logits/rejected": -43264348.8, "logps/chosen": -416.303466796875, "logps/rejected": -279.068017578125, "loss": 0.3342, "rewards/chosen": 0.036021292209625244, "rewards/margins": 1.3094884514808656, "rewards/rejected": -1.2734671592712403, "step": 5199 }, { "epoch": 0.2756208093711075, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5744540.5, "logits/rejected": -15914253.714285715, "logps/chosen": -8.701812744140625, "logps/rejected": -356.26743861607144, "loss": 0.2072, "rewards/chosen": 0.08071766048669815, "rewards/margins": 1.8406937750322478, "rewards/rejected": -1.7599761145455497, "step": 5200 }, { "epoch": 0.27567381337290964, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9700068.0, "logits/rejected": -23536704.0, "logps/chosen": -87.63686116536458, "logps/rejected": -370.9786865234375, "loss": 0.314, "rewards/chosen": -0.30396831035614014, "rewards/margins": 1.6099114656448363, "rewards/rejected": -1.9138797760009765, "step": 5201 }, { "epoch": 0.2757268173747118, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67828648.0, "logits/rejected": -21334160.0, "logps/chosen": -505.732177734375, "logps/rejected": -283.1048990885417, "loss": 0.2608, "rewards/chosen": 0.271157830953598, "rewards/margins": 2.30777574578921, "rewards/rejected": -2.036617914835612, "step": 5202 }, { "epoch": 0.2757798213765139, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9393168.666666666, "logits/rejected": -3177124.8, "logps/chosen": -163.67974853515625, "logps/rejected": -135.5418701171875, "loss": 0.3167, "rewards/chosen": -0.16914814710617065, "rewards/margins": 1.5676100850105286, "rewards/rejected": -1.7367582321166992, "step": 5203 }, { "epoch": 0.27583282537831605, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48531274.666666664, "logits/rejected": -6171554.4, "logps/chosen": -233.0728556315104, "logps/rejected": -160.732666015625, "loss": 0.3923, "rewards/chosen": -0.021318763494491577, "rewards/margins": 0.8127214968204498, "rewards/rejected": -0.8340402603149414, "step": 5204 }, { "epoch": 0.2758858293801182, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -185518.5, "logits/rejected": -9314693.333333334, "logps/chosen": -59.39839553833008, "logps/rejected": -141.83209228515625, "loss": 0.2981, "rewards/chosen": -0.19879204034805298, "rewards/margins": 1.2803790767987568, "rewards/rejected": -1.4791711171468098, "step": 5205 }, { "epoch": 0.2759388333819203, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94585845.33333333, "logits/rejected": -11292612.8, "logps/chosen": -418.9154459635417, "logps/rejected": -273.04873046875, "loss": 0.3643, "rewards/chosen": -0.05606891711552938, "rewards/margins": 1.043360976378123, "rewards/rejected": -1.0994298934936524, "step": 5206 }, { "epoch": 0.27599183738372246, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66694860.8, "logits/rejected": -14258038.666666666, "logps/chosen": -117.51583251953124, "logps/rejected": -143.18685913085938, "loss": 0.339, "rewards/chosen": 0.17703256607055665, "rewards/margins": 2.0585375785827638, "rewards/rejected": -1.881505012512207, "step": 5207 }, { "epoch": 0.2760448413855246, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3946815.3333333335, "logits/rejected": -5196583.2, "logps/chosen": -122.67018636067708, "logps/rejected": -193.22003173828125, "loss": 0.2726, "rewards/chosen": 0.1213517685731252, "rewards/margins": 2.399890281756719, "rewards/rejected": -2.2785385131835936, "step": 5208 }, { "epoch": 0.27609784538732673, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26825322.0, "logits/rejected": -36972412.0, "logps/chosen": -227.10423278808594, "logps/rejected": -226.0242919921875, "loss": 0.3651, "rewards/chosen": 0.181039497256279, "rewards/margins": 1.367532417178154, "rewards/rejected": -1.186492919921875, "step": 5209 }, { "epoch": 0.27615084938912887, "grad_norm": 55.75, "kl": 0.23046875, "learning_rate": 5e-07, "logits/chosen": -4845560.0, "logits/rejected": -5928005.0, "logps/chosen": -255.25685119628906, "logps/rejected": -411.2147521972656, "loss": 0.3681, "rewards/chosen": -0.16948461532592773, "rewards/margins": 1.573122262954712, "rewards/rejected": -1.7426068782806396, "step": 5210 }, { "epoch": 0.276203853390931, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21851198.4, "logits/rejected": -5183635.333333333, "logps/chosen": -221.98671875, "logps/rejected": -88.33784993489583, "loss": 0.4291, "rewards/chosen": -0.13602697849273682, "rewards/margins": 1.1215752363204956, "rewards/rejected": -1.2576022148132324, "step": 5211 }, { "epoch": 0.27625685739273315, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51338048.0, "logits/rejected": -40703616.0, "logps/chosen": -332.452392578125, "logps/rejected": -413.15625, "loss": 0.3695, "rewards/chosen": 0.07876692712306976, "rewards/margins": 1.56413172185421, "rewards/rejected": -1.4853647947311401, "step": 5212 }, { "epoch": 0.2763098613945353, "grad_norm": 47.0, "kl": 0.3354072570800781, "learning_rate": 5e-07, "logits/chosen": -20023576.0, "logits/rejected": -6489975.5, "logps/chosen": -335.62469482421875, "logps/rejected": -186.76727294921875, "loss": 0.2715, "rewards/chosen": 0.6028828620910645, "rewards/margins": 2.325774073600769, "rewards/rejected": -1.7228912115097046, "step": 5213 }, { "epoch": 0.2763628653963374, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13150102.4, "logits/rejected": -19503522.666666668, "logps/chosen": -278.4277099609375, "logps/rejected": -143.8885498046875, "loss": 0.2937, "rewards/chosen": 0.8074891090393066, "rewards/margins": 2.1985566139221193, "rewards/rejected": -1.3910675048828125, "step": 5214 }, { "epoch": 0.27641586939813956, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34110572.8, "logits/rejected": -15344286.666666666, "logps/chosen": -474.198046875, "logps/rejected": -432.2649739583333, "loss": 0.3233, "rewards/chosen": 0.3935573101043701, "rewards/margins": 2.7277286052703857, "rewards/rejected": -2.3341712951660156, "step": 5215 }, { "epoch": 0.2764688733999417, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18156946.0, "logits/rejected": -34921952.0, "logps/chosen": -361.1380615234375, "logps/rejected": -360.849365234375, "loss": 0.2464, "rewards/chosen": 0.9731106162071228, "rewards/margins": 2.599919378757477, "rewards/rejected": -1.626808762550354, "step": 5216 }, { "epoch": 0.27652187740174383, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22016898.666666668, "logits/rejected": -31131180.8, "logps/chosen": -442.7899576822917, "logps/rejected": -379.9462158203125, "loss": 0.2814, "rewards/chosen": 0.3983917236328125, "rewards/margins": 2.1838710784912108, "rewards/rejected": -1.7854793548583985, "step": 5217 }, { "epoch": 0.27657488140354597, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22239856.0, "logits/rejected": -41387978.666666664, "logps/chosen": -342.0759521484375, "logps/rejected": -378.515380859375, "loss": 0.3052, "rewards/chosen": 0.3416236639022827, "rewards/margins": 3.1002731720606485, "rewards/rejected": -2.7586495081583657, "step": 5218 }, { "epoch": 0.2766278854053481, "grad_norm": 44.5, "kl": 0.6698226928710938, "learning_rate": 5e-07, "logits/chosen": -45170874.666666664, "logits/rejected": -34210361.6, "logps/chosen": -442.3714192708333, "logps/rejected": -219.398974609375, "loss": 0.2395, "rewards/chosen": 0.39534302552541095, "rewards/margins": 2.832381828625997, "rewards/rejected": -2.437038803100586, "step": 5219 }, { "epoch": 0.27668088940715024, "grad_norm": 71.5, "kl": 4.352928161621094, "learning_rate": 5e-07, "logits/chosen": -5887332.8, "logits/rejected": -1938530.3333333333, "logps/chosen": -604.38486328125, "logps/rejected": -232.16414388020834, "loss": 0.3128, "rewards/chosen": 0.999871826171875, "rewards/margins": 2.933042462666829, "rewards/rejected": -1.9331706364949544, "step": 5220 }, { "epoch": 0.2767338934089524, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3176748.75, "logits/rejected": -17374364.0, "logps/chosen": -184.88186645507812, "logps/rejected": -236.15608723958334, "loss": 0.2681, "rewards/chosen": 0.614862859249115, "rewards/margins": 2.0290202498435974, "rewards/rejected": -1.4141573905944824, "step": 5221 }, { "epoch": 0.2767868974107545, "grad_norm": 77.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34435072.0, "logits/rejected": -43815616.0, "logps/chosen": -304.96405029296875, "logps/rejected": -239.901611328125, "loss": 0.351, "rewards/chosen": -0.021203413605690002, "rewards/margins": 1.3846250623464584, "rewards/rejected": -1.4058284759521484, "step": 5222 }, { "epoch": 0.27683990141255665, "grad_norm": 54.75, "kl": 1.02569580078125, "learning_rate": 5e-07, "logits/chosen": -8832076.57142857, "logits/rejected": 316196288.0, "logps/chosen": -172.574951171875, "logps/rejected": -902.1761474609375, "loss": 0.3895, "rewards/chosen": 0.40746586663382395, "rewards/margins": 2.7637831483568465, "rewards/rejected": -2.3563172817230225, "step": 5223 }, { "epoch": 0.2768929054143588, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -105678624.0, "logits/rejected": -13536926.4, "logps/chosen": -678.744873046875, "logps/rejected": -361.2942138671875, "loss": 0.2448, "rewards/chosen": 0.39432676633199054, "rewards/margins": 2.458736022313436, "rewards/rejected": -2.0644092559814453, "step": 5224 }, { "epoch": 0.27694590941616093, "grad_norm": 46.5, "kl": 0.5067481994628906, "learning_rate": 5e-07, "logits/chosen": -22545174.0, "logits/rejected": -30275260.0, "logps/chosen": -135.55474853515625, "logps/rejected": -454.39404296875, "loss": 0.3084, "rewards/chosen": -0.09666995704174042, "rewards/margins": 2.8540792018175125, "rewards/rejected": -2.950749158859253, "step": 5225 }, { "epoch": 0.27699891341796307, "grad_norm": 63.75, "kl": 1.8316726684570312, "learning_rate": 5e-07, "logits/chosen": -7614771.333333333, "logits/rejected": -61669320.0, "logps/chosen": -446.969970703125, "logps/rejected": -462.0129089355469, "loss": 0.3706, "rewards/chosen": 0.5887102286020914, "rewards/margins": 2.2727572123209634, "rewards/rejected": -1.684046983718872, "step": 5226 }, { "epoch": 0.2770519174197652, "grad_norm": 72.0, "kl": 1.3426074981689453, "learning_rate": 5e-07, "logits/chosen": -8249286.0, "logits/rejected": -13899600.0, "logps/chosen": -773.5384521484375, "logps/rejected": -134.62423706054688, "loss": 0.3362, "rewards/chosen": 0.5638822317123413, "rewards/margins": 1.5863178968429565, "rewards/rejected": -1.0224356651306152, "step": 5227 }, { "epoch": 0.27710492142156734, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16184802.666666666, "logits/rejected": -9675218.4, "logps/chosen": -124.52581787109375, "logps/rejected": -169.01146240234374, "loss": 0.3653, "rewards/chosen": -0.3007415135701497, "rewards/margins": 1.155229632059733, "rewards/rejected": -1.4559711456298827, "step": 5228 }, { "epoch": 0.2771579254233695, "grad_norm": 50.0, "kl": 0.10258102416992188, "learning_rate": 5e-07, "logits/chosen": -49438032.0, "logits/rejected": -49200744.0, "logps/chosen": -454.88531494140625, "logps/rejected": -445.1062927246094, "loss": 0.2506, "rewards/chosen": 0.21619606018066406, "rewards/margins": 3.4273245334625244, "rewards/rejected": -3.2111284732818604, "step": 5229 }, { "epoch": 0.2772109294251716, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32255769.6, "logits/rejected": -22270656.0, "logps/chosen": -365.3222900390625, "logps/rejected": -132.1153564453125, "loss": 0.301, "rewards/chosen": 0.3603094816207886, "rewards/margins": 2.6245805819829306, "rewards/rejected": -2.264271100362142, "step": 5230 }, { "epoch": 0.27726393342697375, "grad_norm": 70.5, "kl": 2.8163394927978516, "learning_rate": 5e-07, "logits/chosen": -65271030.85714286, "logits/rejected": -35994056.0, "logps/chosen": -477.49400111607144, "logps/rejected": -389.51495361328125, "loss": 0.4419, "rewards/chosen": 0.4978545052664621, "rewards/margins": 1.4145506961005074, "rewards/rejected": -0.9166961908340454, "step": 5231 }, { "epoch": 0.2773169374287759, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40548268.8, "logits/rejected": -45257205.333333336, "logps/chosen": -179.48194580078126, "logps/rejected": -267.4737955729167, "loss": 0.4256, "rewards/chosen": -0.14968631267547608, "rewards/margins": 1.012773060798645, "rewards/rejected": -1.162459373474121, "step": 5232 }, { "epoch": 0.277369941430578, "grad_norm": 95.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37875360.0, "logits/rejected": -17128820.0, "logps/chosen": -412.7759195963542, "logps/rejected": -176.88851928710938, "loss": 0.435, "rewards/chosen": -0.05152283112208048, "rewards/margins": 1.4213844736417134, "rewards/rejected": -1.472907304763794, "step": 5233 }, { "epoch": 0.27742294543238016, "grad_norm": 51.0, "kl": 1.1372299194335938, "learning_rate": 5e-07, "logits/chosen": 16296633.0, "logits/rejected": -28354878.0, "logps/chosen": -396.70001220703125, "logps/rejected": -467.73388671875, "loss": 0.2534, "rewards/chosen": 0.5776236653327942, "rewards/margins": 3.0803382992744446, "rewards/rejected": -2.5027146339416504, "step": 5234 }, { "epoch": 0.2774759494341823, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5088666.0, "logits/rejected": -7629337.0, "logps/chosen": -253.1201934814453, "logps/rejected": -236.06320190429688, "loss": 0.4233, "rewards/chosen": -0.305294394493103, "rewards/margins": 0.6317877173423767, "rewards/rejected": -0.9370821118354797, "step": 5235 }, { "epoch": 0.27752895343598444, "grad_norm": 39.25, "kl": 0.8929176330566406, "learning_rate": 5e-07, "logits/chosen": -19754432.0, "logits/rejected": 581167.375, "logps/chosen": -583.7767944335938, "logps/rejected": -151.99368286132812, "loss": 0.222, "rewards/chosen": 1.133326768875122, "rewards/margins": 2.8061306476593018, "rewards/rejected": -1.6728038787841797, "step": 5236 }, { "epoch": 0.2775819574377866, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4513865.0, "logits/rejected": -42321288.0, "logps/chosen": -78.18701934814453, "logps/rejected": -249.068359375, "loss": 0.3047, "rewards/chosen": 0.4332031011581421, "rewards/margins": 1.9478477239608765, "rewards/rejected": -1.5146446228027344, "step": 5237 }, { "epoch": 0.2776349614395887, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40378608.0, "logits/rejected": -21688446.666666668, "logps/chosen": -268.5556396484375, "logps/rejected": -208.62215169270834, "loss": 0.3065, "rewards/chosen": 0.38007493019104005, "rewards/margins": 2.3115524768829347, "rewards/rejected": -1.9314775466918945, "step": 5238 }, { "epoch": 0.27768796544139085, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63770704.0, "logits/rejected": -20866936.0, "logps/chosen": -466.59637451171875, "logps/rejected": -238.44442749023438, "loss": 0.362, "rewards/chosen": -0.3133419156074524, "rewards/margins": 1.9142994284629822, "rewards/rejected": -2.2276413440704346, "step": 5239 }, { "epoch": 0.277740969443193, "grad_norm": 78.0, "kl": 1.6627273559570312, "learning_rate": 5e-07, "logits/chosen": -20788672.0, "logits/rejected": -33542076.0, "logps/chosen": -697.4176635742188, "logps/rejected": -265.0924987792969, "loss": 0.3855, "rewards/chosen": 0.43452340364456177, "rewards/margins": 1.2127838134765625, "rewards/rejected": -0.7782604098320007, "step": 5240 }, { "epoch": 0.2777939734449951, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45876080.0, "logits/rejected": -34511715.2, "logps/chosen": -368.43798828125, "logps/rejected": -416.464208984375, "loss": 0.2765, "rewards/chosen": -0.026533504327138264, "rewards/margins": 2.3690635720888773, "rewards/rejected": -2.3955970764160157, "step": 5241 }, { "epoch": 0.27784697744679726, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25648892.0, "logits/rejected": -13571986.0, "logps/chosen": -517.7318115234375, "logps/rejected": -412.124267578125, "loss": 0.2118, "rewards/chosen": 0.8570291996002197, "rewards/margins": 3.513192653656006, "rewards/rejected": -2.656163454055786, "step": 5242 }, { "epoch": 0.27789998144859934, "grad_norm": 63.0, "kl": 1.7819318771362305, "learning_rate": 5e-07, "logits/chosen": -9047826.285714285, "logits/rejected": -49911784.0, "logps/chosen": -332.6477748325893, "logps/rejected": -387.778564453125, "loss": 0.3991, "rewards/chosen": 0.4995203358786447, "rewards/margins": 2.040725146021162, "rewards/rejected": -1.541204810142517, "step": 5243 }, { "epoch": 0.2779529854504015, "grad_norm": 64.0, "kl": 3.3073577880859375, "learning_rate": 5e-07, "logits/chosen": -49816566.85714286, "logits/rejected": 12658248.0, "logps/chosen": -480.65011160714283, "logps/rejected": -774.0619506835938, "loss": 0.4354, "rewards/chosen": 0.47515480858939035, "rewards/margins": 2.3925926004137312, "rewards/rejected": -1.9174377918243408, "step": 5244 }, { "epoch": 0.2780059894522036, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32750648.0, "logits/rejected": -20267587.2, "logps/chosen": -692.43994140625, "logps/rejected": -244.40478515625, "loss": 0.3, "rewards/chosen": 0.2141903042793274, "rewards/margins": 1.78254576921463, "rewards/rejected": -1.5683554649353026, "step": 5245 }, { "epoch": 0.27805899345400575, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10307480.0, "logits/rejected": 5903725.0, "logps/chosen": -169.80117797851562, "logps/rejected": -51.17205047607422, "loss": 0.4171, "rewards/chosen": 0.19774174690246582, "rewards/margins": 1.0867323279380798, "rewards/rejected": -0.888990581035614, "step": 5246 }, { "epoch": 0.2781119974558079, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12679664.0, "logits/rejected": 4052933.5, "logps/chosen": -461.8228454589844, "logps/rejected": -121.0260238647461, "loss": 0.374, "rewards/chosen": 0.16815799474716187, "rewards/margins": 1.237934172153473, "rewards/rejected": -1.069776177406311, "step": 5247 }, { "epoch": 0.27816500145761003, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47156552.0, "logits/rejected": -61602048.0, "logps/chosen": -361.258544921875, "logps/rejected": -309.3972574869792, "loss": 0.3095, "rewards/chosen": 0.37462615966796875, "rewards/margins": 1.5529396533966064, "rewards/rejected": -1.1783134937286377, "step": 5248 }, { "epoch": 0.27821800545941217, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11894845.333333334, "logits/rejected": 4216422.4, "logps/chosen": -17.07388178507487, "logps/rejected": -456.6095703125, "loss": 0.2646, "rewards/chosen": 0.15624910593032837, "rewards/margins": 2.5076548218727113, "rewards/rejected": -2.351405715942383, "step": 5249 }, { "epoch": 0.2782710094612143, "grad_norm": 52.0, "kl": 0.7571964263916016, "learning_rate": 5e-07, "logits/chosen": -21388262.666666668, "logits/rejected": -34847811.2, "logps/chosen": -345.2815348307292, "logps/rejected": -247.379248046875, "loss": 0.3263, "rewards/chosen": 0.7273276646931967, "rewards/margins": 1.82436154683431, "rewards/rejected": -1.0970338821411132, "step": 5250 }, { "epoch": 0.27832401346301644, "grad_norm": 47.5, "kl": 0.44460105895996094, "learning_rate": 5e-07, "logits/chosen": -34626418.666666664, "logits/rejected": -17164171.2, "logps/chosen": -286.0582275390625, "logps/rejected": -331.26201171875, "loss": 0.243, "rewards/chosen": 0.17144624392191568, "rewards/margins": 3.185201088587443, "rewards/rejected": -3.0137548446655273, "step": 5251 }, { "epoch": 0.2783770174648186, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9660100.0, "logits/rejected": -21949110.666666668, "logps/chosen": -140.8350341796875, "logps/rejected": -618.6402180989584, "loss": 0.3236, "rewards/chosen": 0.10210174322128296, "rewards/margins": 3.8494076132774353, "rewards/rejected": -3.7473058700561523, "step": 5252 }, { "epoch": 0.2784300214666207, "grad_norm": 29.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1563860.8, "logits/rejected": 1148397.3333333333, "logps/chosen": -38.700128173828126, "logps/rejected": -54.747894287109375, "loss": 0.3736, "rewards/chosen": 0.39623634815216063, "rewards/margins": 1.2395257234573365, "rewards/rejected": -0.8432893753051758, "step": 5253 }, { "epoch": 0.27848302546842285, "grad_norm": 42.75, "kl": 0.3227996826171875, "learning_rate": 5e-07, "logits/chosen": -21964300.8, "logits/rejected": -41950144.0, "logps/chosen": -161.27469482421876, "logps/rejected": -468.2204996744792, "loss": 0.3665, "rewards/chosen": -0.09134637117385865, "rewards/margins": 2.2054652094841005, "rewards/rejected": -2.296811580657959, "step": 5254 }, { "epoch": 0.278536029470225, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2214609.3333333335, "logits/rejected": -19248096.0, "logps/chosen": -132.83866373697916, "logps/rejected": -308.0245361328125, "loss": 0.2642, "rewards/chosen": -0.09058557947476704, "rewards/margins": 2.3786187509695687, "rewards/rejected": -2.469204330444336, "step": 5255 }, { "epoch": 0.2785890334720271, "grad_norm": 66.0, "kl": 0.5967741012573242, "learning_rate": 5e-07, "logits/chosen": -9833425.142857144, "logits/rejected": -6894901.0, "logps/chosen": -232.32425362723214, "logps/rejected": -78.75434875488281, "loss": 0.4798, "rewards/chosen": -0.024618417024612427, "rewards/margins": 1.5142564475536346, "rewards/rejected": -1.538874864578247, "step": 5256 }, { "epoch": 0.27864203747382926, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17443332.0, "logits/rejected": -6842890.4, "logps/chosen": -332.719970703125, "logps/rejected": -355.944091796875, "loss": 0.2555, "rewards/chosen": 0.24647319316864014, "rewards/margins": 2.273493456840515, "rewards/rejected": -2.027020263671875, "step": 5257 }, { "epoch": 0.2786950414756314, "grad_norm": 52.0, "kl": 0.8977699279785156, "learning_rate": 5e-07, "logits/chosen": -54856144.0, "logits/rejected": -39459092.0, "logps/chosen": -517.482666015625, "logps/rejected": -425.52972412109375, "loss": 0.264, "rewards/chosen": 0.39498063921928406, "rewards/margins": 3.3725540339946747, "rewards/rejected": -2.9775733947753906, "step": 5258 }, { "epoch": 0.27874804547743354, "grad_norm": 57.5, "kl": 1.260498046875, "learning_rate": 5e-07, "logits/chosen": -12109986.0, "logits/rejected": -38991164.0, "logps/chosen": -193.3279571533203, "logps/rejected": -255.27984619140625, "loss": 0.3103, "rewards/chosen": 1.004815697669983, "rewards/margins": 1.9212257862091064, "rewards/rejected": -0.9164100885391235, "step": 5259 }, { "epoch": 0.2788010494792357, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4256589.0, "logits/rejected": -27906138.0, "logps/chosen": -344.85113525390625, "logps/rejected": -288.2922058105469, "loss": 0.3209, "rewards/chosen": 0.21031928062438965, "rewards/margins": 1.9906021356582642, "rewards/rejected": -1.7802828550338745, "step": 5260 }, { "epoch": 0.2788540534810378, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14338446.666666666, "logits/rejected": -21713961.6, "logps/chosen": -187.464111328125, "logps/rejected": -278.2650146484375, "loss": 0.209, "rewards/chosen": 0.6729103724161783, "rewards/margins": 2.898084704081217, "rewards/rejected": -2.225174331665039, "step": 5261 }, { "epoch": 0.27890705748283995, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44392612.0, "logits/rejected": -79850544.0, "logps/chosen": -342.72430419921875, "logps/rejected": -330.1624450683594, "loss": 0.2686, "rewards/chosen": 0.6569267511367798, "rewards/margins": 2.6102124452590942, "rewards/rejected": -1.9532856941223145, "step": 5262 }, { "epoch": 0.2789600614846421, "grad_norm": 48.0, "kl": 0.36746788024902344, "learning_rate": 5e-07, "logits/chosen": -422727.71428571426, "logits/rejected": -18264616.0, "logps/chosen": -138.94346400669642, "logps/rejected": -276.24298095703125, "loss": 0.4445, "rewards/chosen": 0.23420815808432444, "rewards/margins": 0.9216898594583784, "rewards/rejected": -0.687481701374054, "step": 5263 }, { "epoch": 0.2790130654864442, "grad_norm": 53.75, "kl": 2.2392501831054688, "learning_rate": 5e-07, "logits/chosen": -22224617.14285714, "logits/rejected": -55806624.0, "logps/chosen": -303.02493722098217, "logps/rejected": -643.118896484375, "loss": 0.3969, "rewards/chosen": 0.5539144788469587, "rewards/margins": 2.3129355226244246, "rewards/rejected": -1.7590210437774658, "step": 5264 }, { "epoch": 0.27906606948824636, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72120080.0, "logits/rejected": -9479424.0, "logps/chosen": -432.2262878417969, "logps/rejected": -124.86583709716797, "loss": 0.359, "rewards/chosen": 0.12977829575538635, "rewards/margins": 1.3928548395633698, "rewards/rejected": -1.2630765438079834, "step": 5265 }, { "epoch": 0.2791190734900485, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41346656.0, "logits/rejected": -343814.7, "logps/chosen": -304.06943766276044, "logps/rejected": -219.5464599609375, "loss": 0.3894, "rewards/chosen": -0.5696777502695719, "rewards/margins": 0.6740217049916585, "rewards/rejected": -1.2436994552612304, "step": 5266 }, { "epoch": 0.27917207749185063, "grad_norm": 57.0, "kl": 0.189178466796875, "learning_rate": 5e-07, "logits/chosen": -52064601.6, "logits/rejected": -5335534.666666667, "logps/chosen": -331.5306640625, "logps/rejected": -97.31215413411458, "loss": 0.3815, "rewards/chosen": 0.16401655673980714, "rewards/margins": 1.5083085616429646, "rewards/rejected": -1.3442920049031575, "step": 5267 }, { "epoch": 0.27922508149365277, "grad_norm": 53.75, "kl": 0.5018310546875, "learning_rate": 5e-07, "logits/chosen": -42615189.333333336, "logits/rejected": -20570854.4, "logps/chosen": -400.010009765625, "logps/rejected": -354.49658203125, "loss": 0.2697, "rewards/chosen": 0.6856145064036051, "rewards/margins": 2.3275120894114174, "rewards/rejected": -1.6418975830078124, "step": 5268 }, { "epoch": 0.2792780854954549, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70917296.0, "logits/rejected": -22583097.6, "logps/chosen": -489.2123209635417, "logps/rejected": -253.47119140625, "loss": 0.285, "rewards/chosen": 0.24806112051010132, "rewards/margins": 1.8161381125450133, "rewards/rejected": -1.568076992034912, "step": 5269 }, { "epoch": 0.27933108949725705, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9471589.0, "logits/rejected": -33197115.42857143, "logps/chosen": -427.9707946777344, "logps/rejected": -305.20884486607144, "loss": 0.1359, "rewards/chosen": 0.9792450070381165, "rewards/margins": 3.274754617895399, "rewards/rejected": -2.2955096108572826, "step": 5270 }, { "epoch": 0.2793840934990592, "grad_norm": 50.25, "kl": 0.2555255889892578, "learning_rate": 5e-07, "logits/chosen": -7951529.0, "logits/rejected": -19539580.0, "logps/chosen": -403.91754150390625, "logps/rejected": -139.37510681152344, "loss": 0.3175, "rewards/chosen": 0.2864452302455902, "rewards/margins": 1.8104390799999237, "rewards/rejected": -1.5239938497543335, "step": 5271 }, { "epoch": 0.2794370975008613, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63066560.0, "logits/rejected": -12715776.0, "logps/chosen": -442.1338297526042, "logps/rejected": -238.365380859375, "loss": 0.3128, "rewards/chosen": -0.17276002963383993, "rewards/margins": 1.4124717513720195, "rewards/rejected": -1.5852317810058594, "step": 5272 }, { "epoch": 0.27949010150266346, "grad_norm": 41.75, "kl": 0.2879199981689453, "learning_rate": 5e-07, "logits/chosen": -32579187.2, "logits/rejected": 768931.9166666666, "logps/chosen": -313.7240966796875, "logps/rejected": -75.5966796875, "loss": 0.3696, "rewards/chosen": 0.23865368366241455, "rewards/margins": 1.7490463495254516, "rewards/rejected": -1.510392665863037, "step": 5273 }, { "epoch": 0.2795431055044656, "grad_norm": 43.5, "kl": 0.140380859375, "learning_rate": 5e-07, "logits/chosen": -35240276.0, "logits/rejected": -29621656.0, "logps/chosen": -289.09820556640625, "logps/rejected": -412.30230712890625, "loss": 0.329, "rewards/chosen": 0.21738222241401672, "rewards/margins": 2.0227552950382233, "rewards/rejected": -1.8053730726242065, "step": 5274 }, { "epoch": 0.27959610950626773, "grad_norm": 48.0, "kl": 1.3331518173217773, "learning_rate": 5e-07, "logits/chosen": 34963.666666666664, "logits/rejected": 84583496.0, "logps/chosen": -151.5074666341146, "logps/rejected": -462.9642333984375, "loss": 0.3577, "rewards/chosen": 0.4927130142847697, "rewards/margins": 2.474225918451945, "rewards/rejected": -1.9815129041671753, "step": 5275 }, { "epoch": 0.27964911350806987, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -110796432.0, "logits/rejected": -24611712.0, "logps/chosen": -432.6125183105469, "logps/rejected": -281.7843540736607, "loss": 0.2222, "rewards/chosen": 0.15863342583179474, "rewards/margins": 1.8610731427158629, "rewards/rejected": -1.7024397168840681, "step": 5276 }, { "epoch": 0.279702117509872, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25487592.0, "logits/rejected": -29562816.0, "logps/chosen": -620.0974934895834, "logps/rejected": -581.41533203125, "loss": 0.1918, "rewards/chosen": 1.044324239095052, "rewards/margins": 3.3673419316609703, "rewards/rejected": -2.323017692565918, "step": 5277 }, { "epoch": 0.27975512151167414, "grad_norm": 55.5, "kl": 0.2204303741455078, "learning_rate": 5e-07, "logits/chosen": -2982673.4, "logits/rejected": -20183769.333333332, "logps/chosen": -200.5002197265625, "logps/rejected": -173.78055826822916, "loss": 0.3267, "rewards/chosen": 0.6853311061859131, "rewards/margins": 1.7631909847259521, "rewards/rejected": -1.077859878540039, "step": 5278 }, { "epoch": 0.2798081255134763, "grad_norm": 52.25, "kl": 0.6837615966796875, "learning_rate": 5e-07, "logits/chosen": -36413989.333333336, "logits/rejected": -15267766.0, "logps/chosen": -235.06681315104166, "logps/rejected": -171.61070251464844, "loss": 0.4497, "rewards/chosen": 0.08356972535451253, "rewards/margins": 1.0038231213887532, "rewards/rejected": -0.9202533960342407, "step": 5279 }, { "epoch": 0.2798611295152784, "grad_norm": 35.75, "kl": 0.39290428161621094, "learning_rate": 5e-07, "logits/chosen": -70743481.6, "logits/rejected": 7627821.333333333, "logps/chosen": -110.106201171875, "logps/rejected": -216.2708536783854, "loss": 0.4548, "rewards/chosen": -0.1779609203338623, "rewards/margins": 1.509553066889445, "rewards/rejected": -1.6875139872233074, "step": 5280 }, { "epoch": 0.27991413351708055, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6846924.5, "logits/rejected": -11761062.0, "logps/chosen": -335.30389404296875, "logps/rejected": -261.1535339355469, "loss": 0.3465, "rewards/chosen": -0.014511100947856903, "rewards/margins": 1.7271314933896065, "rewards/rejected": -1.7416425943374634, "step": 5281 }, { "epoch": 0.2799671375188827, "grad_norm": 124.5, "kl": 0.14320945739746094, "learning_rate": 5e-07, "logits/chosen": -33699532.0, "logits/rejected": -18815956.0, "logps/chosen": -124.4287338256836, "logps/rejected": -260.38433837890625, "loss": 0.2601, "rewards/chosen": 0.42276546359062195, "rewards/margins": 2.900882452726364, "rewards/rejected": -2.478116989135742, "step": 5282 }, { "epoch": 0.28002014152068483, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21878027.2, "logits/rejected": -30510618.666666668, "logps/chosen": -254.9294677734375, "logps/rejected": -259.18798828125, "loss": 0.3449, "rewards/chosen": 0.4086972713470459, "rewards/margins": 1.734997336069743, "rewards/rejected": -1.326300064722697, "step": 5283 }, { "epoch": 0.28007314552248697, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27984772.0, "logits/rejected": -46893500.0, "logps/chosen": -130.2181396484375, "logps/rejected": -480.34857177734375, "loss": 0.3083, "rewards/chosen": 0.23881883919239044, "rewards/margins": 2.331276848912239, "rewards/rejected": -2.0924580097198486, "step": 5284 }, { "epoch": 0.2801261495242891, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7040856.666666667, "logits/rejected": -19197324.0, "logps/chosen": -193.97855631510416, "logps/rejected": -272.2624206542969, "loss": 0.3614, "rewards/chosen": 0.28266841173171997, "rewards/margins": 2.440875828266144, "rewards/rejected": -2.158207416534424, "step": 5285 }, { "epoch": 0.28017915352609124, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19636524.0, "logits/rejected": -16650351.0, "logps/chosen": -219.4876251220703, "logps/rejected": -204.26040649414062, "loss": 0.3941, "rewards/chosen": -0.14995495975017548, "rewards/margins": 1.0403176099061966, "rewards/rejected": -1.190272569656372, "step": 5286 }, { "epoch": 0.2802321575278934, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35709760.0, "logits/rejected": -24029580.0, "logps/chosen": -333.4462483723958, "logps/rejected": -162.7630615234375, "loss": 0.3845, "rewards/chosen": 0.3767828941345215, "rewards/margins": 1.4399412870407104, "rewards/rejected": -1.063158392906189, "step": 5287 }, { "epoch": 0.2802851615296955, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1283098.0, "logits/rejected": -50410696.0, "logps/chosen": -296.7617492675781, "logps/rejected": -324.8533020019531, "loss": 0.2719, "rewards/chosen": 0.3675367534160614, "rewards/margins": 2.448569506406784, "rewards/rejected": -2.0810327529907227, "step": 5288 }, { "epoch": 0.28033816553149765, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12704896.0, "logits/rejected": -32806658.0, "logps/chosen": -206.20065307617188, "logps/rejected": -530.8251953125, "loss": 0.2879, "rewards/chosen": 0.04589977115392685, "rewards/margins": 2.6785499081015587, "rewards/rejected": -2.632650136947632, "step": 5289 }, { "epoch": 0.2803911695332998, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4894615.0, "logits/rejected": -32624436.0, "logps/chosen": -171.4470977783203, "logps/rejected": -443.10589599609375, "loss": 0.3526, "rewards/chosen": -0.2594572901725769, "rewards/margins": 2.0910266041755676, "rewards/rejected": -2.3504838943481445, "step": 5290 }, { "epoch": 0.2804441735351019, "grad_norm": 50.25, "kl": 0.211761474609375, "learning_rate": 5e-07, "logits/chosen": -34605812.0, "logits/rejected": -53379168.0, "logps/chosen": -353.966064453125, "logps/rejected": -428.3084716796875, "loss": 0.1788, "rewards/chosen": 0.5551345944404602, "rewards/margins": 2.9814257423082986, "rewards/rejected": -2.4262911478678384, "step": 5291 }, { "epoch": 0.28049717753690406, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8906470.666666666, "logits/rejected": -28120342.4, "logps/chosen": -224.46929931640625, "logps/rejected": -231.2020751953125, "loss": 0.3428, "rewards/chosen": -0.2209147810935974, "rewards/margins": 1.1709935784339904, "rewards/rejected": -1.3919083595275878, "step": 5292 }, { "epoch": 0.28055018153870614, "grad_norm": 45.0, "kl": 0.5375041961669922, "learning_rate": 5e-07, "logits/chosen": -32106829.333333332, "logits/rejected": -24471073.6, "logps/chosen": -211.4194132486979, "logps/rejected": -270.3791015625, "loss": 0.3208, "rewards/chosen": 0.09819005926450093, "rewards/margins": 1.542174090941747, "rewards/rejected": -1.443984031677246, "step": 5293 }, { "epoch": 0.2806031855405083, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49889061.333333336, "logits/rejected": -846790.4, "logps/chosen": -183.690185546875, "logps/rejected": -374.8337158203125, "loss": 0.2918, "rewards/chosen": -0.391141136487325, "rewards/margins": 2.1597221295038858, "rewards/rejected": -2.550863265991211, "step": 5294 }, { "epoch": 0.2806561895423104, "grad_norm": 44.75, "kl": 0.06365203857421875, "learning_rate": 5e-07, "logits/chosen": -15445084.0, "logits/rejected": -4269434.0, "logps/chosen": -393.8685709635417, "logps/rejected": -252.7671142578125, "loss": 0.2687, "rewards/chosen": 0.7129770914713541, "rewards/margins": 2.098329798380534, "rewards/rejected": -1.3853527069091798, "step": 5295 }, { "epoch": 0.28070919354411256, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42810524.0, "logits/rejected": -10341098.0, "logps/chosen": -349.0433349609375, "logps/rejected": -199.5565388997396, "loss": 0.2769, "rewards/chosen": -0.7073761224746704, "rewards/margins": 1.2727318207422893, "rewards/rejected": -1.9801079432169597, "step": 5296 }, { "epoch": 0.2807621975459147, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47288864.0, "logits/rejected": -32708064.0, "logps/chosen": -98.07217407226562, "logps/rejected": -394.466845703125, "loss": 0.3169, "rewards/chosen": 0.12926839788754782, "rewards/margins": 1.5006747623284658, "rewards/rejected": -1.371406364440918, "step": 5297 }, { "epoch": 0.28081520154771683, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23123278.4, "logits/rejected": -66536469.333333336, "logps/chosen": -194.67344970703124, "logps/rejected": -187.52783203125, "loss": 0.4155, "rewards/chosen": 0.16962554454803466, "rewards/margins": 0.8410958687464396, "rewards/rejected": -0.6714703241984049, "step": 5298 }, { "epoch": 0.28086820554951897, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4009377.6666666665, "logits/rejected": 79065004.8, "logps/chosen": -243.86456298828125, "logps/rejected": -261.9675048828125, "loss": 0.3092, "rewards/chosen": -0.07959499458471934, "rewards/margins": 1.5085252751906713, "rewards/rejected": -1.5881202697753907, "step": 5299 }, { "epoch": 0.2809212095513211, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29250178.666666668, "logits/rejected": -39397328.0, "logps/chosen": -220.30537923177084, "logps/rejected": -417.045703125, "loss": 0.3133, "rewards/chosen": -0.03033599505821864, "rewards/margins": 1.7521492039163906, "rewards/rejected": -1.7824851989746093, "step": 5300 }, { "epoch": 0.28097421355312324, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10640314.0, "logits/rejected": -11769530.0, "logps/chosen": -282.355224609375, "logps/rejected": -302.8429260253906, "loss": 0.2922, "rewards/chosen": 0.16629981994628906, "rewards/margins": 2.207631826400757, "rewards/rejected": -2.0413320064544678, "step": 5301 }, { "epoch": 0.2810272175549254, "grad_norm": 67.0, "kl": 0.11729049682617188, "learning_rate": 5e-07, "logits/chosen": -32655381.333333332, "logits/rejected": -23804980.0, "logps/chosen": -275.8818359375, "logps/rejected": -340.889892578125, "loss": 0.4589, "rewards/chosen": -0.06790785988171895, "rewards/margins": 0.8169931868712107, "rewards/rejected": -0.8849010467529297, "step": 5302 }, { "epoch": 0.2810802215567275, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31463260.8, "logits/rejected": -42425064.0, "logps/chosen": -349.4784423828125, "logps/rejected": -409.2913411458333, "loss": 0.2632, "rewards/chosen": 0.54403657913208, "rewards/margins": 3.2497835477193195, "rewards/rejected": -2.7057469685872397, "step": 5303 }, { "epoch": 0.28113322555852965, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32303160.0, "logits/rejected": -44952456.0, "logps/chosen": -452.1285705566406, "logps/rejected": -361.1058349609375, "loss": 0.2573, "rewards/chosen": 0.5550633072853088, "rewards/margins": 2.4778693318367004, "rewards/rejected": -1.9228060245513916, "step": 5304 }, { "epoch": 0.2811862295603318, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33315196.8, "logits/rejected": -15570013.333333334, "logps/chosen": -303.49892578125, "logps/rejected": -307.46449788411456, "loss": 0.3697, "rewards/chosen": 0.25404465198516846, "rewards/margins": 1.7425504922866821, "rewards/rejected": -1.4885058403015137, "step": 5305 }, { "epoch": 0.28123923356213393, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 27773808.0, "logits/rejected": -6745169.5, "logps/chosen": -539.8236083984375, "logps/rejected": -483.1959533691406, "loss": 0.2879, "rewards/chosen": 0.27277058362960815, "rewards/margins": 2.763678014278412, "rewards/rejected": -2.4909074306488037, "step": 5306 }, { "epoch": 0.28129223756393607, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44930368.0, "logits/rejected": -21388358.0, "logps/chosen": -458.10302734375, "logps/rejected": -465.94268798828125, "loss": 0.2765, "rewards/chosen": 0.27403298020362854, "rewards/margins": 2.824993997812271, "rewards/rejected": -2.5509610176086426, "step": 5307 }, { "epoch": 0.2813452415657382, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8234214.666666667, "logits/rejected": -19160817.6, "logps/chosen": -284.67303466796875, "logps/rejected": -261.5839111328125, "loss": 0.3073, "rewards/chosen": -0.21858318646748862, "rewards/margins": 1.675833813349406, "rewards/rejected": -1.8944169998168945, "step": 5308 }, { "epoch": 0.28139824556754034, "grad_norm": 38.75, "kl": 0.017833709716796875, "learning_rate": 5e-07, "logits/chosen": -22089315.2, "logits/rejected": -32273048.0, "logps/chosen": -141.61212158203125, "logps/rejected": -177.97900390625, "loss": 0.3639, "rewards/chosen": 0.05303030014038086, "rewards/margins": 2.035120932261149, "rewards/rejected": -1.9820906321207683, "step": 5309 }, { "epoch": 0.2814512495693425, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 164306576.0, "logits/rejected": 2475221.3333333335, "logps/chosen": -352.09130859375, "logps/rejected": -277.2912190755208, "loss": 0.2999, "rewards/chosen": -1.0371097326278687, "rewards/margins": 0.7161359389623005, "rewards/rejected": -1.7532456715901692, "step": 5310 }, { "epoch": 0.2815042535711446, "grad_norm": 39.25, "kl": 0.42586517333984375, "learning_rate": 5e-07, "logits/chosen": -3975054.0, "logits/rejected": -40155124.0, "logps/chosen": -251.74221801757812, "logps/rejected": -420.68829345703125, "loss": 0.2903, "rewards/chosen": -0.0034465938806533813, "rewards/margins": 3.108168825507164, "rewards/rejected": -3.1116154193878174, "step": 5311 }, { "epoch": 0.28155725757294675, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18864220.0, "logits/rejected": -11439693.6, "logps/chosen": -309.0182291666667, "logps/rejected": -239.3730224609375, "loss": 0.3337, "rewards/chosen": -0.17278925577799478, "rewards/margins": 1.4372826894124349, "rewards/rejected": -1.6100719451904297, "step": 5312 }, { "epoch": 0.2816102615747489, "grad_norm": 56.25, "kl": 1.7980728149414062, "learning_rate": 5e-07, "logits/chosen": -19203908.0, "logps/chosen": -170.95262145996094, "loss": 0.4636, "rewards/chosen": 0.3368968665599823, "step": 5313 }, { "epoch": 0.281663265576551, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73602448.0, "logits/rejected": -62814968.0, "logps/chosen": -235.00125122070312, "logps/rejected": -348.7162780761719, "loss": 0.3364, "rewards/chosen": 0.039168357849121094, "rewards/margins": 1.8717341423034668, "rewards/rejected": -1.8325657844543457, "step": 5314 }, { "epoch": 0.28171626957835316, "grad_norm": 52.25, "kl": 0.7601790428161621, "learning_rate": 5e-07, "logits/chosen": -7320683.333333333, "logits/rejected": -36414360.0, "logps/chosen": -300.17331949869794, "logps/rejected": -395.62921142578125, "loss": 0.38, "rewards/chosen": 0.24641176064809164, "rewards/margins": 2.697896202405294, "rewards/rejected": -2.451484441757202, "step": 5315 }, { "epoch": 0.2817692735801553, "grad_norm": 44.25, "kl": 0.26013660430908203, "learning_rate": 5e-07, "logits/chosen": -8659528.0, "logits/rejected": -30883736.0, "logps/chosen": -109.786376953125, "logps/rejected": -430.5130208333333, "loss": 0.3535, "rewards/chosen": 0.2415985107421875, "rewards/margins": 1.9664021174112956, "rewards/rejected": -1.7248036066691081, "step": 5316 }, { "epoch": 0.28182227758195744, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5419360.0, "logits/rejected": -58355008.0, "logps/chosen": -345.2633056640625, "logps/rejected": -416.63125, "loss": 0.3011, "rewards/chosen": 0.0874669353167216, "rewards/margins": 2.1817630092302958, "rewards/rejected": -2.0942960739135743, "step": 5317 }, { "epoch": 0.2818752815837596, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14734212.8, "logits/rejected": -39991792.0, "logps/chosen": -247.4675537109375, "logps/rejected": -616.5248209635416, "loss": 0.3065, "rewards/chosen": 0.3052617073059082, "rewards/margins": 3.3946604092915855, "rewards/rejected": -3.0893987019856772, "step": 5318 }, { "epoch": 0.2819282855855617, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72971024.0, "logits/rejected": -7385294.666666667, "logps/chosen": -617.5355224609375, "logps/rejected": -187.2744344075521, "loss": 0.2488, "rewards/chosen": 0.8599578738212585, "rewards/margins": 2.3240490158398943, "rewards/rejected": -1.464091142018636, "step": 5319 }, { "epoch": 0.28198128958736385, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5711235.6, "logits/rejected": -70457760.0, "logps/chosen": -123.336181640625, "logps/rejected": -575.4128824869791, "loss": 0.3066, "rewards/chosen": 0.18506829738616942, "rewards/margins": 3.38052130540212, "rewards/rejected": -3.1954530080159507, "step": 5320 }, { "epoch": 0.282034293589166, "grad_norm": 48.0, "kl": 3.1270523071289062, "learning_rate": 5e-07, "logits/chosen": -32679244.8, "logits/rejected": -22867834.666666668, "logps/chosen": -481.366943359375, "logps/rejected": -207.4838663736979, "loss": 0.3657, "rewards/chosen": 0.8338037490844726, "rewards/margins": 2.149748420715332, "rewards/rejected": -1.3159446716308594, "step": 5321 }, { "epoch": 0.2820872975909681, "grad_norm": 41.0, "kl": 0.20227622985839844, "learning_rate": 5e-07, "logits/chosen": -19925808.0, "logits/rejected": -14677478.666666666, "logps/chosen": -116.74462890625, "logps/rejected": -385.1140950520833, "loss": 0.3701, "rewards/chosen": -0.011225014925003052, "rewards/margins": 1.9921277860800424, "rewards/rejected": -2.0033528010050454, "step": 5322 }, { "epoch": 0.28214030159277026, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18707618.666666668, "logits/rejected": -54301856.0, "logps/chosen": -152.2463175455729, "logps/rejected": -424.685107421875, "loss": 0.1802, "rewards/chosen": 0.8156867027282715, "rewards/margins": 3.1211970329284666, "rewards/rejected": -2.305510330200195, "step": 5323 }, { "epoch": 0.2821933055945724, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25728322.0, "logits/rejected": -43363656.0, "logps/chosen": -109.37657165527344, "logps/rejected": -329.01519775390625, "loss": 0.3407, "rewards/chosen": 0.10634641349315643, "rewards/margins": 1.4553592652082443, "rewards/rejected": -1.349012851715088, "step": 5324 }, { "epoch": 0.28224630959637453, "grad_norm": 52.75, "kl": 0.5507354736328125, "learning_rate": 5e-07, "logits/chosen": -17684154.0, "logits/rejected": -17989616.0, "logps/chosen": -255.55470275878906, "logps/rejected": -275.1444396972656, "loss": 0.2667, "rewards/chosen": 0.7845616936683655, "rewards/margins": 2.541945993900299, "rewards/rejected": -1.7573843002319336, "step": 5325 }, { "epoch": 0.28229931359817667, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9109309.333333334, "logits/rejected": -26181625.6, "logps/chosen": -181.44856770833334, "logps/rejected": -496.66416015625, "loss": 0.2385, "rewards/chosen": 0.35852956771850586, "rewards/margins": 2.3431878089904785, "rewards/rejected": -1.9846582412719727, "step": 5326 }, { "epoch": 0.2823523175999788, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -66381784.0, "logps/rejected": -220.298828125, "loss": 0.1854, "rewards/rejected": -1.864238977432251, "step": 5327 }, { "epoch": 0.28240532160178095, "grad_norm": 67.0, "kl": 1.65631103515625, "learning_rate": 5e-07, "logits/chosen": -19289328.0, "logits/rejected": -42599562.666666664, "logps/chosen": -569.415576171875, "logps/rejected": -407.7351888020833, "loss": 0.2859, "rewards/chosen": 0.8193606376647949, "rewards/margins": 3.097602113087972, "rewards/rejected": -2.2782414754231772, "step": 5328 }, { "epoch": 0.2824583256035831, "grad_norm": 43.5, "kl": 0.6228713989257812, "learning_rate": 5e-07, "logits/chosen": -33342208.0, "logits/rejected": -2684006.6666666665, "logps/chosen": -192.578515625, "logps/rejected": -81.33475748697917, "loss": 0.3777, "rewards/chosen": -0.23217978477478027, "rewards/margins": 3.088386392593384, "rewards/rejected": -3.320566177368164, "step": 5329 }, { "epoch": 0.2825113296053852, "grad_norm": 48.5, "kl": 1.8436412811279297, "learning_rate": 5e-07, "logits/chosen": -22443712.0, "logits/rejected": -45381584.0, "logps/chosen": -329.9010009765625, "logps/rejected": -466.418701171875, "loss": 0.2921, "rewards/chosen": 0.9173952102661133, "rewards/margins": 2.6602917671203614, "rewards/rejected": -1.742896556854248, "step": 5330 }, { "epoch": 0.28256433360718736, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -9803673.0, "logps/rejected": -306.381103515625, "loss": 0.2278, "rewards/rejected": -1.571457862854004, "step": 5331 }, { "epoch": 0.2826173376089895, "grad_norm": 53.5, "kl": 0.9520034790039062, "learning_rate": 5e-07, "logits/chosen": 23955292.0, "logits/rejected": -6726426.666666667, "logps/chosen": -358.2643127441406, "logps/rejected": -258.1190592447917, "loss": 0.2943, "rewards/chosen": 0.12471771985292435, "rewards/margins": 1.407536196211974, "rewards/rejected": -1.2828184763590496, "step": 5332 }, { "epoch": 0.28267034161079163, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57347776.0, "logits/rejected": -20426132.0, "logps/chosen": -259.93878173828125, "logps/rejected": -324.6058654785156, "loss": 0.359, "rewards/chosen": 0.03246593475341797, "rewards/margins": 1.6600022315979004, "rewards/rejected": -1.6275362968444824, "step": 5333 }, { "epoch": 0.28272334561259377, "grad_norm": 58.75, "kl": 1.0949058532714844, "learning_rate": 5e-07, "logits/chosen": -105400234.66666667, "logits/rejected": -45381320.0, "logps/chosen": -362.3675130208333, "logps/rejected": -209.37643432617188, "loss": 0.3513, "rewards/chosen": 0.5462621053059896, "rewards/margins": 1.966527303059896, "rewards/rejected": -1.4202651977539062, "step": 5334 }, { "epoch": 0.2827763496143959, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20205158.0, "logits/rejected": -50952764.0, "logps/chosen": -520.57421875, "logps/rejected": -153.46868896484375, "loss": 0.2407, "rewards/chosen": 1.2081931829452515, "rewards/margins": 2.5619988441467285, "rewards/rejected": -1.353805661201477, "step": 5335 }, { "epoch": 0.28282935361619804, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22395802.666666668, "logits/rejected": -12435784.0, "logps/chosen": -751.5888671875, "logps/rejected": -304.266845703125, "loss": 0.2598, "rewards/chosen": 0.40407737096150714, "rewards/margins": 2.390376313527425, "rewards/rejected": -1.9862989425659179, "step": 5336 }, { "epoch": 0.2828823576180002, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 149966.640625, "logits/rejected": -48235372.0, "logps/chosen": -245.4976806640625, "logps/rejected": -534.1869506835938, "loss": 0.3417, "rewards/chosen": -0.16955628991127014, "rewards/margins": 1.7896822392940521, "rewards/rejected": -1.9592385292053223, "step": 5337 }, { "epoch": 0.2829353616198023, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31308387.2, "logits/rejected": -6178602.666666667, "logps/chosen": -260.363525390625, "logps/rejected": -164.94392903645834, "loss": 0.4076, "rewards/chosen": -0.09008636474609374, "rewards/margins": 1.1806849002838136, "rewards/rejected": -1.2707712650299072, "step": 5338 }, { "epoch": 0.28298836562160445, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38591352.0, "logits/rejected": -7528269.5, "logps/chosen": -108.89673614501953, "logps/rejected": -224.25791931152344, "loss": 0.344, "rewards/chosen": 0.2383471429347992, "rewards/margins": 1.393103688955307, "rewards/rejected": -1.1547565460205078, "step": 5339 }, { "epoch": 0.2830413696234066, "grad_norm": 47.25, "kl": 1.133981704711914, "learning_rate": 5e-07, "logits/chosen": -21012466.0, "logits/rejected": -43485528.0, "logps/chosen": -218.49586486816406, "logps/rejected": -224.0697479248047, "loss": 0.3341, "rewards/chosen": 0.15340644121170044, "rewards/margins": 2.4443944096565247, "rewards/rejected": -2.290987968444824, "step": 5340 }, { "epoch": 0.28309437362520873, "grad_norm": 85.5, "kl": 3.2029953002929688, "learning_rate": 5e-07, "logits/chosen": -36457753.6, "logits/rejected": -13310137.333333334, "logps/chosen": -600.0474609375, "logps/rejected": -211.25311279296875, "loss": 0.3441, "rewards/chosen": 0.7156295776367188, "rewards/margins": 2.350619475046794, "rewards/rejected": -1.634989897410075, "step": 5341 }, { "epoch": 0.28314737762701087, "grad_norm": 61.5, "kl": 0.3789634704589844, "learning_rate": 5e-07, "logits/chosen": -12264907.42857143, "logits/rejected": -60377432.0, "logps/chosen": -398.68282645089283, "logps/rejected": -563.9600830078125, "loss": 0.412, "rewards/chosen": 0.19478113310677664, "rewards/margins": 3.2180844034467424, "rewards/rejected": -3.023303270339966, "step": 5342 }, { "epoch": 0.283200381628813, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12752689.0, "logits/rejected": -63283864.0, "logps/chosen": -148.6193084716797, "logps/rejected": -426.95965576171875, "loss": 0.3329, "rewards/chosen": -0.07011710107326508, "rewards/margins": 2.1888357549905777, "rewards/rejected": -2.2589528560638428, "step": 5343 }, { "epoch": 0.2832533856306151, "grad_norm": 55.0, "kl": 0.06708526611328125, "learning_rate": 5e-07, "logits/chosen": -14214390.666666666, "logits/rejected": -7334764.5, "logps/chosen": -316.671630859375, "logps/rejected": -207.8643798828125, "loss": 0.3383, "rewards/chosen": 0.49833301703135174, "rewards/margins": 2.0393059651056924, "rewards/rejected": -1.5409729480743408, "step": 5344 }, { "epoch": 0.2833063896324172, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36152320.0, "logits/rejected": -31019436.0, "logps/chosen": -250.79916381835938, "logps/rejected": -251.70997619628906, "loss": 0.3227, "rewards/chosen": 0.16154956817626953, "rewards/margins": 1.6681582927703857, "rewards/rejected": -1.5066087245941162, "step": 5345 }, { "epoch": 0.28335939363421936, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24305464.0, "logits/rejected": -78990944.0, "logps/chosen": -156.10221354166666, "logps/rejected": -232.025634765625, "loss": 0.4266, "rewards/chosen": -0.06719406445821126, "rewards/margins": 1.7103604475657146, "rewards/rejected": -1.7775545120239258, "step": 5346 }, { "epoch": 0.2834123976360215, "grad_norm": 55.5, "kl": 1.1330032348632812, "learning_rate": 5e-07, "logits/chosen": -15427862.4, "logits/rejected": -19275392.0, "logps/chosen": -376.2457763671875, "logps/rejected": -343.4549560546875, "loss": 0.3353, "rewards/chosen": 0.39307565689086915, "rewards/margins": 2.349500846862793, "rewards/rejected": -1.9564251899719238, "step": 5347 }, { "epoch": 0.28346540163782363, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3950732.6666666665, "logits/rejected": -18502854.0, "logps/chosen": -169.45598347981772, "logps/rejected": -359.6553955078125, "loss": 0.4073, "rewards/chosen": -0.015923780699570973, "rewards/margins": 2.0642306382457414, "rewards/rejected": -2.0801544189453125, "step": 5348 }, { "epoch": 0.28351840563962577, "grad_norm": 46.25, "kl": 1.3981046676635742, "learning_rate": 5e-07, "logits/chosen": -40643248.0, "logits/rejected": -2343878.75, "logps/chosen": -314.36590576171875, "logps/rejected": -57.513206481933594, "loss": 0.3874, "rewards/chosen": 0.28862589597702026, "rewards/margins": 1.2289475202560425, "rewards/rejected": -0.9403216242790222, "step": 5349 }, { "epoch": 0.2835714096414279, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4091000.5, "logits/rejected": -38126502.85714286, "logps/chosen": -120.49153137207031, "logps/rejected": -357.64195033482144, "loss": 0.27, "rewards/chosen": -0.5114372372627258, "rewards/margins": 0.8493791733469283, "rewards/rejected": -1.360816410609654, "step": 5350 }, { "epoch": 0.28362441364323004, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37619840.0, "logits/rejected": -20262293.333333332, "logps/chosen": -266.7002197265625, "logps/rejected": -476.4318440755208, "loss": 0.3059, "rewards/chosen": 0.3644257068634033, "rewards/margins": 3.0947377363840736, "rewards/rejected": -2.7303120295206704, "step": 5351 }, { "epoch": 0.2836774176450322, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4192328.6666666665, "logits/rejected": -69904851.2, "logps/chosen": -168.0406290690104, "logps/rejected": -217.6239013671875, "loss": 0.3552, "rewards/chosen": -0.07845300436019897, "rewards/margins": 1.0978339791297913, "rewards/rejected": -1.1762869834899903, "step": 5352 }, { "epoch": 0.2837304216468343, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25407080.0, "logits/rejected": -36558044.0, "logps/chosen": -298.1306966145833, "logps/rejected": -444.40478515625, "loss": 0.2598, "rewards/chosen": 0.7573045889536539, "rewards/margins": 4.923651138941447, "rewards/rejected": -4.166346549987793, "step": 5353 }, { "epoch": 0.28378342564863646, "grad_norm": 46.75, "kl": 2.1800155639648438, "learning_rate": 5e-07, "logits/chosen": -15069351.0, "logits/rejected": -29099564.0, "logps/chosen": -321.3033447265625, "logps/rejected": -277.9855041503906, "loss": 0.3545, "rewards/chosen": 0.20962774753570557, "rewards/margins": 2.2201954126358032, "rewards/rejected": -2.0105676651000977, "step": 5354 }, { "epoch": 0.2838364296504386, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35322168.0, "logits/rejected": -22176174.4, "logps/chosen": -426.3530680338542, "logps/rejected": -129.8190185546875, "loss": 0.294, "rewards/chosen": 0.5626276731491089, "rewards/margins": 2.486742091178894, "rewards/rejected": -1.9241144180297851, "step": 5355 }, { "epoch": 0.28388943365224073, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38775674.666666664, "logits/rejected": -63695161.6, "logps/chosen": -666.7067464192709, "logps/rejected": -538.333154296875, "loss": 0.1999, "rewards/chosen": 0.740562915802002, "rewards/margins": 2.9055214881896974, "rewards/rejected": -2.1649585723876954, "step": 5356 }, { "epoch": 0.28394243765404287, "grad_norm": 55.5, "kl": 0.10491275787353516, "learning_rate": 5e-07, "logits/chosen": -8880320.666666666, "logits/rejected": -5840446.0, "logps/chosen": -159.4699910481771, "logps/rejected": -250.57028198242188, "loss": 0.4033, "rewards/chosen": 0.27117005983988446, "rewards/margins": 1.258703629175822, "rewards/rejected": -0.9875335693359375, "step": 5357 }, { "epoch": 0.283995441655845, "grad_norm": 51.5, "kl": 0.15301132202148438, "learning_rate": 5e-07, "logits/chosen": -19377932.8, "logits/rejected": -38064034.666666664, "logps/chosen": -232.8017578125, "logps/rejected": -283.0796712239583, "loss": 0.269, "rewards/chosen": 0.695108699798584, "rewards/margins": 2.6555061022440594, "rewards/rejected": -1.9603974024454753, "step": 5358 }, { "epoch": 0.28404844565764714, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16937326.4, "logits/rejected": -33059944.0, "logps/chosen": -245.3491455078125, "logps/rejected": -472.1085205078125, "loss": 0.2953, "rewards/chosen": 0.3507612466812134, "rewards/margins": 2.967620364824931, "rewards/rejected": -2.6168591181437173, "step": 5359 }, { "epoch": 0.2841014496594493, "grad_norm": 56.0, "kl": 2.171128273010254, "learning_rate": 5e-07, "logits/chosen": -35394754.666666664, "logits/rejected": -24660825.6, "logps/chosen": -555.3306884765625, "logps/rejected": -256.516064453125, "loss": 0.2576, "rewards/chosen": 0.6458304325739542, "rewards/margins": 3.2057860294977822, "rewards/rejected": -2.559955596923828, "step": 5360 }, { "epoch": 0.2841544536612514, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6994888.8, "logits/rejected": -3391037.0, "logps/chosen": -365.68505859375, "logps/rejected": -120.54435221354167, "loss": 0.277, "rewards/chosen": 0.7678619384765625, "rewards/margins": 2.299086825052897, "rewards/rejected": -1.5312248865763347, "step": 5361 }, { "epoch": 0.28420745766305355, "grad_norm": 56.0, "kl": 0.8779487609863281, "learning_rate": 5e-07, "logits/chosen": 834343.35, "logits/rejected": -11206376.0, "logps/chosen": -285.9935791015625, "logps/rejected": -116.79949951171875, "loss": 0.3792, "rewards/chosen": 0.12974411249160767, "rewards/margins": 1.693567971388499, "rewards/rejected": -1.5638238588968914, "step": 5362 }, { "epoch": 0.2842604616648557, "grad_norm": 51.5, "kl": 0.724578857421875, "learning_rate": 5e-07, "logits/chosen": -21677518.666666668, "logits/rejected": -1470051.2, "logps/chosen": -526.9724934895834, "logps/rejected": -390.1405029296875, "loss": 0.2218, "rewards/chosen": 0.5401753584543864, "rewards/margins": 3.0043564955393474, "rewards/rejected": -2.464181137084961, "step": 5363 }, { "epoch": 0.2843134656666578, "grad_norm": 49.5, "kl": 0.3424701690673828, "learning_rate": 5e-07, "logits/chosen": -2570495.75, "logits/rejected": -3482935.75, "logps/chosen": -215.618896484375, "logps/rejected": -89.19486999511719, "loss": 0.3626, "rewards/chosen": 0.4751814007759094, "rewards/margins": 1.3953936696052551, "rewards/rejected": -0.9202122688293457, "step": 5364 }, { "epoch": 0.28436646966845996, "grad_norm": 49.5, "kl": 1.5481853485107422, "learning_rate": 5e-07, "logits/chosen": -5874857.6, "logits/rejected": -24177400.0, "logps/chosen": -227.577587890625, "logps/rejected": -393.6316731770833, "loss": 0.3185, "rewards/chosen": 0.45073719024658204, "rewards/margins": 3.0179562886555993, "rewards/rejected": -2.567219098409017, "step": 5365 }, { "epoch": 0.2844194736702621, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9552719.0, "logits/rejected": -36149316.0, "logps/chosen": -288.47308349609375, "logps/rejected": -299.62188720703125, "loss": 0.3649, "rewards/chosen": -0.04878637194633484, "rewards/margins": 1.651800662279129, "rewards/rejected": -1.7005870342254639, "step": 5366 }, { "epoch": 0.28447247767206424, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38923680.0, "logits/rejected": 2900136.6666666665, "logps/chosen": -260.325634765625, "logps/rejected": -73.29942830403645, "loss": 0.3816, "rewards/chosen": 0.38435194492340086, "rewards/margins": 1.5233412345250446, "rewards/rejected": -1.1389892896016438, "step": 5367 }, { "epoch": 0.2845254816738664, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24249363.2, "logits/rejected": -14936004.0, "logps/chosen": -275.1678466796875, "logps/rejected": -137.9922892252604, "loss": 0.3426, "rewards/chosen": 0.4150658130645752, "rewards/margins": 1.6596550623575848, "rewards/rejected": -1.2445892492930095, "step": 5368 }, { "epoch": 0.2845784856756685, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36270602.666666664, "logits/rejected": -4235529.6, "logps/chosen": -173.25923665364584, "logps/rejected": -141.5293701171875, "loss": 0.2058, "rewards/chosen": 1.3496239980061848, "rewards/margins": 3.215158971150716, "rewards/rejected": -1.8655349731445312, "step": 5369 }, { "epoch": 0.28463148967747065, "grad_norm": 49.5, "kl": 1.2811241149902344, "learning_rate": 5e-07, "logits/chosen": -28601066.666666668, "logits/rejected": -52402472.0, "logps/chosen": -197.48211669921875, "logps/rejected": -398.0433654785156, "loss": 0.4066, "rewards/chosen": 0.14170374472935995, "rewards/margins": 2.3602980573972068, "rewards/rejected": -2.2185943126678467, "step": 5370 }, { "epoch": 0.2846844936792728, "grad_norm": 51.5, "kl": 0.17044448852539062, "learning_rate": 5e-07, "logits/chosen": -35451776.0, "logits/rejected": -11071651.0, "logps/chosen": -381.03265380859375, "logps/rejected": -644.4435424804688, "loss": 0.2537, "rewards/chosen": 0.586675763130188, "rewards/margins": 3.5079604387283325, "rewards/rejected": -2.9212846755981445, "step": 5371 }, { "epoch": 0.2847374976810749, "grad_norm": 58.25, "kl": 1.5372085571289062, "learning_rate": 5e-07, "logits/chosen": -26996848.0, "logits/rejected": -80471290.66666667, "logps/chosen": -472.747802734375, "logps/rejected": -407.1352945963542, "loss": 0.2321, "rewards/chosen": 1.276267147064209, "rewards/margins": 3.417467530568441, "rewards/rejected": -2.141200383504232, "step": 5372 }, { "epoch": 0.28479050168287706, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8537629.0, "logits/rejected": -28397622.0, "logps/chosen": -243.9090118408203, "logps/rejected": -331.0311584472656, "loss": 0.2722, "rewards/chosen": 0.5658142566680908, "rewards/margins": 2.6389994621276855, "rewards/rejected": -2.0731852054595947, "step": 5373 }, { "epoch": 0.2848435056846792, "grad_norm": 52.5, "kl": 0.3320331573486328, "learning_rate": 5e-07, "logits/chosen": -38298448.0, "logits/rejected": -12890984.0, "logps/chosen": -591.76171875, "logps/rejected": -304.79917399088544, "loss": 0.2101, "rewards/chosen": 0.2113800048828125, "rewards/margins": 2.468620777130127, "rewards/rejected": -2.2572407722473145, "step": 5374 }, { "epoch": 0.28489650968648134, "grad_norm": 49.5, "kl": 1.7952957153320312, "learning_rate": 5e-07, "logits/chosen": -11417177.333333334, "logits/rejected": -52010100.0, "logps/chosen": -207.10481770833334, "logps/rejected": -211.95152282714844, "loss": 0.3223, "rewards/chosen": 0.6456648906071981, "rewards/margins": 3.3036872943242392, "rewards/rejected": -2.658022403717041, "step": 5375 }, { "epoch": 0.2849495136882835, "grad_norm": 41.25, "kl": 0.7647628784179688, "learning_rate": 5e-07, "logits/chosen": -41942272.0, "logits/rejected": -20257308.8, "logps/chosen": -247.88289388020834, "logps/rejected": -296.49453125, "loss": 0.2163, "rewards/chosen": 0.633393128712972, "rewards/margins": 2.9652332623799644, "rewards/rejected": -2.3318401336669923, "step": 5376 }, { "epoch": 0.2850025176900856, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3968488.5, "logits/rejected": -7127928.0, "logps/chosen": -120.45536041259766, "logps/rejected": -138.57318115234375, "loss": 0.2518, "rewards/chosen": 1.1690269708633423, "rewards/margins": 2.3786318699518842, "rewards/rejected": -1.2096048990885417, "step": 5377 }, { "epoch": 0.28505552169188775, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1837831.6666666667, "logits/rejected": -26972038.4, "logps/chosen": -160.6416219075521, "logps/rejected": -509.456884765625, "loss": 0.3729, "rewards/chosen": -0.3893692096074422, "rewards/margins": 1.5071544567743937, "rewards/rejected": -1.896523666381836, "step": 5378 }, { "epoch": 0.2851085256936899, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44152968.0, "logits/rejected": -10734772.0, "logps/chosen": -365.5871175130208, "logps/rejected": -412.78369140625, "loss": 0.1914, "rewards/chosen": 0.6201146443684896, "rewards/margins": 3.0502864201863606, "rewards/rejected": -2.430171775817871, "step": 5379 }, { "epoch": 0.285161529695492, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52092120.0, "logits/rejected": -12380756.0, "logps/chosen": -251.94808959960938, "logps/rejected": -392.8400472005208, "loss": 0.249, "rewards/chosen": -0.052996061742305756, "rewards/margins": 2.116246002415816, "rewards/rejected": -2.1692420641581216, "step": 5380 }, { "epoch": 0.28521453369729416, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -100933136.0, "logits/rejected": -6352029.714285715, "logps/chosen": -421.66168212890625, "logps/rejected": -117.79391915457589, "loss": 0.2027, "rewards/chosen": 0.3687896728515625, "rewards/margins": 2.5315579005650113, "rewards/rejected": -2.162768227713449, "step": 5381 }, { "epoch": 0.2852675376990963, "grad_norm": 53.5, "kl": 1.6109352111816406, "learning_rate": 5e-07, "logits/chosen": -12074038.4, "logits/rejected": 25152210.666666668, "logps/chosen": -261.805029296875, "logps/rejected": -300.92653401692706, "loss": 0.3326, "rewards/chosen": 0.7388001918792725, "rewards/margins": 2.0918524901072186, "rewards/rejected": -1.353052298227946, "step": 5382 }, { "epoch": 0.28532054170089843, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25551405.333333332, "logits/rejected": -13081939.2, "logps/chosen": -206.66483561197916, "logps/rejected": -307.2390380859375, "loss": 0.3696, "rewards/chosen": -0.2058953046798706, "rewards/margins": 1.0899952173233032, "rewards/rejected": -1.2958905220031738, "step": 5383 }, { "epoch": 0.28537354570270057, "grad_norm": 60.75, "kl": 0.2594261169433594, "learning_rate": 5e-07, "logits/chosen": -25109186.666666668, "logits/rejected": -20226386.0, "logps/chosen": -444.6599527994792, "logps/rejected": -375.14605712890625, "loss": 0.3003, "rewards/chosen": 0.601000984509786, "rewards/margins": 2.7655946811040244, "rewards/rejected": -2.1645936965942383, "step": 5384 }, { "epoch": 0.2854265497045027, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39515272.0, "logits/rejected": -11057632.0, "logps/chosen": -299.78338623046875, "logps/rejected": -283.22197265625, "loss": 0.2685, "rewards/chosen": 0.006821930408477783, "rewards/margins": 2.065672218799591, "rewards/rejected": -2.0588502883911133, "step": 5385 }, { "epoch": 0.28547955370630484, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31842768.0, "logits/rejected": -38244314.666666664, "logps/chosen": -296.5272216796875, "logps/rejected": -520.9718017578125, "loss": 0.3747, "rewards/chosen": 0.060178828239440915, "rewards/margins": 2.358410906791687, "rewards/rejected": -2.298232078552246, "step": 5386 }, { "epoch": 0.285532557708107, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10777434.666666666, "logits/rejected": -31718323.2, "logps/chosen": -199.4333699544271, "logps/rejected": -206.037744140625, "loss": 0.2931, "rewards/chosen": -0.12557818492253622, "rewards/margins": 1.6648993770281475, "rewards/rejected": -1.7904775619506836, "step": 5387 }, { "epoch": 0.2855855617099091, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28674909.333333332, "logits/rejected": -26104744.0, "logps/chosen": -252.9251708984375, "logps/rejected": -700.1788940429688, "loss": 0.2902, "rewards/chosen": 0.6026957829793295, "rewards/margins": 3.107471783955892, "rewards/rejected": -2.5047760009765625, "step": 5388 }, { "epoch": 0.28563856571171126, "grad_norm": 41.25, "kl": 1.1436691284179688, "learning_rate": 5e-07, "logits/chosen": -47346632.0, "logits/rejected": -53759560.0, "logps/chosen": -158.6853485107422, "logps/rejected": -544.3433837890625, "loss": 0.3376, "rewards/chosen": -0.1466531753540039, "rewards/margins": 2.2845206260681152, "rewards/rejected": -2.431173801422119, "step": 5389 }, { "epoch": 0.2856915697135134, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71526261.33333333, "logits/rejected": -23972448.0, "logps/chosen": -729.9265950520834, "logps/rejected": -443.18330078125, "loss": 0.2417, "rewards/chosen": 0.8490336736043295, "rewards/margins": 2.4002232869466145, "rewards/rejected": -1.5511896133422851, "step": 5390 }, { "epoch": 0.28574457371531553, "grad_norm": 45.75, "kl": 2.3361663818359375, "learning_rate": 5e-07, "logits/chosen": -43633561.6, "logits/rejected": 11746248.0, "logps/chosen": -507.135888671875, "logps/rejected": -125.10750325520833, "loss": 0.3456, "rewards/chosen": 0.6757466793060303, "rewards/margins": 2.175130605697632, "rewards/rejected": -1.4993839263916016, "step": 5391 }, { "epoch": 0.28579757771711767, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4065825.25, "logits/rejected": -6284600.5, "logps/chosen": -162.06504821777344, "logps/rejected": -292.0749206542969, "loss": 0.3191, "rewards/chosen": -0.11894068866968155, "rewards/margins": 2.0930134281516075, "rewards/rejected": -2.211954116821289, "step": 5392 }, { "epoch": 0.2858505817189198, "grad_norm": 97.0, "kl": 5.856529235839844, "learning_rate": 5e-07, "logits/chosen": -29671926.85714286, "logits/rejected": -101647768.0, "logps/chosen": -653.2568359375, "logps/rejected": -531.303955078125, "loss": 0.3985, "rewards/chosen": 0.940443788255964, "rewards/margins": 3.7954304558890204, "rewards/rejected": -2.8549866676330566, "step": 5393 }, { "epoch": 0.28590358572072194, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62264456.0, "logits/rejected": -72054962.28571428, "logps/chosen": -737.4781494140625, "logps/rejected": -247.85907854352678, "loss": 0.21, "rewards/chosen": 0.36956787109375, "rewards/margins": 2.195929391043527, "rewards/rejected": -1.8263615199497767, "step": 5394 }, { "epoch": 0.285956589722524, "grad_norm": 57.0, "kl": 2.287419319152832, "learning_rate": 5e-07, "logits/chosen": -630600.2, "logits/rejected": -10775380.666666666, "logps/chosen": -447.989892578125, "logps/rejected": -240.68780517578125, "loss": 0.317, "rewards/chosen": 0.8746026992797852, "rewards/margins": 2.9966097513834633, "rewards/rejected": -2.1220070521036782, "step": 5395 }, { "epoch": 0.28600959372432616, "grad_norm": 51.0, "kl": 0.8079051971435547, "learning_rate": 5e-07, "logits/chosen": -24499232.0, "logits/rejected": -20847608.0, "logps/chosen": -295.1671142578125, "logps/rejected": -371.01708984375, "loss": 0.3448, "rewards/chosen": 0.13917280733585358, "rewards/margins": 1.9565885215997696, "rewards/rejected": -1.817415714263916, "step": 5396 }, { "epoch": 0.2860625977261283, "grad_norm": 41.75, "kl": 0.153839111328125, "learning_rate": 5e-07, "logits/chosen": -11909131.42857143, "logits/rejected": -10151672.0, "logps/chosen": -144.04453822544642, "logps/rejected": -127.49490356445312, "loss": 0.348, "rewards/chosen": 0.6636079379490444, "rewards/margins": 1.2174928614071439, "rewards/rejected": -0.5538849234580994, "step": 5397 }, { "epoch": 0.28611560172793044, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26942750.0, "logits/rejected": -33170213.333333332, "logps/chosen": -165.75204467773438, "logps/rejected": -417.2227376302083, "loss": 0.2019, "rewards/chosen": 0.4164726138114929, "rewards/margins": 2.602015197277069, "rewards/rejected": -2.185542583465576, "step": 5398 }, { "epoch": 0.2861686057297326, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12896765.333333334, "logits/rejected": 1074108.0, "logps/chosen": -221.97867838541666, "logps/rejected": -98.39979553222656, "loss": 0.4555, "rewards/chosen": 0.0044659872849782305, "rewards/margins": 0.7821877499421438, "rewards/rejected": -0.7777217626571655, "step": 5399 }, { "epoch": 0.2862216097315347, "grad_norm": 62.0, "kl": 0.5390739440917969, "learning_rate": 5e-07, "logits/chosen": -3808931.4285714286, "logits/rejected": -33736576.0, "logps/chosen": -224.64528111049108, "logps/rejected": -428.59063720703125, "loss": 0.4439, "rewards/chosen": 0.1361281360898699, "rewards/margins": 1.705857821873256, "rewards/rejected": -1.5697296857833862, "step": 5400 }, { "epoch": 0.28627461373333685, "grad_norm": 43.75, "kl": 0.10047149658203125, "learning_rate": 5e-07, "logits/chosen": -31752514.666666668, "logits/rejected": 34327923.2, "logps/chosen": -260.90260823567706, "logps/rejected": -512.158203125, "loss": 0.2283, "rewards/chosen": 0.01869545380274455, "rewards/margins": 2.6509090463320413, "rewards/rejected": -2.632213592529297, "step": 5401 }, { "epoch": 0.286327617735139, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44874032.0, "logits/rejected": -11991772.0, "logps/chosen": -253.2954559326172, "logps/rejected": -195.2293497721354, "loss": 0.2713, "rewards/chosen": 0.1883549690246582, "rewards/margins": 1.7092703183492024, "rewards/rejected": -1.5209153493245442, "step": 5402 }, { "epoch": 0.2863806217369411, "grad_norm": 73.5, "kl": 1.4525909423828125, "learning_rate": 5e-07, "logits/chosen": -41626294.85714286, "logits/rejected": 98394192.0, "logps/chosen": -356.93258231026783, "logps/rejected": -689.2398071289062, "loss": 0.3618, "rewards/chosen": 0.5987920761108398, "rewards/margins": 2.7740485668182373, "rewards/rejected": -2.1752564907073975, "step": 5403 }, { "epoch": 0.28643362573874326, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57702184.0, "logits/rejected": -23196891.42857143, "logps/chosen": -127.87480163574219, "logps/rejected": -378.82296316964283, "loss": 0.2396, "rewards/chosen": -0.18788909912109375, "rewards/margins": 1.92049435206822, "rewards/rejected": -2.108383451189314, "step": 5404 }, { "epoch": 0.2864866297405454, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -105537680.0, "logits/rejected": -8074636.0, "logps/chosen": -635.672607421875, "logps/rejected": -220.95808919270834, "loss": 0.3327, "rewards/chosen": 0.4576873779296875, "rewards/margins": 1.5926437377929688, "rewards/rejected": -1.1349563598632812, "step": 5405 }, { "epoch": 0.28653963374234753, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31901456.0, "logits/rejected": -17602614.4, "logps/chosen": -175.04669189453125, "logps/rejected": -266.5378662109375, "loss": 0.3224, "rewards/chosen": 0.41590678691864014, "rewards/margins": 1.6651042699813843, "rewards/rejected": -1.2491974830627441, "step": 5406 }, { "epoch": 0.28659263774414967, "grad_norm": 61.5, "kl": 1.459259033203125, "learning_rate": 5e-07, "logits/chosen": -31456025.6, "logits/rejected": -25411733.333333332, "logps/chosen": -304.247216796875, "logps/rejected": -191.93705240885416, "loss": 0.3542, "rewards/chosen": 0.6073466777801514, "rewards/margins": 1.7706751664479574, "rewards/rejected": -1.163328488667806, "step": 5407 }, { "epoch": 0.2866456417459518, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18660621.714285713, "logits/rejected": -41118516.0, "logps/chosen": -158.45204380580358, "logps/rejected": -348.43255615234375, "loss": 0.5339, "rewards/chosen": -0.40136286190577913, "rewards/margins": 2.013493095125471, "rewards/rejected": -2.41485595703125, "step": 5408 }, { "epoch": 0.28669864574775394, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -98396480.0, "logits/rejected": -85466841.6, "logps/chosen": -326.2855631510417, "logps/rejected": -568.673583984375, "loss": 0.2282, "rewards/chosen": 0.15350308020909628, "rewards/margins": 2.6253767450650534, "rewards/rejected": -2.471873664855957, "step": 5409 }, { "epoch": 0.2867516497495561, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27819696.0, "logits/rejected": -37768696.0, "logps/chosen": -316.0069274902344, "logps/rejected": -96.65055847167969, "loss": 0.2596, "rewards/chosen": 0.6875666975975037, "rewards/margins": 3.0577520728111267, "rewards/rejected": -2.370185375213623, "step": 5410 }, { "epoch": 0.2868046537513582, "grad_norm": 45.0, "kl": 0.2074413299560547, "learning_rate": 5e-07, "logits/chosen": -10634654.4, "logits/rejected": -23516306.666666668, "logps/chosen": -206.1905517578125, "logps/rejected": -238.6661376953125, "loss": 0.3138, "rewards/chosen": 0.5145825386047364, "rewards/margins": 2.1944844563802084, "rewards/rejected": -1.679901917775472, "step": 5411 }, { "epoch": 0.28685765775316036, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70092745.14285715, "logits/rejected": 9668754.0, "logps/chosen": -273.85191127232144, "logps/rejected": -45.7021369934082, "loss": 0.4731, "rewards/chosen": 0.12426517690931048, "rewards/margins": 0.4611971804073879, "rewards/rejected": -0.3369320034980774, "step": 5412 }, { "epoch": 0.2869106617549625, "grad_norm": 50.0, "kl": 0.4935264587402344, "learning_rate": 5e-07, "logits/chosen": -87668576.0, "logits/rejected": -5653090.0, "logps/chosen": -306.47462972005206, "logps/rejected": -132.08560791015626, "loss": 0.3377, "rewards/chosen": 0.11792552471160889, "rewards/margins": 1.661308264732361, "rewards/rejected": -1.543382740020752, "step": 5413 }, { "epoch": 0.28696366575676463, "grad_norm": 42.25, "kl": 1.058013916015625, "learning_rate": 5e-07, "logits/chosen": -37228377.6, "logits/rejected": -32236629.333333332, "logps/chosen": -291.95361328125, "logps/rejected": -440.029296875, "loss": 0.2648, "rewards/chosen": 0.7937524318695068, "rewards/margins": 3.218487024307251, "rewards/rejected": -2.424734592437744, "step": 5414 }, { "epoch": 0.28701666975856677, "grad_norm": 39.0, "kl": 0.54571533203125, "learning_rate": 5e-07, "logits/chosen": -34609408.0, "logits/rejected": -28721432.0, "logps/chosen": -306.237841796875, "logps/rejected": -367.7256673177083, "loss": 0.3098, "rewards/chosen": 0.41593046188354493, "rewards/margins": 2.8979872385660808, "rewards/rejected": -2.4820567766825357, "step": 5415 }, { "epoch": 0.2870696737603689, "grad_norm": 59.25, "kl": 2.735177993774414, "learning_rate": 5e-07, "logits/chosen": -13781687.0, "logits/rejected": -47678152.0, "logps/chosen": -390.22784423828125, "logps/rejected": -323.2122802734375, "loss": 0.2956, "rewards/chosen": 0.524694561958313, "rewards/margins": 2.8600763082504272, "rewards/rejected": -2.3353817462921143, "step": 5416 }, { "epoch": 0.28712267776217104, "grad_norm": 46.5, "kl": 0.7788572311401367, "learning_rate": 5e-07, "logits/chosen": -1539364.75, "logits/rejected": -13453698.0, "logps/chosen": -35.15083312988281, "logps/rejected": -280.81927490234375, "loss": 0.3341, "rewards/chosen": 0.16454191505908966, "rewards/margins": 2.105880096554756, "rewards/rejected": -1.9413381814956665, "step": 5417 }, { "epoch": 0.2871756817639732, "grad_norm": 60.5, "kl": 0.2848243713378906, "learning_rate": 5e-07, "logits/chosen": -3036567.5, "logits/rejected": -50833083.428571425, "logps/chosen": -57.31172180175781, "logps/rejected": -497.97774832589283, "loss": 0.2342, "rewards/chosen": -0.208760067820549, "rewards/margins": 1.4443648776837759, "rewards/rejected": -1.6531249455043249, "step": 5418 }, { "epoch": 0.2872286857657753, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12331128.8, "logits/rejected": -1423081.3333333333, "logps/chosen": -328.757666015625, "logps/rejected": -163.32844034830728, "loss": 0.3892, "rewards/chosen": 0.21218531131744384, "rewards/margins": 1.1065205017725626, "rewards/rejected": -0.8943351904551188, "step": 5419 }, { "epoch": 0.28728168976757745, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17368293.333333332, "logits/rejected": 52720476.0, "logps/chosen": -266.6094970703125, "logps/rejected": -506.389892578125, "loss": 0.3878, "rewards/chosen": 0.16836758454640707, "rewards/margins": 1.840284784634908, "rewards/rejected": -1.671917200088501, "step": 5420 }, { "epoch": 0.2873346937693796, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46549752.0, "logits/rejected": -11572688.0, "logps/chosen": -755.916748046875, "logps/rejected": -219.65962727864584, "loss": 0.2376, "rewards/chosen": 0.6434417366981506, "rewards/margins": 2.13831224044164, "rewards/rejected": -1.4948705037434895, "step": 5421 }, { "epoch": 0.2873876977711817, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38286234.666666664, "logits/rejected": -33731584.0, "logps/chosen": -216.11651611328125, "logps/rejected": -260.037890625, "loss": 0.283, "rewards/chosen": 0.2201684315999349, "rewards/margins": 2.0121087392171226, "rewards/rejected": -1.7919403076171876, "step": 5422 }, { "epoch": 0.28744070177298386, "grad_norm": 41.0, "kl": 0.5082454681396484, "learning_rate": 5e-07, "logits/chosen": 6404996.0, "logits/rejected": -58209664.0, "logps/chosen": -80.76392364501953, "logps/rejected": -403.1258544921875, "loss": 0.2241, "rewards/chosen": 0.5482972860336304, "rewards/margins": 2.4548289378484087, "rewards/rejected": -1.9065316518147786, "step": 5423 }, { "epoch": 0.287493705774786, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -98239232.0, "logits/rejected": -15555608.0, "logps/chosen": -679.4632568359375, "logps/rejected": -336.97837611607144, "loss": 0.2756, "rewards/chosen": -0.018524169921875, "rewards/margins": 1.4682543618338448, "rewards/rejected": -1.4867785317557198, "step": 5424 }, { "epoch": 0.28754670977658814, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33150118.4, "logits/rejected": -35653178.666666664, "logps/chosen": -272.7821044921875, "logps/rejected": -561.572021484375, "loss": 0.3484, "rewards/chosen": 0.04637696743011475, "rewards/margins": 2.319878681500753, "rewards/rejected": -2.273501714070638, "step": 5425 }, { "epoch": 0.2875997137783903, "grad_norm": 65.0, "kl": 0.49798583984375, "learning_rate": 5e-07, "logits/chosen": -3631789.3333333335, "logits/rejected": -20680206.4, "logps/chosen": -610.3033854166666, "logps/rejected": -464.272265625, "loss": 0.2123, "rewards/chosen": 0.9925119876861572, "rewards/margins": 2.87143235206604, "rewards/rejected": -1.8789203643798829, "step": 5426 }, { "epoch": 0.2876527177801924, "grad_norm": 51.0, "kl": 1.3337783813476562, "learning_rate": 5e-07, "logits/chosen": -23340928.0, "logits/rejected": -59082164.0, "logps/chosen": -369.0068359375, "logps/rejected": -246.05201721191406, "loss": 0.3393, "rewards/chosen": 0.6790379683176676, "rewards/margins": 2.363258878389994, "rewards/rejected": -1.6842209100723267, "step": 5427 }, { "epoch": 0.28770572178199455, "grad_norm": 56.0, "kl": 0.37058258056640625, "learning_rate": 5e-07, "logits/chosen": -45550981.333333336, "logits/rejected": 856910.25, "logps/chosen": -280.53379313151044, "logps/rejected": -110.88986206054688, "loss": 0.3994, "rewards/chosen": 0.37042927742004395, "rewards/margins": 1.1498180627822876, "rewards/rejected": -0.7793887853622437, "step": 5428 }, { "epoch": 0.2877587257837967, "grad_norm": 49.0, "kl": 0.31073760986328125, "learning_rate": 5e-07, "logits/chosen": -32070492.0, "logits/rejected": 4486118.0, "logps/chosen": -189.2793426513672, "logps/rejected": -86.8124008178711, "loss": 0.4008, "rewards/chosen": -0.035482585430145264, "rewards/margins": 1.0758034586906433, "rewards/rejected": -1.1112860441207886, "step": 5429 }, { "epoch": 0.2878117297855988, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 68899317.33333333, "logits/rejected": -36445126.4, "logps/chosen": -283.7396240234375, "logps/rejected": -269.76943359375, "loss": 0.2955, "rewards/chosen": -0.17535221576690674, "rewards/margins": 1.8819334745407104, "rewards/rejected": -2.057285690307617, "step": 5430 }, { "epoch": 0.28786473378740096, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52878172.0, "logits/rejected": -18180642.0, "logps/chosen": -232.98764038085938, "logps/rejected": -171.92367553710938, "loss": 0.2946, "rewards/chosen": 0.14635199308395386, "rewards/margins": 2.2719127535820007, "rewards/rejected": -2.125560760498047, "step": 5431 }, { "epoch": 0.2879177377892031, "grad_norm": 83.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -82844921.6, "logits/rejected": -18209582.666666668, "logps/chosen": -393.1937255859375, "logps/rejected": -294.3225911458333, "loss": 0.3341, "rewards/chosen": 0.11171293258666992, "rewards/margins": 2.5787660280863443, "rewards/rejected": -2.4670530954996743, "step": 5432 }, { "epoch": 0.28797074179100524, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30094253.333333332, "logits/rejected": -28121500.8, "logps/chosen": -396.9938151041667, "logps/rejected": -355.42998046875, "loss": 0.1894, "rewards/chosen": 1.0388755003611247, "rewards/margins": 3.2200799147288004, "rewards/rejected": -2.181204414367676, "step": 5433 }, { "epoch": 0.2880237457928074, "grad_norm": 56.75, "kl": 1.0238265991210938, "learning_rate": 5e-07, "logits/chosen": -33173286.0, "logits/rejected": -8960002.0, "logps/chosen": -471.9802551269531, "logps/rejected": -267.98291015625, "loss": 0.374, "rewards/chosen": 0.15834274888038635, "rewards/margins": 1.3986301720142365, "rewards/rejected": -1.24028742313385, "step": 5434 }, { "epoch": 0.2880767497946095, "grad_norm": 42.0, "kl": 0.46875572204589844, "learning_rate": 5e-07, "logits/chosen": -2236799.8, "logits/rejected": -11319302.666666666, "logps/chosen": -263.4937744140625, "logps/rejected": -150.8176472981771, "loss": 0.331, "rewards/chosen": 0.5363712310791016, "rewards/margins": 1.793179194132487, "rewards/rejected": -1.2568079630533855, "step": 5435 }, { "epoch": 0.28812975379641165, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27884632.0, "logits/rejected": -22514112.0, "logps/chosen": -75.51229095458984, "logps/rejected": -301.37400309244794, "loss": 0.2986, "rewards/chosen": -0.2736763060092926, "rewards/margins": 1.3558228115240734, "rewards/rejected": -1.629499117533366, "step": 5436 }, { "epoch": 0.2881827577982138, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53121460.0, "logits/rejected": -25752014.0, "logps/chosen": -302.6993713378906, "logps/rejected": -407.88909912109375, "loss": 0.2332, "rewards/chosen": 0.35298845171928406, "rewards/margins": 3.78342267870903, "rewards/rejected": -3.430434226989746, "step": 5437 }, { "epoch": 0.2882357618000159, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10066071.0, "logits/rejected": -96172128.0, "logps/chosen": -372.2275390625, "logps/rejected": -253.96453857421875, "loss": 0.306, "rewards/chosen": 0.22152766585350037, "rewards/margins": 2.2348895967006683, "rewards/rejected": -2.013361930847168, "step": 5438 }, { "epoch": 0.28828876580181806, "grad_norm": 68.0, "kl": 0.47884368896484375, "learning_rate": 5e-07, "logits/chosen": -64105676.8, "logits/rejected": -27770834.666666668, "logps/chosen": -628.90458984375, "logps/rejected": -716.1791178385416, "loss": 0.2981, "rewards/chosen": 0.4578222751617432, "rewards/margins": 2.644987694422404, "rewards/rejected": -2.1871654192606607, "step": 5439 }, { "epoch": 0.2883417698036202, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8464854.0, "logits/rejected": -26812344.0, "logps/chosen": -149.53805541992188, "logps/rejected": -197.5211181640625, "loss": 0.3214, "rewards/chosen": 0.6313344637552897, "rewards/margins": 1.6516681353251137, "rewards/rejected": -1.0203336715698241, "step": 5440 }, { "epoch": 0.28839477380542233, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62663920.0, "logits/rejected": 27075216.0, "logps/chosen": -382.2919006347656, "logps/rejected": -479.64398193359375, "loss": 0.2719, "rewards/chosen": 0.4427505433559418, "rewards/margins": 4.193575471639633, "rewards/rejected": -3.7508249282836914, "step": 5441 }, { "epoch": 0.28844777780722447, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6924508.5, "logits/rejected": -46991274.666666664, "logps/chosen": -243.81076049804688, "logps/rejected": -408.3537190755208, "loss": 0.2193, "rewards/chosen": -0.0015485761687159538, "rewards/margins": 2.248595269707342, "rewards/rejected": -2.250143845876058, "step": 5442 }, { "epoch": 0.2885007818090266, "grad_norm": 44.25, "kl": 0.8025493621826172, "learning_rate": 5e-07, "logits/chosen": -45121836.0, "logits/rejected": -34001584.0, "logps/chosen": -453.9506530761719, "logps/rejected": -405.37591552734375, "loss": 0.2463, "rewards/chosen": 0.9145575165748596, "rewards/margins": 2.723121464252472, "rewards/rejected": -1.8085639476776123, "step": 5443 }, { "epoch": 0.28855378581082874, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3077521.6, "logits/rejected": -40938986.666666664, "logps/chosen": -239.477783203125, "logps/rejected": -464.2464192708333, "loss": 0.3475, "rewards/chosen": 0.41796231269836426, "rewards/margins": 1.9072201251983643, "rewards/rejected": -1.4892578125, "step": 5444 }, { "epoch": 0.2886067898126308, "grad_norm": 42.25, "kl": 0.26085662841796875, "learning_rate": 5e-07, "logits/chosen": -10666512.0, "logits/rejected": -33324710.4, "logps/chosen": -182.85115559895834, "logps/rejected": -282.423828125, "loss": 0.3041, "rewards/chosen": 0.019239674011866253, "rewards/margins": 1.792160854736964, "rewards/rejected": -1.7729211807250977, "step": 5445 }, { "epoch": 0.28865979381443296, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3065474.5, "logits/rejected": -29849838.0, "logps/chosen": -183.52407836914062, "logps/rejected": -493.3051452636719, "loss": 0.2382, "rewards/chosen": 0.5172033309936523, "rewards/margins": 3.1013035774230957, "rewards/rejected": -2.5841002464294434, "step": 5446 }, { "epoch": 0.2887127978162351, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5756198.666666667, "logits/rejected": -63489491.2, "logps/chosen": -179.43074544270834, "logps/rejected": -605.99267578125, "loss": 0.2119, "rewards/chosen": 0.33837711811065674, "rewards/margins": 4.079710507392884, "rewards/rejected": -3.7413333892822265, "step": 5447 }, { "epoch": 0.28876580181803724, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34322136.0, "logits/rejected": -45585988.0, "logps/chosen": -298.30572509765625, "logps/rejected": -419.8658752441406, "loss": 0.3307, "rewards/chosen": 0.36153872807820636, "rewards/margins": 3.017848332722982, "rewards/rejected": -2.6563096046447754, "step": 5448 }, { "epoch": 0.2888188058198394, "grad_norm": 29.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1739386.5, "logits/rejected": -35967826.666666664, "logps/chosen": -32.96360397338867, "logps/rejected": -592.9253336588541, "loss": 0.211, "rewards/chosen": 0.3359992206096649, "rewards/margins": 2.879264881213506, "rewards/rejected": -2.5432656606038413, "step": 5449 }, { "epoch": 0.2888718098216415, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81651904.0, "logits/rejected": -46265392.0, "logps/chosen": -172.3131866455078, "logps/rejected": -411.0042724609375, "loss": 0.3289, "rewards/chosen": 0.21111497282981873, "rewards/margins": 2.2534380853176117, "rewards/rejected": -2.042323112487793, "step": 5450 }, { "epoch": 0.28892481382344365, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6048597.5, "logits/rejected": -34732341.333333336, "logps/chosen": -87.5150146484375, "logps/rejected": -427.0098470052083, "loss": 0.2698, "rewards/chosen": -0.32236120104789734, "rewards/margins": 1.5937733153502147, "rewards/rejected": -1.916134516398112, "step": 5451 }, { "epoch": 0.2889778178252458, "grad_norm": 53.25, "kl": 3.11313533782959, "learning_rate": 5e-07, "logits/chosen": -14621854.666666666, "logits/rejected": -13632865.0, "logps/chosen": -448.97802734375, "logps/rejected": -361.7712707519531, "loss": 0.3594, "rewards/chosen": 0.7411959171295166, "rewards/margins": 2.561823606491089, "rewards/rejected": -1.8206276893615723, "step": 5452 }, { "epoch": 0.2890308218270479, "grad_norm": 52.5, "kl": 1.4349899291992188, "learning_rate": 5e-07, "logits/chosen": -10162664.0, "logits/rejected": -13498526.0, "logps/chosen": -201.57933044433594, "logps/rejected": -408.8338623046875, "loss": 0.2545, "rewards/chosen": 0.5557026863098145, "rewards/margins": 3.177882194519043, "rewards/rejected": -2.6221795082092285, "step": 5453 }, { "epoch": 0.28908382582885006, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6141028.5, "logits/rejected": -10116661.0, "logps/chosen": -283.5106506347656, "logps/rejected": -250.53030395507812, "loss": 0.2804, "rewards/chosen": 0.4087657034397125, "rewards/margins": 2.4495257437229156, "rewards/rejected": -2.040760040283203, "step": 5454 }, { "epoch": 0.2891368298306522, "grad_norm": 138.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24898412.8, "logits/rejected": -33713005.333333336, "logps/chosen": -373.872314453125, "logps/rejected": -274.19744873046875, "loss": 0.3812, "rewards/chosen": 0.008878028392791748, "rewards/margins": 1.5473618070284527, "rewards/rejected": -1.5384837786356609, "step": 5455 }, { "epoch": 0.28918983383245433, "grad_norm": 58.75, "kl": 0.6615867614746094, "learning_rate": 5e-07, "logits/chosen": -43007532.8, "logits/rejected": -46970789.333333336, "logps/chosen": -341.475, "logps/rejected": -178.99222819010416, "loss": 0.4388, "rewards/chosen": 0.010350185632705688, "rewards/margins": 0.8791337390740713, "rewards/rejected": -0.8687835534413656, "step": 5456 }, { "epoch": 0.28924283783425647, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38588408.0, "logits/rejected": -33980708.0, "logps/chosen": -298.3667907714844, "logps/rejected": -318.8837585449219, "loss": 0.286, "rewards/chosen": 0.3289226293563843, "rewards/margins": 2.6825608015060425, "rewards/rejected": -2.353638172149658, "step": 5457 }, { "epoch": 0.2892958418360586, "grad_norm": 48.25, "kl": 0.6168251037597656, "learning_rate": 5e-07, "logits/chosen": 5253200.0, "logits/rejected": -303188.5, "logps/chosen": -288.74384765625, "logps/rejected": -109.46614583333333, "loss": 0.3503, "rewards/chosen": 0.15422365665435792, "rewards/margins": 2.3914183537165323, "rewards/rejected": -2.2371946970621743, "step": 5458 }, { "epoch": 0.28934884583786075, "grad_norm": 56.0, "kl": 1.6385841369628906, "learning_rate": 5e-07, "logits/chosen": -57630581.333333336, "logits/rejected": -59058918.4, "logps/chosen": -957.5933430989584, "logps/rejected": -295.831640625, "loss": 0.2798, "rewards/chosen": 1.1020264625549316, "rewards/margins": 2.3536032676696776, "rewards/rejected": -1.251576805114746, "step": 5459 }, { "epoch": 0.2894018498396629, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40004728.0, "logits/rejected": 6257570.5, "logps/chosen": -340.7958068847656, "logps/rejected": -215.069580078125, "loss": 0.2711, "rewards/chosen": 0.8606737852096558, "rewards/margins": 2.0684698820114136, "rewards/rejected": -1.2077960968017578, "step": 5460 }, { "epoch": 0.289454853841465, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16743810.666666666, "logits/rejected": -43864454.4, "logps/chosen": -162.8470662434896, "logps/rejected": -546.7431640625, "loss": 0.285, "rewards/chosen": 0.0458150307337443, "rewards/margins": 2.7228314797083537, "rewards/rejected": -2.6770164489746096, "step": 5461 }, { "epoch": 0.28950785784326716, "grad_norm": 64.0, "kl": 0.6468048095703125, "learning_rate": 5e-07, "logits/chosen": -34018601.14285714, "logits/rejected": 26551770.0, "logps/chosen": -413.4867466517857, "logps/rejected": -185.7032928466797, "loss": 0.4538, "rewards/chosen": 0.17037994521004812, "rewards/margins": 1.099273988178798, "rewards/rejected": -0.92889404296875, "step": 5462 }, { "epoch": 0.2895608618450693, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43883446.4, "logits/rejected": -16003664.0, "logps/chosen": -316.057666015625, "logps/rejected": -520.3227945963541, "loss": 0.3583, "rewards/chosen": 0.18547736406326293, "rewards/margins": 1.8021125276883443, "rewards/rejected": -1.6166351636250813, "step": 5463 }, { "epoch": 0.28961386584687143, "grad_norm": 45.75, "kl": 0.2993354797363281, "learning_rate": 5e-07, "logits/chosen": -26569128.0, "logits/rejected": -27671306.0, "logps/chosen": -215.7540283203125, "logps/rejected": -344.0500793457031, "loss": 0.3042, "rewards/chosen": 0.2707352638244629, "rewards/margins": 2.159781336784363, "rewards/rejected": -1.8890460729599, "step": 5464 }, { "epoch": 0.28966686984867357, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19481146.666666668, "logits/rejected": -29350500.0, "logps/chosen": -262.663818359375, "logps/rejected": -446.650390625, "loss": 0.3893, "rewards/chosen": 0.15692255894343057, "rewards/margins": 1.728670100371043, "rewards/rejected": -1.5717475414276123, "step": 5465 }, { "epoch": 0.2897198738504757, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42530701.333333336, "logits/rejected": -33896220.0, "logps/chosen": -265.83461507161456, "logps/rejected": -483.0404357910156, "loss": 0.3977, "rewards/chosen": 0.009840048849582672, "rewards/margins": 2.2357709780335426, "rewards/rejected": -2.22593092918396, "step": 5466 }, { "epoch": 0.28977287785227784, "grad_norm": 35.0, "kl": 0.17490196228027344, "learning_rate": 5e-07, "logits/chosen": -2177709.8, "logits/rejected": -11865129.333333334, "logps/chosen": -200.3518310546875, "logps/rejected": -209.50482177734375, "loss": 0.3423, "rewards/chosen": 0.3302318096160889, "rewards/margins": 2.9369314352671303, "rewards/rejected": -2.6066996256510415, "step": 5467 }, { "epoch": 0.28982588185408, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18884852.0, "logits/rejected": -14558296.0, "logps/chosen": -175.710205078125, "logps/rejected": -348.8463541666667, "loss": 0.1934, "rewards/chosen": 0.6417327523231506, "rewards/margins": 2.578304986159007, "rewards/rejected": -1.9365722338358562, "step": 5468 }, { "epoch": 0.2898788858558821, "grad_norm": 39.75, "kl": 0.35016918182373047, "learning_rate": 5e-07, "logits/chosen": 891274.0, "logits/rejected": -24303979.2, "logps/chosen": -173.26629638671875, "logps/rejected": -367.4456298828125, "loss": 0.2429, "rewards/chosen": 0.3647252321243286, "rewards/margins": 2.425797200202942, "rewards/rejected": -2.0610719680786134, "step": 5469 }, { "epoch": 0.28993188985768426, "grad_norm": 29.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2868886.75, "logits/rejected": 17522398.0, "logps/chosen": -24.62563705444336, "logps/rejected": -137.34530639648438, "loss": 0.2813, "rewards/chosen": 0.6296705007553101, "rewards/margins": 2.027154326438904, "rewards/rejected": -1.3974838256835938, "step": 5470 }, { "epoch": 0.2899848938594864, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26424352.0, "logits/rejected": 44179544.0, "logps/chosen": -455.6207580566406, "logps/rejected": -460.836181640625, "loss": 0.1881, "rewards/chosen": 0.9932823181152344, "rewards/margins": 3.454280376434326, "rewards/rejected": -2.460998058319092, "step": 5471 }, { "epoch": 0.29003789786128853, "grad_norm": 45.5, "kl": 0.5346031188964844, "learning_rate": 5e-07, "logits/chosen": -38878298.666666664, "logits/rejected": -15366188.8, "logps/chosen": -378.6572672526042, "logps/rejected": -125.35738525390624, "loss": 0.3731, "rewards/chosen": 0.06260630488395691, "rewards/margins": 1.0470350921154021, "rewards/rejected": -0.9844287872314453, "step": 5472 }, { "epoch": 0.29009090186309067, "grad_norm": 49.75, "kl": 0.9266204833984375, "learning_rate": 5e-07, "logits/chosen": -41944300.0, "logits/rejected": -37731464.0, "logps/chosen": -319.13177490234375, "logps/rejected": -311.7384338378906, "loss": 0.2817, "rewards/chosen": 0.194634810090065, "rewards/margins": 2.4351798444986343, "rewards/rejected": -2.2405450344085693, "step": 5473 }, { "epoch": 0.2901439058648928, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55396752.0, "logits/rejected": -32834992.0, "logps/chosen": -335.67913818359375, "logps/rejected": -349.02294921875, "loss": 0.2312, "rewards/chosen": 0.5802410244941711, "rewards/margins": 3.1909154057502747, "rewards/rejected": -2.6106743812561035, "step": 5474 }, { "epoch": 0.29019690986669494, "grad_norm": 69.0, "kl": 1.0380973815917969, "learning_rate": 5e-07, "logits/chosen": 2330698.0, "logits/rejected": 21900754.666666668, "logps/chosen": -352.6341796875, "logps/rejected": -385.0797526041667, "loss": 0.2963, "rewards/chosen": 0.9725377082824707, "rewards/margins": 2.2985910892486574, "rewards/rejected": -1.3260533809661865, "step": 5475 }, { "epoch": 0.2902499138684971, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10137485.6, "logits/rejected": -26380538.666666668, "logps/chosen": -209.3741455078125, "logps/rejected": -465.7490234375, "loss": 0.3218, "rewards/chosen": 0.31670894622802737, "rewards/margins": 3.7150091171264648, "rewards/rejected": -3.3983001708984375, "step": 5476 }, { "epoch": 0.2903029178702992, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46791504.0, "logits/rejected": -19076442.0, "logps/chosen": -275.8336181640625, "logps/rejected": -238.49371337890625, "loss": 0.4036, "rewards/chosen": -0.06790351867675781, "rewards/margins": 0.7737586498260498, "rewards/rejected": -0.8416621685028076, "step": 5477 }, { "epoch": 0.29035592187210135, "grad_norm": 46.25, "kl": 1.9204483032226562, "learning_rate": 5e-07, "logits/chosen": -51129302.4, "logits/rejected": -28182330.666666668, "logps/chosen": -985.0181640625, "logps/rejected": -239.76373291015625, "loss": 0.2874, "rewards/chosen": 0.8689910888671875, "rewards/margins": 2.911082458496094, "rewards/rejected": -2.0420913696289062, "step": 5478 }, { "epoch": 0.2904089258739035, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -74726848.0, "logps/rejected": -391.8966064453125, "loss": 0.1319, "rewards/rejected": -2.1005301475524902, "step": 5479 }, { "epoch": 0.2904619298757056, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52979140.0, "logits/rejected": -16428725.333333334, "logps/chosen": -447.99365234375, "logps/rejected": -269.57177734375, "loss": 0.303, "rewards/chosen": 0.19124145805835724, "rewards/margins": 1.4209460765123367, "rewards/rejected": -1.2297046184539795, "step": 5480 }, { "epoch": 0.29051493387750776, "grad_norm": 38.5, "kl": 0.9728889465332031, "learning_rate": 5e-07, "logits/chosen": -18435218.0, "logits/rejected": -50798776.0, "logps/chosen": -162.41624450683594, "logps/rejected": -286.079345703125, "loss": 0.3252, "rewards/chosen": 0.09402203559875488, "rewards/margins": 2.622692823410034, "rewards/rejected": -2.5286707878112793, "step": 5481 }, { "epoch": 0.2905679378793099, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42808147.2, "logits/rejected": -10015877.333333334, "logps/chosen": -133.2275390625, "logps/rejected": -151.1164347330729, "loss": 0.3925, "rewards/chosen": 0.08868011832237244, "rewards/margins": 1.31910436352094, "rewards/rejected": -1.2304242451985676, "step": 5482 }, { "epoch": 0.29062094188111204, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23186234.0, "logits/rejected": -30329546.0, "logps/chosen": -356.768798828125, "logps/rejected": -190.49624633789062, "loss": 0.2991, "rewards/chosen": 0.5387169122695923, "rewards/margins": 1.9392879009246826, "rewards/rejected": -1.4005709886550903, "step": 5483 }, { "epoch": 0.2906739458829142, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25196041.6, "logits/rejected": -23524258.666666668, "logps/chosen": -198.287939453125, "logps/rejected": -208.135986328125, "loss": 0.3362, "rewards/chosen": 0.2291868209838867, "rewards/margins": 2.200539557139079, "rewards/rejected": -1.9713527361551921, "step": 5484 }, { "epoch": 0.2907269498847163, "grad_norm": 56.25, "kl": 0.11546516418457031, "learning_rate": 5e-07, "logits/chosen": -28841602.666666668, "logits/rejected": -14054135.0, "logps/chosen": -307.8853759765625, "logps/rejected": -304.3590087890625, "loss": 0.3717, "rewards/chosen": 0.3129210074742635, "rewards/margins": 1.905681570370992, "rewards/rejected": -1.5927605628967285, "step": 5485 }, { "epoch": 0.29077995388651845, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10137230.4, "logits/rejected": -41422320.0, "logps/chosen": -167.89150390625, "logps/rejected": -423.962158203125, "loss": 0.3516, "rewards/chosen": 0.22072949409484863, "rewards/margins": 2.39365226427714, "rewards/rejected": -2.1729227701822915, "step": 5486 }, { "epoch": 0.2908329578883206, "grad_norm": 56.25, "kl": 0.118316650390625, "learning_rate": 5e-07, "logits/chosen": -63321088.0, "logits/rejected": -11907502.0, "logps/chosen": -375.16204833984375, "logps/rejected": -165.7490997314453, "loss": 0.2743, "rewards/chosen": 0.3857215940952301, "rewards/margins": 2.4806341230869293, "rewards/rejected": -2.094912528991699, "step": 5487 }, { "epoch": 0.2908859618901227, "grad_norm": 54.5, "kl": 0.2764854431152344, "learning_rate": 5e-07, "logits/chosen": -26450750.0, "logits/rejected": -11906370.0, "logps/chosen": -244.18630981445312, "logps/rejected": -238.1529541015625, "loss": 0.3281, "rewards/chosen": 0.20048736035823822, "rewards/margins": 1.8294056504964828, "rewards/rejected": -1.6289182901382446, "step": 5488 }, { "epoch": 0.29093896589192486, "grad_norm": 65.0, "kl": 0.6156730651855469, "learning_rate": 5e-07, "logits/chosen": -35234592.0, "logits/rejected": -10370452.0, "logps/chosen": -424.65771484375, "logps/rejected": -196.83272298177084, "loss": 0.2974, "rewards/chosen": 0.6272073745727539, "rewards/margins": 2.054533576965332, "rewards/rejected": -1.4273262023925781, "step": 5489 }, { "epoch": 0.290991969893727, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31436037.333333332, "logits/rejected": -42656233.6, "logps/chosen": -521.8851725260416, "logps/rejected": -352.646240234375, "loss": 0.2329, "rewards/chosen": 0.04610186815261841, "rewards/margins": 2.8194584965705873, "rewards/rejected": -2.773356628417969, "step": 5490 }, { "epoch": 0.29104497389552914, "grad_norm": 62.0, "kl": 0.5815658569335938, "learning_rate": 5e-07, "logits/chosen": 47222069.333333336, "logits/rejected": -45656518.4, "logps/chosen": -448.498046875, "logps/rejected": -160.97852783203126, "loss": 0.3689, "rewards/chosen": 0.16208191712697348, "rewards/margins": 1.1240336497624714, "rewards/rejected": -0.961951732635498, "step": 5491 }, { "epoch": 0.2910979778973313, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26197576.0, "logits/rejected": -11998156.0, "logps/chosen": -213.7579833984375, "logps/rejected": -292.91798909505206, "loss": 0.2775, "rewards/chosen": 0.6737519264221191, "rewards/margins": 2.6254079500834147, "rewards/rejected": -1.9516560236612956, "step": 5492 }, { "epoch": 0.2911509818991334, "grad_norm": 66.0, "kl": 1.962594985961914, "learning_rate": 5e-07, "logits/chosen": -81246496.0, "logits/rejected": -12885740.0, "logps/chosen": -1018.958251953125, "logps/rejected": -364.581787109375, "loss": 0.2361, "rewards/chosen": 1.5272995233535767, "rewards/margins": 3.553106427192688, "rewards/rejected": -2.0258069038391113, "step": 5493 }, { "epoch": 0.29120398590093555, "grad_norm": 54.5, "kl": 0.1047821044921875, "learning_rate": 5e-07, "logits/chosen": 105596821.33333333, "logits/rejected": -35570073.6, "logps/chosen": -450.4707845052083, "logps/rejected": -323.3658447265625, "loss": 0.2291, "rewards/chosen": 0.32662634054819745, "rewards/margins": 2.673781148592631, "rewards/rejected": -2.3471548080444338, "step": 5494 }, { "epoch": 0.2912569899027377, "grad_norm": 50.5, "kl": 0.8281373977661133, "learning_rate": 5e-07, "logits/chosen": -10107567.0, "logits/rejected": -13728146.0, "logps/chosen": -445.2774658203125, "logps/rejected": -269.4516296386719, "loss": 0.2652, "rewards/chosen": 0.6616287231445312, "rewards/margins": 2.3352761268615723, "rewards/rejected": -1.673647403717041, "step": 5495 }, { "epoch": 0.29130999390453977, "grad_norm": 53.5, "kl": 0.5765590667724609, "learning_rate": 5e-07, "logits/chosen": -31239968.0, "logits/rejected": -20832060.0, "logps/chosen": -301.4478454589844, "logps/rejected": -197.85366821289062, "loss": 0.2997, "rewards/chosen": 0.5379959344863892, "rewards/margins": 2.0702621936798096, "rewards/rejected": -1.5322662591934204, "step": 5496 }, { "epoch": 0.2913629979063419, "grad_norm": 47.25, "kl": 0.242462158203125, "learning_rate": 5e-07, "logits/chosen": -31167928.0, "logits/rejected": -102531544.0, "logps/chosen": -388.61114501953125, "logps/rejected": -182.8555908203125, "loss": 0.2486, "rewards/chosen": 0.47409266233444214, "rewards/margins": 2.6701120734214783, "rewards/rejected": -2.196019411087036, "step": 5497 }, { "epoch": 0.29141600190814404, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33909008.0, "logits/rejected": -23628242.666666668, "logps/chosen": -474.862548828125, "logps/rejected": -233.57318115234375, "loss": 0.2067, "rewards/chosen": 0.41400301456451416, "rewards/margins": 2.3870272239049273, "rewards/rejected": -1.9730242093404133, "step": 5498 }, { "epoch": 0.2914690059099462, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29957746.666666668, "logits/rejected": -28361772.8, "logps/chosen": -390.5743815104167, "logps/rejected": -495.713427734375, "loss": 0.2939, "rewards/chosen": 0.1337514321009318, "rewards/margins": 2.3591058174769084, "rewards/rejected": -2.2253543853759767, "step": 5499 }, { "epoch": 0.2915220099117483, "grad_norm": 86.5, "kl": 0.40019989013671875, "learning_rate": 5e-07, "logits/chosen": -61618880.0, "logits/rejected": -19648788.0, "logps/chosen": -619.2091471354166, "logps/rejected": -163.13458251953125, "loss": 0.3691, "rewards/chosen": 0.3972483476003011, "rewards/margins": 1.7232497533162434, "rewards/rejected": -1.3260014057159424, "step": 5500 }, { "epoch": 0.29157501391355045, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24873578.0, "logits/rejected": -30523300.0, "logps/chosen": -292.2434997558594, "logps/rejected": -338.57147216796875, "loss": 0.2709, "rewards/chosen": 0.5327190160751343, "rewards/margins": 2.52340567111969, "rewards/rejected": -1.9906866550445557, "step": 5501 }, { "epoch": 0.2916280179153526, "grad_norm": 46.25, "kl": 0.03833198547363281, "learning_rate": 5e-07, "logits/chosen": -27761430.0, "logits/rejected": -40688868.0, "logps/chosen": -320.2553405761719, "logps/rejected": -380.6802978515625, "loss": 0.2849, "rewards/chosen": 0.01020926982164383, "rewards/margins": 3.095776505768299, "rewards/rejected": -3.0855672359466553, "step": 5502 }, { "epoch": 0.2916810219171547, "grad_norm": 48.75, "kl": 0.3108406066894531, "learning_rate": 5e-07, "logits/chosen": -17914765.333333332, "logits/rejected": -63365792.0, "logps/chosen": -449.450439453125, "logps/rejected": -629.6297607421875, "loss": 0.324, "rewards/chosen": 0.4159703254699707, "rewards/margins": 4.085228443145752, "rewards/rejected": -3.6692581176757812, "step": 5503 }, { "epoch": 0.29173402591895686, "grad_norm": 56.0, "kl": 1.2707901000976562, "learning_rate": 5e-07, "logits/chosen": -46802531.2, "logits/rejected": -31914517.333333332, "logps/chosen": -288.389990234375, "logps/rejected": -282.0913492838542, "loss": 0.2941, "rewards/chosen": 0.619644021987915, "rewards/margins": 2.708278036117554, "rewards/rejected": -2.0886340141296387, "step": 5504 }, { "epoch": 0.291787029920759, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51494644.0, "logits/rejected": -39440560.0, "logps/chosen": -559.88525390625, "logps/rejected": -689.5239868164062, "loss": 0.2948, "rewards/chosen": 0.12086941301822662, "rewards/margins": 4.011019244790077, "rewards/rejected": -3.8901498317718506, "step": 5505 }, { "epoch": 0.29184003392256114, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -559022.95, "logits/rejected": -10337834.666666666, "logps/chosen": -106.94276123046875, "logps/rejected": -186.44490559895834, "loss": 0.3853, "rewards/chosen": 0.2996324062347412, "rewards/margins": 1.4613079388936359, "rewards/rejected": -1.1616755326588948, "step": 5506 }, { "epoch": 0.2918930379243633, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -147164010.66666666, "logits/rejected": -18479585.6, "logps/chosen": -323.43251546223956, "logps/rejected": -246.668017578125, "loss": 0.3173, "rewards/chosen": -0.2106761932373047, "rewards/margins": 1.6338760375976562, "rewards/rejected": -1.844552230834961, "step": 5507 }, { "epoch": 0.2919460419261654, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34565668.0, "logits/rejected": -21840756.0, "logps/chosen": -194.26797485351562, "logps/rejected": -281.4765625, "loss": 0.3101, "rewards/chosen": 0.05672560632228851, "rewards/margins": 2.6965733617544174, "rewards/rejected": -2.639847755432129, "step": 5508 }, { "epoch": 0.29199904592796755, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -33614724.0, "logps/rejected": -356.1149597167969, "loss": 0.1229, "rewards/rejected": -2.0580191612243652, "step": 5509 }, { "epoch": 0.2920520499297697, "grad_norm": 55.75, "kl": 0.15538787841796875, "learning_rate": 5e-07, "logits/chosen": -23945510.4, "logits/rejected": -22438837.333333332, "logps/chosen": -272.3664306640625, "logps/rejected": -314.5005289713542, "loss": 0.3699, "rewards/chosen": 0.10855531692504883, "rewards/margins": 1.8117713928222656, "rewards/rejected": -1.7032160758972168, "step": 5510 }, { "epoch": 0.2921050539315718, "grad_norm": 46.25, "kl": 0.024808883666992188, "learning_rate": 5e-07, "logits/chosen": -14029024.0, "logits/rejected": -36679244.8, "logps/chosen": -175.74566650390625, "logps/rejected": -364.566162109375, "loss": 0.2907, "rewards/chosen": 0.012681325276692709, "rewards/margins": 1.7263451894124349, "rewards/rejected": -1.7136638641357422, "step": 5511 }, { "epoch": 0.29215805793337396, "grad_norm": 61.0, "kl": 0.387420654296875, "learning_rate": 5e-07, "logits/chosen": 35122421.333333336, "logits/rejected": 12819018.0, "logps/chosen": -280.4959716796875, "logps/rejected": -446.9928894042969, "loss": 0.3419, "rewards/chosen": 0.30987854798634845, "rewards/margins": 2.860170086224874, "rewards/rejected": -2.5502915382385254, "step": 5512 }, { "epoch": 0.2922110619351761, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26955660.8, "logits/rejected": -25899405.333333332, "logps/chosen": -177.5070556640625, "logps/rejected": -171.74591064453125, "loss": 0.3812, "rewards/chosen": 0.05449371337890625, "rewards/margins": 1.584748872121175, "rewards/rejected": -1.5302551587422688, "step": 5513 }, { "epoch": 0.29226406593697823, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11019813.333333334, "logits/rejected": -27790899.2, "logps/chosen": -88.9404296875, "logps/rejected": -431.806298828125, "loss": 0.2763, "rewards/chosen": 0.23647016286849976, "rewards/margins": 2.0568959593772886, "rewards/rejected": -1.820425796508789, "step": 5514 }, { "epoch": 0.29231706993878037, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5554088.666666667, "logits/rejected": -61602009.6, "logps/chosen": -66.09885660807292, "logps/rejected": -342.373779296875, "loss": 0.2912, "rewards/chosen": -0.13185322284698486, "rewards/margins": 2.2504385709762573, "rewards/rejected": -2.382291793823242, "step": 5515 }, { "epoch": 0.2923700739405825, "grad_norm": 55.0, "kl": 1.1466960906982422, "learning_rate": 5e-07, "logits/chosen": -38529850.666666664, "logits/rejected": -16166554.0, "logps/chosen": -313.70115152994794, "logps/rejected": -401.17584228515625, "loss": 0.3431, "rewards/chosen": 0.5223147471745809, "rewards/margins": 2.469842871030172, "rewards/rejected": -1.9475281238555908, "step": 5516 }, { "epoch": 0.29242307794238465, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1217343.25, "logits/rejected": -24701334.85714286, "logps/chosen": -187.4336395263672, "logps/rejected": -381.6881626674107, "loss": 0.1818, "rewards/chosen": -0.8313461542129517, "rewards/margins": 1.5701856102262224, "rewards/rejected": -2.401531764439174, "step": 5517 }, { "epoch": 0.2924760819441868, "grad_norm": 69.5, "kl": 0.37586021423339844, "learning_rate": 5e-07, "logits/chosen": -70861796.57142857, "logits/rejected": -33106328.0, "logps/chosen": -437.5711146763393, "logps/rejected": -257.2174987792969, "loss": 0.4423, "rewards/chosen": 0.13798639604023524, "rewards/margins": 1.7307445747511727, "rewards/rejected": -1.5927581787109375, "step": 5518 }, { "epoch": 0.2925290859459889, "grad_norm": 36.0, "kl": 0.9797697067260742, "learning_rate": 5e-07, "logits/chosen": -10978556.0, "logits/rejected": -12242344.0, "logps/chosen": -300.77154541015625, "logps/rejected": -247.87098693847656, "loss": 0.333, "rewards/chosen": 0.2768227756023407, "rewards/margins": 2.1341496407985687, "rewards/rejected": -1.857326865196228, "step": 5519 }, { "epoch": 0.29258208994779106, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56136104.0, "logits/rejected": -19844888.0, "logps/chosen": -317.58782958984375, "logps/rejected": -146.62802124023438, "loss": 0.2774, "rewards/chosen": 0.6248035430908203, "rewards/margins": 2.29959774017334, "rewards/rejected": -1.6747941970825195, "step": 5520 }, { "epoch": 0.2926350939495932, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27915974.4, "logits/rejected": -44877082.666666664, "logps/chosen": -353.227392578125, "logps/rejected": -552.4306640625, "loss": 0.275, "rewards/chosen": 0.39060163497924805, "rewards/margins": 3.491284211476644, "rewards/rejected": -3.100682576497396, "step": 5521 }, { "epoch": 0.29268809795139533, "grad_norm": 55.25, "kl": 0.5519771575927734, "learning_rate": 5e-07, "logits/chosen": -25607012.8, "logits/rejected": -11528460.0, "logps/chosen": -294.853759765625, "logps/rejected": -170.1519571940104, "loss": 0.3995, "rewards/chosen": 0.4225319862365723, "rewards/margins": 1.0266932090123495, "rewards/rejected": -0.6041612227757772, "step": 5522 }, { "epoch": 0.29274110195319747, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6784355.0, "logits/rejected": 9109475.333333334, "logps/chosen": -217.77227783203125, "logps/rejected": -613.9157307942709, "loss": 0.2007, "rewards/chosen": 0.29740676283836365, "rewards/margins": 3.015756219625473, "rewards/rejected": -2.7183494567871094, "step": 5523 }, { "epoch": 0.2927941059549996, "grad_norm": 94.5, "kl": 0.3716163635253906, "learning_rate": 5e-07, "logits/chosen": -9556182.0, "logps/chosen": -454.861328125, "loss": 0.4602, "rewards/chosen": 0.17737698554992676, "step": 5524 }, { "epoch": 0.29284710995680174, "grad_norm": 32.0, "kl": 0.15495681762695312, "learning_rate": 5e-07, "logits/chosen": -25506224.0, "logits/rejected": -128531123.2, "logps/chosen": -168.29684448242188, "logps/rejected": -353.9408203125, "loss": 0.1606, "rewards/chosen": 0.8909977277119955, "rewards/margins": 3.9577007611592614, "rewards/rejected": -3.066703033447266, "step": 5525 }, { "epoch": 0.2929001139586039, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5515039.333333333, "logits/rejected": -20635273.6, "logps/chosen": -108.18861897786458, "logps/rejected": -150.98857421875, "loss": 0.3023, "rewards/chosen": 0.27318722009658813, "rewards/margins": 1.6489617943763732, "rewards/rejected": -1.3757745742797851, "step": 5526 }, { "epoch": 0.292953117960406, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 85288800.0, "logits/rejected": -17098060.0, "logps/chosen": -621.5744018554688, "logps/rejected": -117.58068084716797, "loss": 0.2604, "rewards/chosen": 0.5997646450996399, "rewards/margins": 2.4254494309425354, "rewards/rejected": -1.8256847858428955, "step": 5527 }, { "epoch": 0.29300612196220815, "grad_norm": 66.0, "kl": 0.60797119140625, "learning_rate": 5e-07, "logits/chosen": -39113906.666666664, "logits/rejected": -44238297.6, "logps/chosen": -472.7203369140625, "logps/rejected": -325.4572509765625, "loss": 0.2993, "rewards/chosen": -0.030205026268959045, "rewards/margins": 1.7435078948736191, "rewards/rejected": -1.7737129211425782, "step": 5528 }, { "epoch": 0.2930591259640103, "grad_norm": 40.25, "kl": 0.0479736328125, "learning_rate": 5e-07, "logits/chosen": -2007025.0, "logits/rejected": -10443070.0, "logps/chosen": -164.65869140625, "logps/rejected": -117.00564575195312, "loss": 0.3358, "rewards/chosen": 0.1552681177854538, "rewards/margins": 1.6553489416837692, "rewards/rejected": -1.5000808238983154, "step": 5529 }, { "epoch": 0.29311212996581243, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27823520.0, "logits/rejected": -5823200.0, "logps/chosen": -342.757470703125, "logps/rejected": -179.8650105794271, "loss": 0.3981, "rewards/chosen": 0.1335902452468872, "rewards/margins": 1.2297643264134723, "rewards/rejected": -1.0961740811665852, "step": 5530 }, { "epoch": 0.29316513396761457, "grad_norm": 48.25, "kl": 0.26226234436035156, "learning_rate": 5e-07, "logits/chosen": -16301131.42857143, "logits/rejected": -70866488.0, "logps/chosen": -350.41476004464283, "logps/rejected": -838.5266723632812, "loss": 0.3079, "rewards/chosen": 0.7294230461120605, "rewards/margins": 4.674241304397583, "rewards/rejected": -3.9448182582855225, "step": 5531 }, { "epoch": 0.2932181379694167, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26435257.6, "logits/rejected": -41391082.666666664, "logps/chosen": -304.6209716796875, "logps/rejected": -298.7371419270833, "loss": 0.243, "rewards/chosen": 0.8418692588806153, "rewards/margins": 3.088483397165934, "rewards/rejected": -2.246614138285319, "step": 5532 }, { "epoch": 0.29327114197121884, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4301476.5, "logits/rejected": -1927072.1666666667, "logps/chosen": -36.25398635864258, "logps/rejected": -359.1846923828125, "loss": 0.2555, "rewards/chosen": -0.18947668373584747, "rewards/margins": 1.8594458550214767, "rewards/rejected": -2.048922538757324, "step": 5533 }, { "epoch": 0.293324145973021, "grad_norm": 54.0, "kl": 0.443206787109375, "learning_rate": 5e-07, "logits/chosen": -24860342.0, "logits/rejected": -9841618.0, "logps/chosen": -283.79150390625, "logps/rejected": -273.1520690917969, "loss": 0.3397, "rewards/chosen": 0.018358618021011353, "rewards/margins": 1.706815391778946, "rewards/rejected": -1.6884567737579346, "step": 5534 }, { "epoch": 0.2933771499748231, "grad_norm": 42.0, "kl": 0.26847076416015625, "learning_rate": 5e-07, "logits/chosen": -59686522.666666664, "logits/rejected": -25806347.2, "logps/chosen": -326.5229899088542, "logps/rejected": -242.70849609375, "loss": 0.2257, "rewards/chosen": 0.8916228612263998, "rewards/margins": 2.6147664388020835, "rewards/rejected": -1.7231435775756836, "step": 5535 }, { "epoch": 0.29343015397662525, "grad_norm": 57.0, "kl": 0.8099365234375, "learning_rate": 5e-07, "logits/chosen": -25305654.85714286, "logits/rejected": -13205582.0, "logps/chosen": -192.11432756696428, "logps/rejected": -99.40478515625, "loss": 0.4697, "rewards/chosen": 0.13162588221686228, "rewards/margins": 0.9247899566377913, "rewards/rejected": -0.793164074420929, "step": 5536 }, { "epoch": 0.2934831579784274, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18084792.0, "logits/rejected": -375091.5, "logps/chosen": -191.4063924153646, "logps/rejected": -123.43415069580078, "loss": 0.3686, "rewards/chosen": 0.20721284548441568, "rewards/margins": 2.4881874720255532, "rewards/rejected": -2.2809746265411377, "step": 5537 }, { "epoch": 0.2935361619802295, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27309964.8, "logits/rejected": -29436650.666666668, "logps/chosen": -388.4709228515625, "logps/rejected": -394.350830078125, "loss": 0.3497, "rewards/chosen": 0.003738558292388916, "rewards/margins": 2.416164871056875, "rewards/rejected": -2.412426312764486, "step": 5538 }, { "epoch": 0.29358916598203166, "grad_norm": 41.75, "kl": 0.061980247497558594, "learning_rate": 5e-07, "logits/chosen": -18344054.4, "logits/rejected": -17473926.666666668, "logps/chosen": -157.477197265625, "logps/rejected": -107.79702758789062, "loss": 0.3413, "rewards/chosen": 0.44319953918457033, "rewards/margins": 2.1761740684509276, "rewards/rejected": -1.7329745292663574, "step": 5539 }, { "epoch": 0.2936421699838338, "grad_norm": 43.75, "kl": 1.5703811645507812, "learning_rate": 5e-07, "logits/chosen": -5481711.0, "logits/rejected": -17098235.2, "logps/chosen": -266.27309163411456, "logps/rejected": -197.6227783203125, "loss": 0.3254, "rewards/chosen": 0.265164852142334, "rewards/margins": 1.7576231956481934, "rewards/rejected": -1.4924583435058594, "step": 5540 }, { "epoch": 0.29369517398563594, "grad_norm": 52.0, "kl": 0.05322074890136719, "learning_rate": 5e-07, "logits/chosen": -31017696.0, "logits/rejected": -25975166.0, "logps/chosen": -215.31778971354166, "logps/rejected": -312.2549743652344, "loss": 0.4648, "rewards/chosen": -0.20460081100463867, "rewards/margins": 1.3374170064926147, "rewards/rejected": -1.5420178174972534, "step": 5541 }, { "epoch": 0.2937481779874381, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 668681.0625, "logits/rejected": -24247134.0, "logps/chosen": -27.22150230407715, "logps/rejected": -289.72845458984375, "loss": 0.3128, "rewards/chosen": 0.23112353682518005, "rewards/margins": 1.923743098974228, "rewards/rejected": -1.6926195621490479, "step": 5542 }, { "epoch": 0.2938011819892402, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3196012.0, "logits/rejected": -29970144.0, "logps/chosen": -50.47259012858073, "logps/rejected": -418.65380859375, "loss": 0.2724, "rewards/chosen": -0.04254512985547384, "rewards/margins": 2.1127210597197212, "rewards/rejected": -2.1552661895751952, "step": 5543 }, { "epoch": 0.29385418599104235, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9298873.0, "logits/rejected": -31958876.0, "logps/chosen": -248.72750854492188, "logps/rejected": -292.4200439453125, "loss": 0.2196, "rewards/chosen": 0.7928045988082886, "rewards/margins": 2.8583353757858276, "rewards/rejected": -2.065530776977539, "step": 5544 }, { "epoch": 0.2939071899928445, "grad_norm": 60.0, "kl": 0.9764728546142578, "learning_rate": 5e-07, "logits/chosen": -35922368.0, "logits/rejected": -49543536.0, "logps/chosen": -366.886328125, "logps/rejected": -189.0020751953125, "loss": 0.3534, "rewards/chosen": 0.1999527096748352, "rewards/margins": 2.1098028302192686, "rewards/rejected": -1.9098501205444336, "step": 5545 }, { "epoch": 0.29396019399464657, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36970592.0, "logits/rejected": -1138010.5, "logps/chosen": -191.9873046875, "logps/rejected": -193.59848022460938, "loss": 0.4382, "rewards/chosen": -0.31958141922950745, "rewards/margins": 0.5730429589748383, "rewards/rejected": -0.8926243782043457, "step": 5546 }, { "epoch": 0.2940131979964487, "grad_norm": 53.0, "kl": 0.7047023773193359, "learning_rate": 5e-07, "logits/chosen": -17114846.4, "logits/rejected": -1132143.1666666667, "logps/chosen": -585.115771484375, "logps/rejected": -79.97137451171875, "loss": 0.3734, "rewards/chosen": 0.4002201557159424, "rewards/margins": 1.5900283654530842, "rewards/rejected": -1.1898082097371419, "step": 5547 }, { "epoch": 0.29406620199825084, "grad_norm": 45.5, "kl": 0.12353515625, "learning_rate": 5e-07, "logits/chosen": -112146768.0, "logits/rejected": -46076212.0, "logps/chosen": -416.7353820800781, "logps/rejected": -799.4036865234375, "loss": 0.2284, "rewards/chosen": 0.373704731464386, "rewards/margins": 3.7649171948432922, "rewards/rejected": -3.3912124633789062, "step": 5548 }, { "epoch": 0.294119206000053, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5508216.0, "logits/rejected": -51665957.333333336, "logps/chosen": -60.87742614746094, "logps/rejected": -426.3735758463542, "loss": 0.2081, "rewards/chosen": 0.10003739595413208, "rewards/margins": 2.281449774901072, "rewards/rejected": -2.18141237894694, "step": 5549 }, { "epoch": 0.2941722100018551, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14651520.0, "logits/rejected": 77081612.8, "logps/chosen": -1049.3763834635417, "logps/rejected": -259.390966796875, "loss": 0.2462, "rewards/chosen": 0.613056500752767, "rewards/margins": 2.5577624638875327, "rewards/rejected": -1.9447059631347656, "step": 5550 }, { "epoch": 0.29422521400365725, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14691028.0, "logits/rejected": -36985811.2, "logps/chosen": -425.8430582682292, "logps/rejected": -300.0610595703125, "loss": 0.1549, "rewards/chosen": 1.1311992009480794, "rewards/margins": 3.601011784871419, "rewards/rejected": -2.46981258392334, "step": 5551 }, { "epoch": 0.2942782180054594, "grad_norm": 45.75, "kl": 0.44527435302734375, "learning_rate": 5e-07, "logits/chosen": -32635513.6, "logits/rejected": -1113274.6666666667, "logps/chosen": -274.5064208984375, "logps/rejected": -64.51010131835938, "loss": 0.4262, "rewards/chosen": 0.012097054719924926, "rewards/margins": 1.0566867331663767, "rewards/rejected": -1.044589678446452, "step": 5552 }, { "epoch": 0.29433122200726153, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17846065.6, "logits/rejected": -27201224.0, "logps/chosen": -148.45673828125, "logps/rejected": -316.0153401692708, "loss": 0.3945, "rewards/chosen": -0.25878019332885743, "rewards/margins": 1.7898821512858072, "rewards/rejected": -2.0486623446146646, "step": 5553 }, { "epoch": 0.29438422600906367, "grad_norm": 44.0, "kl": 1.1583690643310547, "learning_rate": 5e-07, "logits/chosen": -34578768.0, "logits/rejected": -25434558.0, "logps/chosen": -298.25478108723956, "logps/rejected": -260.3680114746094, "loss": 0.3785, "rewards/chosen": 0.4252205689748128, "rewards/margins": 2.3594163258870444, "rewards/rejected": -1.9341957569122314, "step": 5554 }, { "epoch": 0.2944372300108658, "grad_norm": 47.25, "kl": 1.1338615417480469, "learning_rate": 5e-07, "logits/chosen": 13379567.0, "logits/rejected": 8794752.0, "logps/chosen": -217.5893096923828, "logps/rejected": -477.80242919921875, "loss": 0.3201, "rewards/chosen": 0.23846149444580078, "rewards/margins": 2.51351261138916, "rewards/rejected": -2.2750511169433594, "step": 5555 }, { "epoch": 0.29449023401266794, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51066128.0, "logits/rejected": -7254470.285714285, "logps/chosen": -473.7220458984375, "logps/rejected": -186.38959612165178, "loss": 0.2673, "rewards/chosen": -0.05411377176642418, "rewards/margins": 1.160455864987203, "rewards/rejected": -1.2145696367536272, "step": 5556 }, { "epoch": 0.2945432380144701, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26004133.333333332, "logits/rejected": 792263.4, "logps/chosen": -410.9667154947917, "logps/rejected": -298.7924072265625, "loss": 0.3655, "rewards/chosen": 0.2032827933629354, "rewards/margins": 1.3791597922643024, "rewards/rejected": -1.175876998901367, "step": 5557 }, { "epoch": 0.2945962420162722, "grad_norm": 65.0, "kl": 2.9198265075683594, "learning_rate": 5e-07, "logits/chosen": -32900050.285714287, "logits/rejected": -23089988.0, "logps/chosen": -620.3099888392857, "logps/rejected": -234.1229248046875, "loss": 0.4338, "rewards/chosen": 0.4255921500069754, "rewards/margins": 2.098259057317461, "rewards/rejected": -1.6726669073104858, "step": 5558 }, { "epoch": 0.29464924601807435, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3782831.6, "logits/rejected": -15297038.666666666, "logps/chosen": -369.2790771484375, "logps/rejected": -221.00907389322916, "loss": 0.354, "rewards/chosen": 0.387424373626709, "rewards/margins": 1.832303269704183, "rewards/rejected": -1.4448788960774739, "step": 5559 }, { "epoch": 0.2947022500198765, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57633114.666666664, "logits/rejected": -32207420.8, "logps/chosen": -350.1272786458333, "logps/rejected": -354.39912109375, "loss": 0.1769, "rewards/chosen": 0.8390154043833414, "rewards/margins": 3.3813497702280677, "rewards/rejected": -2.5423343658447264, "step": 5560 }, { "epoch": 0.2947552540216786, "grad_norm": 30.875, "kl": 0.702244758605957, "learning_rate": 5e-07, "logits/chosen": 4835574.0, "logits/rejected": -11432211.0, "logps/chosen": -113.19184875488281, "logps/rejected": -164.43618774414062, "loss": 0.2246, "rewards/chosen": 0.791888415813446, "rewards/margins": 3.108829438686371, "rewards/rejected": -2.316941022872925, "step": 5561 }, { "epoch": 0.29480825802348076, "grad_norm": 49.5, "kl": 0.16353988647460938, "learning_rate": 5e-07, "logits/chosen": -24785038.4, "logits/rejected": -25362000.0, "logps/chosen": -271.439453125, "logps/rejected": -420.2119954427083, "loss": 0.3448, "rewards/chosen": -0.034063863754272464, "rewards/margins": 3.2061838626861574, "rewards/rejected": -3.2402477264404297, "step": 5562 }, { "epoch": 0.2948612620252829, "grad_norm": 64.0, "kl": 1.1026382446289062, "learning_rate": 5e-07, "logits/chosen": 878993.9166666666, "logits/rejected": -33989033.6, "logps/chosen": -390.7684733072917, "logps/rejected": -366.210302734375, "loss": 0.299, "rewards/chosen": 0.6604512532552084, "rewards/margins": 2.1405794461568197, "rewards/rejected": -1.4801281929016112, "step": 5563 }, { "epoch": 0.29491426602708504, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55608188.0, "logits/rejected": -15292310.666666666, "logps/chosen": -310.66229248046875, "logps/rejected": -174.9448445638021, "loss": 0.2622, "rewards/chosen": 0.40620920062065125, "rewards/margins": 2.092314610878627, "rewards/rejected": -1.6861054102579753, "step": 5564 }, { "epoch": 0.2949672700288872, "grad_norm": 56.5, "kl": 0.09729385375976562, "learning_rate": 5e-07, "logits/chosen": -43831602.666666664, "logits/rejected": -31190426.0, "logps/chosen": -231.42626953125, "logps/rejected": -455.88775634765625, "loss": 0.3429, "rewards/chosen": 0.28320956230163574, "rewards/margins": 2.9991466999053955, "rewards/rejected": -2.7159371376037598, "step": 5565 }, { "epoch": 0.2950202740306893, "grad_norm": 53.0, "kl": 0.00017547607421875, "learning_rate": 5e-07, "logits/chosen": -24319280.0, "logits/rejected": -30721957.333333332, "logps/chosen": -279.32919921875, "logps/rejected": -377.019775390625, "loss": 0.401, "rewards/chosen": -0.1787794589996338, "rewards/margins": 1.6272182305653888, "rewards/rejected": -1.8059976895650227, "step": 5566 }, { "epoch": 0.29507327803249145, "grad_norm": 57.0, "kl": 0.64501953125, "learning_rate": 5e-07, "logits/chosen": -28044996.0, "logits/rejected": -63324448.0, "logps/chosen": -383.25445556640625, "logps/rejected": -332.4551086425781, "loss": 0.2302, "rewards/chosen": 0.9169219732284546, "rewards/margins": 2.821227192878723, "rewards/rejected": -1.9043052196502686, "step": 5567 }, { "epoch": 0.2951262820342936, "grad_norm": 47.25, "kl": 0.78985595703125, "learning_rate": 5e-07, "logits/chosen": -27347397.333333332, "logits/rejected": -38040579.2, "logps/chosen": -258.6655680338542, "logps/rejected": -253.7087890625, "loss": 0.2845, "rewards/chosen": 0.5111434062321981, "rewards/margins": 1.9650658686955769, "rewards/rejected": -1.4539224624633789, "step": 5568 }, { "epoch": 0.2951792860360957, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3039236.3333333335, "logits/rejected": -35286388.0, "logps/chosen": -117.85445149739583, "logps/rejected": -358.0316467285156, "loss": 0.3956, "rewards/chosen": 0.07890010873476665, "rewards/margins": 2.357467899719874, "rewards/rejected": -2.2785677909851074, "step": 5569 }, { "epoch": 0.29523229003789786, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62449968.0, "logits/rejected": -48802075.428571425, "logps/chosen": -533.078857421875, "logps/rejected": -321.63682338169644, "loss": 0.1849, "rewards/chosen": 1.278649926185608, "rewards/margins": 3.2435538939067294, "rewards/rejected": -1.9649039677211217, "step": 5570 }, { "epoch": 0.2952852940397, "grad_norm": 55.75, "kl": 0.7141532897949219, "learning_rate": 5e-07, "logits/chosen": -47107224.0, "logits/rejected": -5713459.0, "logps/chosen": -271.7451477050781, "logps/rejected": -256.67559814453125, "loss": 0.2518, "rewards/chosen": 0.8302645683288574, "rewards/margins": 2.473048686981201, "rewards/rejected": -1.6427841186523438, "step": 5571 }, { "epoch": 0.29533829804150213, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57686544.0, "logits/rejected": -46128992.0, "logps/chosen": -300.6488342285156, "logps/rejected": -595.8724365234375, "loss": 0.2322, "rewards/chosen": -0.10449638962745667, "rewards/margins": 2.089623381694158, "rewards/rejected": -2.1941197713216147, "step": 5572 }, { "epoch": 0.29539130204330427, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11056417.6, "logits/rejected": 8663865.333333334, "logps/chosen": -159.9933837890625, "logps/rejected": -366.0276692708333, "loss": 0.4365, "rewards/chosen": -0.30502212047576904, "rewards/margins": 1.0519847472508748, "rewards/rejected": -1.3570068677266438, "step": 5573 }, { "epoch": 0.2954443060451064, "grad_norm": 31.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1564288.125, "logits/rejected": -3636467.3333333335, "logps/chosen": -50.44990158081055, "logps/rejected": -332.9873046875, "loss": 0.2286, "rewards/chosen": 0.39365845918655396, "rewards/margins": 2.4480751156806946, "rewards/rejected": -2.0544166564941406, "step": 5574 }, { "epoch": 0.29549731004690855, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5794985.333333333, "logits/rejected": -2050282.6, "logps/chosen": -48.67748514811198, "logps/rejected": -153.8711181640625, "loss": 0.24, "rewards/chosen": 0.5821129083633423, "rewards/margins": 2.5572994470596315, "rewards/rejected": -1.975186538696289, "step": 5575 }, { "epoch": 0.2955503140487107, "grad_norm": 70.0, "kl": 0.5977897644042969, "learning_rate": 5e-07, "logits/chosen": 3565874.6666666665, "logits/rejected": 16644731.2, "logps/chosen": -446.313720703125, "logps/rejected": -282.14501953125, "loss": 0.3009, "rewards/chosen": 1.0670939286549885, "rewards/margins": 1.9695422013600665, "rewards/rejected": -0.9024482727050781, "step": 5576 }, { "epoch": 0.2956033180505128, "grad_norm": 53.25, "kl": 0.2906665802001953, "learning_rate": 5e-07, "logits/chosen": -64743836.0, "logits/rejected": -20343562.0, "logps/chosen": -379.1737060546875, "logps/rejected": -284.61480712890625, "loss": 0.2121, "rewards/chosen": 0.6892944574356079, "rewards/margins": 3.3474518060684204, "rewards/rejected": -2.6581573486328125, "step": 5577 }, { "epoch": 0.29565632205231496, "grad_norm": 50.0, "kl": 0.13759994506835938, "learning_rate": 5e-07, "logits/chosen": -30717840.0, "logits/rejected": -24437862.0, "logps/chosen": -299.6514078776042, "logps/rejected": -496.5205078125, "loss": 0.4097, "rewards/chosen": 0.1762178341547648, "rewards/margins": 2.330752889315287, "rewards/rejected": -2.1545350551605225, "step": 5578 }, { "epoch": 0.2957093260541171, "grad_norm": 72.5, "kl": 0.7731227874755859, "learning_rate": 5e-07, "logits/chosen": -38949641.14285714, "logits/rejected": 472093120.0, "logps/chosen": -327.20992606026783, "logps/rejected": -672.3033447265625, "loss": 0.4367, "rewards/chosen": 0.22076972893306188, "rewards/margins": 1.505071486745562, "rewards/rejected": -1.2843017578125, "step": 5579 }, { "epoch": 0.29576233005591923, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7192373.333333333, "logits/rejected": -3693124.0, "logps/chosen": -147.37385050455728, "logps/rejected": -72.915283203125, "loss": 0.422, "rewards/chosen": 0.09470959504445393, "rewards/margins": 1.4383577903111775, "rewards/rejected": -1.3436481952667236, "step": 5580 }, { "epoch": 0.29581533405772137, "grad_norm": 45.25, "kl": 1.4780197143554688, "learning_rate": 5e-07, "logits/chosen": -33357938.0, "logits/rejected": -14027635.0, "logps/chosen": -585.0317993164062, "logps/rejected": -197.31039428710938, "loss": 0.326, "rewards/chosen": 0.7610074281692505, "rewards/margins": 1.887011170387268, "rewards/rejected": -1.1260037422180176, "step": 5581 }, { "epoch": 0.2958683380595235, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46338768.0, "logits/rejected": -45173317.333333336, "logps/chosen": -262.24033203125, "logps/rejected": -406.1527913411458, "loss": 0.4143, "rewards/chosen": -0.5076390266418457, "rewards/margins": 2.397572104136149, "rewards/rejected": -2.9052111307779946, "step": 5582 }, { "epoch": 0.29592134206132564, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 625746.7, "logits/rejected": -53595882.666666664, "logps/chosen": -136.42064208984374, "logps/rejected": -578.8059895833334, "loss": 0.3307, "rewards/chosen": 0.20406057834625244, "rewards/margins": 2.2877537647883095, "rewards/rejected": -2.083693186442057, "step": 5583 }, { "epoch": 0.2959743460631278, "grad_norm": 82.5, "kl": 0.5997581481933594, "learning_rate": 5e-07, "logits/chosen": -36331372.8, "logits/rejected": -20791932.0, "logps/chosen": -425.95322265625, "logps/rejected": -480.05712890625, "loss": 0.2868, "rewards/chosen": 0.6300888061523438, "rewards/margins": 3.285019874572754, "rewards/rejected": -2.65493106842041, "step": 5584 }, { "epoch": 0.2960273500649299, "grad_norm": 65.0, "kl": 2.0863265991210938, "learning_rate": 5e-07, "logits/chosen": -45437616.0, "logits/rejected": -7344078.666666667, "logps/chosen": -449.841552734375, "logps/rejected": -150.93414306640625, "loss": 0.3739, "rewards/chosen": 0.45256776809692384, "rewards/margins": 1.8321075757344563, "rewards/rejected": -1.3795398076375325, "step": 5585 }, { "epoch": 0.29608035406673205, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6795246.666666667, "logits/rejected": -17925616.0, "logps/chosen": -281.4453531901042, "logps/rejected": -173.37353515625, "loss": 0.3156, "rewards/chosen": 0.11932958165804546, "rewards/margins": 1.448318992058436, "rewards/rejected": -1.3289894104003905, "step": 5586 }, { "epoch": 0.2961333580685342, "grad_norm": 29.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7008393.5, "logits/rejected": 4809628.666666667, "logps/chosen": -29.96573829650879, "logps/rejected": -347.9335530598958, "loss": 0.2162, "rewards/chosen": 0.2325187623500824, "rewards/margins": 2.9172591467698417, "rewards/rejected": -2.6847403844197593, "step": 5587 }, { "epoch": 0.29618636207033633, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24138634.666666668, "logits/rejected": 348516.125, "logps/chosen": -198.4254150390625, "logps/rejected": -197.74755859375, "loss": 0.4271, "rewards/chosen": -0.0646431843439738, "rewards/margins": 1.5088672240575154, "rewards/rejected": -1.5735104084014893, "step": 5588 }, { "epoch": 0.29623936607213847, "grad_norm": 47.5, "kl": 0.8154830932617188, "learning_rate": 5e-07, "logits/chosen": -33081565.333333332, "logits/rejected": -1447889.7, "logps/chosen": -492.0182291666667, "logps/rejected": -393.0537109375, "loss": 0.2439, "rewards/chosen": 0.3367764949798584, "rewards/margins": 2.5982484340667726, "rewards/rejected": -2.2614719390869142, "step": 5589 }, { "epoch": 0.2962923700739406, "grad_norm": 66.5, "kl": 3.2046852111816406, "learning_rate": 5e-07, "logits/chosen": -34703648.0, "logits/rejected": 53708100.0, "logps/chosen": -547.6975708007812, "logps/rejected": -343.99261474609375, "loss": 0.3121, "rewards/chosen": 0.8263611197471619, "rewards/margins": 2.6211602091789246, "rewards/rejected": -1.7947990894317627, "step": 5590 }, { "epoch": 0.29634537407574274, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36077128.0, "logits/rejected": -44575656.0, "logps/chosen": -244.08499145507812, "logps/rejected": -552.5062255859375, "loss": 0.2829, "rewards/chosen": -0.03739318996667862, "rewards/margins": 3.246546171605587, "rewards/rejected": -3.2839393615722656, "step": 5591 }, { "epoch": 0.2963983780775449, "grad_norm": 55.75, "kl": 1.0261154174804688, "learning_rate": 5e-07, "logits/chosen": -50237264.0, "logits/rejected": -72749776.0, "logps/chosen": -427.1119384765625, "logps/rejected": -578.8331298828125, "loss": 0.3219, "rewards/chosen": 0.008853903971612453, "rewards/margins": 2.913832894526422, "rewards/rejected": -2.9049789905548096, "step": 5592 }, { "epoch": 0.296451382079347, "grad_norm": 51.25, "kl": 1.152191162109375, "learning_rate": 5e-07, "logits/chosen": -1520265.3333333333, "logits/rejected": -32584524.0, "logps/chosen": -225.5444132486979, "logps/rejected": -1064.3363037109375, "loss": 0.3488, "rewards/chosen": 0.4986472924550374, "rewards/margins": 2.8388007481892905, "rewards/rejected": -2.340153455734253, "step": 5593 }, { "epoch": 0.29650438608114915, "grad_norm": 60.75, "kl": 0.11751937866210938, "learning_rate": 5e-07, "logits/chosen": -23444648.0, "logits/rejected": -61066762.666666664, "logps/chosen": -409.651953125, "logps/rejected": -750.4747721354166, "loss": 0.2449, "rewards/chosen": 0.6746435642242432, "rewards/margins": 4.0403198083241785, "rewards/rejected": -3.365676244099935, "step": 5594 }, { "epoch": 0.2965573900829513, "grad_norm": 48.5, "kl": 0.12456703186035156, "learning_rate": 5e-07, "logits/chosen": -34476436.0, "logits/rejected": -20810512.0, "logps/chosen": -106.09114837646484, "logps/rejected": -325.8592529296875, "loss": 0.3466, "rewards/chosen": -0.1751834899187088, "rewards/margins": 1.9506905525922775, "rewards/rejected": -2.1258740425109863, "step": 5595 }, { "epoch": 0.2966103940847534, "grad_norm": 68.5, "kl": 2.635270118713379, "learning_rate": 5e-07, "logits/chosen": 23003229.333333332, "logits/rejected": -10199322.0, "logps/chosen": -711.9820149739584, "logps/rejected": -220.0511474609375, "loss": 0.3253, "rewards/chosen": 1.1278785864512126, "rewards/margins": 2.2293793360392256, "rewards/rejected": -1.1015007495880127, "step": 5596 }, { "epoch": 0.2966633980865555, "grad_norm": 43.5, "kl": 1.5142154693603516, "learning_rate": 5e-07, "logits/chosen": -37599018.666666664, "logits/rejected": -43584806.4, "logps/chosen": -280.14060465494794, "logps/rejected": -426.746337890625, "loss": 0.294, "rewards/chosen": 0.1263206402460734, "rewards/margins": 2.485353461901347, "rewards/rejected": -2.3590328216552736, "step": 5597 }, { "epoch": 0.29671640208835764, "grad_norm": 50.0, "kl": 0.3775482177734375, "learning_rate": 5e-07, "logits/chosen": -10117775.333333334, "logits/rejected": -3345040.8, "logps/chosen": -349.7488199869792, "logps/rejected": -348.2203857421875, "loss": 0.2412, "rewards/chosen": 0.858863115310669, "rewards/margins": 2.5907427310943603, "rewards/rejected": -1.7318796157836913, "step": 5598 }, { "epoch": 0.2967694060901598, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20717632.0, "logits/rejected": -9176721.333333334, "logps/chosen": -367.5531005859375, "logps/rejected": -394.8428548177083, "loss": 0.2322, "rewards/chosen": 0.18837282061576843, "rewards/margins": 2.211677441994349, "rewards/rejected": -2.0233046213785806, "step": 5599 }, { "epoch": 0.2968224100919619, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37384829.333333336, "logits/rejected": -35137187.2, "logps/chosen": -324.0506184895833, "logps/rejected": -245.85234375, "loss": 0.2501, "rewards/chosen": 0.10017040371894836, "rewards/margins": 2.200722199678421, "rewards/rejected": -2.100551795959473, "step": 5600 }, { "epoch": 0.29687541409376406, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33608336.0, "logits/rejected": -15764760.0, "logps/chosen": -400.4190673828125, "logps/rejected": -287.51165771484375, "loss": 0.3598, "rewards/chosen": 0.22582589089870453, "rewards/margins": 1.3615110963582993, "rewards/rejected": -1.1356852054595947, "step": 5601 }, { "epoch": 0.2969284180955662, "grad_norm": 52.25, "kl": 0.25249195098876953, "learning_rate": 5e-07, "logits/chosen": -22272510.0, "logits/rejected": -18208288.0, "logps/chosen": -168.84169006347656, "logps/rejected": -220.82669067382812, "loss": 0.4411, "rewards/chosen": -0.336347758769989, "rewards/margins": 0.5417730212211609, "rewards/rejected": -0.8781207799911499, "step": 5602 }, { "epoch": 0.29698142209736833, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33641572.0, "logits/rejected": -16195353.333333334, "logps/chosen": -286.2460632324219, "logps/rejected": -332.21435546875, "loss": 0.2433, "rewards/chosen": 0.5209663510322571, "rewards/margins": 2.2529816428820295, "rewards/rejected": -1.7320152918497722, "step": 5603 }, { "epoch": 0.29703442609917047, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25591045.333333332, "logits/rejected": -24888988.0, "logps/chosen": -239.5278523763021, "logps/rejected": -390.81427001953125, "loss": 0.3819, "rewards/chosen": 0.16709665457407633, "rewards/margins": 2.0076897541681924, "rewards/rejected": -1.8405930995941162, "step": 5604 }, { "epoch": 0.2970874301009726, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26371869.333333332, "logits/rejected": 4372011.2, "logps/chosen": -332.6167805989583, "logps/rejected": -237.8317626953125, "loss": 0.2727, "rewards/chosen": 0.5318837563196818, "rewards/margins": 2.046515885988871, "rewards/rejected": -1.5146321296691894, "step": 5605 }, { "epoch": 0.29714043410277474, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29660334.0, "logits/rejected": -23778488.0, "logps/chosen": -169.15098571777344, "logps/rejected": -304.4516296386719, "loss": 0.3975, "rewards/chosen": -0.4792603552341461, "rewards/margins": 1.1041320264339447, "rewards/rejected": -1.5833923816680908, "step": 5606 }, { "epoch": 0.2971934381045769, "grad_norm": 61.0, "kl": 0.13489151000976562, "learning_rate": 5e-07, "logits/chosen": -30423846.4, "logits/rejected": -45346048.0, "logps/chosen": -273.0087646484375, "logps/rejected": -376.3158365885417, "loss": 0.3959, "rewards/chosen": -0.04736037254333496, "rewards/margins": 1.3867257912953694, "rewards/rejected": -1.4340861638387044, "step": 5607 }, { "epoch": 0.297246442106379, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26481152.0, "logits/rejected": -6386590.666666667, "logps/chosen": -245.630517578125, "logps/rejected": -319.4015706380208, "loss": 0.3683, "rewards/chosen": 0.05013386607170105, "rewards/margins": 1.7390333831310272, "rewards/rejected": -1.6888995170593262, "step": 5608 }, { "epoch": 0.29729944610818115, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49141589.333333336, "logits/rejected": -54080659.2, "logps/chosen": -310.26959228515625, "logps/rejected": -449.249072265625, "loss": 0.2943, "rewards/chosen": 0.09721310933430989, "rewards/margins": 1.9543526967366536, "rewards/rejected": -1.8571395874023438, "step": 5609 }, { "epoch": 0.2973524501099833, "grad_norm": 48.5, "kl": 1.0925140380859375, "learning_rate": 5e-07, "logits/chosen": -45250128.0, "logits/rejected": -26568924.8, "logps/chosen": -490.1636149088542, "logps/rejected": -225.895751953125, "loss": 0.2498, "rewards/chosen": 0.9960145155588785, "rewards/margins": 2.530894390741984, "rewards/rejected": -1.5348798751831054, "step": 5610 }, { "epoch": 0.29740545411178543, "grad_norm": 43.75, "kl": 0.5290985107421875, "learning_rate": 5e-07, "logits/chosen": -17610064.0, "logits/rejected": -61250890.666666664, "logps/chosen": -186.9088623046875, "logps/rejected": -222.920654296875, "loss": 0.3091, "rewards/chosen": 0.5953206062316895, "rewards/margins": 2.3786251386006674, "rewards/rejected": -1.7833045323689778, "step": 5611 }, { "epoch": 0.29745845811358756, "grad_norm": 63.0, "kl": 2.6088199615478516, "learning_rate": 5e-07, "logits/chosen": -24093160.0, "logits/rejected": -2482898.5, "logps/chosen": -263.74037679036456, "logps/rejected": -166.69635009765625, "loss": 0.4572, "rewards/chosen": 0.2530288298924764, "rewards/margins": 1.4898051818211873, "rewards/rejected": -1.236776351928711, "step": 5612 }, { "epoch": 0.2975114621153897, "grad_norm": 55.0, "kl": 0.19998455047607422, "learning_rate": 5e-07, "logits/chosen": -19710630.4, "logits/rejected": -46969584.0, "logps/chosen": -173.24420166015625, "logps/rejected": -216.43965657552084, "loss": 0.4016, "rewards/chosen": -0.14299522638320922, "rewards/margins": 1.6643407861391704, "rewards/rejected": -1.8073360125223796, "step": 5613 }, { "epoch": 0.29756446611719184, "grad_norm": 48.25, "kl": 0.29727935791015625, "learning_rate": 5e-07, "logits/chosen": -26124086.4, "logits/rejected": -21770512.0, "logps/chosen": -310.018359375, "logps/rejected": -248.989501953125, "loss": 0.3, "rewards/chosen": 0.38054139614105226, "rewards/margins": 2.6475702047348024, "rewards/rejected": -2.26702880859375, "step": 5614 }, { "epoch": 0.297617470118994, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38930714.666666664, "logits/rejected": -24011470.4, "logps/chosen": -429.5755208333333, "logps/rejected": -544.90634765625, "loss": 0.1735, "rewards/chosen": 0.7203328609466553, "rewards/margins": 3.6792184352874755, "rewards/rejected": -2.9588855743408202, "step": 5615 }, { "epoch": 0.2976704741207961, "grad_norm": 43.5, "kl": 0.039684295654296875, "learning_rate": 5e-07, "logits/chosen": 13279813.333333334, "logits/rejected": -47502723.2, "logps/chosen": -294.0124104817708, "logps/rejected": -384.3580078125, "loss": 0.2425, "rewards/chosen": 0.1479181945323944, "rewards/margins": 2.803953045606613, "rewards/rejected": -2.6560348510742187, "step": 5616 }, { "epoch": 0.29772347812259825, "grad_norm": 45.75, "kl": 1.4658279418945312, "learning_rate": 5e-07, "logits/chosen": 10421942.0, "logits/rejected": -34623904.0, "logps/chosen": -317.6184997558594, "logps/rejected": -340.799072265625, "loss": 0.2967, "rewards/chosen": 0.2535161077976227, "rewards/margins": 2.7697209417819977, "rewards/rejected": -2.516204833984375, "step": 5617 }, { "epoch": 0.2977764821244004, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1884623.25, "logits/rejected": -23253781.333333332, "logps/chosen": -51.40128707885742, "logps/rejected": -201.95719401041666, "loss": 0.2723, "rewards/chosen": -0.23304256796836853, "rewards/margins": 1.500783274571101, "rewards/rejected": -1.7338258425394695, "step": 5618 }, { "epoch": 0.2978294861262025, "grad_norm": 29.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1411387.0, "logits/rejected": -2599390.4, "logps/chosen": -23.342816670735676, "logps/rejected": -144.23692626953124, "loss": 0.3344, "rewards/chosen": -0.049251233537991844, "rewards/margins": 1.3501634011665982, "rewards/rejected": -1.39941463470459, "step": 5619 }, { "epoch": 0.29788249012800466, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7183093.0, "logits/rejected": -50057818.666666664, "logps/chosen": -155.08966064453125, "logps/rejected": -322.0216471354167, "loss": 0.2288, "rewards/chosen": 0.3809501528739929, "rewards/margins": 2.195920805136363, "rewards/rejected": -1.8149706522623699, "step": 5620 }, { "epoch": 0.2979354941298068, "grad_norm": 56.75, "kl": 1.861541748046875, "learning_rate": 5e-07, "logits/chosen": -52044940.8, "logits/rejected": -14678800.0, "logps/chosen": -298.23603515625, "logps/rejected": -303.51572672526044, "loss": 0.4757, "rewards/chosen": 0.027563035488128662, "rewards/margins": 0.6619721452395121, "rewards/rejected": -0.6344091097513834, "step": 5621 }, { "epoch": 0.29798849813160894, "grad_norm": 33.0, "kl": 0.5467715263366699, "learning_rate": 5e-07, "logits/chosen": 687089.25, "logits/rejected": 1582206.75, "logps/chosen": -85.46012369791667, "logps/rejected": -39.78768539428711, "loss": 0.4393, "rewards/chosen": 0.060489535331726074, "rewards/margins": 1.3651825189590454, "rewards/rejected": -1.3046929836273193, "step": 5622 }, { "epoch": 0.2980415021334111, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26174000.0, "logits/rejected": -25314504.0, "logps/chosen": -213.1036376953125, "logps/rejected": -364.1312561035156, "loss": 0.2726, "rewards/chosen": 0.1444132775068283, "rewards/margins": 2.818120762705803, "rewards/rejected": -2.6737074851989746, "step": 5623 }, { "epoch": 0.2980945061352132, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4251135.333333333, "logits/rejected": -903901.8, "logps/chosen": -135.29634602864584, "logps/rejected": -87.51244506835937, "loss": 0.309, "rewards/chosen": -0.01337812344233195, "rewards/margins": 1.592146893342336, "rewards/rejected": -1.605525016784668, "step": 5624 }, { "epoch": 0.29814751013701535, "grad_norm": 42.0, "kl": 0.5466575622558594, "learning_rate": 5e-07, "logits/chosen": -130900309.33333333, "logits/rejected": -16516348.8, "logps/chosen": -238.87032063802084, "logps/rejected": -330.584716796875, "loss": 0.3168, "rewards/chosen": -0.10861358046531677, "rewards/margins": 2.289191061258316, "rewards/rejected": -2.3978046417236327, "step": 5625 }, { "epoch": 0.2982005141388175, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1960644.0, "logits/rejected": -2654521.3333333335, "logps/chosen": -307.4754638671875, "logps/rejected": -218.7808837890625, "loss": 0.2296, "rewards/chosen": -0.2705429196357727, "rewards/margins": 2.059145470460256, "rewards/rejected": -2.329688390096029, "step": 5626 }, { "epoch": 0.2982535181406196, "grad_norm": 47.25, "kl": 1.8918914794921875, "learning_rate": 5e-07, "logits/chosen": 2746771.4, "logits/rejected": -48239456.0, "logps/chosen": -645.339111328125, "logps/rejected": -274.6547444661458, "loss": 0.2706, "rewards/chosen": 0.9620254516601563, "rewards/margins": 2.3711766878763836, "rewards/rejected": -1.4091512362162273, "step": 5627 }, { "epoch": 0.29830652214242176, "grad_norm": 60.5, "kl": 0.6288585662841797, "learning_rate": 5e-07, "logits/chosen": -15605174.0, "logits/rejected": -12108324.0, "logps/chosen": -303.9205017089844, "logps/rejected": -276.6650390625, "loss": 0.254, "rewards/chosen": 0.8527384400367737, "rewards/margins": 2.4850574135780334, "rewards/rejected": -1.6323189735412598, "step": 5628 }, { "epoch": 0.2983595261442239, "grad_norm": 57.5, "kl": 0.5153217315673828, "learning_rate": 5e-07, "logits/chosen": -25449240.0, "logits/rejected": -12485164.0, "logps/chosen": -236.45048828125, "logps/rejected": -211.577880859375, "loss": 0.4335, "rewards/chosen": -0.2574573516845703, "rewards/margins": 1.1737517356872558, "rewards/rejected": -1.4312090873718262, "step": 5629 }, { "epoch": 0.29841253014602603, "grad_norm": 35.0, "kl": 0.3337240219116211, "learning_rate": 5e-07, "logits/chosen": -21884425.333333332, "logits/rejected": -31090476.8, "logps/chosen": -266.97235107421875, "logps/rejected": -353.1728271484375, "loss": 0.2495, "rewards/chosen": 0.22862754265467325, "rewards/margins": 2.5101903478304544, "rewards/rejected": -2.2815628051757812, "step": 5630 }, { "epoch": 0.29846553414782817, "grad_norm": 51.25, "kl": 0.2488250732421875, "learning_rate": 5e-07, "logits/chosen": -36999194.666666664, "logits/rejected": -2444878.25, "logps/chosen": -227.30155436197916, "logps/rejected": -46.157283782958984, "loss": 0.3703, "rewards/chosen": 0.413989782333374, "rewards/margins": 1.5058826208114624, "rewards/rejected": -1.0918928384780884, "step": 5631 }, { "epoch": 0.2985185381496303, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54334896.0, "logits/rejected": 313171.0, "logps/chosen": -252.08694458007812, "logps/rejected": -114.21233367919922, "loss": 0.3465, "rewards/chosen": 0.1327352523803711, "rewards/margins": 1.7707163095474243, "rewards/rejected": -1.6379810571670532, "step": 5632 }, { "epoch": 0.29857154215143245, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13354733.0, "logits/rejected": -7537768.0, "logps/chosen": -116.05056762695312, "logps/rejected": -105.70901489257812, "loss": 0.3605, "rewards/chosen": 0.1889929175376892, "rewards/margins": 1.2088258862495422, "rewards/rejected": -1.019832968711853, "step": 5633 }, { "epoch": 0.2986245461532346, "grad_norm": 53.75, "kl": 0.6794042587280273, "learning_rate": 5e-07, "logits/chosen": -25485306.666666668, "logits/rejected": -76782008.0, "logps/chosen": -307.18975830078125, "logps/rejected": -474.22686767578125, "loss": 0.3901, "rewards/chosen": 0.23577791452407837, "rewards/margins": 2.628235876560211, "rewards/rejected": -2.392457962036133, "step": 5634 }, { "epoch": 0.2986775501550367, "grad_norm": 50.75, "kl": 1.8980789184570312, "learning_rate": 5e-07, "logits/chosen": -29789572.0, "logits/rejected": -6367363.0, "logps/chosen": -594.0005493164062, "logps/rejected": -227.2809295654297, "loss": 0.2941, "rewards/chosen": 0.9589533805847168, "rewards/margins": 1.96974515914917, "rewards/rejected": -1.0107917785644531, "step": 5635 }, { "epoch": 0.29873055415683886, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9179127.333333334, "logits/rejected": -24258131.2, "logps/chosen": -262.83909098307294, "logps/rejected": -181.845947265625, "loss": 0.2463, "rewards/chosen": 0.39169029394785565, "rewards/margins": 2.26191353003184, "rewards/rejected": -1.8702232360839843, "step": 5636 }, { "epoch": 0.298783558158641, "grad_norm": 44.25, "kl": 0.29888153076171875, "learning_rate": 5e-07, "logits/chosen": -3531856.0, "logits/rejected": -23615532.8, "logps/chosen": -349.01171875, "logps/rejected": -79.620458984375, "loss": 0.2618, "rewards/chosen": 0.8583524227142334, "rewards/margins": 2.2820849895477293, "rewards/rejected": -1.4237325668334961, "step": 5637 }, { "epoch": 0.29883656216044313, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56984138.666666664, "logits/rejected": -6032300.0, "logps/chosen": -359.6690673828125, "logps/rejected": -411.20712890625, "loss": 0.263, "rewards/chosen": 0.5162556966145834, "rewards/margins": 1.996092732747396, "rewards/rejected": -1.4798370361328126, "step": 5638 }, { "epoch": 0.29888956616224527, "grad_norm": 69.0, "kl": 2.5121994018554688, "learning_rate": 5e-07, "logits/chosen": -35261644.0, "logits/rejected": -9649928.0, "logps/chosen": -439.07000732421875, "logps/rejected": -264.2270812988281, "loss": 0.3361, "rewards/chosen": 0.6911012530326843, "rewards/margins": 1.8666231036186218, "rewards/rejected": -1.1755218505859375, "step": 5639 }, { "epoch": 0.2989425701640474, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55735946.666666664, "logits/rejected": -23495062.4, "logps/chosen": -571.7626139322916, "logps/rejected": -350.018017578125, "loss": 0.2478, "rewards/chosen": 0.9475382169087728, "rewards/margins": 2.427868874867757, "rewards/rejected": -1.4803306579589843, "step": 5640 }, { "epoch": 0.29899557416584954, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45499136.0, "logits/rejected": -10310852.57142857, "logps/chosen": -685.6194458007812, "logps/rejected": -247.03618512834822, "loss": 0.2275, "rewards/chosen": 0.569549560546875, "rewards/margins": 2.284102439880371, "rewards/rejected": -1.714552879333496, "step": 5641 }, { "epoch": 0.2990485781676517, "grad_norm": 50.0, "kl": 0.0002231597900390625, "learning_rate": 5e-07, "logits/chosen": -14339166.666666666, "logits/rejected": -20068467.2, "logps/chosen": -181.50455729166666, "logps/rejected": -257.953125, "loss": 0.3217, "rewards/chosen": -0.3904833396275838, "rewards/margins": 1.693091146151225, "rewards/rejected": -2.0835744857788088, "step": 5642 }, { "epoch": 0.2991015821694538, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2247391.5, "logits/rejected": -65001152.0, "logps/chosen": -31.46830940246582, "logps/rejected": -363.682373046875, "loss": 0.19, "rewards/chosen": 0.2022031843662262, "rewards/margins": 2.532539346388408, "rewards/rejected": -2.330336162022182, "step": 5643 }, { "epoch": 0.29915458617125595, "grad_norm": 59.75, "kl": 1.2581558227539062, "learning_rate": 5e-07, "logits/chosen": -62258636.8, "logits/rejected": -19486481.333333332, "logps/chosen": -552.01982421875, "logps/rejected": -285.70945231119794, "loss": 0.3091, "rewards/chosen": 1.0566722869873046, "rewards/margins": 1.6196750720342, "rewards/rejected": -0.5630027850468954, "step": 5644 }, { "epoch": 0.2992075901730581, "grad_norm": 62.25, "kl": 0.6219940185546875, "learning_rate": 5e-07, "logits/chosen": -37114668.8, "logits/rejected": -20948662.666666668, "logps/chosen": -385.9211181640625, "logps/rejected": -345.8553466796875, "loss": 0.3095, "rewards/chosen": 0.5564675807952881, "rewards/margins": 2.1719548384348553, "rewards/rejected": -1.6154872576395671, "step": 5645 }, { "epoch": 0.29926059417486023, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6961332.5, "logits/rejected": -28212732.0, "logps/chosen": -246.18069458007812, "logps/rejected": -399.84326171875, "loss": 0.2593, "rewards/chosen": 0.4619814157485962, "rewards/margins": 2.501134753227234, "rewards/rejected": -2.0391533374786377, "step": 5646 }, { "epoch": 0.29931359817666237, "grad_norm": 55.75, "kl": 1.3035850524902344, "learning_rate": 5e-07, "logits/chosen": -14172629.333333334, "logits/rejected": -27731152.0, "logps/chosen": -231.11641438802084, "logps/rejected": -310.973388671875, "loss": 0.4148, "rewards/chosen": 0.3199321428934733, "rewards/margins": 1.506161133448283, "rewards/rejected": -1.1862289905548096, "step": 5647 }, { "epoch": 0.29936660217846445, "grad_norm": 43.0, "kl": 0.4587287902832031, "learning_rate": 5e-07, "logits/chosen": -4312079.666666667, "logits/rejected": 2227.9, "logps/chosen": -169.63521321614584, "logps/rejected": -258.8738525390625, "loss": 0.3148, "rewards/chosen": 0.7974299589792887, "rewards/margins": 2.259474770228068, "rewards/rejected": -1.4620448112487794, "step": 5648 }, { "epoch": 0.2994196061802666, "grad_norm": 58.5, "kl": 0.4669017791748047, "learning_rate": 5e-07, "logits/chosen": -20644750.0, "logits/rejected": -26058304.0, "logps/chosen": -366.0044860839844, "logps/rejected": -291.1095275878906, "loss": 0.2676, "rewards/chosen": 1.1591949462890625, "rewards/margins": 2.1282005310058594, "rewards/rejected": -0.9690055847167969, "step": 5649 }, { "epoch": 0.2994726101820687, "grad_norm": 80.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50597941.333333336, "logits/rejected": -12790978.4, "logps/chosen": -304.3034261067708, "logps/rejected": -542.2478515625, "loss": 0.189, "rewards/chosen": 1.2972254753112793, "rewards/margins": 3.2074976921081544, "rewards/rejected": -1.910272216796875, "step": 5650 }, { "epoch": 0.29952561418387086, "grad_norm": 44.25, "kl": 0.04015350341796875, "learning_rate": 5e-07, "logits/chosen": -63661708.0, "logits/rejected": -8385907.333333333, "logps/chosen": -1197.44140625, "logps/rejected": -264.71856689453125, "loss": 0.1491, "rewards/chosen": 1.7466217279434204, "rewards/margins": 3.7307107845942182, "rewards/rejected": -1.9840890566507976, "step": 5651 }, { "epoch": 0.299578618185673, "grad_norm": 36.75, "kl": 0.05313682556152344, "learning_rate": 5e-07, "logits/chosen": 1141569.0833333333, "logits/rejected": 13180639.0, "logps/chosen": -54.39178975423177, "logps/rejected": -312.7548828125, "loss": 0.4666, "rewards/chosen": -0.3710692326227824, "rewards/margins": 1.636556347211202, "rewards/rejected": -2.0076255798339844, "step": 5652 }, { "epoch": 0.29963162218747513, "grad_norm": 56.0, "kl": 1.6385269165039062, "learning_rate": 5e-07, "logits/chosen": -20613636.0, "logits/rejected": -33288692.0, "logps/chosen": -417.0406494140625, "logps/rejected": -252.73341369628906, "loss": 0.2816, "rewards/chosen": 0.5425155758857727, "rewards/margins": 2.8663564324378967, "rewards/rejected": -2.323840856552124, "step": 5653 }, { "epoch": 0.29968462618927727, "grad_norm": 86.5, "kl": 2.709197998046875, "learning_rate": 5e-07, "logits/chosen": -29897437.333333332, "logits/rejected": -2354765.5, "logps/chosen": -523.6329345703125, "logps/rejected": -137.83447265625, "loss": 0.3102, "rewards/chosen": 0.9531726042429606, "rewards/margins": 2.7017798821131387, "rewards/rejected": -1.7486072778701782, "step": 5654 }, { "epoch": 0.2997376301910794, "grad_norm": 39.75, "kl": 0.36181640625, "learning_rate": 5e-07, "logits/chosen": -36906972.0, "logits/rejected": -22374114.0, "logps/chosen": -373.5064697265625, "logps/rejected": -187.74325561523438, "loss": 0.2283, "rewards/chosen": 0.9860212802886963, "rewards/margins": 2.7409145832061768, "rewards/rejected": -1.7548933029174805, "step": 5655 }, { "epoch": 0.29979063419288154, "grad_norm": 58.25, "kl": 0.10419082641601562, "learning_rate": 5e-07, "logits/chosen": -54520211.2, "logits/rejected": 53744922.666666664, "logps/chosen": -290.2419921875, "logps/rejected": -213.2431437174479, "loss": 0.3885, "rewards/chosen": 0.3318170070648193, "rewards/margins": 1.135531759262085, "rewards/rejected": -0.8037147521972656, "step": 5656 }, { "epoch": 0.2998436381946837, "grad_norm": 45.0, "kl": 0.37361907958984375, "learning_rate": 5e-07, "logits/chosen": 3836714.0, "logits/rejected": -4925955.0, "logps/chosen": -309.84942626953125, "logps/rejected": -100.79269409179688, "loss": 0.222, "rewards/chosen": 0.8792959451675415, "rewards/margins": 3.0192633867263794, "rewards/rejected": -2.139967441558838, "step": 5657 }, { "epoch": 0.2998966421964858, "grad_norm": 61.75, "kl": 1.2223968505859375, "learning_rate": 5e-07, "logits/chosen": -32671480.0, "logits/rejected": -99646240.0, "logps/chosen": -324.2433776855469, "logps/rejected": -339.6761474609375, "loss": 0.3354, "rewards/chosen": 0.5533242225646973, "rewards/margins": 1.8185434341430664, "rewards/rejected": -1.2652192115783691, "step": 5658 }, { "epoch": 0.29994964619828796, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33426300.8, "logits/rejected": -2718392.1666666665, "logps/chosen": -353.3423095703125, "logps/rejected": -52.24542236328125, "loss": 0.3745, "rewards/chosen": 0.21745059490203858, "rewards/margins": 1.4459159930547079, "rewards/rejected": -1.2284653981526692, "step": 5659 }, { "epoch": 0.3000026502000901, "grad_norm": 51.5, "kl": 0.15954208374023438, "learning_rate": 5e-07, "logits/chosen": -17978586.666666668, "logits/rejected": -8370116.8, "logps/chosen": -515.062744140625, "logps/rejected": -163.584130859375, "loss": 0.1931, "rewards/chosen": 0.8187764485677084, "rewards/margins": 3.001201947530111, "rewards/rejected": -2.1824254989624023, "step": 5660 }, { "epoch": 0.30005565420189223, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3540012.6666666665, "logits/rejected": -15135107.2, "logps/chosen": -208.6508585611979, "logps/rejected": -104.1672119140625, "loss": 0.3937, "rewards/chosen": -0.44454185167948407, "rewards/margins": 0.8158826669057209, "rewards/rejected": -1.260424518585205, "step": 5661 }, { "epoch": 0.30010865820369437, "grad_norm": 56.5, "kl": 2.6402645111083984, "learning_rate": 5e-07, "logits/chosen": -58842293.333333336, "logits/rejected": -48635283.2, "logps/chosen": -1003.09423828125, "logps/rejected": -241.249462890625, "loss": 0.2261, "rewards/chosen": 1.2601075172424316, "rewards/margins": 2.819979667663574, "rewards/rejected": -1.5598721504211426, "step": 5662 }, { "epoch": 0.3001616622054965, "grad_norm": 53.25, "kl": 1.9183082580566406, "learning_rate": 5e-07, "logits/chosen": 1662082.0, "logits/rejected": -28167904.0, "logps/chosen": -511.5365397135417, "logps/rejected": -296.8088623046875, "loss": 0.3042, "rewards/chosen": 0.21787744760513306, "rewards/margins": 2.120942747592926, "rewards/rejected": -1.903065299987793, "step": 5663 }, { "epoch": 0.30021466620729864, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33416973.333333332, "logits/rejected": -29672710.4, "logps/chosen": -399.2176513671875, "logps/rejected": -297.50732421875, "loss": 0.2095, "rewards/chosen": 0.8341852823893229, "rewards/margins": 2.713776652018229, "rewards/rejected": -1.8795913696289062, "step": 5664 }, { "epoch": 0.3002676702091008, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12195580.8, "logits/rejected": -25537576.0, "logps/chosen": -175.69755859375, "logps/rejected": -441.622802734375, "loss": 0.3646, "rewards/chosen": 0.006522035598754883, "rewards/margins": 3.1315681298573814, "rewards/rejected": -3.1250460942586265, "step": 5665 }, { "epoch": 0.3003206742109029, "grad_norm": 56.25, "kl": 0.4298057556152344, "learning_rate": 5e-07, "logits/chosen": -5004015.2, "logits/rejected": -8794248.0, "logps/chosen": -380.37333984375, "logps/rejected": -100.88546752929688, "loss": 0.421, "rewards/chosen": 0.07143738269805908, "rewards/margins": 1.033071223894755, "rewards/rejected": -0.9616338411966959, "step": 5666 }, { "epoch": 0.30037367821270505, "grad_norm": 119.0, "kl": 0.0886077880859375, "learning_rate": 5e-07, "logits/chosen": 240130688.0, "logits/rejected": -9976109.0, "logps/chosen": -730.117919921875, "logps/rejected": -319.2275390625, "loss": 0.339, "rewards/chosen": 0.3812536895275116, "rewards/margins": 1.445027083158493, "rewards/rejected": -1.0637733936309814, "step": 5667 }, { "epoch": 0.3004266822145072, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3091991.3333333335, "logits/rejected": -20080336.0, "logps/chosen": -129.69288126627603, "logps/rejected": -374.7580810546875, "loss": 0.2495, "rewards/chosen": 0.3933435281117757, "rewards/margins": 2.4287294228871663, "rewards/rejected": -2.0353858947753904, "step": 5668 }, { "epoch": 0.3004796862163093, "grad_norm": 49.0, "kl": 0.6191825866699219, "learning_rate": 5e-07, "logits/chosen": -6366062.8, "logits/rejected": -114316394.66666667, "logps/chosen": -403.7392578125, "logps/rejected": -251.7401123046875, "loss": 0.2469, "rewards/chosen": 0.8254907608032227, "rewards/margins": 3.264886824289958, "rewards/rejected": -2.439396063486735, "step": 5669 }, { "epoch": 0.30053269021811146, "grad_norm": 60.0, "kl": 0.2268991470336914, "learning_rate": 5e-07, "logits/chosen": -11713769.0, "logps/chosen": -275.6864013671875, "loss": 0.4772, "rewards/chosen": 0.100518137216568, "step": 5670 }, { "epoch": 0.3005856942199136, "grad_norm": 64.5, "kl": 1.0073585510253906, "learning_rate": 5e-07, "logits/chosen": 12303470.4, "logits/rejected": -28782328.0, "logps/chosen": -269.0964111328125, "logps/rejected": -349.4330647786458, "loss": 0.4175, "rewards/chosen": 0.019344520568847657, "rewards/margins": 1.330047082901001, "rewards/rejected": -1.3107025623321533, "step": 5671 }, { "epoch": 0.30063869822171574, "grad_norm": 48.75, "kl": 0.20667076110839844, "learning_rate": 5e-07, "logits/chosen": -32775834.666666668, "logits/rejected": -28594054.4, "logps/chosen": -890.909423828125, "logps/rejected": -255.0254150390625, "loss": 0.1908, "rewards/chosen": 1.377220630645752, "rewards/margins": 3.4243264198303223, "rewards/rejected": -2.0471057891845703, "step": 5672 }, { "epoch": 0.3006917022235179, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 404177.0, "logits/rejected": -34457620.0, "logps/chosen": -68.3676986694336, "logps/rejected": -356.73199462890625, "loss": 0.3812, "rewards/chosen": -0.14916343986988068, "rewards/margins": 1.8205091208219528, "rewards/rejected": -1.9696725606918335, "step": 5673 }, { "epoch": 0.30074470622532, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15763878.0, "logits/rejected": -44856490.666666664, "logps/chosen": -418.02642822265625, "logps/rejected": -491.3323160807292, "loss": 0.2478, "rewards/chosen": 0.04792022705078125, "rewards/margins": 2.474360783894857, "rewards/rejected": -2.4264405568440757, "step": 5674 }, { "epoch": 0.30079771022712215, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45814789.333333336, "logits/rejected": -24532953.6, "logps/chosen": -247.64680989583334, "logps/rejected": -212.06171875, "loss": 0.2584, "rewards/chosen": 0.9404748280843099, "rewards/margins": 2.097283204396566, "rewards/rejected": -1.1568083763122559, "step": 5675 }, { "epoch": 0.3008507142289243, "grad_norm": 34.25, "kl": 0.20960044860839844, "learning_rate": 5e-07, "logits/chosen": -3376068.75, "logits/rejected": -29119336.0, "logps/chosen": -59.07611846923828, "logps/rejected": -370.5212097167969, "loss": 0.2709, "rewards/chosen": 0.3097839057445526, "rewards/margins": 2.5415761172771454, "rewards/rejected": -2.2317922115325928, "step": 5676 }, { "epoch": 0.3009037182307264, "grad_norm": 54.5, "kl": 0.15871810913085938, "learning_rate": 5e-07, "logits/chosen": -30057945.6, "logits/rejected": -16117074.666666666, "logps/chosen": -457.59521484375, "logps/rejected": -134.89898681640625, "loss": 0.3501, "rewards/chosen": 0.38477630615234376, "rewards/margins": 1.6754371643066406, "rewards/rejected": -1.2906608581542969, "step": 5677 }, { "epoch": 0.30095672223252856, "grad_norm": 52.5, "kl": 0.19810771942138672, "learning_rate": 5e-07, "logits/chosen": -20453116.8, "logits/rejected": -42053738.666666664, "logps/chosen": -225.8671875, "logps/rejected": -319.5880126953125, "loss": 0.278, "rewards/chosen": 0.871152687072754, "rewards/margins": 2.2754708607991536, "rewards/rejected": -1.4043181737263997, "step": 5678 }, { "epoch": 0.3010097262343307, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53292980.0, "logits/rejected": -28836900.57142857, "logps/chosen": -551.3687744140625, "logps/rejected": -590.48046875, "loss": 0.1956, "rewards/chosen": 0.2870849668979645, "rewards/margins": 2.183113758053098, "rewards/rejected": -1.896028791155134, "step": 5679 }, { "epoch": 0.30106273023613284, "grad_norm": 59.5, "kl": 0.666046142578125, "learning_rate": 5e-07, "logits/chosen": -55557312.0, "logits/rejected": -50298634.666666664, "logps/chosen": -336.4234375, "logps/rejected": -454.5091959635417, "loss": 0.3939, "rewards/chosen": 0.2305990934371948, "rewards/margins": 1.3106122096379598, "rewards/rejected": -1.080013116200765, "step": 5680 }, { "epoch": 0.301115734237935, "grad_norm": 65.0, "kl": 2.1966781616210938, "learning_rate": 5e-07, "logits/chosen": -65918912.0, "logits/rejected": -24192706.0, "logps/chosen": -568.0846557617188, "logps/rejected": -312.4612731933594, "loss": 0.2784, "rewards/chosen": 0.89385586977005, "rewards/margins": 2.8615227341651917, "rewards/rejected": -1.9676668643951416, "step": 5681 }, { "epoch": 0.3011687382397371, "grad_norm": 46.5, "kl": 0.4379920959472656, "learning_rate": 5e-07, "logits/chosen": -17296246.4, "logits/rejected": -19543881.333333332, "logps/chosen": -151.9750244140625, "logps/rejected": -297.2113850911458, "loss": 0.2614, "rewards/chosen": 0.7233860969543457, "rewards/margins": 2.8146749814351404, "rewards/rejected": -2.0912888844807944, "step": 5682 }, { "epoch": 0.30122174224153925, "grad_norm": 61.75, "kl": 1.7123193740844727, "learning_rate": 5e-07, "logits/chosen": -55648992.0, "logits/rejected": -19014610.0, "logps/chosen": -548.6192626953125, "logps/rejected": -379.7058410644531, "loss": 0.2909, "rewards/chosen": 0.9642165899276733, "rewards/margins": 2.211226463317871, "rewards/rejected": -1.2470098733901978, "step": 5683 }, { "epoch": 0.3012747462433414, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36097421.333333336, "logits/rejected": -36832496.0, "logps/chosen": -143.43995157877603, "logps/rejected": -407.6633544921875, "loss": 0.326, "rewards/chosen": -0.34796468416849774, "rewards/margins": 1.6603196938832598, "rewards/rejected": -2.0082843780517576, "step": 5684 }, { "epoch": 0.3013277502451435, "grad_norm": 49.25, "kl": 2.6498794555664062, "learning_rate": 5e-07, "logits/chosen": 2535625.6, "logits/rejected": -12853950.666666666, "logps/chosen": -405.1427734375, "logps/rejected": -350.7602132161458, "loss": 0.3144, "rewards/chosen": 0.8980524063110351, "rewards/margins": 2.5235166867574055, "rewards/rejected": -1.6254642804463704, "step": 5685 }, { "epoch": 0.30138075424694566, "grad_norm": 53.0, "kl": 0.5597877502441406, "learning_rate": 5e-07, "logits/chosen": -18088400.0, "logits/rejected": -39471776.0, "logps/chosen": -344.8526611328125, "logps/rejected": -209.655712890625, "loss": 0.2737, "rewards/chosen": 1.128116766611735, "rewards/margins": 2.334467856089274, "rewards/rejected": -1.206351089477539, "step": 5686 }, { "epoch": 0.3014337582487478, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5512132.5, "logits/rejected": -35647996.0, "logps/chosen": -132.45870971679688, "logps/rejected": -255.23995971679688, "loss": 0.3274, "rewards/chosen": 0.3295959532260895, "rewards/margins": 1.7092134058475494, "rewards/rejected": -1.37961745262146, "step": 5687 }, { "epoch": 0.30148676225054993, "grad_norm": 53.0, "kl": 0.21802520751953125, "learning_rate": 5e-07, "logits/chosen": -64561397.333333336, "logits/rejected": -1835563.0, "logps/chosen": -357.0327962239583, "logps/rejected": -58.38111572265625, "loss": 0.3727, "rewards/chosen": -0.04125823577245077, "rewards/margins": 1.0733846704165142, "rewards/rejected": -1.1146429061889649, "step": 5688 }, { "epoch": 0.30153976625235207, "grad_norm": 56.5, "kl": 0.17984771728515625, "learning_rate": 5e-07, "logits/chosen": -29394426.666666668, "logits/rejected": 5108756.0, "logps/chosen": -402.5543212890625, "logps/rejected": -554.451171875, "loss": 0.285, "rewards/chosen": 0.8747367858886719, "rewards/margins": 2.6535515785217285, "rewards/rejected": -1.7788147926330566, "step": 5689 }, { "epoch": 0.3015927702541542, "grad_norm": 45.25, "kl": 0.7299642562866211, "learning_rate": 5e-07, "logits/chosen": -7230894.5, "logits/rejected": -25711256.0, "logps/chosen": -122.90090942382812, "logps/rejected": -288.0228271484375, "loss": 0.3132, "rewards/chosen": 0.48515647649765015, "rewards/margins": 2.094441831111908, "rewards/rejected": -1.6092853546142578, "step": 5690 }, { "epoch": 0.30164577425595634, "grad_norm": 42.5, "kl": 0.3955879211425781, "learning_rate": 5e-07, "logits/chosen": -5241717.333333333, "logits/rejected": -39172198.4, "logps/chosen": -227.18634033203125, "logps/rejected": -333.4285400390625, "loss": 0.2697, "rewards/chosen": 0.4624454180399577, "rewards/margins": 2.3690843264261883, "rewards/rejected": -1.9066389083862305, "step": 5691 }, { "epoch": 0.3016987782577585, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7897359.0, "logits/rejected": -14228694.666666666, "logps/chosen": -183.72802734375, "logps/rejected": -451.7274576822917, "loss": 0.2609, "rewards/chosen": -0.02635059505701065, "rewards/margins": 1.6520758296052616, "rewards/rejected": -1.6784264246622722, "step": 5692 }, { "epoch": 0.3017517822595606, "grad_norm": 51.25, "kl": 0.5915012359619141, "learning_rate": 5e-07, "logits/chosen": -35512940.8, "logits/rejected": -95416853.33333333, "logps/chosen": -274.9605224609375, "logps/rejected": -483.0881754557292, "loss": 0.3317, "rewards/chosen": 0.2085343837738037, "rewards/margins": 2.674625857671102, "rewards/rejected": -2.4660914738972983, "step": 5693 }, { "epoch": 0.30180478626136276, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -606381.5, "logits/rejected": -22904872.0, "logps/chosen": -36.95651626586914, "logps/rejected": -376.9829508463542, "loss": 0.2555, "rewards/chosen": 0.21250011026859283, "rewards/margins": 2.2216007063786187, "rewards/rejected": -2.009100596110026, "step": 5694 }, { "epoch": 0.3018577902631649, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17510209.6, "logits/rejected": -24597530.666666668, "logps/chosen": -266.170654296875, "logps/rejected": -349.19921875, "loss": 0.3417, "rewards/chosen": 0.19555191993713378, "rewards/margins": 2.126291799545288, "rewards/rejected": -1.9307398796081543, "step": 5695 }, { "epoch": 0.30191079426496703, "grad_norm": 41.5, "kl": 0.46471595764160156, "learning_rate": 5e-07, "logits/chosen": -17733233.333333332, "logits/rejected": 10157150.4, "logps/chosen": -241.3819376627604, "logps/rejected": -380.732763671875, "loss": 0.3404, "rewards/chosen": -0.12894453605016074, "rewards/margins": 1.5356730024019878, "rewards/rejected": -1.6646175384521484, "step": 5696 }, { "epoch": 0.30196379826676917, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7824434.0, "logits/rejected": -36221564.8, "logps/chosen": -313.9512532552083, "logps/rejected": -418.459814453125, "loss": 0.3056, "rewards/chosen": -0.44497303167978924, "rewards/margins": 1.5568660338719684, "rewards/rejected": -2.0018390655517577, "step": 5697 }, { "epoch": 0.30201680226857125, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8312426.4, "logits/rejected": -32051048.0, "logps/chosen": -181.06934814453126, "logps/rejected": -583.5362141927084, "loss": 0.3493, "rewards/chosen": -0.03037865161895752, "rewards/margins": 2.5837275266647337, "rewards/rejected": -2.6141061782836914, "step": 5698 }, { "epoch": 0.3020698062703734, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57076184.0, "logits/rejected": -19482972.0, "logps/chosen": -268.273193359375, "logps/rejected": -565.2347412109375, "loss": 0.2584, "rewards/chosen": 0.31170880794525146, "rewards/margins": 2.614336371421814, "rewards/rejected": -2.3026275634765625, "step": 5699 }, { "epoch": 0.3021228102721755, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50823321.6, "logits/rejected": -57362133.333333336, "logps/chosen": -352.835595703125, "logps/rejected": -119.13585408528645, "loss": 0.3891, "rewards/chosen": 0.12856833934783934, "rewards/margins": 1.2608687162399292, "rewards/rejected": -1.1323003768920898, "step": 5700 }, { "epoch": 0.30217581427397766, "grad_norm": 59.5, "kl": 3.3279342651367188, "learning_rate": 5e-07, "logits/chosen": -10779728.0, "logits/rejected": -67511061.33333333, "logps/chosen": -490.711669921875, "logps/rejected": -287.76011149088544, "loss": 0.2928, "rewards/chosen": 1.3808448791503907, "rewards/margins": 2.9008413632710774, "rewards/rejected": -1.5199964841206868, "step": 5701 }, { "epoch": 0.3022288182757798, "grad_norm": 41.5, "kl": 0.18834495544433594, "learning_rate": 5e-07, "logits/chosen": -14378073.333333334, "logits/rejected": -1088532.6, "logps/chosen": -320.9617106119792, "logps/rejected": -60.85498046875, "loss": 0.3054, "rewards/chosen": 0.6656460762023926, "rewards/margins": 1.7339975357055664, "rewards/rejected": -1.0683514595031738, "step": 5702 }, { "epoch": 0.30228182227758194, "grad_norm": 57.5, "kl": 1.4076614379882812, "learning_rate": 5e-07, "logits/chosen": -73879888.0, "logits/rejected": -4904381.0, "logps/chosen": -576.2283325195312, "logps/rejected": -139.2069091796875, "loss": 0.4229, "rewards/chosen": 0.4138542413711548, "rewards/margins": 0.9544820785522461, "rewards/rejected": -0.5406278371810913, "step": 5703 }, { "epoch": 0.3023348262793841, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -683867.6, "logits/rejected": -39384877.333333336, "logps/chosen": -192.5159423828125, "logps/rejected": -286.22943115234375, "loss": 0.3415, "rewards/chosen": 0.406998872756958, "rewards/margins": 2.275955279668172, "rewards/rejected": -1.8689564069112141, "step": 5704 }, { "epoch": 0.3023878302811862, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44854716.0, "logits/rejected": -68186112.0, "logps/chosen": -264.7115783691406, "logps/rejected": -324.2060546875, "loss": 0.3115, "rewards/chosen": 0.3202310800552368, "rewards/margins": 1.8959555625915527, "rewards/rejected": -1.575724482536316, "step": 5705 }, { "epoch": 0.30244083428298835, "grad_norm": 49.0, "kl": 1.0737762451171875, "learning_rate": 5e-07, "logits/chosen": 3261213.0, "logits/rejected": -7252471.0, "logps/chosen": -449.33819580078125, "logps/rejected": -208.45240783691406, "loss": 0.2299, "rewards/chosen": 0.7348281741142273, "rewards/margins": 3.6667911410331726, "rewards/rejected": -2.9319629669189453, "step": 5706 }, { "epoch": 0.3024938382847905, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5517719.5, "logits/rejected": -8621018.857142856, "logps/chosen": -19.966983795166016, "logps/rejected": -209.50861467633928, "loss": 0.2499, "rewards/chosen": -0.4330427348613739, "rewards/margins": 1.1965913048812322, "rewards/rejected": -1.6296340397426061, "step": 5707 }, { "epoch": 0.3025468422865926, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17875041.6, "logits/rejected": -46938384.0, "logps/chosen": -252.12177734375, "logps/rejected": -593.5458577473959, "loss": 0.3111, "rewards/chosen": 0.3445852279663086, "rewards/margins": 2.6952798843383787, "rewards/rejected": -2.3506946563720703, "step": 5708 }, { "epoch": 0.30259984628839476, "grad_norm": 51.5, "kl": 1.1672039031982422, "learning_rate": 5e-07, "logits/chosen": -50301576.0, "logits/rejected": 6416085.5, "logps/chosen": -510.99993896484375, "logps/rejected": -243.1946563720703, "loss": 0.3594, "rewards/chosen": 0.3182505965232849, "rewards/margins": 1.7386977076530457, "rewards/rejected": -1.4204471111297607, "step": 5709 }, { "epoch": 0.3026528502901969, "grad_norm": 62.75, "kl": 1.3174972534179688, "learning_rate": 5e-07, "logits/chosen": -16748032.0, "logits/rejected": -17786438.0, "logps/chosen": -274.3389485677083, "logps/rejected": -318.6072998046875, "loss": 0.3933, "rewards/chosen": 0.29188738266626996, "rewards/margins": 2.2266560594240823, "rewards/rejected": -1.9347686767578125, "step": 5710 }, { "epoch": 0.30270585429199903, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50787365.333333336, "logits/rejected": -22611952.0, "logps/chosen": -317.55712890625, "logps/rejected": -287.10068359375, "loss": 0.3184, "rewards/chosen": -0.09837037324905396, "rewards/margins": 1.3510360598564148, "rewards/rejected": -1.4494064331054688, "step": 5711 }, { "epoch": 0.30275885829380117, "grad_norm": 75.5, "kl": 0.5323333740234375, "learning_rate": 5e-07, "logits/chosen": 7486453.333333333, "logits/rejected": -13195341.0, "logps/chosen": -580.1378580729166, "logps/rejected": -201.05050659179688, "loss": 0.3352, "rewards/chosen": 0.3776206970214844, "rewards/margins": 3.3380513191223145, "rewards/rejected": -2.96043062210083, "step": 5712 }, { "epoch": 0.3028118622956033, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -1379711.0, "logps/rejected": -186.1584014892578, "loss": 0.2996, "rewards/rejected": -0.9754316210746765, "step": 5713 }, { "epoch": 0.30286486629740544, "grad_norm": 77.0, "kl": 1.2103729248046875, "learning_rate": 5e-07, "logits/chosen": -23029336.0, "logits/rejected": 7543157.333333333, "logps/chosen": -856.4607421875, "logps/rejected": -252.74737548828125, "loss": 0.2566, "rewards/chosen": 0.7118283271789551, "rewards/margins": 5.339003658294677, "rewards/rejected": -4.627175331115723, "step": 5714 }, { "epoch": 0.3029178702992076, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31346018.0, "logits/rejected": -7642826.5, "logps/chosen": -258.5936279296875, "logps/rejected": -188.9012908935547, "loss": 0.3204, "rewards/chosen": 0.34207114577293396, "rewards/margins": 1.879825085401535, "rewards/rejected": -1.537753939628601, "step": 5715 }, { "epoch": 0.3029708743010097, "grad_norm": 46.5, "kl": 0.1497821807861328, "learning_rate": 5e-07, "logits/chosen": 8689944.0, "logits/rejected": -9912386.0, "logps/chosen": -201.1749725341797, "logps/rejected": -170.93211364746094, "loss": 0.3668, "rewards/chosen": 0.18699751794338226, "rewards/margins": 1.2089754194021225, "rewards/rejected": -1.0219779014587402, "step": 5716 }, { "epoch": 0.30302387830281186, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91665952.0, "logits/rejected": -17514840.0, "logps/chosen": -138.59835815429688, "logps/rejected": -425.46075439453125, "loss": 0.3135, "rewards/chosen": 0.2883760631084442, "rewards/margins": 1.9112559258937836, "rewards/rejected": -1.6228798627853394, "step": 5717 }, { "epoch": 0.303076882304614, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28273923.2, "logits/rejected": -34891664.0, "logps/chosen": -253.473193359375, "logps/rejected": -321.3955078125, "loss": 0.3766, "rewards/chosen": 0.07617889046669006, "rewards/margins": 1.5254346748193104, "rewards/rejected": -1.4492557843526204, "step": 5718 }, { "epoch": 0.30312988630641613, "grad_norm": 61.5, "kl": 1.2638883590698242, "learning_rate": 5e-07, "logits/chosen": -20921444.57142857, "logits/rejected": -24193828.0, "logps/chosen": -205.57667759486608, "logps/rejected": -397.33355712890625, "loss": 0.4801, "rewards/chosen": 0.004510155745915004, "rewards/margins": 2.2133145247186934, "rewards/rejected": -2.2088043689727783, "step": 5719 }, { "epoch": 0.30318289030821827, "grad_norm": 39.25, "kl": 0.22654151916503906, "learning_rate": 5e-07, "logits/chosen": -11845522.0, "logits/rejected": -49415224.0, "logps/chosen": -206.28488159179688, "logps/rejected": -438.82220458984375, "loss": 0.267, "rewards/chosen": 0.18124905228614807, "rewards/margins": 3.1213888227939606, "rewards/rejected": -2.9401397705078125, "step": 5720 }, { "epoch": 0.3032358943100204, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10775878.0, "logits/rejected": -9276920.0, "logps/chosen": -234.05796813964844, "logps/rejected": -343.2236328125, "loss": 0.3128, "rewards/chosen": 0.23248282074928284, "rewards/margins": 2.147902637720108, "rewards/rejected": -1.9154198169708252, "step": 5721 }, { "epoch": 0.30328889831182254, "grad_norm": 30.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -160504.20833333334, "logits/rejected": -26397921.6, "logps/chosen": -67.5263163248698, "logps/rejected": -265.062890625, "loss": 0.2657, "rewards/chosen": -0.03801002105077108, "rewards/margins": 2.0727653543154396, "rewards/rejected": -2.110775375366211, "step": 5722 }, { "epoch": 0.3033419023136247, "grad_norm": 45.25, "kl": 0.3791465759277344, "learning_rate": 5e-07, "logits/chosen": -23809996.8, "logits/rejected": -11470668.0, "logps/chosen": -201.6124755859375, "logps/rejected": -416.729248046875, "loss": 0.3193, "rewards/chosen": 0.43435420989990237, "rewards/margins": 3.0994298299153646, "rewards/rejected": -2.6650756200154624, "step": 5723 }, { "epoch": 0.3033949063154268, "grad_norm": 94.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63041576.0, "logits/rejected": -11331811.0, "logps/chosen": -673.155029296875, "logps/rejected": -239.5666961669922, "loss": 0.2576, "rewards/chosen": 1.013035774230957, "rewards/margins": 2.4726051092147827, "rewards/rejected": -1.4595693349838257, "step": 5724 }, { "epoch": 0.30344791031722895, "grad_norm": 47.5, "kl": 0.79986572265625, "learning_rate": 5e-07, "logits/chosen": -123283912.0, "logits/rejected": -28380750.0, "logps/chosen": -927.0135498046875, "logps/rejected": -257.8568115234375, "loss": 0.3158, "rewards/chosen": 0.6980072855949402, "rewards/margins": 2.7068437933921814, "rewards/rejected": -2.008836507797241, "step": 5725 }, { "epoch": 0.3035009143190311, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5841715.6, "logits/rejected": -27775234.666666668, "logps/chosen": -206.7990234375, "logps/rejected": -301.4112955729167, "loss": 0.3076, "rewards/chosen": 0.32204408645629884, "rewards/margins": 2.718147627512614, "rewards/rejected": -2.396103541056315, "step": 5726 }, { "epoch": 0.3035539183208332, "grad_norm": 41.5, "kl": 2.3003311157226562, "learning_rate": 5e-07, "logits/chosen": -55886276.0, "logits/rejected": -32906472.0, "logps/chosen": -497.71881103515625, "logps/rejected": -429.6683654785156, "loss": 0.2417, "rewards/chosen": 1.0019254684448242, "rewards/margins": 3.3245201110839844, "rewards/rejected": -2.32259464263916, "step": 5727 }, { "epoch": 0.30360692232263536, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29382584.0, "logits/rejected": -31053165.714285713, "logps/chosen": -295.0639343261719, "logps/rejected": -515.3173828125, "loss": 0.1296, "rewards/chosen": 0.8758026361465454, "rewards/margins": 3.6722828149795532, "rewards/rejected": -2.796480178833008, "step": 5728 }, { "epoch": 0.3036599263244375, "grad_norm": 47.5, "kl": 0.21461105346679688, "learning_rate": 5e-07, "logits/chosen": -45262720.0, "logits/rejected": -22645142.0, "logps/chosen": -208.4999237060547, "logps/rejected": -315.2594909667969, "loss": 0.3411, "rewards/chosen": 0.2417728304862976, "rewards/margins": 1.7544236779212952, "rewards/rejected": -1.5126508474349976, "step": 5729 }, { "epoch": 0.30371293032623964, "grad_norm": 52.0, "kl": 0.06637191772460938, "learning_rate": 5e-07, "logits/chosen": -25340970.666666668, "logits/rejected": -25827140.8, "logps/chosen": -152.01175944010416, "logps/rejected": -267.886962890625, "loss": 0.2726, "rewards/chosen": 0.13946889837582907, "rewards/margins": 1.8932717700799306, "rewards/rejected": -1.7538028717041017, "step": 5730 }, { "epoch": 0.3037659343280418, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10898477.333333334, "logits/rejected": -71875244.8, "logps/chosen": -264.9251302083333, "logps/rejected": -331.36015625, "loss": 0.2157, "rewards/chosen": 0.7543250719706217, "rewards/margins": 2.604576841990153, "rewards/rejected": -1.8502517700195313, "step": 5731 }, { "epoch": 0.3038189383298439, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1201194.0, "logits/rejected": -12911143.2, "logps/chosen": -216.8641357421875, "logps/rejected": -367.219140625, "loss": 0.2664, "rewards/chosen": 0.21517244974772134, "rewards/margins": 2.275295321146647, "rewards/rejected": -2.0601228713989257, "step": 5732 }, { "epoch": 0.30387194233164605, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13537516.0, "logits/rejected": -21398398.666666668, "logps/chosen": -45.00934982299805, "logps/rejected": -280.41371663411456, "loss": 0.2864, "rewards/chosen": 0.042647745460271835, "rewards/margins": 1.4112266562879086, "rewards/rejected": -1.3685789108276367, "step": 5733 }, { "epoch": 0.3039249463334482, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34001396.0, "logits/rejected": -31574328.0, "logps/chosen": -278.98809814453125, "logps/rejected": -197.64349365234375, "loss": 0.3479, "rewards/chosen": -0.0751621201634407, "rewards/margins": 1.5292915627360344, "rewards/rejected": -1.604453682899475, "step": 5734 }, { "epoch": 0.3039779503352503, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47883818.666666664, "logits/rejected": -8457642.4, "logps/chosen": -367.4800618489583, "logps/rejected": -264.976708984375, "loss": 0.2692, "rewards/chosen": 0.1891012191772461, "rewards/margins": 2.135563850402832, "rewards/rejected": -1.946462631225586, "step": 5735 }, { "epoch": 0.30403095433705246, "grad_norm": 54.0, "kl": 0.06418991088867188, "learning_rate": 5e-07, "logits/chosen": -31100098.0, "logits/rejected": -42564840.0, "logps/chosen": -272.4051513671875, "logps/rejected": -222.61793518066406, "loss": 0.4074, "rewards/chosen": -0.0722871869802475, "rewards/margins": 1.0063818842172623, "rewards/rejected": -1.0786690711975098, "step": 5736 }, { "epoch": 0.3040839583388546, "grad_norm": 55.75, "kl": 1.0101757049560547, "learning_rate": 5e-07, "logits/chosen": -16328672.0, "logits/rejected": -48811466.666666664, "logps/chosen": -261.2880859375, "logps/rejected": -428.2849934895833, "loss": 0.3055, "rewards/chosen": 0.30838742256164553, "rewards/margins": 3.215396229426066, "rewards/rejected": -2.9070088068644204, "step": 5737 }, { "epoch": 0.30413696234065674, "grad_norm": 52.25, "kl": 0.24498558044433594, "learning_rate": 5e-07, "logits/chosen": -21683657.6, "logits/rejected": -31000938.666666668, "logps/chosen": -317.679541015625, "logps/rejected": -198.029541015625, "loss": 0.3119, "rewards/chosen": 0.5935494899749756, "rewards/margins": 2.0177535216013593, "rewards/rejected": -1.4242040316263835, "step": 5738 }, { "epoch": 0.3041899663424589, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22385546.0, "logits/rejected": -89756480.0, "logps/chosen": -365.94635009765625, "logps/rejected": -379.9772135416667, "loss": 0.1903, "rewards/chosen": 0.18836231529712677, "rewards/margins": 2.4235040694475174, "rewards/rejected": -2.2351417541503906, "step": 5739 }, { "epoch": 0.304242970344261, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11025630.0, "logits/rejected": -56638952.0, "logps/chosen": -146.78179931640625, "logps/rejected": -345.2093811035156, "loss": 0.2867, "rewards/chosen": 0.2633952796459198, "rewards/margins": 2.1838283240795135, "rewards/rejected": -1.9204330444335938, "step": 5740 }, { "epoch": 0.30429597434606315, "grad_norm": 51.0, "kl": 0.04662132263183594, "learning_rate": 5e-07, "logits/chosen": -8895157.6, "logits/rejected": -687850.0, "logps/chosen": -113.69193115234376, "logps/rejected": -137.28865559895834, "loss": 0.4463, "rewards/chosen": -0.1661144256591797, "rewards/margins": 0.7943363825480143, "rewards/rejected": -0.960450808207194, "step": 5741 }, { "epoch": 0.3043489783478653, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33024992.0, "logits/rejected": -80721446.4, "logps/chosen": -583.7147216796875, "logps/rejected": -536.91552734375, "loss": 0.2518, "rewards/chosen": 0.28996262947718304, "rewards/margins": 2.8575962583223977, "rewards/rejected": -2.567633628845215, "step": 5742 }, { "epoch": 0.3044019823496674, "grad_norm": 61.25, "kl": 0.4791603088378906, "learning_rate": 5e-07, "logits/chosen": -96846918.4, "logits/rejected": -24769008.0, "logps/chosen": -367.693310546875, "logps/rejected": -292.2972005208333, "loss": 0.3319, "rewards/chosen": 0.1859430193901062, "rewards/margins": 2.5978511691093447, "rewards/rejected": -2.4119081497192383, "step": 5743 }, { "epoch": 0.30445498635146956, "grad_norm": 58.0, "kl": 1.2772674560546875, "learning_rate": 5e-07, "logits/chosen": -81228304.0, "logits/rejected": -43994944.0, "logps/chosen": -542.6547241210938, "logps/rejected": -176.01963806152344, "loss": 0.3722, "rewards/chosen": 0.1082611009478569, "rewards/margins": 1.6507825776934624, "rewards/rejected": -1.5425214767456055, "step": 5744 }, { "epoch": 0.3045079903532717, "grad_norm": 54.75, "kl": 1.6631050109863281, "learning_rate": 5e-07, "logits/chosen": -29977240.0, "logits/rejected": -37573704.0, "logps/chosen": -570.8211669921875, "logps/rejected": -519.6343994140625, "loss": 0.2816, "rewards/chosen": 0.9414412975311279, "rewards/margins": 3.290147542953491, "rewards/rejected": -2.3487062454223633, "step": 5745 }, { "epoch": 0.30456099435507383, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1624826.0, "logits/rejected": -23867316.8, "logps/chosen": -44.9233144124349, "logps/rejected": -283.32724609375, "loss": 0.3128, "rewards/chosen": -0.017595301071802776, "rewards/margins": 1.6030902763207753, "rewards/rejected": -1.6206855773925781, "step": 5746 }, { "epoch": 0.30461399835687597, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1630382.6, "logits/rejected": -43687877.333333336, "logps/chosen": -77.55110473632813, "logps/rejected": -364.4329427083333, "loss": 0.2924, "rewards/chosen": 0.45600414276123047, "rewards/margins": 3.266450881958008, "rewards/rejected": -2.8104467391967773, "step": 5747 }, { "epoch": 0.3046670023586781, "grad_norm": 43.25, "kl": 0.3397808074951172, "learning_rate": 5e-07, "logits/chosen": -32319760.0, "logits/rejected": -45271488.0, "logps/chosen": -297.10430908203125, "logps/rejected": -512.408203125, "loss": 0.2815, "rewards/chosen": 0.2271738350391388, "rewards/margins": 2.645995169878006, "rewards/rejected": -2.418821334838867, "step": 5748 }, { "epoch": 0.3047200063604802, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61785484.8, "logits/rejected": -1395734.5, "logps/chosen": -205.1506103515625, "logps/rejected": -166.19220987955728, "loss": 0.4184, "rewards/chosen": -0.029826465249061584, "rewards/margins": 1.435355998078982, "rewards/rejected": -1.4651824633280437, "step": 5749 }, { "epoch": 0.3047730103622823, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5541209.5, "logits/rejected": -31057546.666666668, "logps/chosen": -136.79197692871094, "logps/rejected": -524.815185546875, "loss": 0.1677, "rewards/chosen": 0.3097721040248871, "rewards/margins": 3.1192635794480643, "rewards/rejected": -2.8094914754231772, "step": 5750 }, { "epoch": 0.30482601436408446, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -92364885.33333333, "logits/rejected": -9126236.8, "logps/chosen": -353.4814453125, "logps/rejected": -437.463525390625, "loss": 0.237, "rewards/chosen": 0.29913942019144696, "rewards/margins": 2.5276264349619546, "rewards/rejected": -2.228487014770508, "step": 5751 }, { "epoch": 0.3048790183658866, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79248288.0, "logits/rejected": -44883149.71428572, "logps/chosen": -198.81866455078125, "logps/rejected": -301.54007393973217, "loss": 0.2725, "rewards/chosen": -0.5840896964073181, "rewards/margins": 0.922450874532972, "rewards/rejected": -1.5065405709402901, "step": 5752 }, { "epoch": 0.30493202236768874, "grad_norm": 58.25, "kl": 0.11419677734375, "learning_rate": 5e-07, "logits/chosen": -68357184.0, "logits/rejected": 9713855.0, "logps/chosen": -226.35137939453125, "logps/rejected": -205.1416015625, "loss": 0.4108, "rewards/chosen": -0.36564990878105164, "rewards/margins": 0.8995236456394196, "rewards/rejected": -1.2651735544204712, "step": 5753 }, { "epoch": 0.3049850263694909, "grad_norm": 48.0, "kl": 2.0970964431762695, "learning_rate": 5e-07, "logits/chosen": -25574552.0, "logits/rejected": -7919466.285714285, "logps/chosen": -1024.8223876953125, "logps/rejected": -284.35498046875, "loss": 0.1464, "rewards/chosen": 2.475329637527466, "rewards/margins": 4.308949163981847, "rewards/rejected": -1.8336195264543806, "step": 5754 }, { "epoch": 0.305038030371293, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20445628.0, "logits/rejected": -6378546.5, "logps/chosen": -192.79705810546875, "logps/rejected": -164.11932373046875, "loss": 0.3608, "rewards/chosen": 0.27972379326820374, "rewards/margins": 1.6373521387577057, "rewards/rejected": -1.357628345489502, "step": 5755 }, { "epoch": 0.30509103437309515, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53358960.0, "logits/rejected": -23593192.0, "logps/chosen": -699.73779296875, "logps/rejected": -435.5825602213542, "loss": 0.1995, "rewards/chosen": 1.1255929470062256, "rewards/margins": 3.0471699237823486, "rewards/rejected": -1.921576976776123, "step": 5756 }, { "epoch": 0.3051440383748973, "grad_norm": 50.0, "kl": 0.8855714797973633, "learning_rate": 5e-07, "logits/chosen": -6552049.142857143, "logits/rejected": -12646251.0, "logps/chosen": -546.8534109933036, "logps/rejected": -198.4170684814453, "loss": 0.3051, "rewards/chosen": 0.9711696760995048, "rewards/margins": 3.0117199080330987, "rewards/rejected": -2.0405502319335938, "step": 5757 }, { "epoch": 0.3051970423766994, "grad_norm": 59.25, "kl": 0.5129737854003906, "learning_rate": 5e-07, "logits/chosen": -51458707.2, "logits/rejected": -52361546.666666664, "logps/chosen": -361.769775390625, "logps/rejected": -330.48191324869794, "loss": 0.444, "rewards/chosen": -0.17318146228790282, "rewards/margins": 1.0114529053370158, "rewards/rejected": -1.1846343676249187, "step": 5758 }, { "epoch": 0.30525004637850156, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7134075.0, "logits/rejected": -37284509.333333336, "logps/chosen": -353.1593017578125, "logps/rejected": -333.62451171875, "loss": 0.1982, "rewards/chosen": 0.9750045537948608, "rewards/margins": 3.1601756811141968, "rewards/rejected": -2.185171127319336, "step": 5759 }, { "epoch": 0.3053030503803037, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48424152.0, "logits/rejected": -3340485.5, "logps/chosen": -429.2343444824219, "logps/rejected": -142.77078247070312, "loss": 0.238, "rewards/chosen": 0.8077003359794617, "rewards/margins": 2.7379884123802185, "rewards/rejected": -1.9302880764007568, "step": 5760 }, { "epoch": 0.30535605438210583, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -187705.0, "logits/rejected": -1595447.5, "logps/chosen": -149.72575887044272, "logps/rejected": -69.65567779541016, "loss": 0.347, "rewards/chosen": 0.26418960094451904, "rewards/margins": 2.9568642377853394, "rewards/rejected": -2.6926746368408203, "step": 5761 }, { "epoch": 0.30540905838390797, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36077285.333333336, "logits/rejected": -34915788.8, "logps/chosen": -310.341064453125, "logps/rejected": -511.18310546875, "loss": 0.216, "rewards/chosen": 0.31257768472035724, "rewards/margins": 2.8406830390294395, "rewards/rejected": -2.528105354309082, "step": 5762 }, { "epoch": 0.3054620623857101, "grad_norm": 71.0, "kl": 1.5813674926757812, "learning_rate": 5e-07, "logits/chosen": -85274992.0, "logits/rejected": -14380638.0, "logps/chosen": -426.1015625, "logps/rejected": -112.05599975585938, "loss": 0.3176, "rewards/chosen": 0.38191473484039307, "rewards/margins": 2.451178193092346, "rewards/rejected": -2.069263458251953, "step": 5763 }, { "epoch": 0.30551506638751225, "grad_norm": 31.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5167352.666666667, "logits/rejected": 12415468.0, "logps/chosen": -576.8604736328125, "logps/rejected": -130.386279296875, "loss": 0.2244, "rewards/chosen": 1.1850261688232422, "rewards/margins": 2.8282724380493165, "rewards/rejected": -1.6432462692260743, "step": 5764 }, { "epoch": 0.3055680703893144, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9509023.2, "logits/rejected": -3761239.0, "logps/chosen": -213.6888671875, "logps/rejected": -289.23777262369794, "loss": 0.411, "rewards/chosen": -0.07046074867248535, "rewards/margins": 1.1544349670410157, "rewards/rejected": -1.224895715713501, "step": 5765 }, { "epoch": 0.3056210743911165, "grad_norm": 35.75, "kl": 0.6632843017578125, "learning_rate": 5e-07, "logits/chosen": -16647784.0, "logits/rejected": -26291604.8, "logps/chosen": -168.371826171875, "logps/rejected": -253.48935546875, "loss": 0.2578, "rewards/chosen": 0.3175450960795085, "rewards/margins": 2.579799620310465, "rewards/rejected": -2.262254524230957, "step": 5766 }, { "epoch": 0.30567407839291866, "grad_norm": 53.75, "kl": 0.5999069213867188, "learning_rate": 5e-07, "logits/chosen": -41833744.0, "logits/rejected": -50784536.0, "logps/chosen": -323.9407043457031, "logps/rejected": -292.138916015625, "loss": 0.3715, "rewards/chosen": 0.10380251705646515, "rewards/margins": 1.743146613240242, "rewards/rejected": -1.6393440961837769, "step": 5767 }, { "epoch": 0.3057270823947208, "grad_norm": 55.0, "kl": 1.155487060546875, "learning_rate": 5e-07, "logits/chosen": -29621768.0, "logits/rejected": -2386413.0, "logps/chosen": -336.4056091308594, "logps/rejected": -110.87293243408203, "loss": 0.2858, "rewards/chosen": 0.8787540793418884, "rewards/margins": 2.005184233188629, "rewards/rejected": -1.1264301538467407, "step": 5768 }, { "epoch": 0.30578008639652293, "grad_norm": 41.75, "kl": 0.25432682037353516, "learning_rate": 5e-07, "logits/chosen": -333937.0, "logits/rejected": -18338868.0, "logps/chosen": -162.19598388671875, "logps/rejected": -235.37815856933594, "loss": 0.3343, "rewards/chosen": 0.4050307273864746, "rewards/margins": 1.6024763584136963, "rewards/rejected": -1.1974456310272217, "step": 5769 }, { "epoch": 0.30583309039832507, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19700004.0, "logits/rejected": -25796048.0, "logps/chosen": -304.989990234375, "logps/rejected": -406.97857666015625, "loss": 0.2921, "rewards/chosen": 0.14760665595531464, "rewards/margins": 2.6643507927656174, "rewards/rejected": -2.5167441368103027, "step": 5770 }, { "epoch": 0.3058860944001272, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26341512.0, "logits/rejected": -30054844.0, "logps/chosen": -283.10418701171875, "logps/rejected": -561.4598388671875, "loss": 0.3839, "rewards/chosen": 0.18230567375818887, "rewards/margins": 2.4409116307894387, "rewards/rejected": -2.25860595703125, "step": 5771 }, { "epoch": 0.30593909840192934, "grad_norm": 51.0, "kl": 0.7368850708007812, "learning_rate": 5e-07, "logits/chosen": -51781109.333333336, "logits/rejected": -25146667.2, "logps/chosen": -280.0361735026042, "logps/rejected": -391.3328369140625, "loss": 0.2925, "rewards/chosen": 0.10731518268585205, "rewards/margins": 2.129171109199524, "rewards/rejected": -2.021855926513672, "step": 5772 }, { "epoch": 0.3059921024037315, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 678216.25, "logits/rejected": -31565078.85714286, "logps/chosen": -99.40190124511719, "logps/rejected": -208.39618791852678, "loss": 0.2216, "rewards/chosen": -0.7232574820518494, "rewards/margins": 1.0746006199291773, "rewards/rejected": -1.7978581019810267, "step": 5773 }, { "epoch": 0.3060451064055336, "grad_norm": 41.5, "kl": 1.3092041015625, "learning_rate": 5e-07, "logits/chosen": -25272112.0, "logits/rejected": 147651097.6, "logps/chosen": -312.7524007161458, "logps/rejected": -453.86220703125, "loss": 0.2756, "rewards/chosen": 0.3427240451176961, "rewards/margins": 2.4789594729741418, "rewards/rejected": -2.1362354278564455, "step": 5774 }, { "epoch": 0.30609811040733576, "grad_norm": 53.5, "kl": 0.5921478271484375, "learning_rate": 5e-07, "logits/chosen": -41411840.0, "logits/rejected": -13780684.8, "logps/chosen": -524.078369140625, "logps/rejected": -260.16181640625, "loss": 0.3383, "rewards/chosen": 0.06584066152572632, "rewards/margins": 1.631243646144867, "rewards/rejected": -1.5654029846191406, "step": 5775 }, { "epoch": 0.3061511144091379, "grad_norm": 40.25, "kl": 0.5789775848388672, "learning_rate": 5e-07, "logits/chosen": -11980312.0, "logits/rejected": -42963346.666666664, "logps/chosen": -226.1886474609375, "logps/rejected": -379.2063802083333, "loss": 0.3269, "rewards/chosen": 0.35225648880004884, "rewards/margins": 2.644870154062907, "rewards/rejected": -2.292613665262858, "step": 5776 }, { "epoch": 0.30620411841094003, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18478240.0, "logits/rejected": -76265932.8, "logps/chosen": -163.05471801757812, "logps/rejected": -386.2771484375, "loss": 0.2456, "rewards/chosen": 0.10008348027865092, "rewards/margins": 2.3915721277395883, "rewards/rejected": -2.2914886474609375, "step": 5777 }, { "epoch": 0.30625712241274217, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42636544.0, "logits/rejected": -6605389.5, "logps/chosen": -174.56600952148438, "logps/rejected": -241.2847442626953, "loss": 0.2805, "rewards/chosen": 0.9022471904754639, "rewards/margins": 2.2753078937530518, "rewards/rejected": -1.373060703277588, "step": 5778 }, { "epoch": 0.3063101264145443, "grad_norm": 34.0, "kl": 0.3504667282104492, "learning_rate": 5e-07, "logits/chosen": -6102450.666666667, "logits/rejected": -36248755.2, "logps/chosen": -191.12556966145834, "logps/rejected": -449.524267578125, "loss": 0.2709, "rewards/chosen": 0.2642092704772949, "rewards/margins": 2.266173076629639, "rewards/rejected": -2.001963806152344, "step": 5779 }, { "epoch": 0.30636313041634644, "grad_norm": 43.5, "kl": 0.3220939636230469, "learning_rate": 5e-07, "logits/chosen": -16376848.0, "logits/rejected": 63574368.0, "logps/chosen": -212.77647399902344, "logps/rejected": -263.2276611328125, "loss": 0.3124, "rewards/chosen": 0.059182584285736084, "rewards/margins": 2.034986436367035, "rewards/rejected": -1.9758038520812988, "step": 5780 }, { "epoch": 0.3064161344181486, "grad_norm": 62.5, "kl": 0.7899246215820312, "learning_rate": 5e-07, "logits/chosen": -40859284.0, "logits/rejected": -20449274.0, "logps/chosen": -561.7666625976562, "logps/rejected": -374.8661193847656, "loss": 0.2689, "rewards/chosen": 0.7819352149963379, "rewards/margins": 2.489487886428833, "rewards/rejected": -1.7075526714324951, "step": 5781 }, { "epoch": 0.3064691384199507, "grad_norm": 25.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4551248.0, "logits/rejected": -29520470.85714286, "logps/chosen": -34.31353759765625, "logps/rejected": -326.0777064732143, "loss": 0.1568, "rewards/chosen": 0.21154938638210297, "rewards/margins": 2.8548588433435986, "rewards/rejected": -2.6433094569614957, "step": 5782 }, { "epoch": 0.30652214242175285, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33350410.666666668, "logits/rejected": 30673337.6, "logps/chosen": -211.27484130859375, "logps/rejected": -379.8624755859375, "loss": 0.296, "rewards/chosen": 0.07555986444155376, "rewards/margins": 1.9821423153082531, "rewards/rejected": -1.9065824508666993, "step": 5783 }, { "epoch": 0.306575146423555, "grad_norm": 57.5, "kl": 3.394847869873047, "learning_rate": 5e-07, "logits/chosen": -7186781.333333333, "logits/rejected": -51909988.0, "logps/chosen": -312.2294108072917, "logps/rejected": -155.27496337890625, "loss": 0.4597, "rewards/chosen": 0.2949297825495402, "rewards/margins": 1.8957912127176921, "rewards/rejected": -1.6008614301681519, "step": 5784 }, { "epoch": 0.3066281504253571, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5640326.666666667, "logits/rejected": -12011268.0, "logps/chosen": -87.12195841471355, "logps/rejected": -205.268798828125, "loss": 0.4896, "rewards/chosen": -0.14276559154192606, "rewards/margins": 0.46675090988477075, "rewards/rejected": -0.6095165014266968, "step": 5785 }, { "epoch": 0.30668115442715926, "grad_norm": 53.0, "kl": 1.0368881225585938, "learning_rate": 5e-07, "logits/chosen": -54831737.6, "logits/rejected": -60859418.666666664, "logps/chosen": -413.09833984375, "logps/rejected": -395.858642578125, "loss": 0.3378, "rewards/chosen": 0.23465342521667482, "rewards/margins": 2.524363946914673, "rewards/rejected": -2.289710521697998, "step": 5786 }, { "epoch": 0.3067341584289614, "grad_norm": 43.75, "kl": 0.14882659912109375, "learning_rate": 5e-07, "logits/chosen": -54920344.0, "logits/rejected": -33794532.0, "logps/chosen": -327.6492004394531, "logps/rejected": -293.6524658203125, "loss": 0.2639, "rewards/chosen": 0.3154100775718689, "rewards/margins": 3.7454795241355896, "rewards/rejected": -3.4300694465637207, "step": 5787 }, { "epoch": 0.30678716243076354, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17826234.666666668, "logits/rejected": -28031257.6, "logps/chosen": -418.242431640625, "logps/rejected": -236.565478515625, "loss": 0.2151, "rewards/chosen": 0.7761159737904867, "rewards/margins": 2.799350150426229, "rewards/rejected": -2.023234176635742, "step": 5788 }, { "epoch": 0.3068401664325657, "grad_norm": 51.0, "kl": 1.661376953125, "learning_rate": 5e-07, "logits/chosen": -33700728.0, "logits/rejected": -12208643.0, "logps/chosen": -370.8638916015625, "logps/rejected": -136.36196899414062, "loss": 0.2648, "rewards/chosen": 0.8959599733352661, "rewards/margins": 2.6275895833969116, "rewards/rejected": -1.7316296100616455, "step": 5789 }, { "epoch": 0.3068931704343678, "grad_norm": 48.75, "kl": 0.3526029586791992, "learning_rate": 5e-07, "logits/chosen": -35240393.6, "logits/rejected": -11585586.666666666, "logps/chosen": -148.82779541015626, "logps/rejected": -430.8324381510417, "loss": 0.3392, "rewards/chosen": 0.3001181840896606, "rewards/margins": 2.343247628211975, "rewards/rejected": -2.0431294441223145, "step": 5790 }, { "epoch": 0.30694617443616995, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16442624.0, "logits/rejected": -31721958.4, "logps/chosen": -206.2037556966146, "logps/rejected": -354.9661865234375, "loss": 0.2394, "rewards/chosen": 0.477106769879659, "rewards/margins": 2.4772725184758504, "rewards/rejected": -2.0001657485961912, "step": 5791 }, { "epoch": 0.3069991784379721, "grad_norm": 41.25, "kl": 0.43084716796875, "learning_rate": 5e-07, "logits/chosen": -5374900.0, "logits/rejected": -14248140.8, "logps/chosen": -322.82318115234375, "logps/rejected": -246.814990234375, "loss": 0.2331, "rewards/chosen": 0.4487994909286499, "rewards/margins": 2.6863445043563843, "rewards/rejected": -2.2375450134277344, "step": 5792 }, { "epoch": 0.3070521824397742, "grad_norm": 59.0, "kl": 3.0523452758789062, "learning_rate": 5e-07, "logits/chosen": -36979600.0, "logits/rejected": -7143185.333333333, "logps/chosen": -693.69365234375, "logps/rejected": -123.519775390625, "loss": 0.3446, "rewards/chosen": 0.7829469680786133, "rewards/margins": 2.3785367647806805, "rewards/rejected": -1.5955897967020671, "step": 5793 }, { "epoch": 0.30710518644157636, "grad_norm": 45.5, "kl": 0.14163684844970703, "learning_rate": 5e-07, "logits/chosen": -43611092.0, "logits/rejected": 2543560.75, "logps/chosen": -219.3112030029297, "logps/rejected": -188.6886444091797, "loss": 0.3449, "rewards/chosen": 0.003405943512916565, "rewards/margins": 1.6300265938043594, "rewards/rejected": -1.6266206502914429, "step": 5794 }, { "epoch": 0.3071581904433785, "grad_norm": 51.75, "kl": 2.003373146057129, "learning_rate": 5e-07, "logits/chosen": -38036562.666666664, "logits/rejected": -17473624.0, "logps/chosen": -276.6904703776042, "logps/rejected": -229.3486785888672, "loss": 0.3716, "rewards/chosen": 0.46765267848968506, "rewards/margins": 2.43036949634552, "rewards/rejected": -1.962716817855835, "step": 5795 }, { "epoch": 0.30721119444518064, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 16596678.0, "logits/rejected": -17162814.0, "logps/chosen": -212.22796630859375, "logps/rejected": -270.5633850097656, "loss": 0.2378, "rewards/chosen": 1.0684036016464233, "rewards/margins": 2.653396487236023, "rewards/rejected": -1.5849928855895996, "step": 5796 }, { "epoch": 0.3072641984469828, "grad_norm": 54.0, "kl": 0.9844045639038086, "learning_rate": 5e-07, "logits/chosen": 5069626.5, "logits/rejected": -21908808.0, "logps/chosen": -166.7926025390625, "logps/rejected": -408.0140075683594, "loss": 0.2806, "rewards/chosen": 0.2583956718444824, "rewards/margins": 3.397979259490967, "rewards/rejected": -3.1395835876464844, "step": 5797 }, { "epoch": 0.3073172024487849, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3076061.6, "logits/rejected": -33643464.0, "logps/chosen": -179.51536865234374, "logps/rejected": -345.2810465494792, "loss": 0.3069, "rewards/chosen": 0.25544984340667726, "rewards/margins": 3.4387714783350627, "rewards/rejected": -3.1833216349283853, "step": 5798 }, { "epoch": 0.30737020645058705, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1671272.3333333333, "logits/rejected": -9842947.2, "logps/chosen": -174.448974609375, "logps/rejected": -263.5340087890625, "loss": 0.3603, "rewards/chosen": -0.04570872584978739, "rewards/margins": 1.0712001105149587, "rewards/rejected": -1.1169088363647461, "step": 5799 }, { "epoch": 0.30742321045238913, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21939018.0, "logits/rejected": -53187736.0, "logps/chosen": -164.89178466796875, "logps/rejected": -551.256591796875, "loss": 0.2919, "rewards/chosen": 0.3906228840351105, "rewards/margins": 2.334882289171219, "rewards/rejected": -1.9442594051361084, "step": 5800 }, { "epoch": 0.30747621445419127, "grad_norm": 44.5, "kl": 0.23767757415771484, "learning_rate": 5e-07, "logits/chosen": -68835050.66666667, "logits/rejected": -28850508.8, "logps/chosen": -253.4898681640625, "logps/rejected": -204.4737548828125, "loss": 0.2604, "rewards/chosen": 0.27097855011622113, "rewards/margins": 2.2312210122744243, "rewards/rejected": -1.960242462158203, "step": 5801 }, { "epoch": 0.3075292184559934, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23321508.0, "logits/rejected": -108885768.0, "logps/chosen": -274.7793273925781, "logps/rejected": -259.40155029296875, "loss": 0.3167, "rewards/chosen": 0.4107760787010193, "rewards/margins": 1.8336006999015808, "rewards/rejected": -1.4228246212005615, "step": 5802 }, { "epoch": 0.30758222245779554, "grad_norm": 52.5, "kl": 0.429473876953125, "learning_rate": 5e-07, "logits/chosen": -33936056.0, "logits/rejected": -27926179.2, "logps/chosen": -489.6451009114583, "logps/rejected": -155.509375, "loss": 0.2962, "rewards/chosen": 0.07365316152572632, "rewards/margins": 2.1324300169944763, "rewards/rejected": -2.05877685546875, "step": 5803 }, { "epoch": 0.3076352264595977, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2022110.0, "logits/rejected": -32580100.0, "logps/chosen": -239.6346435546875, "logps/rejected": -264.6557922363281, "loss": 0.3464, "rewards/chosen": 0.04578913748264313, "rewards/margins": 1.6241771131753922, "rewards/rejected": -1.578387975692749, "step": 5804 }, { "epoch": 0.3076882304613998, "grad_norm": 43.25, "kl": 1.1472396850585938, "learning_rate": 5e-07, "logits/chosen": -23030018.666666668, "logits/rejected": -26119918.4, "logps/chosen": -212.89762369791666, "logps/rejected": -305.8607421875, "loss": 0.2947, "rewards/chosen": 0.7098593711853027, "rewards/margins": 2.196181392669678, "rewards/rejected": -1.486322021484375, "step": 5805 }, { "epoch": 0.30774123446320195, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41163605.333333336, "logits/rejected": -18603155.2, "logps/chosen": -672.3727213541666, "logps/rejected": -211.5493408203125, "loss": 0.3095, "rewards/chosen": 0.32249754667282104, "rewards/margins": 1.7547510027885438, "rewards/rejected": -1.4322534561157227, "step": 5806 }, { "epoch": 0.3077942384650041, "grad_norm": 48.75, "kl": 0.13258647918701172, "learning_rate": 5e-07, "logits/chosen": -34414016.0, "logits/rejected": 72836789.33333333, "logps/chosen": -221.3919677734375, "logps/rejected": -353.846435546875, "loss": 0.3725, "rewards/chosen": 0.10725438594818115, "rewards/margins": 1.6884692907333374, "rewards/rejected": -1.5812149047851562, "step": 5807 }, { "epoch": 0.3078472424668062, "grad_norm": 39.0, "kl": 0.7275276184082031, "learning_rate": 5e-07, "logits/chosen": -34508928.0, "logits/rejected": -25182654.0, "logps/chosen": -327.02398681640625, "logps/rejected": -286.469482421875, "loss": 0.2342, "rewards/chosen": 1.0299237966537476, "rewards/margins": 3.0830222368240356, "rewards/rejected": -2.053098440170288, "step": 5808 }, { "epoch": 0.30790024646860836, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52085048.0, "logits/rejected": -52211157.333333336, "logps/chosen": -307.364990234375, "logps/rejected": -483.7950439453125, "loss": 0.2366, "rewards/chosen": 0.5236770510673523, "rewards/margins": 2.418504575888316, "rewards/rejected": -1.8948275248209636, "step": 5809 }, { "epoch": 0.3079532504704105, "grad_norm": 59.75, "kl": 0.5468215942382812, "learning_rate": 5e-07, "logits/chosen": -47104101.333333336, "logits/rejected": -6769554.4, "logps/chosen": -537.4195963541666, "logps/rejected": -277.0831787109375, "loss": 0.2447, "rewards/chosen": 0.6732401847839355, "rewards/margins": 2.2081111907958983, "rewards/rejected": -1.5348710060119628, "step": 5810 }, { "epoch": 0.30800625447221264, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65658468.0, "logits/rejected": -33105202.0, "logps/chosen": -350.19976806640625, "logps/rejected": -365.33868408203125, "loss": 0.357, "rewards/chosen": -0.03132552653551102, "rewards/margins": 1.401230864226818, "rewards/rejected": -1.432556390762329, "step": 5811 }, { "epoch": 0.3080592584740148, "grad_norm": 47.75, "kl": 0.3689002990722656, "learning_rate": 5e-07, "logits/chosen": -72494282.66666667, "logits/rejected": -38133356.8, "logps/chosen": -426.4565836588542, "logps/rejected": -453.24228515625, "loss": 0.1995, "rewards/chosen": 0.7286173502604166, "rewards/margins": 2.976967684427897, "rewards/rejected": -2.2483503341674806, "step": 5812 }, { "epoch": 0.3081122624758169, "grad_norm": 60.25, "kl": 1.8067550659179688, "learning_rate": 5e-07, "logits/chosen": -31392003.2, "logits/rejected": -21236309.333333332, "logps/chosen": -553.628173828125, "logps/rejected": -286.6601969401042, "loss": 0.2591, "rewards/chosen": 1.2443280220031738, "rewards/margins": 2.5862089792887373, "rewards/rejected": -1.3418809572855632, "step": 5813 }, { "epoch": 0.30816526647761905, "grad_norm": 58.25, "kl": 0.57489013671875, "learning_rate": 5e-07, "logits/chosen": -54715552.0, "logits/rejected": -19289540.0, "logps/chosen": -351.0543518066406, "logps/rejected": -598.2234700520834, "loss": 0.2321, "rewards/chosen": 0.43934231996536255, "rewards/margins": 2.380633493264516, "rewards/rejected": -1.9412911732991536, "step": 5814 }, { "epoch": 0.3082182704794212, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22384289.6, "logits/rejected": -68898640.0, "logps/chosen": -250.758740234375, "logps/rejected": -552.34716796875, "loss": 0.4197, "rewards/chosen": -0.48335843086242675, "rewards/margins": 2.127468220392863, "rewards/rejected": -2.6108266512552896, "step": 5815 }, { "epoch": 0.3082712744812233, "grad_norm": 49.75, "kl": 0.34395599365234375, "learning_rate": 5e-07, "logits/chosen": -64263673.6, "logits/rejected": -12904172.0, "logps/chosen": -336.864501953125, "logps/rejected": -88.15699259440105, "loss": 0.4236, "rewards/chosen": 0.3011059284210205, "rewards/margins": 0.753971529006958, "rewards/rejected": -0.4528656005859375, "step": 5816 }, { "epoch": 0.30832427848302546, "grad_norm": 63.5, "kl": 0.8352584838867188, "learning_rate": 5e-07, "logits/chosen": -23717579.42857143, "logits/rejected": -13454819.0, "logps/chosen": -255.78651646205358, "logps/rejected": -593.5860595703125, "loss": 0.4029, "rewards/chosen": 0.29130121639796663, "rewards/margins": 4.5716113703591486, "rewards/rejected": -4.280310153961182, "step": 5817 }, { "epoch": 0.3083772824848276, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11444912.0, "logits/rejected": -15971694.0, "logps/chosen": -213.048583984375, "logps/rejected": -275.8020935058594, "loss": 0.3178, "rewards/chosen": 0.48821204900741577, "rewards/margins": 1.6337663531303406, "rewards/rejected": -1.1455543041229248, "step": 5818 }, { "epoch": 0.30843028648662973, "grad_norm": 69.5, "kl": 1.3014297485351562, "learning_rate": 5e-07, "logits/chosen": -31049964.8, "logits/rejected": -10153976.0, "logps/chosen": -498.866845703125, "logps/rejected": -95.8623758951823, "loss": 0.3609, "rewards/chosen": 0.49033942222595217, "rewards/margins": 1.6185076077779135, "rewards/rejected": -1.1281681855519612, "step": 5819 }, { "epoch": 0.30848329048843187, "grad_norm": 49.75, "kl": 0.6071548461914062, "learning_rate": 5e-07, "logits/chosen": -8823420.0, "logits/rejected": -35877360.0, "logps/chosen": -515.42822265625, "logps/rejected": -398.0811360677083, "loss": 0.1652, "rewards/chosen": 0.8995193243026733, "rewards/margins": 3.2418287992477417, "rewards/rejected": -2.3423094749450684, "step": 5820 }, { "epoch": 0.308536294490234, "grad_norm": 64.5, "kl": 1.9131107330322266, "learning_rate": 5e-07, "logits/chosen": -2770159.0, "logits/rejected": 730655.2, "logps/chosen": -848.92919921875, "logps/rejected": -304.23203125, "loss": 0.2377, "rewards/chosen": 1.0222910245259602, "rewards/margins": 2.6654096921284993, "rewards/rejected": -1.643118667602539, "step": 5821 }, { "epoch": 0.30858929849203615, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10682449.0, "logits/rejected": -24048660.0, "logps/chosen": -214.43536376953125, "logps/rejected": -210.99703979492188, "loss": 0.25, "rewards/chosen": 0.4592551589012146, "rewards/margins": 2.9860071539878845, "rewards/rejected": -2.52675199508667, "step": 5822 }, { "epoch": 0.3086423024938383, "grad_norm": 40.5, "kl": 0.9561052322387695, "learning_rate": 5e-07, "logits/chosen": -18587700.8, "logits/rejected": -23798762.666666668, "logps/chosen": -162.3994384765625, "logps/rejected": -284.8138834635417, "loss": 0.3141, "rewards/chosen": 0.4933137893676758, "rewards/margins": 2.9412593841552734, "rewards/rejected": -2.4479455947875977, "step": 5823 }, { "epoch": 0.3086953064956404, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32925948.0, "logits/rejected": -10528608.0, "logps/chosen": -321.4625549316406, "logps/rejected": -271.1868896484375, "loss": 0.256, "rewards/chosen": 0.4890262484550476, "rewards/margins": 3.1212002635002136, "rewards/rejected": -2.632174015045166, "step": 5824 }, { "epoch": 0.30874831049744256, "grad_norm": 89.5, "kl": 2.0801162719726562, "learning_rate": 5e-07, "logits/chosen": -16128077.333333334, "logits/rejected": -38419712.0, "logps/chosen": -827.2845052083334, "logps/rejected": -297.2148681640625, "loss": 0.2529, "rewards/chosen": 1.2302734851837158, "rewards/margins": 2.789251947402954, "rewards/rejected": -1.5589784622192382, "step": 5825 }, { "epoch": 0.3088013144992447, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73000928.0, "logits/rejected": -12084997.714285715, "logps/chosen": -377.1795349121094, "logps/rejected": -183.22002301897322, "loss": 0.3796, "rewards/chosen": -0.3384857177734375, "rewards/margins": 0.40204695292881554, "rewards/rejected": -0.740532670702253, "step": 5826 }, { "epoch": 0.30885431850104683, "grad_norm": 62.5, "kl": 0.6837005615234375, "learning_rate": 5e-07, "logits/chosen": -43452368.0, "logits/rejected": -17955276.0, "logps/chosen": -403.96527099609375, "logps/rejected": -270.3363952636719, "loss": 0.3268, "rewards/chosen": 0.5372230410575867, "rewards/margins": 1.7529072165489197, "rewards/rejected": -1.215684175491333, "step": 5827 }, { "epoch": 0.30890732250284897, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66792576.0, "logits/rejected": -31613274.666666668, "logps/chosen": -479.2020263671875, "logps/rejected": -220.79024251302084, "loss": 0.265, "rewards/chosen": -0.1936432123184204, "rewards/margins": 1.6235433022181194, "rewards/rejected": -1.8171865145365398, "step": 5828 }, { "epoch": 0.3089603265046511, "grad_norm": 26.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18756166.0, "logits/rejected": -18913608.0, "logps/chosen": -64.32717895507812, "logps/rejected": -204.42445373535156, "loss": 0.3892, "rewards/chosen": -0.20283807814121246, "rewards/margins": 1.3941621631383896, "rewards/rejected": -1.597000241279602, "step": 5829 }, { "epoch": 0.30901333050645324, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48056469.333333336, "logits/rejected": -30625334.4, "logps/chosen": -359.734130859375, "logps/rejected": -274.224072265625, "loss": 0.2671, "rewards/chosen": 0.14883575836817423, "rewards/margins": 2.316333011786143, "rewards/rejected": -2.167497253417969, "step": 5830 }, { "epoch": 0.3090663345082554, "grad_norm": 39.75, "kl": 0.7104568481445312, "learning_rate": 5e-07, "logits/chosen": 17753858.666666668, "logits/rejected": -7550314.4, "logps/chosen": -708.5146484375, "logps/rejected": -180.159326171875, "loss": 0.1859, "rewards/chosen": 1.6448322931925456, "rewards/margins": 4.1890410105387375, "rewards/rejected": -2.5442087173461916, "step": 5831 }, { "epoch": 0.3091193385100575, "grad_norm": 50.75, "kl": 1.2362585067749023, "learning_rate": 5e-07, "logits/chosen": -36761120.0, "logits/rejected": 691219.25, "logps/chosen": -268.35630289713544, "logps/rejected": -73.5218505859375, "loss": 0.4672, "rewards/chosen": -0.22670404116312662, "rewards/margins": 2.4973259766896567, "rewards/rejected": -2.724030017852783, "step": 5832 }, { "epoch": 0.30917234251185965, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35615724.0, "logits/rejected": -28824248.0, "logps/chosen": -119.32971954345703, "logps/rejected": -378.96551513671875, "loss": 0.3013, "rewards/chosen": 0.27244314551353455, "rewards/margins": 1.9161510169506073, "rewards/rejected": -1.6437078714370728, "step": 5833 }, { "epoch": 0.3092253465136618, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14361974.0, "logits/rejected": -4597098.0, "logps/chosen": -267.721435546875, "logps/rejected": -243.4791259765625, "loss": 0.2806, "rewards/chosen": 0.6665060520172119, "rewards/margins": 2.2388153076171875, "rewards/rejected": -1.5723092555999756, "step": 5834 }, { "epoch": 0.30927835051546393, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45463112.0, "logits/rejected": -22429248.0, "logps/chosen": -294.74859619140625, "logps/rejected": -337.013427734375, "loss": 0.3368, "rewards/chosen": 0.06236690282821655, "rewards/margins": 1.570917546749115, "rewards/rejected": -1.5085506439208984, "step": 5835 }, { "epoch": 0.30933135451726607, "grad_norm": 59.25, "kl": 1.6539039611816406, "learning_rate": 5e-07, "logits/chosen": -18703993.6, "logits/rejected": -43407608.0, "logps/chosen": -290.3930908203125, "logps/rejected": -311.2454833984375, "loss": 0.3871, "rewards/chosen": 0.3257354974746704, "rewards/margins": 1.588893739382426, "rewards/rejected": -1.2631582419077556, "step": 5836 }, { "epoch": 0.3093843585190682, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63613312.0, "logits/rejected": -6913701.333333333, "logps/chosen": -353.0515625, "logps/rejected": -182.7900390625, "loss": 0.3671, "rewards/chosen": 0.12117515802383423, "rewards/margins": 1.6844121893246968, "rewards/rejected": -1.5632370313008626, "step": 5837 }, { "epoch": 0.30943736252087034, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44280732.8, "logits/rejected": -28490042.666666668, "logps/chosen": -401.18466796875, "logps/rejected": -639.1966145833334, "loss": 0.2327, "rewards/chosen": 0.82584228515625, "rewards/margins": 3.5103922526041664, "rewards/rejected": -2.6845499674479165, "step": 5838 }, { "epoch": 0.3094903665226725, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29801588.0, "logits/rejected": -13753827.0, "logps/chosen": -406.3355712890625, "logps/rejected": -219.59408569335938, "loss": 0.3596, "rewards/chosen": 0.28188323974609375, "rewards/margins": 1.327237844467163, "rewards/rejected": -1.0453546047210693, "step": 5839 }, { "epoch": 0.3095433705244746, "grad_norm": 45.75, "kl": 0.3540802001953125, "learning_rate": 5e-07, "logits/chosen": -62621260.8, "logits/rejected": -51314784.0, "logps/chosen": -348.606884765625, "logps/rejected": -522.78662109375, "loss": 0.2131, "rewards/chosen": 1.1640522956848145, "rewards/margins": 3.5379994392395018, "rewards/rejected": -2.3739471435546875, "step": 5840 }, { "epoch": 0.30959637452627675, "grad_norm": 51.0, "kl": 1.3390369415283203, "learning_rate": 5e-07, "logits/chosen": 1926376.625, "logits/rejected": -70597312.0, "logps/chosen": -421.0212097167969, "logps/rejected": -395.8121337890625, "loss": 0.2305, "rewards/chosen": 0.7226561307907104, "rewards/margins": 3.5336962938308716, "rewards/rejected": -2.811040163040161, "step": 5841 }, { "epoch": 0.3096493785280789, "grad_norm": 51.0, "kl": 0.5281600952148438, "learning_rate": 5e-07, "logits/chosen": -18678160.0, "logits/rejected": -46085141.333333336, "logps/chosen": -226.1240234375, "logps/rejected": -421.34130859375, "loss": 0.1549, "rewards/chosen": 0.8883202075958252, "rewards/margins": 3.4106533527374268, "rewards/rejected": -2.5223331451416016, "step": 5842 }, { "epoch": 0.309702382529881, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17205866.0, "logits/rejected": -23545814.85714286, "logps/chosen": -432.957763671875, "logps/rejected": -296.79541015625, "loss": 0.1986, "rewards/chosen": 0.44061279296875, "rewards/margins": 2.41051823752267, "rewards/rejected": -1.9699054445539201, "step": 5843 }, { "epoch": 0.30975538653168316, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20485570.666666668, "logits/rejected": -19196844.8, "logps/chosen": -284.3274739583333, "logps/rejected": -369.518701171875, "loss": 0.2606, "rewards/chosen": 0.6452441215515137, "rewards/margins": 2.184518909454346, "rewards/rejected": -1.5392747879028321, "step": 5844 }, { "epoch": 0.3098083905334853, "grad_norm": 61.25, "kl": 1.4113197326660156, "learning_rate": 5e-07, "logits/chosen": -8811839.42857143, "logits/rejected": -10087880.0, "logps/chosen": -424.56685965401783, "logps/rejected": -121.49632263183594, "loss": 0.364, "rewards/chosen": 0.6636813027518136, "rewards/margins": 1.9362231833594186, "rewards/rejected": -1.272541880607605, "step": 5845 }, { "epoch": 0.30986139453528744, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22547500.0, "logits/rejected": -10844330.0, "logps/chosen": -506.10443115234375, "logps/rejected": -282.2657165527344, "loss": 0.4082, "rewards/chosen": 0.03999888896942139, "rewards/margins": 0.8792606592178345, "rewards/rejected": -0.8392617702484131, "step": 5846 }, { "epoch": 0.3099143985370896, "grad_norm": 73.0, "kl": 0.19268035888671875, "learning_rate": 5e-07, "logits/chosen": -5836292.0, "logits/rejected": -101602192.0, "logps/chosen": -320.7065022786458, "logps/rejected": -610.0762939453125, "loss": 0.4048, "rewards/chosen": 0.00596269965171814, "rewards/margins": 2.3160947263240814, "rewards/rejected": -2.3101320266723633, "step": 5847 }, { "epoch": 0.3099674025388917, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24294278.4, "logits/rejected": -19772436.0, "logps/chosen": -527.40654296875, "logps/rejected": -395.1524251302083, "loss": 0.2875, "rewards/chosen": 0.5381558418273926, "rewards/margins": 2.6514663378397625, "rewards/rejected": -2.1133104960123696, "step": 5848 }, { "epoch": 0.31002040654069385, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47472024.0, "logits/rejected": -26836004.0, "logps/chosen": -548.635986328125, "logps/rejected": -163.56324768066406, "loss": 0.3212, "rewards/chosen": 0.4749611020088196, "rewards/margins": 1.8049198985099792, "rewards/rejected": -1.3299587965011597, "step": 5849 }, { "epoch": 0.31007341054249593, "grad_norm": 47.75, "kl": 0.4130744934082031, "learning_rate": 5e-07, "logits/chosen": -13158409.0, "logits/rejected": -12811357.333333334, "logps/chosen": -296.22137451171875, "logps/rejected": -234.85648600260416, "loss": 0.2427, "rewards/chosen": 0.6952667236328125, "rewards/margins": 2.312219937642415, "rewards/rejected": -1.6169532140096028, "step": 5850 }, { "epoch": 0.31012641454429807, "grad_norm": 55.5, "kl": 0.3741769790649414, "learning_rate": 5e-07, "logits/chosen": -7866757.333333333, "logits/rejected": -36614412.0, "logps/chosen": -347.2256673177083, "logps/rejected": -373.1298522949219, "loss": 0.3402, "rewards/chosen": 0.47240432103474933, "rewards/margins": 2.4315198262532554, "rewards/rejected": -1.9591155052185059, "step": 5851 }, { "epoch": 0.3101794185461002, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29454821.333333332, "logits/rejected": -35655904.0, "logps/chosen": -188.4986572265625, "logps/rejected": -556.759765625, "loss": 0.4179, "rewards/chosen": -0.18705028295516968, "rewards/margins": 3.4423611760139465, "rewards/rejected": -3.629411458969116, "step": 5852 }, { "epoch": 0.31023242254790234, "grad_norm": 64.5, "kl": 2.3615856170654297, "learning_rate": 5e-07, "logits/chosen": -41754432.0, "logps/chosen": -316.68890380859375, "loss": 0.5333, "rewards/chosen": 0.09813576936721802, "step": 5853 }, { "epoch": 0.3102854265497045, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14019012.0, "logits/rejected": -19375796.0, "logps/chosen": -94.4238993326823, "logps/rejected": -269.6690979003906, "loss": 0.3249, "rewards/chosen": 0.5768105189005533, "rewards/margins": 2.2828988234202066, "rewards/rejected": -1.7060883045196533, "step": 5854 }, { "epoch": 0.3103384305515066, "grad_norm": 59.25, "kl": 0.1992645263671875, "learning_rate": 5e-07, "logits/chosen": -19354316.8, "logits/rejected": -48987893.333333336, "logps/chosen": -386.259326171875, "logps/rejected": -374.6165771484375, "loss": 0.3449, "rewards/chosen": 0.27432005405426024, "rewards/margins": 1.8635114908218384, "rewards/rejected": -1.5891914367675781, "step": 5855 }, { "epoch": 0.31039143455330875, "grad_norm": 44.5, "kl": 0.47069358825683594, "learning_rate": 5e-07, "logits/chosen": -14181497.333333334, "logits/rejected": -6553171.5, "logps/chosen": -180.382080078125, "logps/rejected": -116.4384765625, "loss": 0.3263, "rewards/chosen": 0.627060612042745, "rewards/margins": 2.2244437535603843, "rewards/rejected": -1.5973831415176392, "step": 5856 }, { "epoch": 0.3104444385551109, "grad_norm": 50.5, "kl": 0.8834075927734375, "learning_rate": 5e-07, "logits/chosen": -42575612.0, "logits/rejected": -39689552.0, "logps/chosen": -203.58775329589844, "logps/rejected": -382.4454650878906, "loss": 0.3404, "rewards/chosen": -0.18490485846996307, "rewards/margins": 2.1327192336320877, "rewards/rejected": -2.317624092102051, "step": 5857 }, { "epoch": 0.31049744255691303, "grad_norm": 52.25, "kl": 0.038116455078125, "learning_rate": 5e-07, "logits/chosen": -42523026.666666664, "logits/rejected": -27576420.0, "logps/chosen": -232.33308919270834, "logps/rejected": -595.3244018554688, "loss": 0.3499, "rewards/chosen": 0.23004438479741415, "rewards/margins": 3.003495713075002, "rewards/rejected": -2.773451328277588, "step": 5858 }, { "epoch": 0.31055044655871517, "grad_norm": 48.75, "kl": 0.2754964828491211, "learning_rate": 5e-07, "logits/chosen": -25012462.4, "logits/rejected": -41542301.333333336, "logps/chosen": -258.982666015625, "logps/rejected": -264.3083902994792, "loss": 0.352, "rewards/chosen": 0.24667189121246338, "rewards/margins": 2.1548511743545533, "rewards/rejected": -1.9081792831420898, "step": 5859 }, { "epoch": 0.3106034505605173, "grad_norm": 52.5, "kl": 0.2193136215209961, "learning_rate": 5e-07, "logits/chosen": -50703104.0, "logits/rejected": -4637386.4, "logps/chosen": -384.6617024739583, "logps/rejected": -228.1293701171875, "loss": 0.2834, "rewards/chosen": 0.8290602366129557, "rewards/margins": 2.1022320429484047, "rewards/rejected": -1.2731718063354491, "step": 5860 }, { "epoch": 0.31065645456231944, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18896988.0, "logits/rejected": -60314420.0, "logps/chosen": -170.8185272216797, "logps/rejected": -163.3312530517578, "loss": 0.2964, "rewards/chosen": 0.6663669347763062, "rewards/margins": 2.2616833448410034, "rewards/rejected": -1.5953164100646973, "step": 5861 }, { "epoch": 0.3107094585641216, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1749481.5, "logits/rejected": -18444466.666666668, "logps/chosen": -408.07379150390625, "logps/rejected": -297.3038330078125, "loss": 0.2916, "rewards/chosen": -0.05287017673254013, "rewards/margins": 1.515194830795129, "rewards/rejected": -1.5680650075276692, "step": 5862 }, { "epoch": 0.3107624625659237, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31904565.333333332, "logits/rejected": -19193545.6, "logps/chosen": -178.2723185221354, "logps/rejected": -296.4049560546875, "loss": 0.2087, "rewards/chosen": 0.451513409614563, "rewards/margins": 3.0935818910598756, "rewards/rejected": -2.6420684814453126, "step": 5863 }, { "epoch": 0.31081546656772585, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42728944.0, "logits/rejected": -18718852.0, "logps/chosen": -253.03787231445312, "logps/rejected": -266.94818115234375, "loss": 0.3582, "rewards/chosen": -0.005015749484300613, "rewards/margins": 1.9022716097533703, "rewards/rejected": -1.907287359237671, "step": 5864 }, { "epoch": 0.310868470569528, "grad_norm": 50.0, "kl": 2.3858509063720703, "learning_rate": 5e-07, "logits/chosen": -5984835.2, "logits/rejected": -13187958.666666666, "logps/chosen": -232.244189453125, "logps/rejected": -501.34326171875, "loss": 0.2892, "rewards/chosen": 0.8815466880798339, "rewards/margins": 2.8544493039449055, "rewards/rejected": -1.9729026158650715, "step": 5865 }, { "epoch": 0.3109214745713301, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47185168.0, "logits/rejected": -29245766.0, "logps/chosen": -241.0687255859375, "logps/rejected": -271.56884765625, "loss": 0.2667, "rewards/chosen": 0.4085042476654053, "rewards/margins": 2.7812304496765137, "rewards/rejected": -2.3727262020111084, "step": 5866 }, { "epoch": 0.31097447857313226, "grad_norm": 56.75, "kl": 3.069843292236328, "learning_rate": 5e-07, "logits/chosen": -44300832.0, "logits/rejected": -1871510.75, "logps/chosen": -335.6505940755208, "logps/rejected": -192.94512939453125, "loss": 0.2825, "rewards/chosen": 1.3757322629292805, "rewards/margins": 2.2527906497319536, "rewards/rejected": -0.8770583868026733, "step": 5867 }, { "epoch": 0.3110274825749344, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80939840.0, "logits/rejected": -43442237.333333336, "logps/chosen": -255.11187744140625, "logps/rejected": -395.5522867838542, "loss": 0.1835, "rewards/chosen": 0.8776487112045288, "rewards/margins": 3.008243441581726, "rewards/rejected": -2.1305947303771973, "step": 5868 }, { "epoch": 0.31108048657673654, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53908444.0, "logits/rejected": -54746140.0, "logps/chosen": -328.12237548828125, "logps/rejected": -501.47509765625, "loss": 0.2901, "rewards/chosen": 0.176615372300148, "rewards/margins": 2.8377941995859146, "rewards/rejected": -2.6611788272857666, "step": 5869 }, { "epoch": 0.3111334905785387, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25483716.0, "logits/rejected": -20078304.0, "logps/chosen": -98.09243774414062, "logps/rejected": -267.6011962890625, "loss": 0.2071, "rewards/chosen": 0.06025199592113495, "rewards/margins": 2.301194950938225, "rewards/rejected": -2.24094295501709, "step": 5870 }, { "epoch": 0.3111864945803408, "grad_norm": 59.0, "kl": 0.9851760864257812, "learning_rate": 5e-07, "logits/chosen": 12142860.0, "logits/rejected": -5787458.4, "logps/chosen": -576.2667236328125, "logps/rejected": -83.416650390625, "loss": 0.3504, "rewards/chosen": 0.7300601005554199, "rewards/margins": 1.5308321952819823, "rewards/rejected": -0.8007720947265625, "step": 5871 }, { "epoch": 0.31123949858214295, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39680608.0, "logits/rejected": -20042806.4, "logps/chosen": -207.5880126953125, "logps/rejected": -458.480712890625, "loss": 0.2482, "rewards/chosen": 0.03701259692509969, "rewards/margins": 2.5497441490491233, "rewards/rejected": -2.5127315521240234, "step": 5872 }, { "epoch": 0.3112925025839451, "grad_norm": 39.5, "kl": 0.25174522399902344, "learning_rate": 5e-07, "logits/chosen": -12879629.0, "logits/rejected": -46077592.0, "logps/chosen": -122.06886291503906, "logps/rejected": -319.3349914550781, "loss": 0.2594, "rewards/chosen": 0.1985914707183838, "rewards/margins": 3.3463706970214844, "rewards/rejected": -3.1477792263031006, "step": 5873 }, { "epoch": 0.3113455065857472, "grad_norm": 48.5, "kl": 0.6710615158081055, "learning_rate": 5e-07, "logits/chosen": -57702762.666666664, "logits/rejected": -6061169.6, "logps/chosen": -285.36083984375, "logps/rejected": -348.2960693359375, "loss": 0.2569, "rewards/chosen": 0.8491172790527344, "rewards/margins": 2.4695365905761717, "rewards/rejected": -1.6204193115234375, "step": 5874 }, { "epoch": 0.31139851058754936, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49041576.0, "logits/rejected": -2923740.5, "logps/chosen": -359.5977783203125, "logps/rejected": -199.22763061523438, "loss": 0.2929, "rewards/chosen": 0.5302932858467102, "rewards/margins": 2.1027031540870667, "rewards/rejected": -1.5724098682403564, "step": 5875 }, { "epoch": 0.3114515145893515, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40478173.333333336, "logits/rejected": -21199683.2, "logps/chosen": -498.0684407552083, "logps/rejected": -247.9072998046875, "loss": 0.2861, "rewards/chosen": -0.14849750200907388, "rewards/margins": 1.7607098420461018, "rewards/rejected": -1.9092073440551758, "step": 5876 }, { "epoch": 0.31150451859115363, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1641971.5, "logits/rejected": -21595500.0, "logps/chosen": -110.80364990234375, "logps/rejected": -258.44439697265625, "loss": 0.2942, "rewards/chosen": -0.05372662842273712, "rewards/margins": 2.5084279030561447, "rewards/rejected": -2.562154531478882, "step": 5877 }, { "epoch": 0.31155752259295577, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12996476.0, "logits/rejected": -26066629.333333332, "logps/chosen": -201.9501953125, "logps/rejected": -403.1263020833333, "loss": 0.2861, "rewards/chosen": 0.5877576351165772, "rewards/margins": 2.680162954330444, "rewards/rejected": -2.092405319213867, "step": 5878 }, { "epoch": 0.3116105265947579, "grad_norm": 61.75, "kl": 5.36845588684082, "learning_rate": 5e-07, "logits/chosen": -54544037.333333336, "logits/rejected": -46540112.0, "logps/chosen": -439.3444417317708, "logps/rejected": -223.25274658203125, "loss": 0.3337, "rewards/chosen": 1.1605923970540364, "rewards/margins": 3.5503562291463213, "rewards/rejected": -2.389763832092285, "step": 5879 }, { "epoch": 0.31166353059656005, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31329672.0, "logits/rejected": -43874728.0, "logps/chosen": -422.43963623046875, "logps/rejected": -457.0450744628906, "loss": 0.2581, "rewards/chosen": 0.44836246967315674, "rewards/margins": 2.866989016532898, "rewards/rejected": -2.418626546859741, "step": 5880 }, { "epoch": 0.3117165345983622, "grad_norm": 37.25, "kl": 0.32268714904785156, "learning_rate": 5e-07, "logits/chosen": 6129734.0, "logits/rejected": -4583790.5, "logps/chosen": -193.37742614746094, "logps/rejected": -178.02003479003906, "loss": 0.3056, "rewards/chosen": 0.43221044540405273, "rewards/margins": 2.074375629425049, "rewards/rejected": -1.642165184020996, "step": 5881 }, { "epoch": 0.3117695386001643, "grad_norm": 51.75, "kl": 0.20640087127685547, "learning_rate": 5e-07, "logits/chosen": -65310748.0, "logits/rejected": 12614873.0, "logps/chosen": -303.7159423828125, "logps/rejected": -92.65071868896484, "loss": 0.3599, "rewards/chosen": 0.055742621421813965, "rewards/margins": 1.4707998037338257, "rewards/rejected": -1.4150571823120117, "step": 5882 }, { "epoch": 0.31182254260196646, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13735112.0, "logits/rejected": -27591061.333333332, "logps/chosen": -234.2402801513672, "logps/rejected": -245.76875813802084, "loss": 0.2833, "rewards/chosen": -0.15489844977855682, "rewards/margins": 2.1749733636776605, "rewards/rejected": -2.3298718134562173, "step": 5883 }, { "epoch": 0.3118755466037686, "grad_norm": 47.75, "kl": 0.6092929840087891, "learning_rate": 5e-07, "logits/chosen": -42056484.0, "logits/rejected": -8700987.0, "logps/chosen": -294.6739807128906, "logps/rejected": -143.30715942382812, "loss": 0.3013, "rewards/chosen": 0.5378620624542236, "rewards/margins": 2.122244358062744, "rewards/rejected": -1.5843822956085205, "step": 5884 }, { "epoch": 0.31192855060557073, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1182683.3333333333, "logits/rejected": -3724000.0, "logps/chosen": -471.0030924479167, "logps/rejected": -382.953271484375, "loss": 0.2474, "rewards/chosen": 0.5040690104166666, "rewards/margins": 3.026747385660807, "rewards/rejected": -2.5226783752441406, "step": 5885 }, { "epoch": 0.31198155460737287, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6443184.0, "logits/rejected": -84233864.0, "logps/chosen": -112.99677276611328, "logps/rejected": -314.6767272949219, "loss": 0.3062, "rewards/chosen": 0.06750154495239258, "rewards/margins": 2.2125890254974365, "rewards/rejected": -2.145087480545044, "step": 5886 }, { "epoch": 0.312034558609175, "grad_norm": 48.0, "kl": 0.4339256286621094, "learning_rate": 5e-07, "logits/chosen": -27209365.333333332, "logits/rejected": -39646422.4, "logps/chosen": -249.05257161458334, "logps/rejected": -421.575390625, "loss": 0.2582, "rewards/chosen": 0.047221253315607704, "rewards/margins": 2.734414551655451, "rewards/rejected": -2.6871932983398437, "step": 5887 }, { "epoch": 0.31208756261097714, "grad_norm": 40.25, "kl": 0.012918472290039062, "learning_rate": 5e-07, "logits/chosen": -1776617.0, "logits/rejected": -28245394.666666668, "logps/chosen": -221.52406311035156, "logps/rejected": -339.1785481770833, "loss": 0.2456, "rewards/chosen": 0.4311489164829254, "rewards/margins": 2.4248311420281725, "rewards/rejected": -1.9936822255452473, "step": 5888 }, { "epoch": 0.3121405666127793, "grad_norm": 35.25, "kl": 0.5954971313476562, "learning_rate": 5e-07, "logits/chosen": -50291029.333333336, "logits/rejected": -28646374.4, "logps/chosen": -244.85331217447916, "logps/rejected": -376.1843017578125, "loss": 0.2407, "rewards/chosen": 0.10849140087763469, "rewards/margins": 2.6382643500963847, "rewards/rejected": -2.52977294921875, "step": 5889 }, { "epoch": 0.3121935706145814, "grad_norm": 50.5, "kl": 2.123074531555176, "learning_rate": 5e-07, "logits/chosen": -29453216.0, "logits/rejected": -12871046.0, "logps/chosen": -526.417724609375, "logps/rejected": -188.51771545410156, "loss": 0.2657, "rewards/chosen": 0.9109818935394287, "rewards/margins": 3.2423574924468994, "rewards/rejected": -2.3313755989074707, "step": 5890 }, { "epoch": 0.31224657461638355, "grad_norm": 39.0, "kl": 1.0285377502441406, "learning_rate": 5e-07, "logits/chosen": -5472434.8, "logits/rejected": -9666474.666666666, "logps/chosen": -196.8621337890625, "logps/rejected": -211.9715372721354, "loss": 0.325, "rewards/chosen": 0.3321633577346802, "rewards/margins": 2.822316034634908, "rewards/rejected": -2.490152676900228, "step": 5891 }, { "epoch": 0.3122995786181857, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84667642.66666667, "logits/rejected": -27793337.6, "logps/chosen": -409.6604410807292, "logps/rejected": -229.354833984375, "loss": 0.2664, "rewards/chosen": 0.49935205777486164, "rewards/margins": 2.13004625638326, "rewards/rejected": -1.6306941986083985, "step": 5892 }, { "epoch": 0.31235258261998783, "grad_norm": 53.25, "kl": 1.0284271240234375, "learning_rate": 5e-07, "logits/chosen": -13003012.8, "logits/rejected": -16733408.0, "logps/chosen": -171.607958984375, "logps/rejected": -354.6496988932292, "loss": 0.4023, "rewards/chosen": 0.29348297119140626, "rewards/margins": 1.3156469186147053, "rewards/rejected": -1.022163947423299, "step": 5893 }, { "epoch": 0.31240558662178997, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43186473.6, "logits/rejected": -10374759.333333334, "logps/chosen": -312.4958984375, "logps/rejected": -163.7096150716146, "loss": 0.3818, "rewards/chosen": 0.11384140253067017, "rewards/margins": 1.5137863437334698, "rewards/rejected": -1.3999449412027996, "step": 5894 }, { "epoch": 0.3124585906235921, "grad_norm": 50.75, "kl": 0.7606353759765625, "learning_rate": 5e-07, "logits/chosen": -47287456.0, "logits/rejected": -47328372.0, "logps/chosen": -435.7182922363281, "logps/rejected": -459.0182189941406, "loss": 0.2514, "rewards/chosen": 0.5374832153320312, "rewards/margins": 3.64267635345459, "rewards/rejected": -3.1051931381225586, "step": 5895 }, { "epoch": 0.31251159462539424, "grad_norm": 65.0, "kl": 0.03629493713378906, "learning_rate": 5e-07, "logits/chosen": -5834949.0, "logits/rejected": -23679634.0, "logps/chosen": -439.2301025390625, "logps/rejected": -278.9240417480469, "loss": 0.2737, "rewards/chosen": 0.6425254940986633, "rewards/margins": 2.393633544445038, "rewards/rejected": -1.7511080503463745, "step": 5896 }, { "epoch": 0.3125645986271964, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62072488.0, "logits/rejected": -27338528.0, "logps/chosen": -181.11343383789062, "logps/rejected": -304.6345621744792, "loss": 0.2238, "rewards/chosen": 0.04876232147216797, "rewards/margins": 2.652271588643392, "rewards/rejected": -2.603509267171224, "step": 5897 }, { "epoch": 0.3126176026289985, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22190284.0, "logits/rejected": -27597464.0, "logps/chosen": -103.9603042602539, "logps/rejected": -287.2453206380208, "loss": 0.2169, "rewards/chosen": -0.14788399636745453, "rewards/margins": 2.0157156536976495, "rewards/rejected": -2.163599650065104, "step": 5898 }, { "epoch": 0.31267060663080065, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14496252.8, "logits/rejected": -33028992.0, "logps/chosen": -265.4492431640625, "logps/rejected": -380.3833821614583, "loss": 0.3608, "rewards/chosen": -0.16227446794509887, "rewards/margins": 2.471847426891327, "rewards/rejected": -2.634121894836426, "step": 5899 }, { "epoch": 0.3127236106326028, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52046085.333333336, "logits/rejected": -6684648.8, "logps/chosen": -139.1759236653646, "logps/rejected": -400.071630859375, "loss": 0.2797, "rewards/chosen": -0.07995408276716869, "rewards/margins": 2.314644687374433, "rewards/rejected": -2.3945987701416014, "step": 5900 }, { "epoch": 0.31277661463440487, "grad_norm": 47.0, "kl": 0.16005325317382812, "learning_rate": 5e-07, "logits/chosen": 6035304.0, "logits/rejected": -489213.875, "logps/chosen": -176.07034737723214, "logps/rejected": -69.137451171875, "loss": 0.409, "rewards/chosen": 0.23409276349203928, "rewards/margins": 2.0428101335253035, "rewards/rejected": -1.8087173700332642, "step": 5901 }, { "epoch": 0.312829618636207, "grad_norm": 45.5, "kl": 0.35848140716552734, "learning_rate": 5e-07, "logits/chosen": -94478880.0, "logits/rejected": -723974.0, "logps/chosen": -209.709375, "logps/rejected": -172.01220703125, "loss": 0.3828, "rewards/chosen": 0.31099467277526854, "rewards/margins": 1.2459806283315022, "rewards/rejected": -0.9349859555562338, "step": 5902 }, { "epoch": 0.31288262263800914, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42235504.0, "logits/rejected": -54095128.0, "logps/chosen": -337.40625, "logps/rejected": -371.5435791015625, "loss": 0.3349, "rewards/chosen": -0.04819469153881073, "rewards/margins": 1.9492426663637161, "rewards/rejected": -1.9974373579025269, "step": 5903 }, { "epoch": 0.3129356266398113, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28623246.0, "logits/rejected": -49520392.0, "logps/chosen": -194.67514038085938, "logps/rejected": -434.3638610839844, "loss": 0.3216, "rewards/chosen": -0.18239516019821167, "rewards/margins": 2.43904048204422, "rewards/rejected": -2.6214356422424316, "step": 5904 }, { "epoch": 0.3129886306416134, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31106013.333333332, "logits/rejected": -18431862.4, "logps/chosen": -79.42622884114583, "logps/rejected": -391.6583984375, "loss": 0.2888, "rewards/chosen": 0.025569026668866474, "rewards/margins": 2.0893868138392766, "rewards/rejected": -2.06381778717041, "step": 5905 }, { "epoch": 0.31304163464341556, "grad_norm": 44.0, "kl": 0.5600357055664062, "learning_rate": 5e-07, "logits/chosen": -9357120.0, "logits/rejected": -87147296.0, "logps/chosen": -229.39395141601562, "logps/rejected": -421.71026611328125, "loss": 0.3631, "rewards/chosen": 0.17491602897644043, "rewards/margins": 2.074699640274048, "rewards/rejected": -1.8997836112976074, "step": 5906 }, { "epoch": 0.3130946386452177, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86058661.33333333, "logits/rejected": 9116384.0, "logps/chosen": -199.36214192708334, "logps/rejected": -192.3111572265625, "loss": 0.3491, "rewards/chosen": -0.24133225282033285, "rewards/margins": 1.255655566851298, "rewards/rejected": -1.4969878196716309, "step": 5907 }, { "epoch": 0.31314764264701983, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2808456.6666666665, "logits/rejected": -3161278.4, "logps/chosen": -415.1334228515625, "logps/rejected": -233.0587890625, "loss": 0.3011, "rewards/chosen": 0.28801163037618, "rewards/margins": 1.6505801995595295, "rewards/rejected": -1.3625685691833496, "step": 5908 }, { "epoch": 0.31320064664882197, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3078494.25, "logits/rejected": -46690696.0, "logps/chosen": -308.8175048828125, "logps/rejected": -458.2700500488281, "loss": 0.3238, "rewards/chosen": -0.048037175089120865, "rewards/margins": 2.0804163180291653, "rewards/rejected": -2.128453493118286, "step": 5909 }, { "epoch": 0.3132536506506241, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17007398.666666668, "logits/rejected": -26809104.0, "logps/chosen": -276.24806722005206, "logps/rejected": -499.53896484375, "loss": 0.267, "rewards/chosen": 0.10727691650390625, "rewards/margins": 2.5250228881835937, "rewards/rejected": -2.4177459716796874, "step": 5910 }, { "epoch": 0.31330665465242624, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29706188.0, "logits/rejected": -35185608.0, "logps/chosen": -705.364013671875, "logps/rejected": -420.2978820800781, "loss": 0.2441, "rewards/chosen": 0.9520132541656494, "rewards/margins": 3.2314205169677734, "rewards/rejected": -2.279407262802124, "step": 5911 }, { "epoch": 0.3133596586542284, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65509228.0, "logits/rejected": 850910.75, "logps/chosen": -359.3812255859375, "logps/rejected": -274.7032470703125, "loss": 0.245, "rewards/chosen": 0.3404991030693054, "rewards/margins": 3.244111716747284, "rewards/rejected": -2.9036126136779785, "step": 5912 }, { "epoch": 0.3134126626560305, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26787536.0, "logits/rejected": 18081760.0, "logps/chosen": -577.8492431640625, "logps/rejected": -512.0435791015625, "loss": 0.2451, "rewards/chosen": 0.40246355533599854, "rewards/margins": 3.2252126932144165, "rewards/rejected": -2.822749137878418, "step": 5913 }, { "epoch": 0.31346566665783265, "grad_norm": 46.75, "kl": 1.4257049560546875, "learning_rate": 5e-07, "logits/chosen": -3589099.75, "logits/rejected": -26362326.0, "logps/chosen": -327.94036865234375, "logps/rejected": -273.396240234375, "loss": 0.357, "rewards/chosen": 0.321287602186203, "rewards/margins": 1.4382042586803436, "rewards/rejected": -1.1169166564941406, "step": 5914 }, { "epoch": 0.3135186706596348, "grad_norm": 43.75, "kl": 0.6240882873535156, "learning_rate": 5e-07, "logits/chosen": -27922150.0, "logits/rejected": -22270396.0, "logps/chosen": -264.6828918457031, "logps/rejected": -187.48377990722656, "loss": 0.3483, "rewards/chosen": 0.22379454970359802, "rewards/margins": 1.4715547263622284, "rewards/rejected": -1.2477601766586304, "step": 5915 }, { "epoch": 0.3135716746614369, "grad_norm": 50.0, "kl": 1.2997474670410156, "learning_rate": 5e-07, "logits/chosen": -18441000.0, "logits/rejected": -79812296.0, "logps/chosen": -298.6666782924107, "logps/rejected": -258.0464782714844, "loss": 0.3392, "rewards/chosen": 0.7098759242466518, "rewards/margins": 2.628175837653024, "rewards/rejected": -1.918299913406372, "step": 5916 }, { "epoch": 0.31362467866323906, "grad_norm": 46.25, "kl": 0.126739501953125, "learning_rate": 5e-07, "logits/chosen": -56498112.0, "logits/rejected": -17515858.666666668, "logps/chosen": -873.3931884765625, "logps/rejected": -214.62215169270834, "loss": 0.1864, "rewards/chosen": 2.1552627086639404, "rewards/margins": 3.8621156215667725, "rewards/rejected": -1.706852912902832, "step": 5917 }, { "epoch": 0.3136776826650412, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63295968.0, "logits/rejected": 1862574.0, "logps/chosen": -483.9930013020833, "logps/rejected": -152.2580322265625, "loss": 0.3124, "rewards/chosen": 0.7941672801971436, "rewards/margins": 1.8628791332244874, "rewards/rejected": -1.0687118530273438, "step": 5918 }, { "epoch": 0.31373068666684334, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16496645.0, "logits/rejected": -24641440.0, "logps/chosen": -525.6915283203125, "logps/rejected": -365.6630452473958, "loss": 0.2341, "rewards/chosen": 0.7083106637001038, "rewards/margins": 2.451159695784251, "rewards/rejected": -1.7428490320841472, "step": 5919 }, { "epoch": 0.3137836906686455, "grad_norm": 58.75, "kl": 0.2640361785888672, "learning_rate": 5e-07, "logits/chosen": -20072449.6, "logits/rejected": 1738294.3333333333, "logps/chosen": -352.35283203125, "logps/rejected": -148.98262532552084, "loss": 0.3622, "rewards/chosen": 0.12699005603790284, "rewards/margins": 1.8079888582229615, "rewards/rejected": -1.6809988021850586, "step": 5920 }, { "epoch": 0.3138366946704476, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16369297.0, "logits/rejected": -17210648.0, "logps/chosen": -219.45574951171875, "logps/rejected": -300.84326171875, "loss": 0.2468, "rewards/chosen": 0.3028770983219147, "rewards/margins": 3.3011752665042877, "rewards/rejected": -2.998298168182373, "step": 5921 }, { "epoch": 0.31388969867224975, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25829288.0, "logits/rejected": -8606108.0, "logps/chosen": -156.67095947265625, "logps/rejected": -162.5794219970703, "loss": 0.4333, "rewards/chosen": -0.27427753806114197, "rewards/margins": 0.7287363111972809, "rewards/rejected": -1.0030138492584229, "step": 5922 }, { "epoch": 0.3139427026740519, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43365427.2, "logits/rejected": -15432410.666666666, "logps/chosen": -333.8851318359375, "logps/rejected": -324.4945475260417, "loss": 0.4259, "rewards/chosen": -0.14024155139923095, "rewards/margins": 1.1050023158391316, "rewards/rejected": -1.2452438672383626, "step": 5923 }, { "epoch": 0.313995706675854, "grad_norm": 76.0, "kl": 1.8899612426757812, "learning_rate": 5e-07, "logits/chosen": -24026838.85714286, "logits/rejected": -19793570.0, "logps/chosen": -329.71902901785717, "logps/rejected": -94.06267547607422, "loss": 0.4294, "rewards/chosen": 0.28674711499895367, "rewards/margins": 2.490882294518607, "rewards/rejected": -2.2041351795196533, "step": 5924 }, { "epoch": 0.31404871067765616, "grad_norm": 41.25, "kl": 0.8769521713256836, "learning_rate": 5e-07, "logits/chosen": -18421706.0, "logits/rejected": -11876735.0, "logps/chosen": -356.54180908203125, "logps/rejected": -88.77977752685547, "loss": 0.3125, "rewards/chosen": 0.5951688289642334, "rewards/margins": 1.5383602976799011, "rewards/rejected": -0.9431914687156677, "step": 5925 }, { "epoch": 0.3141017146794583, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40073900.8, "logits/rejected": -31749434.666666668, "logps/chosen": -220.639453125, "logps/rejected": -211.4095458984375, "loss": 0.4472, "rewards/chosen": -0.4340356826782227, "rewards/margins": 0.907489554087321, "rewards/rejected": -1.3415252367655437, "step": 5926 }, { "epoch": 0.31415471868126044, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17913944.0, "logits/rejected": -38522760.0, "logps/chosen": -221.80977376302084, "logps/rejected": -449.0877685546875, "loss": 0.3631, "rewards/chosen": 0.3382692337036133, "rewards/margins": 2.749953031539917, "rewards/rejected": -2.4116837978363037, "step": 5927 }, { "epoch": 0.3142077226830626, "grad_norm": 44.0, "kl": 0.7037582397460938, "learning_rate": 5e-07, "logits/chosen": -49489412.0, "logits/rejected": -24095284.0, "logps/chosen": -394.3443603515625, "logps/rejected": -261.7558288574219, "loss": 0.2571, "rewards/chosen": 0.7837060689926147, "rewards/margins": 2.5213937759399414, "rewards/rejected": -1.7376877069473267, "step": 5928 }, { "epoch": 0.3142607266848647, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43714664.0, "logits/rejected": -50084672.0, "logps/chosen": -315.0870666503906, "logps/rejected": -397.4139404296875, "loss": 0.2248, "rewards/chosen": 0.10595741122961044, "rewards/margins": 2.3612881327668824, "rewards/rejected": -2.255330721537272, "step": 5929 }, { "epoch": 0.31431373068666685, "grad_norm": 47.25, "kl": 0.763458251953125, "learning_rate": 5e-07, "logits/chosen": 10858851.0, "logits/rejected": -5974400.0, "logps/chosen": -123.6110610961914, "logps/rejected": -255.49342346191406, "loss": 0.3696, "rewards/chosen": 0.32060471177101135, "rewards/margins": 1.2090024054050446, "rewards/rejected": -0.8883976936340332, "step": 5930 }, { "epoch": 0.314366734688469, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64835092.0, "logits/rejected": -24827338.666666668, "logps/chosen": -427.1900939941406, "logps/rejected": -358.451416015625, "loss": 0.2761, "rewards/chosen": 0.49763184785842896, "rewards/margins": 1.8453648686408997, "rewards/rejected": -1.3477330207824707, "step": 5931 }, { "epoch": 0.3144197386902711, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28766890.666666668, "logits/rejected": -34727172.0, "logps/chosen": -285.2173258463542, "logps/rejected": -143.44271850585938, "loss": 0.3132, "rewards/chosen": 0.7154279549916586, "rewards/margins": 1.9879384835561118, "rewards/rejected": -1.2725105285644531, "step": 5932 }, { "epoch": 0.31447274269207326, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19046818.666666668, "logits/rejected": -24537702.0, "logps/chosen": -340.7830810546875, "logps/rejected": -586.8091430664062, "loss": 0.3701, "rewards/chosen": 0.28376154104868573, "rewards/margins": 2.543185512224833, "rewards/rejected": -2.2594239711761475, "step": 5933 }, { "epoch": 0.3145257466938754, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32131104.0, "logits/rejected": -75633288.0, "logps/chosen": -238.2160847981771, "logps/rejected": -842.7496948242188, "loss": 0.3621, "rewards/chosen": 0.1369530459245046, "rewards/margins": 3.413161685069402, "rewards/rejected": -3.2762086391448975, "step": 5934 }, { "epoch": 0.31457875069567753, "grad_norm": 62.5, "kl": 0.30477142333984375, "learning_rate": 5e-07, "logits/chosen": -2913652.6666666665, "logits/rejected": -46160672.0, "logps/chosen": -410.6492106119792, "logps/rejected": -443.7787170410156, "loss": 0.3633, "rewards/chosen": 0.2364195187886556, "rewards/margins": 2.5878752072652182, "rewards/rejected": -2.3514556884765625, "step": 5935 }, { "epoch": 0.31463175469747967, "grad_norm": 57.75, "kl": 0.6517882347106934, "learning_rate": 5e-07, "logits/chosen": 39113990.4, "logits/rejected": -25057026.666666668, "logps/chosen": -323.1533203125, "logps/rejected": -281.1559244791667, "loss": 0.355, "rewards/chosen": 0.46715774536132815, "rewards/margins": 1.5779205799102782, "rewards/rejected": -1.1107628345489502, "step": 5936 }, { "epoch": 0.3146847586992818, "grad_norm": 39.0, "kl": 1.2684717178344727, "learning_rate": 5e-07, "logits/chosen": -17864546.0, "logits/rejected": 103910080.0, "logps/chosen": -112.25047302246094, "logps/rejected": -418.40545654296875, "loss": 0.3409, "rewards/chosen": 0.025068514049053192, "rewards/margins": 1.689428560435772, "rewards/rejected": -1.6643600463867188, "step": 5937 }, { "epoch": 0.31473776270108395, "grad_norm": 63.0, "kl": 0.4976844787597656, "learning_rate": 5e-07, "logits/chosen": -29773466.666666668, "logits/rejected": -110723376.0, "logps/chosen": -663.3671061197916, "logps/rejected": -473.8336181640625, "loss": 0.2407, "rewards/chosen": 0.9691955248514811, "rewards/margins": 3.445492426554362, "rewards/rejected": -2.476296901702881, "step": 5938 }, { "epoch": 0.3147907667028861, "grad_norm": 51.25, "kl": 3.728557586669922, "learning_rate": 5e-07, "logits/chosen": -31732470.4, "logits/rejected": -44035320.0, "logps/chosen": -369.5231201171875, "logps/rejected": -400.1457926432292, "loss": 0.3903, "rewards/chosen": 0.2388071298599243, "rewards/margins": 1.933307417233785, "rewards/rejected": -1.6945002873738606, "step": 5939 }, { "epoch": 0.3148437707046882, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86892776.0, "logits/rejected": -66844456.0, "logps/chosen": -514.0068969726562, "logps/rejected": -420.50128173828125, "loss": 0.2219, "rewards/chosen": 0.7687351703643799, "rewards/margins": 3.5494439601898193, "rewards/rejected": -2.7807087898254395, "step": 5940 }, { "epoch": 0.31489677470649036, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7112896.0, "logits/rejected": 2067416.25, "logps/chosen": -155.16899762834822, "logps/rejected": -63.338096618652344, "loss": 0.4719, "rewards/chosen": -0.11582368612289429, "rewards/margins": 2.170387089252472, "rewards/rejected": -2.286210775375366, "step": 5941 }, { "epoch": 0.3149497787082925, "grad_norm": 52.75, "kl": 2.100200653076172, "learning_rate": 5e-07, "logits/chosen": -72386880.0, "logits/rejected": -17684179.2, "logps/chosen": -421.4151611328125, "logps/rejected": -329.1512451171875, "loss": 0.2209, "rewards/chosen": 0.652228832244873, "rewards/margins": 2.677957057952881, "rewards/rejected": -2.025728225708008, "step": 5942 }, { "epoch": 0.31500278271009463, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14697198.0, "logits/rejected": -45417952.0, "logps/chosen": -280.7215881347656, "logps/rejected": -385.913818359375, "loss": 0.3201, "rewards/chosen": 0.19081862270832062, "rewards/margins": 2.513861730694771, "rewards/rejected": -2.32304310798645, "step": 5943 }, { "epoch": 0.31505578671189677, "grad_norm": 57.75, "kl": 0.034458160400390625, "learning_rate": 5e-07, "logits/chosen": 3010785.75, "logits/rejected": -16521888.0, "logps/chosen": -456.06475830078125, "logps/rejected": -305.1488037109375, "loss": 0.297, "rewards/chosen": -0.24015504121780396, "rewards/margins": 1.2255012392997742, "rewards/rejected": -1.4656562805175781, "step": 5944 }, { "epoch": 0.3151087907136989, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71271648.0, "logits/rejected": -22972152.0, "logps/chosen": -480.5406494140625, "logps/rejected": -331.2671813964844, "loss": 0.3314, "rewards/chosen": 0.2617768347263336, "rewards/margins": 1.6802766621112823, "rewards/rejected": -1.4184998273849487, "step": 5945 }, { "epoch": 0.31516179471550104, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7724204.8, "logits/rejected": -11099622.666666666, "logps/chosen": -231.3853515625, "logps/rejected": -162.26732381184897, "loss": 0.3515, "rewards/chosen": 0.0863871693611145, "rewards/margins": 1.9421677708625793, "rewards/rejected": -1.8557806015014648, "step": 5946 }, { "epoch": 0.3152147987173032, "grad_norm": 84.5, "kl": 0.7389678955078125, "learning_rate": 5e-07, "logits/chosen": -10609117.0, "logits/rejected": -20505252.0, "logps/chosen": -186.0491943359375, "logps/rejected": -359.9605712890625, "loss": 0.3167, "rewards/chosen": 0.2179247885942459, "rewards/margins": 1.7985245734453201, "rewards/rejected": -1.5805997848510742, "step": 5947 }, { "epoch": 0.3152678027191053, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15630669.0, "logits/rejected": -47025877.333333336, "logps/chosen": -117.78276062011719, "logps/rejected": -450.4801432291667, "loss": 0.2224, "rewards/chosen": 0.06072578579187393, "rewards/margins": 2.616696613530318, "rewards/rejected": -2.555970827738444, "step": 5948 }, { "epoch": 0.31532080672090745, "grad_norm": 54.25, "kl": 0.04285430908203125, "learning_rate": 5e-07, "logits/chosen": -12624517.333333334, "logits/rejected": -19982072.0, "logps/chosen": -344.3401692708333, "logps/rejected": -419.31982421875, "loss": 0.302, "rewards/chosen": 0.597657839457194, "rewards/margins": 2.5449647108713784, "rewards/rejected": -1.9473068714141846, "step": 5949 }, { "epoch": 0.3153738107227096, "grad_norm": 40.25, "kl": 0.3248176574707031, "learning_rate": 5e-07, "logits/chosen": -5628005.333333333, "logits/rejected": -47191776.0, "logps/chosen": -368.7897135416667, "logps/rejected": -527.3595703125, "loss": 0.2005, "rewards/chosen": 0.35195974508921307, "rewards/margins": 3.5703893105189004, "rewards/rejected": -3.2184295654296875, "step": 5950 }, { "epoch": 0.31542681472451173, "grad_norm": 41.5, "kl": 1.9411163330078125, "learning_rate": 5e-07, "logits/chosen": -3189424.5, "logits/rejected": -19939710.0, "logps/chosen": -205.47222900390625, "logps/rejected": -391.9533386230469, "loss": 0.2744, "rewards/chosen": 0.3654404282569885, "rewards/margins": 2.393127739429474, "rewards/rejected": -2.0276873111724854, "step": 5951 }, { "epoch": 0.3154798187263138, "grad_norm": 44.25, "kl": 1.308624267578125, "learning_rate": 5e-07, "logits/chosen": -53773130.666666664, "logits/rejected": -20010816.0, "logps/chosen": -445.5016682942708, "logps/rejected": -312.260302734375, "loss": 0.2957, "rewards/chosen": 0.18514825900395712, "rewards/margins": 1.854299755891164, "rewards/rejected": -1.669151496887207, "step": 5952 }, { "epoch": 0.31553282272811595, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36877292.8, "logits/rejected": -59611850.666666664, "logps/chosen": -346.13525390625, "logps/rejected": -489.1559244791667, "loss": 0.2859, "rewards/chosen": 0.42861738204956057, "rewards/margins": 3.0486013730367025, "rewards/rejected": -2.619983990987142, "step": 5953 }, { "epoch": 0.3155858267299181, "grad_norm": 46.0, "kl": 0.029764175415039062, "learning_rate": 5e-07, "logits/chosen": -26381434.0, "logits/rejected": -15414496.0, "logps/chosen": -311.927001953125, "logps/rejected": -361.2929280598958, "loss": 0.2773, "rewards/chosen": -0.3322868347167969, "rewards/margins": 2.3662643432617188, "rewards/rejected": -2.6985511779785156, "step": 5954 }, { "epoch": 0.3156388307317202, "grad_norm": 85.0, "kl": 1.3969955444335938, "learning_rate": 5e-07, "logits/chosen": -30196930.666666668, "logits/rejected": -31602860.0, "logps/chosen": -562.8858235677084, "logps/rejected": -319.6585693359375, "loss": 0.3415, "rewards/chosen": 0.5141144196192423, "rewards/margins": 2.891050140062968, "rewards/rejected": -2.3769357204437256, "step": 5955 }, { "epoch": 0.31569183473352236, "grad_norm": 40.0, "kl": 0.05339622497558594, "learning_rate": 5e-07, "logits/chosen": -13722453.0, "logits/rejected": -6291579.0, "logps/chosen": -230.55526733398438, "logps/rejected": -110.6638412475586, "loss": 0.2414, "rewards/chosen": 1.1157090663909912, "rewards/margins": 2.700273036956787, "rewards/rejected": -1.584563970565796, "step": 5956 }, { "epoch": 0.3157448387353245, "grad_norm": 59.25, "kl": 0.48558807373046875, "learning_rate": 5e-07, "logits/chosen": -30763980.8, "logits/rejected": 13837030.666666666, "logps/chosen": -233.0113037109375, "logps/rejected": -179.92742919921875, "loss": 0.3997, "rewards/chosen": 0.2444685459136963, "rewards/margins": 1.1388769308725992, "rewards/rejected": -0.894408384958903, "step": 5957 }, { "epoch": 0.31579784273712663, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20565544.0, "logits/rejected": -16117792.0, "logps/chosen": -180.88482666015625, "logps/rejected": -176.687451171875, "loss": 0.3097, "rewards/chosen": -0.17707488934199014, "rewards/margins": 1.7381465156873066, "rewards/rejected": -1.9152214050292968, "step": 5958 }, { "epoch": 0.31585084673892877, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1682539.5, "logits/rejected": -12948163.0, "logps/chosen": -310.2215270996094, "logps/rejected": -326.3512268066406, "loss": 0.3187, "rewards/chosen": -0.31880247592926025, "rewards/margins": 2.644922137260437, "rewards/rejected": -2.9637246131896973, "step": 5959 }, { "epoch": 0.3159038507407309, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20152924.0, "logits/rejected": -12893537.0, "logps/chosen": -254.3224334716797, "logps/rejected": -325.09564208984375, "loss": 0.2851, "rewards/chosen": 0.4178333282470703, "rewards/margins": 2.567445993423462, "rewards/rejected": -2.1496126651763916, "step": 5960 }, { "epoch": 0.31595685474253304, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33065696.0, "logits/rejected": -41521352.0, "logps/chosen": -271.4641927083333, "logps/rejected": -592.6182861328125, "loss": 0.388, "rewards/chosen": 0.12158705790837605, "rewards/margins": 2.086897869904836, "rewards/rejected": -1.96531081199646, "step": 5961 }, { "epoch": 0.3160098587443352, "grad_norm": 42.75, "kl": 1.2200393676757812, "learning_rate": 5e-07, "logits/chosen": -18401877.333333332, "logits/rejected": -22856416.0, "logps/chosen": -210.557861328125, "logps/rejected": -370.977294921875, "loss": 0.3308, "rewards/chosen": 0.17865041891733804, "rewards/margins": 1.710219136873881, "rewards/rejected": -1.531568717956543, "step": 5962 }, { "epoch": 0.3160628627461373, "grad_norm": 51.0, "kl": 0.42215538024902344, "learning_rate": 5e-07, "logits/chosen": 6054814.666666667, "logits/rejected": -1225379.5, "logps/chosen": -203.95137532552084, "logps/rejected": -487.4911193847656, "loss": 0.3192, "rewards/chosen": 0.5916176637013754, "rewards/margins": 2.3834838469823203, "rewards/rejected": -1.7918661832809448, "step": 5963 }, { "epoch": 0.31611586674793946, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6355073.333333333, "logits/rejected": -1820995.2, "logps/chosen": -131.167236328125, "logps/rejected": -269.6893310546875, "loss": 0.2507, "rewards/chosen": 0.8610970179239908, "rewards/margins": 2.3744488398234047, "rewards/rejected": -1.513351821899414, "step": 5964 }, { "epoch": 0.3161688707497416, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16687476.0, "logits/rejected": -15278514.0, "logps/chosen": -452.459228515625, "logps/rejected": -149.7520751953125, "loss": 0.4255, "rewards/chosen": -0.3993869721889496, "rewards/margins": 0.5846661031246185, "rewards/rejected": -0.9840530753135681, "step": 5965 }, { "epoch": 0.31622187475154373, "grad_norm": 54.75, "kl": 0.9248285293579102, "learning_rate": 5e-07, "logits/chosen": -48035644.0, "logits/rejected": 7334137.0, "logps/chosen": -555.0662841796875, "logps/rejected": -255.4522247314453, "loss": 0.2976, "rewards/chosen": 0.5888440608978271, "rewards/margins": 1.9179723262786865, "rewards/rejected": -1.3291282653808594, "step": 5966 }, { "epoch": 0.31627487875334587, "grad_norm": 42.75, "kl": 1.558445930480957, "learning_rate": 5e-07, "logits/chosen": -21769768.0, "logits/rejected": -11999220.0, "logps/chosen": -186.1916259765625, "logps/rejected": -200.051513671875, "loss": 0.2918, "rewards/chosen": 0.6742980480194092, "rewards/margins": 2.3508838812510175, "rewards/rejected": -1.6765858332316081, "step": 5967 }, { "epoch": 0.316327882755148, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14243881.0, "logits/rejected": -9005596.0, "logps/chosen": -220.30027770996094, "logps/rejected": -571.6161499023438, "loss": 0.2886, "rewards/chosen": 0.15382367372512817, "rewards/margins": 2.7469409108161926, "rewards/rejected": -2.5931172370910645, "step": 5968 }, { "epoch": 0.31638088675695014, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27142579.2, "logits/rejected": 4618393.666666667, "logps/chosen": -306.470361328125, "logps/rejected": -88.66173299153645, "loss": 0.3616, "rewards/chosen": 0.08657974004745483, "rewards/margins": 2.013841251532237, "rewards/rejected": -1.927261511484782, "step": 5969 }, { "epoch": 0.3164338907587523, "grad_norm": 61.25, "kl": 1.2808141708374023, "learning_rate": 5e-07, "logits/chosen": -15562764.8, "logits/rejected": -12560913.333333334, "logps/chosen": -609.615966796875, "logps/rejected": -331.90997314453125, "loss": 0.3235, "rewards/chosen": 0.5140588283538818, "rewards/margins": 2.5096925894419355, "rewards/rejected": -1.9956337610880535, "step": 5970 }, { "epoch": 0.3164868947605544, "grad_norm": 44.0, "kl": 0.4372291564941406, "learning_rate": 5e-07, "logits/chosen": 1491284.1666666667, "logits/rejected": -63205132.8, "logps/chosen": -225.58695475260416, "logps/rejected": -368.126220703125, "loss": 0.2252, "rewards/chosen": 0.5847902297973633, "rewards/margins": 2.5318403244018555, "rewards/rejected": -1.9470500946044922, "step": 5971 }, { "epoch": 0.31653989876235655, "grad_norm": 73.0, "kl": 0.9832954406738281, "learning_rate": 5e-07, "logits/chosen": -44232624.0, "logits/rejected": -43890288.0, "logps/chosen": -541.7383626302084, "logps/rejected": -234.6708984375, "loss": 0.2671, "rewards/chosen": 0.9249011675516764, "rewards/margins": 2.989933411280314, "rewards/rejected": -2.0650322437286377, "step": 5972 }, { "epoch": 0.3165929027641587, "grad_norm": 45.75, "kl": 0.48737144470214844, "learning_rate": 5e-07, "logits/chosen": 2270226.6666666665, "logits/rejected": 347593.2, "logps/chosen": -54.45738728841146, "logps/rejected": -162.82218017578126, "loss": 0.3254, "rewards/chosen": -0.08753311634063721, "rewards/margins": 1.4011062383651733, "rewards/rejected": -1.4886393547058105, "step": 5973 }, { "epoch": 0.3166459067659608, "grad_norm": 47.75, "kl": 0.030462265014648438, "learning_rate": 5e-07, "logits/chosen": -21383313.333333332, "logits/rejected": -37422019.2, "logps/chosen": -75.22968037923177, "logps/rejected": -299.201953125, "loss": 0.3726, "rewards/chosen": -0.27906330426534015, "rewards/margins": 1.1768898169199626, "rewards/rejected": -1.4559531211853027, "step": 5974 }, { "epoch": 0.31669891076776296, "grad_norm": 49.0, "kl": 0.7249832153320312, "learning_rate": 5e-07, "logits/chosen": -49911509.333333336, "logits/rejected": -17190636.8, "logps/chosen": -489.0859781901042, "logps/rejected": -227.38837890625, "loss": 0.2403, "rewards/chosen": 0.6155598958333334, "rewards/margins": 2.5148038228352867, "rewards/rejected": -1.8992439270019532, "step": 5975 }, { "epoch": 0.3167519147695651, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17272051.2, "logits/rejected": -18278401.333333332, "logps/chosen": -590.41611328125, "logps/rejected": -176.39835611979166, "loss": 0.3645, "rewards/chosen": 0.5461649894714355, "rewards/margins": 1.391120433807373, "rewards/rejected": -0.8449554443359375, "step": 5976 }, { "epoch": 0.31680491877136724, "grad_norm": 41.25, "kl": 0.2403888702392578, "learning_rate": 5e-07, "logits/chosen": -7788028.0, "logits/rejected": -41154936.0, "logps/chosen": -216.4726104736328, "logps/rejected": -336.4751281738281, "loss": 0.2951, "rewards/chosen": 0.07215355336666107, "rewards/margins": 2.6230559200048447, "rewards/rejected": -2.5509023666381836, "step": 5977 }, { "epoch": 0.3168579227731694, "grad_norm": 42.25, "kl": 0.00203704833984375, "learning_rate": 5e-07, "logits/chosen": -46673496.0, "logits/rejected": -34102472.0, "logps/chosen": -142.3583221435547, "logps/rejected": -493.3883056640625, "loss": 0.3171, "rewards/chosen": -0.09257060289382935, "rewards/margins": 2.471913516521454, "rewards/rejected": -2.564484119415283, "step": 5978 }, { "epoch": 0.3169109267749715, "grad_norm": 67.0, "kl": 1.5749969482421875, "learning_rate": 5e-07, "logits/chosen": 10642629.333333334, "logits/rejected": -66380412.0, "logps/chosen": -403.7392578125, "logps/rejected": -689.4965209960938, "loss": 0.3284, "rewards/chosen": 0.39485780398050946, "rewards/margins": 4.162713686625163, "rewards/rejected": -3.7678558826446533, "step": 5979 }, { "epoch": 0.31696393077677365, "grad_norm": 48.25, "kl": 0.30275726318359375, "learning_rate": 5e-07, "logits/chosen": -20329365.333333332, "logits/rejected": -10126664.0, "logps/chosen": -222.17789713541666, "logps/rejected": -292.5231628417969, "loss": 0.2869, "rewards/chosen": 0.7140615781148275, "rewards/margins": 3.6620806058247886, "rewards/rejected": -2.948019027709961, "step": 5980 }, { "epoch": 0.3170169347785758, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34123109.333333336, "logits/rejected": -33537638.4, "logps/chosen": -242.3558146158854, "logps/rejected": -470.199072265625, "loss": 0.273, "rewards/chosen": 0.006455043951670329, "rewards/margins": 3.18748646179835, "rewards/rejected": -3.18103141784668, "step": 5981 }, { "epoch": 0.3170699387803779, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30305606.0, "logits/rejected": -34095213.333333336, "logps/chosen": -152.63818359375, "logps/rejected": -343.73291015625, "loss": 0.2368, "rewards/chosen": -0.3402370512485504, "rewards/margins": 1.9191237390041351, "rewards/rejected": -2.2593607902526855, "step": 5982 }, { "epoch": 0.31712294278218006, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23828189.333333332, "logits/rejected": -20874984.0, "logps/chosen": -363.132568359375, "logps/rejected": -296.8050537109375, "loss": 0.4054, "rewards/chosen": 0.2553199728329976, "rewards/margins": 1.6183388431866963, "rewards/rejected": -1.3630188703536987, "step": 5983 }, { "epoch": 0.3171759467839822, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46484485.333333336, "logits/rejected": -62716787.2, "logps/chosen": -386.7978515625, "logps/rejected": -229.2517822265625, "loss": 0.2799, "rewards/chosen": 0.3149007360140483, "rewards/margins": 1.894794801870982, "rewards/rejected": -1.5798940658569336, "step": 5984 }, { "epoch": 0.31722895078578434, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19340172.0, "logits/rejected": -20738957.333333332, "logps/chosen": -457.4951171875, "logps/rejected": -316.7251790364583, "loss": 0.2365, "rewards/chosen": 0.6620651483535767, "rewards/margins": 2.230062445004781, "rewards/rejected": -1.5679972966512044, "step": 5985 }, { "epoch": 0.3172819547875865, "grad_norm": 54.0, "kl": 0.3934288024902344, "learning_rate": 5e-07, "logits/chosen": -11449140.0, "logits/rejected": -47044148.0, "logps/chosen": -607.7239990234375, "logps/rejected": -379.67034912109375, "loss": 0.2482, "rewards/chosen": 0.4885650873184204, "rewards/margins": 2.83502733707428, "rewards/rejected": -2.3464622497558594, "step": 5986 }, { "epoch": 0.3173349587893886, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39930892.0, "logits/rejected": -15696645.0, "logps/chosen": -697.1634521484375, "logps/rejected": -210.94863891601562, "loss": 0.1633, "rewards/chosen": 1.2710334062576294, "rewards/margins": 3.764724373817444, "rewards/rejected": -2.4936909675598145, "step": 5987 }, { "epoch": 0.31738796279119075, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34931308.0, "logits/rejected": -70054608.0, "logps/chosen": -116.48650360107422, "logps/rejected": -253.4698486328125, "loss": 0.2539, "rewards/chosen": -0.2772649824619293, "rewards/margins": 1.5428825318813324, "rewards/rejected": -1.8201475143432617, "step": 5988 }, { "epoch": 0.3174409667929929, "grad_norm": 58.75, "kl": 0.16766071319580078, "learning_rate": 5e-07, "logits/chosen": -17140158.85714286, "logits/rejected": -24580900.0, "logps/chosen": -467.11202566964283, "logps/rejected": -79.38059997558594, "loss": 0.4292, "rewards/chosen": 0.30298004831586567, "rewards/margins": 1.6225143500736783, "rewards/rejected": -1.3195343017578125, "step": 5989 }, { "epoch": 0.317493970794795, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11765213.6, "logits/rejected": -46474565.333333336, "logps/chosen": -145.2220947265625, "logps/rejected": -132.5367431640625, "loss": 0.3468, "rewards/chosen": 0.43068222999572753, "rewards/margins": 1.500198252995809, "rewards/rejected": -1.0695160230000813, "step": 5990 }, { "epoch": 0.31754697479659716, "grad_norm": 36.75, "kl": 0.9656105041503906, "learning_rate": 5e-07, "logits/chosen": -16154822.0, "logits/rejected": -42957516.0, "logps/chosen": -404.594970703125, "logps/rejected": -375.8385009765625, "loss": 0.2934, "rewards/chosen": 0.40904292464256287, "rewards/margins": 2.9105795323848724, "rewards/rejected": -2.5015366077423096, "step": 5991 }, { "epoch": 0.3175999787983993, "grad_norm": 43.75, "kl": 0.32233428955078125, "learning_rate": 5e-07, "logits/chosen": -21841846.0, "logits/rejected": -19420752.0, "logps/chosen": -359.0448913574219, "logps/rejected": -190.72525024414062, "loss": 0.2741, "rewards/chosen": 1.1685020923614502, "rewards/margins": 2.3113142251968384, "rewards/rejected": -1.1428121328353882, "step": 5992 }, { "epoch": 0.31765298280020143, "grad_norm": 39.0, "kl": 1.6333627700805664, "learning_rate": 5e-07, "logits/chosen": -20274492.0, "logits/rejected": -1138151.0, "logps/chosen": -179.23947143554688, "logps/rejected": -399.2010803222656, "loss": 0.3403, "rewards/chosen": 0.4039846360683441, "rewards/margins": 2.2061963975429535, "rewards/rejected": -1.8022117614746094, "step": 5993 }, { "epoch": 0.31770598680200357, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8854221.333333334, "logits/rejected": -24868024.0, "logps/chosen": -238.24947102864584, "logps/rejected": -292.9724365234375, "loss": 0.2907, "rewards/chosen": -0.24646480878194174, "rewards/margins": 1.9461722532908123, "rewards/rejected": -2.192637062072754, "step": 5994 }, { "epoch": 0.3177589908038057, "grad_norm": 31.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3788618.0, "logits/rejected": -28767467.42857143, "logps/chosen": -2.676706314086914, "logps/rejected": -184.31747000558036, "loss": 0.2512, "rewards/chosen": 0.058715056627988815, "rewards/margins": 1.7381435659314906, "rewards/rejected": -1.6794285093035017, "step": 5995 }, { "epoch": 0.31781199480560784, "grad_norm": 54.5, "kl": 3.4229888916015625, "learning_rate": 5e-07, "logits/chosen": -6949022.0, "logits/rejected": -4985234.5, "logps/chosen": -546.1741536458334, "logps/rejected": -96.65884399414062, "loss": 0.344, "rewards/chosen": 1.1262993812561035, "rewards/margins": 2.130096435546875, "rewards/rejected": -1.0037970542907715, "step": 5996 }, { "epoch": 0.31786499880741, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7460371.0, "logits/rejected": -8630498.0, "logps/chosen": -276.08880615234375, "logps/rejected": -320.56536865234375, "loss": 0.2047, "rewards/chosen": 0.354765921831131, "rewards/margins": 2.861721227566401, "rewards/rejected": -2.50695530573527, "step": 5997 }, { "epoch": 0.3179180028092121, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36730224.0, "logits/rejected": -4936432.0, "logps/chosen": -182.32185872395834, "logps/rejected": -538.7310546875, "loss": 0.2342, "rewards/chosen": 0.09644591808319092, "rewards/margins": 2.680588412284851, "rewards/rejected": -2.58414249420166, "step": 5998 }, { "epoch": 0.31797100681101426, "grad_norm": 50.75, "kl": 0.08112716674804688, "learning_rate": 5e-07, "logits/chosen": -32914470.4, "logits/rejected": -6353386.0, "logps/chosen": -350.210107421875, "logps/rejected": -202.37591552734375, "loss": 0.3564, "rewards/chosen": 0.4066637992858887, "rewards/margins": 1.43984006245931, "rewards/rejected": -1.0331762631734211, "step": 5999 }, { "epoch": 0.3180240108128164, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7662260.0, "logits/rejected": -21875022.0, "logps/chosen": -213.12220764160156, "logps/rejected": -302.18536376953125, "loss": 0.2879, "rewards/chosen": 0.5241909027099609, "rewards/margins": 2.159082293510437, "rewards/rejected": -1.634891390800476, "step": 6000 }, { "epoch": 0.31807701481461853, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7671084.0, "logits/rejected": -28355078.4, "logps/chosen": -125.42336018880208, "logps/rejected": -262.619287109375, "loss": 0.3053, "rewards/chosen": 0.24784725904464722, "rewards/margins": 1.6394582629203795, "rewards/rejected": -1.3916110038757323, "step": 6001 }, { "epoch": 0.3181300188164206, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23053873.6, "logits/rejected": -101862656.0, "logps/chosen": -127.425341796875, "logps/rejected": -522.4684244791666, "loss": 0.3583, "rewards/chosen": 0.10425791740417481, "rewards/margins": 2.889425293604533, "rewards/rejected": -2.785167376200358, "step": 6002 }, { "epoch": 0.31818302281822275, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6449676.5, "logits/rejected": 52745717.333333336, "logps/chosen": -156.53677368164062, "logps/rejected": -265.7143961588542, "loss": 0.3125, "rewards/chosen": 0.1605907380580902, "rewards/margins": 1.4861221412817638, "rewards/rejected": -1.3255314032236736, "step": 6003 }, { "epoch": 0.3182360268200249, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32117884.0, "logits/rejected": -25628694.0, "logps/chosen": -341.1175842285156, "logps/rejected": -372.40777587890625, "loss": 0.2724, "rewards/chosen": 0.18809165060520172, "rewards/margins": 2.641674891114235, "rewards/rejected": -2.453583240509033, "step": 6004 }, { "epoch": 0.318289030821827, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28653661.714285713, "logits/rejected": 901865.75, "logps/chosen": -141.39619663783483, "logps/rejected": -43.192779541015625, "loss": 0.4475, "rewards/chosen": 0.060295164585113525, "rewards/margins": 1.5497782826423645, "rewards/rejected": -1.489483118057251, "step": 6005 }, { "epoch": 0.31834203482362916, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34740952.0, "logits/rejected": -12642263.0, "logps/chosen": -257.28863525390625, "logps/rejected": -190.36264038085938, "loss": 0.3316, "rewards/chosen": 0.0955204963684082, "rewards/margins": 1.6289501190185547, "rewards/rejected": -1.5334296226501465, "step": 6006 }, { "epoch": 0.3183950388254313, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29287744.0, "logits/rejected": -11774710.4, "logps/chosen": -401.9241536458333, "logps/rejected": -295.3681640625, "loss": 0.3162, "rewards/chosen": 0.0926361083984375, "rewards/margins": 1.5947975158691405, "rewards/rejected": -1.502161407470703, "step": 6007 }, { "epoch": 0.31844804282723344, "grad_norm": 34.75, "kl": 0.13646221160888672, "learning_rate": 5e-07, "logits/chosen": -2847119.6666666665, "logits/rejected": -42824297.6, "logps/chosen": -69.3412577311198, "logps/rejected": -468.98291015625, "loss": 0.2557, "rewards/chosen": -0.2253519892692566, "rewards/margins": 2.3945992588996887, "rewards/rejected": -2.6199512481689453, "step": 6008 }, { "epoch": 0.3185010468290356, "grad_norm": 40.5, "kl": 0.6750335693359375, "learning_rate": 5e-07, "logits/chosen": -33784000.0, "logits/rejected": -17709956.0, "logps/chosen": -189.03086853027344, "logps/rejected": -71.96375274658203, "loss": 0.4177, "rewards/chosen": -0.25522613525390625, "rewards/margins": 0.8182802200317383, "rewards/rejected": -1.0735063552856445, "step": 6009 }, { "epoch": 0.3185540508308377, "grad_norm": 38.75, "kl": 0.9134140014648438, "learning_rate": 5e-07, "logits/chosen": 22454154.666666668, "logits/rejected": -22107072.0, "logps/chosen": -364.9201253255208, "logps/rejected": -364.9011474609375, "loss": 0.2011, "rewards/chosen": 0.4645434220631917, "rewards/margins": 2.733417018254598, "rewards/rejected": -2.268873596191406, "step": 6010 }, { "epoch": 0.31860705483263985, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24454756.0, "logits/rejected": -39077364.0, "logps/chosen": -344.31683349609375, "logps/rejected": -287.27587890625, "loss": 0.2958, "rewards/chosen": 0.3348467946052551, "rewards/margins": 2.293231189250946, "rewards/rejected": -1.958384394645691, "step": 6011 }, { "epoch": 0.318660058834442, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35586585.6, "logits/rejected": -41055106.666666664, "logps/chosen": -248.912109375, "logps/rejected": -186.3906046549479, "loss": 0.3426, "rewards/chosen": -0.07382190227508545, "rewards/margins": 2.9058470328648887, "rewards/rejected": -2.979668935139974, "step": 6012 }, { "epoch": 0.3187130628362441, "grad_norm": 58.25, "kl": 1.5430049896240234, "learning_rate": 5e-07, "logits/chosen": -22372454.4, "logits/rejected": -7645640.0, "logps/chosen": -305.8806640625, "logps/rejected": -249.85502115885416, "loss": 0.3122, "rewards/chosen": 0.5281853675842285, "rewards/margins": 2.725832144419352, "rewards/rejected": -2.1976467768351235, "step": 6013 }, { "epoch": 0.31876606683804626, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13015341.333333334, "logits/rejected": 5944520.0, "logps/chosen": -305.3555501302083, "logps/rejected": -68.23919677734375, "loss": 0.4388, "rewards/chosen": 0.23172072569529215, "rewards/margins": 0.5413829783598582, "rewards/rejected": -0.30966225266456604, "step": 6014 }, { "epoch": 0.3188190708398484, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86395733.33333333, "logits/rejected": -32938560.0, "logps/chosen": -192.00052897135416, "logps/rejected": -290.38681640625, "loss": 0.296, "rewards/chosen": 0.06848602493604024, "rewards/margins": 2.3502468128999077, "rewards/rejected": -2.2817607879638673, "step": 6015 }, { "epoch": 0.31887207484165053, "grad_norm": 38.0, "kl": 0.5413398742675781, "learning_rate": 5e-07, "logits/chosen": -14263830.0, "logits/rejected": -50538920.0, "logps/chosen": -218.58364868164062, "logps/rejected": -453.6954345703125, "loss": 0.2392, "rewards/chosen": 0.42036908864974976, "rewards/margins": 3.408744990825653, "rewards/rejected": -2.9883759021759033, "step": 6016 }, { "epoch": 0.31892507884345267, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5048461.6, "logits/rejected": 27256072.0, "logps/chosen": -311.87783203125, "logps/rejected": -469.8280436197917, "loss": 0.3223, "rewards/chosen": 0.3692401170730591, "rewards/margins": 2.231339764595032, "rewards/rejected": -1.8620996475219727, "step": 6017 }, { "epoch": 0.3189780828452548, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32898016.0, "logits/rejected": -2860557.4, "logps/chosen": -412.2389322916667, "logps/rejected": -161.62435302734374, "loss": 0.303, "rewards/chosen": 0.3605033953984578, "rewards/margins": 1.6195264895757038, "rewards/rejected": -1.259023094177246, "step": 6018 }, { "epoch": 0.31903108684705694, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8152773.0, "logits/rejected": -34447032.0, "logps/chosen": -88.36710357666016, "logps/rejected": -300.4351806640625, "loss": 0.2545, "rewards/chosen": 0.1850845366716385, "rewards/margins": 1.981904223561287, "rewards/rejected": -1.7968196868896484, "step": 6019 }, { "epoch": 0.3190840908488591, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54773350.4, "logits/rejected": -6763928.666666667, "logps/chosen": -498.224365234375, "logps/rejected": -90.75490315755208, "loss": 0.3972, "rewards/chosen": 0.04276795387268066, "rewards/margins": 1.241566292444865, "rewards/rejected": -1.1987983385721843, "step": 6020 }, { "epoch": 0.3191370948506612, "grad_norm": 59.25, "kl": 0.4168853759765625, "learning_rate": 5e-07, "logits/chosen": -13115288.0, "logps/chosen": -189.51144409179688, "loss": 0.4239, "rewards/chosen": 0.36146578192710876, "step": 6021 }, { "epoch": 0.31919009885246336, "grad_norm": 43.5, "kl": 0.06305408477783203, "learning_rate": 5e-07, "logits/chosen": -26185232.0, "logits/rejected": -7094117.0, "logps/chosen": -284.5411376953125, "logps/rejected": -200.462890625, "loss": 0.3224, "rewards/chosen": 0.6918808817863464, "rewards/margins": 1.6791833639144897, "rewards/rejected": -0.9873024821281433, "step": 6022 }, { "epoch": 0.3192431028542655, "grad_norm": 47.75, "kl": 0.5490608215332031, "learning_rate": 5e-07, "logits/chosen": -20549798.666666668, "logits/rejected": -24576756.8, "logps/chosen": -80.58742268880208, "logps/rejected": -300.9766357421875, "loss": 0.2715, "rewards/chosen": 0.046720887223879494, "rewards/margins": 2.4681888590256373, "rewards/rejected": -2.421467971801758, "step": 6023 }, { "epoch": 0.31929610685606763, "grad_norm": 40.75, "kl": 0.7025299072265625, "learning_rate": 5e-07, "logits/chosen": -11935712.0, "logits/rejected": -49236688.0, "logps/chosen": -161.63436889648438, "logps/rejected": -287.5660705566406, "loss": 0.3083, "rewards/chosen": 0.2500210106372833, "rewards/margins": 2.0447883903980255, "rewards/rejected": -1.7947673797607422, "step": 6024 }, { "epoch": 0.31934911085786977, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50452716.8, "logits/rejected": 5589357.333333333, "logps/chosen": -343.60400390625, "logps/rejected": -129.99874877929688, "loss": 0.4361, "rewards/chosen": -0.059719234704971313, "rewards/margins": 0.8244912723700205, "rewards/rejected": -0.8842105070749918, "step": 6025 }, { "epoch": 0.3194021148596719, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19773924.0, "logits/rejected": -28702070.0, "logps/chosen": -290.16986083984375, "logps/rejected": -398.5351867675781, "loss": 0.2961, "rewards/chosen": -0.05507071316242218, "rewards/margins": 2.4819790571928024, "rewards/rejected": -2.5370497703552246, "step": 6026 }, { "epoch": 0.31945511886147404, "grad_norm": 46.25, "kl": 2.680347442626953, "learning_rate": 5e-07, "logits/chosen": -13599525.0, "logits/rejected": -44712744.0, "logps/chosen": -413.9742126464844, "logps/rejected": -611.703125, "loss": 0.2075, "rewards/chosen": 0.9900608062744141, "rewards/margins": 3.8257670402526855, "rewards/rejected": -2.8357062339782715, "step": 6027 }, { "epoch": 0.3195081228632762, "grad_norm": 44.25, "kl": 0.19073104858398438, "learning_rate": 5e-07, "logits/chosen": 4762509.0, "logits/rejected": -23245930.666666668, "logps/chosen": -466.63946533203125, "logps/rejected": -235.2813720703125, "loss": 0.1832, "rewards/chosen": 1.5587280988693237, "rewards/margins": 3.3168081045150757, "rewards/rejected": -1.758080005645752, "step": 6028 }, { "epoch": 0.3195611268650783, "grad_norm": 44.5, "kl": 1.2651824951171875, "learning_rate": 5e-07, "logits/chosen": -13129310.0, "logits/rejected": -7466698.0, "logps/chosen": -197.38992309570312, "logps/rejected": -123.57463073730469, "loss": 0.2918, "rewards/chosen": 0.39982399344444275, "rewards/margins": 1.7589325606822968, "rewards/rejected": -1.359108567237854, "step": 6029 }, { "epoch": 0.31961413086688045, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31677912.0, "logits/rejected": 12683256.0, "logps/chosen": -311.486328125, "logps/rejected": -312.9615478515625, "loss": 0.3577, "rewards/chosen": 0.06436023861169815, "rewards/margins": 1.5527546182274818, "rewards/rejected": -1.4883943796157837, "step": 6030 }, { "epoch": 0.3196671348686826, "grad_norm": 44.75, "kl": 0.6062660217285156, "learning_rate": 5e-07, "logits/chosen": -12936302.666666666, "logits/rejected": -20780224.0, "logps/chosen": -190.632080078125, "logps/rejected": -154.4271728515625, "loss": 0.2977, "rewards/chosen": 0.48975904782613117, "rewards/margins": 1.9697446664174396, "rewards/rejected": -1.4799856185913085, "step": 6031 }, { "epoch": 0.3197201388704847, "grad_norm": 33.25, "kl": 1.2070045471191406, "learning_rate": 5e-07, "logits/chosen": -2454267.3333333335, "logits/rejected": -27978137.6, "logps/chosen": -229.36175537109375, "logps/rejected": -359.1576416015625, "loss": 0.26, "rewards/chosen": 0.8544634977976481, "rewards/margins": 2.57678329149882, "rewards/rejected": -1.7223197937011718, "step": 6032 }, { "epoch": 0.31977314287228686, "grad_norm": 52.25, "kl": 1.0140762329101562, "learning_rate": 5e-07, "logits/chosen": -53971000.0, "logits/rejected": 13746937.0, "logps/chosen": -264.8092346191406, "logps/rejected": -215.95838928222656, "loss": 0.2986, "rewards/chosen": 0.8312975168228149, "rewards/margins": 2.1543660163879395, "rewards/rejected": -1.3230684995651245, "step": 6033 }, { "epoch": 0.319826146874089, "grad_norm": 61.25, "kl": 0.9000587463378906, "learning_rate": 5e-07, "logits/chosen": -44859884.0, "logits/rejected": 3130128.0, "logps/chosen": -160.14027404785156, "logps/rejected": -430.6534118652344, "loss": 0.3488, "rewards/chosen": -0.07294464111328125, "rewards/margins": 1.6189292669296265, "rewards/rejected": -1.6918739080429077, "step": 6034 }, { "epoch": 0.31987915087589114, "grad_norm": 57.0, "kl": 0.21837234497070312, "learning_rate": 5e-07, "logits/chosen": 39292485.333333336, "logits/rejected": -41810736.0, "logps/chosen": -553.5129801432291, "logps/rejected": -170.76145935058594, "loss": 0.3288, "rewards/chosen": 0.6555865208307902, "rewards/margins": 2.437776486078898, "rewards/rejected": -1.782189965248108, "step": 6035 }, { "epoch": 0.3199321548776933, "grad_norm": 60.5, "kl": 0.1126413345336914, "learning_rate": 5e-07, "logits/chosen": -38091356.8, "logits/rejected": -51957210.666666664, "logps/chosen": -256.2555419921875, "logps/rejected": -418.6214599609375, "loss": 0.3825, "rewards/chosen": 0.006632077693939209, "rewards/margins": 2.5592528065045674, "rewards/rejected": -2.5526207288106284, "step": 6036 }, { "epoch": 0.3199851588794954, "grad_norm": 51.75, "kl": 0.7165431976318359, "learning_rate": 5e-07, "logits/chosen": -35111596.8, "logits/rejected": -20522133.333333332, "logps/chosen": -311.770849609375, "logps/rejected": -300.6843668619792, "loss": 0.3909, "rewards/chosen": 0.17061431407928468, "rewards/margins": 1.6252888758977253, "rewards/rejected": -1.4546745618184407, "step": 6037 }, { "epoch": 0.32003816288129755, "grad_norm": 59.25, "kl": 0.5137052536010742, "learning_rate": 5e-07, "logits/chosen": -28610124.8, "logits/rejected": -13145182.666666666, "logps/chosen": -339.13876953125, "logps/rejected": -400.4773763020833, "loss": 0.3022, "rewards/chosen": 0.7681822299957275, "rewards/margins": 1.9300190130869548, "rewards/rejected": -1.1618367830912273, "step": 6038 }, { "epoch": 0.3200911668830997, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24371080.0, "logits/rejected": -5187104.8, "logps/chosen": -179.75201416015625, "logps/rejected": -279.1353759765625, "loss": 0.2537, "rewards/chosen": 0.6131014426549276, "rewards/margins": 2.2729876120885213, "rewards/rejected": -1.6598861694335938, "step": 6039 }, { "epoch": 0.3201441708849018, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71000368.0, "logits/rejected": 106322496.0, "logps/chosen": -306.28452555338544, "logps/rejected": -271.36806640625, "loss": 0.376, "rewards/chosen": -0.02447916567325592, "rewards/margins": 0.8924774497747421, "rewards/rejected": -0.916956615447998, "step": 6040 }, { "epoch": 0.32019717488670396, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37347124.0, "logits/rejected": -6799303.0, "logps/chosen": -584.6138916015625, "logps/rejected": -208.0201873779297, "loss": 0.3057, "rewards/chosen": 0.48643988370895386, "rewards/margins": 2.430152952671051, "rewards/rejected": -1.9437130689620972, "step": 6041 }, { "epoch": 0.3202501788885061, "grad_norm": 52.5, "kl": 0.6371498107910156, "learning_rate": 5e-07, "logits/chosen": -72860480.0, "logits/rejected": -80812842.66666667, "logps/chosen": -313.909765625, "logps/rejected": -484.8074137369792, "loss": 0.34, "rewards/chosen": 0.40048608779907224, "rewards/margins": 2.039393679300944, "rewards/rejected": -1.6389075915018718, "step": 6042 }, { "epoch": 0.32030318289030824, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23158012.0, "logits/rejected": -27699710.0, "logps/chosen": -377.2748107910156, "logps/rejected": -262.81134033203125, "loss": 0.2048, "rewards/chosen": 1.1405479907989502, "rewards/margins": 2.9568533897399902, "rewards/rejected": -1.81630539894104, "step": 6043 }, { "epoch": 0.3203561868921104, "grad_norm": 39.0, "kl": 3.726469039916992, "learning_rate": 5e-07, "logits/chosen": -40879811.2, "logits/rejected": -1688518.8333333333, "logps/chosen": -529.990869140625, "logps/rejected": -85.3967793782552, "loss": 0.4258, "rewards/chosen": 0.7630845069885254, "rewards/margins": 1.527303949991862, "rewards/rejected": -0.7642194430033366, "step": 6044 }, { "epoch": 0.3204091908939125, "grad_norm": 55.5, "kl": 0.5241317749023438, "learning_rate": 5e-07, "logits/chosen": -75003609.6, "logits/rejected": 66661701.333333336, "logps/chosen": -351.6672607421875, "logps/rejected": -438.2600911458333, "loss": 0.2611, "rewards/chosen": 0.5001104831695556, "rewards/margins": 4.005667861302694, "rewards/rejected": -3.505557378133138, "step": 6045 }, { "epoch": 0.32046219489571465, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24224277.333333332, "logits/rejected": -3192830.4, "logps/chosen": -220.1445109049479, "logps/rejected": -141.134619140625, "loss": 0.3063, "rewards/chosen": -0.1648005247116089, "rewards/margins": 1.4898106813430787, "rewards/rejected": -1.6546112060546876, "step": 6046 }, { "epoch": 0.3205151988975168, "grad_norm": 47.25, "kl": 0.3164100646972656, "learning_rate": 5e-07, "logits/chosen": -49701592.0, "logits/rejected": -9891484.666666666, "logps/chosen": -321.8162536621094, "logps/rejected": -235.59989420572916, "loss": 0.1495, "rewards/chosen": 0.8120735287666321, "rewards/margins": 3.0982025265693665, "rewards/rejected": -2.2861289978027344, "step": 6047 }, { "epoch": 0.3205682028993189, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29259274.666666668, "logits/rejected": -26316846.4, "logps/chosen": -353.5924479166667, "logps/rejected": -255.0147705078125, "loss": 0.2688, "rewards/chosen": 0.3462056318918864, "rewards/margins": 2.047276226679484, "rewards/rejected": -1.7010705947875977, "step": 6048 }, { "epoch": 0.32062120690112106, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47574924.0, "logits/rejected": -34235776.0, "logps/chosen": -788.029541015625, "logps/rejected": -269.08221435546875, "loss": 0.2508, "rewards/chosen": 0.47368353605270386, "rewards/margins": 2.7643575072288513, "rewards/rejected": -2.2906739711761475, "step": 6049 }, { "epoch": 0.3206742109029232, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69100144.0, "logits/rejected": 15928346.0, "logps/chosen": -502.01641845703125, "logps/rejected": -207.21719360351562, "loss": 0.3531, "rewards/chosen": 0.11806488037109375, "rewards/margins": 1.7996411323547363, "rewards/rejected": -1.6815762519836426, "step": 6050 }, { "epoch": 0.32072721490472533, "grad_norm": 53.25, "kl": 0.44841766357421875, "learning_rate": 5e-07, "logits/chosen": -26663433.6, "logits/rejected": -70854016.0, "logps/chosen": -458.984326171875, "logps/rejected": -496.9582112630208, "loss": 0.3138, "rewards/chosen": 0.2878873825073242, "rewards/margins": 2.913361358642578, "rewards/rejected": -2.625473976135254, "step": 6051 }, { "epoch": 0.32078021890652747, "grad_norm": 33.5, "kl": 1.0503644943237305, "learning_rate": 5e-07, "logits/chosen": -19686472.0, "logits/rejected": -42415568.0, "logps/chosen": -142.18386840820312, "logps/rejected": -330.4869689941406, "loss": 0.3012, "rewards/chosen": -0.2021968960762024, "rewards/margins": 2.8894347548484802, "rewards/rejected": -3.0916316509246826, "step": 6052 }, { "epoch": 0.32083322290832955, "grad_norm": 59.75, "kl": 2.792713165283203, "learning_rate": 5e-07, "logits/chosen": -19240700.0, "logits/rejected": -2978937.5, "logps/chosen": -301.5525716145833, "logps/rejected": -120.17890930175781, "loss": 0.3862, "rewards/chosen": 0.49688422679901123, "rewards/margins": 2.2176986932754517, "rewards/rejected": -1.7208144664764404, "step": 6053 }, { "epoch": 0.3208862269101317, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8299669.333333333, "logits/rejected": -5475437.5, "logps/chosen": -223.78385416666666, "logps/rejected": -186.09884643554688, "loss": 0.4113, "rewards/chosen": 0.05074283480644226, "rewards/margins": 1.7078447043895721, "rewards/rejected": -1.6571018695831299, "step": 6054 }, { "epoch": 0.3209392309119338, "grad_norm": 50.25, "kl": 1.4132423400878906, "learning_rate": 5e-07, "logits/chosen": -30697977.6, "logits/rejected": -2055325.3333333333, "logps/chosen": -292.661328125, "logps/rejected": -465.6688232421875, "loss": 0.3495, "rewards/chosen": 0.08371360301971435, "rewards/margins": 2.7581098000208537, "rewards/rejected": -2.674396197001139, "step": 6055 }, { "epoch": 0.32099223491373596, "grad_norm": 49.25, "kl": 1.9228935241699219, "learning_rate": 5e-07, "logits/chosen": -43064816.0, "logits/rejected": -23864974.0, "logps/chosen": -359.9185791015625, "logps/rejected": -305.53167724609375, "loss": 0.2832, "rewards/chosen": 0.5027916431427002, "rewards/margins": 2.8161532878875732, "rewards/rejected": -2.313361644744873, "step": 6056 }, { "epoch": 0.3210452389155381, "grad_norm": 42.25, "kl": 1.8980255126953125, "learning_rate": 5e-07, "logits/chosen": -35275545.6, "logits/rejected": -61314149.333333336, "logps/chosen": -308.6837890625, "logps/rejected": -200.66068522135416, "loss": 0.297, "rewards/chosen": 0.6613657474517822, "rewards/margins": 2.6789206663767495, "rewards/rejected": -2.0175549189249673, "step": 6057 }, { "epoch": 0.32109824291734024, "grad_norm": 59.75, "kl": 0.5945587158203125, "learning_rate": 5e-07, "logits/chosen": -14185476.8, "logits/rejected": -11975902.666666666, "logps/chosen": -307.564794921875, "logps/rejected": -307.6935221354167, "loss": 0.3388, "rewards/chosen": 0.32882306575775144, "rewards/margins": 2.0825660785039264, "rewards/rejected": -1.753743012746175, "step": 6058 }, { "epoch": 0.3211512469191424, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21394347.2, "logits/rejected": -9819277.333333334, "logps/chosen": -310.379833984375, "logps/rejected": -138.4164021809896, "loss": 0.3846, "rewards/chosen": 0.1310065507888794, "rewards/margins": 1.4772011359532673, "rewards/rejected": -1.346194585164388, "step": 6059 }, { "epoch": 0.3212042509209445, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38094457.6, "logits/rejected": -38916573.333333336, "logps/chosen": -278.58330078125, "logps/rejected": -235.96464029947916, "loss": 0.3899, "rewards/chosen": -0.12221379280090332, "rewards/margins": 1.495624589920044, "rewards/rejected": -1.6178383827209473, "step": 6060 }, { "epoch": 0.32125725492274665, "grad_norm": 50.25, "kl": 1.2061529159545898, "learning_rate": 5e-07, "logits/chosen": -7430378.4, "logits/rejected": -11077432.0, "logps/chosen": -219.4241455078125, "logps/rejected": -195.4964803059896, "loss": 0.2737, "rewards/chosen": 0.9110700607299804, "rewards/margins": 2.9435275713602698, "rewards/rejected": -2.0324575106302896, "step": 6061 }, { "epoch": 0.3213102589245488, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12988855.0, "logits/rejected": -25379490.0, "logps/chosen": -398.6466064453125, "logps/rejected": -256.0443420410156, "loss": 0.2369, "rewards/chosen": 0.7803200483322144, "rewards/margins": 2.6122708320617676, "rewards/rejected": -1.8319507837295532, "step": 6062 }, { "epoch": 0.3213632629263509, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22773984.0, "logits/rejected": -24666260.8, "logps/chosen": -327.8931477864583, "logps/rejected": -399.16708984375, "loss": 0.2883, "rewards/chosen": -0.3730509678522746, "rewards/margins": 2.1269705851872764, "rewards/rejected": -2.500021553039551, "step": 6063 }, { "epoch": 0.32141626692815306, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12792764.0, "logits/rejected": -13098930.0, "logps/chosen": -297.10595703125, "logps/rejected": -106.9823989868164, "loss": 0.3613, "rewards/chosen": -0.01153092086315155, "rewards/margins": 1.2601862698793411, "rewards/rejected": -1.2717171907424927, "step": 6064 }, { "epoch": 0.3214692709299552, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19921244.8, "logits/rejected": -7891629.333333333, "logps/chosen": -345.8737060546875, "logps/rejected": -266.3468831380208, "loss": 0.1848, "rewards/chosen": 1.2901501655578613, "rewards/margins": 3.4568289120992026, "rewards/rejected": -2.1666787465413413, "step": 6065 }, { "epoch": 0.32152227493175733, "grad_norm": 52.5, "kl": 0.10393905639648438, "learning_rate": 5e-07, "logits/chosen": -23851662.4, "logits/rejected": -68388037.33333333, "logps/chosen": -121.3630126953125, "logps/rejected": -411.6214192708333, "loss": 0.4608, "rewards/chosen": -0.31024537086486814, "rewards/margins": 1.0019315719604491, "rewards/rejected": -1.3121769428253174, "step": 6066 }, { "epoch": 0.32157527893355947, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7707206.0, "logits/rejected": -53636132.0, "logps/chosen": -243.7665252685547, "logps/rejected": -351.0745544433594, "loss": 0.3079, "rewards/chosen": 0.5313066840171814, "rewards/margins": 2.3549044728279114, "rewards/rejected": -1.82359778881073, "step": 6067 }, { "epoch": 0.3216282829353616, "grad_norm": 57.0, "kl": 1.9485530853271484, "learning_rate": 5e-07, "logits/chosen": -16949525.333333332, "logits/rejected": -16429454.0, "logps/chosen": -325.94512939453125, "logps/rejected": -498.28033447265625, "loss": 0.3376, "rewards/chosen": 0.580078919728597, "rewards/margins": 3.408217748006185, "rewards/rejected": -2.828138828277588, "step": 6068 }, { "epoch": 0.32168128693716375, "grad_norm": 54.25, "kl": 1.0252647399902344, "learning_rate": 5e-07, "logits/chosen": -34930650.666666664, "logits/rejected": -44248272.0, "logps/chosen": -354.398193359375, "logps/rejected": -414.22344970703125, "loss": 0.3875, "rewards/chosen": 0.13554482658704123, "rewards/margins": 2.1855922242005668, "rewards/rejected": -2.0500473976135254, "step": 6069 }, { "epoch": 0.3217342909389659, "grad_norm": 45.25, "kl": 0.44879913330078125, "learning_rate": 5e-07, "logits/chosen": -36137480.0, "logits/rejected": -35895824.0, "logps/chosen": -257.7208658854167, "logps/rejected": -342.1517028808594, "loss": 0.4549, "rewards/chosen": -0.3232462406158447, "rewards/margins": 2.1833789348602295, "rewards/rejected": -2.506625175476074, "step": 6070 }, { "epoch": 0.321787294940768, "grad_norm": 72.0, "kl": 0.15021896362304688, "learning_rate": 5e-07, "logits/chosen": -34467110.85714286, "logits/rejected": -117373072.0, "logps/chosen": -362.1599818638393, "logps/rejected": -649.3961181640625, "loss": 0.3661, "rewards/chosen": 0.3585179192679269, "rewards/margins": 4.125095163072858, "rewards/rejected": -3.7665772438049316, "step": 6071 }, { "epoch": 0.32184029894257016, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32355568.0, "logits/rejected": 11105508.0, "logps/chosen": -312.443603515625, "logps/rejected": -227.72039794921875, "loss": 0.3623, "rewards/chosen": 0.33358816305796307, "rewards/margins": 2.0732891956965127, "rewards/rejected": -1.7397010326385498, "step": 6072 }, { "epoch": 0.3218933029443723, "grad_norm": 54.0, "kl": 0.33058929443359375, "learning_rate": 5e-07, "logits/chosen": -55783344.0, "logits/rejected": -28918822.0, "logps/chosen": -176.48573303222656, "logps/rejected": -301.1932678222656, "loss": 0.3196, "rewards/chosen": 0.180781289935112, "rewards/margins": 2.0783261507749557, "rewards/rejected": -1.8975448608398438, "step": 6073 }, { "epoch": 0.32194630694617443, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17388608.0, "logits/rejected": -28276506.0, "logps/chosen": -219.5428466796875, "logps/rejected": -291.3138732910156, "loss": 0.3776, "rewards/chosen": 0.20225600401560465, "rewards/margins": 1.917924443880717, "rewards/rejected": -1.7156684398651123, "step": 6074 }, { "epoch": 0.32199931094797657, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12825952.0, "logits/rejected": -25270714.666666668, "logps/chosen": -247.96456909179688, "logps/rejected": -300.0157063802083, "loss": 0.2567, "rewards/chosen": 0.11164245754480362, "rewards/margins": 1.738959568242232, "rewards/rejected": -1.6273171106974285, "step": 6075 }, { "epoch": 0.3220523149497787, "grad_norm": 37.75, "kl": 1.0578384399414062, "learning_rate": 5e-07, "logits/chosen": -16005009.6, "logits/rejected": -48144800.0, "logps/chosen": -206.7466552734375, "logps/rejected": -543.745849609375, "loss": 0.3216, "rewards/chosen": 0.29161646366119387, "rewards/margins": 3.286033018430074, "rewards/rejected": -2.9944165547688804, "step": 6076 }, { "epoch": 0.32210531895158084, "grad_norm": 45.75, "kl": 1.1821966171264648, "learning_rate": 5e-07, "logits/chosen": -50570108.0, "logits/rejected": 816562.875, "logps/chosen": -293.6562805175781, "logps/rejected": -50.30332946777344, "loss": 0.3651, "rewards/chosen": 0.234200119972229, "rewards/margins": 1.6478383541107178, "rewards/rejected": -1.4136382341384888, "step": 6077 }, { "epoch": 0.322158322953383, "grad_norm": 52.25, "kl": 0.7226295471191406, "learning_rate": 5e-07, "logits/chosen": -10220941.333333334, "logits/rejected": -26932850.0, "logps/chosen": -283.07362874348956, "logps/rejected": -375.79559326171875, "loss": 0.4016, "rewards/chosen": 0.17311634620030722, "rewards/margins": 2.2021111448605857, "rewards/rejected": -2.0289947986602783, "step": 6078 }, { "epoch": 0.3222113269551851, "grad_norm": 53.5, "kl": 1.1109962463378906, "learning_rate": 5e-07, "logits/chosen": -46515584.0, "logits/rejected": -53955993.6, "logps/chosen": -737.990966796875, "logps/rejected": -482.10205078125, "loss": 0.1963, "rewards/chosen": 0.6513305902481079, "rewards/margins": 3.7871910333633423, "rewards/rejected": -3.1358604431152344, "step": 6079 }, { "epoch": 0.32226433095698725, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30170677.333333332, "logits/rejected": -90574688.0, "logps/chosen": -150.93302408854166, "logps/rejected": -157.2498779296875, "loss": 0.4533, "rewards/chosen": 0.07567850748697917, "rewards/margins": 0.6945761243502299, "rewards/rejected": -0.6188976168632507, "step": 6080 }, { "epoch": 0.3223173349587894, "grad_norm": 57.75, "kl": 0.205474853515625, "learning_rate": 5e-07, "logits/chosen": -29519768.0, "logits/rejected": -277065.5, "logps/chosen": -434.2492370605469, "logps/rejected": -397.9523010253906, "loss": 0.3326, "rewards/chosen": 0.24768680334091187, "rewards/margins": 1.54683917760849, "rewards/rejected": -1.2991523742675781, "step": 6081 }, { "epoch": 0.32237033896059153, "grad_norm": 78.0, "kl": 0.30416107177734375, "learning_rate": 5e-07, "logits/chosen": 1262345.0, "logits/rejected": -15512969.0, "logps/chosen": -430.008056640625, "logps/rejected": -329.92718505859375, "loss": 0.416, "rewards/chosen": 0.021751026312510174, "rewards/margins": 1.715815047423045, "rewards/rejected": -1.6940640211105347, "step": 6082 }, { "epoch": 0.32242334296239367, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -480318.8125, "logits/rejected": -32503722.666666668, "logps/chosen": -70.25669860839844, "logps/rejected": -558.1506754557291, "loss": 0.2531, "rewards/chosen": 0.00678291916847229, "rewards/margins": 2.1444815695285797, "rewards/rejected": -2.1376986503601074, "step": 6083 }, { "epoch": 0.3224763469641958, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50985440.0, "logits/rejected": -19952236.8, "logps/chosen": -237.66288248697916, "logps/rejected": -222.52177734375, "loss": 0.3298, "rewards/chosen": -0.19042988618214926, "rewards/margins": 1.4106784741083782, "rewards/rejected": -1.6011083602905274, "step": 6084 }, { "epoch": 0.32252935096599794, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19804233.333333332, "logits/rejected": -22426504.0, "logps/chosen": -183.13297526041666, "logps/rejected": -334.625341796875, "loss": 0.2323, "rewards/chosen": 0.6337970495223999, "rewards/margins": 2.8212058782577514, "rewards/rejected": -2.1874088287353515, "step": 6085 }, { "epoch": 0.3225823549678001, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 41704232.0, "logits/rejected": -36769491.2, "logps/chosen": -353.0163981119792, "logps/rejected": -355.191796875, "loss": 0.3182, "rewards/chosen": -0.5819176832834879, "rewards/margins": 1.35799511273702, "rewards/rejected": -1.9399127960205078, "step": 6086 }, { "epoch": 0.3226353589696022, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1394107.125, "logits/rejected": -16074688.0, "logps/chosen": -370.6015625, "logps/rejected": -346.38828822544644, "loss": 0.2667, "rewards/chosen": -0.7835022211074829, "rewards/margins": 1.03220157963889, "rewards/rejected": -1.8157038007463728, "step": 6087 }, { "epoch": 0.32268836297140435, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6579044.0, "logits/rejected": -14842726.4, "logps/chosen": -180.1016845703125, "logps/rejected": -201.0635986328125, "loss": 0.2046, "rewards/chosen": 1.0791651407877605, "rewards/margins": 2.856648508707682, "rewards/rejected": -1.7774833679199218, "step": 6088 }, { "epoch": 0.3227413669732065, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -22424126.0, "logps/rejected": -302.6298522949219, "loss": 0.2556, "rewards/rejected": -1.4386037588119507, "step": 6089 }, { "epoch": 0.3227943709750086, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33333794.666666668, "logits/rejected": -7124600.0, "logps/chosen": -259.1496175130208, "logps/rejected": -300.2985107421875, "loss": 0.3566, "rewards/chosen": -0.5861395200093588, "rewards/margins": 1.1193294207255047, "rewards/rejected": -1.7054689407348633, "step": 6090 }, { "epoch": 0.32284737497681076, "grad_norm": 53.5, "kl": 1.2809677124023438, "learning_rate": 5e-07, "logits/chosen": -38675366.4, "logits/rejected": -55560064.0, "logps/chosen": -524.680224609375, "logps/rejected": -570.4552815755209, "loss": 0.2731, "rewards/chosen": 0.5533850193023682, "rewards/margins": 4.103895743687948, "rewards/rejected": -3.5505107243855796, "step": 6091 }, { "epoch": 0.3229003789786129, "grad_norm": 48.25, "kl": 0.3165283203125, "learning_rate": 5e-07, "logits/chosen": -558367.2, "logits/rejected": -20780405.333333332, "logps/chosen": -217.9962890625, "logps/rejected": -334.4168701171875, "loss": 0.3466, "rewards/chosen": 0.2868096351623535, "rewards/margins": 1.9787753740946452, "rewards/rejected": -1.6919657389322917, "step": 6092 }, { "epoch": 0.32295338298041504, "grad_norm": 28.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3170083.0, "logits/rejected": -30397533.714285713, "logps/chosen": -34.045249938964844, "logps/rejected": -341.73311941964283, "loss": 0.1939, "rewards/chosen": 0.5782257318496704, "rewards/margins": 2.566906980105809, "rewards/rejected": -1.9886812482561385, "step": 6093 }, { "epoch": 0.3230063869822172, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10328853.333333334, "logits/rejected": -6188934.4, "logps/chosen": -218.310546875, "logps/rejected": -290.051806640625, "loss": 0.2492, "rewards/chosen": 0.6400948762893677, "rewards/margins": 2.3099707841873167, "rewards/rejected": -1.6698759078979493, "step": 6094 }, { "epoch": 0.3230593909840193, "grad_norm": 54.5, "kl": 0.52392578125, "learning_rate": 5e-07, "logits/chosen": -41496867.2, "logits/rejected": -14453090.666666666, "logps/chosen": -426.394384765625, "logps/rejected": -78.21014404296875, "loss": 0.3658, "rewards/chosen": 0.40305342674255373, "rewards/margins": 1.4541516145070394, "rewards/rejected": -1.0510981877644856, "step": 6095 }, { "epoch": 0.32311239498582145, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25855710.0, "logits/rejected": -14456822.666666666, "logps/chosen": -252.44061279296875, "logps/rejected": -234.0595703125, "loss": 0.2052, "rewards/chosen": 0.29303380846977234, "rewards/margins": 2.2669921418031054, "rewards/rejected": -1.9739583333333333, "step": 6096 }, { "epoch": 0.3231653989876236, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15565281.6, "logits/rejected": -35379208.0, "logps/chosen": -253.619482421875, "logps/rejected": -247.070068359375, "loss": 0.318, "rewards/chosen": 0.36243011951446535, "rewards/margins": 2.1806750535964965, "rewards/rejected": -1.8182449340820312, "step": 6097 }, { "epoch": 0.3232184029894257, "grad_norm": 44.75, "kl": 0.2524070739746094, "learning_rate": 5e-07, "logits/chosen": -34894733.333333336, "logits/rejected": -34174867.2, "logps/chosen": -200.94344075520834, "logps/rejected": -416.456298828125, "loss": 0.2305, "rewards/chosen": -0.03253987431526184, "rewards/margins": 2.9886984527111053, "rewards/rejected": -3.021238327026367, "step": 6098 }, { "epoch": 0.32327140699122786, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21716812.0, "logits/rejected": -4546504.0, "logps/chosen": -289.37847900390625, "logps/rejected": -199.9157918294271, "loss": 0.2427, "rewards/chosen": 0.8512749075889587, "rewards/margins": 2.3734702467918396, "rewards/rejected": -1.5221953392028809, "step": 6099 }, { "epoch": 0.32332441099303, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77631784.0, "logits/rejected": -24490050.666666668, "logps/chosen": -409.01617431640625, "logps/rejected": -460.3323160807292, "loss": 0.1927, "rewards/chosen": 0.5432747006416321, "rewards/margins": 3.2590524156888327, "rewards/rejected": -2.7157777150472007, "step": 6100 }, { "epoch": 0.32337741499483214, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23777242.666666668, "logits/rejected": -13275373.6, "logps/chosen": -77.05006917317708, "logps/rejected": -239.3880859375, "loss": 0.2756, "rewards/chosen": 0.26918234427769977, "rewards/margins": 2.0357646425565084, "rewards/rejected": -1.7665822982788086, "step": 6101 }, { "epoch": 0.3234304189966343, "grad_norm": 40.25, "kl": 0.06641960144042969, "learning_rate": 5e-07, "logits/chosen": -46226988.8, "logits/rejected": -12418077.333333334, "logps/chosen": -243.47900390625, "logps/rejected": -304.4274088541667, "loss": 0.2801, "rewards/chosen": 0.8235555648803711, "rewards/margins": 2.409196313222249, "rewards/rejected": -1.5856407483418782, "step": 6102 }, { "epoch": 0.3234834229984364, "grad_norm": 43.25, "kl": 0.7041702270507812, "learning_rate": 5e-07, "logits/chosen": -13540288.0, "logits/rejected": -22101074.666666668, "logps/chosen": -282.432080078125, "logps/rejected": -503.5611165364583, "loss": 0.1801, "rewards/chosen": 1.1545236587524415, "rewards/margins": 3.817096837361654, "rewards/rejected": -2.6625731786092124, "step": 6103 }, { "epoch": 0.3235364270002385, "grad_norm": 51.5, "kl": 0.9368782043457031, "learning_rate": 5e-07, "logits/chosen": -58618656.0, "logits/rejected": -5216748.0, "logps/chosen": -414.5692138671875, "logps/rejected": -141.66357421875, "loss": 0.3335, "rewards/chosen": 0.9510712027549744, "rewards/margins": 1.518908977508545, "rewards/rejected": -0.5678377747535706, "step": 6104 }, { "epoch": 0.32358943100204063, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41123013.333333336, "logits/rejected": -23209160.0, "logps/chosen": -365.4974772135417, "logps/rejected": -312.8321533203125, "loss": 0.3348, "rewards/chosen": 0.48947898546854657, "rewards/margins": 2.0040613810221353, "rewards/rejected": -1.5145823955535889, "step": 6105 }, { "epoch": 0.32364243500384277, "grad_norm": 75.0, "kl": 0.5216903686523438, "learning_rate": 5e-07, "logits/chosen": 24697829.333333332, "logits/rejected": -32003528.0, "logps/chosen": -372.4817301432292, "logps/rejected": -455.7669372558594, "loss": 0.4821, "rewards/chosen": -0.35655975341796875, "rewards/margins": 1.659071445465088, "rewards/rejected": -2.0156311988830566, "step": 6106 }, { "epoch": 0.3236954390056449, "grad_norm": 52.5, "kl": 0.4715728759765625, "learning_rate": 5e-07, "logits/chosen": -23616229.333333332, "logits/rejected": -21844724.0, "logps/chosen": -285.1591796875, "logps/rejected": -94.82970428466797, "loss": 0.473, "rewards/chosen": 0.0555310050646464, "rewards/margins": 0.5470246175924937, "rewards/rejected": -0.4914936125278473, "step": 6107 }, { "epoch": 0.32374844300744704, "grad_norm": 40.0, "kl": 0.30238914489746094, "learning_rate": 5e-07, "logits/chosen": -22338404.0, "logits/rejected": -46610052.0, "logps/chosen": -164.38894653320312, "logps/rejected": -290.77801513671875, "loss": 0.3008, "rewards/chosen": 0.24630293250083923, "rewards/margins": 2.035062164068222, "rewards/rejected": -1.7887592315673828, "step": 6108 }, { "epoch": 0.3238014470092492, "grad_norm": 58.5, "kl": 0.07407760620117188, "learning_rate": 5e-07, "logits/chosen": -62206728.0, "logits/rejected": -14756638.0, "logps/chosen": -554.0447387695312, "logps/rejected": -255.30877685546875, "loss": 0.2873, "rewards/chosen": 0.3931834101676941, "rewards/margins": 2.2547412514686584, "rewards/rejected": -1.8615578413009644, "step": 6109 }, { "epoch": 0.3238544510110513, "grad_norm": 55.75, "kl": 3.5287437438964844, "learning_rate": 5e-07, "logits/chosen": -19444467.2, "logits/rejected": 23721109.333333332, "logps/chosen": -501.638037109375, "logps/rejected": -208.26692708333334, "loss": 0.3258, "rewards/chosen": 0.8865208625793457, "rewards/margins": 2.321093400319417, "rewards/rejected": -1.4345725377400715, "step": 6110 }, { "epoch": 0.32390745501285345, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32991152.0, "logits/rejected": -20095385.6, "logps/chosen": -211.5514933268229, "logps/rejected": -238.49423828125, "loss": 0.3162, "rewards/chosen": 0.12164180477460225, "rewards/margins": 1.8582821627457935, "rewards/rejected": -1.7366403579711913, "step": 6111 }, { "epoch": 0.3239604590146556, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13356198.0, "logits/rejected": -26008498.285714287, "logps/chosen": -75.31058502197266, "logps/rejected": -143.39732142857142, "loss": 0.2555, "rewards/chosen": 0.035147856920957565, "rewards/margins": 1.3297141476401262, "rewards/rejected": -1.2945662907191686, "step": 6112 }, { "epoch": 0.3240134630164577, "grad_norm": 47.0, "kl": 1.1209850311279297, "learning_rate": 5e-07, "logits/chosen": -40850956.0, "logits/rejected": -8169998.0, "logps/chosen": -296.2915954589844, "logps/rejected": -156.69903564453125, "loss": 0.312, "rewards/chosen": 0.11156168580055237, "rewards/margins": 2.2474667131900787, "rewards/rejected": -2.1359050273895264, "step": 6113 }, { "epoch": 0.32406646701825986, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33751104.0, "logits/rejected": -48715818.666666664, "logps/chosen": -97.69053649902344, "logps/rejected": -372.7159830729167, "loss": 0.2433, "rewards/chosen": -0.17737369239330292, "rewards/margins": 2.018170232574145, "rewards/rejected": -2.1955439249674478, "step": 6114 }, { "epoch": 0.324119471020062, "grad_norm": 48.25, "kl": 0.5588893890380859, "learning_rate": 5e-07, "logits/chosen": -76120272.0, "logits/rejected": -42192762.666666664, "logps/chosen": -401.2568664550781, "logps/rejected": -297.4807535807292, "loss": 0.2021, "rewards/chosen": 0.6866241693496704, "rewards/margins": 2.4592264095942182, "rewards/rejected": -1.7726022402445476, "step": 6115 }, { "epoch": 0.32417247502186414, "grad_norm": 58.75, "kl": 1.4105148315429688, "learning_rate": 5e-07, "logits/chosen": -42083449.6, "logits/rejected": -8114141.333333333, "logps/chosen": -478.10302734375, "logps/rejected": -150.6368611653646, "loss": 0.3534, "rewards/chosen": 0.5135721206665039, "rewards/margins": 1.16299520333608, "rewards/rejected": -0.649423082669576, "step": 6116 }, { "epoch": 0.3242254790236663, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -151032074.66666666, "logits/rejected": -29979385.6, "logps/chosen": -410.0974527994792, "logps/rejected": -243.4103271484375, "loss": 0.3313, "rewards/chosen": 0.24509811401367188, "rewards/margins": 1.7351237297058106, "rewards/rejected": -1.4900256156921388, "step": 6117 }, { "epoch": 0.3242784830254684, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20276841.6, "logits/rejected": 26651282.666666668, "logps/chosen": -146.97919921875, "logps/rejected": -121.80494181315105, "loss": 0.4198, "rewards/chosen": 0.04071032702922821, "rewards/margins": 0.889828559756279, "rewards/rejected": -0.8491182327270508, "step": 6118 }, { "epoch": 0.32433148702727055, "grad_norm": 39.0, "kl": 1.0517463684082031, "learning_rate": 5e-07, "logits/chosen": -26340464.0, "logits/rejected": -36857040.0, "logps/chosen": -247.63894653320312, "logps/rejected": -399.1024475097656, "loss": 0.2495, "rewards/chosen": 0.6762632131576538, "rewards/margins": 3.093072533607483, "rewards/rejected": -2.416809320449829, "step": 6119 }, { "epoch": 0.3243844910290727, "grad_norm": 43.25, "kl": 0.4514598846435547, "learning_rate": 5e-07, "logits/chosen": -3338233.0, "logits/rejected": -13767766.666666666, "logps/chosen": -23.18415069580078, "logps/rejected": -303.7005208333333, "loss": 0.3233, "rewards/chosen": -0.21631699800491333, "rewards/margins": 1.2737318873405457, "rewards/rejected": -1.490048885345459, "step": 6120 }, { "epoch": 0.3244374950308748, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25138264.0, "logits/rejected": -32762212.0, "logps/chosen": -179.66416931152344, "logps/rejected": -348.8120422363281, "loss": 0.2582, "rewards/chosen": 0.6318801045417786, "rewards/margins": 2.420031726360321, "rewards/rejected": -1.7881516218185425, "step": 6121 }, { "epoch": 0.32449049903267696, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8035696.0, "logits/rejected": 221262464.0, "logps/chosen": -325.8017985026042, "logps/rejected": -551.175341796875, "loss": 0.2758, "rewards/chosen": 0.4013356367746989, "rewards/margins": 2.234527508417765, "rewards/rejected": -1.8331918716430664, "step": 6122 }, { "epoch": 0.3245435030344791, "grad_norm": 55.75, "kl": 2.1351699829101562, "learning_rate": 5e-07, "logits/chosen": -16908425.6, "logits/rejected": -17823129.333333332, "logps/chosen": -327.1095947265625, "logps/rejected": -194.98046875, "loss": 0.3096, "rewards/chosen": 0.6693543910980224, "rewards/margins": 2.0398012320200603, "rewards/rejected": -1.3704468409220378, "step": 6123 }, { "epoch": 0.32459650703628123, "grad_norm": 54.0, "kl": 0.6384754180908203, "learning_rate": 5e-07, "logits/chosen": -14883018.666666666, "logits/rejected": -39013296.0, "logps/chosen": -216.10074869791666, "logps/rejected": -278.9566345214844, "loss": 0.3863, "rewards/chosen": 0.35753687222798664, "rewards/margins": 1.3250783284505208, "rewards/rejected": -0.9675414562225342, "step": 6124 }, { "epoch": 0.32464951103808337, "grad_norm": 70.0, "kl": 0.6189804077148438, "learning_rate": 5e-07, "logits/chosen": -24017316.0, "logits/rejected": -14528714.0, "logps/chosen": -824.193603515625, "logps/rejected": -286.6264953613281, "loss": 0.237, "rewards/chosen": 1.0644439458847046, "rewards/margins": 2.865056276321411, "rewards/rejected": -1.8006123304367065, "step": 6125 }, { "epoch": 0.3247025150398855, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33988808.0, "logits/rejected": -19383008.0, "logps/chosen": -139.1030731201172, "logps/rejected": -404.1094055175781, "loss": 0.3182, "rewards/chosen": -0.05659661069512367, "rewards/margins": 1.9826532863080502, "rewards/rejected": -2.039249897003174, "step": 6126 }, { "epoch": 0.32475551904168765, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41592952.0, "logits/rejected": -27747600.0, "logps/chosen": -160.43238830566406, "logps/rejected": -375.3265380859375, "loss": 0.2996, "rewards/chosen": 0.13658151030540466, "rewards/margins": 2.5525921285152435, "rewards/rejected": -2.416010618209839, "step": 6127 }, { "epoch": 0.3248085230434898, "grad_norm": 54.25, "kl": 0.15599441528320312, "learning_rate": 5e-07, "logits/chosen": -51091653.333333336, "logits/rejected": -11372553.6, "logps/chosen": -401.9679768880208, "logps/rejected": -101.49840698242187, "loss": 0.4077, "rewards/chosen": 0.21377410491307577, "rewards/margins": 0.8004059751828512, "rewards/rejected": -0.5866318702697754, "step": 6128 }, { "epoch": 0.3248615270452919, "grad_norm": 62.25, "kl": 1.3783416748046875, "learning_rate": 5e-07, "logits/chosen": -103260112.0, "logits/rejected": -33599408.0, "logps/chosen": -595.93017578125, "logps/rejected": -239.59945678710938, "loss": 0.3652, "rewards/chosen": 0.45706862211227417, "rewards/margins": 1.363714575767517, "rewards/rejected": -0.9066459536552429, "step": 6129 }, { "epoch": 0.32491453104709406, "grad_norm": 54.75, "kl": 0.4177360534667969, "learning_rate": 5e-07, "logits/chosen": -55543316.0, "logits/rejected": -7523324.0, "logps/chosen": -636.8591918945312, "logps/rejected": -158.4040069580078, "loss": 0.3252, "rewards/chosen": 0.4688999354839325, "rewards/margins": 1.624218076467514, "rewards/rejected": -1.1553181409835815, "step": 6130 }, { "epoch": 0.3249675350488962, "grad_norm": 45.5, "kl": 1.861358642578125, "learning_rate": 5e-07, "logits/chosen": -8927038.0, "logits/rejected": -10973024.0, "logps/chosen": -294.8893229166667, "logps/rejected": -579.3858642578125, "loss": 0.3374, "rewards/chosen": 0.6378093560536703, "rewards/margins": 2.4232580264409385, "rewards/rejected": -1.785448670387268, "step": 6131 }, { "epoch": 0.32502053905069833, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -160303.0, "logits/rejected": -47622912.0, "logps/chosen": -202.61672973632812, "logps/rejected": -269.34912109375, "loss": 0.3314, "rewards/chosen": 0.18506576120853424, "rewards/margins": 1.5915963798761368, "rewards/rejected": -1.4065306186676025, "step": 6132 }, { "epoch": 0.32507354305250047, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48305104.0, "logits/rejected": -34746956.0, "logps/chosen": -446.04693603515625, "logps/rejected": -428.53497314453125, "loss": 0.3079, "rewards/chosen": 0.4026588499546051, "rewards/margins": 2.5092460215091705, "rewards/rejected": -2.1065871715545654, "step": 6133 }, { "epoch": 0.3251265470543026, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18391257.333333332, "logits/rejected": -22227588.0, "logps/chosen": -337.2364908854167, "logps/rejected": -516.542724609375, "loss": 0.3541, "rewards/chosen": 0.22960142294565836, "rewards/margins": 3.084835092226664, "rewards/rejected": -2.855233669281006, "step": 6134 }, { "epoch": 0.32517955105610474, "grad_norm": 113.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12112772.0, "logits/rejected": 41575094.85714286, "logps/chosen": -394.47662353515625, "logps/rejected": -489.05723353794644, "loss": 0.2995, "rewards/chosen": 0.3573852479457855, "rewards/margins": 1.481487295457295, "rewards/rejected": -1.1241020475115096, "step": 6135 }, { "epoch": 0.3252325550579069, "grad_norm": 41.75, "kl": 1.5156736373901367, "learning_rate": 5e-07, "logits/chosen": -13843600.0, "logits/rejected": -26001692.0, "logps/chosen": -268.26318359375, "logps/rejected": -306.95819091796875, "loss": 0.2957, "rewards/chosen": 0.6114313006401062, "rewards/margins": 2.497508466243744, "rewards/rejected": -1.8860771656036377, "step": 6136 }, { "epoch": 0.325285559059709, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50472186.666666664, "logits/rejected": 3457547.5, "logps/chosen": -284.1527913411458, "logps/rejected": -43.24510192871094, "loss": 0.4071, "rewards/chosen": 0.3428614139556885, "rewards/margins": 0.954033374786377, "rewards/rejected": -0.6111719608306885, "step": 6137 }, { "epoch": 0.32533856306151115, "grad_norm": 31.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -427203.25, "logits/rejected": -16405185.333333334, "logps/chosen": -87.39559173583984, "logps/rejected": -309.41135660807294, "loss": 0.1947, "rewards/chosen": 0.6787760853767395, "rewards/margins": 2.946787496407827, "rewards/rejected": -2.2680114110310874, "step": 6138 }, { "epoch": 0.3253915670633133, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7865255.5, "logits/rejected": -35582096.0, "logps/chosen": -242.6686248779297, "logps/rejected": -483.069580078125, "loss": 0.2576, "rewards/chosen": 0.31209975481033325, "rewards/margins": 3.310246765613556, "rewards/rejected": -2.9981470108032227, "step": 6139 }, { "epoch": 0.32544457106511543, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35666480.0, "logits/rejected": 4291656.5, "logps/chosen": -414.7308349609375, "logps/rejected": -367.20086669921875, "loss": 0.325, "rewards/chosen": 0.20663340389728546, "rewards/margins": 1.7087395936250687, "rewards/rejected": -1.5021061897277832, "step": 6140 }, { "epoch": 0.32549757506691757, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4627784.0, "logits/rejected": 12763466.666666666, "logps/chosen": -160.20425415039062, "logps/rejected": -254.75838216145834, "loss": 0.2741, "rewards/chosen": 0.8377875089645386, "rewards/margins": 2.20508070786794, "rewards/rejected": -1.3672931989034016, "step": 6141 }, { "epoch": 0.3255505790687197, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26107732.0, "logits/rejected": -20087184.0, "logps/chosen": -848.7869873046875, "logps/rejected": -197.1910603841146, "loss": 0.3068, "rewards/chosen": 0.39026129245758057, "rewards/margins": 1.481715718905131, "rewards/rejected": -1.0914544264475505, "step": 6142 }, { "epoch": 0.32560358307052184, "grad_norm": 47.0, "kl": 0.1432933807373047, "learning_rate": 5e-07, "logits/chosen": -23615945.6, "logits/rejected": -30730032.0, "logps/chosen": -291.7939208984375, "logps/rejected": -212.26163736979166, "loss": 0.3111, "rewards/chosen": 0.390958309173584, "rewards/margins": 2.381048361460368, "rewards/rejected": -1.990090052286784, "step": 6143 }, { "epoch": 0.325656587072324, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34788080.0, "logits/rejected": -37049900.8, "logps/chosen": -275.3032633463542, "logps/rejected": -280.2042724609375, "loss": 0.2518, "rewards/chosen": 0.3694896697998047, "rewards/margins": 2.3546934127807617, "rewards/rejected": -1.985203742980957, "step": 6144 }, { "epoch": 0.3257095910741261, "grad_norm": 78.5, "kl": 0.5021858215332031, "learning_rate": 5e-07, "logits/chosen": -3334894.0, "logits/rejected": -66567952.0, "logps/chosen": -225.93306477864584, "logps/rejected": -665.7091064453125, "loss": 0.3349, "rewards/chosen": 0.31404372056325275, "rewards/margins": 3.6016597350438437, "rewards/rejected": -3.287616014480591, "step": 6145 }, { "epoch": 0.32576259507592825, "grad_norm": 50.0, "kl": 0.8090972900390625, "learning_rate": 5e-07, "logits/chosen": -4402127.0, "logits/rejected": -15384337.0, "logps/chosen": -166.43161010742188, "logps/rejected": -445.4007263183594, "loss": 0.3478, "rewards/chosen": 0.4156278371810913, "rewards/margins": 1.9296473264694214, "rewards/rejected": -1.51401948928833, "step": 6146 }, { "epoch": 0.3258155990777304, "grad_norm": 44.5, "kl": 0.30719947814941406, "learning_rate": 5e-07, "logits/chosen": 20984960.0, "logits/rejected": -57918645.333333336, "logps/chosen": -386.658447265625, "logps/rejected": -433.9503173828125, "loss": 0.1604, "rewards/chosen": 0.5056877136230469, "rewards/margins": 3.186858812967936, "rewards/rejected": -2.681171099344889, "step": 6147 }, { "epoch": 0.3258686030795325, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53887300.0, "logits/rejected": -38412776.0, "logps/chosen": -209.55999755859375, "logps/rejected": -232.58831787109375, "loss": 0.3434, "rewards/chosen": -0.1363641917705536, "rewards/margins": 2.02633860707283, "rewards/rejected": -2.162702798843384, "step": 6148 }, { "epoch": 0.32592160708133466, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30277986.666666668, "logits/rejected": -17109408.0, "logps/chosen": -273.44012451171875, "logps/rejected": -261.124609375, "loss": 0.3231, "rewards/chosen": 0.041073620319366455, "rewards/margins": 1.4319072842597962, "rewards/rejected": -1.3908336639404297, "step": 6149 }, { "epoch": 0.3259746110831368, "grad_norm": 54.0, "kl": 0.7467575073242188, "learning_rate": 5e-07, "logits/chosen": -15076803.2, "logits/rejected": -7308374.0, "logps/chosen": -236.8072265625, "logps/rejected": -210.0767822265625, "loss": 0.4106, "rewards/chosen": -0.05687501430511475, "rewards/margins": 1.2696587800979615, "rewards/rejected": -1.3265337944030762, "step": 6150 }, { "epoch": 0.32602761508493894, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50186181.333333336, "logits/rejected": -47633580.8, "logps/chosen": -345.6834716796875, "logps/rejected": -468.43603515625, "loss": 0.2307, "rewards/chosen": 0.685359795888265, "rewards/margins": 2.899726136525472, "rewards/rejected": -2.214366340637207, "step": 6151 }, { "epoch": 0.3260806190867411, "grad_norm": 89.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7534791.333333333, "logits/rejected": -20523652.0, "logps/chosen": -224.8804931640625, "logps/rejected": -217.02516174316406, "loss": 0.4109, "rewards/chosen": 0.13061833381652832, "rewards/margins": 1.4430103302001953, "rewards/rejected": -1.312391996383667, "step": 6152 }, { "epoch": 0.3261336230885432, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64227180.8, "logits/rejected": -10941635.333333334, "logps/chosen": -317.6715576171875, "logps/rejected": -223.18721516927084, "loss": 0.3079, "rewards/chosen": 0.573846435546875, "rewards/margins": 2.140295918782552, "rewards/rejected": -1.566449483235677, "step": 6153 }, { "epoch": 0.3261866270903453, "grad_norm": 41.75, "kl": 2.383085250854492, "learning_rate": 5e-07, "logits/chosen": -11242974.666666666, "logits/rejected": -1600062.4, "logps/chosen": -521.9251302083334, "logps/rejected": -285.50146484375, "loss": 0.2217, "rewards/chosen": 0.609286904335022, "rewards/margins": 2.48151113986969, "rewards/rejected": -1.8722242355346679, "step": 6154 }, { "epoch": 0.32623963109214743, "grad_norm": 57.25, "kl": 0.09988784790039062, "learning_rate": 5e-07, "logits/chosen": -44465602.666666664, "logits/rejected": -75107688.0, "logps/chosen": -220.83015950520834, "logps/rejected": -145.32138061523438, "loss": 0.4148, "rewards/chosen": 0.12907747427622476, "rewards/margins": 1.2390711704889934, "rewards/rejected": -1.1099936962127686, "step": 6155 }, { "epoch": 0.32629263509394957, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36634533.333333336, "logits/rejected": -29160169.6, "logps/chosen": -209.69047037760416, "logps/rejected": -288.218701171875, "loss": 0.2863, "rewards/chosen": 0.08540929357210796, "rewards/margins": 2.0564710001150766, "rewards/rejected": -1.9710617065429688, "step": 6156 }, { "epoch": 0.3263456390957517, "grad_norm": 25.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -468198.25, "logits/rejected": -46109589.333333336, "logps/chosen": -105.83438873291016, "logps/rejected": -478.4280598958333, "loss": 0.1335, "rewards/chosen": 1.0809786319732666, "rewards/margins": 4.335496505101522, "rewards/rejected": -3.2545178731282554, "step": 6157 }, { "epoch": 0.32639864309755384, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39373381.333333336, "logits/rejected": -6665632.0, "logps/chosen": -388.1007486979167, "logps/rejected": -255.3614990234375, "loss": 0.2716, "rewards/chosen": 0.5731750329335531, "rewards/margins": 1.923768218358358, "rewards/rejected": -1.3505931854248048, "step": 6158 }, { "epoch": 0.326451647099356, "grad_norm": 57.0, "kl": 0.20543670654296875, "learning_rate": 5e-07, "logits/chosen": -48245664.0, "logits/rejected": -18099899.2, "logps/chosen": -326.0778401692708, "logps/rejected": -264.513134765625, "loss": 0.2883, "rewards/chosen": 0.43843690554300946, "rewards/margins": 1.9193613211313885, "rewards/rejected": -1.480924415588379, "step": 6159 }, { "epoch": 0.3265046511011581, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57529568.0, "logits/rejected": -9343715.2, "logps/chosen": -208.03658040364584, "logps/rejected": -216.93837890625, "loss": 0.3368, "rewards/chosen": -0.3284492492675781, "rewards/margins": 1.6322071075439453, "rewards/rejected": -1.9606563568115234, "step": 6160 }, { "epoch": 0.32655765510296025, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -85959584.0, "logits/rejected": -18615288.0, "logps/chosen": -271.7603454589844, "logps/rejected": -248.4129435221354, "loss": 0.2929, "rewards/chosen": 0.10674744099378586, "rewards/margins": 1.4357692922155063, "rewards/rejected": -1.3290218512217205, "step": 6161 }, { "epoch": 0.3266106591047624, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18750462.0, "logits/rejected": -39885556.0, "logps/chosen": -277.07684326171875, "logps/rejected": -432.49029541015625, "loss": 0.2868, "rewards/chosen": 0.09316635131835938, "rewards/margins": 2.6013782024383545, "rewards/rejected": -2.508211851119995, "step": 6162 }, { "epoch": 0.32666366310656453, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65913596.0, "logits/rejected": -38105392.0, "logps/chosen": -720.1378173828125, "logps/rejected": -643.111328125, "loss": 0.22, "rewards/chosen": 0.8506813049316406, "rewards/margins": 3.314649820327759, "rewards/rejected": -2.463968515396118, "step": 6163 }, { "epoch": 0.32671666710836667, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10795000.0, "logits/rejected": -4958783.333333333, "logps/chosen": -126.5989013671875, "logps/rejected": -383.6407877604167, "loss": 0.3371, "rewards/chosen": 0.03779153227806091, "rewards/margins": 2.5049056629339854, "rewards/rejected": -2.4671141306559243, "step": 6164 }, { "epoch": 0.3267696711101688, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 24526605.333333332, "logits/rejected": -24225297.6, "logps/chosen": -900.393798828125, "logps/rejected": -247.9935546875, "loss": 0.3161, "rewards/chosen": 0.517175038655599, "rewards/margins": 1.8282668431599935, "rewards/rejected": -1.3110918045043944, "step": 6165 }, { "epoch": 0.32682267511197094, "grad_norm": 102.0, "kl": 1.1400794982910156, "learning_rate": 5e-07, "logits/chosen": 42553504.0, "logits/rejected": -8445704.0, "logps/chosen": -730.0779622395834, "logps/rejected": -147.74655151367188, "loss": 0.396, "rewards/chosen": 0.3865404923756917, "rewards/margins": 1.377969225247701, "rewards/rejected": -0.9914287328720093, "step": 6166 }, { "epoch": 0.3268756791137731, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69730080.0, "logits/rejected": -53287680.0, "logps/chosen": -557.0889282226562, "logps/rejected": -361.8866489955357, "loss": 0.2001, "rewards/chosen": -0.515032947063446, "rewards/margins": 1.536241488797324, "rewards/rejected": -2.05127443586077, "step": 6167 }, { "epoch": 0.3269286831155752, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16743384.0, "logits/rejected": -106369706.66666667, "logps/chosen": -251.906884765625, "logps/rejected": -313.93552652994794, "loss": 0.2584, "rewards/chosen": 0.69773268699646, "rewards/margins": 3.0953288873036704, "rewards/rejected": -2.3975962003072104, "step": 6168 }, { "epoch": 0.32698168711737735, "grad_norm": 80.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12101619.0, "logits/rejected": -286866.25, "logps/chosen": -82.72078704833984, "logps/rejected": -373.60906982421875, "loss": 0.3414, "rewards/chosen": -0.1628376990556717, "rewards/margins": 1.9350827187299728, "rewards/rejected": -2.0979204177856445, "step": 6169 }, { "epoch": 0.3270346911191795, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21958946.0, "logits/rejected": -4904898.0, "logps/chosen": -231.453857421875, "logps/rejected": -101.37194061279297, "loss": 0.3277, "rewards/chosen": 0.07600001990795135, "rewards/margins": 1.9454904049634933, "rewards/rejected": -1.869490385055542, "step": 6170 }, { "epoch": 0.3270876951209816, "grad_norm": 51.5, "kl": 2.161569595336914, "learning_rate": 5e-07, "logits/chosen": -16598147.42857143, "logits/rejected": -7766084.0, "logps/chosen": -283.3345947265625, "logps/rejected": -80.14508056640625, "loss": 0.3918, "rewards/chosen": 0.5431748458317348, "rewards/margins": 2.9900682994297574, "rewards/rejected": -2.4468934535980225, "step": 6171 }, { "epoch": 0.32714069912278376, "grad_norm": 46.25, "kl": 0.5602531433105469, "learning_rate": 5e-07, "logits/chosen": -15358373.333333334, "logits/rejected": -65906028.0, "logps/chosen": -152.83006795247397, "logps/rejected": -507.14215087890625, "loss": 0.3705, "rewards/chosen": 0.14196479320526123, "rewards/margins": 2.4739197492599487, "rewards/rejected": -2.3319549560546875, "step": 6172 }, { "epoch": 0.3271937031245859, "grad_norm": 49.75, "kl": 0.2527198791503906, "learning_rate": 5e-07, "logits/chosen": -28959674.0, "logits/rejected": 2395798.5, "logps/chosen": -350.598388671875, "logps/rejected": -138.9275665283203, "loss": 0.2497, "rewards/chosen": 0.5444397330284119, "rewards/margins": 2.8733763098716736, "rewards/rejected": -2.3289365768432617, "step": 6173 }, { "epoch": 0.32724670712638804, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1855860.6666666667, "logits/rejected": -13406419.2, "logps/chosen": -50.63445536295573, "logps/rejected": -128.23916015625, "loss": 0.3411, "rewards/chosen": 0.2498127023379008, "rewards/margins": 1.4944249192873638, "rewards/rejected": -1.244612216949463, "step": 6174 }, { "epoch": 0.3272997111281902, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58607244.0, "logits/rejected": 196468768.0, "logps/chosen": -300.5916748046875, "logps/rejected": -445.4352722167969, "loss": 0.3589, "rewards/chosen": -0.3659622073173523, "rewards/margins": 1.8941193222999573, "rewards/rejected": -2.2600815296173096, "step": 6175 }, { "epoch": 0.3273527151299923, "grad_norm": 37.5, "kl": 0.9589195251464844, "learning_rate": 5e-07, "logits/chosen": -40719512.0, "logits/rejected": -14663161.0, "logps/chosen": -217.150146484375, "logps/rejected": -291.49029541015625, "loss": 0.2695, "rewards/chosen": 0.4243297874927521, "rewards/margins": 2.704295665025711, "rewards/rejected": -2.279965877532959, "step": 6176 }, { "epoch": 0.32740571913179445, "grad_norm": 46.75, "kl": 0.35886573791503906, "learning_rate": 5e-07, "logits/chosen": -15244950.4, "logits/rejected": -2057097.8333333333, "logps/chosen": -311.77705078125, "logps/rejected": -74.41739400227864, "loss": 0.2973, "rewards/chosen": 0.6804475784301758, "rewards/margins": 1.9231576124827068, "rewards/rejected": -1.242710034052531, "step": 6177 }, { "epoch": 0.3274587231335966, "grad_norm": 30.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7774044.0, "logits/rejected": -28898267.42857143, "logps/chosen": -285.4274597167969, "logps/rejected": -330.94827706473217, "loss": 0.1107, "rewards/chosen": 1.6531769037246704, "rewards/margins": 4.0564923116139004, "rewards/rejected": -2.40331540788923, "step": 6178 }, { "epoch": 0.3275117271353987, "grad_norm": 47.25, "kl": 2.1531195640563965, "learning_rate": 5e-07, "logits/chosen": -15375980.0, "logits/rejected": -6567012.0, "logps/chosen": -274.59442138671875, "logps/rejected": -329.01397705078125, "loss": 0.3225, "rewards/chosen": 0.5978171825408936, "rewards/margins": 2.025893449783325, "rewards/rejected": -1.4280762672424316, "step": 6179 }, { "epoch": 0.32756473113720086, "grad_norm": 46.0, "kl": 0.16067886352539062, "learning_rate": 5e-07, "logits/chosen": -16799938.285714287, "logits/rejected": -7524641.5, "logps/chosen": -127.78819056919643, "logps/rejected": -102.25499725341797, "loss": 0.4421, "rewards/chosen": 0.0277874299458095, "rewards/margins": 2.959816438811166, "rewards/rejected": -2.9320290088653564, "step": 6180 }, { "epoch": 0.327617735139003, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94202841.6, "logits/rejected": -29683064.0, "logps/chosen": -473.77646484375, "logps/rejected": -233.1708984375, "loss": 0.4283, "rewards/chosen": -0.36348328590393064, "rewards/margins": 1.1981455326080321, "rewards/rejected": -1.561628818511963, "step": 6181 }, { "epoch": 0.32767073914080513, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28488666.666666668, "logits/rejected": 9563141.6, "logps/chosen": -175.6945597330729, "logps/rejected": -581.26005859375, "loss": 0.2246, "rewards/chosen": 0.6027160485585531, "rewards/margins": 2.543823226292928, "rewards/rejected": -1.941107177734375, "step": 6182 }, { "epoch": 0.32772374314260727, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47048552.0, "logits/rejected": -41391656.0, "logps/chosen": -294.90521240234375, "logps/rejected": -421.90179443359375, "loss": 0.268, "rewards/chosen": 0.6666820645332336, "rewards/margins": 2.4950703978538513, "rewards/rejected": -1.8283883333206177, "step": 6183 }, { "epoch": 0.3277767471444094, "grad_norm": 71.0, "kl": 0.19698429107666016, "learning_rate": 5e-07, "logits/chosen": 8060740.0, "logits/rejected": -18188782.0, "logps/chosen": -197.3776092529297, "logps/rejected": -377.2933654785156, "loss": 0.3013, "rewards/chosen": 0.595005214214325, "rewards/margins": 2.2512980103492737, "rewards/rejected": -1.6562927961349487, "step": 6184 }, { "epoch": 0.32782975114621155, "grad_norm": 59.5, "kl": 0.8364448547363281, "learning_rate": 5e-07, "logits/chosen": -24010053.333333332, "logits/rejected": -11455768.0, "logps/chosen": -377.7389322916667, "logps/rejected": -153.47286987304688, "loss": 0.497, "rewards/chosen": -0.09769053260485332, "rewards/margins": 0.5655813713868459, "rewards/rejected": -0.6632719039916992, "step": 6185 }, { "epoch": 0.3278827551480137, "grad_norm": 46.0, "kl": 0.03980255126953125, "learning_rate": 5e-07, "logits/chosen": -2866590.8, "logits/rejected": -18937726.666666668, "logps/chosen": -209.566943359375, "logps/rejected": -241.564697265625, "loss": 0.3173, "rewards/chosen": 0.3950863599777222, "rewards/margins": 2.3295956373214723, "rewards/rejected": -1.93450927734375, "step": 6186 }, { "epoch": 0.3279357591498158, "grad_norm": 42.0, "kl": 0.8541908264160156, "learning_rate": 5e-07, "logits/chosen": -18916786.666666668, "logits/rejected": -28639046.4, "logps/chosen": -211.19755045572916, "logps/rejected": -377.3933837890625, "loss": 0.2206, "rewards/chosen": 0.6231964826583862, "rewards/margins": 2.890166735649109, "rewards/rejected": -2.2669702529907227, "step": 6187 }, { "epoch": 0.32798876315161796, "grad_norm": 37.5, "kl": 0.17192935943603516, "learning_rate": 5e-07, "logits/chosen": -770771.875, "logits/rejected": -62854632.0, "logps/chosen": -117.60356903076172, "logps/rejected": -294.593994140625, "loss": 0.2695, "rewards/chosen": 0.6363805532455444, "rewards/margins": 2.2416670322418213, "rewards/rejected": -1.6052864789962769, "step": 6188 }, { "epoch": 0.3280417671534201, "grad_norm": 62.0, "kl": 1.7153167724609375, "learning_rate": 5e-07, "logits/chosen": -73285920.0, "logits/rejected": -32577200.0, "logps/chosen": -502.4258626302083, "logps/rejected": -325.8968200683594, "loss": 0.3023, "rewards/chosen": 0.8329544067382812, "rewards/margins": 2.9926681518554688, "rewards/rejected": -2.1597137451171875, "step": 6189 }, { "epoch": 0.32809477115522223, "grad_norm": 71.5, "kl": 2.2628402709960938, "learning_rate": 5e-07, "logits/chosen": -11632043.0, "logps/chosen": -486.69976806640625, "loss": 0.47, "rewards/chosen": 0.4415477514266968, "step": 6190 }, { "epoch": 0.32814777515702437, "grad_norm": 53.75, "kl": 0.9351377487182617, "learning_rate": 5e-07, "logits/chosen": -5852560.8, "logits/rejected": -12642428.0, "logps/chosen": -191.557666015625, "logps/rejected": -235.1383056640625, "loss": 0.4402, "rewards/chosen": -0.06141006946563721, "rewards/margins": 1.2781943082809448, "rewards/rejected": -1.339604377746582, "step": 6191 }, { "epoch": 0.3282007791588265, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45654844.0, "logits/rejected": 14468689.0, "logps/chosen": -802.9755249023438, "logps/rejected": -273.06658935546875, "loss": 0.349, "rewards/chosen": 0.47135382890701294, "rewards/margins": 1.5363141894340515, "rewards/rejected": -1.0649603605270386, "step": 6192 }, { "epoch": 0.32825378316062864, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37562861.333333336, "logits/rejected": -16411834.0, "logps/chosen": -246.83404541015625, "logps/rejected": -384.03985595703125, "loss": 0.4994, "rewards/chosen": -0.348163366317749, "rewards/margins": 0.7496546506881714, "rewards/rejected": -1.0978180170059204, "step": 6193 }, { "epoch": 0.3283067871624308, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -100798056.0, "logits/rejected": -27018093.714285713, "logps/chosen": -263.7452697753906, "logps/rejected": -207.62058803013392, "loss": 0.2583, "rewards/chosen": -0.265838623046875, "rewards/margins": 1.2677971976143974, "rewards/rejected": -1.5336358206612724, "step": 6194 }, { "epoch": 0.3283597911642329, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29802296.0, "logits/rejected": -23306496.0, "logps/chosen": -230.02186584472656, "logps/rejected": -293.2430826822917, "loss": 0.2545, "rewards/chosen": -0.25662630796432495, "rewards/margins": 1.743437111377716, "rewards/rejected": -2.000063419342041, "step": 6195 }, { "epoch": 0.32841279516603505, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23397968.0, "logits/rejected": -22786580.8, "logps/chosen": -398.8660481770833, "logps/rejected": -484.56796875, "loss": 0.2862, "rewards/chosen": 0.08973438541094463, "rewards/margins": 2.2031975964705146, "rewards/rejected": -2.11346321105957, "step": 6196 }, { "epoch": 0.3284657991678372, "grad_norm": 60.25, "kl": 2.767292022705078, "learning_rate": 5e-07, "logits/chosen": -17150539.2, "logits/rejected": -16573613.333333334, "logps/chosen": -632.88017578125, "logps/rejected": -164.86622111002603, "loss": 0.2672, "rewards/chosen": 1.0526598930358886, "rewards/margins": 2.2738414446512856, "rewards/rejected": -1.2211815516153972, "step": 6197 }, { "epoch": 0.32851880316963933, "grad_norm": 84.5, "kl": 0.481231689453125, "learning_rate": 5e-07, "logits/chosen": 9085548.0, "logits/rejected": 28274888.0, "logps/chosen": -263.72381591796875, "logps/rejected": -395.894775390625, "loss": 0.3631, "rewards/chosen": 0.27068978548049927, "rewards/margins": 1.4706578850746155, "rewards/rejected": -1.1999680995941162, "step": 6198 }, { "epoch": 0.32857180717144147, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7721500.0, "logits/rejected": -418965.4, "logps/chosen": -90.28518676757812, "logps/rejected": -267.37900390625, "loss": 0.2648, "rewards/chosen": 0.7682398160298666, "rewards/margins": 2.1102420171101888, "rewards/rejected": -1.3420022010803223, "step": 6199 }, { "epoch": 0.3286248111732436, "grad_norm": 43.75, "kl": 0.5600814819335938, "learning_rate": 5e-07, "logits/chosen": -39383690.666666664, "logits/rejected": -58794265.6, "logps/chosen": -329.8976236979167, "logps/rejected": -344.0802001953125, "loss": 0.2229, "rewards/chosen": 0.46015270551045734, "rewards/margins": 2.661529048283895, "rewards/rejected": -2.2013763427734374, "step": 6200 }, { "epoch": 0.32867781517504574, "grad_norm": 45.75, "kl": 0.6295738220214844, "learning_rate": 5e-07, "logits/chosen": -31821416.0, "logits/rejected": -584467.9375, "logps/chosen": -338.271240234375, "logps/rejected": -129.31959533691406, "loss": 0.2434, "rewards/chosen": 0.8368263244628906, "rewards/margins": 2.858414888381958, "rewards/rejected": -2.0215885639190674, "step": 6201 }, { "epoch": 0.3287308191768479, "grad_norm": 46.5, "kl": 0.16382217407226562, "learning_rate": 5e-07, "logits/chosen": 6299324.0, "logits/rejected": 2921602.6666666665, "logps/chosen": -313.9317932128906, "logps/rejected": -379.2848307291667, "loss": 0.2299, "rewards/chosen": 0.7386940121650696, "rewards/margins": 2.5555747151374817, "rewards/rejected": -1.816880702972412, "step": 6202 }, { "epoch": 0.32878382317865, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12939099.2, "logits/rejected": -255618.25, "logps/chosen": -392.918212890625, "logps/rejected": -105.42622884114583, "loss": 0.2887, "rewards/chosen": 0.9030307769775391, "rewards/margins": 2.202272955576579, "rewards/rejected": -1.2992421785990398, "step": 6203 }, { "epoch": 0.32883682718045215, "grad_norm": 55.25, "kl": 2.323535919189453, "learning_rate": 5e-07, "logits/chosen": -35367225.6, "logits/rejected": -24176528.0, "logps/chosen": -337.18076171875, "logps/rejected": -361.0435384114583, "loss": 0.2825, "rewards/chosen": 0.7678929328918457, "rewards/margins": 2.6647400220235187, "rewards/rejected": -1.896847089131673, "step": 6204 }, { "epoch": 0.32888983118225423, "grad_norm": 58.75, "kl": 0.48438072204589844, "learning_rate": 5e-07, "logits/chosen": -57505050.666666664, "logits/rejected": -61378643.2, "logps/chosen": -333.48398844401044, "logps/rejected": -619.748046875, "loss": 0.2293, "rewards/chosen": 0.3880745967229207, "rewards/margins": 3.0894814570744833, "rewards/rejected": -2.7014068603515624, "step": 6205 }, { "epoch": 0.32894283518405637, "grad_norm": 57.25, "kl": 1.7335128784179688, "learning_rate": 5e-07, "logits/chosen": -51169916.8, "logits/rejected": -52727008.0, "logps/chosen": -462.87939453125, "logps/rejected": -482.4882405598958, "loss": 0.3737, "rewards/chosen": 0.12327361106872559, "rewards/margins": 2.195606311162313, "rewards/rejected": -2.0723327000935874, "step": 6206 }, { "epoch": 0.3289958391858585, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1646890.0, "logits/rejected": -44091436.0, "logps/chosen": -94.73827107747395, "logps/rejected": -218.12355041503906, "loss": 0.3615, "rewards/chosen": 0.30828497807184857, "rewards/margins": 2.268561820189158, "rewards/rejected": -1.9602768421173096, "step": 6207 }, { "epoch": 0.32904884318766064, "grad_norm": 46.75, "kl": 0.6501274108886719, "learning_rate": 5e-07, "logits/chosen": -10465972.8, "logits/rejected": -9203440.0, "logps/chosen": -241.2028564453125, "logps/rejected": -173.7395222981771, "loss": 0.3068, "rewards/chosen": 0.30037746429443357, "rewards/margins": 2.898828125, "rewards/rejected": -2.5984506607055664, "step": 6208 }, { "epoch": 0.3291018471894628, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6293114.8, "logits/rejected": -38123920.0, "logps/chosen": -228.999072265625, "logps/rejected": -391.2582194010417, "loss": 0.3943, "rewards/chosen": 0.04048285782337189, "rewards/margins": 1.6976089666287104, "rewards/rejected": -1.6571261088053386, "step": 6209 }, { "epoch": 0.3291548511912649, "grad_norm": 57.25, "kl": 2.3250818252563477, "learning_rate": 5e-07, "logits/chosen": -15419882.0, "logits/rejected": -21399850.0, "logps/chosen": -476.8575744628906, "logps/rejected": -208.3547821044922, "loss": 0.297, "rewards/chosen": 0.6541788578033447, "rewards/margins": 1.9458824396133423, "rewards/rejected": -1.2917035818099976, "step": 6210 }, { "epoch": 0.32920785519306706, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11264597.0, "logits/rejected": -20177010.0, "logps/chosen": -442.0618896484375, "logps/rejected": -248.05677795410156, "loss": 0.236, "rewards/chosen": 0.90016108751297, "rewards/margins": 3.393329918384552, "rewards/rejected": -2.493168830871582, "step": 6211 }, { "epoch": 0.3292608591948692, "grad_norm": 58.25, "kl": 1.0706787109375, "learning_rate": 5e-07, "logits/chosen": -23738478.0, "logits/rejected": -47018920.0, "logps/chosen": -536.0480346679688, "logps/rejected": -214.2917938232422, "loss": 0.2982, "rewards/chosen": 0.7598289251327515, "rewards/margins": 2.0452587604522705, "rewards/rejected": -1.285429835319519, "step": 6212 }, { "epoch": 0.32931386319667133, "grad_norm": 67.5, "kl": 2.638317108154297, "learning_rate": 5e-07, "logits/chosen": -2740487.6, "logits/rejected": -8563748.0, "logps/chosen": -303.2284423828125, "logps/rejected": -196.7633260091146, "loss": 0.3837, "rewards/chosen": 0.6151439666748046, "rewards/margins": 1.3432976722717285, "rewards/rejected": -0.7281537055969238, "step": 6213 }, { "epoch": 0.32936686719847347, "grad_norm": 76.0, "kl": 0.5656585693359375, "learning_rate": 5e-07, "logits/chosen": -30866194.666666668, "logits/rejected": -5527888.0, "logps/chosen": -405.46826171875, "logps/rejected": -104.6382827758789, "loss": 0.3451, "rewards/chosen": 0.9570230642954508, "rewards/margins": 1.085984061161677, "rewards/rejected": -0.1289609968662262, "step": 6214 }, { "epoch": 0.3294198712002756, "grad_norm": 51.75, "kl": 2.7955265045166016, "learning_rate": 5e-07, "logits/chosen": -19027374.4, "logits/rejected": -30148394.666666668, "logps/chosen": -331.8345458984375, "logps/rejected": -536.2985432942709, "loss": 0.349, "rewards/chosen": 0.12922120094299316, "rewards/margins": 4.397327661514282, "rewards/rejected": -4.268106460571289, "step": 6215 }, { "epoch": 0.32947287520207774, "grad_norm": 43.75, "kl": 0.3601799011230469, "learning_rate": 5e-07, "logits/chosen": -51224228.0, "logits/rejected": -29492789.333333332, "logps/chosen": -294.0871887207031, "logps/rejected": -422.8347574869792, "loss": 0.217, "rewards/chosen": -0.17019157111644745, "rewards/margins": 2.289702609181404, "rewards/rejected": -2.4598941802978516, "step": 6216 }, { "epoch": 0.3295258792038799, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20869136.0, "logits/rejected": -32130366.0, "logps/chosen": -390.1065673828125, "logps/rejected": -160.18353271484375, "loss": 0.3545, "rewards/chosen": 0.09264793992042542, "rewards/margins": 1.3197584450244904, "rewards/rejected": -1.227110505104065, "step": 6217 }, { "epoch": 0.329578883205682, "grad_norm": 58.75, "kl": 0.755615234375, "learning_rate": 5e-07, "logits/chosen": -68783112.0, "logits/rejected": -1529692.125, "logps/chosen": -364.54736328125, "logps/rejected": -115.86676788330078, "loss": 0.3038, "rewards/chosen": 0.5331730246543884, "rewards/margins": 1.865086853504181, "rewards/rejected": -1.3319138288497925, "step": 6218 }, { "epoch": 0.32963188720748415, "grad_norm": 46.5, "kl": 0.1632537841796875, "learning_rate": 5e-07, "logits/chosen": -93454160.0, "logits/rejected": -26470300.0, "logps/chosen": -853.2764892578125, "logps/rejected": -418.65716552734375, "loss": 0.237, "rewards/chosen": 0.6711711883544922, "rewards/margins": 3.159212112426758, "rewards/rejected": -2.4880409240722656, "step": 6219 }, { "epoch": 0.3296848912092863, "grad_norm": 57.25, "kl": 1.8426284790039062, "learning_rate": 5e-07, "logits/chosen": -20241200.0, "logits/rejected": -11398511.0, "logps/chosen": -226.57100423177084, "logps/rejected": -266.57989501953125, "loss": 0.3564, "rewards/chosen": 0.6155924399693807, "rewards/margins": 2.6700515349706015, "rewards/rejected": -2.0544590950012207, "step": 6220 }, { "epoch": 0.3297378952110884, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5938972.0, "logits/rejected": -27874792.0, "logps/chosen": -388.89849853515625, "logps/rejected": -435.1124674479167, "loss": 0.1863, "rewards/chosen": 1.6211410760879517, "rewards/margins": 3.2819329500198364, "rewards/rejected": -1.6607918739318848, "step": 6221 }, { "epoch": 0.32979089921289056, "grad_norm": 43.25, "kl": 0.4170818328857422, "learning_rate": 5e-07, "logits/chosen": -2628711.6666666665, "logits/rejected": -38547136.0, "logps/chosen": -200.7791951497396, "logps/rejected": -376.2377014160156, "loss": 0.3272, "rewards/chosen": 0.3595797618230184, "rewards/margins": 3.6004366477330527, "rewards/rejected": -3.240856885910034, "step": 6222 }, { "epoch": 0.3298439032146927, "grad_norm": 62.0, "kl": 0.5984745025634766, "learning_rate": 5e-07, "logits/chosen": -71740213.33333333, "logits/rejected": -26260272.0, "logps/chosen": -1028.9520670572917, "logps/rejected": -262.94501953125, "loss": 0.1951, "rewards/chosen": 1.21480925877889, "rewards/margins": 2.7983057339986166, "rewards/rejected": -1.5834964752197265, "step": 6223 }, { "epoch": 0.32989690721649484, "grad_norm": 57.25, "kl": 0.3274993896484375, "learning_rate": 5e-07, "logits/chosen": -32001189.333333332, "logits/rejected": -65884328.0, "logps/chosen": -424.5960286458333, "logps/rejected": -445.11895751953125, "loss": 0.2823, "rewards/chosen": 0.8308096726735433, "rewards/margins": 2.9654594262441, "rewards/rejected": -2.1346497535705566, "step": 6224 }, { "epoch": 0.329949911218297, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27225760.0, "logits/rejected": -31825968.0, "logps/chosen": -200.4326171875, "logps/rejected": -569.54873046875, "loss": 0.2176, "rewards/chosen": 0.44824806849161786, "rewards/margins": 2.9961633364359535, "rewards/rejected": -2.5479152679443358, "step": 6225 }, { "epoch": 0.3300029152200991, "grad_norm": 57.5, "kl": 0.22251129150390625, "learning_rate": 5e-07, "logits/chosen": -13797859.2, "logits/rejected": -95172448.0, "logps/chosen": -708.596044921875, "logps/rejected": -526.5775960286459, "loss": 0.1904, "rewards/chosen": 1.2051025390625, "rewards/margins": 4.251370366414388, "rewards/rejected": -3.046267827351888, "step": 6226 }, { "epoch": 0.33005591922190125, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42852732.0, "logits/rejected": -38385944.0, "logps/chosen": -256.88702392578125, "logps/rejected": -454.5599670410156, "loss": 0.326, "rewards/chosen": 0.03889084607362747, "rewards/margins": 2.582778461277485, "rewards/rejected": -2.5438876152038574, "step": 6227 }, { "epoch": 0.3301089232237034, "grad_norm": 56.25, "kl": 0.35298919677734375, "learning_rate": 5e-07, "logits/chosen": -17015277.333333332, "logits/rejected": -64393112.0, "logps/chosen": -387.4029947916667, "logps/rejected": -278.24560546875, "loss": 0.3346, "rewards/chosen": 0.7228584289550781, "rewards/margins": 1.7232071161270142, "rewards/rejected": -1.000348687171936, "step": 6228 }, { "epoch": 0.3301619272255055, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27026637.333333332, "logits/rejected": -62250297.6, "logps/chosen": -203.5714111328125, "logps/rejected": -303.8232421875, "loss": 0.2545, "rewards/chosen": 0.1748326818148295, "rewards/margins": 2.283664087454478, "rewards/rejected": -2.1088314056396484, "step": 6229 }, { "epoch": 0.33021493122730766, "grad_norm": 45.5, "kl": 1.3158206939697266, "learning_rate": 5e-07, "logits/chosen": -5302355.333333333, "logits/rejected": -2928070.25, "logps/chosen": -210.05692545572916, "logps/rejected": -119.79682922363281, "loss": 0.409, "rewards/chosen": 0.4347817897796631, "rewards/margins": 1.367746114730835, "rewards/rejected": -0.9329643249511719, "step": 6230 }, { "epoch": 0.3302679352291098, "grad_norm": 53.75, "kl": 1.6993694305419922, "learning_rate": 5e-07, "logits/chosen": -21942305.6, "logits/rejected": -28223770.666666668, "logps/chosen": -433.07080078125, "logps/rejected": -263.1838785807292, "loss": 0.2445, "rewards/chosen": 1.1106788635253906, "rewards/margins": 3.191126696268717, "rewards/rejected": -2.0804478327433267, "step": 6231 }, { "epoch": 0.33032093923091194, "grad_norm": 44.0, "kl": 0.8531455993652344, "learning_rate": 5e-07, "logits/chosen": -23294964.8, "logits/rejected": -46450245.333333336, "logps/chosen": -220.909326171875, "logps/rejected": -380.8577067057292, "loss": 0.2558, "rewards/chosen": 0.7118320465087891, "rewards/margins": 2.9583773612976074, "rewards/rejected": -2.2465453147888184, "step": 6232 }, { "epoch": 0.3303739432327141, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46660828.8, "logits/rejected": -18589696.0, "logps/chosen": -197.2564208984375, "logps/rejected": -393.5337727864583, "loss": 0.3378, "rewards/chosen": 0.3282613754272461, "rewards/margins": 2.1626187960306806, "rewards/rejected": -1.8343574206034343, "step": 6233 }, { "epoch": 0.3304269472345162, "grad_norm": 66.5, "kl": 0.5467166900634766, "learning_rate": 5e-07, "logits/chosen": -15760913.6, "logits/rejected": -24206208.0, "logps/chosen": -223.6818115234375, "logps/rejected": -472.6869303385417, "loss": 0.3631, "rewards/chosen": -0.009884533286094666, "rewards/margins": 1.951780938108762, "rewards/rejected": -1.9616654713948567, "step": 6234 }, { "epoch": 0.33047995123631835, "grad_norm": 56.25, "kl": 0.06417083740234375, "learning_rate": 5e-07, "logits/chosen": 170452848.0, "logits/rejected": -34106136.0, "logps/chosen": -381.5140380859375, "logps/rejected": -524.0433959960938, "loss": 0.2886, "rewards/chosen": 0.4208323359489441, "rewards/margins": 3.04075688123703, "rewards/rejected": -2.619924545288086, "step": 6235 }, { "epoch": 0.3305329552381205, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24061852.0, "logits/rejected": -25610210.0, "logps/chosen": -395.0481262207031, "logps/rejected": -221.09202575683594, "loss": 0.2796, "rewards/chosen": 0.5680221319198608, "rewards/margins": 2.415312170982361, "rewards/rejected": -1.8472900390625, "step": 6236 }, { "epoch": 0.3305859592399226, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 38028746.666666664, "logits/rejected": -35405561.6, "logps/chosen": -433.6088053385417, "logps/rejected": -211.1844482421875, "loss": 0.2936, "rewards/chosen": 0.038025726874669395, "rewards/margins": 1.9812813421090443, "rewards/rejected": -1.943255615234375, "step": 6237 }, { "epoch": 0.33063896324172476, "grad_norm": 51.0, "kl": 0.3327789306640625, "learning_rate": 5e-07, "logits/chosen": -72228192.0, "logits/rejected": -88574960.0, "logps/chosen": -433.0218505859375, "logps/rejected": -539.1795654296875, "loss": 0.2501, "rewards/chosen": 0.43974801898002625, "rewards/margins": 3.3049010932445526, "rewards/rejected": -2.8651530742645264, "step": 6238 }, { "epoch": 0.3306919672435269, "grad_norm": 25.75, "kl": 0.8790626525878906, "learning_rate": 5e-07, "logits/chosen": -56843432.0, "logits/rejected": -5812526.5, "logps/chosen": -89.087158203125, "logps/rejected": -370.21575927734375, "loss": 0.2825, "rewards/chosen": 0.4068313241004944, "rewards/margins": 2.6789727807044983, "rewards/rejected": -2.272141456604004, "step": 6239 }, { "epoch": 0.33074497124532903, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35302362.666666664, "logits/rejected": -16692204.0, "logps/chosen": -303.4451904296875, "logps/rejected": -104.70166015625, "loss": 0.4116, "rewards/chosen": 0.10349263747533162, "rewards/margins": 1.4328227241834004, "rewards/rejected": -1.3293300867080688, "step": 6240 }, { "epoch": 0.33079797524713117, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22520784.0, "logits/rejected": -6725586.0, "logps/chosen": -304.11611328125, "logps/rejected": -136.0120849609375, "loss": 0.4009, "rewards/chosen": 0.40980072021484376, "rewards/margins": 0.8788535912831624, "rewards/rejected": -0.46905287106831867, "step": 6241 }, { "epoch": 0.3308509792489333, "grad_norm": 64.0, "kl": 2.1197586059570312, "learning_rate": 5e-07, "logits/chosen": -27742928.0, "logits/rejected": -13835886.0, "logps/chosen": -350.916259765625, "logps/rejected": -213.25828552246094, "loss": 0.3846, "rewards/chosen": 0.3793976704279582, "rewards/margins": 1.3563298384348552, "rewards/rejected": -0.976932168006897, "step": 6242 }, { "epoch": 0.33090398325073545, "grad_norm": 43.0, "kl": 0.5763969421386719, "learning_rate": 5e-07, "logits/chosen": -47602832.0, "logits/rejected": -42491416.0, "logps/chosen": -314.8336486816406, "logps/rejected": -240.96148681640625, "loss": 0.2628, "rewards/chosen": 0.901650071144104, "rewards/margins": 2.0596667528152466, "rewards/rejected": -1.1580166816711426, "step": 6243 }, { "epoch": 0.3309569872525376, "grad_norm": 44.0, "kl": 0.7025871276855469, "learning_rate": 5e-07, "logits/chosen": 21362010.666666668, "logits/rejected": -22033790.4, "logps/chosen": -121.14865112304688, "logps/rejected": -264.036279296875, "loss": 0.3079, "rewards/chosen": 0.32257986068725586, "rewards/margins": 1.6740492820739745, "rewards/rejected": -1.3514694213867187, "step": 6244 }, { "epoch": 0.3310099912543397, "grad_norm": 44.75, "kl": 0.9369471073150635, "learning_rate": 5e-07, "logits/chosen": -35486288.0, "logits/rejected": -4804854.0, "logps/chosen": -462.937744140625, "logps/rejected": -679.007080078125, "loss": 0.2446, "rewards/chosen": 0.7338953018188477, "rewards/margins": 3.464182138442993, "rewards/rejected": -2.7302868366241455, "step": 6245 }, { "epoch": 0.33106299525614186, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22393360.0, "logits/rejected": -35939702.4, "logps/chosen": -269.4508870442708, "logps/rejected": -409.162939453125, "loss": 0.2432, "rewards/chosen": 0.4524904489517212, "rewards/margins": 2.6607271432876587, "rewards/rejected": -2.2082366943359375, "step": 6246 }, { "epoch": 0.331115999257944, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40588708.0, "logits/rejected": -32441677.333333332, "logps/chosen": -140.6005096435547, "logps/rejected": -187.70072428385416, "loss": 0.2573, "rewards/chosen": -0.8956802487373352, "rewards/margins": 1.2414997617403665, "rewards/rejected": -2.1371800104777017, "step": 6247 }, { "epoch": 0.33116900325974613, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17874210.0, "logits/rejected": -40553482.666666664, "logps/chosen": -270.7996520996094, "logps/rejected": -508.4378662109375, "loss": 0.1446, "rewards/chosen": 0.42321547865867615, "rewards/margins": 3.2960589031378427, "rewards/rejected": -2.8728434244791665, "step": 6248 }, { "epoch": 0.33122200726154827, "grad_norm": 45.0, "kl": 0.05171966552734375, "learning_rate": 5e-07, "logits/chosen": -32689850.0, "logits/rejected": -31955642.0, "logps/chosen": -433.3236083984375, "logps/rejected": -222.96435546875, "loss": 0.3496, "rewards/chosen": -0.20115453004837036, "rewards/margins": 1.6963045001029968, "rewards/rejected": -1.8974590301513672, "step": 6249 }, { "epoch": 0.3312750112633504, "grad_norm": 35.0, "kl": 0.8628883361816406, "learning_rate": 5e-07, "logits/chosen": -16531620.0, "logits/rejected": -18377678.4, "logps/chosen": -343.2908935546875, "logps/rejected": -187.6116455078125, "loss": 0.187, "rewards/chosen": 1.5017646153767903, "rewards/margins": 3.418494733174642, "rewards/rejected": -1.9167301177978515, "step": 6250 }, { "epoch": 0.33132801526515254, "grad_norm": 70.0, "kl": 0.16654586791992188, "learning_rate": 5e-07, "logits/chosen": -32652328.0, "logits/rejected": -25847530.0, "logps/chosen": -476.1793212890625, "logps/rejected": -182.75753784179688, "loss": 0.2901, "rewards/chosen": 0.2790212631225586, "rewards/margins": 2.0933520793914795, "rewards/rejected": -1.814330816268921, "step": 6251 }, { "epoch": 0.3313810192669547, "grad_norm": 38.75, "kl": 0.36188507080078125, "learning_rate": 5e-07, "logits/chosen": -17295352.0, "logits/rejected": -58408204.0, "logps/chosen": -131.51565551757812, "logps/rejected": -355.4954833984375, "loss": 0.3089, "rewards/chosen": 0.037651073187589645, "rewards/margins": 2.0655815713107586, "rewards/rejected": -2.027930498123169, "step": 6252 }, { "epoch": 0.3314340232687568, "grad_norm": 53.0, "kl": 0.5387458801269531, "learning_rate": 5e-07, "logits/chosen": -20191226.0, "logits/rejected": 4934943.5, "logps/chosen": -322.3204345703125, "logps/rejected": -83.04510498046875, "loss": 0.3745, "rewards/chosen": 0.5090608596801758, "rewards/margins": 1.1695232391357422, "rewards/rejected": -0.6604623794555664, "step": 6253 }, { "epoch": 0.33148702727055895, "grad_norm": 54.25, "kl": 0.5067710876464844, "learning_rate": 5e-07, "logits/chosen": -14282288.0, "logits/rejected": -45779493.333333336, "logps/chosen": -258.9523193359375, "logps/rejected": -475.2746988932292, "loss": 0.303, "rewards/chosen": 0.5104739665985107, "rewards/margins": 2.6122637589772544, "rewards/rejected": -2.1017897923787436, "step": 6254 }, { "epoch": 0.3315400312723611, "grad_norm": 61.25, "kl": 0.18573570251464844, "learning_rate": 5e-07, "logits/chosen": -22296490.0, "logits/rejected": -62073984.0, "logps/chosen": -263.73779296875, "logps/rejected": -607.4310913085938, "loss": 0.2321, "rewards/chosen": 0.4851333498954773, "rewards/margins": 3.445483386516571, "rewards/rejected": -2.9603500366210938, "step": 6255 }, { "epoch": 0.3315930352741632, "grad_norm": 45.0, "kl": 1.0452613830566406, "learning_rate": 5e-07, "logits/chosen": 406276032.0, "logits/rejected": -38408896.0, "logps/chosen": -646.5360107421875, "logps/rejected": -315.97564697265625, "loss": 0.2152, "rewards/chosen": 1.4261330366134644, "rewards/margins": 3.505142569541931, "rewards/rejected": -2.079009532928467, "step": 6256 }, { "epoch": 0.3316460392759653, "grad_norm": 42.5, "kl": 0.06224346160888672, "learning_rate": 5e-07, "logits/chosen": -22943488.0, "logits/rejected": -16702372.0, "logps/chosen": -210.7627156575521, "logps/rejected": -475.29736328125, "loss": 0.3454, "rewards/chosen": 0.41346148649851483, "rewards/margins": 2.5466449658075967, "rewards/rejected": -2.133183479309082, "step": 6257 }, { "epoch": 0.33169904327776745, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33167272.0, "logits/rejected": -11437496.8, "logps/chosen": -322.9671630859375, "logps/rejected": -378.019970703125, "loss": 0.2093, "rewards/chosen": 0.7112691402435303, "rewards/margins": 2.751370573043823, "rewards/rejected": -2.040101432800293, "step": 6258 }, { "epoch": 0.3317520472795696, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69140048.0, "logits/rejected": -4044981.4285714286, "logps/chosen": -425.3692626953125, "logps/rejected": -154.32329450334822, "loss": 0.1713, "rewards/chosen": 1.0488312244415283, "rewards/margins": 2.8972585882459367, "rewards/rejected": -1.8484273638044084, "step": 6259 }, { "epoch": 0.3318050512813717, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2396300.75, "logits/rejected": -22395736.0, "logps/chosen": -101.51253509521484, "logps/rejected": -366.9034423828125, "loss": 0.2926, "rewards/chosen": -0.23074017465114594, "rewards/margins": 1.5302089601755142, "rewards/rejected": -1.7609491348266602, "step": 6260 }, { "epoch": 0.33185805528317386, "grad_norm": 53.0, "kl": 0.41233348846435547, "learning_rate": 5e-07, "logits/chosen": -50066284.8, "logits/rejected": -11228981.333333334, "logps/chosen": -258.727978515625, "logps/rejected": -630.6616617838541, "loss": 0.4106, "rewards/chosen": -0.15089042186737062, "rewards/margins": 1.6765143950780232, "rewards/rejected": -1.8274048169453938, "step": 6261 }, { "epoch": 0.331911059284976, "grad_norm": 54.25, "kl": 0.8287620544433594, "learning_rate": 5e-07, "logits/chosen": -24897765.333333332, "logits/rejected": -20422918.4, "logps/chosen": -372.5108235677083, "logps/rejected": -321.05634765625, "loss": 0.2145, "rewards/chosen": 0.8603251775105795, "rewards/margins": 3.089778836568197, "rewards/rejected": -2.2294536590576173, "step": 6262 }, { "epoch": 0.33196406328677813, "grad_norm": 31.875, "kl": 0.0077381134033203125, "learning_rate": 5e-07, "logits/chosen": -27479694.0, "logits/rejected": -72627112.0, "logps/chosen": -114.9427719116211, "logps/rejected": -594.1094360351562, "loss": 0.3091, "rewards/chosen": -0.2766624093055725, "rewards/margins": 2.8553537726402283, "rewards/rejected": -3.132016181945801, "step": 6263 }, { "epoch": 0.33201706728858027, "grad_norm": 50.5, "kl": 2.4552860260009766, "learning_rate": 5e-07, "logits/chosen": -49738624.0, "logits/rejected": 7491511.0, "logps/chosen": -288.3337809244792, "logps/rejected": -172.86795043945312, "loss": 0.4308, "rewards/chosen": 0.5235751072565714, "rewards/margins": 1.09164688984553, "rewards/rejected": -0.5680717825889587, "step": 6264 }, { "epoch": 0.3320700712903824, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5951052.0, "logits/rejected": -43234820.0, "logps/chosen": -533.8193359375, "logps/rejected": -291.25665283203125, "loss": 0.2429, "rewards/chosen": 0.745563805103302, "rewards/margins": 2.6948536038398743, "rewards/rejected": -1.9492897987365723, "step": 6265 }, { "epoch": 0.33212307529218454, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12244100.0, "logits/rejected": -28038976.0, "logps/chosen": -301.7858072916667, "logps/rejected": -233.7315673828125, "loss": 0.4068, "rewards/chosen": 0.05268465975920359, "rewards/margins": 2.113206500808398, "rewards/rejected": -2.0605218410491943, "step": 6266 }, { "epoch": 0.3321760792939867, "grad_norm": 53.0, "kl": 3.3465232849121094, "learning_rate": 5e-07, "logits/chosen": -51498412.8, "logits/rejected": -109801088.0, "logps/chosen": -596.77373046875, "logps/rejected": -319.01953125, "loss": 0.2688, "rewards/chosen": 0.8846683502197266, "rewards/margins": 3.1549644470214844, "rewards/rejected": -2.270296096801758, "step": 6267 }, { "epoch": 0.3322290832957888, "grad_norm": 67.0, "kl": 0.13483428955078125, "learning_rate": 5e-07, "logits/chosen": -46899397.333333336, "logits/rejected": -43034756.0, "logps/chosen": -578.0132649739584, "logps/rejected": -394.23138427734375, "loss": 0.2668, "rewards/chosen": 0.9666046301523844, "rewards/margins": 3.489820877710978, "rewards/rejected": -2.5232162475585938, "step": 6268 }, { "epoch": 0.33228208729759096, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39315472.0, "logits/rejected": -31332064.0, "logps/chosen": -507.3768615722656, "logps/rejected": -400.7336120605469, "loss": 0.2697, "rewards/chosen": 0.3836174011230469, "rewards/margins": 2.571718215942383, "rewards/rejected": -2.188100814819336, "step": 6269 }, { "epoch": 0.3323350912993931, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -27650040.0, "logps/rejected": -357.6257019042969, "loss": 0.1407, "rewards/rejected": -2.1784791946411133, "step": 6270 }, { "epoch": 0.33238809530119523, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15370804.0, "logits/rejected": -44221616.0, "logps/chosen": -285.53387451171875, "logps/rejected": -273.947021484375, "loss": 0.2706, "rewards/chosen": -0.1326141357421875, "rewards/margins": 2.0357398986816406, "rewards/rejected": -2.168354034423828, "step": 6271 }, { "epoch": 0.33244109930299737, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14132376.0, "logits/rejected": -31062528.0, "logps/chosen": -346.0869445800781, "logps/rejected": -453.4879150390625, "loss": 0.1918, "rewards/chosen": 1.3485183715820312, "rewards/margins": 3.039829730987549, "rewards/rejected": -1.6913113594055176, "step": 6272 }, { "epoch": 0.3324941033047995, "grad_norm": 39.75, "kl": 1.5314407348632812, "learning_rate": 5e-07, "logits/chosen": -31106464.0, "logits/rejected": -53491220.0, "logps/chosen": -229.751708984375, "logps/rejected": -481.5703125, "loss": 0.2846, "rewards/chosen": 0.15792015194892883, "rewards/margins": 2.4100854694843292, "rewards/rejected": -2.2521653175354004, "step": 6273 }, { "epoch": 0.33254710730660164, "grad_norm": 53.5, "kl": 0.7833099365234375, "learning_rate": 5e-07, "logits/chosen": -40639267.2, "logits/rejected": -31822141.333333332, "logps/chosen": -239.811767578125, "logps/rejected": -394.1129557291667, "loss": 0.329, "rewards/chosen": 0.40468320846557615, "rewards/margins": 2.9613759676615397, "rewards/rejected": -2.5566927591959634, "step": 6274 }, { "epoch": 0.3326001113084038, "grad_norm": 45.0, "kl": 1.1494903564453125, "learning_rate": 5e-07, "logits/chosen": -18889144.0, "logits/rejected": -530726.375, "logps/chosen": -331.3318176269531, "logps/rejected": -301.84100341796875, "loss": 0.2866, "rewards/chosen": 0.6439580917358398, "rewards/margins": 2.0479636192321777, "rewards/rejected": -1.404005527496338, "step": 6275 }, { "epoch": 0.3326531153102059, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45063524.0, "logits/rejected": -25731868.0, "logps/chosen": -322.88153076171875, "logps/rejected": -376.6856689453125, "loss": 0.2079, "rewards/chosen": 0.7775977849960327, "rewards/margins": 3.5827029943466187, "rewards/rejected": -2.805105209350586, "step": 6276 }, { "epoch": 0.33270611931200805, "grad_norm": 60.5, "kl": 0.6047725677490234, "learning_rate": 5e-07, "logits/chosen": 1620044.8, "logits/rejected": -12981288.0, "logps/chosen": -210.9218017578125, "logps/rejected": -385.360107421875, "loss": 0.4586, "rewards/chosen": 0.03594410419464111, "rewards/margins": 0.6205434401830038, "rewards/rejected": -0.5845993359883627, "step": 6277 }, { "epoch": 0.3327591233138102, "grad_norm": 60.75, "kl": 0.7653255462646484, "learning_rate": 5e-07, "logits/chosen": -10542674.0, "logits/rejected": -10412676.0, "logps/chosen": -252.42166137695312, "logps/rejected": -377.1566467285156, "loss": 0.3436, "rewards/chosen": 0.2170102894306183, "rewards/margins": 1.53436878323555, "rewards/rejected": -1.3173584938049316, "step": 6278 }, { "epoch": 0.3328121273156123, "grad_norm": 49.0, "kl": 1.2814302444458008, "learning_rate": 5e-07, "logits/chosen": -15582505.333333334, "logits/rejected": -31502526.0, "logps/chosen": -276.85479736328125, "logps/rejected": -296.0865783691406, "loss": 0.3744, "rewards/chosen": 0.5219058990478516, "rewards/margins": 2.082633137702942, "rewards/rejected": -1.5607272386550903, "step": 6279 }, { "epoch": 0.33286513131741446, "grad_norm": 44.75, "kl": 0.17329025268554688, "learning_rate": 5e-07, "logits/chosen": -34438234.666666664, "logits/rejected": -17322878.4, "logps/chosen": -319.19781494140625, "logps/rejected": -388.9428955078125, "loss": 0.2423, "rewards/chosen": 0.31790949900945026, "rewards/margins": 2.498688574632009, "rewards/rejected": -2.1807790756225587, "step": 6280 }, { "epoch": 0.3329181353192166, "grad_norm": 50.75, "kl": 1.6364517211914062, "learning_rate": 5e-07, "logits/chosen": -34919832.0, "logits/rejected": -1959548.0, "logps/chosen": -330.3661702473958, "logps/rejected": -83.97967529296875, "loss": 0.3557, "rewards/chosen": 0.417718768119812, "rewards/margins": 1.8580511808395386, "rewards/rejected": -1.4403324127197266, "step": 6281 }, { "epoch": 0.33297113932101874, "grad_norm": 51.25, "kl": 0.4352855682373047, "learning_rate": 5e-07, "logits/chosen": -60863603.2, "logits/rejected": -28813893.333333332, "logps/chosen": -252.779931640625, "logps/rejected": -176.34903971354166, "loss": 0.3192, "rewards/chosen": 0.7193719387054444, "rewards/margins": 1.7671520551045736, "rewards/rejected": -1.0477801163991292, "step": 6282 }, { "epoch": 0.3330241433228209, "grad_norm": 45.75, "kl": 0.7470226287841797, "learning_rate": 5e-07, "logits/chosen": 4981735.0, "logits/rejected": -10777067.42857143, "logps/chosen": -353.4465637207031, "logps/rejected": -305.0590122767857, "loss": 0.1361, "rewards/chosen": 2.05411696434021, "rewards/margins": 4.128672497613089, "rewards/rejected": -2.0745555332728793, "step": 6283 }, { "epoch": 0.333077147324623, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11186160.0, "logits/rejected": -7481634.0, "logps/chosen": -101.41827392578125, "logps/rejected": -103.41351318359375, "loss": 0.4413, "rewards/chosen": -0.28918904066085815, "rewards/margins": 0.6119179129600525, "rewards/rejected": -0.9011069536209106, "step": 6284 }, { "epoch": 0.33313015132642515, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41799728.0, "logits/rejected": -6109749.333333333, "logps/chosen": -262.0030029296875, "logps/rejected": -86.84415690104167, "loss": 0.3468, "rewards/chosen": 0.24874391555786132, "rewards/margins": 2.1658705711364745, "rewards/rejected": -1.9171266555786133, "step": 6285 }, { "epoch": 0.3331831553282273, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39241760.0, "logits/rejected": -12259221.333333334, "logps/chosen": -161.987060546875, "logps/rejected": -349.753662109375, "loss": 0.2479, "rewards/chosen": -0.37490978837013245, "rewards/margins": 1.7347327172756195, "rewards/rejected": -2.109642505645752, "step": 6286 }, { "epoch": 0.3332361593300294, "grad_norm": 47.75, "kl": 1.281381607055664, "learning_rate": 5e-07, "logits/chosen": -33547976.0, "logits/rejected": -19719732.0, "logps/chosen": -228.18043518066406, "logps/rejected": -372.85498046875, "loss": 0.3284, "rewards/chosen": 0.1746927797794342, "rewards/margins": 2.377167373895645, "rewards/rejected": -2.202474594116211, "step": 6287 }, { "epoch": 0.33328916333183156, "grad_norm": 61.5, "kl": 0.1122903823852539, "learning_rate": 5e-07, "logits/chosen": -27420968.0, "logits/rejected": -10541871.2, "logps/chosen": -506.98876953125, "logps/rejected": -205.040087890625, "loss": 0.3045, "rewards/chosen": 0.3403300444285075, "rewards/margins": 1.7337712446848552, "rewards/rejected": -1.3934412002563477, "step": 6288 }, { "epoch": 0.3333421673336337, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45454176.0, "logits/rejected": -39388052.0, "logps/chosen": -155.8766072591146, "logps/rejected": -487.281005859375, "loss": 0.4699, "rewards/chosen": -0.23939446608225504, "rewards/margins": 1.2067588170369465, "rewards/rejected": -1.4461532831192017, "step": 6289 }, { "epoch": 0.33339517133543584, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49809352.0, "logits/rejected": -20916520.0, "logps/chosen": -600.6336059570312, "logps/rejected": -301.7508544921875, "loss": 0.1938, "rewards/chosen": 0.5924744009971619, "rewards/margins": 2.765448033809662, "rewards/rejected": -2.1729736328125, "step": 6290 }, { "epoch": 0.333448175337238, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7658633.333333333, "logits/rejected": -60724496.0, "logps/chosen": -274.4364420572917, "logps/rejected": -700.6597900390625, "loss": 0.3956, "rewards/chosen": 0.04696127772331238, "rewards/margins": 2.255262106657028, "rewards/rejected": -2.208300828933716, "step": 6291 }, { "epoch": 0.3335011793390401, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75200168.0, "logits/rejected": -15016570.666666666, "logps/chosen": -111.67173767089844, "logps/rejected": -270.5664469401042, "loss": 0.2987, "rewards/chosen": -0.0009527206420898438, "rewards/margins": 1.3659367561340332, "rewards/rejected": -1.366889476776123, "step": 6292 }, { "epoch": 0.33355418334084225, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7165044.0, "logits/rejected": -16391595.2, "logps/chosen": -128.57744344075522, "logps/rejected": -219.534375, "loss": 0.2874, "rewards/chosen": 0.33618418375651044, "rewards/margins": 1.8267678896586101, "rewards/rejected": -1.4905837059020997, "step": 6293 }, { "epoch": 0.3336071873426444, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46545916.0, "logits/rejected": -74904458.66666667, "logps/chosen": -228.9003448486328, "logps/rejected": -460.2926432291667, "loss": 0.193, "rewards/chosen": 0.725487470626831, "rewards/margins": 3.103250583012899, "rewards/rejected": -2.377763112386068, "step": 6294 }, { "epoch": 0.3336601913444465, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12976434.666666666, "logits/rejected": 2771431.4, "logps/chosen": -327.501708984375, "logps/rejected": -147.11187744140625, "loss": 0.3586, "rewards/chosen": 0.2678837974866231, "rewards/margins": 1.139001484711965, "rewards/rejected": -0.8711176872253418, "step": 6295 }, { "epoch": 0.33371319534624866, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21831150.666666668, "logits/rejected": -27429504.0, "logps/chosen": -329.0497233072917, "logps/rejected": -450.1844482421875, "loss": 0.2995, "rewards/chosen": 0.6105604569117228, "rewards/margins": 3.2515869537989297, "rewards/rejected": -2.641026496887207, "step": 6296 }, { "epoch": 0.3337661993480508, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46312688.0, "logits/rejected": -46621920.0, "logps/chosen": -281.8974609375, "logps/rejected": -340.331591796875, "loss": 0.2366, "rewards/chosen": 0.26948702335357666, "rewards/margins": 2.4381954431533814, "rewards/rejected": -2.1687084197998048, "step": 6297 }, { "epoch": 0.33381920334985293, "grad_norm": 61.0, "kl": 0.4919471740722656, "learning_rate": 5e-07, "logits/chosen": -31094000.0, "logits/rejected": -26694508.8, "logps/chosen": -500.6072998046875, "logps/rejected": -209.51748046875, "loss": 0.3028, "rewards/chosen": -0.07415262361367543, "rewards/margins": 1.723826731244723, "rewards/rejected": -1.7979793548583984, "step": 6298 }, { "epoch": 0.33387220735165507, "grad_norm": 48.75, "kl": 0.44228363037109375, "learning_rate": 5e-07, "logits/chosen": -65116117.333333336, "logits/rejected": -43438499.2, "logps/chosen": -548.0283203125, "logps/rejected": -276.5654296875, "loss": 0.2819, "rewards/chosen": 0.6472908655802408, "rewards/margins": 1.9213526407877604, "rewards/rejected": -1.2740617752075196, "step": 6299 }, { "epoch": 0.3339252113534572, "grad_norm": 55.0, "kl": 0.157196044921875, "learning_rate": 5e-07, "logits/chosen": -13281232.0, "logits/rejected": -37633176.0, "logps/chosen": -285.0518798828125, "logps/rejected": -270.85699462890625, "loss": 0.3028, "rewards/chosen": -0.4564453065395355, "rewards/margins": 1.1462649405002594, "rewards/rejected": -1.602710247039795, "step": 6300 }, { "epoch": 0.33397821535525934, "grad_norm": 46.25, "kl": 0.7272729873657227, "learning_rate": 5e-07, "logits/chosen": -23579476.0, "logits/rejected": -19702140.0, "logps/chosen": -267.50543212890625, "logps/rejected": -269.7957763671875, "loss": 0.3164, "rewards/chosen": 0.15868797898292542, "rewards/margins": 1.9759973585605621, "rewards/rejected": -1.8173093795776367, "step": 6301 }, { "epoch": 0.3340312193570615, "grad_norm": 55.5, "kl": 1.3001213073730469, "learning_rate": 5e-07, "logits/chosen": 894736.0, "logits/rejected": -47810122.666666664, "logps/chosen": -681.335009765625, "logps/rejected": -447.2642822265625, "loss": 0.2625, "rewards/chosen": 0.8671436309814453, "rewards/margins": 2.940798759460449, "rewards/rejected": -2.073655128479004, "step": 6302 }, { "epoch": 0.3340842233588636, "grad_norm": 54.0, "kl": 0.38193511962890625, "learning_rate": 5e-07, "logits/chosen": -27182592.0, "logits/rejected": -36879948.0, "logps/chosen": -370.16851806640625, "logps/rejected": -462.799072265625, "loss": 0.2991, "rewards/chosen": 0.2718844413757324, "rewards/margins": 2.7984962463378906, "rewards/rejected": -2.526611804962158, "step": 6303 }, { "epoch": 0.33413722736066576, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2869008.6666666665, "logits/rejected": -52923896.0, "logps/chosen": -100.29006958007812, "logps/rejected": -833.19873046875, "loss": 0.4394, "rewards/chosen": -0.2641603946685791, "rewards/margins": 2.802105665206909, "rewards/rejected": -3.0662660598754883, "step": 6304 }, { "epoch": 0.3341902313624679, "grad_norm": 38.75, "kl": 0.6296482086181641, "learning_rate": 5e-07, "logits/chosen": -15785324.0, "logits/rejected": -51674788.0, "logps/chosen": -224.16555786132812, "logps/rejected": -528.29638671875, "loss": 0.1739, "rewards/chosen": 1.0084717273712158, "rewards/margins": 4.1308510303497314, "rewards/rejected": -3.1223793029785156, "step": 6305 }, { "epoch": 0.33424323536427, "grad_norm": 63.0, "kl": 0.07279586791992188, "learning_rate": 5e-07, "logits/chosen": -33495333.333333332, "logits/rejected": -22903571.2, "logps/chosen": -196.54683430989584, "logps/rejected": -175.8409423828125, "loss": 0.2927, "rewards/chosen": 0.01990052064259847, "rewards/margins": 1.667237099011739, "rewards/rejected": -1.6473365783691407, "step": 6306 }, { "epoch": 0.3342962393660721, "grad_norm": 50.0, "kl": 0.019351959228515625, "learning_rate": 5e-07, "logits/chosen": -34614726.4, "logits/rejected": -6318596.0, "logps/chosen": -317.221435546875, "logps/rejected": -139.25750732421875, "loss": 0.3881, "rewards/chosen": 0.14244518280029297, "rewards/margins": 1.4138868808746339, "rewards/rejected": -1.2714416980743408, "step": 6307 }, { "epoch": 0.33434924336787425, "grad_norm": 50.5, "kl": 1.7052345275878906, "learning_rate": 5e-07, "logits/chosen": -17107710.4, "logits/rejected": -8843058.0, "logps/chosen": -532.966015625, "logps/rejected": -290.93572998046875, "loss": 0.3686, "rewards/chosen": 0.3465726852416992, "rewards/margins": 1.6039656480153401, "rewards/rejected": -1.2573929627736409, "step": 6308 }, { "epoch": 0.3344022473696764, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12747501.333333334, "logits/rejected": -29087206.4, "logps/chosen": -101.49644978841145, "logps/rejected": -329.62138671875, "loss": 0.323, "rewards/chosen": -0.10810807347297668, "rewards/margins": 1.5791681945323943, "rewards/rejected": -1.687276268005371, "step": 6309 }, { "epoch": 0.3344552513714785, "grad_norm": 48.25, "kl": 0.1498394012451172, "learning_rate": 5e-07, "logits/chosen": -21430606.666666668, "logits/rejected": 5092867.5, "logps/chosen": -143.6905517578125, "logps/rejected": -79.76976776123047, "loss": 0.4636, "rewards/chosen": -0.004712194204330444, "rewards/margins": 0.718206912279129, "rewards/rejected": -0.7229191064834595, "step": 6310 }, { "epoch": 0.33450825537328066, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1649488.375, "logits/rejected": 3860685.714285714, "logps/chosen": -342.7261962890625, "logps/rejected": -230.53245326450892, "loss": 0.2366, "rewards/chosen": 1.543939232826233, "rewards/margins": 2.8176061596189226, "rewards/rejected": -1.2736669267926897, "step": 6311 }, { "epoch": 0.3345612593750828, "grad_norm": 61.0, "kl": 0.45897674560546875, "learning_rate": 5e-07, "logits/chosen": -16446452.0, "logits/rejected": -21893334.0, "logps/chosen": -344.813232421875, "logps/rejected": -257.07281494140625, "loss": 0.3697, "rewards/chosen": 0.20157788197199503, "rewards/margins": 2.5468804637591043, "rewards/rejected": -2.3453025817871094, "step": 6312 }, { "epoch": 0.33461426337688494, "grad_norm": 51.5, "kl": 0.15871047973632812, "learning_rate": 5e-07, "logits/chosen": -45187203.2, "logits/rejected": -30535834.666666668, "logps/chosen": -258.7396484375, "logps/rejected": -311.72739664713544, "loss": 0.3443, "rewards/chosen": 0.20881190299987792, "rewards/margins": 1.9673520565032958, "rewards/rejected": -1.758540153503418, "step": 6313 }, { "epoch": 0.33466726737868707, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3659167.2, "logits/rejected": -12124501.333333334, "logps/chosen": -39.33903198242187, "logps/rejected": -559.1413167317709, "loss": 0.3759, "rewards/chosen": -0.22636330127716064, "rewards/margins": 2.2575448751449585, "rewards/rejected": -2.483908176422119, "step": 6314 }, { "epoch": 0.3347202713804892, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 170170762.66666666, "logits/rejected": -58272000.0, "logps/chosen": -863.79052734375, "logps/rejected": -298.7062744140625, "loss": 0.2829, "rewards/chosen": 0.655654509862264, "rewards/margins": 2.239762290318807, "rewards/rejected": -1.5841077804565429, "step": 6315 }, { "epoch": 0.33477327538229135, "grad_norm": 45.75, "kl": 0.8564548492431641, "learning_rate": 5e-07, "logits/chosen": -42843522.666666664, "logits/rejected": -26798680.0, "logps/chosen": -345.4389241536458, "logps/rejected": -211.05458984375, "loss": 0.307, "rewards/chosen": 0.35006554921468097, "rewards/margins": 1.5434138615926105, "rewards/rejected": -1.1933483123779296, "step": 6316 }, { "epoch": 0.3348262793840935, "grad_norm": 38.5, "kl": 0.49771690368652344, "learning_rate": 5e-07, "logits/chosen": -20233812.8, "logits/rejected": -20415001.333333332, "logps/chosen": -256.346337890625, "logps/rejected": -206.162109375, "loss": 0.3081, "rewards/chosen": 0.7474283695220947, "rewards/margins": 2.2687518914540608, "rewards/rejected": -1.521323521931966, "step": 6317 }, { "epoch": 0.3348792833858956, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79889493.33333333, "logits/rejected": -5957670.8, "logps/chosen": -137.8369140625, "logps/rejected": -332.36376953125, "loss": 0.3404, "rewards/chosen": 0.09291597207387288, "rewards/margins": 1.7177420218785604, "rewards/rejected": -1.6248260498046876, "step": 6318 }, { "epoch": 0.33493228738769776, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40037044.0, "logits/rejected": -20558268.0, "logps/chosen": -238.3302001953125, "logps/rejected": -131.62513732910156, "loss": 0.3177, "rewards/chosen": 0.2846015989780426, "rewards/margins": 1.7532439529895782, "rewards/rejected": -1.4686423540115356, "step": 6319 }, { "epoch": 0.3349852913894999, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10806717.333333334, "logits/rejected": 834968.0, "logps/chosen": -468.0574544270833, "logps/rejected": -297.9119384765625, "loss": 0.3299, "rewards/chosen": 0.36655529340108234, "rewards/margins": 1.5368927796681722, "rewards/rejected": -1.1703374862670899, "step": 6320 }, { "epoch": 0.33503829539130203, "grad_norm": 64.5, "kl": 0.824310302734375, "learning_rate": 5e-07, "logits/chosen": -77231123.2, "logits/rejected": 5193146.666666667, "logps/chosen": -306.317041015625, "logps/rejected": -223.9649861653646, "loss": 0.4283, "rewards/chosen": -0.07723586559295655, "rewards/margins": 0.9198280890782674, "rewards/rejected": -0.997063954671224, "step": 6321 }, { "epoch": 0.33509129939310417, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1091266.0, "logits/rejected": -19052940.0, "logps/chosen": -341.72332763671875, "logps/rejected": -243.71200561523438, "loss": 0.3831, "rewards/chosen": 0.3123830556869507, "rewards/margins": 1.27879136800766, "rewards/rejected": -0.9664083123207092, "step": 6322 }, { "epoch": 0.3351443033949063, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16468450.0, "logits/rejected": -15830976.0, "logps/chosen": -289.84417724609375, "logps/rejected": -254.7649943033854, "loss": 0.1904, "rewards/chosen": 0.4608810544013977, "rewards/margins": 2.840997835000356, "rewards/rejected": -2.3801167805989585, "step": 6323 }, { "epoch": 0.33519730739670844, "grad_norm": 45.25, "kl": 0.18895339965820312, "learning_rate": 5e-07, "logits/chosen": -26374733.333333332, "logits/rejected": -23389462.4, "logps/chosen": -241.7105712890625, "logps/rejected": -283.5796875, "loss": 0.3201, "rewards/chosen": 0.675915797551473, "rewards/margins": 1.7849865754445395, "rewards/rejected": -1.1090707778930664, "step": 6324 }, { "epoch": 0.3352503113985106, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77644896.0, "logits/rejected": -27296691.2, "logps/chosen": -618.0445556640625, "logps/rejected": -472.52265625, "loss": 0.1963, "rewards/chosen": 0.7439931233723959, "rewards/margins": 3.724271519978841, "rewards/rejected": -2.9802783966064452, "step": 6325 }, { "epoch": 0.3353033154003127, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 46858280.0, "logits/rejected": -41274682.666666664, "logps/chosen": -503.10540771484375, "logps/rejected": -294.3416748046875, "loss": 0.1824, "rewards/chosen": 0.37777405977249146, "rewards/margins": 3.0065142114957175, "rewards/rejected": -2.628740151723226, "step": 6326 }, { "epoch": 0.33535631940211486, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10047082.666666666, "logits/rejected": -8937147.2, "logps/chosen": -285.36850992838544, "logps/rejected": -274.3548583984375, "loss": 0.2293, "rewards/chosen": 0.7529250780741373, "rewards/margins": 2.5562028566996258, "rewards/rejected": -1.8032777786254883, "step": 6327 }, { "epoch": 0.335409323403917, "grad_norm": 50.25, "kl": 1.2909717559814453, "learning_rate": 5e-07, "logits/chosen": -6758750.4, "logits/rejected": -25719586.666666668, "logps/chosen": -321.836669921875, "logps/rejected": -348.2910970052083, "loss": 0.3229, "rewards/chosen": 0.6758484840393066, "rewards/margins": 1.7982978026072185, "rewards/rejected": -1.1224493185679119, "step": 6328 }, { "epoch": 0.33546232740571913, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3610060.5, "logits/rejected": -96912200.0, "logps/chosen": -181.62022399902344, "logps/rejected": -463.475341796875, "loss": 0.3311, "rewards/chosen": -0.07001657783985138, "rewards/margins": 1.9500174969434738, "rewards/rejected": -2.020034074783325, "step": 6329 }, { "epoch": 0.33551533140752127, "grad_norm": 71.0, "kl": 1.3481941223144531, "learning_rate": 5e-07, "logits/chosen": -34105698.28571428, "logits/rejected": -6239509.0, "logps/chosen": -450.90670340401783, "logps/rejected": -136.15768432617188, "loss": 0.4869, "rewards/chosen": 0.15618235724312918, "rewards/margins": 1.170689650944301, "rewards/rejected": -1.0145072937011719, "step": 6330 }, { "epoch": 0.3355683354093234, "grad_norm": 49.75, "kl": 1.2339487075805664, "learning_rate": 5e-07, "logits/chosen": -31357145.6, "logits/rejected": -24321130.666666668, "logps/chosen": -317.254638671875, "logps/rejected": -179.50651041666666, "loss": 0.3498, "rewards/chosen": 0.3002350330352783, "rewards/margins": 1.7457458972930908, "rewards/rejected": -1.4455108642578125, "step": 6331 }, { "epoch": 0.33562133941112554, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10531136.0, "logits/rejected": -21547944.0, "logps/chosen": -111.33592224121094, "logps/rejected": -176.03921508789062, "loss": 0.3629, "rewards/chosen": -0.06468095630407333, "rewards/margins": 1.3459495082497597, "rewards/rejected": -1.410630464553833, "step": 6332 }, { "epoch": 0.3356743434129277, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35530432.0, "logits/rejected": -32101766.0, "logps/chosen": -185.39556884765625, "logps/rejected": -520.4624633789062, "loss": 0.3327, "rewards/chosen": -0.15909796953201294, "rewards/margins": 2.13387268781662, "rewards/rejected": -2.292970657348633, "step": 6333 }, { "epoch": 0.3357273474147298, "grad_norm": 54.5, "kl": 0.5326995849609375, "learning_rate": 5e-07, "logits/chosen": -22233784.0, "logits/rejected": -1881808.0, "logps/chosen": -420.57757568359375, "logps/rejected": -168.51486206054688, "loss": 0.3204, "rewards/chosen": 0.8936690092086792, "rewards/margins": 1.5539376139640808, "rewards/rejected": -0.6602686047554016, "step": 6334 }, { "epoch": 0.33578035141653195, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33693128.0, "logits/rejected": -32857032.0, "logps/chosen": -304.7794494628906, "logps/rejected": -233.54852294921875, "loss": 0.438, "rewards/chosen": -0.3726164996623993, "rewards/margins": 0.5874511301517487, "rewards/rejected": -0.960067629814148, "step": 6335 }, { "epoch": 0.3358333554183341, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6452010.4, "logits/rejected": -67208592.0, "logps/chosen": -228.398974609375, "logps/rejected": -458.560791015625, "loss": 0.2489, "rewards/chosen": 0.744901704788208, "rewards/margins": 3.3244845549265545, "rewards/rejected": -2.579582850138346, "step": 6336 }, { "epoch": 0.3358863594201362, "grad_norm": 92.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54819929.6, "logits/rejected": -61383242.666666664, "logps/chosen": -414.50830078125, "logps/rejected": -174.72135416666666, "loss": 0.3824, "rewards/chosen": -0.16539119482040404, "rewards/margins": 1.8298178633054096, "rewards/rejected": -1.9952090581258137, "step": 6337 }, { "epoch": 0.33593936342193836, "grad_norm": 45.75, "kl": 0.09663105010986328, "learning_rate": 5e-07, "logits/chosen": -44925056.0, "logits/rejected": -29299477.333333332, "logps/chosen": -328.1119689941406, "logps/rejected": -324.17578125, "loss": 0.2277, "rewards/chosen": 0.5315192937850952, "rewards/margins": 2.478025635083516, "rewards/rejected": -1.9465063412984211, "step": 6338 }, { "epoch": 0.3359923674237405, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4440260.5, "logits/rejected": -16728564.0, "logps/chosen": -219.84298706054688, "logps/rejected": -300.010498046875, "loss": 0.3608, "rewards/chosen": -0.20294228196144104, "rewards/margins": 1.76353058218956, "rewards/rejected": -1.966472864151001, "step": 6339 }, { "epoch": 0.33604537142554264, "grad_norm": 50.0, "kl": 0.3067741394042969, "learning_rate": 5e-07, "logits/chosen": -20723241.6, "logits/rejected": -26236402.666666668, "logps/chosen": -619.48818359375, "logps/rejected": -202.6425577799479, "loss": 0.2842, "rewards/chosen": 0.9369259834289551, "rewards/margins": 2.3616642316182452, "rewards/rejected": -1.4247382481892903, "step": 6340 }, { "epoch": 0.3360983754273448, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67826424.0, "logits/rejected": -10819943.333333334, "logps/chosen": -455.9271240234375, "logps/rejected": -221.3800252278646, "loss": 0.3103, "rewards/chosen": 0.01818162202835083, "rewards/margins": 1.5952423214912415, "rewards/rejected": -1.5770606994628906, "step": 6341 }, { "epoch": 0.3361513794291469, "grad_norm": 55.0, "kl": 0.5615043640136719, "learning_rate": 5e-07, "logits/chosen": -12494094.857142856, "logits/rejected": -23776918.0, "logps/chosen": -251.97919573102678, "logps/rejected": -409.2715759277344, "loss": 0.4297, "rewards/chosen": 0.31725566727774485, "rewards/margins": 1.6159892593111311, "rewards/rejected": -1.2987335920333862, "step": 6342 }, { "epoch": 0.33620438343094905, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41058917.333333336, "logits/rejected": -14859582.4, "logps/chosen": -310.5819905598958, "logps/rejected": -428.40654296875, "loss": 0.2221, "rewards/chosen": 0.6241058508555094, "rewards/margins": 2.7704696814219156, "rewards/rejected": -2.1463638305664063, "step": 6343 }, { "epoch": 0.3362573874327512, "grad_norm": 59.25, "kl": 0.13803863525390625, "learning_rate": 5e-07, "logits/chosen": -58567818.666666664, "logits/rejected": 4267070.4, "logps/chosen": -832.9251302083334, "logps/rejected": -360.5858642578125, "loss": 0.3064, "rewards/chosen": 0.24761786063512167, "rewards/margins": 1.7531836946805317, "rewards/rejected": -1.5055658340454101, "step": 6344 }, { "epoch": 0.3363103914345533, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31553622.0, "logits/rejected": -15805292.0, "logps/chosen": -307.0014953613281, "logps/rejected": -244.5969441731771, "loss": 0.2014, "rewards/chosen": 0.8921875357627869, "rewards/margins": 2.7519509991010027, "rewards/rejected": -1.859763463338216, "step": 6345 }, { "epoch": 0.33636339543635546, "grad_norm": 50.75, "kl": 0.08638763427734375, "learning_rate": 5e-07, "logits/chosen": -21699210.0, "logits/rejected": -46740250.666666664, "logps/chosen": -379.1051940917969, "logps/rejected": -296.55600992838544, "loss": 0.253, "rewards/chosen": 0.11494751274585724, "rewards/margins": 1.9343529095252354, "rewards/rejected": -1.8194053967793782, "step": 6346 }, { "epoch": 0.3364163994381576, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2306728.6, "logits/rejected": -19404853.333333332, "logps/chosen": -439.96015625, "logps/rejected": -501.7473958333333, "loss": 0.2726, "rewards/chosen": 0.5391192436218262, "rewards/margins": 3.053231716156006, "rewards/rejected": -2.5141124725341797, "step": 6347 }, { "epoch": 0.33646940343995974, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10670922.0, "logits/rejected": -32020612.0, "logps/chosen": -421.1022135416667, "logps/rejected": -415.7929382324219, "loss": 0.3029, "rewards/chosen": 0.7845245997111002, "rewards/margins": 2.108705202738444, "rewards/rejected": -1.3241806030273438, "step": 6348 }, { "epoch": 0.3365224074417619, "grad_norm": 36.5, "kl": 0.3401374816894531, "learning_rate": 5e-07, "logits/chosen": -10351739.333333334, "logits/rejected": -37857196.8, "logps/chosen": -311.89101155598956, "logps/rejected": -566.15673828125, "loss": 0.2395, "rewards/chosen": 0.9264672597249349, "rewards/margins": 3.8486428578694665, "rewards/rejected": -2.9221755981445314, "step": 6349 }, { "epoch": 0.336575411443564, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45786640.0, "logits/rejected": -33301858.666666668, "logps/chosen": -399.9956787109375, "logps/rejected": -361.1992594401042, "loss": 0.3266, "rewards/chosen": 0.30489578247070315, "rewards/margins": 2.010195795694987, "rewards/rejected": -1.705300013224284, "step": 6350 }, { "epoch": 0.33662841544536615, "grad_norm": 49.0, "kl": 0.563720703125, "learning_rate": 5e-07, "logits/chosen": -47713760.0, "logits/rejected": -67204262.4, "logps/chosen": -384.181640625, "logps/rejected": -336.918505859375, "loss": 0.2015, "rewards/chosen": 1.1252329349517822, "rewards/margins": 2.9298080921173097, "rewards/rejected": -1.8045751571655273, "step": 6351 }, { "epoch": 0.3366814194471683, "grad_norm": 42.75, "kl": 0.5898666381835938, "learning_rate": 5e-07, "logits/chosen": -21646050.666666668, "logits/rejected": -3580112.0, "logps/chosen": -288.92901611328125, "logps/rejected": -272.9490051269531, "loss": 0.3365, "rewards/chosen": 0.4533408085505168, "rewards/margins": 3.2521832386652627, "rewards/rejected": -2.798842430114746, "step": 6352 }, { "epoch": 0.3367344234489704, "grad_norm": 58.0, "kl": 0.7560253143310547, "learning_rate": 5e-07, "logits/chosen": -24709339.42857143, "logits/rejected": 7901928.0, "logps/chosen": -405.15175083705356, "logps/rejected": -192.09048461914062, "loss": 0.4351, "rewards/chosen": 0.17372064931052073, "rewards/margins": 2.3391352210726057, "rewards/rejected": -2.165414571762085, "step": 6353 }, { "epoch": 0.33678742745077256, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42739894.4, "logits/rejected": -32242216.0, "logps/chosen": -244.127001953125, "logps/rejected": -253.3221435546875, "loss": 0.317, "rewards/chosen": 0.39849808216094973, "rewards/margins": 2.310475532213847, "rewards/rejected": -1.9119774500528972, "step": 6354 }, { "epoch": 0.3368404314525747, "grad_norm": 47.25, "kl": 0.05033111572265625, "learning_rate": 5e-07, "logits/chosen": -7135706.666666667, "logits/rejected": -49360633.6, "logps/chosen": -218.2856241861979, "logps/rejected": -425.307080078125, "loss": 0.211, "rewards/chosen": 0.7477267583211263, "rewards/margins": 3.113332303365072, "rewards/rejected": -2.3656055450439455, "step": 6355 }, { "epoch": 0.33689343545437683, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37091320.0, "logits/rejected": -55270704.0, "logps/chosen": -438.679931640625, "logps/rejected": -393.458740234375, "loss": 0.2619, "rewards/chosen": 0.5226508975028992, "rewards/margins": 2.3559646010398865, "rewards/rejected": -1.8333137035369873, "step": 6356 }, { "epoch": 0.3369464394561789, "grad_norm": 32.75, "kl": 0.604823112487793, "learning_rate": 5e-07, "logits/chosen": -4661535.0, "logits/rejected": -23350428.0, "logps/chosen": -71.35382080078125, "logps/rejected": -224.08383178710938, "loss": 0.4017, "rewards/chosen": -0.4347636103630066, "rewards/margins": 1.1017618775367737, "rewards/rejected": -1.5365254878997803, "step": 6357 }, { "epoch": 0.33699944345798105, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -419707.2, "logits/rejected": -8524044.0, "logps/chosen": -359.825146484375, "logps/rejected": -146.83231608072916, "loss": 0.4113, "rewards/chosen": -0.30594696998596194, "rewards/margins": 1.576819594701131, "rewards/rejected": -1.882766564687093, "step": 6358 }, { "epoch": 0.3370524474597832, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41151320.0, "logits/rejected": -12326465.0, "logps/chosen": -311.4048156738281, "logps/rejected": -98.27259826660156, "loss": 0.3385, "rewards/chosen": 0.00619873870164156, "rewards/margins": 1.575082634575665, "rewards/rejected": -1.5688838958740234, "step": 6359 }, { "epoch": 0.3371054514615853, "grad_norm": 50.25, "kl": 0.3473968505859375, "learning_rate": 5e-07, "logits/chosen": -17295698.666666668, "logits/rejected": -112781712.0, "logps/chosen": -382.7917887369792, "logps/rejected": -703.1411743164062, "loss": 0.3323, "rewards/chosen": 0.36218714714050293, "rewards/margins": 3.9036240577697754, "rewards/rejected": -3.5414369106292725, "step": 6360 }, { "epoch": 0.33715845546338746, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70167525.33333333, "logits/rejected": -55005977.6, "logps/chosen": -364.721923828125, "logps/rejected": -526.430322265625, "loss": 0.1905, "rewards/chosen": 0.43614808718363446, "rewards/margins": 3.8566085974375404, "rewards/rejected": -3.420460510253906, "step": 6361 }, { "epoch": 0.3372114594651896, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24599542.4, "logits/rejected": -37451728.0, "logps/chosen": -147.0485595703125, "logps/rejected": -449.8406575520833, "loss": 0.2498, "rewards/chosen": 0.6848005294799805, "rewards/margins": 3.332175636291504, "rewards/rejected": -2.6473751068115234, "step": 6362 }, { "epoch": 0.33726446346699174, "grad_norm": 52.25, "kl": 0.32213544845581055, "learning_rate": 5e-07, "logits/chosen": -40001533.333333336, "logits/rejected": -20064220.0, "logps/chosen": -313.59527587890625, "logps/rejected": -421.36529541015625, "loss": 0.3739, "rewards/chosen": 0.4125062624613444, "rewards/margins": 1.6728371779123943, "rewards/rejected": -1.2603309154510498, "step": 6363 }, { "epoch": 0.3373174674687939, "grad_norm": 53.5, "kl": 0.10975265502929688, "learning_rate": 5e-07, "logits/chosen": 633808.0, "logits/rejected": -40248888.0, "logps/chosen": -383.6694580078125, "logps/rejected": -117.41208902994792, "loss": 0.3458, "rewards/chosen": 0.5061920642852783, "rewards/margins": 1.572031354904175, "rewards/rejected": -1.0658392906188965, "step": 6364 }, { "epoch": 0.337370471470596, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53341248.0, "logits/rejected": -28630437.333333332, "logps/chosen": -475.1374206542969, "logps/rejected": -504.6666666666667, "loss": 0.171, "rewards/chosen": 0.1469421535730362, "rewards/margins": 2.839172378182411, "rewards/rejected": -2.692230224609375, "step": 6365 }, { "epoch": 0.33742347547239815, "grad_norm": 56.0, "kl": 1.8085708618164062, "learning_rate": 5e-07, "logits/chosen": -18047914.666666668, "logits/rejected": -25252412.0, "logps/chosen": -340.6019694010417, "logps/rejected": -236.55740356445312, "loss": 0.2714, "rewards/chosen": 0.8752716382344564, "rewards/margins": 2.9289069970448813, "rewards/rejected": -2.053635358810425, "step": 6366 }, { "epoch": 0.3374764794742003, "grad_norm": 43.0, "kl": 1.5843219757080078, "learning_rate": 5e-07, "logits/chosen": -16461422.0, "logits/rejected": -10429992.0, "logps/chosen": -508.0242919921875, "logps/rejected": -315.9107666015625, "loss": 0.2062, "rewards/chosen": 1.147419810295105, "rewards/margins": 3.08418611685435, "rewards/rejected": -1.9367663065592449, "step": 6367 }, { "epoch": 0.3375294834760024, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20648974.0, "logits/rejected": -26025194.0, "logps/chosen": -206.82638549804688, "logps/rejected": -563.4649658203125, "loss": 0.2852, "rewards/chosen": -0.01329611986875534, "rewards/margins": 3.117582328617573, "rewards/rejected": -3.130878448486328, "step": 6368 }, { "epoch": 0.33758248747780456, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5111101.6, "logits/rejected": -30148573.333333332, "logps/chosen": -268.380859375, "logps/rejected": -208.37239583333334, "loss": 0.3802, "rewards/chosen": -0.10823698043823242, "rewards/margins": 1.6866302172342937, "rewards/rejected": -1.7948671976725261, "step": 6369 }, { "epoch": 0.3376354914796067, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21103924.0, "logits/rejected": -15191030.0, "logps/chosen": -284.7412109375, "logps/rejected": -156.23712158203125, "loss": 0.2928, "rewards/chosen": 0.7112386226654053, "rewards/margins": 1.9151890277862549, "rewards/rejected": -1.2039504051208496, "step": 6370 }, { "epoch": 0.33768849548140883, "grad_norm": 55.0, "kl": 0.5602703094482422, "learning_rate": 5e-07, "logits/chosen": -13900393.6, "logits/rejected": -37249850.666666664, "logps/chosen": -333.475732421875, "logps/rejected": -278.94248453776044, "loss": 0.2615, "rewards/chosen": 0.6819973468780518, "rewards/margins": 2.9151541868845623, "rewards/rejected": -2.2331568400065103, "step": 6371 }, { "epoch": 0.33774149948321097, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40222773.333333336, "logits/rejected": -33772368.0, "logps/chosen": -278.9867757161458, "logps/rejected": -321.092041015625, "loss": 0.3787, "rewards/chosen": 0.16899134715398154, "rewards/margins": 2.394401808579763, "rewards/rejected": -2.2254104614257812, "step": 6372 }, { "epoch": 0.3377945034850131, "grad_norm": 39.25, "kl": 0.9904270172119141, "learning_rate": 5e-07, "logits/chosen": -26062131.2, "logits/rejected": -37148133.333333336, "logps/chosen": -227.926904296875, "logps/rejected": -377.0857747395833, "loss": 0.3123, "rewards/chosen": 0.38733317852020266, "rewards/margins": 2.471797728538513, "rewards/rejected": -2.0844645500183105, "step": 6373 }, { "epoch": 0.33784750748681525, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5823889.5, "logits/rejected": -22871725.714285713, "logps/chosen": -20.542009353637695, "logps/rejected": -346.1499720982143, "loss": 0.1836, "rewards/chosen": -0.0695001631975174, "rewards/margins": 1.9842442891427448, "rewards/rejected": -2.053744452340262, "step": 6374 }, { "epoch": 0.3379005114886174, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31059683.2, "logits/rejected": -25504096.0, "logps/chosen": -278.0130859375, "logps/rejected": -425.3870849609375, "loss": 0.2888, "rewards/chosen": 0.5435585498809814, "rewards/margins": 2.8570843855539954, "rewards/rejected": -2.313525835673014, "step": 6375 }, { "epoch": 0.3379535154904195, "grad_norm": 40.25, "kl": 0.02291107177734375, "learning_rate": 5e-07, "logits/chosen": -30313276.0, "logits/rejected": -9500415.0, "logps/chosen": -215.47842407226562, "logps/rejected": -243.7273712158203, "loss": 0.2466, "rewards/chosen": 0.6889152526855469, "rewards/margins": 2.910244941711426, "rewards/rejected": -2.221329689025879, "step": 6376 }, { "epoch": 0.33800651949222166, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20431348.8, "logits/rejected": -4574824.666666667, "logps/chosen": -226.4959228515625, "logps/rejected": -228.14959716796875, "loss": 0.3621, "rewards/chosen": 0.3551468849182129, "rewards/margins": 1.37759796778361, "rewards/rejected": -1.0224510828653972, "step": 6377 }, { "epoch": 0.3380595234940238, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 116098858.66666667, "logits/rejected": -26245044.8, "logps/chosen": -541.9695638020834, "logps/rejected": -246.5741943359375, "loss": 0.3596, "rewards/chosen": -0.6050577561060587, "rewards/margins": 1.341825064023336, "rewards/rejected": -1.9468828201293946, "step": 6378 }, { "epoch": 0.33811252749582593, "grad_norm": 59.25, "kl": 0.9468526840209961, "learning_rate": 5e-07, "logits/chosen": -129545817.6, "logits/rejected": -29317930.666666668, "logps/chosen": -428.109716796875, "logps/rejected": -288.02488199869794, "loss": 0.3295, "rewards/chosen": 0.5591710090637207, "rewards/margins": 2.0482864061991375, "rewards/rejected": -1.4891153971354167, "step": 6379 }, { "epoch": 0.33816553149762807, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79555272.0, "logits/rejected": -21194154.0, "logps/chosen": -359.0029602050781, "logps/rejected": -292.446044921875, "loss": 0.2678, "rewards/chosen": 0.4528879225254059, "rewards/margins": 2.514387756586075, "rewards/rejected": -2.061499834060669, "step": 6380 }, { "epoch": 0.3382185354994302, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45862534.4, "logits/rejected": -20134562.666666668, "logps/chosen": -544.9044921875, "logps/rejected": -228.53094482421875, "loss": 0.3105, "rewards/chosen": 0.4000434875488281, "rewards/margins": 2.5210278828938804, "rewards/rejected": -2.1209843953450522, "step": 6381 }, { "epoch": 0.33827153950123234, "grad_norm": 47.5, "kl": 1.8906440734863281, "learning_rate": 5e-07, "logits/chosen": -1100748.625, "logits/rejected": -10107960.0, "logps/chosen": -310.24591064453125, "logps/rejected": -263.6764322916667, "loss": 0.2147, "rewards/chosen": 0.9464104175567627, "rewards/margins": 2.581586440404256, "rewards/rejected": -1.6351760228474934, "step": 6382 }, { "epoch": 0.3383245435030345, "grad_norm": 31.75, "kl": 0.038906097412109375, "learning_rate": 5e-07, "logits/chosen": -2480793.6666666665, "logits/rejected": -19727500.8, "logps/chosen": -111.43865966796875, "logps/rejected": -179.2919677734375, "loss": 0.2271, "rewards/chosen": 0.39336133003234863, "rewards/margins": 2.537247323989868, "rewards/rejected": -2.1438859939575194, "step": 6383 }, { "epoch": 0.3383775475048366, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1068766.75, "logits/rejected": -14799678.857142856, "logps/chosen": -30.712160110473633, "logps/rejected": -426.51510184151783, "loss": 0.237, "rewards/chosen": -0.43631991744041443, "rewards/margins": 1.3970133704798562, "rewards/rejected": -1.8333332879202706, "step": 6384 }, { "epoch": 0.33843055150663875, "grad_norm": 53.0, "kl": 2.2773971557617188, "learning_rate": 5e-07, "logits/chosen": -30925701.333333332, "logits/rejected": -17983432.0, "logps/chosen": -280.12005615234375, "logps/rejected": -363.3597717285156, "loss": 0.3796, "rewards/chosen": 0.28928273916244507, "rewards/margins": 2.765530049800873, "rewards/rejected": -2.4762473106384277, "step": 6385 }, { "epoch": 0.3384835555084409, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11811645.333333334, "logits/rejected": -15819424.0, "logps/chosen": -756.80029296875, "logps/rejected": -184.05670166015625, "loss": 0.2793, "rewards/chosen": 0.9416900475819906, "rewards/margins": 2.258975108464559, "rewards/rejected": -1.3172850608825684, "step": 6386 }, { "epoch": 0.33853655951024303, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8412396.0, "logits/rejected": -22990712.0, "logps/chosen": -355.7078043619792, "logps/rejected": -242.1909912109375, "loss": 0.1933, "rewards/chosen": 0.662264903386434, "rewards/margins": 2.962674601872762, "rewards/rejected": -2.300409698486328, "step": 6387 }, { "epoch": 0.33858956351204517, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3181073.6666666665, "logits/rejected": -64801990.4, "logps/chosen": -142.97795613606772, "logps/rejected": -392.899169921875, "loss": 0.2569, "rewards/chosen": -0.04580406347910563, "rewards/margins": 2.1717605193456015, "rewards/rejected": -2.217564582824707, "step": 6388 }, { "epoch": 0.3386425675138473, "grad_norm": 52.0, "kl": 0.7405624389648438, "learning_rate": 5e-07, "logits/chosen": -21861620.0, "logits/rejected": -11135968.0, "logps/chosen": -198.16111755371094, "logps/rejected": -319.1824035644531, "loss": 0.3437, "rewards/chosen": 0.17820186913013458, "rewards/margins": 1.553687408566475, "rewards/rejected": -1.3754855394363403, "step": 6389 }, { "epoch": 0.33869557151564944, "grad_norm": 43.25, "kl": 1.7763938903808594, "learning_rate": 5e-07, "logits/chosen": -25999061.333333332, "logits/rejected": -18504600.0, "logps/chosen": -271.81528727213544, "logps/rejected": -185.640087890625, "loss": 0.3033, "rewards/chosen": 0.8213719526926676, "rewards/margins": 2.035886017481486, "rewards/rejected": -1.2145140647888184, "step": 6390 }, { "epoch": 0.3387485755174516, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12570388.8, "logits/rejected": -43396109.333333336, "logps/chosen": -269.942578125, "logps/rejected": -203.12154134114584, "loss": 0.3867, "rewards/chosen": -0.12759463787078856, "rewards/margins": 1.6993441502253215, "rewards/rejected": -1.82693878809611, "step": 6391 }, { "epoch": 0.3388015795192537, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26474636.0, "logits/rejected": -30947691.42857143, "logps/chosen": -127.83525085449219, "logps/rejected": -452.60951450892856, "loss": 0.1694, "rewards/chosen": 0.02615661732852459, "rewards/margins": 2.556353760883212, "rewards/rejected": -2.5301971435546875, "step": 6392 }, { "epoch": 0.33885458352105585, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16920169.6, "logits/rejected": -643451.25, "logps/chosen": -233.6735595703125, "logps/rejected": -147.12847900390625, "loss": 0.367, "rewards/chosen": 0.30672903060913087, "rewards/margins": 1.594170045852661, "rewards/rejected": -1.2874410152435303, "step": 6393 }, { "epoch": 0.338907587522858, "grad_norm": 44.75, "kl": 0.1723775863647461, "learning_rate": 5e-07, "logits/chosen": 3658727.25, "logits/rejected": -39637277.333333336, "logps/chosen": -408.47509765625, "logps/rejected": -317.3179931640625, "loss": 0.2515, "rewards/chosen": 0.2519628703594208, "rewards/margins": 2.0505996247132616, "rewards/rejected": -1.798636754353841, "step": 6394 }, { "epoch": 0.3389605915246601, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73655120.0, "logits/rejected": -10078966.0, "logps/chosen": -553.9169921875, "logps/rejected": -456.1903076171875, "loss": 0.2288, "rewards/chosen": 0.24702148139476776, "rewards/margins": 2.5738567958275476, "rewards/rejected": -2.32683531443278, "step": 6395 }, { "epoch": 0.33901359552646226, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11254101.0, "logits/rejected": -35722948.0, "logps/chosen": -279.0389099121094, "logps/rejected": -356.1346130371094, "loss": 0.3057, "rewards/chosen": 0.09490565955638885, "rewards/margins": 2.2865107506513596, "rewards/rejected": -2.1916050910949707, "step": 6396 }, { "epoch": 0.3390665995282644, "grad_norm": 119.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8119819.2, "logits/rejected": -24572056.0, "logps/chosen": -746.89140625, "logps/rejected": -182.98514811197916, "loss": 0.4034, "rewards/chosen": -0.06795722246170044, "rewards/margins": 1.7630404035250347, "rewards/rejected": -1.830997625986735, "step": 6397 }, { "epoch": 0.33911960353006654, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11205479.0, "logits/rejected": -20241654.666666668, "logps/chosen": -458.93255615234375, "logps/rejected": -206.16658528645834, "loss": 0.2619, "rewards/chosen": -0.28564608097076416, "rewards/margins": 1.5964932839075725, "rewards/rejected": -1.8821393648783367, "step": 6398 }, { "epoch": 0.3391726075318687, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -30994372.0, "logps/rejected": -214.00416564941406, "loss": 0.2248, "rewards/rejected": -1.434723138809204, "step": 6399 }, { "epoch": 0.3392256115336708, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46135472.0, "logits/rejected": -32913624.0, "logps/chosen": -330.2539367675781, "logps/rejected": -293.9767150878906, "loss": 0.2446, "rewards/chosen": 0.6379438638687134, "rewards/margins": 2.7418776750564575, "rewards/rejected": -2.103933811187744, "step": 6400 }, { "epoch": 0.33927861553547295, "grad_norm": 40.0, "kl": 0.358154296875, "learning_rate": 5e-07, "logits/chosen": -39276548.0, "logits/rejected": -22958444.0, "logps/chosen": -482.956787109375, "logps/rejected": -282.0391845703125, "loss": 0.145, "rewards/chosen": 1.3461878299713135, "rewards/margins": 4.203168630599976, "rewards/rejected": -2.856980800628662, "step": 6401 }, { "epoch": 0.3393316195372751, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7658384.0, "logits/rejected": -18349753.14285714, "logps/chosen": -81.3258056640625, "logps/rejected": -151.07432338169642, "loss": 0.2563, "rewards/chosen": -0.09057464450597763, "rewards/margins": 1.2914852702191897, "rewards/rejected": -1.3820599147251673, "step": 6402 }, { "epoch": 0.3393846235390772, "grad_norm": 46.0, "kl": 0.31293201446533203, "learning_rate": 5e-07, "logits/chosen": -43715065.6, "logits/rejected": -21304149.333333332, "logps/chosen": -266.42294921875, "logps/rejected": -108.76035563151042, "loss": 0.3338, "rewards/chosen": 0.2921242952346802, "rewards/margins": 1.9667121807734174, "rewards/rejected": -1.674587885538737, "step": 6403 }, { "epoch": 0.33943762754087936, "grad_norm": 41.0, "kl": 0.13023757934570312, "learning_rate": 5e-07, "logits/chosen": -23215139.2, "logits/rejected": -13373214.666666666, "logps/chosen": -193.947705078125, "logps/rejected": -235.32462565104166, "loss": 0.3406, "rewards/chosen": 0.47217607498168945, "rewards/margins": 1.8484646479288738, "rewards/rejected": -1.3762885729471843, "step": 6404 }, { "epoch": 0.3394906315426815, "grad_norm": 49.0, "kl": 1.0435066223144531, "learning_rate": 5e-07, "logits/chosen": -18914316.0, "logits/rejected": -13691635.2, "logps/chosen": -73.14334615071614, "logps/rejected": -284.5705322265625, "loss": 0.3267, "rewards/chosen": 0.0907888412475586, "rewards/margins": 1.5664247512817382, "rewards/rejected": -1.4756359100341796, "step": 6405 }, { "epoch": 0.33954363554448364, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25269958.4, "logits/rejected": -36617274.666666664, "logps/chosen": -126.35960693359375, "logps/rejected": -188.47306315104166, "loss": 0.4087, "rewards/chosen": 0.0643460512161255, "rewards/margins": 1.1291742086410523, "rewards/rejected": -1.0648281574249268, "step": 6406 }, { "epoch": 0.3395966395462858, "grad_norm": 42.75, "kl": 0.04921531677246094, "learning_rate": 5e-07, "logits/chosen": -41536584.0, "logits/rejected": -37683689.6, "logps/chosen": -294.2377522786458, "logps/rejected": -216.1553955078125, "loss": 0.2985, "rewards/chosen": 0.2163010835647583, "rewards/margins": 1.7545214891433716, "rewards/rejected": -1.5382204055786133, "step": 6407 }, { "epoch": 0.33964964354808785, "grad_norm": 40.75, "kl": 0.7059049606323242, "learning_rate": 5e-07, "logits/chosen": -2728297.3333333335, "logits/rejected": -146661.6, "logps/chosen": -348.9490559895833, "logps/rejected": -155.4782958984375, "loss": 0.3168, "rewards/chosen": 0.3840107520421346, "rewards/margins": 1.9791173537572224, "rewards/rejected": -1.595106601715088, "step": 6408 }, { "epoch": 0.33970264754989, "grad_norm": 54.0, "kl": 0.7576360702514648, "learning_rate": 5e-07, "logits/chosen": -36244416.0, "logits/rejected": -22900928.0, "logps/chosen": -486.974462890625, "logps/rejected": -179.3671671549479, "loss": 0.3274, "rewards/chosen": 0.6714791297912598, "rewards/margins": 1.6549968560536703, "rewards/rejected": -0.9835177262624105, "step": 6409 }, { "epoch": 0.33975565155169213, "grad_norm": 63.75, "kl": 2.086639404296875, "learning_rate": 5e-07, "logits/chosen": -35732141.71428572, "logits/rejected": 10991452.0, "logps/chosen": -534.7968052455357, "logps/rejected": -213.62728881835938, "loss": 0.4169, "rewards/chosen": 0.5904781477791923, "rewards/margins": 1.3306179302079337, "rewards/rejected": -0.7401397824287415, "step": 6410 }, { "epoch": 0.33980865555349427, "grad_norm": 73.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40553109.333333336, "logits/rejected": -4193494.0, "logps/chosen": -403.9621175130208, "logps/rejected": -380.404638671875, "loss": 0.1758, "rewards/chosen": 1.20686141649882, "rewards/margins": 3.7657385985056564, "rewards/rejected": -2.558877182006836, "step": 6411 }, { "epoch": 0.3398616595552964, "grad_norm": 31.75, "kl": 1.7282943725585938, "learning_rate": 5e-07, "logits/chosen": -24219728.0, "logits/rejected": -21996284.0, "logps/chosen": -396.9292805989583, "logps/rejected": -329.134033203125, "loss": 0.3297, "rewards/chosen": 0.8783573309580485, "rewards/margins": 3.5611385504404702, "rewards/rejected": -2.682781219482422, "step": 6412 }, { "epoch": 0.33991466355709854, "grad_norm": 40.5, "kl": 2.3552703857421875, "learning_rate": 5e-07, "logits/chosen": -29550265.6, "logits/rejected": -40676045.333333336, "logps/chosen": -158.46732177734376, "logps/rejected": -533.0037841796875, "loss": 0.3914, "rewards/chosen": -0.17265716791152955, "rewards/margins": 2.733281616369883, "rewards/rejected": -2.9059387842814126, "step": 6413 }, { "epoch": 0.3399676675589007, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22018854.0, "logits/rejected": -29396784.0, "logps/chosen": -889.9865112304688, "logps/rejected": -223.79386393229166, "loss": 0.2902, "rewards/chosen": 1.1753673553466797, "rewards/margins": 2.237610340118408, "rewards/rejected": -1.0622429847717285, "step": 6414 }, { "epoch": 0.3400206715607028, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31688538.0, "logits/rejected": -27259132.0, "logps/chosen": -479.87677001953125, "logps/rejected": -288.38970947265625, "loss": 0.2922, "rewards/chosen": 0.16646374762058258, "rewards/margins": 2.168270006775856, "rewards/rejected": -2.0018062591552734, "step": 6415 }, { "epoch": 0.34007367556250495, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38543424.0, "logits/rejected": -25727884.0, "logps/chosen": -251.57180786132812, "logps/rejected": -314.40380859375, "loss": 0.3708, "rewards/chosen": -0.5339233875274658, "rewards/margins": 2.1300761699676514, "rewards/rejected": -2.663999557495117, "step": 6416 }, { "epoch": 0.3401266795643071, "grad_norm": 63.0, "kl": 0.20638275146484375, "learning_rate": 5e-07, "logits/chosen": -49553034.666666664, "logits/rejected": -33885560.0, "logps/chosen": -381.6343587239583, "logps/rejected": -351.6520080566406, "loss": 0.4201, "rewards/chosen": -0.025778015454610188, "rewards/margins": 1.527324954668681, "rewards/rejected": -1.553102970123291, "step": 6417 }, { "epoch": 0.3401796835661092, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64089352.0, "logits/rejected": -25629416.0, "logps/chosen": -492.0876770019531, "logps/rejected": -266.02634684244794, "loss": 0.2634, "rewards/chosen": 0.9855362176895142, "rewards/margins": 2.1346851587295532, "rewards/rejected": -1.149148941040039, "step": 6418 }, { "epoch": 0.34023268756791136, "grad_norm": 53.75, "kl": 0.48207664489746094, "learning_rate": 5e-07, "logits/chosen": -29036611.2, "logits/rejected": -3949111.3333333335, "logps/chosen": -238.3792236328125, "logps/rejected": -161.28975423177084, "loss": 0.3732, "rewards/chosen": 0.06863601803779602, "rewards/margins": 1.6378107845783234, "rewards/rejected": -1.5691747665405273, "step": 6419 }, { "epoch": 0.3402856915697135, "grad_norm": 42.0, "kl": 0.13574790954589844, "learning_rate": 5e-07, "logits/chosen": -5430912.0, "logits/rejected": -8311698.666666667, "logps/chosen": -479.8736877441406, "logps/rejected": -203.6990763346354, "loss": 0.1893, "rewards/chosen": 1.7853636741638184, "rewards/margins": 3.3602477709452314, "rewards/rejected": -1.5748840967814128, "step": 6420 }, { "epoch": 0.34033869557151564, "grad_norm": 48.75, "kl": 0.7759389877319336, "learning_rate": 5e-07, "logits/chosen": -23010186.0, "logits/rejected": -22955266.0, "logps/chosen": -495.2614440917969, "logps/rejected": -450.035888671875, "loss": 0.2567, "rewards/chosen": 1.021288275718689, "rewards/margins": 2.652895212173462, "rewards/rejected": -1.631606936454773, "step": 6421 }, { "epoch": 0.3403916995733178, "grad_norm": 46.25, "kl": 0.0667877197265625, "learning_rate": 5e-07, "logits/chosen": -22528676.0, "logits/rejected": -33797272.0, "logps/chosen": -382.27728271484375, "logps/rejected": -306.23382568359375, "loss": 0.2687, "rewards/chosen": 0.4954330623149872, "rewards/margins": 2.378042310476303, "rewards/rejected": -1.882609248161316, "step": 6422 }, { "epoch": 0.3404447035751199, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20433248.0, "logits/rejected": -25222373.333333332, "logps/chosen": -211.112744140625, "logps/rejected": -487.613525390625, "loss": 0.3052, "rewards/chosen": 0.41574440002441404, "rewards/margins": 2.5004077911376954, "rewards/rejected": -2.0846633911132812, "step": 6423 }, { "epoch": 0.34049770757692205, "grad_norm": 64.0, "kl": 0.114593505859375, "learning_rate": 5e-07, "logits/chosen": -35473856.0, "logits/rejected": -111160426.66666667, "logps/chosen": -578.0795288085938, "logps/rejected": -269.7710367838542, "loss": 0.2524, "rewards/chosen": 0.5105453729629517, "rewards/margins": 2.0134588479995728, "rewards/rejected": -1.502913475036621, "step": 6424 }, { "epoch": 0.3405507115787242, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17573758.0, "logits/rejected": -17970092.0, "logps/chosen": -371.0274963378906, "logps/rejected": -184.27272033691406, "loss": 0.2905, "rewards/chosen": 0.19275522232055664, "rewards/margins": 2.229097604751587, "rewards/rejected": -2.0363423824310303, "step": 6425 }, { "epoch": 0.3406037155805263, "grad_norm": 67.0, "kl": 0.7660751342773438, "learning_rate": 5e-07, "logits/chosen": -19532009.333333332, "logits/rejected": 13994212.0, "logps/chosen": -362.2179361979167, "logps/rejected": -450.8974304199219, "loss": 0.3174, "rewards/chosen": 0.7396520773569742, "rewards/margins": 2.073486844698588, "rewards/rejected": -1.3338347673416138, "step": 6426 }, { "epoch": 0.34065671958232846, "grad_norm": 30.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 653524.5, "logits/rejected": -19155299.2, "logps/chosen": -72.45160420735677, "logps/rejected": -320.6781005859375, "loss": 0.3026, "rewards/chosen": -0.519346276919047, "rewards/margins": 1.7456630309422811, "rewards/rejected": -2.265009307861328, "step": 6427 }, { "epoch": 0.3407097235841306, "grad_norm": 39.75, "kl": 1.658146858215332, "learning_rate": 5e-07, "logits/chosen": -25785992.0, "logits/rejected": -12260986.0, "logps/chosen": -382.4700927734375, "logps/rejected": -360.9051513671875, "loss": 0.3626, "rewards/chosen": 0.6121490399042765, "rewards/margins": 2.9801559845606485, "rewards/rejected": -2.368006944656372, "step": 6428 }, { "epoch": 0.34076272758593273, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24268208.0, "logits/rejected": -21292192.0, "logps/chosen": -153.08350830078126, "logps/rejected": -290.82240804036456, "loss": 0.3853, "rewards/chosen": 0.13867753744125366, "rewards/margins": 2.0921830534934998, "rewards/rejected": -1.953505516052246, "step": 6429 }, { "epoch": 0.34081573158773487, "grad_norm": 44.5, "kl": 0.3520183563232422, "learning_rate": 5e-07, "logits/chosen": -86574656.0, "logits/rejected": -955282.3333333334, "logps/chosen": -223.7324462890625, "logps/rejected": -45.534515380859375, "loss": 0.4381, "rewards/chosen": -0.1970356822013855, "rewards/margins": 0.8578704476356507, "rewards/rejected": -1.0549061298370361, "step": 6430 }, { "epoch": 0.340868735589537, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24644238.0, "logits/rejected": -6760314.0, "logps/chosen": -118.88447570800781, "logps/rejected": -102.51660919189453, "loss": 0.3143, "rewards/chosen": -0.0038541853427886963, "rewards/margins": 2.035357803106308, "rewards/rejected": -2.0392119884490967, "step": 6431 }, { "epoch": 0.34092173959133915, "grad_norm": 59.25, "kl": 0.08620834350585938, "learning_rate": 5e-07, "logits/chosen": -30939896.0, "logits/rejected": -4542234.0, "logps/chosen": -227.85671997070312, "logps/rejected": -281.83599853515625, "loss": 0.3258, "rewards/chosen": 0.19898968935012817, "rewards/margins": 1.7828896641731262, "rewards/rejected": -1.583899974822998, "step": 6432 }, { "epoch": 0.3409747435931413, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7985912.0, "logits/rejected": -55635384.0, "logps/chosen": -448.341064453125, "logps/rejected": -650.669677734375, "loss": 0.1599, "rewards/chosen": 1.1124305725097656, "rewards/margins": 4.0795159339904785, "rewards/rejected": -2.967085361480713, "step": 6433 }, { "epoch": 0.3410277475949434, "grad_norm": 58.0, "kl": 1.2736587524414062, "learning_rate": 5e-07, "logits/chosen": -73652217.6, "logits/rejected": 17716984.0, "logps/chosen": -404.184130859375, "logps/rejected": -198.38716634114584, "loss": 0.3911, "rewards/chosen": 0.12003326416015625, "rewards/margins": 2.161920706431071, "rewards/rejected": -2.0418874422709146, "step": 6434 }, { "epoch": 0.34108075159674556, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16894565.333333332, "logits/rejected": -14596740.8, "logps/chosen": -338.22088623046875, "logps/rejected": -224.773583984375, "loss": 0.2022, "rewards/chosen": 0.8163798650105795, "rewards/margins": 2.851166089375814, "rewards/rejected": -2.0347862243652344, "step": 6435 }, { "epoch": 0.3411337555985477, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 85855168.0, "logits/rejected": -10687661.333333334, "logps/chosen": -247.14308166503906, "logps/rejected": -886.093994140625, "loss": 0.1548, "rewards/chosen": 0.12063904106616974, "rewards/margins": 3.7875609745581946, "rewards/rejected": -3.666921933492025, "step": 6436 }, { "epoch": 0.34118675960034983, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 120235192.0, "logits/rejected": -27836074.666666668, "logps/chosen": -588.9996337890625, "logps/rejected": -233.8858642578125, "loss": 0.2098, "rewards/chosen": 0.23393812775611877, "rewards/margins": 2.2884911596775055, "rewards/rejected": -2.0545530319213867, "step": 6437 }, { "epoch": 0.34123976360215197, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8637094.0, "logits/rejected": -33170210.0, "logps/chosen": -170.85000610351562, "logps/rejected": -231.77842712402344, "loss": 0.3973, "rewards/chosen": -0.647875189781189, "rewards/margins": 1.0715426206588745, "rewards/rejected": -1.7194178104400635, "step": 6438 }, { "epoch": 0.3412927676039541, "grad_norm": 53.0, "kl": 3.5469436645507812, "learning_rate": 5e-07, "logits/chosen": -30088416.0, "logits/rejected": -35943749.333333336, "logps/chosen": -621.492529296875, "logps/rejected": -365.10986328125, "loss": 0.325, "rewards/chosen": 0.895714282989502, "rewards/margins": 2.7682104110717773, "rewards/rejected": -1.8724961280822754, "step": 6439 }, { "epoch": 0.34134577160575624, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6084317.5, "logits/rejected": -11485742.0, "logps/chosen": -411.4150390625, "logps/rejected": -347.27716064453125, "loss": 0.265, "rewards/chosen": 0.8562682867050171, "rewards/margins": 2.215102791786194, "rewards/rejected": -1.3588345050811768, "step": 6440 }, { "epoch": 0.3413987756075584, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1037484.0, "logits/rejected": -38472082.666666664, "logps/chosen": -368.34503173828125, "logps/rejected": -326.1490885416667, "loss": 0.2243, "rewards/chosen": 0.5721634030342102, "rewards/margins": 2.2692414720853167, "rewards/rejected": -1.6970780690511067, "step": 6441 }, { "epoch": 0.3414517796093605, "grad_norm": 63.25, "kl": 0.49193572998046875, "learning_rate": 5e-07, "logits/chosen": -64174592.0, "logits/rejected": -25341888.0, "logps/chosen": -453.337255859375, "logps/rejected": -265.88267008463544, "loss": 0.3521, "rewards/chosen": 0.2642303466796875, "rewards/margins": 2.5600998560587565, "rewards/rejected": -2.295869509379069, "step": 6442 }, { "epoch": 0.34150478361116265, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28035209.6, "logits/rejected": -38117920.0, "logps/chosen": -320.611962890625, "logps/rejected": -362.1347249348958, "loss": 0.31, "rewards/chosen": 0.1997233510017395, "rewards/margins": 2.7746370116869605, "rewards/rejected": -2.574913660685221, "step": 6443 }, { "epoch": 0.3415577876129648, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31804778.0, "logits/rejected": -13631544.0, "logps/chosen": -161.5500030517578, "logps/rejected": -350.6346842447917, "loss": 0.222, "rewards/chosen": -0.31525155901908875, "rewards/margins": 1.9486677944660187, "rewards/rejected": -2.2639193534851074, "step": 6444 }, { "epoch": 0.34161079161476693, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36711872.0, "logits/rejected": 36031336.0, "logps/chosen": -377.3837890625, "logps/rejected": -463.4315999348958, "loss": 0.26, "rewards/chosen": 0.14366266131401062, "rewards/margins": 1.649688293536504, "rewards/rejected": -1.5060256322224934, "step": 6445 }, { "epoch": 0.34166379561656907, "grad_norm": 48.0, "kl": 2.2869787216186523, "learning_rate": 5e-07, "logits/chosen": 1484583.0, "logits/rejected": -19209101.333333332, "logps/chosen": -331.04130859375, "logps/rejected": -328.48488362630206, "loss": 0.3631, "rewards/chosen": 0.583150053024292, "rewards/margins": 2.1729373772939047, "rewards/rejected": -1.5897873242696126, "step": 6446 }, { "epoch": 0.3417167996183712, "grad_norm": 47.0, "kl": 1.6269168853759766, "learning_rate": 5e-07, "logits/chosen": -42952364.0, "logits/rejected": -6538789.0, "logps/chosen": -512.43212890625, "logps/rejected": -166.6701202392578, "loss": 0.371, "rewards/chosen": -0.02511674165725708, "rewards/margins": 1.6037928462028503, "rewards/rejected": -1.6289095878601074, "step": 6447 }, { "epoch": 0.34176980362017334, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4054112.5, "logits/rejected": -31426246.0, "logps/chosen": -224.2398223876953, "logps/rejected": -309.738037109375, "loss": 0.3167, "rewards/chosen": 0.3378521502017975, "rewards/margins": 2.2323488295078278, "rewards/rejected": -1.8944966793060303, "step": 6448 }, { "epoch": 0.3418228076219755, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2728755.0, "logits/rejected": -41358962.666666664, "logps/chosen": -45.354644775390625, "logps/rejected": -247.0810750325521, "loss": 0.2811, "rewards/chosen": -0.11593036353588104, "rewards/margins": 1.4849906315406163, "rewards/rejected": -1.6009209950764973, "step": 6449 }, { "epoch": 0.3418758116237776, "grad_norm": 47.75, "kl": 0.16344451904296875, "learning_rate": 5e-07, "logits/chosen": -20814236.0, "logits/rejected": -54114372.0, "logps/chosen": -548.5057983398438, "logps/rejected": -453.2481994628906, "loss": 0.2148, "rewards/chosen": 1.066869854927063, "rewards/margins": 3.1538697481155396, "rewards/rejected": -2.0869998931884766, "step": 6450 }, { "epoch": 0.34192881562557975, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30651304.0, "logits/rejected": -1463941.5, "logps/chosen": -533.173583984375, "logps/rejected": -182.67193603515625, "loss": 0.2367, "rewards/chosen": 0.5690643787384033, "rewards/margins": 2.174578269322713, "rewards/rejected": -1.6055138905843098, "step": 6451 }, { "epoch": 0.3419818196273819, "grad_norm": 32.25, "kl": 0.6977634429931641, "learning_rate": 5e-07, "logits/chosen": -12228847.0, "logits/rejected": -26047830.0, "logps/chosen": -191.80113220214844, "logps/rejected": -315.1100158691406, "loss": 0.2166, "rewards/chosen": 1.0394047498703003, "rewards/margins": 3.1813119649887085, "rewards/rejected": -2.141907215118408, "step": 6452 }, { "epoch": 0.342034823629184, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24895302.4, "logits/rejected": -98414570.66666667, "logps/chosen": -280.319384765625, "logps/rejected": -590.9545084635416, "loss": 0.3615, "rewards/chosen": -0.21591124534606934, "rewards/margins": 2.789223019282023, "rewards/rejected": -3.0051342646280923, "step": 6453 }, { "epoch": 0.34208782763098616, "grad_norm": 118.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30034182.4, "logits/rejected": -3241362.0, "logps/chosen": -265.6345458984375, "logps/rejected": -229.89237467447916, "loss": 0.3737, "rewards/chosen": 0.2041339874267578, "rewards/margins": 1.3442513942718506, "rewards/rejected": -1.1401174068450928, "step": 6454 }, { "epoch": 0.3421408316327883, "grad_norm": 46.25, "kl": 0.7313976287841797, "learning_rate": 5e-07, "logits/chosen": -13868450.0, "logits/rejected": -42028032.0, "logps/chosen": -370.9454345703125, "logps/rejected": -314.16961669921875, "loss": 0.2103, "rewards/chosen": 0.34357377886772156, "rewards/margins": 2.4629420141379037, "rewards/rejected": -2.119368235270182, "step": 6455 }, { "epoch": 0.34219383563459044, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1459137.0, "logits/rejected": -35750708.0, "logps/chosen": -183.6839599609375, "logps/rejected": -295.3966369628906, "loss": 0.3244, "rewards/chosen": 0.04884912073612213, "rewards/margins": 2.2663788944482803, "rewards/rejected": -2.217529773712158, "step": 6456 }, { "epoch": 0.3422468396363926, "grad_norm": 57.25, "kl": 0.7375297546386719, "learning_rate": 5e-07, "logits/chosen": -60097909.333333336, "logits/rejected": 833531.2, "logps/chosen": -619.2039388020834, "logps/rejected": -116.02822265625, "loss": 0.3071, "rewards/chosen": 0.6053426265716553, "rewards/margins": 1.6848025798797608, "rewards/rejected": -1.0794599533081055, "step": 6457 }, { "epoch": 0.34229984363819466, "grad_norm": 63.25, "kl": 1.5318617820739746, "learning_rate": 5e-07, "logits/chosen": -25127157.333333332, "logits/rejected": -12944255.0, "logps/chosen": -266.4789225260417, "logps/rejected": -221.2078857421875, "loss": 0.4357, "rewards/chosen": 0.15888544917106628, "rewards/margins": 1.4938742816448212, "rewards/rejected": -1.3349888324737549, "step": 6458 }, { "epoch": 0.3423528476399968, "grad_norm": 52.75, "kl": 0.8391566276550293, "learning_rate": 5e-07, "logits/chosen": -21548510.0, "logits/rejected": -12688496.0, "logps/chosen": -235.5436248779297, "logps/rejected": -79.61154174804688, "loss": 0.3823, "rewards/chosen": 0.14315976202487946, "rewards/margins": 1.1683305650949478, "rewards/rejected": -1.0251708030700684, "step": 6459 }, { "epoch": 0.34240585164179893, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20772436.0, "logits/rejected": -17183312.0, "logps/chosen": -179.35467529296875, "logps/rejected": -244.04415893554688, "loss": 0.3439, "rewards/chosen": 0.22021889686584473, "rewards/margins": 1.5206120014190674, "rewards/rejected": -1.3003931045532227, "step": 6460 }, { "epoch": 0.34245885564360107, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9718084.0, "logits/rejected": -17995993.6, "logps/chosen": -160.28858439127603, "logps/rejected": -584.8748046875, "loss": 0.2609, "rewards/chosen": 0.29481279850006104, "rewards/margins": 2.581862282752991, "rewards/rejected": -2.28704948425293, "step": 6461 }, { "epoch": 0.3425118596454032, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4685884.0, "logits/rejected": 115599669.33333333, "logps/chosen": -38.9924430847168, "logps/rejected": -463.48583984375, "loss": 0.243, "rewards/chosen": -0.2478785514831543, "rewards/margins": 1.856017748514811, "rewards/rejected": -2.1038962999979653, "step": 6462 }, { "epoch": 0.34256486364720534, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13098797.333333334, "logits/rejected": 15831467.2, "logps/chosen": -1014.1094563802084, "logps/rejected": -362.467578125, "loss": 0.2357, "rewards/chosen": 0.9015840689341227, "rewards/margins": 2.826997963587443, "rewards/rejected": -1.9254138946533204, "step": 6463 }, { "epoch": 0.3426178676490075, "grad_norm": 50.5, "kl": 2.0286483764648438, "learning_rate": 5e-07, "logits/chosen": 26195604.0, "logits/rejected": -18304340.0, "logps/chosen": -327.9555358886719, "logps/rejected": -309.9617004394531, "loss": 0.3271, "rewards/chosen": 0.06872253119945526, "rewards/margins": 1.8979099243879318, "rewards/rejected": -1.8291873931884766, "step": 6464 }, { "epoch": 0.3426708716508096, "grad_norm": 31.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17831152.0, "logits/rejected": -24724609.6, "logps/chosen": -311.6713460286458, "logps/rejected": -309.27109375, "loss": 0.1492, "rewards/chosen": 1.8082555135091145, "rewards/margins": 3.9440186818440752, "rewards/rejected": -2.135763168334961, "step": 6465 }, { "epoch": 0.34272387565261175, "grad_norm": 54.75, "kl": 0.10784912109375, "learning_rate": 5e-07, "logits/chosen": -25045032.0, "logits/rejected": -25274426.0, "logps/chosen": -318.11859130859375, "logps/rejected": -280.96923828125, "loss": 0.3297, "rewards/chosen": 0.05363254249095917, "rewards/margins": 1.6619111746549606, "rewards/rejected": -1.6082786321640015, "step": 6466 }, { "epoch": 0.3427768796544139, "grad_norm": 58.5, "kl": 0.8646697998046875, "learning_rate": 5e-07, "logits/chosen": -34141664.0, "logits/rejected": 8409332.0, "logps/chosen": -241.2861572265625, "logps/rejected": -281.9461263020833, "loss": 0.3036, "rewards/chosen": 0.46776447296142576, "rewards/margins": 2.3565312385559083, "rewards/rejected": -1.8887667655944824, "step": 6467 }, { "epoch": 0.34282988365621603, "grad_norm": 33.75, "kl": 0.34429359436035156, "learning_rate": 5e-07, "logits/chosen": -9338378.666666666, "logits/rejected": -25111094.4, "logps/chosen": -219.2581787109375, "logps/rejected": -173.1085205078125, "loss": 0.2742, "rewards/chosen": 0.4791406790415446, "rewards/margins": 2.2247480551401773, "rewards/rejected": -1.7456073760986328, "step": 6468 }, { "epoch": 0.34288288765801817, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24061944.0, "logits/rejected": -17917734.85714286, "logps/chosen": -218.78765869140625, "logps/rejected": -248.13950892857142, "loss": 0.238, "rewards/chosen": 0.10259705036878586, "rewards/margins": 1.8243152935590063, "rewards/rejected": -1.7217182431902205, "step": 6469 }, { "epoch": 0.3429358916598203, "grad_norm": 53.5, "kl": 0.1294708251953125, "learning_rate": 5e-07, "logits/chosen": -39299917.333333336, "logits/rejected": 2918099.25, "logps/chosen": -259.86631266276044, "logps/rejected": -60.22148895263672, "loss": 0.4239, "rewards/chosen": -0.00694344441095988, "rewards/margins": 1.6533985336621602, "rewards/rejected": -1.6603419780731201, "step": 6470 }, { "epoch": 0.34298889566162244, "grad_norm": 47.0, "kl": 1.2542591094970703, "learning_rate": 5e-07, "logits/chosen": -41813452.8, "logits/rejected": -57539664.0, "logps/chosen": -236.281787109375, "logps/rejected": -308.73386637369794, "loss": 0.3363, "rewards/chosen": 0.7790330410003662, "rewards/margins": 1.5550526142120362, "rewards/rejected": -0.7760195732116699, "step": 6471 }, { "epoch": 0.3430418996634246, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -26306988.0, "logps/rejected": -291.14813232421875, "loss": 0.197, "rewards/rejected": -1.616681694984436, "step": 6472 }, { "epoch": 0.3430949036652267, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31265264.0, "logits/rejected": -42426484.0, "logps/chosen": -310.76220703125, "logps/rejected": -413.568115234375, "loss": 0.2798, "rewards/chosen": 0.15137705206871033, "rewards/margins": 2.759141057729721, "rewards/rejected": -2.6077640056610107, "step": 6473 }, { "epoch": 0.34314790766702885, "grad_norm": 55.25, "kl": 1.4828338623046875, "learning_rate": 5e-07, "logits/chosen": -11396484.8, "logits/rejected": -17893890.666666668, "logps/chosen": -296.007763671875, "logps/rejected": -471.3800862630208, "loss": 0.3725, "rewards/chosen": -0.030550992488861083, "rewards/margins": 2.1110891938209533, "rewards/rejected": -2.1416401863098145, "step": 6474 }, { "epoch": 0.343200911668831, "grad_norm": 56.25, "kl": 0.16839027404785156, "learning_rate": 5e-07, "logits/chosen": -56018436.0, "logits/rejected": -69228296.0, "logps/chosen": -556.4222412109375, "logps/rejected": -213.66943359375, "loss": 0.2401, "rewards/chosen": 0.9645676612854004, "rewards/margins": 2.4724392890930176, "rewards/rejected": -1.5078716278076172, "step": 6475 }, { "epoch": 0.3432539156706331, "grad_norm": 45.0, "kl": 0.9893054962158203, "learning_rate": 5e-07, "logits/chosen": -25381786.666666668, "logits/rejected": -42381228.8, "logps/chosen": -258.48398844401044, "logps/rejected": -464.748779296875, "loss": 0.2429, "rewards/chosen": 0.8498578071594238, "rewards/margins": 2.6042878150939943, "rewards/rejected": -1.7544300079345703, "step": 6476 }, { "epoch": 0.34330691967243526, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58750117.333333336, "logits/rejected": -16895712.0, "logps/chosen": -455.3600667317708, "logps/rejected": -353.062939453125, "loss": 0.2283, "rewards/chosen": 0.5751443306605021, "rewards/margins": 2.864737073580424, "rewards/rejected": -2.289592742919922, "step": 6477 }, { "epoch": 0.3433599236742374, "grad_norm": 47.75, "kl": 1.0446147918701172, "learning_rate": 5e-07, "logits/chosen": -31931365.333333332, "logits/rejected": -68527897.6, "logps/chosen": -289.6667887369792, "logps/rejected": -462.9177734375, "loss": 0.2161, "rewards/chosen": 1.118714412053426, "rewards/margins": 3.3296336015065506, "rewards/rejected": -2.210919189453125, "step": 6478 }, { "epoch": 0.34341292767603954, "grad_norm": 52.5, "kl": 0.7235641479492188, "learning_rate": 5e-07, "logits/chosen": -21620472.0, "logits/rejected": 1786118.75, "logps/chosen": -464.6525573730469, "logps/rejected": -283.9378356933594, "loss": 0.2875, "rewards/chosen": 0.2736172080039978, "rewards/margins": 2.4891790747642517, "rewards/rejected": -2.215561866760254, "step": 6479 }, { "epoch": 0.3434659316778417, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6004951.333333333, "logits/rejected": -49957779.2, "logps/chosen": -279.9924723307292, "logps/rejected": -250.1596923828125, "loss": 0.2425, "rewards/chosen": 0.8497212727864584, "rewards/margins": 2.6330491383870442, "rewards/rejected": -1.783327865600586, "step": 6480 }, { "epoch": 0.3435189356796438, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 533214.5, "logits/rejected": -33176005.333333332, "logps/chosen": -38.77869415283203, "logps/rejected": -346.6039225260417, "loss": 0.2645, "rewards/chosen": -0.452586829662323, "rewards/margins": 1.5235105951627095, "rewards/rejected": -1.9760974248250325, "step": 6481 }, { "epoch": 0.34357193968144595, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79783200.0, "logits/rejected": -38818965.333333336, "logps/chosen": -353.9015808105469, "logps/rejected": -356.1024576822917, "loss": 0.2742, "rewards/chosen": -0.04481200873851776, "rewards/margins": 1.9364519466956456, "rewards/rejected": -1.9812639554341633, "step": 6482 }, { "epoch": 0.3436249436832481, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9440419.0, "logits/rejected": -22922800.0, "logps/chosen": -198.7359619140625, "logps/rejected": -325.70159912109375, "loss": 0.3112, "rewards/chosen": 0.030906304717063904, "rewards/margins": 2.101733312010765, "rewards/rejected": -2.070827007293701, "step": 6483 }, { "epoch": 0.3436779476850502, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21745764.0, "logits/rejected": -39269337.6, "logps/chosen": -308.63189697265625, "logps/rejected": -177.4474609375, "loss": 0.2759, "rewards/chosen": 0.24552923440933228, "rewards/margins": 1.8234408020973205, "rewards/rejected": -1.5779115676879882, "step": 6484 }, { "epoch": 0.34373095168685236, "grad_norm": 43.0, "kl": 0.134063720703125, "learning_rate": 5e-07, "logits/chosen": 3514432.5, "logits/rejected": 383150.0, "logps/chosen": -20.67461395263672, "logps/rejected": -479.2403157552083, "loss": 0.2104, "rewards/chosen": 0.21584998071193695, "rewards/margins": 2.5697151273489, "rewards/rejected": -2.353865146636963, "step": 6485 }, { "epoch": 0.3437839556886545, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39508016.0, "logits/rejected": -9667826.666666666, "logps/chosen": -360.2322265625, "logps/rejected": -530.72900390625, "loss": 0.2866, "rewards/chosen": 0.5288851737976075, "rewards/margins": 4.04517552057902, "rewards/rejected": -3.5162903467814126, "step": 6486 }, { "epoch": 0.34383695969045663, "grad_norm": 55.25, "kl": 1.049276351928711, "learning_rate": 5e-07, "logits/chosen": -45733748.0, "logits/rejected": 40411196.0, "logps/chosen": -415.380615234375, "logps/rejected": -342.8924560546875, "loss": 0.2723, "rewards/chosen": 0.8063934445381165, "rewards/margins": 2.3172367215156555, "rewards/rejected": -1.510843276977539, "step": 6487 }, { "epoch": 0.34388996369225877, "grad_norm": 70.0, "kl": 0.1971282958984375, "learning_rate": 5e-07, "logits/chosen": 54274188.8, "logits/rejected": -5888193.333333333, "logps/chosen": -328.45224609375, "logps/rejected": -258.85760498046875, "loss": 0.3918, "rewards/chosen": 0.05012218356132507, "rewards/margins": 1.6805867373943328, "rewards/rejected": -1.6304645538330078, "step": 6488 }, { "epoch": 0.3439429676940609, "grad_norm": 58.25, "kl": 0.6167106628417969, "learning_rate": 5e-07, "logits/chosen": 64818924.8, "logits/rejected": 3738347.0, "logps/chosen": -718.28837890625, "logps/rejected": -145.7331746419271, "loss": 0.3001, "rewards/chosen": 0.758273696899414, "rewards/margins": 2.624668534596761, "rewards/rejected": -1.866394837697347, "step": 6489 }, { "epoch": 0.34399597169586305, "grad_norm": 37.75, "kl": 1.800466537475586, "learning_rate": 5e-07, "logits/chosen": -20582220.0, "logits/rejected": -8240362.666666667, "logps/chosen": -300.5140380859375, "logps/rejected": -187.6304931640625, "loss": 0.1947, "rewards/chosen": 0.24960602819919586, "rewards/margins": 2.439776952068011, "rewards/rejected": -2.190170923868815, "step": 6490 }, { "epoch": 0.3440489756976652, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21288764.0, "logits/rejected": -4141774.4, "logps/chosen": -361.3283284505208, "logps/rejected": -527.81875, "loss": 0.1888, "rewards/chosen": 0.2268459995587667, "rewards/margins": 4.236261816819509, "rewards/rejected": -4.009415817260742, "step": 6491 }, { "epoch": 0.3441019796994673, "grad_norm": 52.75, "kl": 1.0357303619384766, "learning_rate": 5e-07, "logits/chosen": -22985713.6, "logits/rejected": 1674770.6666666667, "logps/chosen": -545.025830078125, "logps/rejected": -69.8321533203125, "loss": 0.3423, "rewards/chosen": 0.657597827911377, "rewards/margins": 1.8252256393432618, "rewards/rejected": -1.1676278114318848, "step": 6492 }, { "epoch": 0.34415498370126946, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32716224.0, "logits/rejected": -8467090.0, "logps/chosen": -352.16998291015625, "logps/rejected": -266.8392333984375, "loss": 0.3346, "rewards/chosen": 0.20069275796413422, "rewards/margins": 1.849892720580101, "rewards/rejected": -1.6491999626159668, "step": 6493 }, { "epoch": 0.3442079877030716, "grad_norm": 72.5, "kl": 0.27246856689453125, "learning_rate": 5e-07, "logits/chosen": -44619658.666666664, "logits/rejected": -5651361.6, "logps/chosen": -167.8311971028646, "logps/rejected": -539.23740234375, "loss": 0.2412, "rewards/chosen": 0.2509390115737915, "rewards/margins": 2.8031389474868775, "rewards/rejected": -2.552199935913086, "step": 6494 }, { "epoch": 0.34426099170487373, "grad_norm": 47.5, "kl": 1.3430633544921875, "learning_rate": 5e-07, "logits/chosen": -15530064.0, "logits/rejected": -40701109.333333336, "logps/chosen": -290.45546875, "logps/rejected": -450.0347493489583, "loss": 0.2734, "rewards/chosen": 0.7159601211547851, "rewards/margins": 3.9455559412638346, "rewards/rejected": -3.2295958201090493, "step": 6495 }, { "epoch": 0.34431399570667587, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6477325.0, "logits/rejected": -30489853.333333332, "logps/chosen": -209.1564483642578, "logps/rejected": -516.0890299479166, "loss": 0.2168, "rewards/chosen": 0.004461377859115601, "rewards/margins": 2.338963915904363, "rewards/rejected": -2.3345025380452475, "step": 6496 }, { "epoch": 0.344366999708478, "grad_norm": 69.0, "kl": 0.9925937652587891, "learning_rate": 5e-07, "logits/chosen": -6377903.333333333, "logits/rejected": 299471008.0, "logps/chosen": -253.80633544921875, "logps/rejected": -1115.2952880859375, "loss": 0.3971, "rewards/chosen": 0.2943920095761617, "rewards/margins": 1.6337214907010396, "rewards/rejected": -1.339329481124878, "step": 6497 }, { "epoch": 0.34442000371028014, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5348585.666666667, "logits/rejected": -6015939.2, "logps/chosen": -237.33203125, "logps/rejected": -199.392822265625, "loss": 0.3598, "rewards/chosen": -0.5476475954055786, "rewards/margins": 1.010689616203308, "rewards/rejected": -1.5583372116088867, "step": 6498 }, { "epoch": 0.3444730077120823, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11872865.333333334, "logits/rejected": -27981164.8, "logps/chosen": -86.79918416341145, "logps/rejected": -212.675244140625, "loss": 0.3392, "rewards/chosen": 0.02803770701090495, "rewards/margins": 1.271185843149821, "rewards/rejected": -1.243148136138916, "step": 6499 }, { "epoch": 0.3445260117138844, "grad_norm": 44.75, "kl": 0.2563591003417969, "learning_rate": 5e-07, "logits/chosen": -24668408.0, "logits/rejected": -18886856.0, "logps/chosen": -217.87918090820312, "logps/rejected": -267.1380615234375, "loss": 0.2693, "rewards/chosen": 0.607191801071167, "rewards/margins": 2.664419174194336, "rewards/rejected": -2.057227373123169, "step": 6500 }, { "epoch": 0.34457901571568655, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34837204.0, "logits/rejected": -29018917.333333332, "logps/chosen": -256.4068603515625, "logps/rejected": -183.9453328450521, "loss": 0.2516, "rewards/chosen": -0.29787713289260864, "rewards/margins": 1.6592739621798198, "rewards/rejected": -1.9571510950724285, "step": 6501 }, { "epoch": 0.3446320197174887, "grad_norm": 38.5, "kl": 0.41495418548583984, "learning_rate": 5e-07, "logits/chosen": -7490927.2, "logits/rejected": -8816044.666666666, "logps/chosen": -186.54658203125, "logps/rejected": -80.94390869140625, "loss": 0.4113, "rewards/chosen": 0.13614286184310914, "rewards/margins": 0.9203255693117778, "rewards/rejected": -0.7841827074686686, "step": 6502 }, { "epoch": 0.34468502371929083, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5308848.0, "logits/rejected": -18019540.8, "logps/chosen": -241.6856892903646, "logps/rejected": -232.258251953125, "loss": 0.2226, "rewards/chosen": 0.3444683949152629, "rewards/margins": 2.99197568098704, "rewards/rejected": -2.6475072860717774, "step": 6503 }, { "epoch": 0.34473802772109297, "grad_norm": 63.25, "kl": 0.0286712646484375, "learning_rate": 5e-07, "logits/chosen": -41002464.0, "logits/rejected": -45268596.0, "logps/chosen": -304.4328308105469, "logps/rejected": -320.635986328125, "loss": 0.3198, "rewards/chosen": 0.6981976628303528, "rewards/margins": 1.7131755948066711, "rewards/rejected": -1.0149779319763184, "step": 6504 }, { "epoch": 0.3447910317228951, "grad_norm": 49.25, "kl": 2.34619140625, "learning_rate": 5e-07, "logits/chosen": -68146528.0, "logits/rejected": -36833248.0, "logps/chosen": -240.6774139404297, "logps/rejected": -210.3099365234375, "loss": 0.2621, "rewards/chosen": 0.5807480812072754, "rewards/margins": 2.0135579109191895, "rewards/rejected": -1.432809829711914, "step": 6505 }, { "epoch": 0.34484403572469724, "grad_norm": 55.75, "kl": 2.22653865814209, "learning_rate": 5e-07, "logits/chosen": -20410070.85714286, "logits/rejected": -28521248.0, "logps/chosen": -291.6485072544643, "logps/rejected": -462.6617126464844, "loss": 0.4302, "rewards/chosen": 0.3617897033691406, "rewards/margins": 2.505970001220703, "rewards/rejected": -2.1441802978515625, "step": 6506 }, { "epoch": 0.3448970397264994, "grad_norm": 52.75, "kl": 1.3304443359375, "learning_rate": 5e-07, "logits/chosen": -28962928.0, "logits/rejected": -82026784.0, "logps/chosen": -318.6290283203125, "logps/rejected": -332.34954833984375, "loss": 0.3069, "rewards/chosen": 0.8687712351481119, "rewards/margins": 2.5735105673472085, "rewards/rejected": -1.7047393321990967, "step": 6507 }, { "epoch": 0.3449500437283015, "grad_norm": 67.0, "kl": 0.22721099853515625, "learning_rate": 5e-07, "logits/chosen": -22700394.666666668, "logits/rejected": -19576329.6, "logps/chosen": -488.1819661458333, "logps/rejected": -365.628125, "loss": 0.2546, "rewards/chosen": 0.5757426420847574, "rewards/margins": 2.151610485712687, "rewards/rejected": -1.5758678436279296, "step": 6508 }, { "epoch": 0.3450030477301036, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25069976.0, "logits/rejected": -10851639.0, "logps/chosen": -160.34140014648438, "logps/rejected": -94.95145416259766, "loss": 0.2886, "rewards/chosen": 0.5963378548622131, "rewards/margins": 2.111537277698517, "rewards/rejected": -1.5151994228363037, "step": 6509 }, { "epoch": 0.34505605173190573, "grad_norm": 36.75, "kl": 2.8302602767944336, "learning_rate": 5e-07, "logits/chosen": -49892040.0, "logits/rejected": -39852084.0, "logps/chosen": -523.2372436523438, "logps/rejected": -658.019287109375, "loss": 0.2553, "rewards/chosen": 0.45040208101272583, "rewards/margins": 3.8696449398994446, "rewards/rejected": -3.4192428588867188, "step": 6510 }, { "epoch": 0.34510905573370787, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43976688.0, "logits/rejected": -30431490.666666668, "logps/chosen": -170.69053649902344, "logps/rejected": -438.6200764973958, "loss": 0.2581, "rewards/chosen": -0.2051849365234375, "rewards/margins": 1.7876548767089844, "rewards/rejected": -1.9928398132324219, "step": 6511 }, { "epoch": 0.34516205973551, "grad_norm": 49.75, "kl": 1.4784793853759766, "learning_rate": 5e-07, "logits/chosen": -38726380.8, "logits/rejected": -25930986.666666668, "logps/chosen": -364.6332275390625, "logps/rejected": -89.89242553710938, "loss": 0.3734, "rewards/chosen": 0.3258945941925049, "rewards/margins": 1.5771698951721191, "rewards/rejected": -1.2512753009796143, "step": 6512 }, { "epoch": 0.34521506373731214, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28978352.0, "logits/rejected": -23778920.0, "logps/chosen": -416.8728942871094, "logps/rejected": -203.5987345377604, "loss": 0.1961, "rewards/chosen": 0.3752456605434418, "rewards/margins": 2.406552722056707, "rewards/rejected": -2.031307061513265, "step": 6513 }, { "epoch": 0.3452680677391143, "grad_norm": 53.5, "kl": 0.2790393829345703, "learning_rate": 5e-07, "logits/chosen": 20255630.666666668, "logits/rejected": -54078000.0, "logps/chosen": -311.5445149739583, "logps/rejected": -331.0722351074219, "loss": 0.2908, "rewards/chosen": 0.7035367488861084, "rewards/margins": 3.094606637954712, "rewards/rejected": -2.3910698890686035, "step": 6514 }, { "epoch": 0.3453210717409164, "grad_norm": 56.0, "kl": 1.4729576110839844, "learning_rate": 5e-07, "logits/chosen": -7021099.0, "logits/rejected": -43372076.0, "logps/chosen": -361.1688232421875, "logps/rejected": -547.8178100585938, "loss": 0.2992, "rewards/chosen": 0.32465553283691406, "rewards/margins": 2.8414359092712402, "rewards/rejected": -2.516780376434326, "step": 6515 }, { "epoch": 0.34537407574271856, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28926882.285714287, "logits/rejected": -13715188.0, "logps/chosen": -329.293701171875, "logps/rejected": -301.32781982421875, "loss": 0.4208, "rewards/chosen": 0.12379902601242065, "rewards/margins": 2.888276517391205, "rewards/rejected": -2.764477491378784, "step": 6516 }, { "epoch": 0.3454270797445207, "grad_norm": 45.0, "kl": 1.3194084167480469, "learning_rate": 5e-07, "logits/chosen": -26606940.0, "logits/rejected": -6825572.0, "logps/chosen": -150.15472412109375, "logps/rejected": -82.16789245605469, "loss": 0.3502, "rewards/chosen": 0.308567613363266, "rewards/margins": 1.3837332427501678, "rewards/rejected": -1.0751656293869019, "step": 6517 }, { "epoch": 0.34548008374632283, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 80217264.0, "logits/rejected": -13815386.666666666, "logps/chosen": -392.49945068359375, "logps/rejected": -309.5948079427083, "loss": 0.25, "rewards/chosen": -0.02559966966509819, "rewards/margins": 2.005900510897239, "rewards/rejected": -2.0315001805623374, "step": 6518 }, { "epoch": 0.34553308774812497, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48953704.0, "logits/rejected": -6005960.0, "logps/chosen": -203.28054809570312, "logps/rejected": -221.7176310221354, "loss": 0.3061, "rewards/chosen": 0.35600072145462036, "rewards/margins": 1.5278740525245667, "rewards/rejected": -1.1718733310699463, "step": 6519 }, { "epoch": 0.3455860917499271, "grad_norm": 51.5, "kl": 0.05455207824707031, "learning_rate": 5e-07, "logits/chosen": -40950282.666666664, "logits/rejected": -29134809.6, "logps/chosen": -448.4955240885417, "logps/rejected": -330.5011962890625, "loss": 0.2198, "rewards/chosen": 0.6013733148574829, "rewards/margins": 2.5402227640151978, "rewards/rejected": -1.9388494491577148, "step": 6520 }, { "epoch": 0.34563909575172924, "grad_norm": 38.25, "kl": 0.02820587158203125, "learning_rate": 5e-07, "logits/chosen": -18357070.4, "logits/rejected": -49077072.0, "logps/chosen": -85.80458984375, "logps/rejected": -493.483154296875, "loss": 0.3379, "rewards/chosen": 0.06986125111579895, "rewards/margins": 2.388793784379959, "rewards/rejected": -2.31893253326416, "step": 6521 }, { "epoch": 0.3456920997535314, "grad_norm": 54.25, "kl": 0.5256252288818359, "learning_rate": 5e-07, "logits/chosen": -23377874.666666668, "logits/rejected": -14154603.2, "logps/chosen": -327.4660237630208, "logps/rejected": -285.08291015625, "loss": 0.2543, "rewards/chosen": 0.5808415412902832, "rewards/margins": 2.501119518280029, "rewards/rejected": -1.920277976989746, "step": 6522 }, { "epoch": 0.3457451037553335, "grad_norm": 41.5, "kl": 1.0466804504394531, "learning_rate": 5e-07, "logits/chosen": -9947272.0, "logits/rejected": -31355460.0, "logps/chosen": -100.89654541015625, "logps/rejected": -313.8772277832031, "loss": 0.4275, "rewards/chosen": 0.09570672114690144, "rewards/margins": 1.4322259624799092, "rewards/rejected": -1.3365192413330078, "step": 6523 }, { "epoch": 0.34579810775713565, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86017168.0, "logits/rejected": -45517115.428571425, "logps/chosen": -244.42576599121094, "logps/rejected": -299.37765066964283, "loss": 0.1781, "rewards/chosen": -0.4145767390727997, "rewards/margins": 1.9648635685443878, "rewards/rejected": -2.3794403076171875, "step": 6524 }, { "epoch": 0.3458511117589378, "grad_norm": 46.5, "kl": 1.5245170593261719, "learning_rate": 5e-07, "logits/chosen": -41643168.0, "logits/rejected": -34300248.0, "logps/chosen": -707.612060546875, "logps/rejected": -401.2788391113281, "loss": 0.3014, "rewards/chosen": 0.8335947195688883, "rewards/margins": 3.37672750155131, "rewards/rejected": -2.543132781982422, "step": 6525 }, { "epoch": 0.3459041157607399, "grad_norm": 63.5, "kl": 3.3394126892089844, "learning_rate": 5e-07, "logits/chosen": -6419142.0, "logits/rejected": -21443856.0, "logps/chosen": -397.0947570800781, "logps/rejected": -157.76522827148438, "loss": 0.2913, "rewards/chosen": 0.9807266592979431, "rewards/margins": 1.9224318265914917, "rewards/rejected": -0.9417051672935486, "step": 6526 }, { "epoch": 0.34595711976254206, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31206978.0, "logits/rejected": -8609977.0, "logps/chosen": -267.15740966796875, "logps/rejected": -186.12078857421875, "loss": 0.3623, "rewards/chosen": -0.10619908571243286, "rewards/margins": 1.41567724943161, "rewards/rejected": -1.521876335144043, "step": 6527 }, { "epoch": 0.3460101237643442, "grad_norm": 48.25, "kl": 0.7355384826660156, "learning_rate": 5e-07, "logits/chosen": -8735638.0, "logits/rejected": -25732476.0, "logps/chosen": -326.368408203125, "logps/rejected": -246.4534454345703, "loss": 0.2905, "rewards/chosen": 0.6205047965049744, "rewards/margins": 2.177668511867523, "rewards/rejected": -1.5571637153625488, "step": 6528 }, { "epoch": 0.34606312776614634, "grad_norm": 34.75, "kl": 1.3888664245605469, "learning_rate": 5e-07, "logits/chosen": 57737896.0, "logits/rejected": -29289874.285714287, "logps/chosen": -1987.637939453125, "logps/rejected": -255.27181570870536, "loss": 0.1552, "rewards/chosen": 2.449462890625, "rewards/margins": 4.4873071398053845, "rewards/rejected": -2.037844249180385, "step": 6529 }, { "epoch": 0.3461161317679485, "grad_norm": 42.75, "kl": 0.8747615814208984, "learning_rate": 5e-07, "logits/chosen": -1981306.6666666667, "logits/rejected": -54550080.0, "logps/chosen": -260.7183430989583, "logps/rejected": -406.9068115234375, "loss": 0.2661, "rewards/chosen": 0.03945058584213257, "rewards/margins": 2.402321755886078, "rewards/rejected": -2.3628711700439453, "step": 6530 }, { "epoch": 0.3461691357697506, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41342084.0, "logits/rejected": -69493680.0, "logps/chosen": -281.5399169921875, "logps/rejected": -365.29254150390625, "loss": 0.316, "rewards/chosen": 0.027302496135234833, "rewards/margins": 2.1643731519579887, "rewards/rejected": -2.137070655822754, "step": 6531 }, { "epoch": 0.34622213977155275, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33230611.2, "logits/rejected": -10594213.333333334, "logps/chosen": -165.5430419921875, "logps/rejected": -261.5828450520833, "loss": 0.3924, "rewards/chosen": -0.050168180465698244, "rewards/margins": 1.4069829146067303, "rewards/rejected": -1.4571510950724285, "step": 6532 }, { "epoch": 0.3462751437733549, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29339216.0, "logits/rejected": -13509491.2, "logps/chosen": -226.50703938802084, "logps/rejected": -519.73330078125, "loss": 0.2439, "rewards/chosen": 0.4855852921803792, "rewards/margins": 2.6430804093678795, "rewards/rejected": -2.1574951171875, "step": 6533 }, { "epoch": 0.346328147775157, "grad_norm": 31.5, "kl": 1.4828662872314453, "learning_rate": 5e-07, "logits/chosen": -55599680.0, "logits/rejected": 1648651.75, "logps/chosen": -178.2410125732422, "logps/rejected": -108.50505828857422, "loss": 0.2622, "rewards/chosen": 0.7105719447135925, "rewards/margins": 2.4884642958641052, "rewards/rejected": -1.7778923511505127, "step": 6534 }, { "epoch": 0.34638115177695916, "grad_norm": 44.75, "kl": 0.24092864990234375, "learning_rate": 5e-07, "logits/chosen": -5906889.0, "logits/rejected": -12247686.666666666, "logps/chosen": -177.75006103515625, "logps/rejected": -231.58174641927084, "loss": 0.2447, "rewards/chosen": 0.4981845021247864, "rewards/margins": 2.0066957275072737, "rewards/rejected": -1.508511225382487, "step": 6535 }, { "epoch": 0.3464341557787613, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9925066.0, "logits/rejected": 1091911.2, "logps/chosen": -235.51192220052084, "logps/rejected": -111.16683349609374, "loss": 0.3467, "rewards/chosen": -0.16366068522135416, "rewards/margins": 1.0871768315633137, "rewards/rejected": -1.250837516784668, "step": 6536 }, { "epoch": 0.34648715978056344, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 23133126.4, "logits/rejected": -27634541.333333332, "logps/chosen": -290.587255859375, "logps/rejected": -214.3038330078125, "loss": 0.3869, "rewards/chosen": 0.016589049994945527, "rewards/margins": 1.4461807057261467, "rewards/rejected": -1.4295916557312012, "step": 6537 }, { "epoch": 0.3465401637823656, "grad_norm": 46.25, "kl": 1.0152149200439453, "learning_rate": 5e-07, "logits/chosen": -30731241.6, "logits/rejected": -29323586.666666668, "logps/chosen": -235.9572021484375, "logps/rejected": -179.07511393229166, "loss": 0.3041, "rewards/chosen": 0.553404712677002, "rewards/margins": 2.3125291506449384, "rewards/rejected": -1.7591244379679363, "step": 6538 }, { "epoch": 0.3465931677841677, "grad_norm": 51.75, "kl": 1.85772705078125, "learning_rate": 5e-07, "logits/chosen": -12073449.333333334, "logits/rejected": -27426506.0, "logps/chosen": -351.9486897786458, "logps/rejected": -255.3974151611328, "loss": 0.2996, "rewards/chosen": 0.9141985575358073, "rewards/margins": 3.1487147013346353, "rewards/rejected": -2.234516143798828, "step": 6539 }, { "epoch": 0.34664617178596985, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13092556.0, "logits/rejected": 21477.375, "logps/chosen": -364.4781188964844, "logps/rejected": -261.75927734375, "loss": 0.3111, "rewards/chosen": 0.18481045961380005, "rewards/margins": 2.053114116191864, "rewards/rejected": -1.868303656578064, "step": 6540 }, { "epoch": 0.346699175787772, "grad_norm": 45.0, "kl": 1.3969802856445312, "learning_rate": 5e-07, "logits/chosen": 3861824.6666666665, "logits/rejected": 5299425.0, "logps/chosen": -150.23347981770834, "logps/rejected": -154.72164916992188, "loss": 0.4119, "rewards/chosen": 0.3648500045140584, "rewards/margins": 0.8925645550092061, "rewards/rejected": -0.5277145504951477, "step": 6541 }, { "epoch": 0.3467521797895741, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25303741.333333332, "logits/rejected": -10164396.8, "logps/chosen": -264.0996907552083, "logps/rejected": -219.533837890625, "loss": 0.2177, "rewards/chosen": 0.6521240075429281, "rewards/margins": 3.2617963631947835, "rewards/rejected": -2.6096723556518553, "step": 6542 }, { "epoch": 0.34680518379137626, "grad_norm": 74.0, "kl": 0.53338623046875, "learning_rate": 5e-07, "logits/chosen": 2097661.3333333335, "logits/rejected": -73440435.2, "logps/chosen": -1219.6205240885417, "logps/rejected": -360.595458984375, "loss": 0.185, "rewards/chosen": 1.1777435143788655, "rewards/margins": 3.217496665318807, "rewards/rejected": -2.0397531509399416, "step": 6543 }, { "epoch": 0.3468581877931784, "grad_norm": 53.5, "kl": 2.5884666442871094, "learning_rate": 5e-07, "logits/chosen": -25230592.0, "logits/rejected": -32070244.0, "logps/chosen": -276.98919677734375, "logps/rejected": -303.03326416015625, "loss": 0.3538, "rewards/chosen": 0.5094146331151327, "rewards/margins": 2.974503477414449, "rewards/rejected": -2.4650888442993164, "step": 6544 }, { "epoch": 0.34691119179498053, "grad_norm": 58.75, "kl": 1.002131462097168, "learning_rate": 5e-07, "logits/chosen": -24695846.4, "logits/rejected": 16164082.666666666, "logps/chosen": -494.2751953125, "logps/rejected": -172.1091105143229, "loss": 0.2847, "rewards/chosen": 0.9411215782165527, "rewards/margins": 2.6223308245340986, "rewards/rejected": -1.6812092463175456, "step": 6545 }, { "epoch": 0.34696419579678267, "grad_norm": 41.75, "kl": 0.6847190856933594, "learning_rate": 5e-07, "logits/chosen": 12771656.0, "logits/rejected": -1933154.2, "logps/chosen": -878.0419108072916, "logps/rejected": -126.752490234375, "loss": 0.1737, "rewards/chosen": 2.1155573527018228, "rewards/margins": 3.492453638712565, "rewards/rejected": -1.376896286010742, "step": 6546 }, { "epoch": 0.3470171997985848, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33077536.0, "logits/rejected": -12385784.0, "logps/chosen": -309.4107177734375, "logps/rejected": -350.17822265625, "loss": 0.3057, "rewards/chosen": 0.6290235042572021, "rewards/margins": 2.2778796672821047, "rewards/rejected": -1.6488561630249023, "step": 6547 }, { "epoch": 0.34707020380038694, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18166568.0, "logits/rejected": -53678764.8, "logps/chosen": -317.29388427734375, "logps/rejected": -479.4880859375, "loss": 0.3197, "rewards/chosen": -0.5001757542292277, "rewards/margins": 1.7371487696965535, "rewards/rejected": -2.237324523925781, "step": 6548 }, { "epoch": 0.3471232078021891, "grad_norm": 38.25, "kl": 0.27402687072753906, "learning_rate": 5e-07, "logits/chosen": -11772868.0, "logits/rejected": -21435034.0, "logps/chosen": -99.31585693359375, "logps/rejected": -245.66714477539062, "loss": 0.3594, "rewards/chosen": 0.11101260781288147, "rewards/margins": 1.5991726219654083, "rewards/rejected": -1.4881600141525269, "step": 6549 }, { "epoch": 0.3471762118039912, "grad_norm": 30.375, "kl": 0.5575714111328125, "learning_rate": 5e-07, "logits/chosen": -4550263.333333333, "logits/rejected": -24618705.6, "logps/chosen": -110.01895141601562, "logps/rejected": -212.7386474609375, "loss": 0.2684, "rewards/chosen": 0.19943644603093466, "rewards/margins": 2.2350481708844505, "rewards/rejected": -2.0356117248535157, "step": 6550 }, { "epoch": 0.34722921580579336, "grad_norm": 56.25, "kl": 0.15738296508789062, "learning_rate": 5e-07, "logits/chosen": -18929814.4, "logits/rejected": 26873970.666666668, "logps/chosen": -319.551416015625, "logps/rejected": -569.258544921875, "loss": 0.2711, "rewards/chosen": 0.9074684143066406, "rewards/margins": 2.4394610087076822, "rewards/rejected": -1.5319925944010417, "step": 6551 }, { "epoch": 0.3472822198075955, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25870992.0, "logits/rejected": -52535462.4, "logps/chosen": -349.9644775390625, "logps/rejected": -647.26162109375, "loss": 0.2149, "rewards/chosen": 0.47482506434122723, "rewards/margins": 2.86043488184611, "rewards/rejected": -2.385609817504883, "step": 6552 }, { "epoch": 0.34733522380939763, "grad_norm": 34.5, "kl": 0.6485214233398438, "learning_rate": 5e-07, "logits/chosen": -10772169.6, "logits/rejected": -33947693.333333336, "logps/chosen": -225.22685546875, "logps/rejected": -408.2556559244792, "loss": 0.254, "rewards/chosen": 0.5933108329772949, "rewards/margins": 3.7112730344136557, "rewards/rejected": -3.117962201436361, "step": 6553 }, { "epoch": 0.34738822781119977, "grad_norm": 48.0, "kl": 0.16109085083007812, "learning_rate": 5e-07, "logits/chosen": -17070836.0, "logits/rejected": -36493300.0, "logps/chosen": -449.7047119140625, "logps/rejected": -406.0009460449219, "loss": 0.2843, "rewards/chosen": 0.3296428620815277, "rewards/margins": 2.72837170958519, "rewards/rejected": -2.398728847503662, "step": 6554 }, { "epoch": 0.3474412318130019, "grad_norm": 41.25, "kl": 0.1895885467529297, "learning_rate": 5e-07, "logits/chosen": -70791232.0, "logits/rejected": -33876553.14285714, "logps/chosen": -403.37896728515625, "logps/rejected": -492.32198660714283, "loss": 0.145, "rewards/chosen": 0.535626232624054, "rewards/margins": 3.2367662276540483, "rewards/rejected": -2.7011399950299944, "step": 6555 }, { "epoch": 0.34749423581480404, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48363048.0, "logits/rejected": -34323860.0, "logps/chosen": -553.423583984375, "logps/rejected": -412.41900634765625, "loss": 0.2345, "rewards/chosen": 0.5962470769882202, "rewards/margins": 2.951311230659485, "rewards/rejected": -2.3550641536712646, "step": 6556 }, { "epoch": 0.3475472398166062, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10071204.0, "logits/rejected": -21592384.0, "logps/chosen": -64.62449645996094, "logps/rejected": -127.79764229910714, "loss": 0.2575, "rewards/chosen": -0.3949485719203949, "rewards/margins": 1.2132960515362876, "rewards/rejected": -1.6082446234566825, "step": 6557 }, { "epoch": 0.3476002438184083, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84239296.0, "logits/rejected": -28812466.0, "logps/chosen": -360.48126220703125, "logps/rejected": -219.19662475585938, "loss": 0.3388, "rewards/chosen": 0.44947516918182373, "rewards/margins": 1.524292230606079, "rewards/rejected": -1.0748170614242554, "step": 6558 }, { "epoch": 0.3476532478202104, "grad_norm": 42.0, "kl": 0.21503543853759766, "learning_rate": 5e-07, "logits/chosen": -24876180.8, "logits/rejected": -37089416.0, "logps/chosen": -199.34205322265626, "logps/rejected": -782.22314453125, "loss": 0.3668, "rewards/chosen": -0.1762221097946167, "rewards/margins": 3.076412574450175, "rewards/rejected": -3.2526346842447915, "step": 6559 }, { "epoch": 0.34770625182201254, "grad_norm": 48.75, "kl": 1.7639617919921875, "learning_rate": 5e-07, "logits/chosen": -30459114.666666668, "logits/rejected": 390680.875, "logps/chosen": -235.58902994791666, "logps/rejected": -126.35865783691406, "loss": 0.377, "rewards/chosen": 0.47333772977193195, "rewards/margins": 2.2003076871236167, "rewards/rejected": -1.7269699573516846, "step": 6560 }, { "epoch": 0.3477592558238147, "grad_norm": 66.0, "kl": 1.0562496185302734, "learning_rate": 5e-07, "logits/chosen": -28324336.0, "logits/rejected": 996910.25, "logps/chosen": -409.4396158854167, "logps/rejected": -105.73945617675781, "loss": 0.441, "rewards/chosen": 0.34477512041727704, "rewards/margins": 0.4111691663662593, "rewards/rejected": -0.06639404594898224, "step": 6561 }, { "epoch": 0.3478122598256168, "grad_norm": 44.0, "kl": 0.9765357971191406, "learning_rate": 5e-07, "logits/chosen": -41836464.0, "logits/rejected": -63406368.0, "logps/chosen": -316.8494873046875, "logps/rejected": -464.163330078125, "loss": 0.3244, "rewards/chosen": 0.5870054562886556, "rewards/margins": 3.670312245686849, "rewards/rejected": -3.0833067893981934, "step": 6562 }, { "epoch": 0.34786526382741895, "grad_norm": 54.75, "kl": 1.2842273712158203, "learning_rate": 5e-07, "logits/chosen": -52339604.0, "logits/rejected": -15025079.0, "logps/chosen": -266.9995422363281, "logps/rejected": -200.82852172851562, "loss": 0.2886, "rewards/chosen": 0.6442693471908569, "rewards/margins": 2.0393433570861816, "rewards/rejected": -1.3950740098953247, "step": 6563 }, { "epoch": 0.3479182678292211, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12854538.0, "logits/rejected": -2379120.5, "logps/chosen": -170.61907958984375, "logps/rejected": -132.28604125976562, "loss": 0.3964, "rewards/chosen": -0.22610417008399963, "rewards/margins": 1.2918212115764618, "rewards/rejected": -1.5179253816604614, "step": 6564 }, { "epoch": 0.3479712718310232, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -586618.9, "logits/rejected": -13355925.333333334, "logps/chosen": -103.031494140625, "logps/rejected": -362.4617919921875, "loss": 0.3761, "rewards/chosen": 0.16644970178604127, "rewards/margins": 1.875731146335602, "rewards/rejected": -1.7092814445495605, "step": 6565 }, { "epoch": 0.34802427583282536, "grad_norm": 65.0, "kl": 0.5149974822998047, "learning_rate": 5e-07, "logits/chosen": -21003442.666666668, "logits/rejected": -1520420.4, "logps/chosen": -379.5848795572917, "logps/rejected": -183.03692626953125, "loss": 0.3927, "rewards/chosen": 0.053020477294921875, "rewards/margins": 0.8745071411132812, "rewards/rejected": -0.8214866638183593, "step": 6566 }, { "epoch": 0.3480772798346275, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1279104.1, "logits/rejected": -7730684.666666667, "logps/chosen": -219.325341796875, "logps/rejected": -148.8740234375, "loss": 0.4187, "rewards/chosen": -0.17743065357208251, "rewards/margins": 1.4241768280665081, "rewards/rejected": -1.6016074816385906, "step": 6567 }, { "epoch": 0.34813028383642963, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8772037.333333334, "logits/rejected": -35496224.0, "logps/chosen": -405.1060791015625, "logps/rejected": -597.3912109375, "loss": 0.1981, "rewards/chosen": 0.889530340830485, "rewards/margins": 3.45889565149943, "rewards/rejected": -2.569365310668945, "step": 6568 }, { "epoch": 0.34818328783823177, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35125344.0, "logits/rejected": 13807589.333333334, "logps/chosen": -394.01934814453125, "logps/rejected": -322.3652750651042, "loss": 0.1986, "rewards/chosen": 1.0195832252502441, "rewards/margins": 2.73594331741333, "rewards/rejected": -1.716360092163086, "step": 6569 }, { "epoch": 0.3482362918400339, "grad_norm": 49.75, "kl": 0.6325550079345703, "learning_rate": 5e-07, "logits/chosen": -3159109.3333333335, "logits/rejected": -1793839.25, "logps/chosen": -192.7039591471354, "logps/rejected": -46.6809196472168, "loss": 0.3625, "rewards/chosen": 0.5291794538497925, "rewards/margins": 1.4717469215393066, "rewards/rejected": -0.9425674676895142, "step": 6570 }, { "epoch": 0.34828929584183604, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65801636.0, "logits/rejected": -20520362.0, "logps/chosen": -202.3827667236328, "logps/rejected": -239.7823944091797, "loss": 0.313, "rewards/chosen": 0.30985647439956665, "rewards/margins": 2.161807596683502, "rewards/rejected": -1.8519511222839355, "step": 6571 }, { "epoch": 0.3483422998436382, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63305786.666666664, "logits/rejected": -12896586.4, "logps/chosen": -125.46192423502605, "logps/rejected": -227.410009765625, "loss": 0.3835, "rewards/chosen": -0.4196592966715495, "rewards/margins": 0.7163586934407551, "rewards/rejected": -1.1360179901123046, "step": 6572 }, { "epoch": 0.3483953038454403, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26428124.0, "logits/rejected": -9780722.0, "logps/chosen": -112.64823913574219, "logps/rejected": -386.61297607421875, "loss": 0.3856, "rewards/chosen": -0.3522215783596039, "rewards/margins": 1.9630400240421295, "rewards/rejected": -2.3152616024017334, "step": 6573 }, { "epoch": 0.34844830784724246, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24688232.0, "logits/rejected": -24990000.0, "logps/chosen": -228.0732421875, "logps/rejected": -377.2566731770833, "loss": 0.327, "rewards/chosen": 0.15270020961761474, "rewards/margins": 2.8125655889511108, "rewards/rejected": -2.659865379333496, "step": 6574 }, { "epoch": 0.3485013118490446, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 34937472.0, "logits/rejected": -26178714.0, "logps/chosen": -298.40087890625, "logps/rejected": -241.32882690429688, "loss": 0.3243, "rewards/chosen": 0.03392067551612854, "rewards/margins": 1.938379317522049, "rewards/rejected": -1.9044586420059204, "step": 6575 }, { "epoch": 0.34855431585084673, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32013922.0, "logits/rejected": -11879914.0, "logps/chosen": -531.3614501953125, "logps/rejected": -268.44317626953125, "loss": 0.268, "rewards/chosen": 0.9874671697616577, "rewards/margins": 2.4304150342941284, "rewards/rejected": -1.4429478645324707, "step": 6576 }, { "epoch": 0.34860731985264887, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3371415.5, "logits/rejected": -20477958.666666668, "logps/chosen": -77.26347351074219, "logps/rejected": -158.06341552734375, "loss": 0.266, "rewards/chosen": -0.40649646520614624, "rewards/margins": 1.4777583082516987, "rewards/rejected": -1.884254773457845, "step": 6577 }, { "epoch": 0.348660323854451, "grad_norm": 37.75, "kl": 1.2949295043945312, "learning_rate": 5e-07, "logits/chosen": -33182016.0, "logits/rejected": -31077824.0, "logps/chosen": -331.3556722005208, "logps/rejected": -344.452880859375, "loss": 0.1863, "rewards/chosen": 1.0908721288045247, "rewards/margins": 3.1597318013509117, "rewards/rejected": -2.068859672546387, "step": 6578 }, { "epoch": 0.34871332785625314, "grad_norm": 45.0, "kl": 0.70098876953125, "learning_rate": 5e-07, "logits/chosen": -28621555.2, "logits/rejected": -7303660.0, "logps/chosen": -267.86318359375, "logps/rejected": -239.6636962890625, "loss": 0.2843, "rewards/chosen": 0.3915297031402588, "rewards/margins": 2.6441395918528237, "rewards/rejected": -2.252609888712565, "step": 6579 }, { "epoch": 0.3487663318580553, "grad_norm": 55.25, "kl": 1.1207313537597656, "learning_rate": 5e-07, "logits/chosen": -27580085.333333332, "logits/rejected": -23694530.0, "logps/chosen": -357.5762125651042, "logps/rejected": -373.8291931152344, "loss": 0.3642, "rewards/chosen": 0.482397198677063, "rewards/margins": 2.6905120611190796, "rewards/rejected": -2.2081148624420166, "step": 6580 }, { "epoch": 0.3488193358598574, "grad_norm": 44.0, "kl": 0.3501167297363281, "learning_rate": 5e-07, "logits/chosen": -35991226.666666664, "logits/rejected": -50111369.6, "logps/chosen": -331.41530354817706, "logps/rejected": -394.40341796875, "loss": 0.2767, "rewards/chosen": -0.443695068359375, "rewards/margins": 1.8987346649169923, "rewards/rejected": -2.3424297332763673, "step": 6581 }, { "epoch": 0.34887233986165955, "grad_norm": 58.5, "kl": 0.06757068634033203, "learning_rate": 5e-07, "logits/chosen": -20589882.666666668, "logits/rejected": 2898980.5, "logps/chosen": -437.43798828125, "logps/rejected": -612.0579223632812, "loss": 0.2981, "rewards/chosen": 0.7385561466217041, "rewards/margins": 3.9252641201019287, "rewards/rejected": -3.1867079734802246, "step": 6582 }, { "epoch": 0.3489253438634617, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14970798.4, "logits/rejected": -23879640.0, "logps/chosen": -397.008984375, "logps/rejected": -310.82407633463544, "loss": 0.343, "rewards/chosen": 0.4453061580657959, "rewards/margins": 1.724030351638794, "rewards/rejected": -1.278724193572998, "step": 6583 }, { "epoch": 0.3489783478652638, "grad_norm": 52.25, "kl": 2.6967086791992188, "learning_rate": 5e-07, "logits/chosen": -63265184.0, "logits/rejected": -59548320.0, "logps/chosen": -509.941943359375, "logps/rejected": -426.8716634114583, "loss": 0.2971, "rewards/chosen": 0.5536659240722657, "rewards/margins": 2.6542133967081707, "rewards/rejected": -2.100547472635905, "step": 6584 }, { "epoch": 0.34903135186706596, "grad_norm": 55.75, "kl": 0.229034423828125, "learning_rate": 5e-07, "logits/chosen": -49210664.0, "logits/rejected": -28064560.0, "logps/chosen": -563.1509399414062, "logps/rejected": -346.8109537760417, "loss": 0.1987, "rewards/chosen": 1.2636092901229858, "rewards/margins": 3.0298306544621783, "rewards/rejected": -1.7662213643391926, "step": 6585 }, { "epoch": 0.3490843558688681, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42423858.666666664, "logits/rejected": 1807673.6, "logps/chosen": -338.3053385416667, "logps/rejected": -225.390087890625, "loss": 0.2995, "rewards/chosen": 0.13115564982096353, "rewards/margins": 1.7472363154093424, "rewards/rejected": -1.6160806655883788, "step": 6586 }, { "epoch": 0.34913735987067024, "grad_norm": 32.75, "kl": 0.045562744140625, "learning_rate": 5e-07, "logits/chosen": -23236002.666666668, "logits/rejected": -2780067.0, "logps/chosen": -215.209716796875, "logps/rejected": -240.099658203125, "loss": 0.2702, "rewards/chosen": 0.25821711619695026, "rewards/margins": 2.1916395862897238, "rewards/rejected": -1.9334224700927733, "step": 6587 }, { "epoch": 0.3491903638724724, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 156322.8, "logits/rejected": -10390268.666666666, "logps/chosen": -98.80484008789062, "logps/rejected": -228.84102376302084, "loss": 0.3066, "rewards/chosen": 0.5990866184234619, "rewards/margins": 2.0760886033376056, "rewards/rejected": -1.4770019849141438, "step": 6588 }, { "epoch": 0.3492433678742745, "grad_norm": 56.0, "kl": 1.3092193603515625, "learning_rate": 5e-07, "logits/chosen": -33673828.0, "logits/rejected": 24550092.0, "logps/chosen": -250.50979614257812, "logps/rejected": -250.33535766601562, "loss": 0.3449, "rewards/chosen": 0.6510093808174133, "rewards/margins": 1.4517227411270142, "rewards/rejected": -0.8007133603096008, "step": 6589 }, { "epoch": 0.34929637187607665, "grad_norm": 70.0, "kl": 0.115753173828125, "learning_rate": 5e-07, "logits/chosen": -11266937.6, "logits/rejected": -36012424.0, "logps/chosen": -423.266259765625, "logps/rejected": -530.1956380208334, "loss": 0.3236, "rewards/chosen": 0.23880505561828613, "rewards/margins": 3.2677093346913657, "rewards/rejected": -3.0289042790730796, "step": 6590 }, { "epoch": 0.3493493758778788, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18197921.6, "logits/rejected": -18025136.0, "logps/chosen": -149.78880615234374, "logps/rejected": -307.3856608072917, "loss": 0.395, "rewards/chosen": -0.08166701793670654, "rewards/margins": 1.586114017168681, "rewards/rejected": -1.6677810351053874, "step": 6591 }, { "epoch": 0.3494023798796809, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38791952.0, "logits/rejected": -42183956.0, "logps/chosen": -467.4280700683594, "logps/rejected": -350.9473571777344, "loss": 0.2396, "rewards/chosen": 1.2592506408691406, "rewards/margins": 2.726468086242676, "rewards/rejected": -1.4672174453735352, "step": 6592 }, { "epoch": 0.34945538388148306, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17959316.0, "logits/rejected": -21538046.666666668, "logps/chosen": -203.20968627929688, "logps/rejected": -199.2618408203125, "loss": 0.2331, "rewards/chosen": 0.925501823425293, "rewards/margins": 2.4439395268758135, "rewards/rejected": -1.5184377034505208, "step": 6593 }, { "epoch": 0.3495083878832852, "grad_norm": 45.0, "kl": 0.06987380981445312, "learning_rate": 5e-07, "logits/chosen": -25066809.6, "logits/rejected": -27294957.333333332, "logps/chosen": -280.0019775390625, "logps/rejected": -413.0448404947917, "loss": 0.3287, "rewards/chosen": 0.11346038579940795, "rewards/margins": 3.257233146826426, "rewards/rejected": -3.143772761027018, "step": 6594 }, { "epoch": 0.34956139188508734, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44060521.6, "logits/rejected": -22526322.666666668, "logps/chosen": -337.077734375, "logps/rejected": -248.31658935546875, "loss": 0.2508, "rewards/chosen": 0.6355462551116944, "rewards/margins": 3.195865201950073, "rewards/rejected": -2.560318946838379, "step": 6595 }, { "epoch": 0.3496143958868895, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25758336.0, "logits/rejected": -31380174.0, "logps/chosen": -249.11080932617188, "logps/rejected": -381.17669677734375, "loss": 0.2597, "rewards/chosen": 0.49573010206222534, "rewards/margins": 2.922764003276825, "rewards/rejected": -2.4270339012145996, "step": 6596 }, { "epoch": 0.3496673998886916, "grad_norm": 46.5, "kl": 0.10106945037841797, "learning_rate": 5e-07, "logits/chosen": 1277878.2857142857, "logits/rejected": 3028510.0, "logps/chosen": -95.92860630580357, "logps/rejected": -19.76954460144043, "loss": 0.5607, "rewards/chosen": -0.36323400906154085, "rewards/margins": -0.16647974721023012, "rewards/rejected": -0.19675426185131073, "step": 6597 }, { "epoch": 0.34972040389049375, "grad_norm": 44.5, "kl": 0.403961181640625, "learning_rate": 5e-07, "logits/chosen": -26354155.2, "logits/rejected": -5122205.666666667, "logps/chosen": -172.8376708984375, "logps/rejected": -137.6269734700521, "loss": 0.3507, "rewards/chosen": 0.36461672782897947, "rewards/margins": 1.810798978805542, "rewards/rejected": -1.4461822509765625, "step": 6598 }, { "epoch": 0.3497734078922959, "grad_norm": 55.0, "kl": 0.4226226806640625, "learning_rate": 5e-07, "logits/chosen": -53655016.0, "logits/rejected": -30934939.42857143, "logps/chosen": -458.4390869140625, "logps/rejected": -422.027099609375, "loss": 0.1823, "rewards/chosen": 0.17302857339382172, "rewards/margins": 2.1897146041904176, "rewards/rejected": -2.016686030796596, "step": 6599 }, { "epoch": 0.349826411894098, "grad_norm": 66.0, "kl": 1.7032203674316406, "learning_rate": 5e-07, "logits/chosen": -34501785.6, "logits/rejected": -2316009.0, "logps/chosen": -356.86611328125, "logps/rejected": -97.3002217610677, "loss": 0.3952, "rewards/chosen": 0.3596005439758301, "rewards/margins": 2.0025172233581543, "rewards/rejected": -1.6429166793823242, "step": 6600 }, { "epoch": 0.34987941589590016, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38785733.333333336, "logits/rejected": 1817379.75, "logps/chosen": -301.1223958333333, "logps/rejected": -325.85211181640625, "loss": 0.3925, "rewards/chosen": 0.12284139792124431, "rewards/margins": 1.7981846729914348, "rewards/rejected": -1.6753432750701904, "step": 6601 }, { "epoch": 0.3499324198977023, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25084586.666666668, "logits/rejected": -44303654.4, "logps/chosen": -273.005615234375, "logps/rejected": -404.9955078125, "loss": 0.2978, "rewards/chosen": -0.17920583486557007, "rewards/margins": 1.6898089051246643, "rewards/rejected": -1.8690147399902344, "step": 6602 }, { "epoch": 0.34998542389950443, "grad_norm": 47.25, "kl": 0.0467529296875, "learning_rate": 5e-07, "logits/chosen": -54748968.0, "logits/rejected": -20445552.0, "logps/chosen": -555.4016723632812, "logps/rejected": -385.5992431640625, "loss": 0.3203, "rewards/chosen": 0.17242321372032166, "rewards/margins": 2.154627412557602, "rewards/rejected": -1.9822041988372803, "step": 6603 }, { "epoch": 0.35003842790130657, "grad_norm": 48.5, "kl": 0.5003871917724609, "learning_rate": 5e-07, "logits/chosen": -35430328.0, "logits/rejected": -25003512.0, "logps/chosen": -258.4736633300781, "logps/rejected": -750.5146484375, "loss": 0.2926, "rewards/chosen": 0.00958070158958435, "rewards/margins": 2.9817915856838226, "rewards/rejected": -2.9722108840942383, "step": 6604 }, { "epoch": 0.3500914319031087, "grad_norm": 51.0, "kl": 0.7531824111938477, "learning_rate": 5e-07, "logits/chosen": -5440884.666666667, "logits/rejected": -8663172.0, "logps/chosen": -259.65736897786456, "logps/rejected": -42.09040451049805, "loss": 0.3581, "rewards/chosen": 0.3442510763804118, "rewards/margins": 2.5994974772135415, "rewards/rejected": -2.25524640083313, "step": 6605 }, { "epoch": 0.35014443590491084, "grad_norm": 49.25, "kl": 1.353372573852539, "learning_rate": 5e-07, "logits/chosen": -1596314.4, "logits/rejected": -12865069.333333334, "logps/chosen": -239.5386962890625, "logps/rejected": -523.7763671875, "loss": 0.2627, "rewards/chosen": 0.7948275089263916, "rewards/margins": 4.368355798721313, "rewards/rejected": -3.573528289794922, "step": 6606 }, { "epoch": 0.350197439906713, "grad_norm": 30.625, "kl": 1.161956787109375, "learning_rate": 5e-07, "logits/chosen": -5902376.8, "logits/rejected": -27023490.666666668, "logps/chosen": -638.0115234375, "logps/rejected": -569.2666422526041, "loss": 0.1355, "rewards/chosen": 1.965464210510254, "rewards/margins": 5.658310508728027, "rewards/rejected": -3.6928462982177734, "step": 6607 }, { "epoch": 0.3502504439085151, "grad_norm": 42.0, "kl": 1.6419525146484375, "learning_rate": 5e-07, "logits/chosen": -7005107.2, "logits/rejected": -29714165.333333332, "logps/chosen": -209.621875, "logps/rejected": -485.7900390625, "loss": 0.328, "rewards/chosen": 0.48998513221740725, "rewards/margins": 3.444718345006307, "rewards/rejected": -2.9547332127889, "step": 6608 }, { "epoch": 0.35030344791031726, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46660544.0, "logits/rejected": -44033942.4, "logps/chosen": -323.174072265625, "logps/rejected": -497.95234375, "loss": 0.2081, "rewards/chosen": 0.7228736877441406, "rewards/margins": 2.848213768005371, "rewards/rejected": -2.1253400802612306, "step": 6609 }, { "epoch": 0.35035645191211934, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33143038.0, "logits/rejected": -25572212.0, "logps/chosen": -215.34368896484375, "logps/rejected": -356.10943603515625, "loss": 0.3003, "rewards/chosen": 0.17623895406723022, "rewards/margins": 2.2431969046592712, "rewards/rejected": -2.066957950592041, "step": 6610 }, { "epoch": 0.3504094559139215, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7858117.5, "logits/rejected": -40446281.14285714, "logps/chosen": -26.42820167541504, "logps/rejected": -661.6077008928571, "loss": 0.1563, "rewards/chosen": 0.2361549437046051, "rewards/margins": 3.1129353174141476, "rewards/rejected": -2.8767803737095425, "step": 6611 }, { "epoch": 0.3504624599157236, "grad_norm": 39.75, "kl": 0.6899833679199219, "learning_rate": 5e-07, "logits/chosen": -6544920.0, "logits/rejected": -30401290.666666668, "logps/chosen": -225.24014282226562, "logps/rejected": -318.20611572265625, "loss": 0.271, "rewards/chosen": 0.19732666015625, "rewards/margins": 1.7219456036885579, "rewards/rejected": -1.5246189435323079, "step": 6612 }, { "epoch": 0.35051546391752575, "grad_norm": 63.0, "kl": 1.977020263671875, "learning_rate": 5e-07, "logits/chosen": -22562501.333333332, "logits/rejected": -57339872.0, "logps/chosen": -372.7257486979167, "logps/rejected": -357.9502258300781, "loss": 0.3903, "rewards/chosen": 0.47739966710408527, "rewards/margins": 2.5303672154744468, "rewards/rejected": -2.0529675483703613, "step": 6613 }, { "epoch": 0.3505684679193279, "grad_norm": 46.75, "kl": 0.958289623260498, "learning_rate": 5e-07, "logits/chosen": -22285664.0, "logits/rejected": -4604284.0, "logps/chosen": -295.4720764160156, "logps/rejected": -305.6891784667969, "loss": 0.2573, "rewards/chosen": 0.42552486062049866, "rewards/margins": 2.3693725168704987, "rewards/rejected": -1.94384765625, "step": 6614 }, { "epoch": 0.35062147192113, "grad_norm": 44.5, "kl": 0.7582540512084961, "learning_rate": 5e-07, "logits/chosen": 592972.875, "logits/rejected": -42256488.0, "logps/chosen": -183.0921630859375, "logps/rejected": -264.8257649739583, "loss": 0.3004, "rewards/chosen": 0.08500079810619354, "rewards/margins": 1.5214851349592209, "rewards/rejected": -1.4364843368530273, "step": 6615 }, { "epoch": 0.35067447592293216, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10395972.0, "logits/rejected": -4557211.0, "logps/chosen": -159.046875, "logps/rejected": -345.74462890625, "loss": 0.2681, "rewards/chosen": 0.3673882484436035, "rewards/margins": 3.144033432006836, "rewards/rejected": -2.7766451835632324, "step": 6616 }, { "epoch": 0.3507274799247343, "grad_norm": 51.25, "kl": 0.9807682037353516, "learning_rate": 5e-07, "logits/chosen": -32814906.0, "logits/rejected": -21119532.0, "logps/chosen": -344.5673828125, "logps/rejected": -261.6119079589844, "loss": 0.2993, "rewards/chosen": 0.3961290717124939, "rewards/margins": 2.203514277935028, "rewards/rejected": -1.8073852062225342, "step": 6617 }, { "epoch": 0.35078048392653643, "grad_norm": 35.5, "kl": 0.10288238525390625, "learning_rate": 5e-07, "logits/chosen": 969116.6, "logits/rejected": -16196429.333333334, "logps/chosen": -69.06618041992188, "logps/rejected": -265.9099527994792, "loss": 0.3368, "rewards/chosen": 0.33650593757629393, "rewards/margins": 2.00406444867452, "rewards/rejected": -1.6675585110982258, "step": 6618 }, { "epoch": 0.35083348792833857, "grad_norm": 38.5, "kl": 0.2804584503173828, "learning_rate": 5e-07, "logits/chosen": -27577278.0, "logits/rejected": -21406512.0, "logps/chosen": -251.2842559814453, "logps/rejected": -223.62704467773438, "loss": 0.2825, "rewards/chosen": 0.2181997001171112, "rewards/margins": 2.437434643507004, "rewards/rejected": -2.2192349433898926, "step": 6619 }, { "epoch": 0.3508864919301407, "grad_norm": 44.5, "kl": 0.03562164306640625, "learning_rate": 5e-07, "logits/chosen": -40892352.0, "logits/rejected": -19296328.0, "logps/chosen": -265.965576171875, "logps/rejected": -100.62248229980469, "loss": 0.3454, "rewards/chosen": 0.451863169670105, "rewards/margins": 1.3453932404518127, "rewards/rejected": -0.8935300707817078, "step": 6620 }, { "epoch": 0.35093949593194285, "grad_norm": 58.0, "kl": 0.5485553741455078, "learning_rate": 5e-07, "logits/chosen": -35560688.0, "logps/chosen": -471.413818359375, "loss": 0.4192, "rewards/chosen": 0.5274359583854675, "step": 6621 }, { "epoch": 0.350992499933745, "grad_norm": 44.5, "kl": 0.6044406890869141, "learning_rate": 5e-07, "logits/chosen": -13259972.8, "logits/rejected": -27656298.666666668, "logps/chosen": -199.829443359375, "logps/rejected": -246.8319295247396, "loss": 0.3992, "rewards/chosen": -0.008691745996475219, "rewards/margins": 1.7763683418432872, "rewards/rejected": -1.7850600878397624, "step": 6622 }, { "epoch": 0.3510455039355471, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16044916.0, "logits/rejected": -41542696.0, "logps/chosen": -233.7569580078125, "logps/rejected": -530.9693603515625, "loss": 0.3198, "rewards/chosen": 0.05193433165550232, "rewards/margins": 2.5653067529201508, "rewards/rejected": -2.5133724212646484, "step": 6623 }, { "epoch": 0.35109850793734926, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31355154.666666668, "logits/rejected": -25435454.4, "logps/chosen": -303.0804443359375, "logps/rejected": -338.5276123046875, "loss": 0.2735, "rewards/chosen": 0.6509458223978678, "rewards/margins": 2.0871813456217447, "rewards/rejected": -1.436235523223877, "step": 6624 }, { "epoch": 0.3511515119391514, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19829660.0, "logits/rejected": -44917632.0, "logps/chosen": -257.880859375, "logps/rejected": -399.90789794921875, "loss": 0.3145, "rewards/chosen": -0.0158004779368639, "rewards/margins": 2.21764206700027, "rewards/rejected": -2.233442544937134, "step": 6625 }, { "epoch": 0.35120451594095353, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63828416.0, "logits/rejected": -18025608.0, "logps/chosen": -289.31890869140625, "logps/rejected": -159.0135955810547, "loss": 0.4026, "rewards/chosen": 0.046278953552246094, "rewards/margins": 0.8380489349365234, "rewards/rejected": -0.7917699813842773, "step": 6626 }, { "epoch": 0.35125751994275567, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68807024.0, "logits/rejected": -28534382.0, "logps/chosen": -247.8519744873047, "logps/rejected": -380.4761657714844, "loss": 0.2935, "rewards/chosen": 0.4499490261077881, "rewards/margins": 2.630714178085327, "rewards/rejected": -2.180765151977539, "step": 6627 }, { "epoch": 0.3513105239445578, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4040146.5714285714, "logits/rejected": -62981984.0, "logps/chosen": -96.20146833147321, "logps/rejected": -179.48402404785156, "loss": 0.3831, "rewards/chosen": 0.38615165437970844, "rewards/margins": 1.8600698539188931, "rewards/rejected": -1.4739181995391846, "step": 6628 }, { "epoch": 0.35136352794635994, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29776266.666666668, "logits/rejected": 10807789.6, "logps/chosen": -284.4292399088542, "logps/rejected": -197.673974609375, "loss": 0.2289, "rewards/chosen": 0.6422922611236572, "rewards/margins": 2.466483736038208, "rewards/rejected": -1.8241914749145507, "step": 6629 }, { "epoch": 0.3514165319481621, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48250316.8, "logits/rejected": -4764.833333333333, "logps/chosen": -525.75556640625, "logps/rejected": -202.2494099934896, "loss": 0.381, "rewards/chosen": 0.086579430103302, "rewards/margins": 1.4454445322354634, "rewards/rejected": -1.3588651021321614, "step": 6630 }, { "epoch": 0.3514695359499642, "grad_norm": 42.75, "kl": 0.09053802490234375, "learning_rate": 5e-07, "logits/chosen": -33318131.2, "logits/rejected": -24577282.666666668, "logps/chosen": -165.923779296875, "logps/rejected": -629.9503173828125, "loss": 0.3483, "rewards/chosen": -0.13532428741455077, "rewards/margins": 5.304688326517741, "rewards/rejected": -5.440012613932292, "step": 6631 }, { "epoch": 0.35152253995176636, "grad_norm": 39.75, "kl": 2.7867794036865234, "learning_rate": 5e-07, "logits/chosen": -5989769.0, "logits/rejected": -84825808.0, "logps/chosen": -472.1418762207031, "logps/rejected": -362.4278869628906, "loss": 0.2104, "rewards/chosen": 0.9135616421699524, "rewards/margins": 3.469898521900177, "rewards/rejected": -2.5563368797302246, "step": 6632 }, { "epoch": 0.3515755439535685, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28388446.0, "logits/rejected": -21632586.0, "logps/chosen": -278.92083740234375, "logps/rejected": -420.4891662597656, "loss": 0.3, "rewards/chosen": 0.029531285166740417, "rewards/margins": 2.2153292149305344, "rewards/rejected": -2.185797929763794, "step": 6633 }, { "epoch": 0.35162854795537063, "grad_norm": 37.25, "kl": 1.1911888122558594, "learning_rate": 5e-07, "logits/chosen": -13127724.0, "logits/rejected": -34937366.4, "logps/chosen": -197.11263020833334, "logps/rejected": -293.3996826171875, "loss": 0.2918, "rewards/chosen": 0.1581553022066752, "rewards/margins": 2.114519170920054, "rewards/rejected": -1.9563638687133789, "step": 6634 }, { "epoch": 0.35168155195717277, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -95740480.0, "logits/rejected": -1793285.0, "logps/chosen": -258.83916015625, "logps/rejected": -126.9947509765625, "loss": 0.4061, "rewards/chosen": 0.24697632789611818, "rewards/margins": 1.0888113498687744, "rewards/rejected": -0.8418350219726562, "step": 6635 }, { "epoch": 0.3517345559589749, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18771176.0, "logits/rejected": -29284258.0, "logps/chosen": -62.86201477050781, "logps/rejected": -195.65086364746094, "loss": 0.5022, "rewards/chosen": -0.2711080114046733, "rewards/margins": 0.5560167630513508, "rewards/rejected": -0.8271247744560242, "step": 6636 }, { "epoch": 0.35178755996077704, "grad_norm": 49.0, "kl": 0.13053131103515625, "learning_rate": 5e-07, "logits/chosen": -40372822.4, "logits/rejected": 4695014.333333333, "logps/chosen": -347.9662841796875, "logps/rejected": -118.58998616536458, "loss": 0.3831, "rewards/chosen": 0.46553940773010255, "rewards/margins": 1.260938835144043, "rewards/rejected": -0.7953994274139404, "step": 6637 }, { "epoch": 0.3518405639625792, "grad_norm": 68.5, "kl": 0.12459564208984375, "learning_rate": 5e-07, "logits/chosen": -86106024.0, "logits/rejected": -22802762.0, "logps/chosen": -477.3194885253906, "logps/rejected": -233.24314880371094, "loss": 0.343, "rewards/chosen": 0.06984862685203552, "rewards/margins": 2.1843594014644623, "rewards/rejected": -2.1145107746124268, "step": 6638 }, { "epoch": 0.3518935679643813, "grad_norm": 44.25, "kl": 0.3306751251220703, "learning_rate": 5e-07, "logits/chosen": -29278202.666666668, "logits/rejected": -67659091.2, "logps/chosen": -224.3547159830729, "logps/rejected": -313.149267578125, "loss": 0.2503, "rewards/chosen": 0.746572732925415, "rewards/margins": 2.2101805210113525, "rewards/rejected": -1.4636077880859375, "step": 6639 }, { "epoch": 0.35194657196618345, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45111398.4, "logits/rejected": -19470709.333333332, "logps/chosen": -426.132568359375, "logps/rejected": -222.3368123372396, "loss": 0.3281, "rewards/chosen": 0.15729377269744874, "rewards/margins": 2.6126459201176964, "rewards/rejected": -2.4553521474202475, "step": 6640 }, { "epoch": 0.3519995759679856, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60846997.333333336, "logits/rejected": -32017027.2, "logps/chosen": -263.4842529296875, "logps/rejected": -202.38157958984374, "loss": 0.3734, "rewards/chosen": -0.29382769266764325, "rewards/margins": 0.8841454823811847, "rewards/rejected": -1.177973175048828, "step": 6641 }, { "epoch": 0.3520525799697877, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20255450.0, "logits/rejected": -23411146.0, "logps/chosen": -285.25775146484375, "logps/rejected": -278.787353515625, "loss": 0.318, "rewards/chosen": 0.0050642043352127075, "rewards/margins": 2.053205206990242, "rewards/rejected": -2.0481410026550293, "step": 6642 }, { "epoch": 0.35210558397158986, "grad_norm": 45.5, "kl": 0.29410743713378906, "learning_rate": 5e-07, "logits/chosen": -38939065.6, "logits/rejected": 1539768.3333333333, "logps/chosen": -385.0711181640625, "logps/rejected": -330.96767171223956, "loss": 0.3472, "rewards/chosen": 0.3146281957626343, "rewards/margins": 1.8688496669133503, "rewards/rejected": -1.554221471150716, "step": 6643 }, { "epoch": 0.352158587973392, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27920682.666666668, "logits/rejected": -28160745.6, "logps/chosen": -385.3147786458333, "logps/rejected": -418.21357421875, "loss": 0.1724, "rewards/chosen": 1.1277618408203125, "rewards/margins": 3.522259330749512, "rewards/rejected": -2.3944974899291993, "step": 6644 }, { "epoch": 0.35221159197519414, "grad_norm": 46.0, "kl": 3.096541404724121, "learning_rate": 5e-07, "logits/chosen": -4475089.5, "logits/rejected": -46378140.0, "logps/chosen": -181.0362548828125, "logps/rejected": -249.72044372558594, "loss": 0.2839, "rewards/chosen": 0.8575908541679382, "rewards/margins": 2.0743778347969055, "rewards/rejected": -1.2167869806289673, "step": 6645 }, { "epoch": 0.3522645959769963, "grad_norm": 53.5, "kl": 0.1799182891845703, "learning_rate": 5e-07, "logits/chosen": -20276836.0, "logits/rejected": -35185264.0, "logps/chosen": -288.33770751953125, "logps/rejected": -217.89413452148438, "loss": 0.2854, "rewards/chosen": 0.5364760160446167, "rewards/margins": 2.3443297147750854, "rewards/rejected": -1.8078536987304688, "step": 6646 }, { "epoch": 0.3523175999787984, "grad_norm": 69.0, "kl": 1.5338897705078125, "learning_rate": 5e-07, "logits/chosen": -24711449.14285714, "logits/rejected": -31361906.0, "logps/chosen": -249.766845703125, "logps/rejected": -303.55401611328125, "loss": 0.4797, "rewards/chosen": 0.22958321230752127, "rewards/margins": 0.7819300379071916, "rewards/rejected": -0.5523468255996704, "step": 6647 }, { "epoch": 0.35237060398060055, "grad_norm": 62.0, "kl": 0.5577468872070312, "learning_rate": 5e-07, "logits/chosen": -46184466.28571428, "logits/rejected": -6025480.5, "logps/chosen": -324.7990025111607, "logps/rejected": -619.0795288085938, "loss": 0.4271, "rewards/chosen": 0.22111845016479492, "rewards/margins": 2.295703411102295, "rewards/rejected": -2.0745849609375, "step": 6648 }, { "epoch": 0.3524236079824027, "grad_norm": 43.5, "kl": 0.71392822265625, "learning_rate": 5e-07, "logits/chosen": -30934854.0, "logits/rejected": -50463128.0, "logps/chosen": -289.2102966308594, "logps/rejected": -290.0879821777344, "loss": 0.2722, "rewards/chosen": 0.768321692943573, "rewards/margins": 2.382805049419403, "rewards/rejected": -1.61448335647583, "step": 6649 }, { "epoch": 0.3524766119842048, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40788202.666666664, "logits/rejected": -37708596.0, "logps/chosen": -371.0154215494792, "logps/rejected": -558.1531372070312, "loss": 0.2908, "rewards/chosen": 0.6287674109141032, "rewards/margins": 4.094306866327922, "rewards/rejected": -3.4655394554138184, "step": 6650 }, { "epoch": 0.35252961598600696, "grad_norm": 55.25, "kl": 1.3332405090332031, "learning_rate": 5e-07, "logits/chosen": -12634423.2, "logits/rejected": -14843674.666666666, "logps/chosen": -217.146337890625, "logps/rejected": -280.35740152994794, "loss": 0.366, "rewards/chosen": 0.36642303466796877, "rewards/margins": 1.6116979757944745, "rewards/rejected": -1.2452749411265056, "step": 6651 }, { "epoch": 0.3525826199878091, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -26103788.0, "logps/rejected": -209.426513671875, "loss": 0.1459, "rewards/rejected": -1.9235223531723022, "step": 6652 }, { "epoch": 0.35263562398961124, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7351531.0, "logits/rejected": -42482373.333333336, "logps/chosen": -120.57123565673828, "logps/rejected": -252.02950032552084, "loss": 0.2436, "rewards/chosen": 0.35174256563186646, "rewards/margins": 2.255064308643341, "rewards/rejected": -1.9033217430114746, "step": 6653 }, { "epoch": 0.3526886279914134, "grad_norm": 68.5, "kl": 0.3537178039550781, "learning_rate": 5e-07, "logits/chosen": 51109956.571428575, "logits/rejected": -42752056.0, "logps/chosen": -440.27280970982144, "logps/rejected": -390.095458984375, "loss": 0.3396, "rewards/chosen": 0.5845851216997419, "rewards/margins": 2.6566921983446394, "rewards/rejected": -2.0721070766448975, "step": 6654 }, { "epoch": 0.3527416319932155, "grad_norm": 31.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3452332.3333333335, "logits/rejected": -14978651.2, "logps/chosen": -91.65843709309895, "logps/rejected": -296.5496337890625, "loss": 0.3225, "rewards/chosen": -0.4416985511779785, "rewards/margins": 1.3881461143493652, "rewards/rejected": -1.8298446655273437, "step": 6655 }, { "epoch": 0.35279463599501765, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6557774.5, "logits/rejected": -13570795.0, "logps/chosen": -422.31890869140625, "logps/rejected": -238.917236328125, "loss": 0.2873, "rewards/chosen": 0.48518258333206177, "rewards/margins": 2.1992005705833435, "rewards/rejected": -1.7140179872512817, "step": 6656 }, { "epoch": 0.3528476399968198, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12792150.0, "logits/rejected": -7188721.333333333, "logps/chosen": -90.67021942138672, "logps/rejected": -95.47737630208333, "loss": 0.3071, "rewards/chosen": -0.3194345533847809, "rewards/margins": 1.3864119152228038, "rewards/rejected": -1.7058464686075847, "step": 6657 }, { "epoch": 0.3529006439986219, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38802218.666666664, "logits/rejected": -49271408.0, "logps/chosen": -401.7954915364583, "logps/rejected": -357.88330078125, "loss": 0.373, "rewards/chosen": 0.3016713460286458, "rewards/margins": 2.181212385495504, "rewards/rejected": -1.879541039466858, "step": 6658 }, { "epoch": 0.35295364800042406, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30042995.2, "logits/rejected": -36301216.0, "logps/chosen": -216.571875, "logps/rejected": -265.4097493489583, "loss": 0.397, "rewards/chosen": -0.1013570785522461, "rewards/margins": 1.427234395345052, "rewards/rejected": -1.528591473897298, "step": 6659 }, { "epoch": 0.3530066520022262, "grad_norm": 41.75, "kl": 0.7870025634765625, "learning_rate": 5e-07, "logits/chosen": -44509781.333333336, "logits/rejected": -33372428.8, "logps/chosen": -430.1783854166667, "logps/rejected": -212.568896484375, "loss": 0.242, "rewards/chosen": 1.0967986583709717, "rewards/margins": 2.3401236057281496, "rewards/rejected": -1.2433249473571777, "step": 6660 }, { "epoch": 0.3530596560040283, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49980665.6, "logits/rejected": 531879.8333333334, "logps/chosen": -343.727978515625, "logps/rejected": -76.8219706217448, "loss": 0.3107, "rewards/chosen": 0.8718410491943359, "rewards/margins": 2.355618413289388, "rewards/rejected": -1.483777364095052, "step": 6661 }, { "epoch": 0.3531126600058304, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29879076.0, "logits/rejected": -19906408.0, "logps/chosen": -292.87738037109375, "logps/rejected": -317.748779296875, "loss": 0.285, "rewards/chosen": 0.8889074325561523, "rewards/margins": 2.547318935394287, "rewards/rejected": -1.6584115028381348, "step": 6662 }, { "epoch": 0.35316566400763255, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30124202.0, "logits/rejected": -5460597.0, "logps/chosen": -204.5609588623047, "logps/rejected": -64.906494140625, "loss": 0.3072, "rewards/chosen": 0.4246434271335602, "rewards/margins": 1.906154066324234, "rewards/rejected": -1.4815106391906738, "step": 6663 }, { "epoch": 0.3532186680094347, "grad_norm": 45.5, "kl": 0.7497653961181641, "learning_rate": 5e-07, "logits/chosen": -43536620.0, "logits/rejected": -11818901.0, "logps/chosen": -299.81170654296875, "logps/rejected": -348.281494140625, "loss": 0.3357, "rewards/chosen": 0.36605337262153625, "rewards/margins": 1.6146095097064972, "rewards/rejected": -1.248556137084961, "step": 6664 }, { "epoch": 0.3532716720112368, "grad_norm": 39.75, "kl": 1.423959732055664, "learning_rate": 5e-07, "logits/chosen": -22491988.0, "logits/rejected": -261333.5, "logps/chosen": -216.3011016845703, "logps/rejected": -43.76136779785156, "loss": 0.3664, "rewards/chosen": 0.6815975904464722, "rewards/margins": 1.1396339535713196, "rewards/rejected": -0.4580363631248474, "step": 6665 }, { "epoch": 0.35332467601303896, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53705952.0, "logits/rejected": -25366435.2, "logps/chosen": -316.4569905598958, "logps/rejected": -356.6641357421875, "loss": 0.2818, "rewards/chosen": -0.05788115660349528, "rewards/margins": 2.201054962476095, "rewards/rejected": -2.25893611907959, "step": 6666 }, { "epoch": 0.3533776800148411, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16059900.0, "logits/rejected": -27809272.0, "logps/chosen": -332.89776611328125, "logps/rejected": -392.3448181152344, "loss": 0.2864, "rewards/chosen": 0.33424070477485657, "rewards/margins": 2.591729909181595, "rewards/rejected": -2.2574892044067383, "step": 6667 }, { "epoch": 0.35343068401664324, "grad_norm": 43.25, "kl": 2.6668882369995117, "learning_rate": 5e-07, "logits/chosen": 5843152.0, "logits/rejected": -5731532.0, "logps/chosen": -151.2949462890625, "logps/rejected": -378.439697265625, "loss": 0.4226, "rewards/chosen": 0.12270008325576783, "rewards/margins": 1.7305854439735413, "rewards/rejected": -1.6078853607177734, "step": 6668 }, { "epoch": 0.3534836880184454, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51838024.0, "logits/rejected": -5564166.666666667, "logps/chosen": -382.07373046875, "logps/rejected": -329.46079508463544, "loss": 0.1737, "rewards/chosen": 0.4315849244594574, "rewards/margins": 3.23462383945783, "rewards/rejected": -2.8030389149983725, "step": 6669 }, { "epoch": 0.3535366920202475, "grad_norm": 31.75, "kl": 2.7755889892578125, "learning_rate": 5e-07, "logits/chosen": -28983586.0, "logits/rejected": -39449348.0, "logps/chosen": -492.385009765625, "logps/rejected": -366.72808837890625, "loss": 0.1692, "rewards/chosen": 1.1783095598220825, "rewards/margins": 3.9684520959854126, "rewards/rejected": -2.79014253616333, "step": 6670 }, { "epoch": 0.35358969602204965, "grad_norm": 57.25, "kl": 0.3664073944091797, "learning_rate": 5e-07, "logits/chosen": -27569088.0, "logits/rejected": -18241702.0, "logps/chosen": -679.4393310546875, "logps/rejected": -251.55621337890625, "loss": 0.2129, "rewards/chosen": 0.7533043622970581, "rewards/margins": 3.5998376607894897, "rewards/rejected": -2.8465332984924316, "step": 6671 }, { "epoch": 0.3536427000238518, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41523811.2, "logits/rejected": -79939946.66666667, "logps/chosen": -95.93277587890626, "logps/rejected": -381.4395345052083, "loss": 0.3268, "rewards/chosen": 0.36556644439697267, "rewards/margins": 2.2962149302164714, "rewards/rejected": -1.9306484858194988, "step": 6672 }, { "epoch": 0.3536957040256539, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36514232.0, "logits/rejected": -42822070.4, "logps/chosen": -428.7292887369792, "logps/rejected": -629.4560546875, "loss": 0.2518, "rewards/chosen": 0.10687261819839478, "rewards/margins": 2.5169825196266173, "rewards/rejected": -2.4101099014282226, "step": 6673 }, { "epoch": 0.35374870802745606, "grad_norm": 50.25, "kl": 0.9509201049804688, "learning_rate": 5e-07, "logits/chosen": -28835962.666666668, "logits/rejected": -25962984.0, "logps/chosen": -270.4101155598958, "logps/rejected": -236.2541046142578, "loss": 0.4412, "rewards/chosen": -0.22652210791905722, "rewards/margins": 1.8130615750948589, "rewards/rejected": -2.039583683013916, "step": 6674 }, { "epoch": 0.3538017120292582, "grad_norm": 56.0, "kl": 0.26462554931640625, "learning_rate": 5e-07, "logits/chosen": 4568033.333333333, "logits/rejected": -21064248.0, "logps/chosen": -300.601318359375, "logps/rejected": -215.69122314453125, "loss": 0.3273, "rewards/chosen": 0.5925281842549642, "rewards/margins": 1.9455093940099082, "rewards/rejected": -1.3529812097549438, "step": 6675 }, { "epoch": 0.35385471603106033, "grad_norm": 46.5, "kl": 0.8302001953125, "learning_rate": 5e-07, "logits/chosen": -7777652.0, "logits/rejected": -43741628.0, "logps/chosen": -105.26839447021484, "logps/rejected": -542.516357421875, "loss": 0.3992, "rewards/chosen": -0.1918233036994934, "rewards/margins": 1.71948903799057, "rewards/rejected": -1.9113123416900635, "step": 6676 }, { "epoch": 0.35390772003286247, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51735472.0, "logits/rejected": -33668152.0, "logps/chosen": -281.35467529296875, "logps/rejected": -431.9141031901042, "loss": 0.2595, "rewards/chosen": -0.47827664017677307, "rewards/margins": 2.1309683422247567, "rewards/rejected": -2.60924498240153, "step": 6677 }, { "epoch": 0.3539607240346646, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7563485.333333333, "logits/rejected": -37032742.4, "logps/chosen": -524.2955322265625, "logps/rejected": -315.42822265625, "loss": 0.2928, "rewards/chosen": 0.034275313218434654, "rewards/margins": 1.820578642686208, "rewards/rejected": -1.7863033294677735, "step": 6678 }, { "epoch": 0.35401372803646675, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65632308.0, "logits/rejected": -16511860.57142857, "logps/chosen": -167.4730987548828, "logps/rejected": -314.21962193080356, "loss": 0.1906, "rewards/chosen": -0.4542526304721832, "rewards/margins": 1.691912672349385, "rewards/rejected": -2.146165302821568, "step": 6679 }, { "epoch": 0.3540667320382689, "grad_norm": 58.75, "kl": 0.2691307067871094, "learning_rate": 5e-07, "logits/chosen": -28963386.666666668, "logits/rejected": -25259132.8, "logps/chosen": -296.10699462890625, "logps/rejected": -332.32587890625, "loss": 0.1916, "rewards/chosen": 0.5430939197540283, "rewards/margins": 3.194857454299927, "rewards/rejected": -2.6517635345458985, "step": 6680 }, { "epoch": 0.354119736040071, "grad_norm": 48.5, "kl": 0.3798789978027344, "learning_rate": 5e-07, "logits/chosen": -96595512.0, "logits/rejected": -38678284.0, "logps/chosen": -305.4434814453125, "logps/rejected": -312.1674499511719, "loss": 0.3122, "rewards/chosen": 0.051599107682704926, "rewards/margins": 2.2322572097182274, "rewards/rejected": -2.1806581020355225, "step": 6681 }, { "epoch": 0.35417274004187316, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57057237.333333336, "logits/rejected": -15965779.2, "logps/chosen": -129.55133056640625, "logps/rejected": -432.59169921875, "loss": 0.307, "rewards/chosen": -0.2243230144182841, "rewards/margins": 1.7796154697736102, "rewards/rejected": -2.0039384841918944, "step": 6682 }, { "epoch": 0.3542257440436753, "grad_norm": 44.25, "kl": 0.04045867919921875, "learning_rate": 5e-07, "logits/chosen": -9543899.2, "logits/rejected": -4452515.333333333, "logps/chosen": -125.1228759765625, "logps/rejected": -122.100830078125, "loss": 0.4127, "rewards/chosen": -0.04869734942913055, "rewards/margins": 1.1286294887463253, "rewards/rejected": -1.1773268381754558, "step": 6683 }, { "epoch": 0.35427874804547743, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39364016.0, "logits/rejected": -20127789.333333332, "logps/chosen": -191.99063110351562, "logps/rejected": -251.47774251302084, "loss": 0.2222, "rewards/chosen": -0.41882437467575073, "rewards/margins": 1.9882108171780906, "rewards/rejected": -2.4070351918538413, "step": 6684 }, { "epoch": 0.35433175204727957, "grad_norm": 35.25, "kl": 0.2676267623901367, "learning_rate": 5e-07, "logits/chosen": -19014532.0, "logits/rejected": -47568584.0, "logps/chosen": -572.095458984375, "logps/rejected": -460.6038818359375, "loss": 0.242, "rewards/chosen": 0.4721064567565918, "rewards/margins": 3.1872854232788086, "rewards/rejected": -2.715178966522217, "step": 6685 }, { "epoch": 0.3543847560490817, "grad_norm": 54.0, "kl": 1.3050994873046875, "learning_rate": 5e-07, "logits/chosen": -15752822.666666666, "logits/rejected": -33358838.4, "logps/chosen": -667.20849609375, "logps/rejected": -292.1445068359375, "loss": 0.2248, "rewards/chosen": 1.2658030986785889, "rewards/margins": 3.1067412853240968, "rewards/rejected": -1.840938186645508, "step": 6686 }, { "epoch": 0.35443776005088384, "grad_norm": 52.5, "kl": 2.839400291442871, "learning_rate": 5e-07, "logits/chosen": -62889530.666666664, "logits/rejected": -15614922.0, "logps/chosen": -448.3065999348958, "logps/rejected": -268.6833801269531, "loss": 0.3176, "rewards/chosen": 0.7444227536519369, "rewards/margins": 2.191688378651937, "rewards/rejected": -1.447265625, "step": 6687 }, { "epoch": 0.354490764052686, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 456753.3333333333, "logits/rejected": -21358534.4, "logps/chosen": -453.3803304036458, "logps/rejected": -306.1328369140625, "loss": 0.2077, "rewards/chosen": 0.2640676101048787, "rewards/margins": 3.048693044980367, "rewards/rejected": -2.784625434875488, "step": 6688 }, { "epoch": 0.3545437680544881, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -147797898.66666666, "logits/rejected": -28416684.8, "logps/chosen": -346.9652506510417, "logps/rejected": -157.153076171875, "loss": 0.2191, "rewards/chosen": 0.5872655709584554, "rewards/margins": 2.7003003915150963, "rewards/rejected": -2.1130348205566407, "step": 6689 }, { "epoch": 0.35459677205629025, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25117325.333333332, "logits/rejected": -18166086.4, "logps/chosen": -255.27433268229166, "logps/rejected": -468.1728515625, "loss": 0.2435, "rewards/chosen": 0.17003631591796875, "rewards/margins": 2.5872213363647463, "rewards/rejected": -2.4171850204467775, "step": 6690 }, { "epoch": 0.3546497760580924, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18333763.2, "logits/rejected": -32375888.0, "logps/chosen": -230.5693603515625, "logps/rejected": -263.384033203125, "loss": 0.2572, "rewards/chosen": 0.7378236770629882, "rewards/margins": 3.007641347249349, "rewards/rejected": -2.269817670186361, "step": 6691 }, { "epoch": 0.35470278005989453, "grad_norm": 39.25, "kl": 0.5269737243652344, "learning_rate": 5e-07, "logits/chosen": -23148114.0, "logits/rejected": -29815206.0, "logps/chosen": -197.44564819335938, "logps/rejected": -527.8428344726562, "loss": 0.2464, "rewards/chosen": 0.3918326497077942, "rewards/margins": 3.2615779042243958, "rewards/rejected": -2.8697452545166016, "step": 6692 }, { "epoch": 0.35475578406169667, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -111108128.0, "logits/rejected": -22088934.666666668, "logps/chosen": -459.84686279296875, "logps/rejected": -255.5326944986979, "loss": 0.309, "rewards/chosen": -0.17443999648094177, "rewards/margins": 1.2777022421360016, "rewards/rejected": -1.4521422386169434, "step": 6693 }, { "epoch": 0.3548087880634988, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 741858.0, "logits/rejected": -4772866.8, "logps/chosen": -125.57285563151042, "logps/rejected": -323.2227783203125, "loss": 0.397, "rewards/chosen": -0.32480953137079877, "rewards/margins": 1.019672699769338, "rewards/rejected": -1.3444822311401368, "step": 6694 }, { "epoch": 0.35486179206530094, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14784314.0, "logits/rejected": -28676949.333333332, "logps/chosen": -64.83536529541016, "logps/rejected": -171.27140299479166, "loss": 0.3302, "rewards/chosen": -0.2613140046596527, "rewards/margins": 1.0400820473829906, "rewards/rejected": -1.3013960520426433, "step": 6695 }, { "epoch": 0.3549147960671031, "grad_norm": 37.25, "kl": 0.5544757843017578, "learning_rate": 5e-07, "logits/chosen": 16094638.4, "logits/rejected": -68304256.0, "logps/chosen": -202.25712890625, "logps/rejected": -236.23274739583334, "loss": 0.29, "rewards/chosen": 0.33278589248657225, "rewards/margins": 4.124404875437419, "rewards/rejected": -3.791618982950846, "step": 6696 }, { "epoch": 0.3549678000689052, "grad_norm": 28.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2967961.5, "logits/rejected": -72690825.14285715, "logps/chosen": -52.023658752441406, "logps/rejected": -278.13527134486606, "loss": 0.2236, "rewards/chosen": -1.3945716619491577, "rewards/margins": 0.8032396009990146, "rewards/rejected": -2.1978112629481723, "step": 6697 }, { "epoch": 0.35502080407070735, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1920514.6, "logits/rejected": -2346141.3333333335, "logps/chosen": -125.702978515625, "logps/rejected": -292.6710611979167, "loss": 0.3568, "rewards/chosen": 0.2962423086166382, "rewards/margins": 1.7319478114446003, "rewards/rejected": -1.4357055028279622, "step": 6698 }, { "epoch": 0.3550738080725095, "grad_norm": 52.0, "kl": 0.5011444091796875, "learning_rate": 5e-07, "logits/chosen": -67548120.0, "logits/rejected": -19699958.0, "logps/chosen": -421.4419860839844, "logps/rejected": -262.4290771484375, "loss": 0.2951, "rewards/chosen": 0.2446647584438324, "rewards/margins": 2.483551114797592, "rewards/rejected": -2.2388863563537598, "step": 6699 }, { "epoch": 0.3551268120743116, "grad_norm": 51.75, "kl": 0.5890464782714844, "learning_rate": 5e-07, "logits/chosen": -35306240.0, "logits/rejected": -26201749.333333332, "logps/chosen": -303.482177734375, "logps/rejected": -272.96262613932294, "loss": 0.284, "rewards/chosen": 0.7476742744445801, "rewards/margins": 2.2897205034891766, "rewards/rejected": -1.5420462290445964, "step": 6700 }, { "epoch": 0.35517981607611376, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25915472.0, "logits/rejected": -53596021.333333336, "logps/chosen": -163.0906982421875, "logps/rejected": -387.5729166666667, "loss": 0.2658, "rewards/chosen": -0.13278932869434357, "rewards/margins": 1.6437790840864182, "rewards/rejected": -1.7765684127807617, "step": 6701 }, { "epoch": 0.3552328200779159, "grad_norm": 50.5, "kl": 0.6738815307617188, "learning_rate": 5e-07, "logits/chosen": -43470532.0, "logits/rejected": -1872863.25, "logps/chosen": -445.8834533691406, "logps/rejected": -98.36146545410156, "loss": 0.329, "rewards/chosen": 0.7808067798614502, "rewards/margins": 1.9510183334350586, "rewards/rejected": -1.1702115535736084, "step": 6702 }, { "epoch": 0.35528582407971804, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 701319.8333333334, "logits/rejected": -29977104.0, "logps/chosen": -247.16105143229166, "logps/rejected": -239.7499267578125, "loss": 0.2365, "rewards/chosen": 0.6881345907847086, "rewards/margins": 2.7693262259165445, "rewards/rejected": -2.081191635131836, "step": 6703 }, { "epoch": 0.3553388280815202, "grad_norm": 57.5, "kl": 0.5280723571777344, "learning_rate": 5e-07, "logits/chosen": -11569428.0, "logits/rejected": -15826600.0, "logps/chosen": -520.6536254882812, "logps/rejected": -223.8436737060547, "loss": 0.2203, "rewards/chosen": 1.1268348693847656, "rewards/margins": 2.784850835800171, "rewards/rejected": -1.6580159664154053, "step": 6704 }, { "epoch": 0.3553918320833223, "grad_norm": 69.0, "kl": 0.08637237548828125, "learning_rate": 5e-07, "logits/chosen": -40699046.4, "logits/rejected": 8239600.0, "logps/chosen": -299.86220703125, "logps/rejected": -343.2041015625, "loss": 0.4384, "rewards/chosen": -0.28601105213165284, "rewards/margins": 1.2792447487513225, "rewards/rejected": -1.5652558008829753, "step": 6705 }, { "epoch": 0.35544483608512445, "grad_norm": 48.75, "kl": 0.07806587219238281, "learning_rate": 5e-07, "logits/chosen": -46834592.0, "logits/rejected": -27656921.6, "logps/chosen": -272.7626953125, "logps/rejected": -384.4631591796875, "loss": 0.312, "rewards/chosen": -0.1416473388671875, "rewards/margins": 1.7597929000854493, "rewards/rejected": -1.9014402389526368, "step": 6706 }, { "epoch": 0.3554978400869266, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25709002.0, "logits/rejected": -18129922.0, "logps/chosen": -363.0929260253906, "logps/rejected": -233.93475341796875, "loss": 0.2974, "rewards/chosen": 0.3736729025840759, "rewards/margins": 2.200310170650482, "rewards/rejected": -1.8266372680664062, "step": 6707 }, { "epoch": 0.3555508440887287, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45773610.666666664, "logits/rejected": -208377.6, "logps/chosen": -453.9982096354167, "logps/rejected": -140.36888427734374, "loss": 0.2985, "rewards/chosen": -0.01439616084098816, "rewards/margins": 1.7846251785755158, "rewards/rejected": -1.799021339416504, "step": 6708 }, { "epoch": 0.35560384809053086, "grad_norm": 44.75, "kl": 0.06336212158203125, "learning_rate": 5e-07, "logits/chosen": -13195966.0, "logits/rejected": -59265336.0, "logps/chosen": -241.46702575683594, "logps/rejected": -253.912841796875, "loss": 0.2816, "rewards/chosen": 0.2769942283630371, "rewards/margins": 2.799872398376465, "rewards/rejected": -2.5228781700134277, "step": 6709 }, { "epoch": 0.355656852092333, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63441456.0, "logits/rejected": -4070233.0, "logps/chosen": -318.44561767578125, "logps/rejected": -376.2643127441406, "loss": 0.2849, "rewards/chosen": 0.5485010743141174, "rewards/margins": 2.163968503475189, "rewards/rejected": -1.6154674291610718, "step": 6710 }, { "epoch": 0.3557098560941351, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54581280.0, "logits/rejected": -23348149.333333332, "logps/chosen": -816.03115234375, "logps/rejected": -429.4095865885417, "loss": 0.3311, "rewards/chosen": 0.49616708755493166, "rewards/margins": 3.7262778917948403, "rewards/rejected": -3.2301108042399087, "step": 6711 }, { "epoch": 0.3557628600959372, "grad_norm": 49.25, "kl": 0.6461296081542969, "learning_rate": 5e-07, "logits/chosen": -33377916.0, "logits/rejected": -15642324.57142857, "logps/chosen": -411.9084777832031, "logps/rejected": -253.00003487723214, "loss": 0.2722, "rewards/chosen": -0.0890350341796875, "rewards/margins": 1.4130281720842635, "rewards/rejected": -1.502063206263951, "step": 6712 }, { "epoch": 0.35581586409773935, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28323253.333333332, "logits/rejected": -56351924.0, "logps/chosen": -310.0699055989583, "logps/rejected": -355.371826171875, "loss": 0.4009, "rewards/chosen": -0.026103074351946514, "rewards/margins": 2.5835441996653876, "rewards/rejected": -2.609647274017334, "step": 6713 }, { "epoch": 0.3558688680995415, "grad_norm": 51.0, "kl": 0.14794158935546875, "learning_rate": 5e-07, "logits/chosen": -47170101.333333336, "logits/rejected": -55265000.0, "logps/chosen": -344.8891194661458, "logps/rejected": -405.83221435546875, "loss": 0.4006, "rewards/chosen": 0.004671613375345866, "rewards/margins": 2.929801265398661, "rewards/rejected": -2.9251296520233154, "step": 6714 }, { "epoch": 0.35592187210134363, "grad_norm": 87.5, "kl": 1.250986099243164, "learning_rate": 5e-07, "logits/chosen": -17508784.0, "logits/rejected": -30542989.333333332, "logps/chosen": -958.52578125, "logps/rejected": -116.23356119791667, "loss": 0.2391, "rewards/chosen": 1.6195034027099608, "rewards/margins": 3.3073397318522133, "rewards/rejected": -1.6878363291422527, "step": 6715 }, { "epoch": 0.35597487610314577, "grad_norm": 49.0, "kl": 1.8174114227294922, "learning_rate": 5e-07, "logits/chosen": -31401797.333333332, "logits/rejected": 26433152.0, "logps/chosen": -273.18857828776044, "logps/rejected": -498.622314453125, "loss": 0.4231, "rewards/chosen": 0.3042936722437541, "rewards/margins": 2.4110365311304727, "rewards/rejected": -2.1067428588867188, "step": 6716 }, { "epoch": 0.3560278801049479, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49017152.0, "logits/rejected": -31529683.2, "logps/chosen": -322.22426350911456, "logps/rejected": -409.4517822265625, "loss": 0.2492, "rewards/chosen": 0.22016721963882446, "rewards/margins": 2.4641468644142153, "rewards/rejected": -2.243979644775391, "step": 6717 }, { "epoch": 0.35608088410675004, "grad_norm": 43.75, "kl": 0.623866081237793, "learning_rate": 5e-07, "logits/chosen": -12874321.333333334, "logits/rejected": -25518078.0, "logps/chosen": -258.66473388671875, "logps/rejected": -325.6063232421875, "loss": 0.3306, "rewards/chosen": 0.5558406909306844, "rewards/margins": 2.2058455546696982, "rewards/rejected": -1.6500048637390137, "step": 6718 }, { "epoch": 0.3561338881085522, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33990848.0, "logits/rejected": 13311733.333333334, "logps/chosen": -331.579736328125, "logps/rejected": -393.8757731119792, "loss": 0.3689, "rewards/chosen": 0.08247268199920654, "rewards/margins": 1.745803713798523, "rewards/rejected": -1.6633310317993164, "step": 6719 }, { "epoch": 0.3561868921103543, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4876100.0, "logits/rejected": 390926.85714285716, "logps/chosen": -377.3226318359375, "logps/rejected": -145.8739013671875, "loss": 0.2357, "rewards/chosen": 1.8359222412109375, "rewards/margins": 3.469612802777972, "rewards/rejected": -1.633690561567034, "step": 6720 }, { "epoch": 0.35623989611215645, "grad_norm": 59.75, "kl": 1.115509033203125, "learning_rate": 5e-07, "logits/chosen": -21052962.0, "logits/rejected": -30588058.666666668, "logps/chosen": -177.18380737304688, "logps/rejected": -330.740478515625, "loss": 0.3676, "rewards/chosen": -0.6319765448570251, "rewards/margins": 1.355932931105296, "rewards/rejected": -1.987909475962321, "step": 6721 }, { "epoch": 0.3562929001139586, "grad_norm": 62.5, "kl": 0.2913494110107422, "learning_rate": 5e-07, "logits/chosen": -45322970.666666664, "logits/rejected": -8616298.0, "logps/chosen": -379.9635823567708, "logps/rejected": -115.9107666015625, "loss": 0.3427, "rewards/chosen": 0.39540640513102215, "rewards/margins": 2.445046345392863, "rewards/rejected": -2.049639940261841, "step": 6722 }, { "epoch": 0.3563459041157607, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -87254233.6, "logits/rejected": -36118144.0, "logps/chosen": -399.7011474609375, "logps/rejected": -332.56239827473956, "loss": 0.4576, "rewards/chosen": -0.6898214817047119, "rewards/margins": 1.402426322301229, "rewards/rejected": -2.092247804005941, "step": 6723 }, { "epoch": 0.35639890811756286, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26577988.0, "logits/rejected": -21931482.666666668, "logps/chosen": -260.3599853515625, "logps/rejected": -216.56978352864584, "loss": 0.2925, "rewards/chosen": 0.05613727122545242, "rewards/margins": 1.4975525811314583, "rewards/rejected": -1.4414153099060059, "step": 6724 }, { "epoch": 0.356451912119365, "grad_norm": 55.5, "kl": 0.05721282958984375, "learning_rate": 5e-07, "logits/chosen": 37316160.0, "logits/rejected": -31051660.0, "logps/chosen": -458.3052978515625, "logps/rejected": -481.60198974609375, "loss": 0.2798, "rewards/chosen": 0.032287418842315674, "rewards/margins": 3.1835933327674866, "rewards/rejected": -3.151305913925171, "step": 6725 }, { "epoch": 0.35650491612116714, "grad_norm": 55.75, "kl": 0.11050033569335938, "learning_rate": 5e-07, "logits/chosen": -27750096.0, "logits/rejected": -473729.625, "logps/chosen": -414.3575439453125, "logps/rejected": -159.236083984375, "loss": 0.251, "rewards/chosen": 0.7139655947685242, "rewards/margins": 2.4780368208885193, "rewards/rejected": -1.7640712261199951, "step": 6726 }, { "epoch": 0.3565579201229693, "grad_norm": 55.0, "kl": 0.3435478210449219, "learning_rate": 5e-07, "logits/chosen": -65385200.0, "logits/rejected": -14484308.0, "logps/chosen": -401.8741455078125, "logps/rejected": -343.69561767578125, "loss": 0.2962, "rewards/chosen": 0.6104686856269836, "rewards/margins": 2.147311270236969, "rewards/rejected": -1.5368425846099854, "step": 6727 }, { "epoch": 0.3566109241247714, "grad_norm": 47.0, "kl": 1.5084285736083984, "learning_rate": 5e-07, "logits/chosen": -3065365.3333333335, "logits/rejected": 8307025.0, "logps/chosen": -198.98628743489584, "logps/rejected": -291.3764953613281, "loss": 0.3641, "rewards/chosen": 0.39898252487182617, "rewards/margins": 2.355640411376953, "rewards/rejected": -1.956657886505127, "step": 6728 }, { "epoch": 0.35666392812657355, "grad_norm": 59.5, "kl": 0.29129791259765625, "learning_rate": 5e-07, "logits/chosen": -2640292.5, "logits/rejected": -32386184.0, "logps/chosen": -271.79248046875, "logps/rejected": -220.66873168945312, "loss": 0.2787, "rewards/chosen": 0.6062479615211487, "rewards/margins": 2.2038115859031677, "rewards/rejected": -1.597563624382019, "step": 6729 }, { "epoch": 0.3567169321283757, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17896985.6, "logits/rejected": 5119425.0, "logps/chosen": -298.718017578125, "logps/rejected": -115.0947774251302, "loss": 0.3214, "rewards/chosen": 0.21477890014648438, "rewards/margins": 2.7487966219584146, "rewards/rejected": -2.53401772181193, "step": 6730 }, { "epoch": 0.3567699361301778, "grad_norm": 81.0, "kl": 0.2971687316894531, "learning_rate": 5e-07, "logits/chosen": -34666533.333333336, "logits/rejected": -6382654.0, "logps/chosen": -414.08935546875, "logps/rejected": -76.69527435302734, "loss": 0.4074, "rewards/chosen": 0.052487184604008995, "rewards/margins": 1.7498094340165455, "rewards/rejected": -1.6973222494125366, "step": 6731 }, { "epoch": 0.35682294013197996, "grad_norm": 61.5, "kl": 2.0731582641601562, "learning_rate": 5e-07, "logits/chosen": -58837657.6, "logits/rejected": -67687429.33333333, "logps/chosen": -329.4935791015625, "logps/rejected": -473.556640625, "loss": 0.3201, "rewards/chosen": 0.6365750312805176, "rewards/margins": 2.409960397084554, "rewards/rejected": -1.7733853658040364, "step": 6732 }, { "epoch": 0.3568759441337821, "grad_norm": 45.75, "kl": 0.5042362213134766, "learning_rate": 5e-07, "logits/chosen": -18306398.4, "logits/rejected": -31273818.666666668, "logps/chosen": -417.7845703125, "logps/rejected": -184.0770467122396, "loss": 0.2847, "rewards/chosen": 0.711094331741333, "rewards/margins": 2.395651133855184, "rewards/rejected": -1.6845568021138508, "step": 6733 }, { "epoch": 0.35692894813558423, "grad_norm": 41.5, "kl": 0.10825157165527344, "learning_rate": 5e-07, "logits/chosen": -12938230.4, "logits/rejected": -1039705.0833333334, "logps/chosen": -198.68538818359374, "logps/rejected": -72.4336446126302, "loss": 0.3767, "rewards/chosen": 0.09948906898498536, "rewards/margins": 1.8208817005157472, "rewards/rejected": -1.7213926315307617, "step": 6734 }, { "epoch": 0.35698195213738637, "grad_norm": 55.5, "kl": 0.3527870178222656, "learning_rate": 5e-07, "logits/chosen": -97672053.33333333, "logits/rejected": -18889900.8, "logps/chosen": -241.80904134114584, "logps/rejected": -363.74404296875, "loss": 0.2672, "rewards/chosen": 0.03294320901234945, "rewards/margins": 2.261513384183248, "rewards/rejected": -2.2285701751708986, "step": 6735 }, { "epoch": 0.3570349561391885, "grad_norm": 43.25, "kl": 0.9656772613525391, "learning_rate": 5e-07, "logits/chosen": -36757491.2, "logits/rejected": -5374202.0, "logps/chosen": -271.6465087890625, "logps/rejected": -202.0194295247396, "loss": 0.3985, "rewards/chosen": -0.0917548418045044, "rewards/margins": 1.544750126202901, "rewards/rejected": -1.6365049680074055, "step": 6736 }, { "epoch": 0.35708796014099065, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2164973.75, "logits/rejected": -27793693.333333332, "logps/chosen": -69.39593505859375, "logps/rejected": -427.2733561197917, "loss": 0.1885, "rewards/chosen": 0.8613666296005249, "rewards/margins": 3.1017361084620156, "rewards/rejected": -2.2403694788614907, "step": 6737 }, { "epoch": 0.3571409641427928, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29777605.333333332, "logits/rejected": -37873156.0, "logps/chosen": -424.1102701822917, "logps/rejected": -494.07647705078125, "loss": 0.3269, "rewards/chosen": 0.37590229511260986, "rewards/margins": 3.1347028017044067, "rewards/rejected": -2.758800506591797, "step": 6738 }, { "epoch": 0.3571939681445949, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18765060.8, "logits/rejected": -14685173.333333334, "logps/chosen": -321.5136474609375, "logps/rejected": -194.30021158854166, "loss": 0.2919, "rewards/chosen": 0.42839899063110354, "rewards/margins": 2.6969352404276528, "rewards/rejected": -2.2685362497965493, "step": 6739 }, { "epoch": 0.35724697214639706, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2153479.0, "logits/rejected": -2496096.8, "logps/chosen": -167.62945556640625, "logps/rejected": -246.402490234375, "loss": 0.4561, "rewards/chosen": -0.31321195761362713, "rewards/margins": 0.6117856582005818, "rewards/rejected": -0.924997615814209, "step": 6740 }, { "epoch": 0.3572999761481992, "grad_norm": 52.25, "kl": 1.1611785888671875, "learning_rate": 5e-07, "logits/chosen": -49333946.666666664, "logits/rejected": 7227371.2, "logps/chosen": -228.17138671875, "logps/rejected": -202.0635986328125, "loss": 0.325, "rewards/chosen": 0.158142218987147, "rewards/margins": 1.4305079797903697, "rewards/rejected": -1.2723657608032226, "step": 6741 }, { "epoch": 0.35735298015000133, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6200078.0, "logits/rejected": -19013858.0, "logps/chosen": -273.83734130859375, "logps/rejected": -169.9325408935547, "loss": 0.2567, "rewards/chosen": 0.5446819067001343, "rewards/margins": 2.504339814186096, "rewards/rejected": -1.959657907485962, "step": 6742 }, { "epoch": 0.35740598415180347, "grad_norm": 48.75, "kl": 0.09325790405273438, "learning_rate": 5e-07, "logits/chosen": -12961985.333333334, "logits/rejected": -21257816.0, "logps/chosen": -262.6217854817708, "logps/rejected": -89.25975036621094, "loss": 0.3779, "rewards/chosen": 0.1755530039469401, "rewards/margins": 2.4954439798990884, "rewards/rejected": -2.3198909759521484, "step": 6743 }, { "epoch": 0.3574589881536056, "grad_norm": 81.5, "kl": 0.19562530517578125, "learning_rate": 5e-07, "logits/chosen": -38724122.666666664, "logits/rejected": -18289890.0, "logps/chosen": -167.06465657552084, "logps/rejected": -213.7147216796875, "loss": 0.4487, "rewards/chosen": -0.11718101302782695, "rewards/margins": 1.3523891468842824, "rewards/rejected": -1.4695701599121094, "step": 6744 }, { "epoch": 0.35751199215540774, "grad_norm": 57.25, "kl": 0.9997215270996094, "learning_rate": 5e-07, "logits/chosen": -9443097.6, "logits/rejected": -25254026.666666668, "logps/chosen": -120.5575439453125, "logps/rejected": -225.5216064453125, "loss": 0.4259, "rewards/chosen": -0.2592058897018433, "rewards/margins": 1.491894809405009, "rewards/rejected": -1.7511006991068523, "step": 6745 }, { "epoch": 0.3575649961572099, "grad_norm": 48.5, "kl": 0.13823699951171875, "learning_rate": 5e-07, "logits/chosen": -31260736.0, "logits/rejected": -26264042.666666668, "logps/chosen": -384.49482421875, "logps/rejected": -433.7993570963542, "loss": 0.2997, "rewards/chosen": 0.33366239070892334, "rewards/margins": 2.711056431134542, "rewards/rejected": -2.3773940404256186, "step": 6746 }, { "epoch": 0.357618000159012, "grad_norm": 53.5, "kl": 0.11841583251953125, "learning_rate": 5e-07, "logits/chosen": -53048648.0, "logits/rejected": -44706696.0, "logps/chosen": -391.08856201171875, "logps/rejected": -455.7274475097656, "loss": 0.2887, "rewards/chosen": 0.04205970838665962, "rewards/margins": 2.5041990764439106, "rewards/rejected": -2.462139368057251, "step": 6747 }, { "epoch": 0.35767100416081415, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15135855.0, "logits/rejected": -9237815.0, "logps/chosen": -499.28997802734375, "logps/rejected": -224.17982482910156, "loss": 0.2896, "rewards/chosen": 0.4492642283439636, "rewards/margins": 2.0192471146583557, "rewards/rejected": -1.569982886314392, "step": 6748 }, { "epoch": 0.3577240081626163, "grad_norm": 40.5, "kl": 1.6242027282714844, "learning_rate": 5e-07, "logits/chosen": -11684645.0, "logits/rejected": -11304796.0, "logps/chosen": -245.48460388183594, "logps/rejected": -373.34979248046875, "loss": 0.2529, "rewards/chosen": 0.4620501399040222, "rewards/margins": 2.8229803442955017, "rewards/rejected": -2.3609302043914795, "step": 6749 }, { "epoch": 0.35777701216441843, "grad_norm": 39.75, "kl": 1.1846904754638672, "learning_rate": 5e-07, "logits/chosen": -50102664.0, "logits/rejected": -28775076.0, "logps/chosen": -570.9816284179688, "logps/rejected": -218.35482788085938, "loss": 0.2648, "rewards/chosen": 0.9099167585372925, "rewards/margins": 2.9500426054000854, "rewards/rejected": -2.040125846862793, "step": 6750 }, { "epoch": 0.35783001616622057, "grad_norm": 27.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 225629.125, "logits/rejected": -11454701.333333334, "logps/chosen": -49.177406311035156, "logps/rejected": -237.1099853515625, "loss": 0.2518, "rewards/chosen": -0.5192640423774719, "rewards/margins": 1.6954441269238791, "rewards/rejected": -2.214708169301351, "step": 6751 }, { "epoch": 0.3578830201680227, "grad_norm": 45.25, "kl": 0.21442413330078125, "learning_rate": 5e-07, "logits/chosen": -22287336.0, "logits/rejected": -18596505.333333332, "logps/chosen": -318.746142578125, "logps/rejected": -200.4259236653646, "loss": 0.301, "rewards/chosen": 0.6221735000610351, "rewards/margins": 2.5468058904012043, "rewards/rejected": -1.9246323903401692, "step": 6752 }, { "epoch": 0.35793602416982484, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23158413.333333332, "logits/rejected": -39653.0, "logps/chosen": -192.5596923828125, "logps/rejected": -83.37060546875, "loss": 0.4614, "rewards/chosen": -0.0011615753173828125, "rewards/margins": 0.6941380500793457, "rewards/rejected": -0.6952996253967285, "step": 6753 }, { "epoch": 0.357989028171627, "grad_norm": 36.75, "kl": 0.42208385467529297, "learning_rate": 5e-07, "logits/chosen": -29110592.0, "logits/rejected": -17631312.0, "logps/chosen": -309.6823974609375, "logps/rejected": -258.76361083984375, "loss": 0.3333, "rewards/chosen": 0.30944013595581055, "rewards/margins": 2.811292807261149, "rewards/rejected": -2.5018526713053384, "step": 6754 }, { "epoch": 0.3580420321734291, "grad_norm": 42.75, "kl": 0.7754325866699219, "learning_rate": 5e-07, "logits/chosen": -17431712.0, "logits/rejected": 2216444.0, "logps/chosen": -126.91606140136719, "logps/rejected": -305.89013671875, "loss": 0.2605, "rewards/chosen": 0.42767271399497986, "rewards/margins": 2.720357745885849, "rewards/rejected": -2.292685031890869, "step": 6755 }, { "epoch": 0.35809503617523125, "grad_norm": 62.25, "kl": 1.3049802780151367, "learning_rate": 5e-07, "logits/chosen": 32111358.0, "logits/rejected": -9185074.666666666, "logps/chosen": -1466.465576171875, "logps/rejected": -364.5643717447917, "loss": 0.2099, "rewards/chosen": 0.7151573300361633, "rewards/margins": 2.4386884172757464, "rewards/rejected": -1.7235310872395833, "step": 6756 }, { "epoch": 0.3581480401770334, "grad_norm": 41.5, "kl": 0.14299488067626953, "learning_rate": 5e-07, "logits/chosen": -29732618.666666668, "logits/rejected": -19872048.0, "logps/chosen": -157.40494791666666, "logps/rejected": -518.1799926757812, "loss": 0.323, "rewards/chosen": 0.412197748819987, "rewards/margins": 3.5193375746409097, "rewards/rejected": -3.107139825820923, "step": 6757 }, { "epoch": 0.3582010441788355, "grad_norm": 93.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3136140.0, "logits/rejected": 45159104.0, "logps/chosen": -679.445361328125, "logps/rejected": -328.3655192057292, "loss": 0.3574, "rewards/chosen": 0.6523069381713867, "rewards/margins": 1.6333313624064127, "rewards/rejected": -0.981024424235026, "step": 6758 }, { "epoch": 0.35825404818063766, "grad_norm": 43.0, "kl": 1.9612846374511719, "learning_rate": 5e-07, "logits/chosen": -6491526.5, "logits/rejected": -16833714.285714287, "logps/chosen": -103.2003173828125, "logps/rejected": -344.1616908482143, "loss": 0.2241, "rewards/chosen": -0.2970077693462372, "rewards/margins": 1.7285398031984056, "rewards/rejected": -2.025547572544643, "step": 6759 }, { "epoch": 0.3583070521824398, "grad_norm": 62.75, "kl": 1.7762222290039062, "learning_rate": 5e-07, "logits/chosen": -18749244.0, "logits/rejected": -31238080.0, "logps/chosen": -426.8805338541667, "logps/rejected": -151.2528839111328, "loss": 0.3244, "rewards/chosen": 0.807222048441569, "rewards/margins": 1.8194398085276284, "rewards/rejected": -1.0122177600860596, "step": 6760 }, { "epoch": 0.35836005618424194, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53916480.0, "logits/rejected": -32268496.0, "logps/chosen": -355.1121826171875, "logps/rejected": -413.3250732421875, "loss": 0.2819, "rewards/chosen": 0.2823413610458374, "rewards/margins": 3.0043927431106567, "rewards/rejected": -2.7220513820648193, "step": 6761 }, { "epoch": 0.358413060186044, "grad_norm": 42.75, "kl": 0.8339805603027344, "learning_rate": 5e-07, "logits/chosen": -29109282.0, "logits/rejected": -32840396.0, "logps/chosen": -312.28338623046875, "logps/rejected": -340.884521484375, "loss": 0.2048, "rewards/chosen": 0.692068874835968, "rewards/margins": 3.705024778842926, "rewards/rejected": -3.012955904006958, "step": 6762 }, { "epoch": 0.35846606418784616, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54836576.0, "logits/rejected": -36893156.571428575, "logps/chosen": -613.6544189453125, "logps/rejected": -540.8855329241071, "loss": 0.1567, "rewards/chosen": -0.03497314453125, "rewards/margins": 2.807581765311105, "rewards/rejected": -2.842554909842355, "step": 6763 }, { "epoch": 0.3585190681896483, "grad_norm": 55.0, "kl": 0.27725982666015625, "learning_rate": 5e-07, "logits/chosen": -45034393.6, "logits/rejected": -11792088.0, "logps/chosen": -428.075390625, "logps/rejected": -296.2452799479167, "loss": 0.3153, "rewards/chosen": 0.3704937696456909, "rewards/margins": 2.490674535433451, "rewards/rejected": -2.1201807657877603, "step": 6764 }, { "epoch": 0.35857207219145043, "grad_norm": 83.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27382528.0, "logits/rejected": -43394520.0, "logps/chosen": -328.8663330078125, "logps/rejected": -444.6444091796875, "loss": 0.2934, "rewards/chosen": 0.6094099283218384, "rewards/margins": 2.3542743921279907, "rewards/rejected": -1.7448644638061523, "step": 6765 }, { "epoch": 0.35862507619325257, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29827817.6, "logits/rejected": -54699312.0, "logps/chosen": -359.6081787109375, "logps/rejected": -508.9225260416667, "loss": 0.3142, "rewards/chosen": 0.1873164176940918, "rewards/margins": 2.934260527292887, "rewards/rejected": -2.7469441095987954, "step": 6766 }, { "epoch": 0.3586780801950547, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25305528.0, "logits/rejected": -82392117.33333333, "logps/chosen": -605.316650390625, "logps/rejected": -323.23883056640625, "loss": 0.24, "rewards/chosen": -0.09050597995519638, "rewards/margins": 1.9736248031258583, "rewards/rejected": -2.0641307830810547, "step": 6767 }, { "epoch": 0.35873108419685684, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2088434.0, "logits/rejected": 4827452.0, "logps/chosen": -127.79338073730469, "logps/rejected": -286.7629699707031, "loss": 0.3373, "rewards/chosen": 0.12643027305603027, "rewards/margins": 1.627335786819458, "rewards/rejected": -1.5009055137634277, "step": 6768 }, { "epoch": 0.358784088198659, "grad_norm": 45.75, "kl": 0.39776134490966797, "learning_rate": 5e-07, "logits/chosen": -22496836.57142857, "logits/rejected": -72031664.0, "logps/chosen": -127.97771344866071, "logps/rejected": -767.634033203125, "loss": 0.4024, "rewards/chosen": 0.3175129549843924, "rewards/margins": 2.119691933904375, "rewards/rejected": -1.802178978919983, "step": 6769 }, { "epoch": 0.3588370922004611, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1560789.375, "logits/rejected": -44627578.666666664, "logps/chosen": -69.22044372558594, "logps/rejected": -386.9532063802083, "loss": 0.2684, "rewards/chosen": -0.3323618173599243, "rewards/margins": 1.428715745608012, "rewards/rejected": -1.7610775629679363, "step": 6770 }, { "epoch": 0.35889009620226325, "grad_norm": 53.25, "kl": 0.00783538818359375, "learning_rate": 5e-07, "logits/chosen": -13826745.0, "logits/rejected": -22635916.0, "logps/chosen": -362.20599365234375, "logps/rejected": -501.2044372558594, "loss": 0.3175, "rewards/chosen": 0.13581961393356323, "rewards/margins": 2.2346836924552917, "rewards/rejected": -2.0988640785217285, "step": 6771 }, { "epoch": 0.3589431002040654, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52986120.0, "logits/rejected": 84839208.0, "logps/chosen": -222.25572204589844, "logps/rejected": -80.10281372070312, "loss": 0.3956, "rewards/chosen": 0.09360437095165253, "rewards/margins": 1.0859387964010239, "rewards/rejected": -0.9923344254493713, "step": 6772 }, { "epoch": 0.35899610420586753, "grad_norm": 85.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 42127260.0, "logits/rejected": -71025424.0, "logps/chosen": -1041.40673828125, "logps/rejected": -539.9393920898438, "loss": 0.268, "rewards/chosen": 0.2967330813407898, "rewards/margins": 2.877191722393036, "rewards/rejected": -2.580458641052246, "step": 6773 }, { "epoch": 0.35904910820766966, "grad_norm": 65.5, "kl": 1.9990291595458984, "learning_rate": 5e-07, "logits/chosen": -22141986.666666668, "logits/rejected": -41786804.0, "logps/chosen": -342.2787272135417, "logps/rejected": -160.2270050048828, "loss": 0.382, "rewards/chosen": 0.40533284346262616, "rewards/margins": 1.9646041790644329, "rewards/rejected": -1.5592713356018066, "step": 6774 }, { "epoch": 0.3591021122094718, "grad_norm": 57.5, "kl": 1.8448619842529297, "learning_rate": 5e-07, "logits/chosen": -19494256.0, "logits/rejected": 1946582.75, "logps/chosen": -341.308349609375, "logps/rejected": -23.12042808532715, "loss": 0.4515, "rewards/chosen": 0.41800682885306223, "rewards/margins": 0.550150609442166, "rewards/rejected": -0.1321437805891037, "step": 6775 }, { "epoch": 0.35915511621127394, "grad_norm": 24.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4112762.0, "logits/rejected": -21970498.285714287, "logps/chosen": -62.254783630371094, "logps/rejected": -299.0099400111607, "loss": 0.1526, "rewards/chosen": -0.512499988079071, "rewards/margins": 2.2741859555244446, "rewards/rejected": -2.7866859436035156, "step": 6776 }, { "epoch": 0.3592081202130761, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31837177.6, "logits/rejected": 36187186.666666664, "logps/chosen": -240.56796875, "logps/rejected": -546.4760335286459, "loss": 0.332, "rewards/chosen": 0.19975197315216064, "rewards/margins": 3.0975056886672974, "rewards/rejected": -2.8977537155151367, "step": 6777 }, { "epoch": 0.3592611242148782, "grad_norm": 199.0, "kl": 0.5820198059082031, "learning_rate": 5e-07, "logits/chosen": -49218661.333333336, "logits/rejected": -31979542.4, "logps/chosen": -612.579833984375, "logps/rejected": -433.212255859375, "loss": 0.1814, "rewards/chosen": 0.8021362622578939, "rewards/margins": 3.542515023549398, "rewards/rejected": -2.740378761291504, "step": 6778 }, { "epoch": 0.35931412821668035, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12066996.0, "logits/rejected": -34211696.0, "logps/chosen": -263.84613037109375, "logps/rejected": -252.37782287597656, "loss": 0.2955, "rewards/chosen": 0.4477244019508362, "rewards/margins": 2.3729477524757385, "rewards/rejected": -1.9252233505249023, "step": 6779 }, { "epoch": 0.3593671322184825, "grad_norm": 47.5, "kl": 0.6139297485351562, "learning_rate": 5e-07, "logits/chosen": -9985386.0, "logits/rejected": -28970006.4, "logps/chosen": -107.8298848470052, "logps/rejected": -257.5873779296875, "loss": 0.2521, "rewards/chosen": 0.5301397641499838, "rewards/margins": 2.6640167554219567, "rewards/rejected": -2.133876991271973, "step": 6780 }, { "epoch": 0.3594201362202846, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15534793.333333334, "logits/rejected": -19823379.2, "logps/chosen": -226.05853271484375, "logps/rejected": -335.2169677734375, "loss": 0.3067, "rewards/chosen": 0.23607152700424194, "rewards/margins": 1.7001708388328551, "rewards/rejected": -1.4640993118286132, "step": 6781 }, { "epoch": 0.35947314022208676, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35678740.0, "logits/rejected": -75380784.0, "logps/chosen": -399.32745361328125, "logps/rejected": -400.41864013671875, "loss": 0.3557, "rewards/chosen": 0.1051204651594162, "rewards/margins": 1.6620427817106247, "rewards/rejected": -1.5569223165512085, "step": 6782 }, { "epoch": 0.3595261442238889, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44753888.0, "logits/rejected": -41054304.0, "logps/chosen": -360.56806640625, "logps/rejected": -309.1352945963542, "loss": 0.3042, "rewards/chosen": 0.42128696441650393, "rewards/margins": 2.3103121757507323, "rewards/rejected": -1.8890252113342285, "step": 6783 }, { "epoch": 0.35957914822569104, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23970668.0, "logits/rejected": -16399996.0, "logps/chosen": -363.8438415527344, "logps/rejected": -355.4728190104167, "loss": 0.3341, "rewards/chosen": -0.721722424030304, "rewards/margins": 0.574216862519582, "rewards/rejected": -1.295939286549886, "step": 6784 }, { "epoch": 0.3596321522274932, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5598952.0, "logits/rejected": -43233416.0, "logps/chosen": -263.20849609375, "logps/rejected": -343.48602294921875, "loss": 0.2795, "rewards/chosen": 0.2842281460762024, "rewards/margins": 2.4130225777626038, "rewards/rejected": -2.1287944316864014, "step": 6785 }, { "epoch": 0.3596851562292953, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4385991.0, "logits/rejected": -16475278.666666666, "logps/chosen": -48.816650390625, "logps/rejected": -431.5240071614583, "loss": 0.2082, "rewards/chosen": -0.19360047578811646, "rewards/margins": 2.2981949051221213, "rewards/rejected": -2.491795380910238, "step": 6786 }, { "epoch": 0.35973816023109745, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19540072.0, "logits/rejected": -47303092.0, "logps/chosen": -301.0860188802083, "logps/rejected": -528.0771484375, "loss": 0.367, "rewards/chosen": 0.1810403267542521, "rewards/margins": 2.6599588791529336, "rewards/rejected": -2.4789185523986816, "step": 6787 }, { "epoch": 0.3597911642328996, "grad_norm": 49.75, "kl": 0.35972023010253906, "learning_rate": 5e-07, "logits/chosen": -13518280.0, "logits/rejected": -51551562.666666664, "logps/chosen": -159.3446044921875, "logps/rejected": -184.65323893229166, "loss": 0.4223, "rewards/chosen": -0.03397934436798096, "rewards/margins": 0.9353724241256713, "rewards/rejected": -0.9693517684936523, "step": 6788 }, { "epoch": 0.3598441682347017, "grad_norm": 70.5, "kl": 0.6352787017822266, "learning_rate": 5e-07, "logits/chosen": -33405736.0, "logits/rejected": 7429500.5, "logps/chosen": -419.3291422526042, "logps/rejected": -338.737548828125, "loss": 0.3798, "rewards/chosen": 0.35296106338500977, "rewards/margins": 1.7875515222549438, "rewards/rejected": -1.434590458869934, "step": 6789 }, { "epoch": 0.35989717223650386, "grad_norm": 47.75, "kl": 2.3819665908813477, "learning_rate": 5e-07, "logits/chosen": -18010189.714285713, "logits/rejected": -22684058.0, "logps/chosen": -420.91127232142856, "logps/rejected": -492.4512939453125, "loss": 0.3617, "rewards/chosen": 0.764239856175014, "rewards/margins": 3.9768131460462297, "rewards/rejected": -3.212573289871216, "step": 6790 }, { "epoch": 0.359950176238306, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -890756.25, "logits/rejected": -40377059.2, "logps/chosen": -103.93044026692708, "logps/rejected": -472.7724609375, "loss": 0.2529, "rewards/chosen": 0.38546351591746014, "rewards/margins": 2.6460126797358194, "rewards/rejected": -2.2605491638183595, "step": 6791 }, { "epoch": 0.36000318024010813, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27251027.2, "logits/rejected": -13437882.666666666, "logps/chosen": -476.8146484375, "logps/rejected": -195.2177937825521, "loss": 0.3112, "rewards/chosen": 0.4930454730987549, "rewards/margins": 2.490745687484741, "rewards/rejected": -1.9977002143859863, "step": 6792 }, { "epoch": 0.36005618424191027, "grad_norm": 37.75, "kl": 0.9695529937744141, "learning_rate": 5e-07, "logits/chosen": 10834179.0, "logits/rejected": -28121396.57142857, "logps/chosen": -484.5511169433594, "logps/rejected": -322.62977818080356, "loss": 0.191, "rewards/chosen": 1.4126557111740112, "rewards/margins": 3.2266510725021362, "rewards/rejected": -1.813995361328125, "step": 6793 }, { "epoch": 0.3601091882437124, "grad_norm": 42.75, "kl": 1.6922588348388672, "learning_rate": 5e-07, "logits/chosen": -22522178.666666668, "logits/rejected": -21433876.0, "logps/chosen": -221.81461588541666, "logps/rejected": -168.93861389160156, "loss": 0.463, "rewards/chosen": -0.015276054541269938, "rewards/margins": 1.519114871819814, "rewards/rejected": -1.534390926361084, "step": 6794 }, { "epoch": 0.36016219224551455, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30012892.8, "logits/rejected": -43666533.333333336, "logps/chosen": -204.5302734375, "logps/rejected": -446.8118489583333, "loss": 0.3932, "rewards/chosen": -0.12293144464492797, "rewards/margins": 1.8839625080426536, "rewards/rejected": -2.0068939526875815, "step": 6795 }, { "epoch": 0.3602151962473167, "grad_norm": 44.75, "kl": 0.5348091125488281, "learning_rate": 5e-07, "logits/chosen": 5195308.0, "logits/rejected": -18400827.42857143, "logps/chosen": -730.8890380859375, "logps/rejected": -289.03163364955356, "loss": 0.2188, "rewards/chosen": 0.09847412258386612, "rewards/margins": 1.9450675706778253, "rewards/rejected": -1.8465934480939592, "step": 6796 }, { "epoch": 0.3602682002491188, "grad_norm": 42.75, "kl": 0.08957481384277344, "learning_rate": 5e-07, "logits/chosen": -13835682.666666666, "logits/rejected": -15734382.4, "logps/chosen": -227.6064249674479, "logps/rejected": -201.02996826171875, "loss": 0.2662, "rewards/chosen": 0.5708234310150146, "rewards/margins": 2.2215575695037844, "rewards/rejected": -1.6507341384887695, "step": 6797 }, { "epoch": 0.36032120425092096, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11671892.8, "logits/rejected": -31067330.666666668, "logps/chosen": -100.78427734375, "logps/rejected": -279.49658203125, "loss": 0.3628, "rewards/chosen": 0.04692069292068481, "rewards/margins": 1.9009427189826966, "rewards/rejected": -1.8540220260620117, "step": 6798 }, { "epoch": 0.3603742082527231, "grad_norm": 60.5, "kl": 1.4403667449951172, "learning_rate": 5e-07, "logits/chosen": 17515022.85714286, "logits/rejected": -37448012.0, "logps/chosen": -200.11903599330358, "logps/rejected": -529.2147216796875, "loss": 0.4022, "rewards/chosen": 0.3921780586242676, "rewards/margins": 3.1298489570617676, "rewards/rejected": -2.7376708984375, "step": 6799 }, { "epoch": 0.36042721225452523, "grad_norm": 68.5, "kl": 0.39855289459228516, "learning_rate": 5e-07, "logits/chosen": -9377420.0, "logits/rejected": 6672543.5, "logps/chosen": -554.7432861328125, "logps/rejected": -281.46002197265625, "loss": 0.3035, "rewards/chosen": 0.9756157398223877, "rewards/margins": 1.9394046068191528, "rewards/rejected": -0.9637888669967651, "step": 6800 }, { "epoch": 0.36048021625632737, "grad_norm": 42.5, "kl": 0.11032867431640625, "learning_rate": 5e-07, "logits/chosen": -32574952.0, "logits/rejected": -26527552.0, "logps/chosen": -284.5400085449219, "logps/rejected": -141.26510620117188, "loss": 0.3341, "rewards/chosen": -0.1561807543039322, "rewards/margins": 1.889037236571312, "rewards/rejected": -2.045217990875244, "step": 6801 }, { "epoch": 0.3605332202581295, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71599712.0, "logits/rejected": -25902403.2, "logps/chosen": -171.82682291666666, "logps/rejected": -373.0001953125, "loss": 0.2411, "rewards/chosen": 0.4533925453821818, "rewards/margins": 2.313491384188334, "rewards/rejected": -1.8600988388061523, "step": 6802 }, { "epoch": 0.36058622425993164, "grad_norm": 30.375, "kl": 0.4689979553222656, "learning_rate": 5e-07, "logits/chosen": -412825.5625, "logits/rejected": -26717222.0, "logps/chosen": -61.58368682861328, "logps/rejected": -446.30230712890625, "loss": 0.278, "rewards/chosen": 0.25393134355545044, "rewards/margins": 2.701330244541168, "rewards/rejected": -2.4473989009857178, "step": 6803 }, { "epoch": 0.3606392282617338, "grad_norm": 51.75, "kl": 1.6358909606933594, "learning_rate": 5e-07, "logits/chosen": -25861014.4, "logits/rejected": 15193450.666666666, "logps/chosen": -309.5305908203125, "logps/rejected": -255.68798828125, "loss": 0.2532, "rewards/chosen": 0.8088590621948242, "rewards/margins": 3.1745217959086096, "rewards/rejected": -2.3656627337137857, "step": 6804 }, { "epoch": 0.3606922322635359, "grad_norm": 60.5, "kl": 0.0802459716796875, "learning_rate": 5e-07, "logits/chosen": -20615617.6, "logits/rejected": 3405387.6666666665, "logps/chosen": -366.6600830078125, "logps/rejected": -120.52909342447917, "loss": 0.3948, "rewards/chosen": 0.013718116283416747, "rewards/margins": 1.7067341685295105, "rewards/rejected": -1.6930160522460938, "step": 6805 }, { "epoch": 0.36074523626533805, "grad_norm": 50.25, "kl": 1.6283750534057617, "learning_rate": 5e-07, "logits/chosen": -38175640.0, "logits/rejected": -29421414.4, "logps/chosen": -363.1759847005208, "logps/rejected": -373.34345703125, "loss": 0.2625, "rewards/chosen": 0.49738462766011554, "rewards/margins": 2.4008453210194904, "rewards/rejected": -1.903460693359375, "step": 6806 }, { "epoch": 0.3607982402671402, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32948984.0, "logits/rejected": -32035074.666666668, "logps/chosen": -223.85935974121094, "logps/rejected": -132.34037272135416, "loss": 0.256, "rewards/chosen": 0.7305785417556763, "rewards/margins": 2.5855898459752398, "rewards/rejected": -1.8550113042195637, "step": 6807 }, { "epoch": 0.36085124426894233, "grad_norm": 49.25, "kl": 0.7638740539550781, "learning_rate": 5e-07, "logits/chosen": -26798676.0, "logits/rejected": -13997668.0, "logps/chosen": -206.2123565673828, "logps/rejected": -273.7657165527344, "loss": 0.3266, "rewards/chosen": 0.17704831063747406, "rewards/margins": 1.5787458270788193, "rewards/rejected": -1.4016975164413452, "step": 6808 }, { "epoch": 0.36090424827074447, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37381325.333333336, "logits/rejected": -64213452.8, "logps/chosen": -324.1884358723958, "logps/rejected": -749.637060546875, "loss": 0.1838, "rewards/chosen": 0.3222971558570862, "rewards/margins": 4.036754095554352, "rewards/rejected": -3.7144569396972655, "step": 6809 }, { "epoch": 0.3609572522725466, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61381356.8, "logits/rejected": -38728741.333333336, "logps/chosen": -417.87998046875, "logps/rejected": -408.7579345703125, "loss": 0.3984, "rewards/chosen": -0.2729055881500244, "rewards/margins": 1.9276909033457437, "rewards/rejected": -2.200596491495768, "step": 6810 }, { "epoch": 0.36101025627434874, "grad_norm": 57.25, "kl": 1.6526432037353516, "learning_rate": 5e-07, "logits/chosen": -19431997.333333332, "logits/rejected": 10998090.0, "logps/chosen": -527.4798583984375, "logps/rejected": -538.7617797851562, "loss": 0.2772, "rewards/chosen": 0.9544227917989095, "rewards/margins": 3.2576735814412436, "rewards/rejected": -2.303250789642334, "step": 6811 }, { "epoch": 0.3610632602761509, "grad_norm": 46.75, "kl": 0.16912460327148438, "learning_rate": 5e-07, "logits/chosen": -16759862.666666666, "logits/rejected": -51815792.0, "logps/chosen": -205.95589192708334, "logps/rejected": -665.7076416015625, "loss": 0.3135, "rewards/chosen": 0.5534581343332926, "rewards/margins": 2.827995697657267, "rewards/rejected": -2.2745375633239746, "step": 6812 }, { "epoch": 0.36111626427795296, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49652682.666666664, "logits/rejected": -126362675.2, "logps/chosen": -388.0250244140625, "logps/rejected": -383.988232421875, "loss": 0.2483, "rewards/chosen": 0.19900715351104736, "rewards/margins": 2.4725295305252075, "rewards/rejected": -2.27352237701416, "step": 6813 }, { "epoch": 0.3611692682797551, "grad_norm": 50.0, "kl": 0.5600423812866211, "learning_rate": 5e-07, "logits/chosen": -17400593.333333332, "logits/rejected": -58478048.0, "logps/chosen": -146.52885945638022, "logps/rejected": -356.87802734375, "loss": 0.2671, "rewards/chosen": -0.05764718850453695, "rewards/margins": 2.4323853890101113, "rewards/rejected": -2.4900325775146483, "step": 6814 }, { "epoch": 0.36122227228155723, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46514232.0, "logits/rejected": -29669508.0, "logps/chosen": -273.07781982421875, "logps/rejected": -399.3354797363281, "loss": 0.2324, "rewards/chosen": 0.5137842893600464, "rewards/margins": 3.2839800119400024, "rewards/rejected": -2.770195722579956, "step": 6815 }, { "epoch": 0.36127527628335937, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47307194.666666664, "logits/rejected": -41280950.4, "logps/chosen": -329.480712890625, "logps/rejected": -331.10625, "loss": 0.2497, "rewards/chosen": 0.44718194007873535, "rewards/margins": 2.525282144546509, "rewards/rejected": -2.0781002044677734, "step": 6816 }, { "epoch": 0.3613282802851615, "grad_norm": 78.0, "kl": 0.2021465301513672, "learning_rate": 5e-07, "logits/chosen": -29974120.0, "logits/rejected": 94485328.0, "logps/chosen": -288.7671813964844, "logps/rejected": -517.1071166992188, "loss": 0.3555, "rewards/chosen": 0.021437883377075195, "rewards/margins": 2.0804669857025146, "rewards/rejected": -2.0590291023254395, "step": 6817 }, { "epoch": 0.36138128428696364, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -109660501.33333333, "logits/rejected": -30387548.8, "logps/chosen": -340.1754964192708, "logps/rejected": -357.8759765625, "loss": 0.2718, "rewards/chosen": 0.07570393880208333, "rewards/margins": 1.8974578221638996, "rewards/rejected": -1.8217538833618163, "step": 6818 }, { "epoch": 0.3614342882887658, "grad_norm": 47.5, "kl": 0.5416650772094727, "learning_rate": 5e-07, "logits/chosen": -46912544.0, "logits/rejected": -14346304.0, "logps/chosen": -319.5725341796875, "logps/rejected": -113.2502950032552, "loss": 0.3668, "rewards/chosen": 0.37386019229888917, "rewards/margins": 1.5902835448582966, "rewards/rejected": -1.2164233525594075, "step": 6819 }, { "epoch": 0.3614872922905679, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36430962.666666664, "logits/rejected": -16017673.6, "logps/chosen": -263.8065185546875, "logps/rejected": -426.50244140625, "loss": 0.2003, "rewards/chosen": 0.7949357032775879, "rewards/margins": 2.909028720855713, "rewards/rejected": -2.114093017578125, "step": 6820 }, { "epoch": 0.36154029629237006, "grad_norm": 38.0, "kl": 1.2741880416870117, "learning_rate": 5e-07, "logits/chosen": 3195422.0, "logits/rejected": -29286948.0, "logps/chosen": -121.51747131347656, "logps/rejected": -267.8865661621094, "loss": 0.3689, "rewards/chosen": 0.10707403719425201, "rewards/margins": 1.3202173858880997, "rewards/rejected": -1.2131433486938477, "step": 6821 }, { "epoch": 0.3615933002941722, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23558898.666666668, "logits/rejected": -2907346.5, "logps/chosen": -210.462158203125, "logps/rejected": -42.534908294677734, "loss": 0.5288, "rewards/chosen": -0.22799734274546304, "rewards/margins": -0.18211359033981958, "rewards/rejected": -0.04588375240564346, "step": 6822 }, { "epoch": 0.36164630429597433, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77132184.0, "logits/rejected": -9234941.142857144, "logps/chosen": -424.5459289550781, "logps/rejected": -230.74117606026786, "loss": 0.2869, "rewards/chosen": 0.15577088296413422, "rewards/margins": 1.2709642499685287, "rewards/rejected": -1.1151933670043945, "step": 6823 }, { "epoch": 0.36169930829777647, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50316908.0, "logits/rejected": -19939312.0, "logps/chosen": -438.6917419433594, "logps/rejected": -413.0677897135417, "loss": 0.2512, "rewards/chosen": 0.24721984565258026, "rewards/margins": 2.0703283598025637, "rewards/rejected": -1.8231085141499836, "step": 6824 }, { "epoch": 0.3617523122995786, "grad_norm": 45.75, "kl": 0.616431713104248, "learning_rate": 5e-07, "logits/chosen": -16089082.0, "logits/rejected": -29593116.0, "logps/chosen": -379.78790283203125, "logps/rejected": -206.95333862304688, "loss": 0.3742, "rewards/chosen": 0.0692775696516037, "rewards/margins": 1.7974414080381393, "rewards/rejected": -1.7281638383865356, "step": 6825 }, { "epoch": 0.36180531630138074, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10272307.42857143, "logits/rejected": -117842096.0, "logps/chosen": -191.98540387834822, "logps/rejected": -693.8111572265625, "loss": 0.3626, "rewards/chosen": 0.5168343271527972, "rewards/margins": 2.730811425617763, "rewards/rejected": -2.213977098464966, "step": 6826 }, { "epoch": 0.3618583203031829, "grad_norm": 54.5, "kl": 3.019287109375, "learning_rate": 5e-07, "logits/chosen": -23574782.4, "logits/rejected": -38390360.0, "logps/chosen": -662.82158203125, "logps/rejected": -348.9993896484375, "loss": 0.3266, "rewards/chosen": 0.7865736007690429, "rewards/margins": 2.210013580322266, "rewards/rejected": -1.4234399795532227, "step": 6827 }, { "epoch": 0.361911324304985, "grad_norm": 60.25, "kl": 0.6854133605957031, "learning_rate": 5e-07, "logits/chosen": -41064269.333333336, "logits/rejected": -25727180.0, "logps/chosen": -507.0704345703125, "logps/rejected": -347.0126037597656, "loss": 0.3068, "rewards/chosen": 0.8734944661458334, "rewards/margins": 2.0318135817845664, "rewards/rejected": -1.158319115638733, "step": 6828 }, { "epoch": 0.36196432830678715, "grad_norm": 48.0, "kl": 0.04909706115722656, "learning_rate": 5e-07, "logits/chosen": -23951769.6, "logits/rejected": -63933589.333333336, "logps/chosen": -266.9189697265625, "logps/rejected": -251.69453938802084, "loss": 0.303, "rewards/chosen": 0.4459108352661133, "rewards/margins": 2.247934087117513, "rewards/rejected": -1.8020232518513997, "step": 6829 }, { "epoch": 0.3620173323085893, "grad_norm": 53.5, "kl": 0.8635940551757812, "learning_rate": 5e-07, "logits/chosen": -22820186.0, "logits/rejected": -21010758.0, "logps/chosen": -471.5649719238281, "logps/rejected": -222.0039520263672, "loss": 0.2488, "rewards/chosen": 0.930328369140625, "rewards/margins": 2.593888998031616, "rewards/rejected": -1.6635606288909912, "step": 6830 }, { "epoch": 0.3620703363103914, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28353960.0, "logits/rejected": -955955.0625, "logps/chosen": -209.45599365234375, "logps/rejected": -261.9143981933594, "loss": 0.3285, "rewards/chosen": 0.18233534693717957, "rewards/margins": 1.7095583379268646, "rewards/rejected": -1.527222990989685, "step": 6831 }, { "epoch": 0.36212334031219356, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43737820.0, "logits/rejected": -23074266.0, "logps/chosen": -316.79437255859375, "logps/rejected": -548.9512939453125, "loss": 0.2291, "rewards/chosen": 0.736496090888977, "rewards/margins": 3.0847502946853638, "rewards/rejected": -2.3482542037963867, "step": 6832 }, { "epoch": 0.3621763443139957, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38855856.0, "logits/rejected": -48953896.0, "logps/chosen": -343.1260681152344, "logps/rejected": -495.6468811035156, "loss": 0.3057, "rewards/chosen": 0.21781203150749207, "rewards/margins": 2.0741758048534393, "rewards/rejected": -1.8563637733459473, "step": 6833 }, { "epoch": 0.36222934831579784, "grad_norm": 44.75, "kl": 0.9910202026367188, "learning_rate": 5e-07, "logits/chosen": -16531729.333333334, "logits/rejected": -30332964.0, "logps/chosen": -262.75620524088544, "logps/rejected": -488.64154052734375, "loss": 0.3251, "rewards/chosen": 0.7183601061503092, "rewards/margins": 2.2731880346934, "rewards/rejected": -1.5548279285430908, "step": 6834 }, { "epoch": 0.3622823523176, "grad_norm": 50.5, "kl": 0.2800769805908203, "learning_rate": 5e-07, "logits/chosen": -42889974.4, "logits/rejected": -20815752.0, "logps/chosen": -382.3876953125, "logps/rejected": -493.4457194010417, "loss": 0.285, "rewards/chosen": 0.7070404052734375, "rewards/margins": 3.628039296468099, "rewards/rejected": -2.9209988911946616, "step": 6835 }, { "epoch": 0.3623353563194021, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12364028.0, "logits/rejected": -48705052.0, "logps/chosen": -116.87747955322266, "logps/rejected": -443.3406677246094, "loss": 0.3405, "rewards/chosen": -0.08886909484863281, "rewards/margins": 1.9415254592895508, "rewards/rejected": -2.0303945541381836, "step": 6836 }, { "epoch": 0.36238836032120425, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6433736.0, "logits/rejected": -1156827.25, "logps/chosen": -219.63772583007812, "logps/rejected": -305.5718688964844, "loss": 0.3571, "rewards/chosen": 0.10916848480701447, "rewards/margins": 1.5406958609819412, "rewards/rejected": -1.4315273761749268, "step": 6837 }, { "epoch": 0.3624413643230064, "grad_norm": 45.75, "kl": 0.029140472412109375, "learning_rate": 5e-07, "logits/chosen": -32357909.333333332, "logits/rejected": -11535612.0, "logps/chosen": -288.3131917317708, "logps/rejected": -521.138134765625, "loss": 0.2519, "rewards/chosen": 0.21480711301167807, "rewards/margins": 2.509284003575643, "rewards/rejected": -2.2944768905639648, "step": 6838 }, { "epoch": 0.3624943683248085, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4066511.25, "logits/rejected": -12198289.0, "logps/chosen": -123.04969787597656, "logps/rejected": -239.6809539794922, "loss": 0.2821, "rewards/chosen": 0.5450615882873535, "rewards/margins": 2.4306185245513916, "rewards/rejected": -1.885556936264038, "step": 6839 }, { "epoch": 0.36254737232661066, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18386062.666666668, "logits/rejected": -14475684.8, "logps/chosen": -103.506103515625, "logps/rejected": -158.90230712890624, "loss": 0.2666, "rewards/chosen": 0.08069096008936565, "rewards/margins": 2.4590807000796, "rewards/rejected": -2.3783897399902343, "step": 6840 }, { "epoch": 0.3626003763284128, "grad_norm": 39.0, "kl": 0.43368053436279297, "learning_rate": 5e-07, "logits/chosen": 9021435.2, "logits/rejected": -18572337.333333332, "logps/chosen": -164.033984375, "logps/rejected": -337.46718343098956, "loss": 0.3486, "rewards/chosen": 0.26815319061279297, "rewards/margins": 2.674846331278483, "rewards/rejected": -2.40669314066569, "step": 6841 }, { "epoch": 0.36265338033021494, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25194114.0, "logits/rejected": -35418624.0, "logps/chosen": -382.97015380859375, "logps/rejected": -176.12245178222656, "loss": 0.2985, "rewards/chosen": 0.4256591796875, "rewards/margins": 2.027394652366638, "rewards/rejected": -1.6017354726791382, "step": 6842 }, { "epoch": 0.3627063843320171, "grad_norm": 50.0, "kl": 0.34694671630859375, "learning_rate": 5e-07, "logits/chosen": -21099798.4, "logits/rejected": -13026130.666666666, "logps/chosen": -176.46217041015626, "logps/rejected": -180.30084228515625, "loss": 0.3981, "rewards/chosen": -0.12603477239608765, "rewards/margins": 1.9508930484453835, "rewards/rejected": -2.076927820841471, "step": 6843 }, { "epoch": 0.3627593883338192, "grad_norm": 51.75, "kl": 0.29959869384765625, "learning_rate": 5e-07, "logits/chosen": -43552086.4, "logits/rejected": -41193018.666666664, "logps/chosen": -312.32939453125, "logps/rejected": -158.6659138997396, "loss": 0.37, "rewards/chosen": 0.14807742834091187, "rewards/margins": 2.0570317308108015, "rewards/rejected": -1.9089543024698894, "step": 6844 }, { "epoch": 0.36281239233562135, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39312493.333333336, "logits/rejected": -51437177.6, "logps/chosen": -361.6775716145833, "logps/rejected": -335.2624267578125, "loss": 0.2862, "rewards/chosen": 0.3445546627044678, "rewards/margins": 1.7906609058380127, "rewards/rejected": -1.446106243133545, "step": 6845 }, { "epoch": 0.3628653963374235, "grad_norm": 63.25, "kl": 0.34506988525390625, "learning_rate": 5e-07, "logits/chosen": -40904538.666666664, "logits/rejected": -41377664.0, "logps/chosen": -420.9049886067708, "logps/rejected": -694.7332153320312, "loss": 0.2995, "rewards/chosen": 0.6215306520462036, "rewards/margins": 3.476280093193054, "rewards/rejected": -2.8547494411468506, "step": 6846 }, { "epoch": 0.3629184003392256, "grad_norm": 60.0, "kl": 2.7852516174316406, "learning_rate": 5e-07, "logits/chosen": -35131404.8, "logits/rejected": -9084746.0, "logps/chosen": -361.73955078125, "logps/rejected": -315.1748453776042, "loss": 0.3321, "rewards/chosen": 0.47777762413024905, "rewards/margins": 2.7663806120554604, "rewards/rejected": -2.2886029879252114, "step": 6847 }, { "epoch": 0.36297140434102776, "grad_norm": 42.75, "kl": 2.4748916625976562, "learning_rate": 5e-07, "logits/chosen": -61840360.0, "logits/rejected": -23088752.0, "logps/chosen": -407.91168212890625, "logps/rejected": -308.8978576660156, "loss": 0.1927, "rewards/chosen": 1.2788825035095215, "rewards/margins": 3.1896469593048096, "rewards/rejected": -1.910764455795288, "step": 6848 }, { "epoch": 0.3630244083428299, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -471542.9375, "logits/rejected": -12851606.666666666, "logps/chosen": -155.31097412109375, "logps/rejected": -659.8211263020834, "loss": 0.2238, "rewards/chosen": 0.1567508727312088, "rewards/margins": 3.5871374160051346, "rewards/rejected": -3.430386543273926, "step": 6849 }, { "epoch": 0.36307741234463203, "grad_norm": 73.5, "kl": 0.9985427856445312, "learning_rate": 5e-07, "logits/chosen": -31062246.4, "logits/rejected": 1649300.5, "logps/chosen": -403.3294921875, "logps/rejected": -89.91858927408855, "loss": 0.3974, "rewards/chosen": 0.06904296875, "rewards/margins": 1.537001895904541, "rewards/rejected": -1.467958927154541, "step": 6850 }, { "epoch": 0.36313041634643417, "grad_norm": 32.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5205618.0, "logits/rejected": 1297124.3333333333, "logps/chosen": -37.928382873535156, "logps/rejected": -178.8428751627604, "loss": 0.2587, "rewards/chosen": -0.2049008458852768, "rewards/margins": 1.7135911534229915, "rewards/rejected": -1.9184919993082683, "step": 6851 }, { "epoch": 0.3631834203482363, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11484722.0, "logits/rejected": -34321876.0, "logps/chosen": -292.1515197753906, "logps/rejected": -416.3161315917969, "loss": 0.2839, "rewards/chosen": 0.34767037630081177, "rewards/margins": 2.5845125317573547, "rewards/rejected": -2.236842155456543, "step": 6852 }, { "epoch": 0.36323642435003844, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29836526.0, "logits/rejected": -10160556.0, "logps/chosen": -399.4300231933594, "logps/rejected": -319.1266784667969, "loss": 0.3281, "rewards/chosen": -0.1705063134431839, "rewards/margins": 2.702913925051689, "rewards/rejected": -2.873420238494873, "step": 6853 }, { "epoch": 0.3632894283518406, "grad_norm": 54.5, "kl": 0.4404716491699219, "learning_rate": 5e-07, "logits/chosen": -33133957.333333332, "logits/rejected": -13046942.4, "logps/chosen": -363.3367513020833, "logps/rejected": -158.5103271484375, "loss": 0.2631, "rewards/chosen": 0.7031768957773844, "rewards/margins": 2.0624326864878335, "rewards/rejected": -1.3592557907104492, "step": 6854 }, { "epoch": 0.3633424323536427, "grad_norm": 60.5, "kl": 1.7641448974609375, "learning_rate": 5e-07, "logits/chosen": -28518710.85714286, "logits/rejected": -18883610.0, "logps/chosen": -330.1988002232143, "logps/rejected": -128.9961700439453, "loss": 0.4464, "rewards/chosen": 0.2738896438053676, "rewards/margins": 1.0531506964138577, "rewards/rejected": -0.77926105260849, "step": 6855 }, { "epoch": 0.36339543635544486, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7506966.0, "logits/rejected": -23735038.0, "logps/chosen": -243.0933837890625, "logps/rejected": -534.5011596679688, "loss": 0.2265, "rewards/chosen": 0.7404113411903381, "rewards/margins": 3.2263675332069397, "rewards/rejected": -2.4859561920166016, "step": 6856 }, { "epoch": 0.363448440357247, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32861570.0, "logits/rejected": -19867627.42857143, "logps/chosen": -452.05718994140625, "logps/rejected": -331.9803989955357, "loss": 0.2352, "rewards/chosen": 0.3491882383823395, "rewards/margins": 1.9954780638217926, "rewards/rejected": -1.6462898254394531, "step": 6857 }, { "epoch": 0.36350144435904913, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24177284.8, "logits/rejected": -36130010.666666664, "logps/chosen": -130.0350830078125, "logps/rejected": -486.1732584635417, "loss": 0.4024, "rewards/chosen": -0.12167311906814575, "rewards/margins": 1.578657615184784, "rewards/rejected": -1.7003307342529297, "step": 6858 }, { "epoch": 0.36355444836085127, "grad_norm": 53.0, "kl": 0.1332855224609375, "learning_rate": 5e-07, "logits/chosen": -24158899.2, "logits/rejected": -3005586.6666666665, "logps/chosen": -204.38297119140626, "logps/rejected": -142.61116536458334, "loss": 0.4513, "rewards/chosen": -0.15841274261474608, "rewards/margins": 1.2375264485677082, "rewards/rejected": -1.3959391911824544, "step": 6859 }, { "epoch": 0.3636074523626534, "grad_norm": 40.5, "kl": 1.0064640045166016, "learning_rate": 5e-07, "logits/chosen": 1757477.75, "logits/rejected": -13194202.0, "logps/chosen": -170.80101013183594, "logps/rejected": -272.5144348144531, "loss": 0.3674, "rewards/chosen": 0.2514530122280121, "rewards/margins": 1.393695443868637, "rewards/rejected": -1.142242431640625, "step": 6860 }, { "epoch": 0.36366045636445554, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70517536.0, "logits/rejected": -13978809.0, "logps/chosen": -473.5670166015625, "logps/rejected": -322.58233642578125, "loss": 0.3282, "rewards/chosen": -0.07162512838840485, "rewards/margins": 2.0258742421865463, "rewards/rejected": -2.097499370574951, "step": 6861 }, { "epoch": 0.3637134603662577, "grad_norm": 42.75, "kl": 1.2530193328857422, "learning_rate": 5e-07, "logits/chosen": -39180328.0, "logits/rejected": -30150854.4, "logps/chosen": -527.0038248697916, "logps/rejected": -275.01044921875, "loss": 0.2359, "rewards/chosen": 0.8717194398244222, "rewards/margins": 2.9710350831349692, "rewards/rejected": -2.099315643310547, "step": 6862 }, { "epoch": 0.36376646436805976, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65845440.0, "logits/rejected": -25105356.0, "logps/chosen": -433.845703125, "logps/rejected": -368.17572021484375, "loss": 0.3306, "rewards/chosen": -0.05606212839484215, "rewards/margins": 2.166127774864435, "rewards/rejected": -2.2221899032592773, "step": 6863 }, { "epoch": 0.3638194683698619, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2646044.5, "logits/rejected": -12885428.57142857, "logps/chosen": -45.29154968261719, "logps/rejected": -210.47004045758928, "loss": 0.2536, "rewards/chosen": -1.0882797241210938, "rewards/margins": 0.530874524797712, "rewards/rejected": -1.6191542489188058, "step": 6864 }, { "epoch": 0.36387247237166404, "grad_norm": 97.0, "kl": 0.8761978149414062, "learning_rate": 5e-07, "logits/chosen": -10380383.0, "logits/rejected": -15006582.0, "logps/chosen": -254.303955078125, "logps/rejected": -281.89239501953125, "loss": 0.2951, "rewards/chosen": 0.4913591742515564, "rewards/margins": 2.1897013783454895, "rewards/rejected": -1.698342204093933, "step": 6865 }, { "epoch": 0.3639254763734662, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33044932.0, "logits/rejected": -11600852.0, "logps/chosen": -298.4862060546875, "logps/rejected": -294.8997802734375, "loss": 0.3055, "rewards/chosen": 0.4723963141441345, "rewards/margins": 1.809307038784027, "rewards/rejected": -1.3369107246398926, "step": 6866 }, { "epoch": 0.3639784803752683, "grad_norm": 49.25, "kl": 1.0162134170532227, "learning_rate": 5e-07, "logits/chosen": -35221993.6, "logits/rejected": -31717656.0, "logps/chosen": -333.8499267578125, "logps/rejected": -184.314208984375, "loss": 0.3384, "rewards/chosen": 0.33751511573791504, "rewards/margins": 2.079736789067586, "rewards/rejected": -1.7422216733296711, "step": 6867 }, { "epoch": 0.36403148437707045, "grad_norm": 82.5, "kl": 0.4060802459716797, "learning_rate": 5e-07, "logits/chosen": -55045280.0, "logits/rejected": -888103.0, "logps/chosen": -346.4908142089844, "logps/rejected": -253.0025177001953, "loss": 0.3854, "rewards/chosen": 0.0948692262172699, "rewards/margins": 1.0450808107852936, "rewards/rejected": -0.9502115845680237, "step": 6868 }, { "epoch": 0.3640844883788726, "grad_norm": 53.75, "kl": 0.5652084350585938, "learning_rate": 5e-07, "logits/chosen": -19277489.333333332, "logits/rejected": -4583138.0, "logps/chosen": -271.87841796875, "logps/rejected": -231.18017578125, "loss": 0.3905, "rewards/chosen": 0.2854718565940857, "rewards/margins": 1.440556824207306, "rewards/rejected": -1.1550849676132202, "step": 6869 }, { "epoch": 0.3641374923806747, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3498722.0, "logits/rejected": -31907386.666666668, "logps/chosen": -87.37671661376953, "logps/rejected": -525.4525146484375, "loss": 0.2201, "rewards/chosen": -0.06267842650413513, "rewards/margins": 2.57582480708758, "rewards/rejected": -2.6385032335917153, "step": 6870 }, { "epoch": 0.36419049638247686, "grad_norm": 47.5, "kl": 0.3272380828857422, "learning_rate": 5e-07, "logits/chosen": 26560496.0, "logits/rejected": 9741720.0, "logps/chosen": -192.780029296875, "logps/rejected": -348.11181640625, "loss": 0.3651, "rewards/chosen": -0.12103780508041381, "rewards/margins": 2.349741772810618, "rewards/rejected": -2.4707795778910318, "step": 6871 }, { "epoch": 0.364243500384279, "grad_norm": 41.5, "kl": 0.3460726737976074, "learning_rate": 5e-07, "logits/chosen": -14281620.0, "logits/rejected": -22976434.0, "logps/chosen": -127.42106628417969, "logps/rejected": -515.0958251953125, "loss": 0.2788, "rewards/chosen": 0.5520638227462769, "rewards/margins": 2.8795915842056274, "rewards/rejected": -2.3275277614593506, "step": 6872 }, { "epoch": 0.36429650438608113, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54951528.0, "logits/rejected": -8019026.0, "logps/chosen": -309.95233154296875, "logps/rejected": -381.0576477050781, "loss": 0.3285, "rewards/chosen": -0.05911140516400337, "rewards/margins": 1.9513108246028423, "rewards/rejected": -2.0104222297668457, "step": 6873 }, { "epoch": 0.36434950838788327, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29943252.0, "logits/rejected": -36461876.0, "logps/chosen": -335.16070556640625, "logps/rejected": -330.597412109375, "loss": 0.2441, "rewards/chosen": 0.7397206425666809, "rewards/margins": 2.6035868525505066, "rewards/rejected": -1.8638662099838257, "step": 6874 }, { "epoch": 0.3644025123896854, "grad_norm": 78.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2018129.7142857143, "logits/rejected": -5154376.0, "logps/chosen": -304.3662109375, "logps/rejected": -139.297119140625, "loss": 0.4945, "rewards/chosen": -0.1073096649987357, "rewards/margins": 0.8021996361868722, "rewards/rejected": -0.9095093011856079, "step": 6875 }, { "epoch": 0.36445551639148754, "grad_norm": 70.0, "kl": 1.8385953903198242, "learning_rate": 5e-07, "logits/chosen": -12152349.333333334, "logits/rejected": -12121621.0, "logps/chosen": -223.32194010416666, "logps/rejected": -159.83010864257812, "loss": 0.4311, "rewards/chosen": 0.15011062224706015, "rewards/margins": 1.725857635339101, "rewards/rejected": -1.575747013092041, "step": 6876 }, { "epoch": 0.3645085203932897, "grad_norm": 39.25, "kl": 0.04061698913574219, "learning_rate": 5e-07, "logits/chosen": -20099380.0, "logits/rejected": -23250674.0, "logps/chosen": -156.99264526367188, "logps/rejected": -253.34872436523438, "loss": 0.2934, "rewards/chosen": 0.3859129846096039, "rewards/margins": 2.103353351354599, "rewards/rejected": -1.7174403667449951, "step": 6877 }, { "epoch": 0.3645615243950918, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25512210.285714287, "logits/rejected": -92540856.0, "logps/chosen": -222.48041643415178, "logps/rejected": -265.33514404296875, "loss": 0.4432, "rewards/chosen": 0.12280816691262382, "rewards/margins": 1.2028512443814958, "rewards/rejected": -1.080043077468872, "step": 6878 }, { "epoch": 0.36461452839689396, "grad_norm": 49.25, "kl": 0.8996505737304688, "learning_rate": 5e-07, "logits/chosen": -32923040.0, "logits/rejected": -40351424.0, "logps/chosen": -234.725341796875, "logps/rejected": -503.1810607910156, "loss": 0.2872, "rewards/chosen": 0.11799813061952591, "rewards/margins": 2.594871051609516, "rewards/rejected": -2.4768729209899902, "step": 6879 }, { "epoch": 0.3646675323986961, "grad_norm": 57.0, "kl": 0.9625473022460938, "learning_rate": 5e-07, "logits/chosen": -5005620.0, "logits/rejected": 5931071.0, "logps/chosen": -649.024658203125, "logps/rejected": -218.09701538085938, "loss": 0.2999, "rewards/chosen": 0.6377115249633789, "rewards/margins": 2.1602569818496704, "rewards/rejected": -1.5225454568862915, "step": 6880 }, { "epoch": 0.36472053640049823, "grad_norm": 32.5, "kl": 0.4418792724609375, "learning_rate": 5e-07, "logits/chosen": -21141172.0, "logits/rejected": -27581276.0, "logps/chosen": -213.2210693359375, "logps/rejected": -552.5987548828125, "loss": 0.2291, "rewards/chosen": 0.5680602788925171, "rewards/margins": 4.244372487068176, "rewards/rejected": -3.676312208175659, "step": 6881 }, { "epoch": 0.36477354040230037, "grad_norm": 61.25, "kl": 1.2336511611938477, "learning_rate": 5e-07, "logits/chosen": -12079679.2, "logits/rejected": -24289205.333333332, "logps/chosen": -396.28447265625, "logps/rejected": -280.48891194661456, "loss": 0.3469, "rewards/chosen": 0.5200838088989258, "rewards/margins": 1.8589470227559408, "rewards/rejected": -1.338863213857015, "step": 6882 }, { "epoch": 0.3648265444041025, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4436190.333333333, "logits/rejected": -46014819.2, "logps/chosen": -189.09088134765625, "logps/rejected": -262.6779052734375, "loss": 0.3185, "rewards/chosen": 0.2485195199648539, "rewards/margins": 1.6945050279299418, "rewards/rejected": -1.4459855079650878, "step": 6883 }, { "epoch": 0.36487954840590464, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -654093.5, "logits/rejected": -67270570.66666667, "logps/chosen": -87.08644104003906, "logps/rejected": -279.7375081380208, "loss": 0.2726, "rewards/chosen": -0.1896488219499588, "rewards/margins": 1.4966853111982346, "rewards/rejected": -1.6863341331481934, "step": 6884 }, { "epoch": 0.3649325524077068, "grad_norm": 47.0, "kl": 1.1025161743164062, "learning_rate": 5e-07, "logits/chosen": -10853418.666666666, "logits/rejected": -22691590.4, "logps/chosen": -303.58201090494794, "logps/rejected": -302.7146728515625, "loss": 0.3234, "rewards/chosen": 0.48766255378723145, "rewards/margins": 1.9929973125457763, "rewards/rejected": -1.5053347587585448, "step": 6885 }, { "epoch": 0.3649855564095089, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 578776.1875, "logits/rejected": 130268632.0, "logps/chosen": -107.71328735351562, "logps/rejected": -303.8672790527344, "loss": 0.342, "rewards/chosen": -0.2938114404678345, "rewards/margins": 1.9508856534957886, "rewards/rejected": -2.244697093963623, "step": 6886 }, { "epoch": 0.36503856041131105, "grad_norm": 55.0, "kl": 0.9394187927246094, "learning_rate": 5e-07, "logits/chosen": 26496084.0, "logits/rejected": -10020115.0, "logps/chosen": -332.961669921875, "logps/rejected": -180.6789093017578, "loss": 0.3376, "rewards/chosen": 0.3138432502746582, "rewards/margins": 1.7997153997421265, "rewards/rejected": -1.4858721494674683, "step": 6887 }, { "epoch": 0.3650915644131132, "grad_norm": 101.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27289369.6, "logits/rejected": 2238816.3333333335, "logps/chosen": -489.6908203125, "logps/rejected": -227.18098958333334, "loss": 0.3577, "rewards/chosen": 0.19597045183181763, "rewards/margins": 1.632323976357778, "rewards/rejected": -1.4363535245259602, "step": 6888 }, { "epoch": 0.3651445684149153, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18581898.666666668, "logits/rejected": -8294181.6, "logps/chosen": -107.3164571126302, "logps/rejected": -303.7164306640625, "loss": 0.2661, "rewards/chosen": 0.38872186342875165, "rewards/margins": 2.6801801840464274, "rewards/rejected": -2.291458320617676, "step": 6889 }, { "epoch": 0.36519757241671746, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19036910.0, "logits/rejected": -26327238.85714286, "logps/chosen": -205.27841186523438, "logps/rejected": -348.47572544642856, "loss": 0.1547, "rewards/chosen": 0.131031796336174, "rewards/margins": 2.624831278409277, "rewards/rejected": -2.493799482073103, "step": 6890 }, { "epoch": 0.3652505764185196, "grad_norm": 69.0, "kl": 2.9899559020996094, "learning_rate": 5e-07, "logits/chosen": -49319941.333333336, "logits/rejected": -32273434.0, "logps/chosen": -421.7423909505208, "logps/rejected": -407.9090576171875, "loss": 0.3809, "rewards/chosen": 0.600260337193807, "rewards/margins": 2.301533063252767, "rewards/rejected": -1.70127272605896, "step": 6891 }, { "epoch": 0.36530358042032174, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15104496.0, "logits/rejected": -10445545.6, "logps/chosen": -180.14678955078125, "logps/rejected": -308.8071044921875, "loss": 0.2732, "rewards/chosen": 0.03569183746973673, "rewards/margins": 1.978788952032725, "rewards/rejected": -1.9430971145629883, "step": 6892 }, { "epoch": 0.3653565844221239, "grad_norm": 33.25, "kl": 1.344879150390625, "learning_rate": 5e-07, "logits/chosen": -18654347.2, "logits/rejected": -74518954.66666667, "logps/chosen": -442.070947265625, "logps/rejected": -552.7262369791666, "loss": 0.228, "rewards/chosen": 1.0545364379882813, "rewards/margins": 4.011314646402995, "rewards/rejected": -2.9567782084147134, "step": 6893 }, { "epoch": 0.365409588423926, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5684077.0, "logits/rejected": 6833522.5, "logps/chosen": -163.34542846679688, "logps/rejected": -284.1971435546875, "loss": 0.3658, "rewards/chosen": 0.21874120831489563, "rewards/margins": 1.4624935686588287, "rewards/rejected": -1.243752360343933, "step": 6894 }, { "epoch": 0.36546259242572815, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10907542.0, "logits/rejected": -41637620.0, "logps/chosen": -165.48782348632812, "logps/rejected": -294.23956298828125, "loss": 0.3342, "rewards/chosen": -0.2772088646888733, "rewards/margins": 2.2743443846702576, "rewards/rejected": -2.551553249359131, "step": 6895 }, { "epoch": 0.3655155964275303, "grad_norm": 55.75, "kl": 1.1433677673339844, "learning_rate": 5e-07, "logits/chosen": -23062481.6, "logits/rejected": -9569054.666666666, "logps/chosen": -299.0134033203125, "logps/rejected": -326.69968668619794, "loss": 0.258, "rewards/chosen": 0.7663505554199219, "rewards/margins": 4.290372021993002, "rewards/rejected": -3.5240214665730796, "step": 6896 }, { "epoch": 0.3655686004293324, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8340518.0, "logits/rejected": -19669315.2, "logps/chosen": -128.00435384114584, "logps/rejected": -268.59873046875, "loss": 0.2695, "rewards/chosen": 0.3748578627904256, "rewards/margins": 1.9376679976781208, "rewards/rejected": -1.5628101348876953, "step": 6897 }, { "epoch": 0.36562160443113456, "grad_norm": 54.5, "kl": 1.6050224304199219, "learning_rate": 5e-07, "logits/chosen": -21186556.0, "logits/rejected": -10115077.6, "logps/chosen": -319.23046875, "logps/rejected": -241.825341796875, "loss": 0.3311, "rewards/chosen": 0.25432918469111127, "rewards/margins": 1.7088008681933087, "rewards/rejected": -1.4544716835021974, "step": 6898 }, { "epoch": 0.3656746084329367, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -89695765.33333333, "logits/rejected": -32340710.4, "logps/chosen": -392.039794921875, "logps/rejected": -322.3357177734375, "loss": 0.1834, "rewards/chosen": 0.7559995651245117, "rewards/margins": 3.38528995513916, "rewards/rejected": -2.6292903900146483, "step": 6899 }, { "epoch": 0.36572761243473884, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41648930.666666664, "logits/rejected": -17428048.0, "logps/chosen": -264.42645263671875, "logps/rejected": -302.0894287109375, "loss": 0.2593, "rewards/chosen": 0.2185564637184143, "rewards/margins": 2.308355963230133, "rewards/rejected": -2.089799499511719, "step": 6900 }, { "epoch": 0.365780616436541, "grad_norm": 56.75, "kl": 0.10256195068359375, "learning_rate": 5e-07, "logits/chosen": -15669741.0, "logits/rejected": -14163034.0, "logps/chosen": -473.1672668457031, "logps/rejected": -164.4413604736328, "loss": 0.3265, "rewards/chosen": 0.5032323598861694, "rewards/margins": 1.875396728515625, "rewards/rejected": -1.3721643686294556, "step": 6901 }, { "epoch": 0.3658336204383431, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31818364.8, "logits/rejected": -62446581.333333336, "logps/chosen": -251.3298095703125, "logps/rejected": -444.7045491536458, "loss": 0.3445, "rewards/chosen": 0.2104381799697876, "rewards/margins": 2.3660918156305946, "rewards/rejected": -2.155653635660807, "step": 6902 }, { "epoch": 0.36588662444014525, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34596115.2, "logits/rejected": -27784986.666666668, "logps/chosen": -345.12041015625, "logps/rejected": -154.6115926106771, "loss": 0.3405, "rewards/chosen": 0.30100021362304685, "rewards/margins": 2.0817860921223956, "rewards/rejected": -1.7807858784993489, "step": 6903 }, { "epoch": 0.3659396284419474, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7058747.0, "logits/rejected": -17045388.0, "logps/chosen": -436.58935546875, "logps/rejected": -308.587646484375, "loss": 0.2991, "rewards/chosen": 0.00176316499710083, "rewards/margins": 2.8801315426826477, "rewards/rejected": -2.878368377685547, "step": 6904 }, { "epoch": 0.3659926324437495, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23099976.0, "logits/rejected": 7300864.5, "logps/chosen": -303.42694091796875, "logps/rejected": -343.3907165527344, "loss": 0.2374, "rewards/chosen": 0.7470184564590454, "rewards/margins": 3.114472270011902, "rewards/rejected": -2.3674538135528564, "step": 6905 }, { "epoch": 0.36604563644555166, "grad_norm": 41.25, "kl": 0.3324699401855469, "learning_rate": 5e-07, "logits/chosen": -55563888.0, "logits/rejected": -28556040.0, "logps/chosen": -423.0311279296875, "logps/rejected": -522.7665405273438, "loss": 0.1908, "rewards/chosen": 0.8338077068328857, "rewards/margins": 3.936525821685791, "rewards/rejected": -3.1027181148529053, "step": 6906 }, { "epoch": 0.3660986404473538, "grad_norm": 59.75, "kl": 0.0094451904296875, "learning_rate": 5e-07, "logits/chosen": -38019139.2, "logits/rejected": -11360125.333333334, "logps/chosen": -319.834130859375, "logps/rejected": -114.11250813802083, "loss": 0.4104, "rewards/chosen": -0.29023807048797606, "rewards/margins": 1.4787905931472778, "rewards/rejected": -1.769028663635254, "step": 6907 }, { "epoch": 0.36615164444915593, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52313744.0, "logits/rejected": -19810562.666666668, "logps/chosen": -236.2292938232422, "logps/rejected": -143.92765299479166, "loss": 0.2517, "rewards/chosen": 1.0240700244903564, "rewards/margins": 2.3174426555633545, "rewards/rejected": -1.293372631072998, "step": 6908 }, { "epoch": 0.36620464845095807, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34817632.0, "logits/rejected": -24064534.0, "logps/chosen": -201.98196411132812, "logps/rejected": -421.1278076171875, "loss": 0.3665, "rewards/chosen": -0.030564218759536743, "rewards/margins": 1.4611351191997528, "rewards/rejected": -1.4916993379592896, "step": 6909 }, { "epoch": 0.3662576524527602, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22924236.0, "logits/rejected": 1740658.0, "logps/chosen": -330.798583984375, "logps/rejected": -781.011962890625, "loss": 0.2828, "rewards/chosen": 0.21169301867485046, "rewards/margins": 2.263390988111496, "rewards/rejected": -2.0516979694366455, "step": 6910 }, { "epoch": 0.36631065645456234, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44203242.666666664, "logits/rejected": -21687035.2, "logps/chosen": -608.2884521484375, "logps/rejected": -182.1589599609375, "loss": 0.2229, "rewards/chosen": 0.7763651212056478, "rewards/margins": 2.8095493634541833, "rewards/rejected": -2.0331842422485353, "step": 6911 }, { "epoch": 0.3663636604563645, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42984083.2, "logits/rejected": 182687.66666666666, "logps/chosen": -308.1419189453125, "logps/rejected": -681.7841796875, "loss": 0.3698, "rewards/chosen": 0.022005760669708253, "rewards/margins": 2.202172323067983, "rewards/rejected": -2.180166562398275, "step": 6912 }, { "epoch": 0.3664166644581666, "grad_norm": 62.0, "kl": 0.9124984741210938, "learning_rate": 5e-07, "logits/chosen": -32502538.666666668, "logits/rejected": -16898124.0, "logps/chosen": -463.6962890625, "logps/rejected": -218.24534606933594, "loss": 0.2693, "rewards/chosen": 0.781853993733724, "rewards/margins": 3.858379681905111, "rewards/rejected": -3.0765256881713867, "step": 6913 }, { "epoch": 0.3664696684599687, "grad_norm": 37.25, "kl": 0.8580684661865234, "learning_rate": 5e-07, "logits/chosen": -35092848.0, "logits/rejected": -19677363.2, "logps/chosen": -393.8163248697917, "logps/rejected": -272.80498046875, "loss": 0.2035, "rewards/chosen": 0.5365917682647705, "rewards/margins": 3.0577152729034425, "rewards/rejected": -2.521123504638672, "step": 6914 }, { "epoch": 0.36652267246177084, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31112090.0, "logits/rejected": -32156044.0, "logps/chosen": -382.615234375, "logps/rejected": -306.48651123046875, "loss": 0.2441, "rewards/chosen": 0.634344220161438, "rewards/margins": 2.8898123502731323, "rewards/rejected": -2.2554681301116943, "step": 6915 }, { "epoch": 0.366575676463573, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -87569056.0, "logits/rejected": -41727130.666666664, "logps/chosen": -484.062646484375, "logps/rejected": -397.3084309895833, "loss": 0.2603, "rewards/chosen": 0.5491793632507325, "rewards/margins": 3.4790976524353026, "rewards/rejected": -2.9299182891845703, "step": 6916 }, { "epoch": 0.3666286804653751, "grad_norm": 56.75, "kl": 0.0350494384765625, "learning_rate": 5e-07, "logits/chosen": -24518590.4, "logits/rejected": -6891662.666666667, "logps/chosen": -236.0815185546875, "logps/rejected": -295.447509765625, "loss": 0.3811, "rewards/chosen": -0.22681069374084473, "rewards/margins": 3.4469614823659263, "rewards/rejected": -3.673772176106771, "step": 6917 }, { "epoch": 0.36668168446717725, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70537317.33333333, "logits/rejected": -42105062.4, "logps/chosen": -159.6311238606771, "logps/rejected": -267.861474609375, "loss": 0.2766, "rewards/chosen": 0.2311378518740336, "rewards/margins": 2.1959342996279396, "rewards/rejected": -1.9647964477539062, "step": 6918 }, { "epoch": 0.3667346884689794, "grad_norm": 51.25, "kl": 0.40729522705078125, "learning_rate": 5e-07, "logits/chosen": -23826102.0, "logits/rejected": -13983539.0, "logps/chosen": -461.02001953125, "logps/rejected": -145.55746459960938, "loss": 0.3217, "rewards/chosen": 0.9061592817306519, "rewards/margins": 1.6227878332138062, "rewards/rejected": -0.7166285514831543, "step": 6919 }, { "epoch": 0.3667876924707815, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44688516.0, "logits/rejected": -35429660.0, "logps/chosen": -303.97320556640625, "logps/rejected": -262.1675720214844, "loss": 0.3146, "rewards/chosen": 0.27352574467658997, "rewards/margins": 1.9112397730350494, "rewards/rejected": -1.6377140283584595, "step": 6920 }, { "epoch": 0.36684069647258366, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21946987.2, "logits/rejected": -94357696.0, "logps/chosen": -173.87098388671876, "logps/rejected": -275.0249430338542, "loss": 0.2913, "rewards/chosen": 0.5951489448547364, "rewards/margins": 3.2699663480122885, "rewards/rejected": -2.6748174031575522, "step": 6921 }, { "epoch": 0.3668937004743858, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 571895.0, "logits/rejected": -23476032.0, "logps/chosen": -146.883056640625, "logps/rejected": -225.60233561197916, "loss": 0.2722, "rewards/chosen": -0.27871522307395935, "rewards/margins": 1.8798142174879708, "rewards/rejected": -2.15852944056193, "step": 6922 }, { "epoch": 0.36694670447618793, "grad_norm": 53.0, "kl": 0.459320068359375, "learning_rate": 5e-07, "logits/chosen": -62489834.666666664, "logits/rejected": -14433574.4, "logps/chosen": -531.7440592447916, "logps/rejected": -95.0493408203125, "loss": 0.3293, "rewards/chosen": 0.7363770008087158, "rewards/margins": 1.5525522708892823, "rewards/rejected": -0.8161752700805665, "step": 6923 }, { "epoch": 0.36699970847799007, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30787477.333333332, "logits/rejected": -15795688.0, "logps/chosen": -280.90216064453125, "logps/rejected": -404.8452880859375, "loss": 0.2222, "rewards/chosen": 0.2562138835589091, "rewards/margins": 2.8833228389422096, "rewards/rejected": -2.6271089553833007, "step": 6924 }, { "epoch": 0.3670527124797922, "grad_norm": 61.25, "kl": 1.0466222763061523, "learning_rate": 5e-07, "logits/chosen": -27825112.0, "logits/rejected": -10924404.0, "logps/chosen": -272.0356852213542, "logps/rejected": -367.0518798828125, "loss": 0.3748, "rewards/chosen": 0.5624918937683105, "rewards/margins": 1.6144938468933105, "rewards/rejected": -1.052001953125, "step": 6925 }, { "epoch": 0.36710571648159435, "grad_norm": 50.75, "kl": 3.9557723999023438, "learning_rate": 5e-07, "logits/chosen": -10092250.666666666, "logits/rejected": -15586528.0, "logps/chosen": -398.7183430989583, "logps/rejected": -300.49627685546875, "loss": 0.2651, "rewards/chosen": 1.1953165531158447, "rewards/margins": 3.4988505840301514, "rewards/rejected": -2.3035340309143066, "step": 6926 }, { "epoch": 0.3671587204833965, "grad_norm": 39.25, "kl": 3.007925033569336, "learning_rate": 5e-07, "logits/chosen": -4455358.0, "logits/rejected": -24447100.0, "logps/chosen": -268.4999694824219, "logps/rejected": -475.0928955078125, "loss": 0.2703, "rewards/chosen": 0.5575241446495056, "rewards/margins": 2.762532889842987, "rewards/rejected": -2.2050087451934814, "step": 6927 }, { "epoch": 0.3672117244851986, "grad_norm": 52.75, "kl": 1.526254653930664, "learning_rate": 5e-07, "logits/chosen": -2363164.0, "logits/rejected": -27864707.2, "logps/chosen": -303.57269287109375, "logps/rejected": -417.09697265625, "loss": 0.2154, "rewards/chosen": 0.7647308508555094, "rewards/margins": 3.0643530050913492, "rewards/rejected": -2.29962215423584, "step": 6928 }, { "epoch": 0.36726472848700076, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17201998.4, "logits/rejected": -35896576.0, "logps/chosen": -227.61181640625, "logps/rejected": -450.7315266927083, "loss": 0.2629, "rewards/chosen": 0.6407864093780518, "rewards/margins": 2.9793638388315835, "rewards/rejected": -2.3385774294535318, "step": 6929 }, { "epoch": 0.3673177324888029, "grad_norm": 43.75, "kl": 2.1045989990234375, "learning_rate": 5e-07, "logits/chosen": -29408089.6, "logits/rejected": -11485389.333333334, "logps/chosen": -275.7867919921875, "logps/rejected": -228.1454874674479, "loss": 0.4433, "rewards/chosen": -0.27450385093688967, "rewards/margins": 1.0785149097442628, "rewards/rejected": -1.3530187606811523, "step": 6930 }, { "epoch": 0.36737073649060503, "grad_norm": 61.0, "kl": 0.7091045379638672, "learning_rate": 5e-07, "logits/chosen": -20626057.6, "logits/rejected": -27966269.333333332, "logps/chosen": -465.889697265625, "logps/rejected": -229.71919759114584, "loss": 0.3193, "rewards/chosen": 0.5287854194641113, "rewards/margins": 2.0788129488627116, "rewards/rejected": -1.5500275293986003, "step": 6931 }, { "epoch": 0.36742374049240717, "grad_norm": 68.0, "kl": 3.138957977294922, "learning_rate": 5e-07, "logits/chosen": -39588637.333333336, "logits/rejected": -27261216.0, "logps/chosen": -643.4410400390625, "logps/rejected": -193.392138671875, "loss": 0.2095, "rewards/chosen": 1.4769455591837566, "rewards/margins": 2.890990320841471, "rewards/rejected": -1.4140447616577148, "step": 6932 }, { "epoch": 0.3674767444942093, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12725438.0, "logits/rejected": -14108678.0, "logps/chosen": -119.08006286621094, "logps/rejected": -113.76788330078125, "loss": 0.3124, "rewards/chosen": 0.032197244465351105, "rewards/margins": 1.9805269315838814, "rewards/rejected": -1.9483296871185303, "step": 6933 }, { "epoch": 0.36752974849601144, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49095808.0, "logits/rejected": -14485976.0, "logps/chosen": -319.0065511067708, "logps/rejected": -168.6674072265625, "loss": 0.2743, "rewards/chosen": 0.5792342821756998, "rewards/margins": 1.9810016314188639, "rewards/rejected": -1.4017673492431642, "step": 6934 }, { "epoch": 0.3675827524978136, "grad_norm": 40.75, "kl": 0.3488044738769531, "learning_rate": 5e-07, "logits/chosen": -42340496.0, "logits/rejected": -33075392.0, "logps/chosen": -263.60369873046875, "logps/rejected": -226.172509765625, "loss": 0.2051, "rewards/chosen": 0.45909321308135986, "rewards/margins": 3.0654927492141724, "rewards/rejected": -2.6063995361328125, "step": 6935 }, { "epoch": 0.3676357564996157, "grad_norm": 46.0, "kl": 3.7369956970214844, "learning_rate": 5e-07, "logits/chosen": -74992192.0, "logits/rejected": -61154821.333333336, "logps/chosen": -556.53623046875, "logps/rejected": -148.88969930013022, "loss": 0.3856, "rewards/chosen": 0.7424938678741455, "rewards/margins": 1.3248058557510376, "rewards/rejected": -0.5823119878768921, "step": 6936 }, { "epoch": 0.36768876050141786, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38611344.0, "logits/rejected": -22158898.0, "logps/chosen": -404.0614013671875, "logps/rejected": -277.8682556152344, "loss": 0.3204, "rewards/chosen": 0.012765690684318542, "rewards/margins": 1.9867301434278488, "rewards/rejected": -1.9739644527435303, "step": 6937 }, { "epoch": 0.36774176450322, "grad_norm": 45.75, "kl": 0.338287353515625, "learning_rate": 5e-07, "logits/chosen": -27027100.0, "logits/rejected": -10645772.0, "logps/chosen": -265.9805603027344, "logps/rejected": -166.28872680664062, "loss": 0.3422, "rewards/chosen": -0.09629020094871521, "rewards/margins": 1.5473792850971222, "rewards/rejected": -1.6436694860458374, "step": 6938 }, { "epoch": 0.36779476850502213, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -89556424.0, "logits/rejected": -28307830.0, "logps/chosen": -155.3551025390625, "logps/rejected": -191.16595458984375, "loss": 0.3243, "rewards/chosen": 0.10410423576831818, "rewards/margins": 2.195917323231697, "rewards/rejected": -2.091813087463379, "step": 6939 }, { "epoch": 0.36784777250682427, "grad_norm": 55.25, "kl": 1.1599388122558594, "learning_rate": 5e-07, "logits/chosen": -19729461.333333332, "logits/rejected": -37363640.0, "logps/chosen": -366.9047037760417, "logps/rejected": -335.8980712890625, "loss": 0.3167, "rewards/chosen": 0.826988935470581, "rewards/margins": 2.7782081365585327, "rewards/rejected": -1.9512192010879517, "step": 6940 }, { "epoch": 0.3679007765086264, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -40098960.0, "logps/rejected": -313.20745849609375, "loss": 0.145, "rewards/rejected": -1.9269208908081055, "step": 6941 }, { "epoch": 0.36795378051042854, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36776601.6, "logits/rejected": 1400753.3333333333, "logps/chosen": -490.487890625, "logps/rejected": -347.4193522135417, "loss": 0.2351, "rewards/chosen": 0.7832770347595215, "rewards/margins": 3.107777754465739, "rewards/rejected": -2.3245007197062173, "step": 6942 }, { "epoch": 0.3680067845122307, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15782417.0, "logits/rejected": -4466994.0, "logps/chosen": -220.1022186279297, "logps/rejected": -200.27723693847656, "loss": 0.2916, "rewards/chosen": 0.8243440389633179, "rewards/margins": 2.2422467470169067, "rewards/rejected": -1.4179027080535889, "step": 6943 }, { "epoch": 0.3680597885140328, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32449292.0, "logits/rejected": -11189562.0, "logps/chosen": -322.94287109375, "logps/rejected": -170.92559814453125, "loss": 0.2454, "rewards/chosen": 0.4103359282016754, "rewards/margins": 2.7793258726596832, "rewards/rejected": -2.368989944458008, "step": 6944 }, { "epoch": 0.36811279251583495, "grad_norm": 54.75, "kl": 1.2270317077636719, "learning_rate": 5e-07, "logits/chosen": -32564042.666666668, "logits/rejected": -13260420.0, "logps/chosen": -448.496337890625, "logps/rejected": -200.36544799804688, "loss": 0.3981, "rewards/chosen": 0.4712766806284587, "rewards/margins": 1.5751352707544963, "rewards/rejected": -1.1038585901260376, "step": 6945 }, { "epoch": 0.3681657965176371, "grad_norm": 65.0, "kl": 0.3661518096923828, "learning_rate": 5e-07, "logits/chosen": -42941109.333333336, "logits/rejected": -3007260.5, "logps/chosen": -227.69742838541666, "logps/rejected": -214.56161499023438, "loss": 0.3942, "rewards/chosen": 0.2350545326868693, "rewards/margins": 1.4389005104700725, "rewards/rejected": -1.2038459777832031, "step": 6946 }, { "epoch": 0.3682188005194392, "grad_norm": 49.0, "kl": 1.4207839965820312, "learning_rate": 5e-07, "logits/chosen": -40696612.0, "logits/rejected": -12013385.333333334, "logps/chosen": -876.206787109375, "logps/rejected": -248.82002766927084, "loss": 0.1869, "rewards/chosen": 1.3095802068710327, "rewards/margins": 3.1444391012191772, "rewards/rejected": -1.8348588943481445, "step": 6947 }, { "epoch": 0.36827180452124136, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21340902.666666668, "logits/rejected": -3738139.2, "logps/chosen": -62.492828369140625, "logps/rejected": -372.62763671875, "loss": 0.2356, "rewards/chosen": 0.11580213904380798, "rewards/margins": 2.8669364392757415, "rewards/rejected": -2.7511343002319335, "step": 6948 }, { "epoch": 0.3683248085230435, "grad_norm": 50.0, "kl": 0.3161201477050781, "learning_rate": 5e-07, "logits/chosen": -48324536.0, "logits/rejected": -16518788.0, "logps/chosen": -484.62762451171875, "logps/rejected": -477.67333984375, "loss": 0.244, "rewards/chosen": 0.5815101265907288, "rewards/margins": 2.7810285687446594, "rewards/rejected": -2.1995184421539307, "step": 6949 }, { "epoch": 0.36837781252484564, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30567356.0, "logits/rejected": -19878480.0, "logps/chosen": -267.4527282714844, "logps/rejected": -128.9344940185547, "loss": 0.3399, "rewards/chosen": 0.252902626991272, "rewards/margins": 1.6811243295669556, "rewards/rejected": -1.4282217025756836, "step": 6950 }, { "epoch": 0.3684308165266478, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81609048.0, "logits/rejected": -18616061.714285713, "logps/chosen": -239.70095825195312, "logps/rejected": -377.18788364955356, "loss": 0.1531, "rewards/chosen": -0.34073182940483093, "rewards/margins": 2.2537413282053813, "rewards/rejected": -2.5944731576102122, "step": 6951 }, { "epoch": 0.3684838205284499, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38144984.0, "logits/rejected": -57170438.4, "logps/chosen": -416.76220703125, "logps/rejected": -305.611474609375, "loss": 0.2456, "rewards/chosen": 0.5928944746653239, "rewards/margins": 2.49658195177714, "rewards/rejected": -1.9036874771118164, "step": 6952 }, { "epoch": 0.36853682453025205, "grad_norm": 37.5, "kl": 1.3814544677734375, "learning_rate": 5e-07, "logits/chosen": -67000170.666666664, "logits/rejected": -24819888.0, "logps/chosen": -180.7757568359375, "logps/rejected": -86.86123657226562, "loss": 0.4062, "rewards/chosen": 0.2959978183110555, "rewards/margins": 2.058019836743673, "rewards/rejected": -1.7620220184326172, "step": 6953 }, { "epoch": 0.3685898285320542, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31973125.333333332, "logits/rejected": -16325712.0, "logps/chosen": -261.22279866536456, "logps/rejected": -281.263134765625, "loss": 0.2399, "rewards/chosen": -0.044051105777422585, "rewards/margins": 2.743314044674238, "rewards/rejected": -2.7873651504516603, "step": 6954 }, { "epoch": 0.3686428325338563, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12248949.333333334, "logits/rejected": -8399485.6, "logps/chosen": -143.71739705403647, "logps/rejected": -183.09749755859374, "loss": 0.2315, "rewards/chosen": 0.6426633596420288, "rewards/margins": 2.5185155630111695, "rewards/rejected": -1.8758522033691407, "step": 6955 }, { "epoch": 0.36869583653565846, "grad_norm": 52.5, "kl": 1.458984375, "learning_rate": 5e-07, "logits/chosen": -23599715.2, "logits/rejected": -23897501.333333332, "logps/chosen": -360.104248046875, "logps/rejected": -276.3254801432292, "loss": 0.3166, "rewards/chosen": 0.7419944763183594, "rewards/margins": 2.4784681955973307, "rewards/rejected": -1.7364737192789714, "step": 6956 }, { "epoch": 0.3687488405374606, "grad_norm": 51.0, "kl": 1.085087776184082, "learning_rate": 5e-07, "logits/chosen": -12613588.0, "logits/rejected": -7155661.5, "logps/chosen": -226.08955891927084, "logps/rejected": -208.30776977539062, "loss": 0.3135, "rewards/chosen": 0.5793162186940511, "rewards/margins": 3.528127749760946, "rewards/rejected": -2.9488115310668945, "step": 6957 }, { "epoch": 0.36880184453926274, "grad_norm": 46.75, "kl": 1.4047775268554688, "learning_rate": 5e-07, "logits/chosen": -37558540.8, "logits/rejected": -33931090.666666664, "logps/chosen": -369.15107421875, "logps/rejected": -370.4419759114583, "loss": 0.3026, "rewards/chosen": 0.5800240516662598, "rewards/margins": 2.377209949493408, "rewards/rejected": -1.7971858978271484, "step": 6958 }, { "epoch": 0.3688548485410649, "grad_norm": 73.5, "kl": 0.7798271179199219, "learning_rate": 5e-07, "logits/chosen": -60194784.0, "logits/rejected": -32091328.0, "logps/chosen": -423.2255554199219, "logps/rejected": -346.6337585449219, "loss": 0.265, "rewards/chosen": 0.5114189386367798, "rewards/margins": 2.95427143573761, "rewards/rejected": -2.44285249710083, "step": 6959 }, { "epoch": 0.368907852542867, "grad_norm": 39.25, "kl": 0.7479305267333984, "learning_rate": 5e-07, "logits/chosen": -7528662.666666667, "logits/rejected": -30847244.0, "logps/chosen": -327.87339274088544, "logps/rejected": -242.0105743408203, "loss": 0.3518, "rewards/chosen": 0.5732101599375407, "rewards/margins": 2.6295923391977944, "rewards/rejected": -2.056382179260254, "step": 6960 }, { "epoch": 0.36896085654466915, "grad_norm": 50.5, "kl": 1.0694541931152344, "learning_rate": 5e-07, "logits/chosen": -11025408.0, "logits/rejected": -3518885.1428571427, "logps/chosen": -321.12249755859375, "logps/rejected": -147.80172293526786, "loss": 0.2416, "rewards/chosen": 0.882568359375, "rewards/margins": 2.0193681716918945, "rewards/rejected": -1.1367998123168945, "step": 6961 }, { "epoch": 0.3690138605464713, "grad_norm": 40.0, "kl": 0.9492721557617188, "learning_rate": 5e-07, "logits/chosen": -21665860.0, "logits/rejected": -9952544.666666666, "logps/chosen": -917.3650512695312, "logps/rejected": -242.4102579752604, "loss": 0.1529, "rewards/chosen": 1.4345991611480713, "rewards/margins": 3.478999058405558, "rewards/rejected": -2.044399897257487, "step": 6962 }, { "epoch": 0.3690668645482734, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27934748.8, "logits/rejected": -15320077.333333334, "logps/chosen": -643.32421875, "logps/rejected": -297.7915852864583, "loss": 0.3321, "rewards/chosen": 0.3381669044494629, "rewards/margins": 2.3261973381042482, "rewards/rejected": -1.9880304336547852, "step": 6963 }, { "epoch": 0.36911986855007556, "grad_norm": 53.25, "kl": 0.3791999816894531, "learning_rate": 5e-07, "logits/chosen": -8147859.0, "logits/rejected": 4656226.5, "logps/chosen": -118.47215270996094, "logps/rejected": -160.69573974609375, "loss": 0.4245, "rewards/chosen": 0.02529263310134411, "rewards/margins": 0.7211875896900892, "rewards/rejected": -0.6958949565887451, "step": 6964 }, { "epoch": 0.36917287255187764, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9389722.0, "logits/rejected": -11344634.0, "logps/chosen": -539.484619140625, "logps/rejected": -238.57135009765625, "loss": 0.2201, "rewards/chosen": 0.9714534878730774, "rewards/margins": 3.019865334033966, "rewards/rejected": -2.0484118461608887, "step": 6965 }, { "epoch": 0.3692258765536798, "grad_norm": 53.5, "kl": 0.07570075988769531, "learning_rate": 5e-07, "logits/chosen": -63307370.666666664, "logits/rejected": -27453596.8, "logps/chosen": -306.9300130208333, "logps/rejected": -308.95693359375, "loss": 0.2842, "rewards/chosen": 0.2962443033854167, "rewards/margins": 1.8921291987101239, "rewards/rejected": -1.5958848953247071, "step": 6966 }, { "epoch": 0.3692788805554819, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7951251.5, "logits/rejected": -62100086.85714286, "logps/chosen": -56.98598098754883, "logps/rejected": -531.2292829241071, "loss": 0.1928, "rewards/chosen": -0.20075035095214844, "rewards/margins": 1.7751621518816267, "rewards/rejected": -1.9759125028337752, "step": 6967 }, { "epoch": 0.36933188455728405, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47910937.6, "logits/rejected": -40817085.333333336, "logps/chosen": -364.2154541015625, "logps/rejected": -639.4229329427084, "loss": 0.3105, "rewards/chosen": 0.19175255298614502, "rewards/margins": 2.785558501879374, "rewards/rejected": -2.593805948893229, "step": 6968 }, { "epoch": 0.3693848885590862, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19977482.666666668, "logits/rejected": -31169011.2, "logps/chosen": -431.7626953125, "logps/rejected": -410.10654296875, "loss": 0.2371, "rewards/chosen": 0.13656006256739298, "rewards/margins": 2.857836345831553, "rewards/rejected": -2.72127628326416, "step": 6969 }, { "epoch": 0.3694378925608883, "grad_norm": 51.0, "kl": 0.39455223083496094, "learning_rate": 5e-07, "logits/chosen": -17586224.0, "logits/rejected": -1917440.75, "logps/chosen": -334.64306640625, "logps/rejected": -45.420108795166016, "loss": 0.3376, "rewards/chosen": 0.6955802100045341, "rewards/margins": 2.259186114583697, "rewards/rejected": -1.5636059045791626, "step": 6970 }, { "epoch": 0.36949089656269046, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50008368.0, "logits/rejected": -28055712.0, "logps/chosen": -221.4193115234375, "logps/rejected": -362.00654296875, "loss": 0.2626, "rewards/chosen": 0.09013671676317851, "rewards/margins": 1.984434888760249, "rewards/rejected": -1.8942981719970704, "step": 6971 }, { "epoch": 0.3695439005644926, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22521248.0, "logits/rejected": -13372223.0, "logps/chosen": -254.86331176757812, "logps/rejected": -202.44837951660156, "loss": 0.4353, "rewards/chosen": 0.002391822636127472, "rewards/margins": 0.6687848046422005, "rewards/rejected": -0.666392982006073, "step": 6972 }, { "epoch": 0.36959690456629474, "grad_norm": 74.0, "kl": 1.3609199523925781, "learning_rate": 5e-07, "logits/chosen": -28974882.285714287, "logits/rejected": 6253743.0, "logps/chosen": -279.18118722098217, "logps/rejected": -163.62210083007812, "loss": 0.4395, "rewards/chosen": 0.3665966647011893, "rewards/margins": 0.9700161474091666, "rewards/rejected": -0.6034194827079773, "step": 6973 }, { "epoch": 0.3696499085680969, "grad_norm": 47.75, "kl": 0.15687179565429688, "learning_rate": 5e-07, "logits/chosen": -11660782.0, "logits/rejected": -32575152.0, "logps/chosen": -303.1181335449219, "logps/rejected": -263.0865173339844, "loss": 0.3089, "rewards/chosen": 0.1302105039358139, "rewards/margins": 2.2805951684713364, "rewards/rejected": -2.1503846645355225, "step": 6974 }, { "epoch": 0.369702912569899, "grad_norm": 52.75, "kl": 1.0302906036376953, "learning_rate": 5e-07, "logits/chosen": -9397427.0, "logits/rejected": -14580832.0, "logps/chosen": -254.88671875, "logps/rejected": -223.8910675048828, "loss": 0.3288, "rewards/chosen": 0.658872663974762, "rewards/margins": 1.6592559218406677, "rewards/rejected": -1.0003832578659058, "step": 6975 }, { "epoch": 0.36975591657170115, "grad_norm": 41.25, "kl": 0.9477882385253906, "learning_rate": 5e-07, "logits/chosen": -71501226.66666667, "logits/rejected": -12102174.4, "logps/chosen": -722.338134765625, "logps/rejected": -403.396875, "loss": 0.1514, "rewards/chosen": 1.399580478668213, "rewards/margins": 3.894645595550537, "rewards/rejected": -2.495065116882324, "step": 6976 }, { "epoch": 0.3698089205735033, "grad_norm": 45.25, "kl": 0.19945716857910156, "learning_rate": 5e-07, "logits/chosen": -6859517.5, "logits/rejected": -34811004.0, "logps/chosen": -181.8910369873047, "logps/rejected": -236.3957977294922, "loss": 0.2991, "rewards/chosen": 0.29239195585250854, "rewards/margins": 2.217589318752289, "rewards/rejected": -1.9251973628997803, "step": 6977 }, { "epoch": 0.3698619245753054, "grad_norm": 48.75, "kl": 0.8834095001220703, "learning_rate": 5e-07, "logits/chosen": -58344900.0, "logits/rejected": -145821.125, "logps/chosen": -301.9487609863281, "logps/rejected": -129.03846740722656, "loss": 0.2834, "rewards/chosen": 0.7413539886474609, "rewards/margins": 2.428271532058716, "rewards/rejected": -1.6869175434112549, "step": 6978 }, { "epoch": 0.36991492857710756, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43322272.0, "logits/rejected": -21125124.0, "logps/chosen": -418.712890625, "logps/rejected": -299.5953369140625, "loss": 0.2976, "rewards/chosen": 0.47843475341796876, "rewards/margins": 2.4824522654215495, "rewards/rejected": -2.0040175120035806, "step": 6979 }, { "epoch": 0.3699679325789097, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22152832.0, "logits/rejected": -44843992.0, "logps/chosen": -336.2092692057292, "logps/rejected": -169.89035034179688, "loss": 0.4532, "rewards/chosen": -0.05152917901674906, "rewards/margins": 1.076866736014684, "rewards/rejected": -1.128395915031433, "step": 6980 }, { "epoch": 0.37002093658071183, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23137402.0, "logits/rejected": -59313544.0, "logps/chosen": -341.2485046386719, "logps/rejected": -315.65576171875, "loss": 0.2886, "rewards/chosen": 0.4257662892341614, "rewards/margins": 2.4110410809516907, "rewards/rejected": -1.9852747917175293, "step": 6981 }, { "epoch": 0.37007394058251397, "grad_norm": 43.75, "kl": 0.172943115234375, "learning_rate": 5e-07, "logits/chosen": -47821416.0, "logits/rejected": -22812644.0, "logps/chosen": -350.9690246582031, "logps/rejected": -318.5013122558594, "loss": 0.2715, "rewards/chosen": 0.3703170418739319, "rewards/margins": 2.453937590122223, "rewards/rejected": -2.083620548248291, "step": 6982 }, { "epoch": 0.3701269445843161, "grad_norm": 52.25, "kl": 0.38396453857421875, "learning_rate": 5e-07, "logits/chosen": -23349306.0, "logits/rejected": -16933244.0, "logps/chosen": -297.1097106933594, "logps/rejected": -409.91790771484375, "loss": 0.2817, "rewards/chosen": 0.34945154190063477, "rewards/margins": 2.3512630462646484, "rewards/rejected": -2.0018115043640137, "step": 6983 }, { "epoch": 0.37017994858611825, "grad_norm": 53.0, "kl": 1.589324951171875, "learning_rate": 5e-07, "logits/chosen": -18791560.0, "logits/rejected": -5214700.5, "logps/chosen": -1087.20751953125, "logps/rejected": -157.73092651367188, "loss": 0.2349, "rewards/chosen": 1.4624450206756592, "rewards/margins": 2.983191728591919, "rewards/rejected": -1.5207467079162598, "step": 6984 }, { "epoch": 0.3702329525879204, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19240130.0, "logits/rejected": -11619008.0, "logps/chosen": -182.59365844726562, "logps/rejected": -231.06595865885416, "loss": 0.1917, "rewards/chosen": 0.08848762512207031, "rewards/margins": 2.5284810066223145, "rewards/rejected": -2.439993381500244, "step": 6985 }, { "epoch": 0.3702859565897225, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5648993.142857143, "logits/rejected": -2808892.5, "logps/chosen": -188.36952427455358, "logps/rejected": -175.93350219726562, "loss": 0.4242, "rewards/chosen": 0.1663302183151245, "rewards/margins": 1.7730135917663574, "rewards/rejected": -1.606683373451233, "step": 6986 }, { "epoch": 0.37033896059152466, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32667988.0, "logits/rejected": -5600679.5, "logps/chosen": -248.5010528564453, "logps/rejected": -145.2677001953125, "loss": 0.3976, "rewards/chosen": -0.19535085558891296, "rewards/margins": 1.3736661970615387, "rewards/rejected": -1.5690170526504517, "step": 6987 }, { "epoch": 0.3703919645933268, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51852788.0, "logits/rejected": -20035786.666666668, "logps/chosen": -330.2142333984375, "logps/rejected": -273.1363525390625, "loss": 0.209, "rewards/chosen": -0.253744512796402, "rewards/margins": 2.4540671606858573, "rewards/rejected": -2.7078116734822593, "step": 6988 }, { "epoch": 0.37044496859512893, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20379506.0, "logits/rejected": -19949880.0, "logps/chosen": -263.13848876953125, "logps/rejected": -285.7801513671875, "loss": 0.2806, "rewards/chosen": 0.45000192523002625, "rewards/margins": 2.4840318858623505, "rewards/rejected": -2.034029960632324, "step": 6989 }, { "epoch": 0.37049797259693107, "grad_norm": 44.5, "kl": 2.0248870849609375, "learning_rate": 5e-07, "logits/chosen": -44208413.333333336, "logits/rejected": -31872832.0, "logps/chosen": -1043.113037109375, "logps/rejected": -325.5611572265625, "loss": 0.2463, "rewards/chosen": 1.5889231363932292, "rewards/margins": 3.49899164835612, "rewards/rejected": -1.9100685119628906, "step": 6990 }, { "epoch": 0.3705509765987332, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24492214.4, "logits/rejected": 11553201.333333334, "logps/chosen": -196.3399169921875, "logps/rejected": -73.1499735514323, "loss": 0.4232, "rewards/chosen": 0.15569289922714233, "rewards/margins": 0.8861478209495545, "rewards/rejected": -0.7304549217224121, "step": 6991 }, { "epoch": 0.37060398060053534, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70571445.33333333, "logits/rejected": -34330873.6, "logps/chosen": -290.2730305989583, "logps/rejected": -421.46640625, "loss": 0.2557, "rewards/chosen": -0.09403990705808003, "rewards/margins": 2.601803789536158, "rewards/rejected": -2.6958436965942383, "step": 6992 }, { "epoch": 0.3706569846023375, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55230752.0, "logits/rejected": -6031306.666666667, "logps/chosen": -481.83343505859375, "logps/rejected": -442.7613525390625, "loss": 0.2762, "rewards/chosen": -0.10865937173366547, "rewards/margins": 1.8231761207183201, "rewards/rejected": -1.9318354924519856, "step": 6993 }, { "epoch": 0.3707099886041396, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44919852.0, "logits/rejected": 2125684.75, "logps/chosen": -264.1800842285156, "logps/rejected": -246.6219940185547, "loss": 0.2918, "rewards/chosen": 0.24496307969093323, "rewards/margins": 2.3592521250247955, "rewards/rejected": -2.1142890453338623, "step": 6994 }, { "epoch": 0.37076299260594175, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6953045.0, "logits/rejected": -9055893.0, "logps/chosen": -83.9131851196289, "logps/rejected": -212.6270751953125, "loss": 0.4531, "rewards/chosen": -0.524358868598938, "rewards/margins": 0.4914461374282837, "rewards/rejected": -1.0158050060272217, "step": 6995 }, { "epoch": 0.3708159966077439, "grad_norm": 97.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50619621.333333336, "logits/rejected": -7453947.2, "logps/chosen": -422.8662923177083, "logps/rejected": -338.25556640625, "loss": 0.3124, "rewards/chosen": 0.028421024481455486, "rewards/margins": 1.8945602456728619, "rewards/rejected": -1.8661392211914063, "step": 6996 }, { "epoch": 0.37086900060954603, "grad_norm": 35.25, "kl": 0.22723770141601562, "learning_rate": 5e-07, "logits/chosen": -5840920.8, "logits/rejected": -1331505.5833333333, "logps/chosen": -159.68538818359374, "logps/rejected": -115.17355346679688, "loss": 0.3641, "rewards/chosen": 0.39080333709716797, "rewards/margins": 1.5159849325815837, "rewards/rejected": -1.1251815954844158, "step": 6997 }, { "epoch": 0.37092200461134817, "grad_norm": 44.5, "kl": 0.8125247955322266, "learning_rate": 5e-07, "logits/chosen": -25252686.4, "logits/rejected": -32436336.0, "logps/chosen": -220.2769775390625, "logps/rejected": -500.6358642578125, "loss": 0.3938, "rewards/chosen": 0.21739549636840821, "rewards/margins": 1.6475382169087727, "rewards/rejected": -1.4301427205403645, "step": 6998 }, { "epoch": 0.3709750086131503, "grad_norm": 36.25, "kl": 0.3506031036376953, "learning_rate": 5e-07, "logits/chosen": 5346574.666666667, "logits/rejected": -17386184.0, "logps/chosen": -46.296112060546875, "logps/rejected": -262.16630859375, "loss": 0.3346, "rewards/chosen": 0.1711215376853943, "rewards/margins": 1.5074958205223083, "rewards/rejected": -1.336374282836914, "step": 6999 }, { "epoch": 0.37102801261495244, "grad_norm": 48.5, "kl": 0.6768836975097656, "learning_rate": 5e-07, "logits/chosen": -23589112.0, "logits/rejected": -19677774.0, "logps/chosen": -467.5768229166667, "logps/rejected": -265.92108154296875, "loss": 0.3695, "rewards/chosen": 0.36057337125142414, "rewards/margins": 3.201508124669393, "rewards/rejected": -2.8409347534179688, "step": 7000 }, { "epoch": 0.3710810166167546, "grad_norm": 38.25, "kl": 0.11521720886230469, "learning_rate": 5e-07, "logits/chosen": 1075200.0, "logits/rejected": -16848582.4, "logps/chosen": -189.15840657552084, "logps/rejected": -305.522265625, "loss": 0.2782, "rewards/chosen": 0.1315007507801056, "rewards/margins": 2.2459756195545197, "rewards/rejected": -2.114474868774414, "step": 7001 }, { "epoch": 0.3711340206185567, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26724072.0, "logits/rejected": -11135817.333333334, "logps/chosen": -372.975927734375, "logps/rejected": -178.39493815104166, "loss": 0.4573, "rewards/chosen": -0.3855090618133545, "rewards/margins": 0.8026196479797363, "rewards/rejected": -1.1881287097930908, "step": 7002 }, { "epoch": 0.37118702462035885, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41266842.666666664, "logits/rejected": -10194105.6, "logps/chosen": -172.15633138020834, "logps/rejected": -146.54376220703125, "loss": 0.3015, "rewards/chosen": -0.030875136454900105, "rewards/margins": 1.9606949547926587, "rewards/rejected": -1.9915700912475587, "step": 7003 }, { "epoch": 0.371240028622161, "grad_norm": 80.5, "kl": 0.3751983642578125, "learning_rate": 5e-07, "logits/chosen": -40738898.28571428, "logits/rejected": -45560892.0, "logps/chosen": -354.19234793526783, "logps/rejected": -114.62269592285156, "loss": 0.4075, "rewards/chosen": 0.2848550932747977, "rewards/margins": 2.0270349638802663, "rewards/rejected": -1.7421798706054688, "step": 7004 }, { "epoch": 0.3712930326239631, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10065825.0, "logits/rejected": -49050368.0, "logps/chosen": -304.9126892089844, "logps/rejected": -437.26885986328125, "loss": 0.1808, "rewards/chosen": 0.7002983093261719, "rewards/margins": 4.385247230529785, "rewards/rejected": -3.6849489212036133, "step": 7005 }, { "epoch": 0.37134603662576526, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44104037.333333336, "logits/rejected": 11331299.0, "logps/chosen": -236.18916829427084, "logps/rejected": -447.5606384277344, "loss": 0.3843, "rewards/chosen": 0.2419947385787964, "rewards/margins": 1.7360926866531372, "rewards/rejected": -1.4940979480743408, "step": 7006 }, { "epoch": 0.3713990406275674, "grad_norm": 63.75, "kl": 0.3405914306640625, "learning_rate": 5e-07, "logits/chosen": 14374053.333333334, "logits/rejected": -5035523.0, "logps/chosen": -442.0461018880208, "logps/rejected": -240.8938446044922, "loss": 0.2807, "rewards/chosen": 0.8166842460632324, "rewards/margins": 3.1014277935028076, "rewards/rejected": -2.284743547439575, "step": 7007 }, { "epoch": 0.37145204462936954, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1767067.0, "logits/rejected": -29766520.0, "logps/chosen": -175.99349975585938, "logps/rejected": -632.1900634765625, "loss": 0.2284, "rewards/chosen": 0.7007512450218201, "rewards/margins": 3.5619184374809265, "rewards/rejected": -2.8611671924591064, "step": 7008 }, { "epoch": 0.3715050486311717, "grad_norm": 44.5, "kl": 1.6315078735351562, "learning_rate": 5e-07, "logits/chosen": -11464833.6, "logits/rejected": -23033037.333333332, "logps/chosen": -454.319677734375, "logps/rejected": -275.5329182942708, "loss": 0.2046, "rewards/chosen": 1.3246126174926758, "rewards/margins": 2.8152049382527666, "rewards/rejected": -1.490592320760091, "step": 7009 }, { "epoch": 0.3715580526329738, "grad_norm": 58.0, "kl": 0.9976720809936523, "learning_rate": 5e-07, "logits/chosen": -4123840.0, "logits/rejected": 2844697.5, "logps/chosen": -323.15789794921875, "logps/rejected": -41.29841232299805, "loss": 0.3755, "rewards/chosen": 0.49335257212320965, "rewards/margins": 1.2529209057490032, "rewards/rejected": -0.7595683336257935, "step": 7010 }, { "epoch": 0.37161105663477595, "grad_norm": 84.5, "kl": 3.8500022888183594, "learning_rate": 5e-07, "logits/chosen": 18028732.0, "logps/chosen": -542.155517578125, "loss": 0.4806, "rewards/chosen": 0.5304558873176575, "step": 7011 }, { "epoch": 0.3716640606365781, "grad_norm": 49.25, "kl": 0.08867263793945312, "learning_rate": 5e-07, "logits/chosen": -37782473.6, "logits/rejected": -11033582.666666666, "logps/chosen": -657.3404296875, "logps/rejected": -208.30708821614584, "loss": 0.2847, "rewards/chosen": 0.5301385879516601, "rewards/margins": 3.1173603693644205, "rewards/rejected": -2.5872217814127603, "step": 7012 }, { "epoch": 0.3717170646383802, "grad_norm": 53.75, "kl": 0.6162986755371094, "learning_rate": 5e-07, "logits/chosen": -35827088.0, "logits/rejected": -32984228.0, "logps/chosen": -305.965576171875, "logps/rejected": -245.2124481201172, "loss": 0.3249, "rewards/chosen": 0.06880141794681549, "rewards/margins": 1.7953952699899673, "rewards/rejected": -1.7265938520431519, "step": 7013 }, { "epoch": 0.37177006864018236, "grad_norm": 43.75, "kl": 0.29744911193847656, "learning_rate": 5e-07, "logits/chosen": -27773434.0, "logits/rejected": -33541542.0, "logps/chosen": -242.61248779296875, "logps/rejected": -386.0231018066406, "loss": 0.2877, "rewards/chosen": 0.10229474306106567, "rewards/margins": 2.817838728427887, "rewards/rejected": -2.7155439853668213, "step": 7014 }, { "epoch": 0.37182307264198444, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44910284.8, "logits/rejected": -29317248.0, "logps/chosen": -438.351611328125, "logps/rejected": -257.7604166666667, "loss": 0.3223, "rewards/chosen": 0.22018160820007324, "rewards/margins": 2.4199038982391357, "rewards/rejected": -2.1997222900390625, "step": 7015 }, { "epoch": 0.3718760766437866, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64511216.0, "logits/rejected": 8072427.2, "logps/chosen": -563.4872639973959, "logps/rejected": -148.92745361328124, "loss": 0.3693, "rewards/chosen": -0.11525142192840576, "rewards/margins": 0.9381815195083618, "rewards/rejected": -1.0534329414367676, "step": 7016 }, { "epoch": 0.3719290806455887, "grad_norm": 55.75, "kl": 0.7912979125976562, "learning_rate": 5e-07, "logits/chosen": -17153065.14285714, "logits/rejected": 10261014.0, "logps/chosen": -259.0077427455357, "logps/rejected": -449.50799560546875, "loss": 0.5031, "rewards/chosen": -0.2573972429547991, "rewards/margins": 3.2764834676470076, "rewards/rejected": -3.5338807106018066, "step": 7017 }, { "epoch": 0.37198208464739085, "grad_norm": 48.5, "kl": 0.140594482421875, "learning_rate": 5e-07, "logits/chosen": -17344137.6, "logits/rejected": -12899274.666666666, "logps/chosen": -600.8666015625, "logps/rejected": -201.68975830078125, "loss": 0.2552, "rewards/chosen": 0.913119888305664, "rewards/margins": 3.004449780782064, "rewards/rejected": -2.0913298924764, "step": 7018 }, { "epoch": 0.372035088649193, "grad_norm": 78.5, "kl": 1.3071403503417969, "learning_rate": 5e-07, "logits/chosen": -70483952.0, "logits/rejected": -41860700.0, "logps/chosen": -858.7116088867188, "logps/rejected": -481.6891784667969, "loss": 0.2267, "rewards/chosen": 0.7419143915176392, "rewards/margins": 4.169976592063904, "rewards/rejected": -3.4280622005462646, "step": 7019 }, { "epoch": 0.37208809265099513, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1375008.5, "logits/rejected": -27633198.0, "logps/chosen": -128.36143493652344, "logps/rejected": -355.4045715332031, "loss": 0.3441, "rewards/chosen": 0.09504210203886032, "rewards/margins": 1.6152070686221123, "rewards/rejected": -1.520164966583252, "step": 7020 }, { "epoch": 0.37214109665279727, "grad_norm": 41.5, "kl": 1.0695457458496094, "learning_rate": 5e-07, "logits/chosen": -37961676.0, "logits/rejected": -46220088.0, "logps/chosen": -376.70794677734375, "logps/rejected": -218.37017822265625, "loss": 0.2347, "rewards/chosen": 0.6348875164985657, "rewards/margins": 2.7438254952430725, "rewards/rejected": -2.108937978744507, "step": 7021 }, { "epoch": 0.3721941006545994, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54614922.666666664, "logits/rejected": 21746620.8, "logps/chosen": -174.3583984375, "logps/rejected": -328.4154541015625, "loss": 0.3361, "rewards/chosen": -0.43887921174367267, "rewards/margins": 1.2503312985102337, "rewards/rejected": -1.6892105102539063, "step": 7022 }, { "epoch": 0.37224710465640154, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42580202.666666664, "logits/rejected": -3313398.5, "logps/chosen": -615.4503987630209, "logps/rejected": -74.64359283447266, "loss": 0.2941, "rewards/chosen": 0.9186222553253174, "rewards/margins": 2.1710814237594604, "rewards/rejected": -1.252459168434143, "step": 7023 }, { "epoch": 0.3723001086582037, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42228884.0, "logits/rejected": -1705856.875, "logps/chosen": -363.26971435546875, "logps/rejected": -200.57052612304688, "loss": 0.2787, "rewards/chosen": 0.36510133743286133, "rewards/margins": 2.336042642593384, "rewards/rejected": -1.9709413051605225, "step": 7024 }, { "epoch": 0.3723531126600058, "grad_norm": 46.75, "kl": 1.8804473876953125, "learning_rate": 5e-07, "logits/chosen": -46066992.0, "logits/rejected": -59654456.0, "logps/chosen": -424.5526123046875, "logps/rejected": -276.57867431640625, "loss": 0.271, "rewards/chosen": 0.7766473293304443, "rewards/margins": 2.1665183305740356, "rewards/rejected": -1.3898710012435913, "step": 7025 }, { "epoch": 0.37240611666180795, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21415882.666666668, "logits/rejected": -2271192.8, "logps/chosen": -252.5464070638021, "logps/rejected": -161.0020263671875, "loss": 0.2332, "rewards/chosen": 0.5344189008076986, "rewards/margins": 2.4788746198018394, "rewards/rejected": -1.9444557189941407, "step": 7026 }, { "epoch": 0.3724591206636101, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4540921.0, "logits/rejected": -33835794.28571428, "logps/chosen": -404.16912841796875, "logps/rejected": -356.10756138392856, "loss": 0.187, "rewards/chosen": -0.407583624124527, "rewards/margins": 1.9301416065011705, "rewards/rejected": -2.3377252306256975, "step": 7027 }, { "epoch": 0.3725121246654122, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6072625.2, "logits/rejected": -30023136.0, "logps/chosen": -343.67490234375, "logps/rejected": -567.2301839192709, "loss": 0.3658, "rewards/chosen": 0.053844332695007324, "rewards/margins": 2.7631173531214395, "rewards/rejected": -2.709273020426432, "step": 7028 }, { "epoch": 0.37256512866721436, "grad_norm": 54.25, "kl": 2.268716812133789, "learning_rate": 5e-07, "logits/chosen": -14382692.8, "logits/rejected": -44273482.666666664, "logps/chosen": -329.932080078125, "logps/rejected": -565.97216796875, "loss": 0.2959, "rewards/chosen": 0.5613431453704834, "rewards/margins": 2.5859740098317463, "rewards/rejected": -2.024630864461263, "step": 7029 }, { "epoch": 0.3726181326690165, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10091595.333333334, "logits/rejected": -12805488.0, "logps/chosen": -103.40767415364583, "logps/rejected": -162.355078125, "loss": 0.286, "rewards/chosen": 0.09916571776072185, "rewards/margins": 1.9616182247797649, "rewards/rejected": -1.862452507019043, "step": 7030 }, { "epoch": 0.37267113667081864, "grad_norm": 54.25, "kl": 0.2134990692138672, "learning_rate": 5e-07, "logits/chosen": -64190468.0, "logits/rejected": -12288237.333333334, "logps/chosen": -485.43621826171875, "logps/rejected": -179.57210286458334, "loss": 0.1915, "rewards/chosen": 1.047821044921875, "rewards/margins": 2.896938482920329, "rewards/rejected": -1.8491174379984539, "step": 7031 }, { "epoch": 0.3727241406726208, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29118837.333333332, "logits/rejected": -19096252.8, "logps/chosen": -309.99017333984375, "logps/rejected": -289.212109375, "loss": 0.2683, "rewards/chosen": 0.7572197119394938, "rewards/margins": 2.337379471460978, "rewards/rejected": -1.5801597595214845, "step": 7032 }, { "epoch": 0.3727771446744229, "grad_norm": 40.75, "kl": 0.20102310180664062, "learning_rate": 5e-07, "logits/chosen": -6960622.4, "logits/rejected": -36970741.333333336, "logps/chosen": -190.82525634765625, "logps/rejected": -501.04541015625, "loss": 0.2786, "rewards/chosen": 0.5689323425292969, "rewards/margins": 3.4629587809244793, "rewards/rejected": -2.894026438395182, "step": 7033 }, { "epoch": 0.37283014867622505, "grad_norm": 47.0, "kl": 0.8353996276855469, "learning_rate": 5e-07, "logits/chosen": -18661328.0, "logits/rejected": -4979653.6, "logps/chosen": -445.4058837890625, "logps/rejected": -124.412548828125, "loss": 0.3058, "rewards/chosen": 0.19669719537099203, "rewards/margins": 2.0277602752049764, "rewards/rejected": -1.8310630798339844, "step": 7034 }, { "epoch": 0.3728831526780272, "grad_norm": 65.5, "kl": 1.6596765518188477, "learning_rate": 5e-07, "logits/chosen": -18223265.333333332, "logits/rejected": -38058316.0, "logps/chosen": -387.5611979166667, "logps/rejected": -278.98272705078125, "loss": 0.3975, "rewards/chosen": 0.24727378288904825, "rewards/margins": 2.3369329969088235, "rewards/rejected": -2.0896592140197754, "step": 7035 }, { "epoch": 0.3729361566798293, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25803382.4, "logits/rejected": -55139306.666666664, "logps/chosen": -209.8384033203125, "logps/rejected": -706.5891927083334, "loss": 0.2724, "rewards/chosen": 0.37397217750549316, "rewards/margins": 4.635155916213989, "rewards/rejected": -4.261183738708496, "step": 7036 }, { "epoch": 0.37298916068163146, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41093125.333333336, "logits/rejected": -41386553.6, "logps/chosen": -226.67862955729166, "logps/rejected": -343.164453125, "loss": 0.3042, "rewards/chosen": -0.04870899518330892, "rewards/margins": 1.7239195982615154, "rewards/rejected": -1.7726285934448243, "step": 7037 }, { "epoch": 0.3730421646834336, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 87821424.0, "logits/rejected": -396979.0, "logps/chosen": -416.1397399902344, "logps/rejected": -252.372802734375, "loss": 0.3853, "rewards/chosen": -0.37970906496047974, "rewards/margins": 1.4052539467811584, "rewards/rejected": -1.7849630117416382, "step": 7038 }, { "epoch": 0.37309516868523573, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29954448.0, "logits/rejected": -35461141.333333336, "logps/chosen": -504.61541748046875, "logps/rejected": -307.2444661458333, "loss": 0.2455, "rewards/chosen": 0.21459656953811646, "rewards/margins": 1.980585555235545, "rewards/rejected": -1.7659889856974285, "step": 7039 }, { "epoch": 0.37314817268703787, "grad_norm": 65.5, "kl": 0.317901611328125, "learning_rate": 5e-07, "logits/chosen": -16319246.0, "logits/rejected": 3676154.0, "logps/chosen": -324.5988464355469, "logps/rejected": -231.26260375976562, "loss": 0.3291, "rewards/chosen": 0.40224790573120117, "rewards/margins": 1.9998068809509277, "rewards/rejected": -1.5975589752197266, "step": 7040 }, { "epoch": 0.37320117668884, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5173798.5, "logits/rejected": -29596160.0, "logps/chosen": -26.115324020385742, "logps/rejected": -339.7584533691406, "loss": 0.3689, "rewards/chosen": -0.3671339452266693, "rewards/margins": 1.4413317739963531, "rewards/rejected": -1.8084657192230225, "step": 7041 }, { "epoch": 0.37325418069064215, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37747416.0, "logits/rejected": -43419232.0, "logps/chosen": -213.3463134765625, "logps/rejected": -393.56640625, "loss": 0.2597, "rewards/chosen": 0.2416084259748459, "rewards/margins": 1.7541497200727463, "rewards/rejected": -1.5125412940979004, "step": 7042 }, { "epoch": 0.3733071846924443, "grad_norm": 54.25, "kl": 1.2506675720214844, "learning_rate": 5e-07, "logits/chosen": -13264068.0, "logits/rejected": -76848416.0, "logps/chosen": -339.5750427246094, "logps/rejected": -232.88534545898438, "loss": 0.2749, "rewards/chosen": 0.7568035125732422, "rewards/margins": 2.5955642461776733, "rewards/rejected": -1.8387607336044312, "step": 7043 }, { "epoch": 0.3733601886942464, "grad_norm": 35.5, "kl": 0.017406463623046875, "learning_rate": 5e-07, "logits/chosen": -134605312.0, "logits/rejected": -89488960.0, "logps/chosen": -396.8280436197917, "logps/rejected": -477.35166015625, "loss": 0.1763, "rewards/chosen": 1.2363158861796062, "rewards/margins": 3.7943206469217934, "rewards/rejected": -2.5580047607421874, "step": 7044 }, { "epoch": 0.37341319269604856, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55605040.0, "logits/rejected": -12780065.333333334, "logps/chosen": -211.27613830566406, "logps/rejected": -516.9613444010416, "loss": 0.22, "rewards/chosen": 0.025592505931854248, "rewards/margins": 2.441910445690155, "rewards/rejected": -2.416317939758301, "step": 7045 }, { "epoch": 0.3734661966978507, "grad_norm": 64.0, "kl": 0.6199378967285156, "learning_rate": 5e-07, "logits/chosen": -45243670.4, "logits/rejected": -6039697.333333333, "logps/chosen": -478.6923828125, "logps/rejected": -100.95509847005208, "loss": 0.344, "rewards/chosen": 0.4254270076751709, "rewards/margins": 1.6644694487253826, "rewards/rejected": -1.2390424410502117, "step": 7046 }, { "epoch": 0.37351920069965283, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19330244.0, "logits/rejected": 1737222.0, "logps/chosen": -134.82395935058594, "logps/rejected": -226.937255859375, "loss": 0.3037, "rewards/chosen": 0.4491334855556488, "rewards/margins": 1.9258473813533783, "rewards/rejected": -1.4767138957977295, "step": 7047 }, { "epoch": 0.37357220470145497, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62775466.666666664, "logits/rejected": -35486745.6, "logps/chosen": -297.69708251953125, "logps/rejected": -350.27197265625, "loss": 0.2047, "rewards/chosen": 0.4195886452992757, "rewards/margins": 3.08016947110494, "rewards/rejected": -2.6605808258056642, "step": 7048 }, { "epoch": 0.3736252087032571, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80104106.66666667, "logits/rejected": -22545040.0, "logps/chosen": -561.176025390625, "logps/rejected": -328.0685546875, "loss": 0.316, "rewards/chosen": 0.1803929607073466, "rewards/margins": 2.232974366346995, "rewards/rejected": -2.0525814056396485, "step": 7049 }, { "epoch": 0.37367821270505924, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7992810.0, "logits/rejected": -23612236.8, "logps/chosen": -302.8081868489583, "logps/rejected": -231.8898681640625, "loss": 0.2534, "rewards/chosen": 0.7959233919779459, "rewards/margins": 2.425391165415446, "rewards/rejected": -1.6294677734375, "step": 7050 }, { "epoch": 0.3737312167068614, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4753311.5, "logits/rejected": -42869576.0, "logps/chosen": -46.85579299926758, "logps/rejected": -616.7483520507812, "loss": 0.27, "rewards/chosen": 0.015982285141944885, "rewards/margins": 3.0716992765665054, "rewards/rejected": -3.0557169914245605, "step": 7051 }, { "epoch": 0.3737842207086635, "grad_norm": 52.5, "kl": 0.7262916564941406, "learning_rate": 5e-07, "logits/chosen": -42018445.71428572, "logits/rejected": 3897909.75, "logps/chosen": -188.00251116071428, "logps/rejected": -17.153207778930664, "loss": 0.5245, "rewards/chosen": -0.061591080256870816, "rewards/margins": 0.1899592067514147, "rewards/rejected": -0.2515502870082855, "step": 7052 }, { "epoch": 0.37383722471046565, "grad_norm": 49.75, "kl": 0.0699453353881836, "learning_rate": 5e-07, "logits/chosen": -23568282.666666668, "logits/rejected": -1122193.25, "logps/chosen": -317.0397542317708, "logps/rejected": -39.96957778930664, "loss": 0.2825, "rewards/chosen": 1.240575949350993, "rewards/margins": 1.9781588117281597, "rewards/rejected": -0.7375828623771667, "step": 7053 }, { "epoch": 0.3738902287122678, "grad_norm": 53.5, "kl": 1.0933208465576172, "learning_rate": 5e-07, "logits/chosen": -49936453.333333336, "logits/rejected": 3449133.0, "logps/chosen": -442.6600748697917, "logps/rejected": -366.48687744140625, "loss": 0.3825, "rewards/chosen": 0.15570354461669922, "rewards/margins": 3.1779990196228027, "rewards/rejected": -3.0222954750061035, "step": 7054 }, { "epoch": 0.37394323271406993, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31978740.0, "logits/rejected": -51359344.0, "logps/chosen": -488.0114440917969, "logps/rejected": -340.82427978515625, "loss": 0.3054, "rewards/chosen": 0.3616889715194702, "rewards/margins": 1.9158416986465454, "rewards/rejected": -1.5541527271270752, "step": 7055 }, { "epoch": 0.37399623671587207, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39348252.0, "logits/rejected": 119626672.0, "logps/chosen": -157.9178466796875, "logps/rejected": -393.33465576171875, "loss": 0.258, "rewards/chosen": 0.49049168825149536, "rewards/margins": 2.6498408913612366, "rewards/rejected": -2.159349203109741, "step": 7056 }, { "epoch": 0.3740492407176742, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57977972.0, "logits/rejected": -39061558.85714286, "logps/chosen": -515.2877197265625, "logps/rejected": -556.8840680803571, "loss": 0.106, "rewards/chosen": 0.9682861566543579, "rewards/margins": 3.7362975222723827, "rewards/rejected": -2.7680113656180247, "step": 7057 }, { "epoch": 0.37410224471947634, "grad_norm": 47.75, "kl": 0.5124721527099609, "learning_rate": 5e-07, "logits/chosen": -5905022.4, "logits/rejected": -14116604.0, "logps/chosen": -170.8555419921875, "logps/rejected": -206.5859375, "loss": 0.4218, "rewards/chosen": -0.23565163612365722, "rewards/margins": 1.4358333110809327, "rewards/rejected": -1.6714849472045898, "step": 7058 }, { "epoch": 0.3741552487212785, "grad_norm": 54.0, "kl": 1.1986160278320312, "learning_rate": 5e-07, "logits/chosen": -13721131.0, "logits/rejected": -17795632.0, "logps/chosen": -200.13497924804688, "logps/rejected": -295.36138916015625, "loss": 0.3279, "rewards/chosen": 0.6163836121559143, "rewards/margins": 1.839957058429718, "rewards/rejected": -1.2235734462738037, "step": 7059 }, { "epoch": 0.3742082527230806, "grad_norm": 52.0, "kl": 2.4542465209960938, "learning_rate": 5e-07, "logits/chosen": -34120828.0, "logits/rejected": -24414956.0, "logps/chosen": -405.66839599609375, "logps/rejected": -307.12286376953125, "loss": 0.2644, "rewards/chosen": 0.8315380215644836, "rewards/margins": 2.732488691806793, "rewards/rejected": -1.9009506702423096, "step": 7060 }, { "epoch": 0.37426125672488275, "grad_norm": 54.5, "kl": 0.2662353515625, "learning_rate": 5e-07, "logits/chosen": -15257125.0, "logits/rejected": -73509328.0, "logps/chosen": -163.15255737304688, "logps/rejected": -329.22540283203125, "loss": 0.3963, "rewards/chosen": -0.3624342679977417, "rewards/margins": 1.1087838411331177, "rewards/rejected": -1.4712181091308594, "step": 7061 }, { "epoch": 0.3743142607266849, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1052233.5, "logits/rejected": -38999970.666666664, "logps/chosen": -285.710693359375, "logps/rejected": -416.101806640625, "loss": 0.1194, "rewards/chosen": 0.9119136929512024, "rewards/margins": 3.7719794710477195, "rewards/rejected": -2.860065778096517, "step": 7062 }, { "epoch": 0.374367264728487, "grad_norm": 45.75, "kl": 0.4572906494140625, "learning_rate": 5e-07, "logits/chosen": -29208754.666666668, "logits/rejected": -14370208.0, "logps/chosen": -192.6225382486979, "logps/rejected": -157.6088104248047, "loss": 0.401, "rewards/chosen": 0.16175758838653564, "rewards/margins": 1.5967903137207031, "rewards/rejected": -1.4350327253341675, "step": 7063 }, { "epoch": 0.37442026873028916, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55775404.0, "logits/rejected": -8852594.666666666, "logps/chosen": -303.17926025390625, "logps/rejected": -159.36346435546875, "loss": 0.2688, "rewards/chosen": 1.1312668323516846, "rewards/margins": 2.3887741565704346, "rewards/rejected": -1.25750732421875, "step": 7064 }, { "epoch": 0.3744732727320913, "grad_norm": 63.0, "kl": 1.627640724182129, "learning_rate": 5e-07, "logits/chosen": -37887993.6, "logits/rejected": -21986757.333333332, "logps/chosen": -651.99404296875, "logps/rejected": -294.6772867838542, "loss": 0.3327, "rewards/chosen": 1.0512951850891112, "rewards/margins": 2.0271588802337646, "rewards/rejected": -0.9758636951446533, "step": 7065 }, { "epoch": 0.3745262767338934, "grad_norm": 43.0, "kl": 0.19940757751464844, "learning_rate": 5e-07, "logits/chosen": -15974726.4, "logits/rejected": -4530257.333333333, "logps/chosen": -496.019482421875, "logps/rejected": -234.4996337890625, "loss": 0.2266, "rewards/chosen": 0.9911593437194824, "rewards/margins": 3.5413556734720864, "rewards/rejected": -2.550196329752604, "step": 7066 }, { "epoch": 0.3745792807356955, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36901408.0, "logits/rejected": -45868853.333333336, "logps/chosen": -251.2335205078125, "logps/rejected": -412.4189860026042, "loss": 0.2884, "rewards/chosen": 0.47043638229370116, "rewards/margins": 2.5226926485697425, "rewards/rejected": -2.0522562662760415, "step": 7067 }, { "epoch": 0.37463228473749766, "grad_norm": 60.25, "kl": 0.3827037811279297, "learning_rate": 5e-07, "logits/chosen": -13483676.0, "logits/rejected": -13740264.0, "logps/chosen": -298.9909362792969, "logps/rejected": -439.59906005859375, "loss": 0.2833, "rewards/chosen": 0.3255484700202942, "rewards/margins": 2.7941046357154846, "rewards/rejected": -2.4685561656951904, "step": 7068 }, { "epoch": 0.3746852887392998, "grad_norm": 36.75, "kl": 0.8407974243164062, "learning_rate": 5e-07, "logits/chosen": -19061957.333333332, "logits/rejected": -17245209.6, "logps/chosen": -813.698486328125, "logps/rejected": -148.1197998046875, "loss": 0.3107, "rewards/chosen": 0.6988104184468588, "rewards/margins": 1.736287053426107, "rewards/rejected": -1.0374766349792481, "step": 7069 }, { "epoch": 0.37473829274110193, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77746352.0, "logits/rejected": -21589257.333333332, "logps/chosen": -252.30520629882812, "logps/rejected": -290.2049153645833, "loss": 0.2829, "rewards/chosen": -0.07272262871265411, "rewards/margins": 1.7209677348534267, "rewards/rejected": -1.7936903635660808, "step": 7070 }, { "epoch": 0.37479129674290407, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8294883.2, "logits/rejected": -44961653.333333336, "logps/chosen": -303.456787109375, "logps/rejected": -315.1382649739583, "loss": 0.3195, "rewards/chosen": 0.2974076271057129, "rewards/margins": 2.1417369842529297, "rewards/rejected": -1.8443293571472168, "step": 7071 }, { "epoch": 0.3748443007447062, "grad_norm": 57.25, "kl": 0.7388191223144531, "learning_rate": 5e-07, "logits/chosen": -29847129.6, "logits/rejected": -46618954.666666664, "logps/chosen": -358.966455078125, "logps/rejected": -285.9901123046875, "loss": 0.3308, "rewards/chosen": 0.38778321743011473, "rewards/margins": 2.3102003812789915, "rewards/rejected": -1.922417163848877, "step": 7072 }, { "epoch": 0.37489730474650834, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8494987.0, "logits/rejected": -34467744.0, "logps/chosen": -29.69542121887207, "logps/rejected": -342.046875, "loss": 0.2661, "rewards/chosen": 0.016289710998535156, "rewards/margins": 1.8649476369222004, "rewards/rejected": -1.8486579259236653, "step": 7073 }, { "epoch": 0.3749503087483105, "grad_norm": 51.75, "kl": 0.45047760009765625, "learning_rate": 5e-07, "logits/chosen": -41489648.0, "logits/rejected": -28805514.0, "logps/chosen": -428.15924072265625, "logps/rejected": -225.3917999267578, "loss": 0.2924, "rewards/chosen": 0.12004852294921875, "rewards/margins": 2.366229295730591, "rewards/rejected": -2.246180772781372, "step": 7074 }, { "epoch": 0.3750033127501126, "grad_norm": 48.0, "kl": 1.8759422302246094, "learning_rate": 5e-07, "logits/chosen": -6813393.6, "logits/rejected": -5866744.0, "logps/chosen": -153.81060791015625, "logps/rejected": -122.54629516601562, "loss": 0.3039, "rewards/chosen": 0.6082148075103759, "rewards/margins": 2.547793817520142, "rewards/rejected": -1.9395790100097656, "step": 7075 }, { "epoch": 0.37505631675191475, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17558994.0, "logits/rejected": -29296.5, "logps/chosen": -502.74822998046875, "logps/rejected": -69.92631530761719, "loss": 0.3112, "rewards/chosen": 0.8780258297920227, "rewards/margins": 1.7746230363845825, "rewards/rejected": -0.8965972065925598, "step": 7076 }, { "epoch": 0.3751093207537169, "grad_norm": 60.0, "kl": 1.3338394165039062, "learning_rate": 5e-07, "logits/chosen": -60229176.0, "logits/rejected": -24064244.0, "logps/chosen": -757.0198364257812, "logps/rejected": -226.3690185546875, "loss": 0.2913, "rewards/chosen": 0.7556110620498657, "rewards/margins": 2.008762001991272, "rewards/rejected": -1.2531509399414062, "step": 7077 }, { "epoch": 0.375162324755519, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32237560.0, "logits/rejected": -29307000.0, "logps/chosen": -278.4027099609375, "logps/rejected": -389.1505126953125, "loss": 0.236, "rewards/chosen": 0.3206976056098938, "rewards/margins": 3.4486783146858215, "rewards/rejected": -3.1279807090759277, "step": 7078 }, { "epoch": 0.37521532875732116, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8402851.0, "logits/rejected": -27278272.0, "logps/chosen": -457.20648193359375, "logps/rejected": -202.81131998697916, "loss": 0.1434, "rewards/chosen": 1.5609421730041504, "rewards/margins": 3.669886747996012, "rewards/rejected": -2.108944574991862, "step": 7079 }, { "epoch": 0.3752683327591233, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6148671.5, "logits/rejected": -25637909.333333332, "logps/chosen": -216.93704223632812, "logps/rejected": -283.0099283854167, "loss": 0.2527, "rewards/chosen": 0.5290008783340454, "rewards/margins": 2.2449233929316206, "rewards/rejected": -1.715922514597575, "step": 7080 }, { "epoch": 0.37532133676092544, "grad_norm": 50.25, "kl": 0.3545570373535156, "learning_rate": 5e-07, "logits/chosen": -35146872.0, "logits/rejected": -47900140.8, "logps/chosen": -562.5924886067709, "logps/rejected": -326.744970703125, "loss": 0.1889, "rewards/chosen": 0.8444796403249105, "rewards/margins": 3.330087169011434, "rewards/rejected": -2.4856075286865233, "step": 7081 }, { "epoch": 0.3753743407627276, "grad_norm": 53.5, "kl": 0.46047496795654297, "learning_rate": 5e-07, "logits/chosen": -61476570.666666664, "logits/rejected": -33088188.0, "logps/chosen": -206.93025716145834, "logps/rejected": -329.49462890625, "loss": 0.4115, "rewards/chosen": 0.06359481811523438, "rewards/margins": 2.074535369873047, "rewards/rejected": -2.0109405517578125, "step": 7082 }, { "epoch": 0.3754273447645297, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40315124.0, "logits/rejected": -14854088.0, "logps/chosen": -166.54930114746094, "logps/rejected": -304.1424560546875, "loss": 0.2634, "rewards/chosen": 0.4346572756767273, "rewards/margins": 1.951276957988739, "rewards/rejected": -1.5166196823120117, "step": 7083 }, { "epoch": 0.37548034876633185, "grad_norm": 48.0, "kl": 0.9572219848632812, "learning_rate": 5e-07, "logits/chosen": -39341904.0, "logits/rejected": -14809860.0, "logps/chosen": -548.36025390625, "logps/rejected": -190.9879353841146, "loss": 0.3846, "rewards/chosen": 0.43967227935791015, "rewards/margins": 1.9286452611287435, "rewards/rejected": -1.4889729817708333, "step": 7084 }, { "epoch": 0.375533352768134, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12860928.0, "logits/rejected": 26715664.0, "logps/chosen": -300.99920654296875, "logps/rejected": -615.4967651367188, "loss": 0.2717, "rewards/chosen": 0.5819511413574219, "rewards/margins": 3.419898509979248, "rewards/rejected": -2.837947368621826, "step": 7085 }, { "epoch": 0.3755863567699361, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12500809.6, "logits/rejected": 13555096.0, "logps/chosen": -199.09466552734375, "logps/rejected": -425.0453287760417, "loss": 0.3256, "rewards/chosen": 0.22441282272338867, "rewards/margins": 2.234916591644287, "rewards/rejected": -2.0105037689208984, "step": 7086 }, { "epoch": 0.37563936077173826, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18817402.0, "logits/rejected": -18915482.285714287, "logps/chosen": -369.41357421875, "logps/rejected": -409.868408203125, "loss": 0.1545, "rewards/chosen": 0.5763183832168579, "rewards/margins": 3.3067369971956526, "rewards/rejected": -2.7304186139787947, "step": 7087 }, { "epoch": 0.3756923647735404, "grad_norm": 60.25, "kl": 0.1117095947265625, "learning_rate": 5e-07, "logits/chosen": 5647726.666666667, "logits/rejected": -44044870.4, "logps/chosen": -785.9591471354166, "logps/rejected": -436.55654296875, "loss": 0.2645, "rewards/chosen": 0.7013992468516032, "rewards/margins": 2.330759827295939, "rewards/rejected": -1.6293605804443358, "step": 7088 }, { "epoch": 0.37574536877534254, "grad_norm": 51.75, "kl": 1.4892587661743164, "learning_rate": 5e-07, "logits/chosen": -1196791.0, "logits/rejected": 28558088.0, "logps/chosen": -170.04766845703125, "logps/rejected": -237.31488037109375, "loss": 0.3408, "rewards/chosen": 0.32100993394851685, "rewards/margins": 1.6919499039649963, "rewards/rejected": -1.3709399700164795, "step": 7089 }, { "epoch": 0.3757983727771447, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -82216784.0, "logits/rejected": -60886144.0, "logps/chosen": -509.7891845703125, "logps/rejected": -426.1111537388393, "loss": 0.2223, "rewards/chosen": -0.08877868950366974, "rewards/margins": 1.7952851674386434, "rewards/rejected": -1.8840638569423132, "step": 7090 }, { "epoch": 0.3758513767789468, "grad_norm": 41.25, "kl": 2.0422229766845703, "learning_rate": 5e-07, "logits/chosen": -22858485.333333332, "logits/rejected": -36578291.2, "logps/chosen": -287.7950032552083, "logps/rejected": -381.594921875, "loss": 0.1775, "rewards/chosen": 1.2346023718516033, "rewards/margins": 3.619078842798869, "rewards/rejected": -2.3844764709472654, "step": 7091 }, { "epoch": 0.37590438078074895, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21536777.333333332, "logits/rejected": -4579545.6, "logps/chosen": -346.7917887369792, "logps/rejected": -392.71123046875, "loss": 0.2548, "rewards/chosen": 0.40293161074320477, "rewards/margins": 2.092321793238322, "rewards/rejected": -1.6893901824951172, "step": 7092 }, { "epoch": 0.3759573847825511, "grad_norm": 42.5, "kl": 0.28730201721191406, "learning_rate": 5e-07, "logits/chosen": -46375221.333333336, "logits/rejected": -20185148.8, "logps/chosen": -416.51416015625, "logps/rejected": -139.3279052734375, "loss": 0.22, "rewards/chosen": 1.1681539217631023, "rewards/margins": 2.640689055124919, "rewards/rejected": -1.4725351333618164, "step": 7093 }, { "epoch": 0.3760103887843532, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8314913.0, "logits/rejected": -45802808.0, "logps/chosen": -196.51568603515625, "logps/rejected": -294.06463623046875, "loss": 0.2885, "rewards/chosen": 0.39962145686149597, "rewards/margins": 2.131755441427231, "rewards/rejected": -1.7321339845657349, "step": 7094 }, { "epoch": 0.37606339278615536, "grad_norm": 49.25, "kl": 2.604602813720703, "learning_rate": 5e-07, "logits/chosen": -14545661.333333334, "logits/rejected": -8318932.0, "logps/chosen": -1210.943603515625, "logps/rejected": -160.8822021484375, "loss": 0.1989, "rewards/chosen": 1.6822225252787273, "rewards/margins": 3.5534358660380043, "rewards/rejected": -1.8712133407592773, "step": 7095 }, { "epoch": 0.3761163967879575, "grad_norm": 44.75, "kl": 0.6164407730102539, "learning_rate": 5e-07, "logits/chosen": -12935798.4, "logits/rejected": -48042752.0, "logps/chosen": -223.0071044921875, "logps/rejected": -267.970458984375, "loss": 0.3483, "rewards/chosen": 0.15168700218200684, "rewards/margins": 2.262399689356486, "rewards/rejected": -2.110712687174479, "step": 7096 }, { "epoch": 0.37616940078975963, "grad_norm": 86.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23187926.0, "logits/rejected": 28297704.0, "logps/chosen": -252.34637451171875, "logps/rejected": -215.4181111653646, "loss": 0.2674, "rewards/chosen": 1.0944221019744873, "rewards/margins": 2.2668425242106123, "rewards/rejected": -1.1724204222361247, "step": 7097 }, { "epoch": 0.37622240479156177, "grad_norm": 55.25, "kl": 0.5829963684082031, "learning_rate": 5e-07, "logits/chosen": -31462483.2, "logits/rejected": -37371768.0, "logps/chosen": -385.43916015625, "logps/rejected": -388.8410237630208, "loss": 0.3174, "rewards/chosen": 0.3836972236633301, "rewards/margins": 2.4246823310852053, "rewards/rejected": -2.040985107421875, "step": 7098 }, { "epoch": 0.3762754087933639, "grad_norm": 50.25, "kl": 1.730015754699707, "learning_rate": 5e-07, "logits/chosen": -42588458.666666664, "logits/rejected": -25700772.0, "logps/chosen": -203.4664103190104, "logps/rejected": -332.3643798828125, "loss": 0.4287, "rewards/chosen": 0.13635998964309692, "rewards/margins": 2.02119117975235, "rewards/rejected": -1.884831190109253, "step": 7099 }, { "epoch": 0.37632841279516605, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66251626.666666664, "logits/rejected": -23822707.2, "logps/chosen": -480.4613444010417, "logps/rejected": -248.321142578125, "loss": 0.2099, "rewards/chosen": 0.8399231433868408, "rewards/margins": 2.669669008255005, "rewards/rejected": -1.8297458648681642, "step": 7100 }, { "epoch": 0.3763814167969682, "grad_norm": 52.75, "kl": 0.8396682739257812, "learning_rate": 5e-07, "logits/chosen": -36751125.333333336, "logits/rejected": -4205778.0, "logps/chosen": -189.1385498046875, "logps/rejected": -293.8266906738281, "loss": 0.4123, "rewards/chosen": 0.3466043472290039, "rewards/margins": 0.8873990774154663, "rewards/rejected": -0.5407947301864624, "step": 7101 }, { "epoch": 0.3764344207987703, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40518659.2, "logits/rejected": -1708510.6666666667, "logps/chosen": -297.1866455078125, "logps/rejected": -155.80023193359375, "loss": 0.3361, "rewards/chosen": 0.2096405267715454, "rewards/margins": 2.3528506199518837, "rewards/rejected": -2.1432100931803384, "step": 7102 }, { "epoch": 0.37648742480057246, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39502792.0, "logits/rejected": -31381754.0, "logps/chosen": -291.9553629557292, "logps/rejected": -327.52099609375, "loss": 0.3633, "rewards/chosen": 0.4658936659495036, "rewards/margins": 1.6069531838099163, "rewards/rejected": -1.1410595178604126, "step": 7103 }, { "epoch": 0.3765404288023746, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -29636288.0, "logps/rejected": -395.42144775390625, "loss": 0.1585, "rewards/rejected": -2.060242176055908, "step": 7104 }, { "epoch": 0.37659343280417673, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -255637.1875, "logits/rejected": 5177534.857142857, "logps/chosen": -102.21733856201172, "logps/rejected": -237.40359933035714, "loss": 0.22, "rewards/chosen": -0.5862503051757812, "rewards/margins": 1.3405451093401228, "rewards/rejected": -1.926795414515904, "step": 7105 }, { "epoch": 0.37664643680597887, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2670503.0, "logits/rejected": -27325878.85714286, "logps/chosen": -3.364116668701172, "logps/rejected": -379.14432198660717, "loss": 0.1647, "rewards/chosen": 0.09259586781263351, "rewards/margins": 2.3696111451302255, "rewards/rejected": -2.277015277317592, "step": 7106 }, { "epoch": 0.376699440807781, "grad_norm": 47.25, "kl": 0.22345256805419922, "learning_rate": 5e-07, "logits/chosen": -32406419.2, "logits/rejected": -37192253.333333336, "logps/chosen": -221.9298583984375, "logps/rejected": -448.4417317708333, "loss": 0.3066, "rewards/chosen": 0.365678596496582, "rewards/margins": 2.942233530680338, "rewards/rejected": -2.5765549341837564, "step": 7107 }, { "epoch": 0.37675244480958314, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10778179.0, "logits/rejected": -23788948.57142857, "logps/chosen": -817.6107177734375, "logps/rejected": -274.83262416294644, "loss": 0.182, "rewards/chosen": -0.614459216594696, "rewards/margins": 1.63317962203707, "rewards/rejected": -2.247638838631766, "step": 7108 }, { "epoch": 0.3768054488113853, "grad_norm": 48.25, "kl": 0.00395965576171875, "learning_rate": 5e-07, "logits/chosen": -29264780.8, "logits/rejected": -33635133.333333336, "logps/chosen": -412.476806640625, "logps/rejected": -210.5190633138021, "loss": 0.3452, "rewards/chosen": 0.36785428524017333, "rewards/margins": 2.1898298343022664, "rewards/rejected": -1.821975549062093, "step": 7109 }, { "epoch": 0.3768584528131874, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31652568.0, "logits/rejected": 40897840.0, "logps/chosen": -282.225341796875, "logps/rejected": -83.97052001953125, "loss": 0.3438, "rewards/chosen": 0.6340929667154948, "rewards/margins": 1.747280995051066, "rewards/rejected": -1.1131880283355713, "step": 7110 }, { "epoch": 0.37691145681498955, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35484048.0, "logits/rejected": -48475080.0, "logps/chosen": -204.31997680664062, "logps/rejected": -507.4125671386719, "loss": 0.282, "rewards/chosen": 0.13117671012878418, "rewards/margins": 2.6639151573181152, "rewards/rejected": -2.532738447189331, "step": 7111 }, { "epoch": 0.3769644608167917, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1536074.0, "logits/rejected": -3160398.8, "logps/chosen": -283.0901692708333, "logps/rejected": -272.5979736328125, "loss": 0.2203, "rewards/chosen": 0.9159990946451823, "rewards/margins": 2.585206667582194, "rewards/rejected": -1.6692075729370117, "step": 7112 }, { "epoch": 0.37701746481859383, "grad_norm": 45.0, "kl": 0.6799278259277344, "learning_rate": 5e-07, "logits/chosen": 13304658.0, "logits/rejected": -16895022.666666668, "logps/chosen": -372.8057861328125, "logps/rejected": -190.68790690104166, "loss": 0.2387, "rewards/chosen": 0.5040062069892883, "rewards/margins": 2.848828454812368, "rewards/rejected": -2.3448222478230796, "step": 7113 }, { "epoch": 0.37707046882039597, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12610370.0, "logits/rejected": -24480378.0, "logps/chosen": -135.42864990234375, "logps/rejected": -333.76654052734375, "loss": 0.2488, "rewards/chosen": 0.40269413590431213, "rewards/margins": 3.1178230345249176, "rewards/rejected": -2.7151288986206055, "step": 7114 }, { "epoch": 0.3771234728221981, "grad_norm": 51.0, "kl": 0.16268348693847656, "learning_rate": 5e-07, "logits/chosen": -22718134.4, "logits/rejected": -39561570.666666664, "logps/chosen": -550.361376953125, "logps/rejected": -387.406494140625, "loss": 0.3017, "rewards/chosen": 0.6420784950256347, "rewards/margins": 2.735999361673991, "rewards/rejected": -2.093920866648356, "step": 7115 }, { "epoch": 0.37717647682400024, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39476781.333333336, "logits/rejected": -9542917.6, "logps/chosen": -712.8094075520834, "logps/rejected": -146.0544189453125, "loss": 0.2445, "rewards/chosen": 2.0315237045288086, "rewards/margins": 3.1849943161010743, "rewards/rejected": -1.1534706115722657, "step": 7116 }, { "epoch": 0.3772294808258023, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13993400.0, "logits/rejected": -22785882.666666668, "logps/chosen": -579.2968139648438, "logps/rejected": -308.8809407552083, "loss": 0.1935, "rewards/chosen": 0.3656478822231293, "rewards/margins": 2.5599539379278817, "rewards/rejected": -2.1943060557047525, "step": 7117 }, { "epoch": 0.37728248482760446, "grad_norm": 47.25, "kl": 1.1897192001342773, "learning_rate": 5e-07, "logits/chosen": 1253356.1666666667, "logits/rejected": -136094000.0, "logps/chosen": -150.3498331705729, "logps/rejected": -448.494384765625, "loss": 0.4475, "rewards/chosen": -0.16403871774673462, "rewards/margins": 2.0566033720970154, "rewards/rejected": -2.22064208984375, "step": 7118 }, { "epoch": 0.3773354888294066, "grad_norm": 54.0, "kl": 0.10638427734375, "learning_rate": 5e-07, "logits/chosen": -17170156.8, "logits/rejected": -10512429.333333334, "logps/chosen": -381.2657470703125, "logps/rejected": -235.8166707356771, "loss": 0.3488, "rewards/chosen": 0.42433938980102537, "rewards/margins": 1.6837996800740558, "rewards/rejected": -1.2594602902730305, "step": 7119 }, { "epoch": 0.37738849283120873, "grad_norm": 49.75, "kl": 0.8831663131713867, "learning_rate": 5e-07, "logits/chosen": 8502184.0, "logits/rejected": -25846000.0, "logps/chosen": -197.95613606770834, "logps/rejected": -265.04638671875, "loss": 0.3299, "rewards/chosen": -0.1535560687383016, "rewards/margins": 1.5764221111933392, "rewards/rejected": -1.7299781799316407, "step": 7120 }, { "epoch": 0.37744149683301087, "grad_norm": 68.0, "kl": 0.39612579345703125, "learning_rate": 5e-07, "logits/chosen": -4684370.285714285, "logits/rejected": -45309016.0, "logps/chosen": -224.8497314453125, "logps/rejected": -397.4770202636719, "loss": 0.4557, "rewards/chosen": 0.15505446706499373, "rewards/margins": 0.8169654778071812, "rewards/rejected": -0.6619110107421875, "step": 7121 }, { "epoch": 0.377494500834813, "grad_norm": 41.75, "kl": 1.2939352989196777, "learning_rate": 5e-07, "logits/chosen": -59949332.0, "logits/rejected": -10622194.285714285, "logps/chosen": -429.49798583984375, "logps/rejected": -342.063232421875, "loss": 0.1961, "rewards/chosen": 1.896826148033142, "rewards/margins": 3.6641016857964654, "rewards/rejected": -1.7672755377633231, "step": 7122 }, { "epoch": 0.37754750483661514, "grad_norm": 52.5, "kl": 3.172365188598633, "learning_rate": 5e-07, "logits/chosen": 19183488.0, "logits/rejected": -30497232.0, "logps/chosen": -1070.400390625, "logps/rejected": -275.952099609375, "loss": 0.1835, "rewards/chosen": 1.4781824747721355, "rewards/margins": 3.7820662180582687, "rewards/rejected": -2.303883743286133, "step": 7123 }, { "epoch": 0.3776005088384173, "grad_norm": 54.75, "kl": 0.4457874298095703, "learning_rate": 5e-07, "logits/chosen": 3272395.8, "logits/rejected": -8255793.333333333, "logps/chosen": -128.98380126953126, "logps/rejected": -155.7259318033854, "loss": 0.3671, "rewards/chosen": 0.20652637481689454, "rewards/margins": 1.70007381439209, "rewards/rejected": -1.4935474395751953, "step": 7124 }, { "epoch": 0.3776535128402194, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14476216.0, "logits/rejected": -45346268.8, "logps/chosen": -536.0540771484375, "logps/rejected": -317.7425537109375, "loss": 0.239, "rewards/chosen": 0.40395204226175946, "rewards/margins": 2.5713239828745524, "rewards/rejected": -2.167371940612793, "step": 7125 }, { "epoch": 0.37770651684202156, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63423056.0, "logits/rejected": -16137181.0, "logps/chosen": -494.228271484375, "logps/rejected": -216.13511657714844, "loss": 0.3416, "rewards/chosen": 0.48230592409769696, "rewards/margins": 2.720362583796183, "rewards/rejected": -2.2380566596984863, "step": 7126 }, { "epoch": 0.3777595208438237, "grad_norm": 58.0, "kl": 0.26275634765625, "learning_rate": 5e-07, "logits/chosen": -84520960.0, "logits/rejected": -13083252.0, "logps/chosen": -350.63818359375, "logps/rejected": -159.7430216471354, "loss": 0.3777, "rewards/chosen": 0.06789917349815369, "rewards/margins": 1.7633900105953217, "rewards/rejected": -1.695490837097168, "step": 7127 }, { "epoch": 0.37781252484562583, "grad_norm": 57.75, "kl": 0.8296928405761719, "learning_rate": 5e-07, "logits/chosen": -8308659.428571428, "logits/rejected": 657698560.0, "logps/chosen": -474.69363839285717, "logps/rejected": -499.09832763671875, "loss": 0.3092, "rewards/chosen": 0.8037325995309013, "rewards/margins": 4.031308378492083, "rewards/rejected": -3.2275757789611816, "step": 7128 }, { "epoch": 0.37786552884742797, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -35716504.0, "logps/rejected": -218.59774780273438, "loss": 0.1769, "rewards/rejected": -1.7950791120529175, "step": 7129 }, { "epoch": 0.3779185328492301, "grad_norm": 43.5, "kl": 1.994863510131836, "learning_rate": 5e-07, "logits/chosen": -11458920.0, "logits/rejected": 5034838.5, "logps/chosen": -322.14231363932294, "logps/rejected": -274.8357849121094, "loss": 0.2471, "rewards/chosen": 1.3002630869547527, "rewards/margins": 4.256894985834758, "rewards/rejected": -2.956631898880005, "step": 7130 }, { "epoch": 0.37797153685103224, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6453696.5, "logits/rejected": -1796523.0, "logps/chosen": -90.32952880859375, "logps/rejected": -123.78107452392578, "loss": 0.4152, "rewards/chosen": -0.28535592555999756, "rewards/margins": 0.8332819938659668, "rewards/rejected": -1.1186379194259644, "step": 7131 }, { "epoch": 0.3780245408528344, "grad_norm": 52.75, "kl": 0.8808670043945312, "learning_rate": 5e-07, "logits/chosen": -29402227.2, "logits/rejected": -21266729.333333332, "logps/chosen": -384.2651611328125, "logps/rejected": -221.99251302083334, "loss": 0.3474, "rewards/chosen": 0.35600929260253905, "rewards/margins": 2.148504066467285, "rewards/rejected": -1.792494773864746, "step": 7132 }, { "epoch": 0.3780775448546365, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44881848.0, "logits/rejected": 14137203.0, "logps/chosen": -223.37191772460938, "logps/rejected": -482.9254455566406, "loss": 0.2726, "rewards/chosen": 0.3265068233013153, "rewards/margins": 2.347852438688278, "rewards/rejected": -2.021345615386963, "step": 7133 }, { "epoch": 0.37813054885643865, "grad_norm": 91.0, "kl": 0.6023578643798828, "learning_rate": 5e-07, "logits/chosen": -14691475.0, "logits/rejected": -27201302.0, "logps/chosen": -238.53907775878906, "logps/rejected": -273.88079833984375, "loss": 0.3167, "rewards/chosen": 0.5639622211456299, "rewards/margins": 1.894600749015808, "rewards/rejected": -1.3306385278701782, "step": 7134 }, { "epoch": 0.3781835528582408, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37191396.0, "logits/rejected": -8772529.333333334, "logps/chosen": -266.3748779296875, "logps/rejected": -225.27545166015625, "loss": 0.2798, "rewards/chosen": -0.1563607156276703, "rewards/margins": 1.8843125402927399, "rewards/rejected": -2.04067325592041, "step": 7135 }, { "epoch": 0.3782365568600429, "grad_norm": 49.75, "kl": 2.7481231689453125, "learning_rate": 5e-07, "logits/chosen": -18394384.0, "logits/rejected": -17367062.0, "logps/chosen": -524.11962890625, "logps/rejected": -150.85061645507812, "loss": 0.3046, "rewards/chosen": 0.9378677606582642, "rewards/margins": 2.5490405559539795, "rewards/rejected": -1.6111727952957153, "step": 7136 }, { "epoch": 0.37828956086184506, "grad_norm": 43.75, "kl": 0.08017349243164062, "learning_rate": 5e-07, "logits/chosen": -13631040.0, "logits/rejected": -23507218.0, "logps/chosen": -69.10617065429688, "logps/rejected": -294.4682312011719, "loss": 0.3489, "rewards/chosen": -0.13961483538150787, "rewards/margins": 1.6609451621770859, "rewards/rejected": -1.8005599975585938, "step": 7137 }, { "epoch": 0.3783425648636472, "grad_norm": 66.0, "kl": 1.515634536743164, "learning_rate": 5e-07, "logits/chosen": -16507795.2, "logits/rejected": -24938824.0, "logps/chosen": -309.17939453125, "logps/rejected": -200.1287841796875, "loss": 0.3517, "rewards/chosen": 0.4769193172454834, "rewards/margins": 1.5517074108123778, "rewards/rejected": -1.0747880935668945, "step": 7138 }, { "epoch": 0.37839556886544934, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6133512.8, "logits/rejected": -5029621.333333333, "logps/chosen": -192.9770751953125, "logps/rejected": -203.97237141927084, "loss": 0.3915, "rewards/chosen": 0.20230729579925538, "rewards/margins": 1.194548519452413, "rewards/rejected": -0.9922412236531576, "step": 7139 }, { "epoch": 0.3784485728672515, "grad_norm": 54.5, "kl": 0.3003549575805664, "learning_rate": 5e-07, "logits/chosen": -52116252.8, "logits/rejected": -116710.66666666667, "logps/chosen": -528.38662109375, "logps/rejected": -218.73486328125, "loss": 0.3385, "rewards/chosen": 0.44884357452392576, "rewards/margins": 1.7253968874613443, "rewards/rejected": -1.2765533129374187, "step": 7140 }, { "epoch": 0.3785015768690536, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41274352.0, "logits/rejected": -36883152.0, "logps/chosen": -285.2920837402344, "logps/rejected": -253.53367614746094, "loss": 0.2623, "rewards/chosen": 0.28640326857566833, "rewards/margins": 2.738208144903183, "rewards/rejected": -2.4518048763275146, "step": 7141 }, { "epoch": 0.37855458087085575, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56853930.666666664, "logits/rejected": -18111180.8, "logps/chosen": -319.4896240234375, "logps/rejected": -365.509765625, "loss": 0.2819, "rewards/chosen": 0.14355340600013733, "rewards/margins": 1.825487381219864, "rewards/rejected": -1.6819339752197267, "step": 7142 }, { "epoch": 0.3786075848726579, "grad_norm": 44.0, "kl": 0.9926338195800781, "learning_rate": 5e-07, "logits/chosen": -8905324.0, "logits/rejected": -69143744.0, "logps/chosen": -262.0420227050781, "logps/rejected": -413.67620849609375, "loss": 0.2439, "rewards/chosen": 0.7610024809837341, "rewards/margins": 3.1618451476097107, "rewards/rejected": -2.4008426666259766, "step": 7143 }, { "epoch": 0.37866058887446, "grad_norm": 38.75, "kl": 0.4064483642578125, "learning_rate": 5e-07, "logits/chosen": -9534527.2, "logits/rejected": -55709386.666666664, "logps/chosen": -203.41083984375, "logps/rejected": -456.0974934895833, "loss": 0.2903, "rewards/chosen": 0.42688302993774413, "rewards/margins": 3.138578383127848, "rewards/rejected": -2.711695353190104, "step": 7144 }, { "epoch": 0.37871359287626216, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -89469800.0, "logits/rejected": -29265881.14285714, "logps/chosen": -733.3240966796875, "logps/rejected": -523.12158203125, "loss": 0.1367, "rewards/chosen": 0.5648437738418579, "rewards/margins": 3.03183274609702, "rewards/rejected": -2.466988972255162, "step": 7145 }, { "epoch": 0.3787665968780643, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70262272.0, "logits/rejected": -24794602.666666668, "logps/chosen": -285.92529296875, "logps/rejected": -526.2340901692709, "loss": 0.1867, "rewards/chosen": 0.3867843747138977, "rewards/margins": 3.118237316608429, "rewards/rejected": -2.7314529418945312, "step": 7146 }, { "epoch": 0.37881960087986644, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41369110.4, "logits/rejected": -74226037.33333333, "logps/chosen": -234.1396240234375, "logps/rejected": -651.168212890625, "loss": 0.321, "rewards/chosen": 0.25028038024902344, "rewards/margins": 2.3621864318847656, "rewards/rejected": -2.111906051635742, "step": 7147 }, { "epoch": 0.3788726048816686, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -122398.66666666667, "logits/rejected": -12125675.2, "logps/chosen": -77.02058919270833, "logps/rejected": -156.008544921875, "loss": 0.3344, "rewards/chosen": -0.019434866805871327, "rewards/margins": 1.6068956996003787, "rewards/rejected": -1.62633056640625, "step": 7148 }, { "epoch": 0.3789256088834707, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14165978.666666666, "logits/rejected": -12909726.4, "logps/chosen": -37.65337371826172, "logps/rejected": -172.9706787109375, "loss": 0.2905, "rewards/chosen": -0.04833132525285085, "rewards/margins": 1.8401109049717586, "rewards/rejected": -1.8884422302246093, "step": 7149 }, { "epoch": 0.37897861288527285, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1103528.8, "logits/rejected": 14052572.0, "logps/chosen": -324.0722412109375, "logps/rejected": -122.65175374348958, "loss": 0.436, "rewards/chosen": -0.0637357234954834, "rewards/margins": 1.245720624923706, "rewards/rejected": -1.3094563484191895, "step": 7150 }, { "epoch": 0.379031616887075, "grad_norm": 63.75, "kl": 1.1819305419921875, "learning_rate": 5e-07, "logits/chosen": -30654352.0, "logits/rejected": -9186897.333333334, "logps/chosen": -532.4326171875, "logps/rejected": -193.0800577799479, "loss": 0.3501, "rewards/chosen": 0.2784054517745972, "rewards/margins": 2.3024094343185424, "rewards/rejected": -2.0240039825439453, "step": 7151 }, { "epoch": 0.3790846208888771, "grad_norm": 54.25, "kl": 1.0872478485107422, "learning_rate": 5e-07, "logits/chosen": -31330160.0, "logits/rejected": -30896852.0, "logps/chosen": -767.5728352864584, "logps/rejected": -371.3038330078125, "loss": 0.3087, "rewards/chosen": 0.8502318064371744, "rewards/margins": 3.437774578730265, "rewards/rejected": -2.587542772293091, "step": 7152 }, { "epoch": 0.37913762489067926, "grad_norm": 52.25, "kl": 1.0515151023864746, "learning_rate": 5e-07, "logits/chosen": -62662428.0, "logits/rejected": -16322308.0, "logps/chosen": -291.9326171875, "logps/rejected": -134.6228790283203, "loss": 0.3939, "rewards/chosen": -0.09261494129896164, "rewards/margins": 1.349271960556507, "rewards/rejected": -1.4418869018554688, "step": 7153 }, { "epoch": 0.3791906288924814, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36351552.0, "logits/rejected": -61289928.0, "logps/chosen": -272.455322265625, "logps/rejected": -593.5086669921875, "loss": 0.2342, "rewards/chosen": 0.6057816743850708, "rewards/margins": 3.6121217012405396, "rewards/rejected": -3.0063400268554688, "step": 7154 }, { "epoch": 0.37924363289428353, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28192809.6, "logits/rejected": 44760336.0, "logps/chosen": -425.51708984375, "logps/rejected": -431.5361735026042, "loss": 0.2879, "rewards/chosen": 0.6860867500305176, "rewards/margins": 2.1519328117370606, "rewards/rejected": -1.465846061706543, "step": 7155 }, { "epoch": 0.37929663689608567, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53943276.0, "logits/rejected": 125282250.66666667, "logps/chosen": -285.89373779296875, "logps/rejected": -481.6298828125, "loss": 0.2477, "rewards/chosen": -0.5269630551338196, "rewards/margins": 1.9176472226778665, "rewards/rejected": -2.444610277811686, "step": 7156 }, { "epoch": 0.3793496408978878, "grad_norm": 51.75, "kl": 1.5425357818603516, "learning_rate": 5e-07, "logits/chosen": -45890128.0, "logits/rejected": -32653214.0, "logps/chosen": -321.0562744140625, "logps/rejected": -264.8338623046875, "loss": 0.3035, "rewards/chosen": 0.5986289381980896, "rewards/margins": 2.3998172879219055, "rewards/rejected": -1.801188349723816, "step": 7157 }, { "epoch": 0.37940264489968994, "grad_norm": 40.25, "kl": 0.24587249755859375, "learning_rate": 5e-07, "logits/chosen": -34190944.0, "logits/rejected": -13602372.8, "logps/chosen": -326.13511149088544, "logps/rejected": -356.1025634765625, "loss": 0.3201, "rewards/chosen": -0.500076174736023, "rewards/margins": 2.073591923713684, "rewards/rejected": -2.573668098449707, "step": 7158 }, { "epoch": 0.3794556489014921, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35434220.0, "logits/rejected": -20604114.0, "logps/chosen": -278.5594787597656, "logps/rejected": -307.58038330078125, "loss": 0.3177, "rewards/chosen": 0.4236951768398285, "rewards/margins": 2.028377741575241, "rewards/rejected": -1.6046825647354126, "step": 7159 }, { "epoch": 0.3795086529032942, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40362880.0, "logits/rejected": -24616090.666666668, "logps/chosen": -181.26133728027344, "logps/rejected": -295.37453206380206, "loss": 0.2688, "rewards/chosen": -0.6377216577529907, "rewards/margins": 1.4127556880315146, "rewards/rejected": -2.0504773457845054, "step": 7160 }, { "epoch": 0.37956165690509636, "grad_norm": 55.25, "kl": 0.4575843811035156, "learning_rate": 5e-07, "logits/chosen": -1161572.4, "logits/rejected": -28453442.666666668, "logps/chosen": -367.80712890625, "logps/rejected": -463.3885091145833, "loss": 0.2968, "rewards/chosen": 0.4451117038726807, "rewards/margins": 2.7292994976043703, "rewards/rejected": -2.2841877937316895, "step": 7161 }, { "epoch": 0.3796146609068985, "grad_norm": 66.5, "kl": 0.115692138671875, "learning_rate": 5e-07, "logits/chosen": -12947168.0, "logits/rejected": -18942950.666666668, "logps/chosen": -333.603271484375, "logps/rejected": -204.06488037109375, "loss": 0.3233, "rewards/chosen": 0.42484893798828127, "rewards/margins": 2.285477066040039, "rewards/rejected": -1.8606281280517578, "step": 7162 }, { "epoch": 0.37966766490870063, "grad_norm": 45.25, "kl": 1.0269660949707031, "learning_rate": 5e-07, "logits/chosen": -11717515.2, "logits/rejected": -26408314.666666668, "logps/chosen": -249.459716796875, "logps/rejected": -380.6704915364583, "loss": 0.2842, "rewards/chosen": 0.648088788986206, "rewards/margins": 3.194496488571167, "rewards/rejected": -2.546407699584961, "step": 7163 }, { "epoch": 0.37972066891050277, "grad_norm": 34.75, "kl": 0.5517339706420898, "learning_rate": 5e-07, "logits/chosen": -4717798.666666667, "logits/rejected": -42348249.6, "logps/chosen": -68.65749613444011, "logps/rejected": -330.94921875, "loss": 0.2215, "rewards/chosen": 1.0989782015482585, "rewards/margins": 2.917486540476481, "rewards/rejected": -1.8185083389282226, "step": 7164 }, { "epoch": 0.3797736729123049, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80540890.66666667, "logits/rejected": -15666988.8, "logps/chosen": -300.52280680338544, "logps/rejected": -476.544921875, "loss": 0.2325, "rewards/chosen": 0.17095082998275757, "rewards/margins": 2.9286609053611756, "rewards/rejected": -2.757710075378418, "step": 7165 }, { "epoch": 0.37982667691410704, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39855027.2, "logits/rejected": -76282181.33333333, "logps/chosen": -338.208837890625, "logps/rejected": -288.7432861328125, "loss": 0.2921, "rewards/chosen": 0.4908896446228027, "rewards/margins": 2.8870806058247886, "rewards/rejected": -2.396190961201986, "step": 7166 }, { "epoch": 0.3798796809159091, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52038416.0, "logits/rejected": -68364754.28571428, "logps/chosen": -144.1964111328125, "logps/rejected": -358.11178152901783, "loss": 0.1542, "rewards/chosen": 1.5688523054122925, "rewards/margins": 3.5000962018966675, "rewards/rejected": -1.931243896484375, "step": 7167 }, { "epoch": 0.37993268491771126, "grad_norm": 57.0, "kl": 0.1686725616455078, "learning_rate": 5e-07, "logits/chosen": -47807830.4, "logits/rejected": -30289720.0, "logps/chosen": -295.29931640625, "logps/rejected": -300.8307291666667, "loss": 0.3848, "rewards/chosen": 0.013928759098052978, "rewards/margins": 1.5745204567909241, "rewards/rejected": -1.560591697692871, "step": 7168 }, { "epoch": 0.3799856889195134, "grad_norm": 49.25, "kl": 0.8265304565429688, "learning_rate": 5e-07, "logits/chosen": 2535441.6, "logits/rejected": -18759238.666666668, "logps/chosen": -261.83330078125, "logps/rejected": -237.1925048828125, "loss": 0.3288, "rewards/chosen": 0.5067861557006836, "rewards/margins": 1.8379654566446941, "rewards/rejected": -1.3311793009440105, "step": 7169 }, { "epoch": 0.38003869292131554, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12830269.333333334, "logits/rejected": -12126447.2, "logps/chosen": -153.6535847981771, "logps/rejected": -298.848583984375, "loss": 0.2204, "rewards/chosen": 0.1744823455810547, "rewards/margins": 2.7451709747314452, "rewards/rejected": -2.5706886291503905, "step": 7170 }, { "epoch": 0.3800916969231177, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41443506.666666664, "logits/rejected": -37171830.4, "logps/chosen": -390.83154296875, "logps/rejected": -440.661865234375, "loss": 0.2146, "rewards/chosen": 0.32871299982070923, "rewards/margins": 3.0593753457069397, "rewards/rejected": -2.7306623458862305, "step": 7171 }, { "epoch": 0.3801447009249198, "grad_norm": 55.25, "kl": 0.638218879699707, "learning_rate": 5e-07, "logits/chosen": 26141518.4, "logits/rejected": -16823413.333333332, "logps/chosen": -328.51015625, "logps/rejected": -193.4267374674479, "loss": 0.3903, "rewards/chosen": 0.13978312015533448, "rewards/margins": 1.398549771308899, "rewards/rejected": -1.2587666511535645, "step": 7172 }, { "epoch": 0.38019770492672195, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13254571.0, "logits/rejected": -49134896.0, "logps/chosen": -215.950927734375, "logps/rejected": -225.94283040364584, "loss": 0.2578, "rewards/chosen": 0.3357810974121094, "rewards/margins": 2.0891062418619795, "rewards/rejected": -1.7533251444498699, "step": 7173 }, { "epoch": 0.3802507089285241, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11260410.666666666, "logits/rejected": -12240345.6, "logps/chosen": -564.4784342447916, "logps/rejected": -463.68486328125, "loss": 0.3028, "rewards/chosen": -0.11699015895525615, "rewards/margins": 1.7720580359299978, "rewards/rejected": -1.889048194885254, "step": 7174 }, { "epoch": 0.3803037129303262, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62081276.0, "logits/rejected": -38480352.0, "logps/chosen": -399.430908203125, "logps/rejected": -452.9214680989583, "loss": 0.1953, "rewards/chosen": 0.06729508936405182, "rewards/margins": 2.7534402360518775, "rewards/rejected": -2.6861451466878257, "step": 7175 }, { "epoch": 0.38035671693212836, "grad_norm": 26.625, "kl": 1.1255970001220703, "learning_rate": 5e-07, "logits/chosen": -3732103.0, "logits/rejected": -7169374.5, "logps/chosen": -190.68881225585938, "logps/rejected": -126.69200134277344, "loss": 0.2241, "rewards/chosen": 0.8330521583557129, "rewards/margins": 4.129551649093628, "rewards/rejected": -3.296499490737915, "step": 7176 }, { "epoch": 0.3804097209339305, "grad_norm": 60.25, "kl": 1.7687435150146484, "learning_rate": 5e-07, "logits/chosen": -25611580.8, "logits/rejected": -8960177.333333334, "logps/chosen": -469.061669921875, "logps/rejected": -232.9004109700521, "loss": 0.3679, "rewards/chosen": 0.6947117328643799, "rewards/margins": 2.3702839692433675, "rewards/rejected": -1.6755722363789876, "step": 7177 }, { "epoch": 0.38046272493573263, "grad_norm": 54.5, "kl": 0.8718376159667969, "learning_rate": 5e-07, "logits/chosen": -15833931.2, "logits/rejected": -30613221.333333332, "logps/chosen": -437.18740234375, "logps/rejected": -418.667724609375, "loss": 0.2808, "rewards/chosen": 0.668862771987915, "rewards/margins": 2.6980415503183997, "rewards/rejected": -2.029178778330485, "step": 7178 }, { "epoch": 0.38051572893753477, "grad_norm": 35.0, "kl": 0.4206695556640625, "learning_rate": 5e-07, "logits/chosen": 639556.6875, "logits/rejected": 2526734.6666666665, "logps/chosen": -175.2274627685547, "logps/rejected": -196.05428059895834, "loss": 0.2254, "rewards/chosen": 0.7588703632354736, "rewards/margins": 2.6281104882558184, "rewards/rejected": -1.869240125020345, "step": 7179 }, { "epoch": 0.3805687329393369, "grad_norm": 47.25, "kl": 1.1599369049072266, "learning_rate": 5e-07, "logits/chosen": -33093256.0, "logits/rejected": -10455920.0, "logps/chosen": -278.8746337890625, "logps/rejected": -161.65550231933594, "loss": 0.4165, "rewards/chosen": -0.046778857707977295, "rewards/margins": 0.8110044598579407, "rewards/rejected": -0.857783317565918, "step": 7180 }, { "epoch": 0.38062173694113904, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10203827.0, "logits/rejected": -6281543.5, "logps/chosen": -266.0585021972656, "logps/rejected": -396.6904296875, "loss": 0.3154, "rewards/chosen": 0.02477887272834778, "rewards/margins": 2.1178775131702423, "rewards/rejected": -2.0930986404418945, "step": 7181 }, { "epoch": 0.3806747409429412, "grad_norm": 50.25, "kl": 0.033061981201171875, "learning_rate": 5e-07, "logits/chosen": -41123340.0, "logits/rejected": 16743277.0, "logps/chosen": -372.68603515625, "logps/rejected": -177.20083618164062, "loss": 0.322, "rewards/chosen": 0.24830837547779083, "rewards/margins": 1.756015494465828, "rewards/rejected": -1.507707118988037, "step": 7182 }, { "epoch": 0.3807277449447433, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48392684.0, "logits/rejected": -43177504.0, "logps/chosen": -164.24855041503906, "logps/rejected": -387.0395202636719, "loss": 0.3148, "rewards/chosen": 0.06725121289491653, "rewards/margins": 2.4805767610669136, "rewards/rejected": -2.413325548171997, "step": 7183 }, { "epoch": 0.38078074894654546, "grad_norm": 43.25, "kl": 0.1385784149169922, "learning_rate": 5e-07, "logits/chosen": -95352312.0, "logits/rejected": -10638842.0, "logps/chosen": -287.59344482421875, "logps/rejected": -171.4262898763021, "loss": 0.2485, "rewards/chosen": 0.5302322506904602, "rewards/margins": 2.024279733498891, "rewards/rejected": -1.494047482808431, "step": 7184 }, { "epoch": 0.3808337529483476, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51253152.0, "logits/rejected": -27936688.0, "logps/chosen": -347.51702880859375, "logps/rejected": -239.08425903320312, "loss": 0.2852, "rewards/chosen": 0.3226369321346283, "rewards/margins": 2.2459965646266937, "rewards/rejected": -1.9233596324920654, "step": 7185 }, { "epoch": 0.38088675695014973, "grad_norm": 44.25, "kl": 0.7543392181396484, "learning_rate": 5e-07, "logits/chosen": -14022416.0, "logits/rejected": -36842176.0, "logps/chosen": -365.4612223307292, "logps/rejected": -405.30029296875, "loss": 0.2805, "rewards/chosen": 0.8950255711873373, "rewards/margins": 3.908119042714437, "rewards/rejected": -3.0130934715270996, "step": 7186 }, { "epoch": 0.38093976095195187, "grad_norm": 43.75, "kl": 0.14061737060546875, "learning_rate": 5e-07, "logits/chosen": -14094959.0, "logits/rejected": -32626864.0, "logps/chosen": -164.8565673828125, "logps/rejected": -225.08499145507812, "loss": 0.3971, "rewards/chosen": -0.16843317449092865, "rewards/margins": 1.7855836302042007, "rewards/rejected": -1.9540168046951294, "step": 7187 }, { "epoch": 0.380992764953754, "grad_norm": 42.0, "kl": 1.4819869995117188, "learning_rate": 5e-07, "logits/chosen": -38848646.4, "logits/rejected": -30434576.0, "logps/chosen": -195.36397705078124, "logps/rejected": -235.28971354166666, "loss": 0.3616, "rewards/chosen": 0.33078579902648925, "rewards/margins": 2.080840794245402, "rewards/rejected": -1.7500549952189128, "step": 7188 }, { "epoch": 0.38104576895555614, "grad_norm": 58.0, "kl": 0.1388721466064453, "learning_rate": 5e-07, "logits/chosen": -34762288.0, "logits/rejected": -10692870.666666666, "logps/chosen": -254.74189453125, "logps/rejected": -163.86041259765625, "loss": 0.3744, "rewards/chosen": -0.050112712383270266, "rewards/margins": 2.0487826784451806, "rewards/rejected": -2.0988953908284507, "step": 7189 }, { "epoch": 0.3810987729573583, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18648446.4, "logits/rejected": -86072080.0, "logps/chosen": -257.8423583984375, "logps/rejected": -171.6028849283854, "loss": 0.3071, "rewards/chosen": 0.9051189422607422, "rewards/margins": 1.8543952306111655, "rewards/rejected": -0.9492762883504232, "step": 7190 }, { "epoch": 0.3811517769591604, "grad_norm": 49.5, "kl": 0.05164337158203125, "learning_rate": 5e-07, "logits/chosen": -26321062.0, "logits/rejected": -15326764.0, "logps/chosen": -342.340087890625, "logps/rejected": -348.15185546875, "loss": 0.2083, "rewards/chosen": 0.7515146732330322, "rewards/margins": 3.395634651184082, "rewards/rejected": -2.64411997795105, "step": 7191 }, { "epoch": 0.38120478096096255, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19835192.0, "logits/rejected": -15524401.142857144, "logps/chosen": -415.2978820800781, "logps/rejected": -279.2216099330357, "loss": 0.1948, "rewards/chosen": -0.7592712640762329, "rewards/margins": 1.580041970525469, "rewards/rejected": -2.339313234601702, "step": 7192 }, { "epoch": 0.3812577849627647, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76570840.0, "logits/rejected": -2712232.0, "logps/chosen": -179.06724548339844, "logps/rejected": -379.2782389322917, "loss": 0.2107, "rewards/chosen": 0.49062538146972656, "rewards/margins": 2.4065860112508135, "rewards/rejected": -1.9159606297810872, "step": 7193 }, { "epoch": 0.3813107889645668, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15258664.0, "logits/rejected": -46223608.0, "logps/chosen": -318.07977294921875, "logps/rejected": -192.70571899414062, "loss": 0.3724, "rewards/chosen": 0.5384516716003418, "rewards/margins": 1.1428044438362122, "rewards/rejected": -0.6043527722358704, "step": 7194 }, { "epoch": 0.38136379296636896, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22657800.0, "logits/rejected": -38995052.8, "logps/chosen": -175.53767903645834, "logps/rejected": -553.61328125, "loss": 0.1768, "rewards/chosen": 0.7273743947347006, "rewards/margins": 3.9867673238118493, "rewards/rejected": -3.2593929290771486, "step": 7195 }, { "epoch": 0.3814167969681711, "grad_norm": 66.5, "kl": 2.991649627685547, "learning_rate": 5e-07, "logits/chosen": -60219142.4, "logits/rejected": -72280032.0, "logps/chosen": -364.314306640625, "logps/rejected": -283.10654703776044, "loss": 0.2414, "rewards/chosen": 1.2085845947265625, "rewards/margins": 4.650030263264974, "rewards/rejected": -3.4414456685384116, "step": 7196 }, { "epoch": 0.38146980096997324, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8073067.0, "logits/rejected": -1558443.875, "logps/chosen": -242.43264770507812, "logps/rejected": -111.24755096435547, "loss": 0.3233, "rewards/chosen": 0.17704924941062927, "rewards/margins": 1.9466443955898285, "rewards/rejected": -1.7695951461791992, "step": 7197 }, { "epoch": 0.3815228049717754, "grad_norm": 43.25, "kl": 0.2824735641479492, "learning_rate": 5e-07, "logits/chosen": -13734616.0, "logits/rejected": 916864.125, "logps/chosen": -138.1298828125, "logps/rejected": -291.5635070800781, "loss": 0.3908, "rewards/chosen": 0.4516083598136902, "rewards/margins": 1.1447004675865173, "rewards/rejected": -0.6930921077728271, "step": 7198 }, { "epoch": 0.3815758089735775, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11257828.0, "logits/rejected": -19522868.8, "logps/chosen": -443.2373046875, "logps/rejected": -316.314990234375, "loss": 0.1906, "rewards/chosen": 1.0751078923543294, "rewards/margins": 3.0511105855305987, "rewards/rejected": -1.9760026931762695, "step": 7199 }, { "epoch": 0.38162881297537965, "grad_norm": 42.75, "kl": 0.43546295166015625, "learning_rate": 5e-07, "logits/chosen": -4414239.333333333, "logits/rejected": -29411590.4, "logps/chosen": -255.89640299479166, "logps/rejected": -410.166357421875, "loss": 0.1973, "rewards/chosen": 1.3227318127950032, "rewards/margins": 3.1107685407002768, "rewards/rejected": -1.7880367279052733, "step": 7200 }, { "epoch": 0.3816818169771818, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11536276.0, "logits/rejected": 10649427.0, "logps/chosen": -311.0316467285156, "logps/rejected": -204.60931396484375, "loss": 0.4118, "rewards/chosen": -0.24175255000591278, "rewards/margins": 0.8250268250703812, "rewards/rejected": -1.066779375076294, "step": 7201 }, { "epoch": 0.3817348209789839, "grad_norm": 60.75, "kl": 0.11443805694580078, "learning_rate": 5e-07, "logits/chosen": 5299274.0, "logits/rejected": -23764844.0, "logps/chosen": -260.8465881347656, "logps/rejected": -319.11474609375, "loss": 0.27, "rewards/chosen": 0.49145975708961487, "rewards/margins": 2.7076565325260162, "rewards/rejected": -2.2161967754364014, "step": 7202 }, { "epoch": 0.38178782498078606, "grad_norm": 61.75, "kl": 0.799224853515625, "learning_rate": 5e-07, "logits/chosen": -46736086.85714286, "logits/rejected": -4503680.5, "logps/chosen": -253.45486886160714, "logps/rejected": -153.82696533203125, "loss": 0.4065, "rewards/chosen": 0.37172698974609375, "rewards/margins": 1.598052978515625, "rewards/rejected": -1.2263259887695312, "step": 7203 }, { "epoch": 0.3818408289825882, "grad_norm": 74.0, "kl": 0.9782333374023438, "learning_rate": 5e-07, "logits/chosen": -73373517.71428572, "logits/rejected": -45234808.0, "logps/chosen": -352.1064453125, "logps/rejected": -158.34652709960938, "loss": 0.416, "rewards/chosen": 0.33428989137922016, "rewards/margins": 1.7497592994144986, "rewards/rejected": -1.4154694080352783, "step": 7204 }, { "epoch": 0.38189383298439034, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71116656.0, "logits/rejected": -40552808.0, "logps/chosen": -276.34368896484375, "logps/rejected": -620.861328125, "loss": 0.2839, "rewards/chosen": -0.03974342346191406, "rewards/margins": 3.371838092803955, "rewards/rejected": -3.411581516265869, "step": 7205 }, { "epoch": 0.3819468369861925, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59788800.0, "logits/rejected": -24040854.4, "logps/chosen": -332.1876627604167, "logps/rejected": -315.1361328125, "loss": 0.2278, "rewards/chosen": 0.08597056070963542, "rewards/margins": 2.705800120035807, "rewards/rejected": -2.619829559326172, "step": 7206 }, { "epoch": 0.3819998409879946, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12714145.333333334, "logits/rejected": -10123440.0, "logps/chosen": -122.56783040364583, "logps/rejected": -174.940234375, "loss": 0.3714, "rewards/chosen": -0.2271448771158854, "rewards/margins": 0.9907996495564778, "rewards/rejected": -1.2179445266723632, "step": 7207 }, { "epoch": 0.38205284498979675, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43175280.0, "logits/rejected": -4500474.0, "logps/chosen": -449.6793518066406, "logps/rejected": -146.2552693684896, "loss": 0.1887, "rewards/chosen": 1.4290130138397217, "rewards/margins": 3.1310993035634356, "rewards/rejected": -1.7020862897237141, "step": 7208 }, { "epoch": 0.3821058489915989, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15621277.333333334, "logits/rejected": 13446064.0, "logps/chosen": -369.99560546875, "logps/rejected": -207.3386474609375, "loss": 0.3241, "rewards/chosen": 0.9561833540598551, "rewards/margins": 1.737218681971232, "rewards/rejected": -0.781035327911377, "step": 7209 }, { "epoch": 0.382158852993401, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29726229.333333332, "logits/rejected": -19499289.6, "logps/chosen": -310.197265625, "logps/rejected": -349.4732421875, "loss": 0.2489, "rewards/chosen": 0.21346656481424967, "rewards/margins": 2.523388973871867, "rewards/rejected": -2.3099224090576174, "step": 7210 }, { "epoch": 0.38221185699520316, "grad_norm": 61.75, "kl": 0.6211776733398438, "learning_rate": 5e-07, "logits/chosen": -19569964.0, "logits/rejected": -28203882.0, "logps/chosen": -492.28704833984375, "logps/rejected": -278.38287353515625, "loss": 0.2708, "rewards/chosen": 0.882917046546936, "rewards/margins": 2.1977862119674683, "rewards/rejected": -1.3148691654205322, "step": 7211 }, { "epoch": 0.3822648609970053, "grad_norm": 50.75, "kl": 1.9405059814453125, "learning_rate": 5e-07, "logits/chosen": -52424858.666666664, "logits/rejected": -1831699.625, "logps/chosen": -359.2730305989583, "logps/rejected": -81.55570983886719, "loss": 0.3353, "rewards/chosen": 0.6868014335632324, "rewards/margins": 1.988810658454895, "rewards/rejected": -1.3020092248916626, "step": 7212 }, { "epoch": 0.38231786499880743, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53120748.0, "logits/rejected": -23046696.0, "logps/chosen": -535.3219604492188, "logps/rejected": -325.1268717447917, "loss": 0.2442, "rewards/chosen": 0.6045501828193665, "rewards/margins": 2.5607758959134417, "rewards/rejected": -1.9562257130940754, "step": 7213 }, { "epoch": 0.38237086900060957, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64810732.0, "logits/rejected": -25639294.0, "logps/chosen": -311.5343017578125, "logps/rejected": -368.0668029785156, "loss": 0.3025, "rewards/chosen": 0.13188964128494263, "rewards/margins": 2.0482295155525208, "rewards/rejected": -1.9163398742675781, "step": 7214 }, { "epoch": 0.3824238730024117, "grad_norm": 36.25, "kl": 0.8019571304321289, "learning_rate": 5e-07, "logits/chosen": -21667580.0, "logits/rejected": -19342369.333333332, "logps/chosen": -266.6415100097656, "logps/rejected": -288.61216227213544, "loss": 0.1504, "rewards/chosen": 1.2476768493652344, "rewards/margins": 3.404661019643148, "rewards/rejected": -2.1569841702779136, "step": 7215 }, { "epoch": 0.38247687700421384, "grad_norm": 53.5, "kl": 1.4591636657714844, "learning_rate": 5e-07, "logits/chosen": -1577348.4, "logits/rejected": -114507360.0, "logps/chosen": -220.6337890625, "logps/rejected": -301.931396484375, "loss": 0.31, "rewards/chosen": 0.4089798450469971, "rewards/margins": 3.22958353360494, "rewards/rejected": -2.820603688557943, "step": 7216 }, { "epoch": 0.382529881006016, "grad_norm": 55.75, "kl": 0.9054279327392578, "learning_rate": 5e-07, "logits/chosen": 1501860.0, "logits/rejected": -28835424.0, "logps/chosen": -338.7106119791667, "logps/rejected": -189.4785400390625, "loss": 0.3224, "rewards/chosen": 0.03537089625994364, "rewards/margins": 1.57294566432635, "rewards/rejected": -1.5375747680664062, "step": 7217 }, { "epoch": 0.38258288500781806, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13424965.0, "logits/rejected": -31339440.0, "logps/chosen": -120.56666564941406, "logps/rejected": -433.3607584635417, "loss": 0.2202, "rewards/chosen": -0.6106422543525696, "rewards/margins": 2.1500547925631204, "rewards/rejected": -2.76069704691569, "step": 7218 }, { "epoch": 0.3826358890096202, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19546664.0, "logits/rejected": -33832948.0, "logps/chosen": -154.27520751953125, "logps/rejected": -269.6365661621094, "loss": 0.3374, "rewards/chosen": 0.3102973898251851, "rewards/margins": 2.969264646371206, "rewards/rejected": -2.6589672565460205, "step": 7219 }, { "epoch": 0.38268889301142234, "grad_norm": 50.25, "kl": 1.6688270568847656, "learning_rate": 5e-07, "logits/chosen": -53789112.0, "logits/rejected": -3260395.5, "logps/chosen": -533.5945434570312, "logps/rejected": -144.1028289794922, "loss": 0.3002, "rewards/chosen": 1.13780677318573, "rewards/margins": 2.348847270011902, "rewards/rejected": -1.2110404968261719, "step": 7220 }, { "epoch": 0.3827418970132245, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39895240.0, "logits/rejected": -23139436.0, "logps/chosen": -195.40005493164062, "logps/rejected": -304.2392272949219, "loss": 0.4137, "rewards/chosen": -0.1299007534980774, "rewards/margins": 0.7524176239967346, "rewards/rejected": -0.882318377494812, "step": 7221 }, { "epoch": 0.3827949010150266, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58005104.0, "logits/rejected": -24421962.666666668, "logps/chosen": -324.3454284667969, "logps/rejected": -183.77958170572916, "loss": 0.3246, "rewards/chosen": -0.2510772943496704, "rewards/margins": 1.0837024450302124, "rewards/rejected": -1.3347797393798828, "step": 7222 }, { "epoch": 0.38284790501682875, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2171064.3333333335, "logits/rejected": -13471486.4, "logps/chosen": -279.33026123046875, "logps/rejected": -170.71031494140624, "loss": 0.3369, "rewards/chosen": -0.2441067099571228, "rewards/margins": 1.4785497307777404, "rewards/rejected": -1.7226564407348632, "step": 7223 }, { "epoch": 0.3829009090186309, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5605835.0, "logits/rejected": -29052021.333333332, "logps/chosen": -70.46939086914062, "logps/rejected": -321.1901041666667, "loss": 0.2354, "rewards/chosen": 0.02333841472864151, "rewards/margins": 2.0178619399666786, "rewards/rejected": -1.994523525238037, "step": 7224 }, { "epoch": 0.382953913020433, "grad_norm": 70.5, "kl": 3.7075271606445312, "learning_rate": 5e-07, "logits/chosen": -29868614.4, "logits/rejected": -9057628.0, "logps/chosen": -561.09140625, "logps/rejected": -71.38840230305989, "loss": 0.295, "rewards/chosen": 1.1748565673828124, "rewards/margins": 2.5542137463887533, "rewards/rejected": -1.3793571790059407, "step": 7225 }, { "epoch": 0.38300691702223516, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40012524.8, "logits/rejected": -42960962.666666664, "logps/chosen": -305.000927734375, "logps/rejected": -352.1764729817708, "loss": 0.3431, "rewards/chosen": 0.1090274453163147, "rewards/margins": 2.361495554447174, "rewards/rejected": -2.2524681091308594, "step": 7226 }, { "epoch": 0.3830599210240373, "grad_norm": 48.75, "kl": 0.6378269195556641, "learning_rate": 5e-07, "logits/chosen": -58109088.0, "logits/rejected": -5482110.4, "logps/chosen": -197.04508463541666, "logps/rejected": -198.91107177734375, "loss": 0.2275, "rewards/chosen": 1.2901802062988281, "rewards/margins": 2.7559179306030273, "rewards/rejected": -1.4657377243041991, "step": 7227 }, { "epoch": 0.38311292502583943, "grad_norm": 46.75, "kl": 2.253842353820801, "learning_rate": 5e-07, "logits/chosen": -5262933.333333333, "logits/rejected": 54450060.8, "logps/chosen": -165.89452107747397, "logps/rejected": -468.5982421875, "loss": 0.2956, "rewards/chosen": 0.4398257335027059, "rewards/margins": 2.2646432002385457, "rewards/rejected": -1.8248174667358399, "step": 7228 }, { "epoch": 0.38316592902764157, "grad_norm": 41.25, "kl": 0.5773220062255859, "learning_rate": 5e-07, "logits/chosen": -19983388.8, "logits/rejected": -67418389.33333333, "logps/chosen": -162.57183837890625, "logps/rejected": -361.4860432942708, "loss": 0.3275, "rewards/chosen": 0.20485930442810057, "rewards/margins": 2.798640934626261, "rewards/rejected": -2.5937816301981607, "step": 7229 }, { "epoch": 0.3832189330294437, "grad_norm": 61.5, "kl": 0.0119171142578125, "learning_rate": 5e-07, "logits/chosen": -24293426.666666668, "logits/rejected": -38865984.0, "logps/chosen": -389.7320963541667, "logps/rejected": -543.0883178710938, "loss": 0.3424, "rewards/chosen": 0.3063087463378906, "rewards/margins": 2.891677141189575, "rewards/rejected": -2.5853683948516846, "step": 7230 }, { "epoch": 0.38327193703124585, "grad_norm": 40.0, "kl": 0.9906635284423828, "learning_rate": 5e-07, "logits/chosen": -9969159.333333334, "logits/rejected": -61574438.4, "logps/chosen": -308.80108642578125, "logps/rejected": -302.6816650390625, "loss": 0.2949, "rewards/chosen": 0.5576067765553793, "rewards/margins": 2.0777275880177815, "rewards/rejected": -1.5201208114624023, "step": 7231 }, { "epoch": 0.383324941033048, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32503892.0, "logits/rejected": -634680.0, "logps/chosen": -259.2071533203125, "logps/rejected": -231.78353881835938, "loss": 0.3402, "rewards/chosen": -0.15240821242332458, "rewards/margins": 1.6213316023349762, "rewards/rejected": -1.7737398147583008, "step": 7232 }, { "epoch": 0.3833779450348501, "grad_norm": 49.75, "kl": 1.4117984771728516, "learning_rate": 5e-07, "logits/chosen": -7876837.0, "logits/rejected": -36678808.0, "logps/chosen": -404.89739990234375, "logps/rejected": -296.1504211425781, "loss": 0.3092, "rewards/chosen": 0.35682758688926697, "rewards/margins": 2.4088657796382904, "rewards/rejected": -2.0520381927490234, "step": 7233 }, { "epoch": 0.38343094903665226, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20970221.333333332, "logits/rejected": -11663179.2, "logps/chosen": -256.5833333333333, "logps/rejected": -263.22529296875, "loss": 0.2908, "rewards/chosen": 0.3101041118303935, "rewards/margins": 2.0135627071062725, "rewards/rejected": -1.703458595275879, "step": 7234 }, { "epoch": 0.3834839530384544, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23743486.0, "logits/rejected": -11948450.666666666, "logps/chosen": -471.86053466796875, "logps/rejected": -149.6225789388021, "loss": 0.2481, "rewards/chosen": 0.7078232169151306, "rewards/margins": 2.1665330926577253, "rewards/rejected": -1.4587098757425945, "step": 7235 }, { "epoch": 0.38353695704025653, "grad_norm": 41.25, "kl": 0.3876686096191406, "learning_rate": 5e-07, "logits/chosen": -18121512.0, "logits/rejected": -19726379.2, "logps/chosen": -337.39015706380206, "logps/rejected": -202.0763916015625, "loss": 0.2814, "rewards/chosen": 0.35156071186065674, "rewards/margins": 2.501446270942688, "rewards/rejected": -2.149885559082031, "step": 7236 }, { "epoch": 0.38358996104205867, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 36272400.0, "logits/rejected": -48491336.0, "logps/chosen": -168.9537811279297, "logps/rejected": -455.97418212890625, "loss": 0.2845, "rewards/chosen": -0.03252272680401802, "rewards/margins": 3.1497904770076275, "rewards/rejected": -3.1823132038116455, "step": 7237 }, { "epoch": 0.3836429650438608, "grad_norm": 44.0, "kl": 0.29598522186279297, "learning_rate": 5e-07, "logits/chosen": -43271176.0, "logits/rejected": -13327068.0, "logps/chosen": -397.08111572265625, "logps/rejected": -576.7428792317709, "loss": 0.1705, "rewards/chosen": 0.9003387093544006, "rewards/margins": 3.7059611280759177, "rewards/rejected": -2.805622418721517, "step": 7238 }, { "epoch": 0.38369596904566294, "grad_norm": 41.5, "kl": 0.12379837036132812, "learning_rate": 5e-07, "logits/chosen": -50620169.6, "logits/rejected": -51755450.666666664, "logps/chosen": -222.747509765625, "logps/rejected": -560.1496988932291, "loss": 0.2794, "rewards/chosen": 0.4553820610046387, "rewards/margins": 2.9360666592915856, "rewards/rejected": -2.4806845982869468, "step": 7239 }, { "epoch": 0.3837489730474651, "grad_norm": 64.5, "kl": 0.6685333251953125, "learning_rate": 5e-07, "logits/chosen": -30739781.333333332, "logits/rejected": -27999368.0, "logps/chosen": -504.9159342447917, "logps/rejected": -179.18212890625, "loss": 0.3414, "rewards/chosen": 0.530591090520223, "rewards/margins": 2.0752960046132407, "rewards/rejected": -1.5447049140930176, "step": 7240 }, { "epoch": 0.3838019770492672, "grad_norm": 61.75, "kl": 0.029205322265625, "learning_rate": 5e-07, "logits/chosen": -12402712.0, "logits/rejected": -5276295.6, "logps/chosen": -234.23763020833334, "logps/rejected": -136.2467529296875, "loss": 0.2953, "rewards/chosen": 0.33165283997853595, "rewards/margins": 1.7237244685490924, "rewards/rejected": -1.3920716285705566, "step": 7241 }, { "epoch": 0.38385498105106935, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20853526.4, "logits/rejected": -13441113.333333334, "logps/chosen": -249.6182373046875, "logps/rejected": -180.4300537109375, "loss": 0.3536, "rewards/chosen": 0.039548957347869874, "rewards/margins": 2.269358559449514, "rewards/rejected": -2.229809602101644, "step": 7242 }, { "epoch": 0.3839079850528715, "grad_norm": 83.5, "kl": 0.7454605102539062, "learning_rate": 5e-07, "logits/chosen": -22081825.6, "logits/rejected": 3163388.0, "logps/chosen": -311.1240966796875, "logps/rejected": -228.1551513671875, "loss": 0.3464, "rewards/chosen": 0.37607688903808595, "rewards/margins": 1.6173688888549804, "rewards/rejected": -1.2412919998168945, "step": 7243 }, { "epoch": 0.38396098905467363, "grad_norm": 49.0, "kl": 0.5136170387268066, "learning_rate": 5e-07, "logits/chosen": -15974989.333333334, "logits/rejected": -16101868.8, "logps/chosen": -308.7598063151042, "logps/rejected": -190.53115234375, "loss": 0.3071, "rewards/chosen": 0.9009488423665365, "rewards/margins": 1.7612334569295247, "rewards/rejected": -0.8602846145629883, "step": 7244 }, { "epoch": 0.38401399305647577, "grad_norm": 55.5, "kl": 0.20009613037109375, "learning_rate": 5e-07, "logits/chosen": -26298410.666666668, "logits/rejected": -43656556.8, "logps/chosen": -656.774658203125, "logps/rejected": -449.4798828125, "loss": 0.2102, "rewards/chosen": 0.472747802734375, "rewards/margins": 2.9329059600830076, "rewards/rejected": -2.4601581573486326, "step": 7245 }, { "epoch": 0.3840669970582779, "grad_norm": 48.5, "kl": 0.11622047424316406, "learning_rate": 5e-07, "logits/chosen": -43968204.8, "logits/rejected": 24682037.333333332, "logps/chosen": -205.247705078125, "logps/rejected": -372.2270914713542, "loss": 0.3859, "rewards/chosen": -0.29732980728149416, "rewards/margins": 3.53282683690389, "rewards/rejected": -3.8301566441853843, "step": 7246 }, { "epoch": 0.38412000106008004, "grad_norm": 56.25, "kl": 0.4722576141357422, "learning_rate": 5e-07, "logits/chosen": -8597056.0, "logits/rejected": -69497544.0, "logps/chosen": -202.40704345703125, "logps/rejected": -389.23590087890625, "loss": 0.2758, "rewards/chosen": 0.7314751942952474, "rewards/margins": 3.02463706334432, "rewards/rejected": -2.2931618690490723, "step": 7247 }, { "epoch": 0.3841730050618822, "grad_norm": 72.5, "kl": 2.4057083129882812, "learning_rate": 5e-07, "logits/chosen": 2618656.0, "logits/rejected": -154912224.0, "logps/chosen": -549.9899088541666, "logps/rejected": -530.1572265625, "loss": 0.3607, "rewards/chosen": 0.5686566829681396, "rewards/margins": 3.066723346710205, "rewards/rejected": -2.4980666637420654, "step": 7248 }, { "epoch": 0.3842260090636843, "grad_norm": 31.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6040153.0, "logits/rejected": -26275648.0, "logps/chosen": -49.824790954589844, "logps/rejected": -310.01548549107144, "loss": 0.2124, "rewards/chosen": -0.4746131896972656, "rewards/margins": 1.5494206292288646, "rewards/rejected": -2.02403381892613, "step": 7249 }, { "epoch": 0.38427901306548645, "grad_norm": 53.75, "kl": 0.20302581787109375, "learning_rate": 5e-07, "logits/chosen": -42296533.333333336, "logits/rejected": -18219144.0, "logps/chosen": -338.1909993489583, "logps/rejected": -378.4146423339844, "loss": 0.3036, "rewards/chosen": 0.5206586917241415, "rewards/margins": 3.481239994366964, "rewards/rejected": -2.9605813026428223, "step": 7250 }, { "epoch": 0.3843320170672886, "grad_norm": 47.75, "kl": 0.4193458557128906, "learning_rate": 5e-07, "logits/chosen": -23865968.0, "logits/rejected": -27204717.333333332, "logps/chosen": -265.24599609375, "logps/rejected": -383.3467203776042, "loss": 0.3007, "rewards/chosen": 0.3436920166015625, "rewards/margins": 3.010696919759115, "rewards/rejected": -2.6670049031575522, "step": 7251 }, { "epoch": 0.3843850210690907, "grad_norm": 30.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16611424.0, "logits/rejected": -18080474.666666668, "logps/chosen": -232.500634765625, "logps/rejected": -138.93633015950522, "loss": 0.3465, "rewards/chosen": 0.3280048131942749, "rewards/margins": 2.2314751704533893, "rewards/rejected": -1.9034703572591145, "step": 7252 }, { "epoch": 0.38443802507089286, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -636127.0, "logits/rejected": 6027302.0, "logps/chosen": -181.37374877929688, "logps/rejected": -394.7421875, "loss": 0.3231, "rewards/chosen": 0.253334641456604, "rewards/margins": 2.0620148181915283, "rewards/rejected": -1.8086801767349243, "step": 7253 }, { "epoch": 0.384491029072695, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13153560.0, "logits/rejected": -18395058.666666668, "logps/chosen": -220.689501953125, "logps/rejected": -303.1141357421875, "loss": 0.361, "rewards/chosen": 0.264342737197876, "rewards/margins": 1.4905585765838623, "rewards/rejected": -1.2262158393859863, "step": 7254 }, { "epoch": 0.38454403307449714, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13270940.0, "logits/rejected": -40758360.0, "logps/chosen": -244.69595336914062, "logps/rejected": -275.7461751302083, "loss": 0.2092, "rewards/chosen": 1.0382888317108154, "rewards/margins": 2.987887779871623, "rewards/rejected": -1.9495989481608074, "step": 7255 }, { "epoch": 0.3845970370762993, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25165284.0, "logits/rejected": -31918084.0, "logps/chosen": -658.6149291992188, "logps/rejected": -237.8731689453125, "loss": 0.3305, "rewards/chosen": 0.5379406213760376, "rewards/margins": 1.9468984603881836, "rewards/rejected": -1.408957839012146, "step": 7256 }, { "epoch": 0.3846500410781014, "grad_norm": 47.25, "kl": 1.4118614196777344, "learning_rate": 5e-07, "logits/chosen": -36498472.0, "logits/rejected": -13885580.0, "logps/chosen": -303.8453674316406, "logps/rejected": -431.9434509277344, "loss": 0.2675, "rewards/chosen": 0.46936875581741333, "rewards/margins": 3.82248193025589, "rewards/rejected": -3.3531131744384766, "step": 7257 }, { "epoch": 0.38470304507990355, "grad_norm": 43.0, "kl": 0.3262062072753906, "learning_rate": 5e-07, "logits/chosen": -29883184.0, "logits/rejected": -2694483.3333333335, "logps/chosen": -148.77362060546875, "logps/rejected": -75.51383463541667, "loss": 0.4049, "rewards/chosen": -0.0919040322303772, "rewards/margins": 1.627582045396169, "rewards/rejected": -1.7194860776265461, "step": 7258 }, { "epoch": 0.3847560490817057, "grad_norm": 43.75, "kl": 0.4349021911621094, "learning_rate": 5e-07, "logits/chosen": -11381553.0, "logits/rejected": -25388888.0, "logps/chosen": -249.91522216796875, "logps/rejected": -248.17111206054688, "loss": 0.2914, "rewards/chosen": 0.5189804434776306, "rewards/margins": 2.340007483959198, "rewards/rejected": -1.8210270404815674, "step": 7259 }, { "epoch": 0.3848090530835078, "grad_norm": 83.5, "kl": 2.6932945251464844, "learning_rate": 5e-07, "logits/chosen": -34615122.666666664, "logits/rejected": 3571569.5, "logps/chosen": -313.52931722005206, "logps/rejected": -198.73118591308594, "loss": 0.4184, "rewards/chosen": 0.5851218303044637, "rewards/margins": 1.0716635783513389, "rewards/rejected": -0.486541748046875, "step": 7260 }, { "epoch": 0.38486205708530996, "grad_norm": 70.0, "kl": 0.7464847564697266, "learning_rate": 5e-07, "logits/chosen": -68325418.66666667, "logits/rejected": -20594548.0, "logps/chosen": -369.8132731119792, "logps/rejected": -151.52200317382812, "loss": 0.3124, "rewards/chosen": 0.6709063847859701, "rewards/margins": 2.1051886876424155, "rewards/rejected": -1.4342823028564453, "step": 7261 }, { "epoch": 0.3849150610871121, "grad_norm": 59.25, "kl": 3.416107177734375, "learning_rate": 5e-07, "logits/chosen": -34886352.0, "logits/rejected": -48116618.666666664, "logps/chosen": -428.67880859375, "logps/rejected": -861.8380533854166, "loss": 0.2703, "rewards/chosen": 1.2032288551330566, "rewards/margins": 4.76053196589152, "rewards/rejected": -3.5573031107584634, "step": 7262 }, { "epoch": 0.38496806508891424, "grad_norm": 42.75, "kl": 0.8042640686035156, "learning_rate": 5e-07, "logits/chosen": -15452039.0, "logits/rejected": -18802012.0, "logps/chosen": -149.05551147460938, "logps/rejected": -409.58807373046875, "loss": 0.3104, "rewards/chosen": 0.07092609256505966, "rewards/margins": 2.431814096868038, "rewards/rejected": -2.3608880043029785, "step": 7263 }, { "epoch": 0.3850210690907164, "grad_norm": 53.25, "kl": 1.1757268905639648, "learning_rate": 5e-07, "logits/chosen": -2039232.3333333333, "logits/rejected": 3244276.0, "logps/chosen": -266.0889892578125, "logps/rejected": -233.80899047851562, "loss": 0.4133, "rewards/chosen": 0.44578059514363605, "rewards/margins": 1.374495764573415, "rewards/rejected": -0.928715169429779, "step": 7264 }, { "epoch": 0.3850740730925185, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7241081.0, "logits/rejected": -10959912.0, "logps/chosen": -109.0239486694336, "logps/rejected": -184.5206756591797, "loss": 0.4111, "rewards/chosen": -0.14899098873138428, "rewards/margins": 0.7998099327087402, "rewards/rejected": -0.9488009214401245, "step": 7265 }, { "epoch": 0.38512707709432065, "grad_norm": 56.75, "kl": 1.8867244720458984, "learning_rate": 5e-07, "logits/chosen": -35824362.666666664, "logits/rejected": -8013881.6, "logps/chosen": -434.0952962239583, "logps/rejected": -318.164306640625, "loss": 0.1858, "rewards/chosen": 0.7530639966328939, "rewards/margins": 3.3890100797017415, "rewards/rejected": -2.6359460830688475, "step": 7266 }, { "epoch": 0.3851800810961228, "grad_norm": 53.25, "kl": 0.5016555786132812, "learning_rate": 5e-07, "logits/chosen": -27720288.0, "logits/rejected": -28799929.6, "logps/chosen": -453.2761637369792, "logps/rejected": -251.4195068359375, "loss": 0.2276, "rewards/chosen": 0.7822652657826742, "rewards/margins": 2.6658105691274008, "rewards/rejected": -1.8835453033447265, "step": 7267 }, { "epoch": 0.3852330850979249, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51152115.2, "logits/rejected": -25246898.666666668, "logps/chosen": -318.023388671875, "logps/rejected": -139.18598429361978, "loss": 0.428, "rewards/chosen": -0.18662850856781005, "rewards/margins": 1.2393229722976684, "rewards/rejected": -1.4259514808654785, "step": 7268 }, { "epoch": 0.385286089099727, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1641338.0, "logits/rejected": -45317450.666666664, "logps/chosen": -105.6297119140625, "logps/rejected": -366.34716796875, "loss": 0.4, "rewards/chosen": -0.20431337356567383, "rewards/margins": 1.5407423019409179, "rewards/rejected": -1.7450556755065918, "step": 7269 }, { "epoch": 0.38533909310152914, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14206672.0, "logits/rejected": -52479720.0, "logps/chosen": -401.9423522949219, "logps/rejected": -433.14520263671875, "loss": 0.2337, "rewards/chosen": 1.0156351327896118, "rewards/margins": 3.5510283708572388, "rewards/rejected": -2.535393238067627, "step": 7270 }, { "epoch": 0.3853920971033313, "grad_norm": 53.75, "kl": 0.6218338012695312, "learning_rate": 5e-07, "logits/chosen": -7322656.666666667, "logits/rejected": -86698864.0, "logps/chosen": -444.8288981119792, "logps/rejected": -855.8980712890625, "loss": 0.3098, "rewards/chosen": 0.5800677140553793, "rewards/margins": 3.826759656270345, "rewards/rejected": -3.246691942214966, "step": 7271 }, { "epoch": 0.3854451011051334, "grad_norm": 66.0, "kl": 2.8400068283081055, "learning_rate": 5e-07, "logits/chosen": -25994181.333333332, "logits/rejected": 23778742.0, "logps/chosen": -516.6109212239584, "logps/rejected": -657.4559936523438, "loss": 0.4116, "rewards/chosen": 0.3782879114151001, "rewards/margins": 3.711811661720276, "rewards/rejected": -3.333523750305176, "step": 7272 }, { "epoch": 0.38549810510693555, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44223354.666666664, "logits/rejected": -21622539.2, "logps/chosen": -422.947509765625, "logps/rejected": -262.8913818359375, "loss": 0.2481, "rewards/chosen": 0.1943819522857666, "rewards/margins": 2.3346683979034424, "rewards/rejected": -2.140286445617676, "step": 7273 }, { "epoch": 0.3855511091087377, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30341604.0, "logits/rejected": -28075776.0, "logps/chosen": -450.7318420410156, "logps/rejected": -451.2710876464844, "loss": 0.3606, "rewards/chosen": -0.44962078332901, "rewards/margins": 1.8330053687095642, "rewards/rejected": -2.282626152038574, "step": 7274 }, { "epoch": 0.3856041131105398, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12173358.0, "logits/rejected": -10785042.0, "logps/chosen": -213.9696807861328, "logps/rejected": -476.55999755859375, "loss": 0.2261, "rewards/chosen": 0.4067218601703644, "rewards/margins": 3.7830927670001984, "rewards/rejected": -3.376370906829834, "step": 7275 }, { "epoch": 0.38565711711234196, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31931357.333333332, "logits/rejected": -57278169.6, "logps/chosen": -693.4679361979166, "logps/rejected": -587.90390625, "loss": 0.2362, "rewards/chosen": 0.00960083802541097, "rewards/margins": 2.9421960910161338, "rewards/rejected": -2.9325952529907227, "step": 7276 }, { "epoch": 0.3857101211141441, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14801959.0, "logits/rejected": -21762830.666666668, "logps/chosen": -227.92601013183594, "logps/rejected": -258.1007080078125, "loss": 0.2453, "rewards/chosen": -0.5899391174316406, "rewards/margins": 1.9512240091959634, "rewards/rejected": -2.541163126627604, "step": 7277 }, { "epoch": 0.38576312511594624, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35289644.8, "logits/rejected": -50504368.0, "logps/chosen": -335.3087646484375, "logps/rejected": -388.4683430989583, "loss": 0.251, "rewards/chosen": 0.6022399425506592, "rewards/margins": 3.460825618108114, "rewards/rejected": -2.8585856755574546, "step": 7278 }, { "epoch": 0.3858161291177484, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42703577.6, "logits/rejected": 37796936.0, "logps/chosen": -370.7085693359375, "logps/rejected": -302.27760823567706, "loss": 0.4237, "rewards/chosen": -0.08195667266845703, "rewards/margins": 1.1674251874287922, "rewards/rejected": -1.2493818600972493, "step": 7279 }, { "epoch": 0.3858691331195505, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38812277.333333336, "logits/rejected": -30121536.0, "logps/chosen": -333.49676513671875, "logps/rejected": -459.34033203125, "loss": 0.2155, "rewards/chosen": 0.2802485227584839, "rewards/margins": 2.8609686613082888, "rewards/rejected": -2.580720138549805, "step": 7280 }, { "epoch": 0.38592213712135265, "grad_norm": 38.75, "kl": 1.5167655944824219, "learning_rate": 5e-07, "logits/chosen": -10145390.0, "logits/rejected": -30862916.0, "logps/chosen": -250.4369354248047, "logps/rejected": -373.6510925292969, "loss": 0.2659, "rewards/chosen": 0.7948031425476074, "rewards/margins": 2.6331303119659424, "rewards/rejected": -1.838327169418335, "step": 7281 }, { "epoch": 0.3859751411231548, "grad_norm": 44.5, "kl": 0.04477691650390625, "learning_rate": 5e-07, "logits/chosen": -45185304.0, "logits/rejected": -52545148.0, "logps/chosen": -225.3238525390625, "logps/rejected": -282.54962158203125, "loss": 0.283, "rewards/chosen": 0.4325326085090637, "rewards/margins": 2.2934630513191223, "rewards/rejected": -1.8609304428100586, "step": 7282 }, { "epoch": 0.3860281451249569, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38421904.0, "logits/rejected": -32232612.0, "logps/chosen": -232.79339599609375, "logps/rejected": -271.2061767578125, "loss": 0.3381, "rewards/chosen": 0.42448476950327557, "rewards/margins": 2.4689917961756387, "rewards/rejected": -2.0445070266723633, "step": 7283 }, { "epoch": 0.38608114912675906, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40096936.0, "logits/rejected": -3899009.5, "logps/chosen": -296.0919189453125, "logps/rejected": -202.115478515625, "loss": 0.3243, "rewards/chosen": -0.012102499604225159, "rewards/margins": 1.8325357288122177, "rewards/rejected": -1.8446382284164429, "step": 7284 }, { "epoch": 0.3861341531285612, "grad_norm": 27.625, "kl": 0.24875640869140625, "learning_rate": 5e-07, "logits/chosen": -22424016.0, "logits/rejected": -19350640.0, "logps/chosen": -866.2067057291666, "logps/rejected": -281.6318115234375, "loss": 0.0983, "rewards/chosen": 2.467031160990397, "rewards/margins": 5.0305988947550455, "rewards/rejected": -2.5635677337646485, "step": 7285 }, { "epoch": 0.38618715713036333, "grad_norm": 35.25, "kl": 0.7775039672851562, "learning_rate": 5e-07, "logits/chosen": -10300158.0, "logits/rejected": 4276470.5, "logps/chosen": -124.99325561523438, "logps/rejected": -251.7751007080078, "loss": 0.2613, "rewards/chosen": 0.32971397042274475, "rewards/margins": 3.67313352227211, "rewards/rejected": -3.3434195518493652, "step": 7286 }, { "epoch": 0.38624016113216547, "grad_norm": 45.75, "kl": 0.12221717834472656, "learning_rate": 5e-07, "logits/chosen": -26078264.0, "logits/rejected": -34837113.6, "logps/chosen": -105.57918294270833, "logps/rejected": -389.6984375, "loss": 0.3574, "rewards/chosen": -0.5702331860860189, "rewards/margins": 0.9488353411356608, "rewards/rejected": -1.5190685272216797, "step": 7287 }, { "epoch": 0.3862931651339676, "grad_norm": 56.75, "kl": 1.624481201171875, "learning_rate": 5e-07, "logits/chosen": -58382048.0, "logits/rejected": -73050904.0, "logps/chosen": -521.4325358072916, "logps/rejected": -204.01881408691406, "loss": 0.3723, "rewards/chosen": 0.33474159240722656, "rewards/margins": 1.758388876914978, "rewards/rejected": -1.4236472845077515, "step": 7288 }, { "epoch": 0.38634616913576975, "grad_norm": 40.25, "kl": 0.6683807373046875, "learning_rate": 5e-07, "logits/chosen": 7401180.0, "logits/rejected": -4491430.4, "logps/chosen": -115.34004720052083, "logps/rejected": -283.16416015625, "loss": 0.2954, "rewards/chosen": -0.1743756135304769, "rewards/margins": 2.044792095820109, "rewards/rejected": -2.219167709350586, "step": 7289 }, { "epoch": 0.3863991731375719, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18259542.666666668, "logits/rejected": -17789033.6, "logps/chosen": -269.1118977864583, "logps/rejected": -229.6646240234375, "loss": 0.178, "rewards/chosen": 1.0474092960357666, "rewards/margins": 3.011652421951294, "rewards/rejected": -1.9642431259155273, "step": 7290 }, { "epoch": 0.386452177139374, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26810752.0, "logits/rejected": -23370874.666666668, "logps/chosen": -181.08201904296874, "logps/rejected": -111.49245198567708, "loss": 0.3829, "rewards/chosen": 0.13141379356384278, "rewards/margins": 1.3313845157623292, "rewards/rejected": -1.1999707221984863, "step": 7291 }, { "epoch": 0.38650518114117616, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17093294.0, "logits/rejected": 8191480.0, "logps/chosen": -345.53131103515625, "logps/rejected": -434.7626139322917, "loss": 0.1679, "rewards/chosen": 0.8890838623046875, "rewards/margins": 2.944917837778727, "rewards/rejected": -2.0558339754740396, "step": 7292 }, { "epoch": 0.3865581851429783, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86071984.0, "logits/rejected": -6759980.5, "logps/chosen": -370.08575439453125, "logps/rejected": -261.334716796875, "loss": 0.2684, "rewards/chosen": 0.4594796299934387, "rewards/margins": 2.3962374329566956, "rewards/rejected": -1.9367578029632568, "step": 7293 }, { "epoch": 0.38661118914478043, "grad_norm": 68.0, "kl": 1.3396987915039062, "learning_rate": 5e-07, "logits/chosen": -32836744.0, "logits/rejected": -37686184.0, "logps/chosen": -591.870361328125, "logps/rejected": -201.27499389648438, "loss": 0.2698, "rewards/chosen": 0.9211288690567017, "rewards/margins": 2.7161762714385986, "rewards/rejected": -1.795047402381897, "step": 7294 }, { "epoch": 0.38666419314658257, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15651603.2, "logits/rejected": -47342682.666666664, "logps/chosen": -269.743798828125, "logps/rejected": -241.9944051106771, "loss": 0.384, "rewards/chosen": 0.09335800409317016, "rewards/margins": 2.0080737709999084, "rewards/rejected": -1.9147157669067383, "step": 7295 }, { "epoch": 0.3867171971483847, "grad_norm": 61.5, "kl": 0.9525775909423828, "learning_rate": 5e-07, "logits/chosen": -65820948.0, "logits/rejected": -27626236.0, "logps/chosen": -632.1290893554688, "logps/rejected": -401.7460632324219, "loss": 0.3051, "rewards/chosen": 0.39752197265625, "rewards/margins": 2.708599090576172, "rewards/rejected": -2.311077117919922, "step": 7296 }, { "epoch": 0.38677020115018684, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39737034.666666664, "logits/rejected": -17292450.0, "logps/chosen": -135.98116048177084, "logps/rejected": -195.67254638671875, "loss": 0.4193, "rewards/chosen": -0.11262296636899312, "rewards/margins": 2.1109271148840585, "rewards/rejected": -2.2235500812530518, "step": 7297 }, { "epoch": 0.386823205151989, "grad_norm": 39.25, "kl": 0.2511758804321289, "learning_rate": 5e-07, "logits/chosen": -998283.1666666666, "logits/rejected": -42542710.4, "logps/chosen": -63.68171691894531, "logps/rejected": -498.414013671875, "loss": 0.204, "rewards/chosen": 0.694892962773641, "rewards/margins": 3.3936625321706138, "rewards/rejected": -2.6987695693969727, "step": 7298 }, { "epoch": 0.3868762091537911, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24804834.666666668, "logits/rejected": -47200368.0, "logps/chosen": -133.20435587565103, "logps/rejected": -530.03701171875, "loss": 0.2226, "rewards/chosen": 0.26416174570719403, "rewards/margins": 2.9400213877360026, "rewards/rejected": -2.6758596420288088, "step": 7299 }, { "epoch": 0.38692921315559325, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66742912.0, "logits/rejected": 417826.3, "logps/chosen": -413.0718994140625, "logps/rejected": -215.9441162109375, "loss": 0.2721, "rewards/chosen": 0.5865794022878011, "rewards/margins": 2.053415282567342, "rewards/rejected": -1.466835880279541, "step": 7300 }, { "epoch": 0.3869822171573954, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11514289.6, "logits/rejected": -13073254.666666666, "logps/chosen": -136.9086669921875, "logps/rejected": -112.55682373046875, "loss": 0.3933, "rewards/chosen": -0.24532482624053956, "rewards/margins": 1.943113907178243, "rewards/rejected": -2.1884387334187827, "step": 7301 }, { "epoch": 0.38703522115919753, "grad_norm": 84.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64817514.666666664, "logits/rejected": -18423334.0, "logps/chosen": -427.41650390625, "logps/rejected": -355.6802673339844, "loss": 0.4371, "rewards/chosen": 0.03170108546813329, "rewards/margins": 1.1736520503958066, "rewards/rejected": -1.1419509649276733, "step": 7302 }, { "epoch": 0.38708822516099967, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57157813.333333336, "logits/rejected": -24407374.4, "logps/chosen": -413.2056884765625, "logps/rejected": -345.2354736328125, "loss": 0.2705, "rewards/chosen": 0.17948404947916666, "rewards/margins": 2.5128472010294596, "rewards/rejected": -2.333363151550293, "step": 7303 }, { "epoch": 0.3871412291628018, "grad_norm": 51.25, "kl": 0.31453418731689453, "learning_rate": 5e-07, "logits/chosen": -20483402.0, "logits/rejected": -31526644.0, "logps/chosen": -230.43072509765625, "logps/rejected": -252.63095092773438, "loss": 0.3538, "rewards/chosen": -0.021875720471143723, "rewards/margins": 1.636967796832323, "rewards/rejected": -1.6588435173034668, "step": 7304 }, { "epoch": 0.38719423316460394, "grad_norm": 44.75, "kl": 0.4795989990234375, "learning_rate": 5e-07, "logits/chosen": -24742629.333333332, "logits/rejected": -6852043.5, "logps/chosen": -412.7421468098958, "logps/rejected": -164.34442138671875, "loss": 0.3416, "rewards/chosen": 0.46501970291137695, "rewards/margins": 2.3857381343841553, "rewards/rejected": -1.9207184314727783, "step": 7305 }, { "epoch": 0.3872472371664061, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41743984.0, "logits/rejected": 49429552.0, "logps/chosen": -341.48095703125, "logps/rejected": -361.8965250651042, "loss": 0.3499, "rewards/chosen": -0.8743316531181335, "rewards/margins": 0.7751962145169575, "rewards/rejected": -1.649527867635091, "step": 7306 }, { "epoch": 0.3873002411682082, "grad_norm": 54.5, "kl": 0.8930597305297852, "learning_rate": 5e-07, "logits/chosen": -12270172.8, "logits/rejected": 11514432.0, "logps/chosen": -170.1782470703125, "logps/rejected": -195.4605712890625, "loss": 0.4407, "rewards/chosen": -0.16024407148361205, "rewards/margins": 1.068167046705882, "rewards/rejected": -1.228411118189494, "step": 7307 }, { "epoch": 0.38735324517001035, "grad_norm": 26.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47150851.2, "logits/rejected": -13867946.666666666, "logps/chosen": -60.744775390625, "logps/rejected": -387.8876953125, "loss": 0.3385, "rewards/chosen": -0.04355678558349609, "rewards/margins": 3.23952267964681, "rewards/rejected": -3.283079465230306, "step": 7308 }, { "epoch": 0.3874062491718125, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35782756.0, "logits/rejected": -10136180.0, "logps/chosen": -287.4488220214844, "logps/rejected": -329.2930501302083, "loss": 0.2097, "rewards/chosen": 0.5387174487113953, "rewards/margins": 2.624290645122528, "rewards/rejected": -2.085573196411133, "step": 7309 }, { "epoch": 0.3874592531736146, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71673464.0, "logits/rejected": -31083653.333333332, "logps/chosen": -471.962646484375, "logps/rejected": -501.2505696614583, "loss": 0.1988, "rewards/chosen": 0.6983643174171448, "rewards/margins": 3.1038230856259665, "rewards/rejected": -2.4054587682088218, "step": 7310 }, { "epoch": 0.38751225717541676, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12335556.0, "logits/rejected": -12618548.0, "logps/chosen": -262.64276123046875, "logps/rejected": -295.20965576171875, "loss": 0.321, "rewards/chosen": 0.0926118791103363, "rewards/margins": 2.1886182725429535, "rewards/rejected": -2.096006393432617, "step": 7311 }, { "epoch": 0.3875652611772189, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1360419.0, "logits/rejected": -35309444.0, "logps/chosen": -76.92242431640625, "logps/rejected": -493.9582214355469, "loss": 0.342, "rewards/chosen": -0.32013288140296936, "rewards/margins": 1.9692910611629486, "rewards/rejected": -2.289423942565918, "step": 7312 }, { "epoch": 0.38761826517902104, "grad_norm": 73.0, "kl": 0.05759429931640625, "learning_rate": 5e-07, "logits/chosen": -45597408.0, "logits/rejected": -23949637.333333332, "logps/chosen": -612.2100830078125, "logps/rejected": -397.7311197916667, "loss": 0.2556, "rewards/chosen": 0.7363067865371704, "rewards/margins": 2.0991159677505493, "rewards/rejected": -1.362809181213379, "step": 7313 }, { "epoch": 0.3876712691808232, "grad_norm": 47.25, "kl": 0.1461353302001953, "learning_rate": 5e-07, "logits/chosen": 3948083.3333333335, "logits/rejected": -23189460.8, "logps/chosen": -207.23404947916666, "logps/rejected": -314.4873046875, "loss": 0.2851, "rewards/chosen": -0.21828182538350424, "rewards/margins": 1.8943280378977458, "rewards/rejected": -2.11260986328125, "step": 7314 }, { "epoch": 0.3877242731826253, "grad_norm": 44.75, "kl": 1.682459831237793, "learning_rate": 5e-07, "logits/chosen": -40913593.6, "logits/rejected": -3980383.3333333335, "logps/chosen": -228.0101318359375, "logps/rejected": -647.2120768229166, "loss": 0.3637, "rewards/chosen": 0.05842078924179077, "rewards/margins": 2.6948119521141054, "rewards/rejected": -2.6363911628723145, "step": 7315 }, { "epoch": 0.38777727718442745, "grad_norm": 54.25, "kl": 0.7234649658203125, "learning_rate": 5e-07, "logits/chosen": -6923865.5, "logits/rejected": -23414524.0, "logps/chosen": -377.5881042480469, "logps/rejected": -176.881103515625, "loss": 0.3107, "rewards/chosen": 0.464883416891098, "rewards/margins": 1.9596173465251923, "rewards/rejected": -1.4947339296340942, "step": 7316 }, { "epoch": 0.3878302811862296, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3348009.3333333335, "logits/rejected": -48960328.0, "logps/chosen": -253.17679850260416, "logps/rejected": -328.04705810546875, "loss": 0.3778, "rewards/chosen": 0.222374161084493, "rewards/margins": 2.00778075059255, "rewards/rejected": -1.7854065895080566, "step": 7317 }, { "epoch": 0.3878832851880317, "grad_norm": 57.75, "kl": 1.499155044555664, "learning_rate": 5e-07, "logits/chosen": -19607641.14285714, "logits/rejected": -14045926.0, "logps/chosen": -183.73416573660714, "logps/rejected": -85.09095764160156, "loss": 0.4477, "rewards/chosen": 0.2844935825892857, "rewards/margins": 1.397735970360892, "rewards/rejected": -1.1132423877716064, "step": 7318 }, { "epoch": 0.3879362891898338, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32267956.0, "logits/rejected": -36533712.0, "logps/chosen": -202.6279754638672, "logps/rejected": -251.2705535888672, "loss": 0.2576, "rewards/chosen": 0.418612003326416, "rewards/margins": 2.629976272583008, "rewards/rejected": -2.211364269256592, "step": 7319 }, { "epoch": 0.38798929319163594, "grad_norm": 58.25, "kl": 1.2759113311767578, "learning_rate": 5e-07, "logits/chosen": -10206780.0, "logits/rejected": -5472355.333333333, "logps/chosen": -385.8951904296875, "logps/rejected": -155.5951131184896, "loss": 0.3026, "rewards/chosen": 1.2655454635620118, "rewards/margins": 1.872486686706543, "rewards/rejected": -0.6069412231445312, "step": 7320 }, { "epoch": 0.3880422971934381, "grad_norm": 47.5, "kl": 0.00397491455078125, "learning_rate": 5e-07, "logits/chosen": -29088332.0, "logits/rejected": -42007428.0, "logps/chosen": -761.7176513671875, "logps/rejected": -240.3325653076172, "loss": 0.2946, "rewards/chosen": 0.4946935772895813, "rewards/margins": 2.7000368237495422, "rewards/rejected": -2.205343246459961, "step": 7321 }, { "epoch": 0.3880953011952402, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5451317.0, "logits/rejected": -41660150.85714286, "logps/chosen": -801.0557861328125, "logps/rejected": -481.12130301339283, "loss": 0.1817, "rewards/chosen": -0.03109130822122097, "rewards/margins": 2.0375694278627634, "rewards/rejected": -2.0686607360839844, "step": 7322 }, { "epoch": 0.38814830519704235, "grad_norm": 55.5, "kl": 1.212381362915039, "learning_rate": 5e-07, "logits/chosen": -27949013.333333332, "logits/rejected": 9652527.0, "logps/chosen": -412.6952311197917, "logps/rejected": -318.3453063964844, "loss": 0.3174, "rewards/chosen": 0.7392868200937907, "rewards/margins": 3.034260908762614, "rewards/rejected": -2.2949740886688232, "step": 7323 }, { "epoch": 0.3882013091988445, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24237525.333333332, "logits/rejected": -11842660.8, "logps/chosen": -330.059326171875, "logps/rejected": -308.937646484375, "loss": 0.2516, "rewards/chosen": 0.6603597005208334, "rewards/margins": 2.2164748509724936, "rewards/rejected": -1.5561151504516602, "step": 7324 }, { "epoch": 0.38825431320064663, "grad_norm": 52.0, "kl": 2.4744529724121094, "learning_rate": 5e-07, "logits/chosen": -21705185.333333332, "logits/rejected": -10968754.0, "logps/chosen": -221.8152872721354, "logps/rejected": -196.05641174316406, "loss": 0.4142, "rewards/chosen": 0.4226293961207072, "rewards/margins": 2.3594535986582437, "rewards/rejected": -1.9368242025375366, "step": 7325 }, { "epoch": 0.38830731720244877, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22980894.4, "logits/rejected": -26957157.333333332, "logps/chosen": -339.868017578125, "logps/rejected": -428.4599609375, "loss": 0.2531, "rewards/chosen": 0.6414570808410645, "rewards/margins": 3.574519952138265, "rewards/rejected": -2.9330628712972007, "step": 7326 }, { "epoch": 0.3883603212042509, "grad_norm": 48.25, "kl": 0.02384471893310547, "learning_rate": 5e-07, "logits/chosen": -24178548.0, "logits/rejected": 1725904.5, "logps/chosen": -417.0009460449219, "logps/rejected": -118.35962677001953, "loss": 0.3146, "rewards/chosen": 0.47842323780059814, "rewards/margins": 1.7850419282913208, "rewards/rejected": -1.3066186904907227, "step": 7327 }, { "epoch": 0.38841332520605304, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22728550.0, "logits/rejected": -19578098.0, "logps/chosen": -181.7045440673828, "logps/rejected": -227.21826171875, "loss": 0.3067, "rewards/chosen": 0.5059575438499451, "rewards/margins": 2.0112319588661194, "rewards/rejected": -1.5052744150161743, "step": 7328 }, { "epoch": 0.3884663292078552, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25856979.2, "logits/rejected": -25168389.333333332, "logps/chosen": -53.89683227539062, "logps/rejected": -197.1065673828125, "loss": 0.4249, "rewards/chosen": -0.17179374694824218, "rewards/margins": 1.0854252497355144, "rewards/rejected": -1.2572189966837566, "step": 7329 }, { "epoch": 0.3885193332096573, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48594568.0, "logits/rejected": -1646205.25, "logps/chosen": -310.4916687011719, "logps/rejected": -130.9530792236328, "loss": 0.4075, "rewards/chosen": -0.09734020382165909, "rewards/margins": 0.939339779317379, "rewards/rejected": -1.036679983139038, "step": 7330 }, { "epoch": 0.38857233721145945, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31768144.0, "logits/rejected": 11978874.666666666, "logps/chosen": -435.80615234375, "logps/rejected": -225.2157185872396, "loss": 0.3563, "rewards/chosen": 0.25327272415161134, "rewards/margins": 1.7998881657918293, "rewards/rejected": -1.546615441640218, "step": 7331 }, { "epoch": 0.3886253412132616, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24046578.0, "logits/rejected": -14600418.666666666, "logps/chosen": -309.71649169921875, "logps/rejected": -191.44854736328125, "loss": 0.2255, "rewards/chosen": -0.420388787984848, "rewards/margins": 1.9520157873630524, "rewards/rejected": -2.3724045753479004, "step": 7332 }, { "epoch": 0.3886783452150637, "grad_norm": 86.0, "kl": 0.024227142333984375, "learning_rate": 5e-07, "logits/chosen": -13641920.0, "logits/rejected": -4374579.0, "logps/chosen": -417.782470703125, "logps/rejected": -67.81144714355469, "loss": 0.338, "rewards/chosen": 0.4640848636627197, "rewards/margins": 2.193570852279663, "rewards/rejected": -1.7294859886169434, "step": 7333 }, { "epoch": 0.38873134921686586, "grad_norm": 45.5, "kl": 0.16239356994628906, "learning_rate": 5e-07, "logits/chosen": -47957312.0, "logits/rejected": -20467876.0, "logps/chosen": -313.1341247558594, "logps/rejected": -395.3942565917969, "loss": 0.2277, "rewards/chosen": 0.7854647636413574, "rewards/margins": 3.0060107707977295, "rewards/rejected": -2.220546007156372, "step": 7334 }, { "epoch": 0.388784353218668, "grad_norm": 77.5, "kl": 2.341938018798828, "learning_rate": 5e-07, "logits/chosen": -55277152.0, "logits/rejected": -19288389.333333332, "logps/chosen": -426.0083984375, "logps/rejected": -258.8048909505208, "loss": 0.3084, "rewards/chosen": 0.8828134536743164, "rewards/margins": 2.896670341491699, "rewards/rejected": -2.013856887817383, "step": 7335 }, { "epoch": 0.38883735722047014, "grad_norm": 59.0, "kl": 0.5286216735839844, "learning_rate": 5e-07, "logits/chosen": -58097448.0, "logits/rejected": -42448560.0, "logps/chosen": -459.11322021484375, "logps/rejected": -354.44488525390625, "loss": 0.3361, "rewards/chosen": 0.04198533669114113, "rewards/margins": 1.904918733984232, "rewards/rejected": -1.8629333972930908, "step": 7336 }, { "epoch": 0.3888903612222723, "grad_norm": 60.75, "kl": 2.6076297760009766, "learning_rate": 5e-07, "logits/chosen": -76385657.6, "logits/rejected": -17998394.666666668, "logps/chosen": -320.599951171875, "logps/rejected": -540.3457845052084, "loss": 0.3, "rewards/chosen": 0.7165488243103028, "rewards/margins": 3.88630158106486, "rewards/rejected": -3.169752756754557, "step": 7337 }, { "epoch": 0.3889433652240744, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1227317.0, "logits/rejected": -3252832.8571428573, "logps/chosen": -21.110567092895508, "logps/rejected": -358.0286342075893, "loss": 0.2085, "rewards/chosen": 0.7477924227714539, "rewards/margins": 2.492742036070142, "rewards/rejected": -1.7449496132986886, "step": 7338 }, { "epoch": 0.38899636922587655, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -85989834.66666667, "logits/rejected": -32233523.2, "logps/chosen": -134.77730305989584, "logps/rejected": -635.695703125, "loss": 0.3084, "rewards/chosen": -0.0038733165711164474, "rewards/margins": 1.9557326633483172, "rewards/rejected": -1.9596059799194336, "step": 7339 }, { "epoch": 0.3890493732276787, "grad_norm": 66.0, "kl": 0.2679462432861328, "learning_rate": 5e-07, "logits/chosen": -26004027.42857143, "logits/rejected": -52840764.0, "logps/chosen": -228.05001395089286, "logps/rejected": -298.84405517578125, "loss": 0.478, "rewards/chosen": -0.06726178526878357, "rewards/margins": 1.5907460749149323, "rewards/rejected": -1.6580078601837158, "step": 7340 }, { "epoch": 0.3891023772294808, "grad_norm": 52.25, "kl": 0.5469436645507812, "learning_rate": 5e-07, "logits/chosen": -33772992.0, "logits/rejected": -63918496.0, "logps/chosen": -374.7928466796875, "logps/rejected": -358.59552001953125, "loss": 0.3755, "rewards/chosen": 0.3148807883262634, "rewards/margins": 1.9709225296974182, "rewards/rejected": -1.6560417413711548, "step": 7341 }, { "epoch": 0.38915538123128296, "grad_norm": 27.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -96948.40625, "logits/rejected": -25278157.333333332, "logps/chosen": -46.06632614135742, "logps/rejected": -491.7793782552083, "loss": 0.1944, "rewards/chosen": 0.042771533131599426, "rewards/margins": 3.030685300628344, "rewards/rejected": -2.9879137674967446, "step": 7342 }, { "epoch": 0.3892083852330851, "grad_norm": 54.75, "kl": 0.3517417907714844, "learning_rate": 5e-07, "logits/chosen": -20679808.0, "logits/rejected": -21433172.0, "logps/chosen": -203.1053466796875, "logps/rejected": -237.10507202148438, "loss": 0.3506, "rewards/chosen": -0.17530861496925354, "rewards/margins": 2.04292568564415, "rewards/rejected": -2.2182343006134033, "step": 7343 }, { "epoch": 0.38926138923488723, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6358917.333333333, "logits/rejected": -72191488.0, "logps/chosen": -467.6981608072917, "logps/rejected": -384.3747314453125, "loss": 0.2476, "rewards/chosen": 0.8574114640553793, "rewards/margins": 2.675922664006551, "rewards/rejected": -1.8185111999511718, "step": 7344 }, { "epoch": 0.38931439323668937, "grad_norm": 52.5, "kl": 2.1471214294433594, "learning_rate": 5e-07, "logits/chosen": -37013676.8, "logits/rejected": -1041136.6666666666, "logps/chosen": -336.445654296875, "logps/rejected": -189.82975260416666, "loss": 0.3559, "rewards/chosen": 0.41676912307739256, "rewards/margins": 2.4253860473632813, "rewards/rejected": -2.0086169242858887, "step": 7345 }, { "epoch": 0.3893673972384915, "grad_norm": 43.0, "kl": 0.06175041198730469, "learning_rate": 5e-07, "logits/chosen": -38789884.8, "logits/rejected": -18017400.0, "logps/chosen": -620.314306640625, "logps/rejected": -150.767822265625, "loss": 0.304, "rewards/chosen": 0.6983633041381836, "rewards/margins": 2.571492513020833, "rewards/rejected": -1.8731292088826497, "step": 7346 }, { "epoch": 0.38942040124029365, "grad_norm": 55.5, "kl": 0.958343505859375, "learning_rate": 5e-07, "logits/chosen": -55098352.0, "logits/rejected": -31395986.0, "logps/chosen": -371.6611328125, "logps/rejected": -239.12832641601562, "loss": 0.4039, "rewards/chosen": 0.4974210858345032, "rewards/margins": 1.0975778698921204, "rewards/rejected": -0.6001567840576172, "step": 7347 }, { "epoch": 0.3894734052420958, "grad_norm": 60.25, "kl": 0.616875171661377, "learning_rate": 5e-07, "logits/chosen": -40823392.0, "logits/rejected": -8083232.5, "logps/chosen": -422.618896484375, "logps/rejected": -171.23809814453125, "loss": 0.3511, "rewards/chosen": 0.297260046005249, "rewards/margins": 1.3188745975494385, "rewards/rejected": -1.0216145515441895, "step": 7348 }, { "epoch": 0.3895264092438979, "grad_norm": 35.75, "kl": 0.016155242919921875, "learning_rate": 5e-07, "logits/chosen": -76176328.0, "logits/rejected": -2438721.5, "logps/chosen": -189.0251922607422, "logps/rejected": -239.154052734375, "loss": 0.3067, "rewards/chosen": 0.15585723519325256, "rewards/margins": 2.1567613184452057, "rewards/rejected": -2.000904083251953, "step": 7349 }, { "epoch": 0.38957941324570006, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25257904.0, "logits/rejected": -9612429.0, "logps/chosen": -293.248779296875, "logps/rejected": -159.5355224609375, "loss": 0.186, "rewards/chosen": 1.1879452466964722, "rewards/margins": 3.55508291721344, "rewards/rejected": -2.3671376705169678, "step": 7350 }, { "epoch": 0.3896324172475022, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28264556.8, "logits/rejected": -47527930.666666664, "logps/chosen": -358.9054931640625, "logps/rejected": -371.126220703125, "loss": 0.3712, "rewards/chosen": -0.23365583419799804, "rewards/margins": 2.7968833605448404, "rewards/rejected": -3.0305391947428384, "step": 7351 }, { "epoch": 0.38968542124930433, "grad_norm": 46.0, "kl": 1.4215850830078125, "learning_rate": 5e-07, "logits/chosen": -10962986.666666666, "logits/rejected": -20024032.0, "logps/chosen": -559.1236979166666, "logps/rejected": -612.8251953125, "loss": 0.1595, "rewards/chosen": 1.3163899580637615, "rewards/margins": 4.174678723017375, "rewards/rejected": -2.8582887649536133, "step": 7352 }, { "epoch": 0.38973842525110647, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 52999541.333333336, "logits/rejected": -39665376.0, "logps/chosen": -165.73495483398438, "logps/rejected": -497.1958984375, "loss": 0.2369, "rewards/chosen": 0.4571677049001058, "rewards/margins": 2.841878398259481, "rewards/rejected": -2.384710693359375, "step": 7353 }, { "epoch": 0.3897914292529086, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4496022.0, "logits/rejected": -3201953.5, "logps/chosen": -403.8935241699219, "logps/rejected": -123.45051574707031, "loss": 0.2319, "rewards/chosen": 0.8629348278045654, "rewards/margins": 2.695877432823181, "rewards/rejected": -1.8329426050186157, "step": 7354 }, { "epoch": 0.38984443325471074, "grad_norm": 48.0, "kl": 0.716156005859375, "learning_rate": 5e-07, "logits/chosen": -15593482.0, "logits/rejected": -18799134.0, "logps/chosen": -200.81494140625, "logps/rejected": -368.5838317871094, "loss": 0.2637, "rewards/chosen": 0.5097636580467224, "rewards/margins": 2.72363418340683, "rewards/rejected": -2.2138705253601074, "step": 7355 }, { "epoch": 0.3898974372565129, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42639334.4, "logits/rejected": -17480165.333333332, "logps/chosen": -129.62041015625, "logps/rejected": -289.7447509765625, "loss": 0.3697, "rewards/chosen": -0.06831777095794678, "rewards/margins": 2.367069363594055, "rewards/rejected": -2.435387134552002, "step": 7356 }, { "epoch": 0.389950441258315, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40237632.0, "logits/rejected": -24937685.333333332, "logps/chosen": -414.309619140625, "logps/rejected": -291.3338623046875, "loss": 0.3432, "rewards/chosen": 0.24792819023132323, "rewards/margins": 1.8351205666859944, "rewards/rejected": -1.5871923764546711, "step": 7357 }, { "epoch": 0.39000344526011715, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11636128.0, "logits/rejected": -21984310.0, "logps/chosen": -258.4834899902344, "logps/rejected": -143.55189514160156, "loss": 0.3741, "rewards/chosen": -0.16450195014476776, "rewards/margins": 1.2394418269395828, "rewards/rejected": -1.4039437770843506, "step": 7358 }, { "epoch": 0.3900564492619193, "grad_norm": 67.5, "kl": 2.1122283935546875, "learning_rate": 5e-07, "logits/chosen": -49565753.6, "logits/rejected": -18811825.333333332, "logps/chosen": -316.1346435546875, "logps/rejected": -312.2554931640625, "loss": 0.3144, "rewards/chosen": 0.9311388015747071, "rewards/margins": 2.3079304377237957, "rewards/rejected": -1.3767916361490886, "step": 7359 }, { "epoch": 0.39010945326372143, "grad_norm": 51.0, "kl": 0.4538726806640625, "learning_rate": 5e-07, "logits/chosen": -10059795.0, "logits/rejected": -17772308.0, "logps/chosen": -572.347412109375, "logps/rejected": -220.2015380859375, "loss": 0.2555, "rewards/chosen": 0.7883957028388977, "rewards/margins": 2.725489914417267, "rewards/rejected": -1.9370942115783691, "step": 7360 }, { "epoch": 0.39016245726552357, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11796886.0, "logits/rejected": -16400088.0, "logps/chosen": -219.29248046875, "logps/rejected": -211.33487955729166, "loss": 0.2583, "rewards/chosen": -0.2493888884782791, "rewards/margins": 1.670226698120435, "rewards/rejected": -1.9196155865987141, "step": 7361 }, { "epoch": 0.3902154612673257, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25217008.0, "logits/rejected": -31504416.0, "logps/chosen": -319.561767578125, "logps/rejected": -437.81231689453125, "loss": 0.2614, "rewards/chosen": 0.37205374240875244, "rewards/margins": 2.5806440114974976, "rewards/rejected": -2.208590269088745, "step": 7362 }, { "epoch": 0.39026846526912784, "grad_norm": 51.75, "kl": 0.4095935821533203, "learning_rate": 5e-07, "logits/chosen": -27141536.0, "logits/rejected": -185935568.0, "logps/chosen": -286.59503173828125, "logps/rejected": -199.4096221923828, "loss": 0.4189, "rewards/chosen": -0.07852948705355327, "rewards/margins": 2.443499435981115, "rewards/rejected": -2.522028923034668, "step": 7363 }, { "epoch": 0.39032146927093, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15985190.666666666, "logits/rejected": -9777340.8, "logps/chosen": -549.49755859375, "logps/rejected": -222.786572265625, "loss": 0.2582, "rewards/chosen": 0.8498611450195312, "rewards/margins": 2.4063848495483398, "rewards/rejected": -1.5565237045288085, "step": 7364 }, { "epoch": 0.3903744732727321, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23202661.333333332, "logits/rejected": -4601142.4, "logps/chosen": -228.5753377278646, "logps/rejected": -80.56902465820312, "loss": 0.3373, "rewards/chosen": -0.07863165438175201, "rewards/margins": 1.246981081366539, "rewards/rejected": -1.325612735748291, "step": 7365 }, { "epoch": 0.39042747727453425, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15934938.666666666, "logits/rejected": 27414482.0, "logps/chosen": -257.8529866536458, "logps/rejected": -309.8449401855469, "loss": 0.2949, "rewards/chosen": 0.9791064262390137, "rewards/margins": 1.9965938329696655, "rewards/rejected": -1.0174874067306519, "step": 7366 }, { "epoch": 0.3904804812763364, "grad_norm": 54.75, "kl": 0.369232177734375, "learning_rate": 5e-07, "logits/chosen": -36621238.85714286, "logits/rejected": -16115546.0, "logps/chosen": -284.24183872767856, "logps/rejected": -468.37298583984375, "loss": 0.3897, "rewards/chosen": 0.4159998212541853, "rewards/margins": 2.110445669719151, "rewards/rejected": -1.6944458484649658, "step": 7367 }, { "epoch": 0.3905334852781385, "grad_norm": 45.75, "kl": 0.44862937927246094, "learning_rate": 5e-07, "logits/chosen": -26416747.2, "logits/rejected": -16068482.666666666, "logps/chosen": -167.79638671875, "logps/rejected": -444.636962890625, "loss": 0.3732, "rewards/chosen": 0.18163025379180908, "rewards/margins": 2.54193905989329, "rewards/rejected": -2.360308806101481, "step": 7368 }, { "epoch": 0.39058648927994066, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30021886.0, "logits/rejected": -40508669.333333336, "logps/chosen": -290.28619384765625, "logps/rejected": -292.6473795572917, "loss": 0.1956, "rewards/chosen": 0.8498923778533936, "rewards/margins": 2.637421687444051, "rewards/rejected": -1.7875293095906575, "step": 7369 }, { "epoch": 0.39063949328174274, "grad_norm": 70.0, "kl": 0.531707763671875, "learning_rate": 5e-07, "logits/chosen": -28199090.285714287, "logits/rejected": -809404.375, "logps/chosen": -410.7276088169643, "logps/rejected": -75.0400390625, "loss": 0.384, "rewards/chosen": 0.35794568061828613, "rewards/margins": 3.414921283721924, "rewards/rejected": -3.0569756031036377, "step": 7370 }, { "epoch": 0.3906924972835449, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29447650.0, "logits/rejected": -31611872.0, "logps/chosen": -195.7381591796875, "logps/rejected": -526.0293579101562, "loss": 0.3502, "rewards/chosen": -0.24163293838500977, "rewards/margins": 2.0083305835723877, "rewards/rejected": -2.2499635219573975, "step": 7371 }, { "epoch": 0.390745501285347, "grad_norm": 51.0, "kl": 1.1792373657226562, "learning_rate": 5e-07, "logits/chosen": -3652374.0, "logits/rejected": -25830582.0, "logps/chosen": -765.9246826171875, "logps/rejected": -128.44061279296875, "loss": 0.3675, "rewards/chosen": 0.23655283451080322, "rewards/margins": 1.4933396577835083, "rewards/rejected": -1.256786823272705, "step": 7372 }, { "epoch": 0.39079850528714916, "grad_norm": 55.25, "kl": 1.1599950790405273, "learning_rate": 5e-07, "logits/chosen": -19337736.0, "logits/rejected": 10068888.8, "logps/chosen": -241.97672526041666, "logps/rejected": -185.6215576171875, "loss": 0.3638, "rewards/chosen": 0.5808757543563843, "rewards/margins": 1.390597128868103, "rewards/rejected": -0.8097213745117188, "step": 7373 }, { "epoch": 0.3908515092889513, "grad_norm": 40.25, "kl": 0.0972890853881836, "learning_rate": 5e-07, "logits/chosen": -36693762.666666664, "logits/rejected": -25546934.4, "logps/chosen": -580.7156575520834, "logps/rejected": -208.147509765625, "loss": 0.2003, "rewards/chosen": 1.2534431616465251, "rewards/margins": 3.534210220972697, "rewards/rejected": -2.280767059326172, "step": 7374 }, { "epoch": 0.39090451329075343, "grad_norm": 49.5, "kl": 0.2212371826171875, "learning_rate": 5e-07, "logits/chosen": 15425446.666666666, "logits/rejected": -4155571.2, "logps/chosen": -265.10052490234375, "logps/rejected": -359.0286865234375, "loss": 0.3091, "rewards/chosen": 0.39274438222249347, "rewards/margins": 1.971579869588216, "rewards/rejected": -1.5788354873657227, "step": 7375 }, { "epoch": 0.39095751729255557, "grad_norm": 41.0, "kl": 0.7799549102783203, "learning_rate": 5e-07, "logits/chosen": -24291324.8, "logits/rejected": -137736.33333333334, "logps/chosen": -164.6054443359375, "logps/rejected": -146.73933919270834, "loss": 0.3648, "rewards/chosen": 0.3457324028015137, "rewards/margins": 1.5633895715077717, "rewards/rejected": -1.217657168706258, "step": 7376 }, { "epoch": 0.3910105212943577, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5496105.0, "logits/rejected": -37272214.85714286, "logps/chosen": -31.75393295288086, "logps/rejected": -215.98751395089286, "loss": 0.21, "rewards/chosen": 0.17103195190429688, "rewards/margins": 1.9688480922154017, "rewards/rejected": -1.7978161403111048, "step": 7377 }, { "epoch": 0.39106352529615984, "grad_norm": 73.0, "kl": 0.13514328002929688, "learning_rate": 5e-07, "logits/chosen": -49293226.666666664, "logits/rejected": -14645928.0, "logps/chosen": -255.5428670247396, "logps/rejected": -108.0646743774414, "loss": 0.4635, "rewards/chosen": -0.10752209027608235, "rewards/margins": 0.9356805483500162, "rewards/rejected": -1.0432026386260986, "step": 7378 }, { "epoch": 0.391116529297962, "grad_norm": 56.25, "kl": 0.4693870544433594, "learning_rate": 5e-07, "logits/chosen": 22316668.8, "logits/rejected": -14373741.333333334, "logps/chosen": -229.4418701171875, "logps/rejected": -278.0657145182292, "loss": 0.322, "rewards/chosen": 0.5699869632720947, "rewards/margins": 2.6136283715566, "rewards/rejected": -2.0436414082845054, "step": 7379 }, { "epoch": 0.3911695332997641, "grad_norm": 46.25, "kl": 0.030879974365234375, "learning_rate": 5e-07, "logits/chosen": 3932500.6666666665, "logits/rejected": -27627372.8, "logps/chosen": -263.96824137369794, "logps/rejected": -393.2180908203125, "loss": 0.2847, "rewards/chosen": 0.49664855003356934, "rewards/margins": 2.5971959590911866, "rewards/rejected": -2.1005474090576173, "step": 7380 }, { "epoch": 0.39122253730156625, "grad_norm": 51.0, "kl": 0.07237625122070312, "learning_rate": 5e-07, "logits/chosen": -9893952.0, "logits/rejected": -33516156.8, "logps/chosen": -486.3260904947917, "logps/rejected": -501.61279296875, "loss": 0.2241, "rewards/chosen": 0.7435893217722574, "rewards/margins": 2.8308331648508704, "rewards/rejected": -2.087243843078613, "step": 7381 }, { "epoch": 0.3912755413033684, "grad_norm": 58.0, "kl": 0.1447601318359375, "learning_rate": 5e-07, "logits/chosen": -10481842.0, "logits/rejected": 395194.5625, "logps/chosen": -312.90826416015625, "logps/rejected": -123.0052719116211, "loss": 0.3738, "rewards/chosen": 0.01522979885339737, "rewards/margins": 1.5072022452950478, "rewards/rejected": -1.4919724464416504, "step": 7382 }, { "epoch": 0.3913285453051705, "grad_norm": 56.5, "kl": 3.0915069580078125, "learning_rate": 5e-07, "logits/chosen": -51049657.6, "logits/rejected": -52202485.333333336, "logps/chosen": -725.0568359375, "logps/rejected": -496.264404296875, "loss": 0.2452, "rewards/chosen": 1.3648396492004395, "rewards/margins": 3.758761501312256, "rewards/rejected": -2.3939218521118164, "step": 7383 }, { "epoch": 0.39138154930697266, "grad_norm": 57.5, "kl": 0.49074554443359375, "learning_rate": 5e-07, "logits/chosen": -46955318.4, "logits/rejected": -5656252.0, "logps/chosen": -425.374169921875, "logps/rejected": -233.094482421875, "loss": 0.3561, "rewards/chosen": 0.4767059326171875, "rewards/margins": 1.832836691538493, "rewards/rejected": -1.3561307589213054, "step": 7384 }, { "epoch": 0.3914345533087748, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34831145.6, "logits/rejected": -46323493.333333336, "logps/chosen": -313.92216796875, "logps/rejected": -622.3553873697916, "loss": 0.2707, "rewards/chosen": 0.6533349514007568, "rewards/margins": 2.8739830811818443, "rewards/rejected": -2.2206481297810874, "step": 7385 }, { "epoch": 0.39148755731057694, "grad_norm": 81.5, "kl": 0.22327423095703125, "learning_rate": 5e-07, "logits/chosen": -35131259.428571425, "logits/rejected": -118991832.0, "logps/chosen": -348.2440708705357, "logps/rejected": -225.408447265625, "loss": 0.4072, "rewards/chosen": 0.41669041769845144, "rewards/margins": 1.0172519768987383, "rewards/rejected": -0.6005615592002869, "step": 7386 }, { "epoch": 0.3915405613123791, "grad_norm": 58.25, "kl": 0.7543907165527344, "learning_rate": 5e-07, "logits/chosen": -16856776.0, "logits/rejected": -32131048.0, "logps/chosen": -335.9250895182292, "logps/rejected": -213.915283203125, "loss": 0.3372, "rewards/chosen": 0.6388738950093588, "rewards/margins": 2.1796396573384604, "rewards/rejected": -1.5407657623291016, "step": 7387 }, { "epoch": 0.3915935653141812, "grad_norm": 46.0, "kl": 0.5239906311035156, "learning_rate": 5e-07, "logits/chosen": -2999119.6666666665, "logits/rejected": -9931307.0, "logps/chosen": -172.84427897135416, "logps/rejected": -219.9337615966797, "loss": 0.3633, "rewards/chosen": 0.5254433552424113, "rewards/margins": 1.8436959187189736, "rewards/rejected": -1.3182525634765625, "step": 7388 }, { "epoch": 0.39164656931598335, "grad_norm": 34.5, "kl": 0.10946464538574219, "learning_rate": 5e-07, "logits/chosen": -8518701.0, "logits/rejected": -25778232.0, "logps/chosen": -169.6182861328125, "logps/rejected": -292.6104736328125, "loss": 0.269, "rewards/chosen": 0.6894587278366089, "rewards/margins": 2.609460234642029, "rewards/rejected": -1.92000150680542, "step": 7389 }, { "epoch": 0.3916995733177855, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12179558.4, "logits/rejected": -132524565.33333333, "logps/chosen": -179.945068359375, "logps/rejected": -192.28338623046875, "loss": 0.3481, "rewards/chosen": 0.017765653133392335, "rewards/margins": 2.2870286027590434, "rewards/rejected": -2.269262949625651, "step": 7390 }, { "epoch": 0.3917525773195876, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55756837.333333336, "logits/rejected": -12888297.6, "logps/chosen": -352.3125406901042, "logps/rejected": -304.9149658203125, "loss": 0.2903, "rewards/chosen": 0.2357355753580729, "rewards/margins": 1.9937939325968426, "rewards/rejected": -1.7580583572387696, "step": 7391 }, { "epoch": 0.39180558132138976, "grad_norm": 66.5, "kl": 2.160454750061035, "learning_rate": 5e-07, "logits/chosen": -22920464.0, "logits/rejected": 6187658.5, "logps/chosen": -301.4710286458333, "logps/rejected": -56.190650939941406, "loss": 0.2851, "rewards/chosen": 1.0728249549865723, "rewards/margins": 2.523548722267151, "rewards/rejected": -1.4507237672805786, "step": 7392 }, { "epoch": 0.3918585853231919, "grad_norm": 45.5, "kl": 0.31960487365722656, "learning_rate": 5e-07, "logits/chosen": -19096602.666666668, "logits/rejected": -18097776.0, "logps/chosen": -213.05828857421875, "logps/rejected": -153.17430114746094, "loss": 0.377, "rewards/chosen": 0.18889721234639487, "rewards/margins": 2.542208989461263, "rewards/rejected": -2.353311777114868, "step": 7393 }, { "epoch": 0.39191158932499404, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39728880.0, "logits/rejected": -31378662.4, "logps/chosen": -171.88360595703125, "logps/rejected": -238.572509765625, "loss": 0.304, "rewards/chosen": 0.20242208242416382, "rewards/margins": 1.6702521681785583, "rewards/rejected": -1.4678300857543944, "step": 7394 }, { "epoch": 0.3919645933267962, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35318296.0, "logits/rejected": -42995144.0, "logps/chosen": -247.82028198242188, "logps/rejected": -429.7159830729167, "loss": 0.2169, "rewards/chosen": -0.13887101411819458, "rewards/margins": 2.336023191610972, "rewards/rejected": -2.4748942057291665, "step": 7395 }, { "epoch": 0.3920175973285983, "grad_norm": 42.25, "kl": 0.14812088012695312, "learning_rate": 5e-07, "logits/chosen": -11258993.6, "logits/rejected": -29896608.0, "logps/chosen": -150.324462890625, "logps/rejected": -481.459716796875, "loss": 0.3154, "rewards/chosen": 0.3022942304611206, "rewards/margins": 3.1030751625696817, "rewards/rejected": -2.800780932108561, "step": 7396 }, { "epoch": 0.39207060133040045, "grad_norm": 41.75, "kl": 0.9608526229858398, "learning_rate": 5e-07, "logits/chosen": -9656524.0, "logits/rejected": -6694372.666666667, "logps/chosen": -76.81318359375, "logps/rejected": -155.40433756510416, "loss": 0.3805, "rewards/chosen": 0.19191566705703736, "rewards/margins": 1.9759961724281312, "rewards/rejected": -1.7840805053710938, "step": 7397 }, { "epoch": 0.3921236053322026, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54945146.666666664, "logits/rejected": -9183054.4, "logps/chosen": -224.9238077799479, "logps/rejected": -192.65908203125, "loss": 0.3459, "rewards/chosen": 0.19191455841064453, "rewards/margins": 1.2447525978088378, "rewards/rejected": -1.0528380393981933, "step": 7398 }, { "epoch": 0.3921766093340047, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40736010.666666664, "logits/rejected": -641392.0, "logps/chosen": -652.423095703125, "logps/rejected": -386.389013671875, "loss": 0.2171, "rewards/chosen": 0.9379720687866211, "rewards/margins": 2.859510803222656, "rewards/rejected": -1.921538734436035, "step": 7399 }, { "epoch": 0.39222961333580686, "grad_norm": 51.25, "kl": 0.47718143463134766, "learning_rate": 5e-07, "logits/chosen": -91188224.0, "logits/rejected": -10251716.0, "logps/chosen": -240.4424285888672, "logps/rejected": -385.09869384765625, "loss": 0.3313, "rewards/chosen": 0.5010805130004883, "rewards/margins": 1.7133897542953491, "rewards/rejected": -1.2123092412948608, "step": 7400 }, { "epoch": 0.392282617337609, "grad_norm": 57.25, "kl": 0.8724765777587891, "learning_rate": 5e-07, "logits/chosen": -28297040.0, "logits/rejected": -14067960.0, "logps/chosen": -408.7076416015625, "logps/rejected": -327.0530029296875, "loss": 0.2452, "rewards/chosen": 1.124519904454549, "rewards/margins": 2.377151187260946, "rewards/rejected": -1.2526312828063966, "step": 7401 }, { "epoch": 0.39233562133941113, "grad_norm": 52.0, "kl": 0.4741325378417969, "learning_rate": 5e-07, "logits/chosen": -21803320.0, "logits/rejected": -33692784.0, "logps/chosen": -260.05971272786456, "logps/rejected": -500.03009033203125, "loss": 0.3645, "rewards/chosen": 0.3187401493390401, "rewards/margins": 2.4033775528271994, "rewards/rejected": -2.084637403488159, "step": 7402 }, { "epoch": 0.39238862534121327, "grad_norm": 67.5, "kl": 2.95941162109375, "learning_rate": 5e-07, "logits/chosen": -83663632.0, "logits/rejected": -43150707.2, "logps/chosen": -582.6470947265625, "logps/rejected": -393.2671875, "loss": 0.352, "rewards/chosen": 0.2635182738304138, "rewards/margins": 1.8736356139183044, "rewards/rejected": -1.6101173400878905, "step": 7403 }, { "epoch": 0.3924416293430154, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36153460.0, "logits/rejected": -4241185.0, "logps/chosen": -444.62127685546875, "logps/rejected": -309.05165608723956, "loss": 0.1376, "rewards/chosen": 1.570297360420227, "rewards/margins": 3.9617797136306763, "rewards/rejected": -2.391482353210449, "step": 7404 }, { "epoch": 0.39249463334481755, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -101895984.0, "logits/rejected": -27642186.666666668, "logps/chosen": -388.337158203125, "logps/rejected": -219.810302734375, "loss": 0.3141, "rewards/chosen": -0.49774932861328125, "rewards/margins": 1.2289695739746094, "rewards/rejected": -1.7267189025878906, "step": 7405 }, { "epoch": 0.3925476373466197, "grad_norm": 58.25, "kl": 0.20251941680908203, "learning_rate": 5e-07, "logits/chosen": -27812752.0, "logits/rejected": -37323514.666666664, "logps/chosen": -325.5067443847656, "logps/rejected": -218.7879638671875, "loss": 0.3345, "rewards/chosen": 0.19991150498390198, "rewards/margins": 1.1622319916884103, "rewards/rejected": -0.9623204867045084, "step": 7406 }, { "epoch": 0.3926006413484218, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69500552.0, "logits/rejected": -26297584.0, "logps/chosen": -441.2813415527344, "logps/rejected": -247.1466064453125, "loss": 0.2062, "rewards/chosen": 0.7834747433662415, "rewards/margins": 2.6402834057807922, "rewards/rejected": -1.8568086624145508, "step": 7407 }, { "epoch": 0.39265364535022396, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40639504.0, "logits/rejected": -34078192.0, "logps/chosen": -182.1382853190104, "logps/rejected": -472.4579772949219, "loss": 0.4263, "rewards/chosen": -0.14567740758260092, "rewards/margins": 3.387462059656779, "rewards/rejected": -3.53313946723938, "step": 7408 }, { "epoch": 0.3927066493520261, "grad_norm": 41.5, "kl": 1.0446205139160156, "learning_rate": 5e-07, "logits/chosen": -61359664.0, "logits/rejected": -50715976.0, "logps/chosen": -658.3638916015625, "logps/rejected": -619.554443359375, "loss": 0.2643, "rewards/chosen": 0.4052574634552002, "rewards/margins": 3.2214417457580566, "rewards/rejected": -2.8161842823028564, "step": 7409 }, { "epoch": 0.39275965335382823, "grad_norm": 40.0, "kl": 0.9103164672851562, "learning_rate": 5e-07, "logits/chosen": -11216205.333333334, "logits/rejected": -42461939.2, "logps/chosen": -802.5519205729166, "logps/rejected": -345.7617919921875, "loss": 0.237, "rewards/chosen": 0.7786677678426107, "rewards/margins": 2.9056023915608726, "rewards/rejected": -2.126934623718262, "step": 7410 }, { "epoch": 0.39281265735563037, "grad_norm": 60.5, "kl": 0.5647239685058594, "learning_rate": 5e-07, "logits/chosen": -44070444.8, "logits/rejected": -3746488.6666666665, "logps/chosen": -285.740625, "logps/rejected": -173.99955240885416, "loss": 0.4476, "rewards/chosen": -0.12123535871505738, "rewards/margins": 0.9855475544929504, "rewards/rejected": -1.1067829132080078, "step": 7411 }, { "epoch": 0.3928656613574325, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25436094.4, "logits/rejected": -31244157.333333332, "logps/chosen": -293.02138671875, "logps/rejected": -326.3104654947917, "loss": 0.3128, "rewards/chosen": 0.31642348766326905, "rewards/margins": 2.295210274060567, "rewards/rejected": -1.978786786397298, "step": 7412 }, { "epoch": 0.39291866535923464, "grad_norm": 43.5, "kl": 0.4886608123779297, "learning_rate": 5e-07, "logits/chosen": -17909248.0, "logits/rejected": -29493198.0, "logps/chosen": -161.17532348632812, "logps/rejected": -311.643798828125, "loss": 0.3438, "rewards/chosen": 0.17010149359703064, "rewards/margins": 1.6484709680080414, "rewards/rejected": -1.4783694744110107, "step": 7413 }, { "epoch": 0.3929716693610368, "grad_norm": 55.5, "kl": 1.5389518737792969, "learning_rate": 5e-07, "logits/chosen": -99526144.0, "logits/rejected": -23791172.0, "logps/chosen": -364.5119934082031, "logps/rejected": -345.9440002441406, "loss": 0.2052, "rewards/chosen": 1.1938042640686035, "rewards/margins": 3.983612537384033, "rewards/rejected": -2.7898082733154297, "step": 7414 }, { "epoch": 0.3930246733628389, "grad_norm": 74.5, "kl": 0.1326923370361328, "learning_rate": 5e-07, "logits/chosen": -42469612.0, "logits/rejected": -560164.3333333334, "logps/chosen": -229.3406524658203, "logps/rejected": -341.9916178385417, "loss": 0.2559, "rewards/chosen": 0.1524784117937088, "rewards/margins": 2.138283923268318, "rewards/rejected": -1.9858055114746094, "step": 7415 }, { "epoch": 0.39307767736464105, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55552856.0, "logits/rejected": 22334170.0, "logps/chosen": -173.079345703125, "logps/rejected": -267.95159912109375, "loss": 0.3729, "rewards/chosen": -0.26910480856895447, "rewards/margins": 1.4520796835422516, "rewards/rejected": -1.721184492111206, "step": 7416 }, { "epoch": 0.3931306813664432, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22377908.0, "logits/rejected": -19290436.0, "logps/chosen": -271.6641845703125, "logps/rejected": -492.89801025390625, "loss": 0.2545, "rewards/chosen": 0.775411069393158, "rewards/margins": 2.855174958705902, "rewards/rejected": -2.079763889312744, "step": 7417 }, { "epoch": 0.39318368536824533, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31338936.0, "logits/rejected": -39478928.0, "logps/chosen": -273.1121520996094, "logps/rejected": -457.33551025390625, "loss": 0.3602, "rewards/chosen": 0.1591058224439621, "rewards/margins": 1.4971518963575363, "rewards/rejected": -1.3380460739135742, "step": 7418 }, { "epoch": 0.39323668937004747, "grad_norm": 59.5, "kl": 3.78009033203125, "learning_rate": 5e-07, "logits/chosen": -61280272.0, "logits/rejected": -25310092.0, "logps/chosen": -1002.307373046875, "logps/rejected": -159.8641815185547, "loss": 0.2832, "rewards/chosen": 1.5259780883789062, "rewards/margins": 3.458234429359436, "rewards/rejected": -1.9322563409805298, "step": 7419 }, { "epoch": 0.39328969337184955, "grad_norm": 58.5, "kl": 1.2255706787109375, "learning_rate": 5e-07, "logits/chosen": -12149196.0, "logits/rejected": -50844736.0, "logps/chosen": -222.8774210611979, "logps/rejected": -479.2720947265625, "loss": 0.3924, "rewards/chosen": 0.24810938040415445, "rewards/margins": 2.0578827063242593, "rewards/rejected": -1.809773325920105, "step": 7420 }, { "epoch": 0.3933426973736517, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42787520.0, "logits/rejected": -55630566.4, "logps/chosen": -304.7950439453125, "logps/rejected": -301.6683349609375, "loss": 0.2489, "rewards/chosen": 0.3474576473236084, "rewards/margins": 2.2184682369232176, "rewards/rejected": -1.8710105895996094, "step": 7421 }, { "epoch": 0.3933957013754538, "grad_norm": 62.75, "kl": 2.0264205932617188, "learning_rate": 5e-07, "logits/chosen": -19014862.0, "logits/rejected": -14987462.0, "logps/chosen": -670.9520874023438, "logps/rejected": -395.6423034667969, "loss": 0.2054, "rewards/chosen": 1.0989990234375, "rewards/margins": 3.3888511657714844, "rewards/rejected": -2.2898521423339844, "step": 7422 }, { "epoch": 0.39344870537725596, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1500970.0, "logits/rejected": -15130834.666666666, "logps/chosen": -85.01873016357422, "logps/rejected": -156.2651163736979, "loss": 0.1992, "rewards/chosen": 0.785147488117218, "rewards/margins": 2.5480232040087385, "rewards/rejected": -1.7628757158915203, "step": 7423 }, { "epoch": 0.3935017093790581, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3170378.6666666665, "logits/rejected": -1452515.6, "logps/chosen": -148.56920369466147, "logps/rejected": -178.1240234375, "loss": 0.3318, "rewards/chosen": 0.0660655399163564, "rewards/margins": 1.321731509764989, "rewards/rejected": -1.2556659698486328, "step": 7424 }, { "epoch": 0.39355471338086023, "grad_norm": 45.5, "kl": 0.38829708099365234, "learning_rate": 5e-07, "logits/chosen": -20181144.0, "logits/rejected": -21229650.0, "logps/chosen": -215.45216369628906, "logps/rejected": -206.4807891845703, "loss": 0.3385, "rewards/chosen": 0.44232863187789917, "rewards/margins": 1.7222867608070374, "rewards/rejected": -1.2799581289291382, "step": 7425 }, { "epoch": 0.39360771738266237, "grad_norm": 68.5, "kl": 0.8442802429199219, "learning_rate": 5e-07, "logits/chosen": -5369924.0, "logits/rejected": -21676825.6, "logps/chosen": -157.55907185872397, "logps/rejected": -267.713818359375, "loss": 0.1653, "rewards/chosen": 0.9083833694458008, "rewards/margins": 3.8904401779174806, "rewards/rejected": -2.98205680847168, "step": 7426 }, { "epoch": 0.3936607213844645, "grad_norm": 49.5, "kl": 0.4102668762207031, "learning_rate": 5e-07, "logits/chosen": -17032512.0, "logits/rejected": -136338864.0, "logps/chosen": -309.4294128417969, "logps/rejected": -528.5281372070312, "loss": 0.2079, "rewards/chosen": 0.8155216574668884, "rewards/margins": 3.4266719222068787, "rewards/rejected": -2.6111502647399902, "step": 7427 }, { "epoch": 0.39371372538626664, "grad_norm": 44.25, "kl": 0.0383000373840332, "learning_rate": 5e-07, "logits/chosen": 6228488.0, "logits/rejected": -21140284.0, "logps/chosen": -184.15768432617188, "logps/rejected": -365.4347229003906, "loss": 0.2891, "rewards/chosen": 0.19514517486095428, "rewards/margins": 2.3614688366651535, "rewards/rejected": -2.166323661804199, "step": 7428 }, { "epoch": 0.3937667293880688, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21593916.0, "logits/rejected": -7385741.5, "logps/chosen": -277.2788391113281, "logps/rejected": -121.70918273925781, "loss": 0.2755, "rewards/chosen": 0.4101444482803345, "rewards/margins": 2.6195002794265747, "rewards/rejected": -2.2093558311462402, "step": 7429 }, { "epoch": 0.3938197333898709, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15073820.8, "logits/rejected": -37369200.0, "logps/chosen": -243.892724609375, "logps/rejected": -286.44748942057294, "loss": 0.3627, "rewards/chosen": 0.054388117790222165, "rewards/margins": 1.9378640572230021, "rewards/rejected": -1.88347593943278, "step": 7430 }, { "epoch": 0.39387273739167306, "grad_norm": 41.0, "kl": 0.6522073745727539, "learning_rate": 5e-07, "logits/chosen": -57195744.0, "logits/rejected": -32553894.4, "logps/chosen": -304.86814371744794, "logps/rejected": -374.896337890625, "loss": 0.3252, "rewards/chosen": 0.26443735758463544, "rewards/margins": 1.6998345057169597, "rewards/rejected": -1.4353971481323242, "step": 7431 }, { "epoch": 0.3939257413934752, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35693228.0, "logits/rejected": -35195861.333333336, "logps/chosen": -399.3283386230469, "logps/rejected": -349.3871256510417, "loss": 0.1874, "rewards/chosen": 0.8789147138595581, "rewards/margins": 2.9081352949142456, "rewards/rejected": -2.0292205810546875, "step": 7432 }, { "epoch": 0.39397874539527733, "grad_norm": 85.5, "kl": 0.4126129150390625, "learning_rate": 5e-07, "logits/chosen": -73543450.66666667, "logits/rejected": 25181232.0, "logps/chosen": -821.0013834635416, "logps/rejected": -359.578955078125, "loss": 0.2185, "rewards/chosen": 0.6041320562362671, "rewards/margins": 2.6654889822006225, "rewards/rejected": -2.0613569259643554, "step": 7433 }, { "epoch": 0.39403174939707947, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46433539.2, "logits/rejected": -48828277.333333336, "logps/chosen": -402.9515869140625, "logps/rejected": -176.426025390625, "loss": 0.273, "rewards/chosen": 0.8021627426147461, "rewards/margins": 2.272286033630371, "rewards/rejected": -1.470123291015625, "step": 7434 }, { "epoch": 0.3940847533988816, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65573296.0, "logits/rejected": -32308355.2, "logps/chosen": -389.4330240885417, "logps/rejected": -190.5017578125, "loss": 0.309, "rewards/chosen": 0.03556925058364868, "rewards/margins": 1.535724127292633, "rewards/rejected": -1.5001548767089843, "step": 7435 }, { "epoch": 0.39413775740068374, "grad_norm": 56.5, "kl": 0.03359222412109375, "learning_rate": 5e-07, "logits/chosen": -33535946.666666668, "logits/rejected": -28634540.8, "logps/chosen": -444.4068196614583, "logps/rejected": -286.427880859375, "loss": 0.1844, "rewards/chosen": 1.1236592928568523, "rewards/margins": 3.0479657808939615, "rewards/rejected": -1.9243064880371095, "step": 7436 }, { "epoch": 0.3941907614024859, "grad_norm": 60.5, "kl": 1.2535934448242188, "learning_rate": 5e-07, "logits/chosen": -40343369.6, "logits/rejected": -30425784.0, "logps/chosen": -264.6396240234375, "logps/rejected": -224.48992919921875, "loss": 0.4449, "rewards/chosen": -0.05641971230506897, "rewards/margins": 1.3877717435359955, "rewards/rejected": -1.4441914558410645, "step": 7437 }, { "epoch": 0.394243765404288, "grad_norm": 53.5, "kl": 0.3070335388183594, "learning_rate": 5e-07, "logits/chosen": -11045078.0, "logits/rejected": -26286918.0, "logps/chosen": -261.0513000488281, "logps/rejected": -416.3379211425781, "loss": 0.2298, "rewards/chosen": 0.8375002145767212, "rewards/margins": 2.773362159729004, "rewards/rejected": -1.9358619451522827, "step": 7438 }, { "epoch": 0.39429676940609015, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21865542.666666668, "logits/rejected": -27234256.0, "logps/chosen": -144.00118001302084, "logps/rejected": -550.8816528320312, "loss": 0.3494, "rewards/chosen": 0.2261433800061544, "rewards/margins": 3.3825431068738303, "rewards/rejected": -3.156399726867676, "step": 7439 }, { "epoch": 0.3943497734078923, "grad_norm": 57.0, "kl": 1.3238754272460938, "learning_rate": 5e-07, "logits/chosen": -33001696.0, "logits/rejected": -37083832.0, "logps/chosen": -407.0958658854167, "logps/rejected": -474.88555908203125, "loss": 0.4057, "rewards/chosen": 0.27609721819559735, "rewards/margins": 1.9663635889689128, "rewards/rejected": -1.6902663707733154, "step": 7440 }, { "epoch": 0.3944027774096944, "grad_norm": 64.5, "kl": 2.081817626953125, "learning_rate": 5e-07, "logits/chosen": -44531493.333333336, "logits/rejected": -29874880.0, "logps/chosen": -400.237548828125, "logps/rejected": -326.7552795410156, "loss": 0.4227, "rewards/chosen": 0.3864646355311076, "rewards/margins": 2.325551907221476, "rewards/rejected": -1.9390872716903687, "step": 7441 }, { "epoch": 0.39445578141149656, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73751456.0, "logits/rejected": -23336064.0, "logps/chosen": -388.5982666015625, "logps/rejected": -250.0338134765625, "loss": 0.3201, "rewards/chosen": 0.10941429436206818, "rewards/margins": 2.007400706410408, "rewards/rejected": -1.8979864120483398, "step": 7442 }, { "epoch": 0.3945087854132987, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37320220.0, "logits/rejected": -16816078.0, "logps/chosen": -253.80209350585938, "logps/rejected": -315.0391845703125, "loss": 0.3446, "rewards/chosen": -0.27485156059265137, "rewards/margins": 2.1312978267669678, "rewards/rejected": -2.406149387359619, "step": 7443 }, { "epoch": 0.39456178941510084, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5094106.5, "logits/rejected": -6939978.285714285, "logps/chosen": -55.461151123046875, "logps/rejected": -175.15628487723214, "loss": 0.2262, "rewards/chosen": 0.2435939759016037, "rewards/margins": 1.8969209641218185, "rewards/rejected": -1.6533269882202148, "step": 7444 }, { "epoch": 0.394614793416903, "grad_norm": 43.25, "kl": 0.06734657287597656, "learning_rate": 5e-07, "logits/chosen": -7000607.0, "logits/rejected": -13400720.0, "logps/chosen": -534.8181762695312, "logps/rejected": -188.79083251953125, "loss": 0.227, "rewards/chosen": 1.3520007133483887, "rewards/margins": 3.1232779026031494, "rewards/rejected": -1.7712771892547607, "step": 7445 }, { "epoch": 0.3946677974187051, "grad_norm": 59.25, "kl": 0.255279541015625, "learning_rate": 5e-07, "logits/chosen": -11898305.0, "logits/rejected": -20494792.0, "logps/chosen": -464.5173034667969, "logps/rejected": -202.79039001464844, "loss": 0.3086, "rewards/chosen": 0.17115822434425354, "rewards/margins": 2.0287093222141266, "rewards/rejected": -1.857551097869873, "step": 7446 }, { "epoch": 0.39472080142050725, "grad_norm": 58.75, "kl": 0.7751197814941406, "learning_rate": 5e-07, "logits/chosen": -30673270.4, "logits/rejected": -4513532.0, "logps/chosen": -367.180126953125, "logps/rejected": -252.93023681640625, "loss": 0.3273, "rewards/chosen": 0.32625365257263184, "rewards/margins": 3.1427298386891684, "rewards/rejected": -2.8164761861165366, "step": 7447 }, { "epoch": 0.3947738054223094, "grad_norm": 61.25, "kl": 1.0531902313232422, "learning_rate": 5e-07, "logits/chosen": -23391126.4, "logits/rejected": -12531482.666666666, "logps/chosen": -277.8970458984375, "logps/rejected": -109.92613728841145, "loss": 0.4368, "rewards/chosen": 0.11915664672851563, "rewards/margins": 1.3006219546000164, "rewards/rejected": -1.1814653078715007, "step": 7448 }, { "epoch": 0.3948268094241115, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17064034.0, "logits/rejected": -52246344.0, "logps/chosen": -478.3873291015625, "logps/rejected": -468.83013916015625, "loss": 0.288, "rewards/chosen": 0.04096469283103943, "rewards/margins": 2.8812728822231293, "rewards/rejected": -2.84030818939209, "step": 7449 }, { "epoch": 0.39487981342591366, "grad_norm": 39.0, "kl": 1.6325187683105469, "learning_rate": 5e-07, "logits/chosen": -24536786.0, "logits/rejected": -45969116.0, "logps/chosen": -669.3638916015625, "logps/rejected": -320.2903747558594, "loss": 0.2252, "rewards/chosen": 1.4873619079589844, "rewards/margins": 4.394440174102783, "rewards/rejected": -2.907078266143799, "step": 7450 }, { "epoch": 0.3949328174277158, "grad_norm": 47.0, "kl": 0.1017913818359375, "learning_rate": 5e-07, "logits/chosen": -4828108.333333333, "logits/rejected": -59381868.8, "logps/chosen": -235.60892740885416, "logps/rejected": -458.1025390625, "loss": 0.2706, "rewards/chosen": 0.18879820903142294, "rewards/margins": 2.1221871097882588, "rewards/rejected": -1.933388900756836, "step": 7451 }, { "epoch": 0.39498582142951794, "grad_norm": 47.0, "kl": 0.7371721267700195, "learning_rate": 5e-07, "logits/chosen": -2703758.5, "logits/rejected": -6878666.0, "logps/chosen": -214.64747619628906, "logps/rejected": -114.52117919921875, "loss": 0.3828, "rewards/chosen": -0.03525933623313904, "rewards/margins": 1.3296622335910797, "rewards/rejected": -1.3649215698242188, "step": 7452 }, { "epoch": 0.3950388254313201, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8412609.0, "logits/rejected": -10178168.0, "logps/chosen": -390.70855712890625, "logps/rejected": -134.65065002441406, "loss": 0.2801, "rewards/chosen": 0.2589634954929352, "rewards/margins": 2.423734098672867, "rewards/rejected": -2.1647706031799316, "step": 7453 }, { "epoch": 0.3950918294331222, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8317121.6, "logits/rejected": -1989568.6666666667, "logps/chosen": -307.775927734375, "logps/rejected": -148.59253946940103, "loss": 0.264, "rewards/chosen": 0.5274576187133789, "rewards/margins": 3.2479064305623373, "rewards/rejected": -2.7204488118489585, "step": 7454 }, { "epoch": 0.39514483343492435, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69562480.0, "logits/rejected": -24591973.333333332, "logps/chosen": -461.82293701171875, "logps/rejected": -506.1185302734375, "loss": 0.2032, "rewards/chosen": 0.46557924151420593, "rewards/margins": 3.087553550799688, "rewards/rejected": -2.621974309285482, "step": 7455 }, { "epoch": 0.3951978374367265, "grad_norm": 48.25, "kl": 3.1474742889404297, "learning_rate": 5e-07, "logits/chosen": -27207334.0, "logits/rejected": -15828816.0, "logps/chosen": -670.8085327148438, "logps/rejected": -92.98753356933594, "loss": 0.242, "rewards/chosen": 1.2192273139953613, "rewards/margins": 2.7275363206863403, "rewards/rejected": -1.508309006690979, "step": 7456 }, { "epoch": 0.3952508414385286, "grad_norm": 57.25, "kl": 0.391265869140625, "learning_rate": 5e-07, "logits/chosen": -62465504.0, "logits/rejected": -66057360.0, "logps/chosen": -421.6858317057292, "logps/rejected": -429.82757568359375, "loss": 0.2983, "rewards/chosen": 0.7214581171671549, "rewards/margins": 2.552585999170939, "rewards/rejected": -1.8311278820037842, "step": 7457 }, { "epoch": 0.39530384544033076, "grad_norm": 31.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33260642.0, "logits/rejected": -15972312.0, "logps/chosen": -129.54513549804688, "logps/rejected": -425.8337809244792, "loss": 0.2124, "rewards/chosen": 0.20527181029319763, "rewards/margins": 3.1816095610459647, "rewards/rejected": -2.976337750752767, "step": 7458 }, { "epoch": 0.3953568494421329, "grad_norm": 46.25, "kl": 0.7776565551757812, "learning_rate": 5e-07, "logits/chosen": -21241140.0, "logits/rejected": 65638948.0, "logps/chosen": -300.8702392578125, "logps/rejected": -246.81321716308594, "loss": 0.2853, "rewards/chosen": 0.37737998366355896, "rewards/margins": 2.481980413198471, "rewards/rejected": -2.104600429534912, "step": 7459 }, { "epoch": 0.39540985344393503, "grad_norm": 80.5, "kl": 1.1142330169677734, "learning_rate": 5e-07, "logits/chosen": -47996540.0, "logits/rejected": -8679032.0, "logps/chosen": -241.88229370117188, "logps/rejected": -264.3261413574219, "loss": 0.352, "rewards/chosen": 0.3896912634372711, "rewards/margins": 1.8396977484226227, "rewards/rejected": -1.4500064849853516, "step": 7460 }, { "epoch": 0.39546285744573717, "grad_norm": 62.25, "kl": 4.012109756469727, "learning_rate": 5e-07, "logits/chosen": 46483308.0, "logits/rejected": -5349245.0, "logps/chosen": -1157.511474609375, "logps/rejected": -274.6220703125, "loss": 0.2952, "rewards/chosen": 1.2241482734680176, "rewards/margins": 3.1811327934265137, "rewards/rejected": -1.956984519958496, "step": 7461 }, { "epoch": 0.3955158614475393, "grad_norm": 48.75, "kl": 0.6769886016845703, "learning_rate": 5e-07, "logits/chosen": 43134832.0, "logits/rejected": -24479738.0, "logps/chosen": -180.7042236328125, "logps/rejected": -169.26834106445312, "loss": 0.3237, "rewards/chosen": 0.29524821043014526, "rewards/margins": 1.8063201308250427, "rewards/rejected": -1.5110719203948975, "step": 7462 }, { "epoch": 0.39556886544934144, "grad_norm": 44.5, "kl": 0.738677978515625, "learning_rate": 5e-07, "logits/chosen": 587913.55, "logits/rejected": -18245030.666666668, "logps/chosen": -156.5731201171875, "logps/rejected": -207.37748209635416, "loss": 0.3239, "rewards/chosen": 0.3976346254348755, "rewards/margins": 2.4541859229405723, "rewards/rejected": -2.0565512975056968, "step": 7463 }, { "epoch": 0.3956218694511436, "grad_norm": 63.75, "kl": 1.3495254516601562, "learning_rate": 5e-07, "logits/chosen": -8963311.0, "logits/rejected": -33947872.0, "logps/chosen": -493.55523681640625, "logps/rejected": -271.27410888671875, "loss": 0.2893, "rewards/chosen": 0.5061394572257996, "rewards/margins": 2.1289647221565247, "rewards/rejected": -1.622825264930725, "step": 7464 }, { "epoch": 0.3956748734529457, "grad_norm": 34.25, "kl": 0.2049541473388672, "learning_rate": 5e-07, "logits/chosen": -1463555.25, "logits/rejected": -11662017.0, "logps/chosen": -102.96013641357422, "logps/rejected": -220.35507202148438, "loss": 0.3135, "rewards/chosen": 0.379956990480423, "rewards/margins": 1.9525037109851837, "rewards/rejected": -1.5725467205047607, "step": 7465 }, { "epoch": 0.39572787745474786, "grad_norm": 53.5, "kl": 3.265226364135742, "learning_rate": 5e-07, "logits/chosen": -28057602.666666668, "logits/rejected": -36789888.0, "logps/chosen": -348.2312418619792, "logps/rejected": -676.8095092773438, "loss": 0.3678, "rewards/chosen": 0.6031184593836466, "rewards/margins": 3.180334130922953, "rewards/rejected": -2.5772156715393066, "step": 7466 }, { "epoch": 0.39578088145655, "grad_norm": 52.5, "kl": 0.213714599609375, "learning_rate": 5e-07, "logits/chosen": -50542997.333333336, "logits/rejected": -42479801.6, "logps/chosen": -494.1510416666667, "logps/rejected": -266.505126953125, "loss": 0.2596, "rewards/chosen": 0.38666486740112305, "rewards/margins": 2.188452434539795, "rewards/rejected": -1.8017875671386718, "step": 7467 }, { "epoch": 0.39583388545835213, "grad_norm": 37.0, "kl": 0.5918149948120117, "learning_rate": 5e-07, "logits/chosen": -28904230.0, "logits/rejected": -6280226.0, "logps/chosen": -363.8982238769531, "logps/rejected": -143.7476806640625, "loss": 0.2035, "rewards/chosen": 0.8874174952507019, "rewards/margins": 3.482866942882538, "rewards/rejected": -2.595449447631836, "step": 7468 }, { "epoch": 0.39588688946015427, "grad_norm": 69.5, "kl": 4.649087905883789, "learning_rate": 5e-07, "logits/chosen": -46969008.0, "logits/rejected": -24144760.0, "logps/chosen": -435.7056070963542, "logps/rejected": -505.92572021484375, "loss": 0.3395, "rewards/chosen": 0.9231866200764974, "rewards/margins": 3.9590447743733725, "rewards/rejected": -3.035858154296875, "step": 7469 }, { "epoch": 0.3959398934619564, "grad_norm": 70.5, "kl": 1.7260818481445312, "learning_rate": 5e-07, "logits/chosen": -50221036.8, "logits/rejected": -25879658.666666668, "logps/chosen": -690.5607421875, "logps/rejected": -202.4608154296875, "loss": 0.3611, "rewards/chosen": 0.6830919742584228, "rewards/margins": 1.9267314434051515, "rewards/rejected": -1.2436394691467285, "step": 7470 }, { "epoch": 0.3959928974637585, "grad_norm": 57.0, "kl": 0.2551708221435547, "learning_rate": 5e-07, "logits/chosen": -36441184.0, "logits/rejected": -25734222.0, "logps/chosen": -346.5313720703125, "logps/rejected": -130.89390563964844, "loss": 0.4077, "rewards/chosen": 0.3117040197054545, "rewards/margins": 1.0999658505121868, "rewards/rejected": -0.7882618308067322, "step": 7471 }, { "epoch": 0.3960459014655606, "grad_norm": 53.0, "kl": 0.2859516143798828, "learning_rate": 5e-07, "logits/chosen": -35414776.0, "logits/rejected": -31349196.0, "logps/chosen": -233.42218017578125, "logps/rejected": -322.24658203125, "loss": 0.3534, "rewards/chosen": 0.07512876391410828, "rewards/margins": 1.5452482402324677, "rewards/rejected": -1.4701194763183594, "step": 7472 }, { "epoch": 0.39609890546736276, "grad_norm": 59.25, "kl": 1.3950214385986328, "learning_rate": 5e-07, "logits/chosen": -18343320.0, "logits/rejected": -46534602.666666664, "logps/chosen": -230.1690185546875, "logps/rejected": -394.1565755208333, "loss": 0.3741, "rewards/chosen": 0.2507109880447388, "rewards/margins": 2.0634457508722943, "rewards/rejected": -1.8127347628275554, "step": 7473 }, { "epoch": 0.3961519094691649, "grad_norm": 55.75, "kl": 0.11336135864257812, "learning_rate": 5e-07, "logits/chosen": 187466.8, "logits/rejected": -15318754.666666666, "logps/chosen": -148.415234375, "logps/rejected": -336.73887125651044, "loss": 0.3614, "rewards/chosen": 0.12058956623077392, "rewards/margins": 2.233947745958964, "rewards/rejected": -2.11335817972819, "step": 7474 }, { "epoch": 0.39620491347096704, "grad_norm": 42.75, "kl": 0.09407329559326172, "learning_rate": 5e-07, "logits/chosen": -71160448.0, "logits/rejected": -9266565.6, "logps/chosen": -464.2897542317708, "logps/rejected": -285.3259765625, "loss": 0.2619, "rewards/chosen": 0.10848236083984375, "rewards/margins": 2.249593734741211, "rewards/rejected": -2.141111373901367, "step": 7475 }, { "epoch": 0.39625791747276917, "grad_norm": 76.0, "kl": 5.168718338012695, "learning_rate": 5e-07, "logits/chosen": -31498697.14285714, "logits/rejected": -21418904.0, "logps/chosen": -617.7117047991071, "logps/rejected": -194.87734985351562, "loss": 0.385, "rewards/chosen": 1.0478134836469377, "rewards/margins": 1.945417889526912, "rewards/rejected": -0.8976044058799744, "step": 7476 }, { "epoch": 0.3963109214745713, "grad_norm": 71.5, "kl": 2.046356201171875, "learning_rate": 5e-07, "logits/chosen": -36075994.666666664, "logits/rejected": -35368115.2, "logps/chosen": -1071.1560872395833, "logps/rejected": -128.830224609375, "loss": 0.2832, "rewards/chosen": 0.6606079339981079, "rewards/margins": 1.9780321359634399, "rewards/rejected": -1.317424201965332, "step": 7477 }, { "epoch": 0.39636392547637345, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3127070.5, "logits/rejected": -28391948.0, "logps/chosen": -94.16694641113281, "logps/rejected": -184.323974609375, "loss": 0.4025, "rewards/chosen": -0.6542359590530396, "rewards/margins": 1.2729476690292358, "rewards/rejected": -1.9271836280822754, "step": 7478 }, { "epoch": 0.3964169294781756, "grad_norm": 65.0, "kl": 5.874179840087891, "learning_rate": 5e-07, "logits/chosen": -12402481.333333334, "logits/rejected": -29161276.0, "logps/chosen": -455.8310546875, "logps/rejected": -158.2677459716797, "loss": 0.2613, "rewards/chosen": 1.73537015914917, "rewards/margins": 3.410101294517517, "rewards/rejected": -1.6747311353683472, "step": 7479 }, { "epoch": 0.3964699334799777, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63860917.333333336, "logits/rejected": -8676294.4, "logps/chosen": -227.9668172200521, "logps/rejected": -212.312109375, "loss": 0.2489, "rewards/chosen": 0.6190782388051351, "rewards/margins": 2.303090270360311, "rewards/rejected": -1.6840120315551759, "step": 7480 }, { "epoch": 0.39652293748177986, "grad_norm": 43.75, "kl": 0.4466438293457031, "learning_rate": 5e-07, "logits/chosen": -14942296.0, "logits/rejected": -10320805.6, "logps/chosen": -254.095458984375, "logps/rejected": -225.044970703125, "loss": 0.2434, "rewards/chosen": 0.8042942682902018, "rewards/margins": 2.516659228006999, "rewards/rejected": -1.712364959716797, "step": 7481 }, { "epoch": 0.396575941483582, "grad_norm": 33.5, "kl": 0.29766082763671875, "learning_rate": 5e-07, "logits/chosen": -9635998.0, "logits/rejected": -2749733.75, "logps/chosen": -102.96800231933594, "logps/rejected": -68.39754486083984, "loss": 0.3543, "rewards/chosen": -0.04986211657524109, "rewards/margins": 1.52977254986763, "rewards/rejected": -1.579634666442871, "step": 7482 }, { "epoch": 0.39662894548538413, "grad_norm": 59.25, "kl": 0.0547332763671875, "learning_rate": 5e-07, "logits/chosen": 14301588.0, "logits/rejected": 531052.8, "logps/chosen": -304.20249430338544, "logps/rejected": -251.87138671875, "loss": 0.3298, "rewards/chosen": -0.09922452767690022, "rewards/margins": 1.2828421036402384, "rewards/rejected": -1.3820666313171386, "step": 7483 }, { "epoch": 0.39668194948718627, "grad_norm": 45.0, "kl": 0.4070854187011719, "learning_rate": 5e-07, "logits/chosen": -7715535.2, "logits/rejected": -33554018.666666668, "logps/chosen": -145.9337890625, "logps/rejected": -181.05765787760416, "loss": 0.4109, "rewards/chosen": -0.0954376220703125, "rewards/margins": 1.6241632143656413, "rewards/rejected": -1.7196008364359539, "step": 7484 }, { "epoch": 0.3967349534889884, "grad_norm": 69.5, "kl": 1.9492273330688477, "learning_rate": 5e-07, "logits/chosen": -28572389.333333332, "logits/rejected": -3662719.0, "logps/chosen": -462.2853597005208, "logps/rejected": -154.9468536376953, "loss": 0.4148, "rewards/chosen": 0.3331323067347209, "rewards/margins": 1.8899282614390056, "rewards/rejected": -1.5567959547042847, "step": 7485 }, { "epoch": 0.39678795749079054, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26772946.666666668, "logits/rejected": -3693879.2, "logps/chosen": -287.1731770833333, "logps/rejected": -283.8146484375, "loss": 0.3669, "rewards/chosen": -0.7069152990976969, "rewards/margins": 0.9004993279774983, "rewards/rejected": -1.6074146270751952, "step": 7486 }, { "epoch": 0.3968409614925927, "grad_norm": 32.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -346936.4375, "logits/rejected": -36738454.85714286, "logps/chosen": -30.358566284179688, "logps/rejected": -416.28125, "loss": 0.2426, "rewards/chosen": -0.2998192012310028, "rewards/margins": 1.7790856318814412, "rewards/rejected": -2.078904833112444, "step": 7487 }, { "epoch": 0.3968939654943948, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17401742.0, "logits/rejected": -23386568.0, "logps/chosen": -174.19778442382812, "logps/rejected": -282.75091552734375, "loss": 0.2966, "rewards/chosen": 0.3020763695240021, "rewards/margins": 2.111947685480118, "rewards/rejected": -1.8098713159561157, "step": 7488 }, { "epoch": 0.39694696949619696, "grad_norm": 78.0, "kl": 4.015449523925781, "learning_rate": 5e-07, "logits/chosen": -34331709.333333336, "logits/rejected": -17402822.0, "logps/chosen": -722.4918619791666, "logps/rejected": -69.26245880126953, "loss": 0.3634, "rewards/chosen": 0.9011483192443848, "rewards/margins": 1.7585361003875732, "rewards/rejected": -0.8573877811431885, "step": 7489 }, { "epoch": 0.3969999734979991, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20800204.0, "logits/rejected": -21085358.4, "logps/chosen": -79.26236470540364, "logps/rejected": -416.029833984375, "loss": 0.248, "rewards/chosen": 0.05438753962516785, "rewards/margins": 2.6503985106945036, "rewards/rejected": -2.5960109710693358, "step": 7490 }, { "epoch": 0.39705297749980123, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30272626.666666668, "logits/rejected": -17983211.2, "logps/chosen": -339.8828531901042, "logps/rejected": -230.3406494140625, "loss": 0.2149, "rewards/chosen": 0.8258336385091146, "rewards/margins": 2.8815198262532555, "rewards/rejected": -2.055686187744141, "step": 7491 }, { "epoch": 0.39710598150160337, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21217618.666666668, "logits/rejected": -11420376.0, "logps/chosen": -361.0006510416667, "logps/rejected": -184.66461181640625, "loss": 0.2406, "rewards/chosen": 0.4976593255996704, "rewards/margins": 2.5492704629898073, "rewards/rejected": -2.051611137390137, "step": 7492 }, { "epoch": 0.3971589855034055, "grad_norm": 46.75, "kl": 2.678666114807129, "learning_rate": 5e-07, "logits/chosen": -13729438.0, "logits/rejected": 44752832.0, "logps/chosen": -328.3053894042969, "logps/rejected": -327.3289794921875, "loss": 0.2666, "rewards/chosen": 0.8736335635185242, "rewards/margins": 3.1679758429527283, "rewards/rejected": -2.294342279434204, "step": 7493 }, { "epoch": 0.39721198950520764, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8353745.6, "logits/rejected": -9171135.333333334, "logps/chosen": -194.75703125, "logps/rejected": -151.23250325520834, "loss": 0.3116, "rewards/chosen": 0.5641017436981202, "rewards/margins": 1.8938595136006673, "rewards/rejected": -1.3297577699025471, "step": 7494 }, { "epoch": 0.3972649935070098, "grad_norm": 151.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -12076880.0, "logps/rejected": -220.74188232421875, "loss": 0.1539, "rewards/rejected": -2.089338541030884, "step": 7495 }, { "epoch": 0.3973179975088119, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18757830.0, "logits/rejected": -7625918.666666667, "logps/chosen": -122.46088409423828, "logps/rejected": -164.91939290364584, "loss": 0.2114, "rewards/chosen": 0.31514739990234375, "rewards/margins": 2.2468546231587725, "rewards/rejected": -1.931707223256429, "step": 7496 }, { "epoch": 0.39737100151061405, "grad_norm": 50.5, "kl": 2.6260986328125, "learning_rate": 5e-07, "logits/chosen": -71736122.66666667, "logits/rejected": -17968512.0, "logps/chosen": -964.2291666666666, "logps/rejected": -356.6375732421875, "loss": 0.1632, "rewards/chosen": 1.3456390698750813, "rewards/margins": 3.6706985791524254, "rewards/rejected": -2.325059509277344, "step": 7497 }, { "epoch": 0.3974240055124162, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9484711.0, "logits/rejected": -35003360.0, "logps/chosen": -173.5245361328125, "logps/rejected": -469.30389404296875, "loss": 0.2262, "rewards/chosen": 0.47905194759368896, "rewards/margins": 4.702268958091736, "rewards/rejected": -4.223217010498047, "step": 7498 }, { "epoch": 0.3974770095142183, "grad_norm": 57.75, "kl": 0.8378067016601562, "learning_rate": 5e-07, "logits/chosen": -38652656.0, "logits/rejected": -38079797.333333336, "logps/chosen": -294.199267578125, "logps/rejected": -361.4364827473958, "loss": 0.3441, "rewards/chosen": 0.428812837600708, "rewards/margins": 1.8377046744028727, "rewards/rejected": -1.4088918368021648, "step": 7499 }, { "epoch": 0.39753001351602046, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20467224.0, "logits/rejected": -7788092.0, "logps/chosen": -163.99899291992188, "logps/rejected": -120.9932373046875, "loss": 0.2272, "rewards/chosen": 0.4671601454416911, "rewards/margins": 2.586777035395304, "rewards/rejected": -2.119616889953613, "step": 7500 }, { "epoch": 0.3975830175178226, "grad_norm": 47.75, "kl": 1.719451904296875, "learning_rate": 5e-07, "logits/chosen": -12893100.8, "logits/rejected": -1036233.3333333334, "logps/chosen": -233.566259765625, "logps/rejected": -82.8386942545573, "loss": 0.3343, "rewards/chosen": 0.6893166065216064, "rewards/margins": 2.1050194263458253, "rewards/rejected": -1.4157028198242188, "step": 7501 }, { "epoch": 0.39763602151962474, "grad_norm": 41.5, "kl": 0.07631874084472656, "learning_rate": 5e-07, "logits/chosen": -38703622.4, "logits/rejected": -10028928.0, "logps/chosen": -198.66929931640624, "logps/rejected": -132.25748697916666, "loss": 0.3982, "rewards/chosen": -0.02598726749420166, "rewards/margins": 1.2393643458684285, "rewards/rejected": -1.2653516133626301, "step": 7502 }, { "epoch": 0.3976890255214269, "grad_norm": 54.5, "kl": 0.02625274658203125, "learning_rate": 5e-07, "logits/chosen": -25881896.0, "logits/rejected": 6441976.0, "logps/chosen": -357.1754557291667, "logps/rejected": -288.7826171875, "loss": 0.2578, "rewards/chosen": 0.35372571150461835, "rewards/margins": 2.17591237227122, "rewards/rejected": -1.8221866607666015, "step": 7503 }, { "epoch": 0.397742029523229, "grad_norm": 47.0, "kl": 1.8326644897460938, "learning_rate": 5e-07, "logits/chosen": -102694464.0, "logits/rejected": -44193808.0, "logps/chosen": -194.5562286376953, "logps/rejected": -592.5169677734375, "loss": 0.3408, "rewards/chosen": 0.1718454360961914, "rewards/margins": 2.1175642013549805, "rewards/rejected": -1.945718765258789, "step": 7504 }, { "epoch": 0.39779503352503115, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9112130.0, "logits/rejected": -2417271.6666666665, "logps/chosen": -208.4552764892578, "logps/rejected": -207.93084716796875, "loss": 0.2475, "rewards/chosen": 0.6862087845802307, "rewards/margins": 2.263175070285797, "rewards/rejected": -1.5769662857055664, "step": 7505 }, { "epoch": 0.3978480375268333, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31241116.0, "logits/rejected": -6755309.333333333, "logps/chosen": -214.960693359375, "logps/rejected": -210.4080607096354, "loss": 0.2826, "rewards/chosen": -0.00044269859790802, "rewards/margins": 1.5322584758202236, "rewards/rejected": -1.5327011744181316, "step": 7506 }, { "epoch": 0.3979010415286354, "grad_norm": 55.25, "kl": 0.4801521301269531, "learning_rate": 5e-07, "logits/chosen": -23162990.4, "logits/rejected": 13607032.0, "logps/chosen": -270.551123046875, "logps/rejected": -138.42573038736978, "loss": 0.396, "rewards/chosen": 0.5323896884918213, "rewards/margins": 0.8771235227584839, "rewards/rejected": -0.3447338342666626, "step": 7507 }, { "epoch": 0.39795404553043756, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13241045.0, "logits/rejected": -14268570.666666666, "logps/chosen": -272.9722900390625, "logps/rejected": -290.89955647786456, "loss": 0.1769, "rewards/chosen": 1.2931389808654785, "rewards/margins": 3.1064600944519043, "rewards/rejected": -1.8133211135864258, "step": 7508 }, { "epoch": 0.3980070495322397, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11745161.333333334, "logits/rejected": -38733737.6, "logps/chosen": -46.810821533203125, "logps/rejected": -429.64619140625, "loss": 0.2338, "rewards/chosen": 0.3581657409667969, "rewards/margins": 2.5726692199707033, "rewards/rejected": -2.2145034790039064, "step": 7509 }, { "epoch": 0.39806005353404184, "grad_norm": 46.25, "kl": 1.1627750396728516, "learning_rate": 5e-07, "logits/chosen": -24176302.0, "logits/rejected": -14014494.0, "logps/chosen": -204.6255340576172, "logps/rejected": -318.0587158203125, "loss": 0.3015, "rewards/chosen": 0.8450480103492737, "rewards/margins": 2.136654555797577, "rewards/rejected": -1.2916065454483032, "step": 7510 }, { "epoch": 0.398113057535844, "grad_norm": 41.5, "kl": 0.698150634765625, "learning_rate": 5e-07, "logits/chosen": -54417268.0, "logits/rejected": -49025036.0, "logps/chosen": -427.1233825683594, "logps/rejected": -478.77874755859375, "loss": 0.1684, "rewards/chosen": 1.342756748199463, "rewards/margins": 4.60936975479126, "rewards/rejected": -3.266613006591797, "step": 7511 }, { "epoch": 0.3981660615376461, "grad_norm": 51.25, "kl": 0.7772674560546875, "learning_rate": 5e-07, "logits/chosen": -25868896.0, "logits/rejected": -6191436.4, "logps/chosen": -334.231689453125, "logps/rejected": -207.87333984375, "loss": 0.313, "rewards/chosen": 0.3156093756357829, "rewards/margins": 1.8167397658030193, "rewards/rejected": -1.5011303901672364, "step": 7512 }, { "epoch": 0.39821906553944825, "grad_norm": 48.5, "kl": 0.649017333984375, "learning_rate": 5e-07, "logits/chosen": -15072547.2, "logits/rejected": -74112208.0, "logps/chosen": -274.433056640625, "logps/rejected": -452.8123372395833, "loss": 0.2809, "rewards/chosen": 0.7686243534088135, "rewards/margins": 2.9160323619842528, "rewards/rejected": -2.1474080085754395, "step": 7513 }, { "epoch": 0.3982720695412504, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -488868.0, "logits/rejected": -18757458.0, "logps/chosen": -65.1080093383789, "logps/rejected": -365.7424621582031, "loss": 0.2785, "rewards/chosen": 0.47515127062797546, "rewards/margins": 2.472964972257614, "rewards/rejected": -1.9978137016296387, "step": 7514 }, { "epoch": 0.3983250735430525, "grad_norm": 38.25, "kl": 0.046428680419921875, "learning_rate": 5e-07, "logits/chosen": 1080067.0, "logits/rejected": -20300016.0, "logps/chosen": -426.2423909505208, "logps/rejected": -166.24349365234374, "loss": 0.1449, "rewards/chosen": 1.7749797503153484, "rewards/margins": 3.7093434969584145, "rewards/rejected": -1.9343637466430663, "step": 7515 }, { "epoch": 0.39837807754485466, "grad_norm": 56.0, "kl": 0.11200141906738281, "learning_rate": 5e-07, "logits/chosen": -1625523.2, "logits/rejected": -2652052.3333333335, "logps/chosen": -381.3271484375, "logps/rejected": -168.81330362955728, "loss": 0.3435, "rewards/chosen": 0.23967506885528564, "rewards/margins": 1.9484664837519328, "rewards/rejected": -1.7087914148966472, "step": 7516 }, { "epoch": 0.3984310815466568, "grad_norm": 61.5, "kl": 1.009918212890625, "learning_rate": 5e-07, "logits/chosen": -48885744.0, "logits/rejected": -1107218.5, "logps/chosen": -397.1174621582031, "logps/rejected": -126.55250549316406, "loss": 0.2906, "rewards/chosen": 0.588086724281311, "rewards/margins": 2.4666959047317505, "rewards/rejected": -1.8786091804504395, "step": 7517 }, { "epoch": 0.39848408554845893, "grad_norm": 46.75, "kl": 0.08999156951904297, "learning_rate": 5e-07, "logits/chosen": -10210778.0, "logits/rejected": -28976298.666666668, "logps/chosen": -498.43170166015625, "logps/rejected": -387.549072265625, "loss": 0.1452, "rewards/chosen": 0.9748344421386719, "rewards/margins": 3.272077719370524, "rewards/rejected": -2.297243277231852, "step": 7518 }, { "epoch": 0.39853708955026107, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10276578.0, "logits/rejected": -7292434.0, "logps/chosen": -285.14495849609375, "logps/rejected": -126.38906860351562, "loss": 0.2832, "rewards/chosen": 0.6111335754394531, "rewards/margins": 2.1808714866638184, "rewards/rejected": -1.5697379112243652, "step": 7519 }, { "epoch": 0.3985900935520632, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34472534.4, "logits/rejected": -37675770.666666664, "logps/chosen": -293.8533203125, "logps/rejected": -459.718505859375, "loss": 0.3062, "rewards/chosen": 0.38614401817321775, "rewards/margins": 2.4378827571868897, "rewards/rejected": -2.051738739013672, "step": 7520 }, { "epoch": 0.39864309755386534, "grad_norm": 71.0, "kl": 2.136455535888672, "learning_rate": 5e-07, "logits/chosen": -42806602.666666664, "logits/rejected": -12958760.0, "logps/chosen": -233.65730794270834, "logps/rejected": -306.4099426269531, "loss": 0.3769, "rewards/chosen": 0.6933335463205973, "rewards/margins": 2.1586773792902627, "rewards/rejected": -1.4653438329696655, "step": 7521 }, { "epoch": 0.3986961015556674, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23990588.0, "logits/rejected": -18842382.0, "logps/chosen": -300.2496337890625, "logps/rejected": -168.4010467529297, "loss": 0.3124, "rewards/chosen": 0.5363361239433289, "rewards/margins": 1.9014939665794373, "rewards/rejected": -1.3651578426361084, "step": 7522 }, { "epoch": 0.39874910555746956, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42669516.0, "logits/rejected": -23097536.0, "logps/chosen": -318.9193420410156, "logps/rejected": -601.615234375, "loss": 0.349, "rewards/chosen": -0.34481602907180786, "rewards/margins": 2.646580398082733, "rewards/rejected": -2.991396427154541, "step": 7523 }, { "epoch": 0.3988021095592717, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18464398.0, "logits/rejected": -21544768.0, "logps/chosen": -209.10328674316406, "logps/rejected": -442.3838704427083, "loss": 0.2067, "rewards/chosen": 0.32957905530929565, "rewards/margins": 3.1628354589144387, "rewards/rejected": -2.833256403605143, "step": 7524 }, { "epoch": 0.39885511356107384, "grad_norm": 55.5, "kl": 2.146402359008789, "learning_rate": 5e-07, "logits/chosen": -32582840.0, "logits/rejected": -36205584.0, "logps/chosen": -425.9226379394531, "logps/rejected": -556.9690551757812, "loss": 0.2516, "rewards/chosen": 0.674238920211792, "rewards/margins": 3.8816535472869873, "rewards/rejected": -3.2074146270751953, "step": 7525 }, { "epoch": 0.398908117562876, "grad_norm": 38.5, "kl": 1.4156875610351562, "learning_rate": 5e-07, "logits/chosen": -21060089.6, "logits/rejected": -18757116.0, "logps/chosen": -191.2255126953125, "logps/rejected": -253.5594482421875, "loss": 0.4023, "rewards/chosen": -0.13222427368164064, "rewards/margins": 2.355493672688802, "rewards/rejected": -2.487717946370443, "step": 7526 }, { "epoch": 0.3989611215646781, "grad_norm": 65.0, "kl": 3.201984405517578, "learning_rate": 5e-07, "logits/chosen": -42073961.14285714, "logits/rejected": 8240172.0, "logps/chosen": -357.56612723214283, "logps/rejected": -74.43460845947266, "loss": 0.4199, "rewards/chosen": 0.6833602360316685, "rewards/margins": 1.0483611694404056, "rewards/rejected": -0.3650009334087372, "step": 7527 }, { "epoch": 0.39901412556648025, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33512464.0, "logits/rejected": -22864366.4, "logps/chosen": -397.277587890625, "logps/rejected": -370.4939453125, "loss": 0.1926, "rewards/chosen": 1.1009124914805095, "rewards/margins": 3.0643024603525797, "rewards/rejected": -1.9633899688720704, "step": 7528 }, { "epoch": 0.3990671295682824, "grad_norm": 43.75, "kl": 1.1047143936157227, "learning_rate": 5e-07, "logits/chosen": -12047148.0, "logits/rejected": -33033170.0, "logps/chosen": -228.346435546875, "logps/rejected": -315.2062072753906, "loss": 0.2391, "rewards/chosen": 0.650617241859436, "rewards/margins": 3.141271948814392, "rewards/rejected": -2.490654706954956, "step": 7529 }, { "epoch": 0.3991201335700845, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9601181.333333334, "logits/rejected": -24782374.4, "logps/chosen": -112.94004313151042, "logps/rejected": -293.69501953125, "loss": 0.2402, "rewards/chosen": 0.7738888263702393, "rewards/margins": 2.38213849067688, "rewards/rejected": -1.6082496643066406, "step": 7530 }, { "epoch": 0.39917313757188666, "grad_norm": 49.25, "kl": 1.3033618927001953, "learning_rate": 5e-07, "logits/chosen": -20280806.0, "logits/rejected": -15923618.0, "logps/chosen": -332.36590576171875, "logps/rejected": -138.56793212890625, "loss": 0.239, "rewards/chosen": 0.888344943523407, "rewards/margins": 3.0231353640556335, "rewards/rejected": -2.1347904205322266, "step": 7531 }, { "epoch": 0.3992261415736888, "grad_norm": 53.75, "kl": 0.6012306213378906, "learning_rate": 5e-07, "logits/chosen": -59266112.0, "logits/rejected": -28299916.0, "logps/chosen": -290.64312744140625, "logps/rejected": -196.25277709960938, "loss": 0.3824, "rewards/chosen": 0.09796619415283203, "rewards/margins": 1.4276137351989746, "rewards/rejected": -1.3296475410461426, "step": 7532 }, { "epoch": 0.39927914557549093, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11018162.0, "logits/rejected": -17774680.0, "logps/chosen": -312.5263366699219, "logps/rejected": -183.4775390625, "loss": 0.1137, "rewards/chosen": 2.182016134262085, "rewards/margins": 4.328711543764387, "rewards/rejected": -2.146695409502302, "step": 7533 }, { "epoch": 0.39933214957729307, "grad_norm": 56.25, "kl": 2.2843456268310547, "learning_rate": 5e-07, "logits/chosen": 3530887.2, "logits/rejected": -38081941.333333336, "logps/chosen": -179.06959228515626, "logps/rejected": -239.18318684895834, "loss": 0.3705, "rewards/chosen": 0.5190125465393066, "rewards/margins": 1.7913971900939942, "rewards/rejected": -1.2723846435546875, "step": 7534 }, { "epoch": 0.3993851535790952, "grad_norm": 48.5, "kl": 0.7816371917724609, "learning_rate": 5e-07, "logits/chosen": -27386576.0, "logits/rejected": -22094972.0, "logps/chosen": -406.32977294921875, "logps/rejected": -345.9312744140625, "loss": 0.2802, "rewards/chosen": 0.6544777154922485, "rewards/margins": 2.5933645963668823, "rewards/rejected": -1.9388868808746338, "step": 7535 }, { "epoch": 0.39943815758089735, "grad_norm": 54.5, "kl": 0.5054779052734375, "learning_rate": 5e-07, "logits/chosen": -6720884.0, "logits/rejected": -13185977.0, "logps/chosen": -230.3636474609375, "logps/rejected": -144.3421630859375, "loss": 0.4233, "rewards/chosen": 0.08039347330729167, "rewards/margins": 1.2864398161570232, "rewards/rejected": -1.2060463428497314, "step": 7536 }, { "epoch": 0.3994911615826995, "grad_norm": 65.0, "kl": 1.4355621337890625, "learning_rate": 5e-07, "logits/chosen": 176688.57142857142, "logits/rejected": -8803814.0, "logps/chosen": -508.20717075892856, "logps/rejected": -282.3309020996094, "loss": 0.2907, "rewards/chosen": 0.9613430159432548, "rewards/margins": 3.734588350568499, "rewards/rejected": -2.773245334625244, "step": 7537 }, { "epoch": 0.3995441655845016, "grad_norm": 79.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3903930.3333333335, "logits/rejected": -12156331.0, "logps/chosen": -443.172119140625, "logps/rejected": -448.59246826171875, "loss": 0.42, "rewards/chosen": 0.034337458511193596, "rewards/margins": 2.3008539055784545, "rewards/rejected": -2.2665164470672607, "step": 7538 }, { "epoch": 0.39959716958630376, "grad_norm": 48.0, "kl": 0.17085647583007812, "learning_rate": 5e-07, "logits/chosen": -9181917.333333334, "logits/rejected": -28768006.4, "logps/chosen": -113.75765991210938, "logps/rejected": -412.52138671875, "loss": 0.2482, "rewards/chosen": 0.3956894079844157, "rewards/margins": 2.439468018213908, "rewards/rejected": -2.0437786102294924, "step": 7539 }, { "epoch": 0.3996501735881059, "grad_norm": 68.0, "kl": 0.22882843017578125, "learning_rate": 5e-07, "logits/chosen": -13406959.0, "logits/rejected": -26178284.0, "logps/chosen": -524.6920776367188, "logps/rejected": -290.16168212890625, "loss": 0.3104, "rewards/chosen": 0.2100597470998764, "rewards/margins": 2.3011098951101303, "rewards/rejected": -2.091050148010254, "step": 7540 }, { "epoch": 0.39970317758990803, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7549705.333333333, "logits/rejected": -26009484.0, "logps/chosen": -97.16769409179688, "logps/rejected": -298.68359375, "loss": 0.4795, "rewards/chosen": -0.10126914580663045, "rewards/margins": 0.6796650687853495, "rewards/rejected": -0.78093421459198, "step": 7541 }, { "epoch": 0.39975618159171017, "grad_norm": 55.25, "kl": 0.21477508544921875, "learning_rate": 5e-07, "logits/chosen": -39833810.666666664, "logits/rejected": 82889120.0, "logps/chosen": -253.92974853515625, "logps/rejected": -548.449951171875, "loss": 0.4017, "rewards/chosen": -0.02651418497165044, "rewards/margins": 2.6284876937667527, "rewards/rejected": -2.6550018787384033, "step": 7542 }, { "epoch": 0.3998091855935123, "grad_norm": 47.25, "kl": 0.34346771240234375, "learning_rate": 5e-07, "logits/chosen": -6216071.0, "logits/rejected": -6655605.5, "logps/chosen": -223.04074096679688, "logps/rejected": -196.94525146484375, "loss": 0.3502, "rewards/chosen": -0.02439965307712555, "rewards/margins": 1.8202428668737411, "rewards/rejected": -1.8446425199508667, "step": 7543 }, { "epoch": 0.39986218959531444, "grad_norm": 41.0, "kl": 0.7953033447265625, "learning_rate": 5e-07, "logits/chosen": -50726032.0, "logits/rejected": -27928509.333333332, "logps/chosen": -459.217529296875, "logps/rejected": -198.1222941080729, "loss": 0.2186, "rewards/chosen": 1.9324843883514404, "rewards/margins": 3.2590387662251787, "rewards/rejected": -1.3265543778737385, "step": 7544 }, { "epoch": 0.3999151935971166, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6574499.333333333, "logits/rejected": -10139460.8, "logps/chosen": -187.09004720052084, "logps/rejected": -214.0394775390625, "loss": 0.2683, "rewards/chosen": -0.23205262422561646, "rewards/margins": 2.285127246379852, "rewards/rejected": -2.5171798706054687, "step": 7545 }, { "epoch": 0.3999681975989187, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29159728.0, "logits/rejected": -46333704.0, "logps/chosen": -200.33026123046875, "logps/rejected": -329.30023193359375, "loss": 0.3056, "rewards/chosen": 0.11742133647203445, "rewards/margins": 2.4066411927342415, "rewards/rejected": -2.289219856262207, "step": 7546 }, { "epoch": 0.40002120160072085, "grad_norm": 52.25, "kl": 0.5721778869628906, "learning_rate": 5e-07, "logits/chosen": -26485561.6, "logits/rejected": -92320181.33333333, "logps/chosen": -267.7788818359375, "logps/rejected": -107.64996337890625, "loss": 0.3833, "rewards/chosen": 0.5087778091430664, "rewards/margins": 1.8975876490275065, "rewards/rejected": -1.3888098398844402, "step": 7547 }, { "epoch": 0.400074205602523, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 398641.8125, "logits/rejected": -14776975.0, "logps/chosen": -181.05857849121094, "logps/rejected": -294.3096923828125, "loss": 0.3111, "rewards/chosen": 0.19987407326698303, "rewards/margins": 1.8921131193637848, "rewards/rejected": -1.6922390460968018, "step": 7548 }, { "epoch": 0.40012720960432513, "grad_norm": 44.5, "kl": 0.054210662841796875, "learning_rate": 5e-07, "logits/chosen": -39818064.0, "logits/rejected": -25065334.0, "logps/chosen": -302.7266845703125, "logps/rejected": -345.22357177734375, "loss": 0.2094, "rewards/chosen": 0.5694054365158081, "rewards/margins": 3.514740824699402, "rewards/rejected": -2.9453353881835938, "step": 7549 }, { "epoch": 0.40018021360612727, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16179143.0, "logits/rejected": -37076172.0, "logps/chosen": -209.56185913085938, "logps/rejected": -559.365234375, "loss": 0.2447, "rewards/chosen": 0.32831019163131714, "rewards/margins": 3.5262526869773865, "rewards/rejected": -3.1979424953460693, "step": 7550 }, { "epoch": 0.4002332176079294, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65145802.666666664, "logits/rejected": -3386075.2, "logps/chosen": -362.0864664713542, "logps/rejected": -276.30302734375, "loss": 0.2994, "rewards/chosen": -0.04475275178750356, "rewards/margins": 2.2515176186958947, "rewards/rejected": -2.2962703704833984, "step": 7551 }, { "epoch": 0.40028622160973154, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42477216.0, "logits/rejected": 159161920.0, "logps/chosen": -404.75750732421875, "logps/rejected": -357.2724304199219, "loss": 0.2903, "rewards/chosen": 0.40176621079444885, "rewards/margins": 2.2585434019565582, "rewards/rejected": -1.8567771911621094, "step": 7552 }, { "epoch": 0.4003392256115337, "grad_norm": 39.75, "kl": 1.1135377883911133, "learning_rate": 5e-07, "logits/chosen": -48337600.0, "logits/rejected": -17451489.6, "logps/chosen": -486.9346516927083, "logps/rejected": -152.7008056640625, "loss": 0.2996, "rewards/chosen": 0.6216089725494385, "rewards/margins": 2.0564167499542236, "rewards/rejected": -1.4348077774047852, "step": 7553 }, { "epoch": 0.4003922296133358, "grad_norm": 59.5, "kl": 2.8137283325195312, "learning_rate": 5e-07, "logits/chosen": -10630019.42857143, "logits/rejected": 382513.28125, "logps/chosen": -204.41366141183036, "logps/rejected": -114.42964172363281, "loss": 0.4421, "rewards/chosen": 0.5102768284933907, "rewards/margins": 1.1435402376311166, "rewards/rejected": -0.6332634091377258, "step": 7554 }, { "epoch": 0.40044523361513795, "grad_norm": 82.0, "kl": 3.9603328704833984, "learning_rate": 5e-07, "logits/chosen": -18700386.285714287, "logits/rejected": -8345403.0, "logps/chosen": -634.8073381696429, "logps/rejected": -104.82118225097656, "loss": 0.3636, "rewards/chosen": 0.9034191540309361, "rewards/margins": 3.018108742577689, "rewards/rejected": -2.114689588546753, "step": 7555 }, { "epoch": 0.4004982376169401, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19412372.0, "logits/rejected": -16096906.666666666, "logps/chosen": -266.8199157714844, "logps/rejected": -296.2896728515625, "loss": 0.239, "rewards/chosen": 0.17259369790554047, "rewards/margins": 2.374809051553408, "rewards/rejected": -2.2022153536478677, "step": 7556 }, { "epoch": 0.4005512416187422, "grad_norm": 29.5, "kl": 0.7443199157714844, "learning_rate": 5e-07, "logits/chosen": -9796735.333333334, "logits/rejected": -21485443.2, "logps/chosen": -123.5238749186198, "logps/rejected": -521.97119140625, "loss": 0.2156, "rewards/chosen": 1.309915542602539, "rewards/margins": 3.878023147583008, "rewards/rejected": -2.5681076049804688, "step": 7557 }, { "epoch": 0.40060424562054436, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25640992.0, "logits/rejected": -9896795.2, "logps/chosen": -371.1011555989583, "logps/rejected": -241.0682373046875, "loss": 0.2251, "rewards/chosen": 0.9666921297709147, "rewards/margins": 2.5653173128763833, "rewards/rejected": -1.5986251831054688, "step": 7558 }, { "epoch": 0.4006572496223465, "grad_norm": 74.5, "kl": 0.912811279296875, "learning_rate": 5e-07, "logits/chosen": 73522342.4, "logits/rejected": -32228090.666666668, "logps/chosen": -355.3534912109375, "logps/rejected": -328.7833658854167, "loss": 0.3693, "rewards/chosen": 0.09760544300079346, "rewards/margins": 2.086600677172343, "rewards/rejected": -1.9889952341715496, "step": 7559 }, { "epoch": 0.40071025362414864, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18442932.0, "logits/rejected": -37301428.0, "logps/chosen": -266.5030517578125, "logps/rejected": -529.1271362304688, "loss": 0.3029, "rewards/chosen": 0.3360264003276825, "rewards/margins": 2.073743551969528, "rewards/rejected": -1.7377171516418457, "step": 7560 }, { "epoch": 0.4007632576259508, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34120678.4, "logits/rejected": -48904426.666666664, "logps/chosen": -336.3361328125, "logps/rejected": -192.55546061197916, "loss": 0.3808, "rewards/chosen": -0.14803102016448974, "rewards/margins": 1.8583754460016886, "rewards/rejected": -2.0064064661661782, "step": 7561 }, { "epoch": 0.4008162616277529, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36719816.0, "logits/rejected": -11003586.0, "logps/chosen": -622.84814453125, "logps/rejected": -433.0924987792969, "loss": 0.1961, "rewards/chosen": 0.8394957780838013, "rewards/margins": 3.834912896156311, "rewards/rejected": -2.9954171180725098, "step": 7562 }, { "epoch": 0.40086926562955505, "grad_norm": 53.25, "kl": 0.7148361206054688, "learning_rate": 5e-07, "logits/chosen": -57724454.4, "logits/rejected": -35252658.666666664, "logps/chosen": -343.857958984375, "logps/rejected": -440.1787516276042, "loss": 0.3512, "rewards/chosen": 0.04890228509902954, "rewards/margins": 3.1611147840817773, "rewards/rejected": -3.1122124989827475, "step": 7563 }, { "epoch": 0.4009222696313572, "grad_norm": 33.5, "kl": 1.8734960556030273, "learning_rate": 5e-07, "logits/chosen": -1544994.75, "logits/rejected": -64769348.0, "logps/chosen": -149.65809631347656, "logps/rejected": -391.14984130859375, "loss": 0.2996, "rewards/chosen": 0.4710787534713745, "rewards/margins": 3.028040051460266, "rewards/rejected": -2.5569612979888916, "step": 7564 }, { "epoch": 0.4009752736331593, "grad_norm": 51.5, "kl": 1.0277824401855469, "learning_rate": 5e-07, "logits/chosen": -37493256.0, "logits/rejected": -12071575.0, "logps/chosen": -460.0129699707031, "logps/rejected": -176.6682891845703, "loss": 0.2746, "rewards/chosen": 0.6500478982925415, "rewards/margins": 2.4164446592330933, "rewards/rejected": -1.7663967609405518, "step": 7565 }, { "epoch": 0.40102827763496146, "grad_norm": 57.25, "kl": 4.138513565063477, "learning_rate": 5e-07, "logits/chosen": -3682929.714285714, "logits/rejected": 199437696.0, "logps/chosen": -416.23182896205356, "logps/rejected": -1502.1434326171875, "loss": 0.4007, "rewards/chosen": 0.761775153023856, "rewards/margins": 7.766596930367606, "rewards/rejected": -7.00482177734375, "step": 7566 }, { "epoch": 0.4010812816367636, "grad_norm": 53.5, "kl": 0.14585113525390625, "learning_rate": 5e-07, "logits/chosen": -15571371.0, "logits/rejected": -22463484.0, "logps/chosen": -473.03936767578125, "logps/rejected": -269.4175109863281, "loss": 0.3128, "rewards/chosen": 0.6244128346443176, "rewards/margins": 1.7454680800437927, "rewards/rejected": -1.121055245399475, "step": 7567 }, { "epoch": 0.40113428563856574, "grad_norm": 52.75, "kl": 0.4283781051635742, "learning_rate": 5e-07, "logits/chosen": -40664806.4, "logits/rejected": 12602272.0, "logps/chosen": -325.231396484375, "logps/rejected": -169.55817667643228, "loss": 0.3303, "rewards/chosen": 0.5101637840270996, "rewards/margins": 1.9672997792561848, "rewards/rejected": -1.4571359952290852, "step": 7568 }, { "epoch": 0.4011872896403679, "grad_norm": 47.25, "kl": 0.5416707992553711, "learning_rate": 5e-07, "logits/chosen": -20952585.333333332, "logits/rejected": -1402429.6, "logps/chosen": -254.4588623046875, "logps/rejected": -124.04241943359375, "loss": 0.2501, "rewards/chosen": 0.9324512481689453, "rewards/margins": 2.255359077453613, "rewards/rejected": -1.3229078292846679, "step": 7569 }, { "epoch": 0.40124029364217, "grad_norm": 45.5, "kl": 1.0621576309204102, "learning_rate": 5e-07, "logits/chosen": -15299289.6, "logits/rejected": -5028724.666666667, "logps/chosen": -307.0626953125, "logps/rejected": -216.81512451171875, "loss": 0.2407, "rewards/chosen": 1.170252513885498, "rewards/margins": 3.1381192525227863, "rewards/rejected": -1.9678667386372883, "step": 7570 }, { "epoch": 0.40129329764397215, "grad_norm": 48.5, "kl": 0.3494987487792969, "learning_rate": 5e-07, "logits/chosen": -34769200.0, "logits/rejected": -44268512.0, "logps/chosen": -236.8370361328125, "logps/rejected": -363.363623046875, "loss": 0.239, "rewards/chosen": 0.30859212080637616, "rewards/margins": 2.7003016392389934, "rewards/rejected": -2.3917095184326174, "step": 7571 }, { "epoch": 0.40134630164577423, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34868594.666666664, "logits/rejected": -36667468.8, "logps/chosen": -150.60466512044272, "logps/rejected": -321.801123046875, "loss": 0.313, "rewards/chosen": 0.032030234734217324, "rewards/margins": 1.8724610666433972, "rewards/rejected": -1.8404308319091798, "step": 7572 }, { "epoch": 0.40139930564757637, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23287588.0, "logits/rejected": -54845544.0, "logps/chosen": -238.49998474121094, "logps/rejected": -673.0533447265625, "loss": 0.2677, "rewards/chosen": 0.3488832414150238, "rewards/margins": 3.442706674337387, "rewards/rejected": -3.0938234329223633, "step": 7573 }, { "epoch": 0.4014523096493785, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26060211.2, "logits/rejected": -179208.66666666666, "logps/chosen": -201.79075927734374, "logps/rejected": -402.4166259765625, "loss": 0.3781, "rewards/chosen": 0.03514076471328735, "rewards/margins": 2.243854137261709, "rewards/rejected": -2.2087133725484214, "step": 7574 }, { "epoch": 0.40150531365118064, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30852691.2, "logits/rejected": -36895752.0, "logps/chosen": -266.6167236328125, "logps/rejected": -278.28883870442706, "loss": 0.3549, "rewards/chosen": 0.14339781999588014, "rewards/margins": 1.9975724339485168, "rewards/rejected": -1.8541746139526367, "step": 7575 }, { "epoch": 0.4015583176529828, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52901200.0, "logits/rejected": -2168754.5, "logps/chosen": -246.0740763346354, "logps/rejected": -68.97991180419922, "loss": 0.5228, "rewards/chosen": -0.6300857464472452, "rewards/margins": 1.3693649371465049, "rewards/rejected": -1.99945068359375, "step": 7576 }, { "epoch": 0.4016113216547849, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2686348.3333333335, "logits/rejected": -3956264.8, "logps/chosen": -54.24267578125, "logps/rejected": -221.709521484375, "loss": 0.2317, "rewards/chosen": 0.5842831532160441, "rewards/margins": 2.6289784351984657, "rewards/rejected": -2.044695281982422, "step": 7577 }, { "epoch": 0.40166432565658705, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39065308.0, "logits/rejected": -49288552.0, "logps/chosen": -357.6148986816406, "logps/rejected": -401.59930419921875, "loss": 0.3103, "rewards/chosen": -0.16236525774002075, "rewards/margins": 2.261879861354828, "rewards/rejected": -2.4242451190948486, "step": 7578 }, { "epoch": 0.4017173296583892, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63264280.0, "logits/rejected": -20937200.0, "logps/chosen": -353.1760559082031, "logps/rejected": -274.576904296875, "loss": 0.3505, "rewards/chosen": -0.2641868591308594, "rewards/margins": 1.700955867767334, "rewards/rejected": -1.9651427268981934, "step": 7579 }, { "epoch": 0.4017703336601913, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19388329.333333332, "logits/rejected": -14204598.4, "logps/chosen": -321.5187581380208, "logps/rejected": -255.785107421875, "loss": 0.2183, "rewards/chosen": 0.48155518372853595, "rewards/margins": 3.2911434253056844, "rewards/rejected": -2.8095882415771483, "step": 7580 }, { "epoch": 0.40182333766199346, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31718430.0, "logits/rejected": -41946880.0, "logps/chosen": -452.35455322265625, "logps/rejected": -270.28912353515625, "loss": 0.2952, "rewards/chosen": 0.534466028213501, "rewards/margins": 2.330742359161377, "rewards/rejected": -1.796276330947876, "step": 7581 }, { "epoch": 0.4018763416637956, "grad_norm": 63.75, "kl": 1.5073127746582031, "learning_rate": 5e-07, "logits/chosen": -32712288.0, "logits/rejected": -27805456.0, "logps/chosen": -397.3765869140625, "logps/rejected": -490.9383951822917, "loss": 0.3271, "rewards/chosen": 0.4254733085632324, "rewards/margins": 2.8674918174743653, "rewards/rejected": -2.442018508911133, "step": 7582 }, { "epoch": 0.40192934566559774, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72762864.0, "logits/rejected": -47759036.8, "logps/chosen": -407.5559488932292, "logps/rejected": -340.939111328125, "loss": 0.2559, "rewards/chosen": 0.2426300048828125, "rewards/margins": 2.096999931335449, "rewards/rejected": -1.8543699264526368, "step": 7583 }, { "epoch": 0.4019823496673999, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1021715.375, "logits/rejected": -34835992.0, "logps/chosen": -85.70590209960938, "logps/rejected": -491.0506896972656, "loss": 0.2937, "rewards/chosen": 0.03383169323205948, "rewards/margins": 2.5314775481820107, "rewards/rejected": -2.497645854949951, "step": 7584 }, { "epoch": 0.402035353669202, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84330480.0, "logits/rejected": -27666240.0, "logps/chosen": -397.7928161621094, "logps/rejected": -493.5839538574219, "loss": 0.23, "rewards/chosen": 0.7509614825248718, "rewards/margins": 3.3153781294822693, "rewards/rejected": -2.5644166469573975, "step": 7585 }, { "epoch": 0.40208835767100415, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55690960.0, "logits/rejected": -37593657.6, "logps/chosen": -257.83009847005206, "logps/rejected": -336.2878173828125, "loss": 0.2116, "rewards/chosen": 0.2533424496650696, "rewards/margins": 3.1539390683174133, "rewards/rejected": -2.9005966186523438, "step": 7586 }, { "epoch": 0.4021413616728063, "grad_norm": 35.5, "kl": 0.8593130111694336, "learning_rate": 5e-07, "logits/chosen": -6880447.0, "logits/rejected": -16050140.0, "logps/chosen": -41.04742431640625, "logps/rejected": -196.55226135253906, "loss": 0.29, "rewards/chosen": 0.5264070630073547, "rewards/margins": 2.2783153653144836, "rewards/rejected": -1.751908302307129, "step": 7587 }, { "epoch": 0.4021943656746084, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36035192.0, "logits/rejected": -37959276.8, "logps/chosen": -343.0531005859375, "logps/rejected": -417.874853515625, "loss": 0.3042, "rewards/chosen": -0.2720759113629659, "rewards/margins": 1.495374612013499, "rewards/rejected": -1.7674505233764648, "step": 7588 }, { "epoch": 0.40224736967641056, "grad_norm": 28.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19511454.0, "logits/rejected": -51037248.0, "logps/chosen": -46.057098388671875, "logps/rejected": -576.9864095052084, "loss": 0.164, "rewards/chosen": 0.6513930559158325, "rewards/margins": 3.786904454231262, "rewards/rejected": -3.1355113983154297, "step": 7589 }, { "epoch": 0.4023003736782127, "grad_norm": 42.75, "kl": 0.2513141632080078, "learning_rate": 5e-07, "logits/chosen": -16464192.0, "logits/rejected": -13341284.0, "logps/chosen": -206.1798095703125, "logps/rejected": -344.0860188802083, "loss": 0.3228, "rewards/chosen": 0.4251760482788086, "rewards/margins": 2.2435382843017577, "rewards/rejected": -1.8183622360229492, "step": 7590 }, { "epoch": 0.40235337768001483, "grad_norm": 53.0, "kl": 1.641535758972168, "learning_rate": 5e-07, "logits/chosen": -1784729.6666666667, "logits/rejected": 13871024.0, "logps/chosen": -215.65055338541666, "logps/rejected": -292.7448425292969, "loss": 0.3846, "rewards/chosen": 0.3316047390302022, "rewards/margins": 2.6616525848706565, "rewards/rejected": -2.330047845840454, "step": 7591 }, { "epoch": 0.40240638168181697, "grad_norm": 44.0, "kl": 0.5328903198242188, "learning_rate": 5e-07, "logits/chosen": -35612808.0, "logits/rejected": -16474092.8, "logps/chosen": -155.44941202799478, "logps/rejected": -290.5496337890625, "loss": 0.3047, "rewards/chosen": 0.39688082536061603, "rewards/margins": 2.170207699139913, "rewards/rejected": -1.7733268737792969, "step": 7592 }, { "epoch": 0.4024593856836191, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18133653.333333332, "logits/rejected": -36481312.0, "logps/chosen": -234.89483642578125, "logps/rejected": -333.1458984375, "loss": 0.2462, "rewards/chosen": 0.3349427382151286, "rewards/margins": 2.211435238520304, "rewards/rejected": -1.8764925003051758, "step": 7593 }, { "epoch": 0.40251238968542125, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9066592.0, "logits/rejected": -7026993.6, "logps/chosen": -339.58249918619794, "logps/rejected": -293.5230224609375, "loss": 0.2606, "rewards/chosen": 0.4565571943918864, "rewards/margins": 2.2540848890940346, "rewards/rejected": -1.7975276947021483, "step": 7594 }, { "epoch": 0.4025653936872234, "grad_norm": 35.75, "kl": 0.3885812759399414, "learning_rate": 5e-07, "logits/chosen": -5890964.0, "logits/rejected": -79277504.0, "logps/chosen": -152.2638916015625, "logps/rejected": -756.42724609375, "loss": 0.2499, "rewards/chosen": 0.5908148765563965, "rewards/margins": 5.362555154164632, "rewards/rejected": -4.771740277608235, "step": 7595 }, { "epoch": 0.4026183976890255, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31293892.0, "logits/rejected": 10404170.0, "logps/chosen": -276.299072265625, "logps/rejected": -577.8363037109375, "loss": 0.3171, "rewards/chosen": 0.2762394845485687, "rewards/margins": 2.0802155435085297, "rewards/rejected": -1.803976058959961, "step": 7596 }, { "epoch": 0.40267140169082766, "grad_norm": 48.25, "kl": 0.8326015472412109, "learning_rate": 5e-07, "logits/chosen": -18901632.0, "logits/rejected": -28326122.0, "logps/chosen": -228.13297526041666, "logps/rejected": -263.2611083984375, "loss": 0.4525, "rewards/chosen": -0.2309881647427877, "rewards/margins": 2.433964113394419, "rewards/rejected": -2.664952278137207, "step": 7597 }, { "epoch": 0.4027244056926298, "grad_norm": 59.25, "kl": 0.051128387451171875, "learning_rate": 5e-07, "logits/chosen": 137926880.0, "logits/rejected": -37309216.0, "logps/chosen": -411.5945739746094, "logps/rejected": -465.81744384765625, "loss": 0.3137, "rewards/chosen": 0.3447464108467102, "rewards/margins": 2.0674200654029846, "rewards/rejected": -1.7226736545562744, "step": 7598 }, { "epoch": 0.40277740969443193, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17542195.2, "logits/rejected": -78504133.33333333, "logps/chosen": -213.225732421875, "logps/rejected": -296.6636555989583, "loss": 0.32, "rewards/chosen": 0.5672564506530762, "rewards/margins": 2.6413334210713706, "rewards/rejected": -2.0740769704182944, "step": 7599 }, { "epoch": 0.40283041369623407, "grad_norm": 34.5, "kl": 0.12714004516601562, "learning_rate": 5e-07, "logits/chosen": -79153328.0, "logits/rejected": -19184356.8, "logps/chosen": -242.89141845703125, "logps/rejected": -183.27294921875, "loss": 0.2603, "rewards/chosen": 0.14344712098439535, "rewards/margins": 2.846657188733419, "rewards/rejected": -2.7032100677490236, "step": 7600 }, { "epoch": 0.4028834176980362, "grad_norm": 51.25, "kl": 0.7190742492675781, "learning_rate": 5e-07, "logits/chosen": -40905160.0, "logits/rejected": -15885548.0, "logps/chosen": -422.59228515625, "logps/rejected": -284.44671630859375, "loss": 0.2683, "rewards/chosen": 0.4163587987422943, "rewards/margins": 2.5061746537685394, "rewards/rejected": -2.089815855026245, "step": 7601 }, { "epoch": 0.40293642169983834, "grad_norm": 48.0, "kl": 0.8169498443603516, "learning_rate": 5e-07, "logits/chosen": -48937424.0, "logits/rejected": 14652.375, "logps/chosen": -295.40869140625, "logps/rejected": -80.61614227294922, "loss": 0.3503, "rewards/chosen": 0.02097301185131073, "rewards/margins": 1.7006768435239792, "rewards/rejected": -1.6797038316726685, "step": 7602 }, { "epoch": 0.4029894257016405, "grad_norm": 55.0, "kl": 1.0128517150878906, "learning_rate": 5e-07, "logits/chosen": -38464394.666666664, "logits/rejected": -22432924.0, "logps/chosen": -362.6122233072917, "logps/rejected": -224.45831298828125, "loss": 0.4287, "rewards/chosen": 0.12293535470962524, "rewards/margins": 1.5236807465553284, "rewards/rejected": -1.4007453918457031, "step": 7603 }, { "epoch": 0.4030424297034426, "grad_norm": 54.75, "kl": 0.2527351379394531, "learning_rate": 5e-07, "logits/chosen": -26332584.0, "logits/rejected": -11616265.6, "logps/chosen": -483.8273111979167, "logps/rejected": -350.6535400390625, "loss": 0.2799, "rewards/chosen": 0.5522176424662272, "rewards/margins": 2.2612659136454263, "rewards/rejected": -1.7090482711791992, "step": 7604 }, { "epoch": 0.40309543370524475, "grad_norm": 54.0, "kl": 0.11007881164550781, "learning_rate": 5e-07, "logits/chosen": -11249766.0, "logits/rejected": -13931730.666666666, "logps/chosen": -66.90568542480469, "logps/rejected": -220.82840983072916, "loss": 0.3102, "rewards/chosen": 0.20984573662281036, "rewards/margins": 1.3517605811357498, "rewards/rejected": -1.1419148445129395, "step": 7605 }, { "epoch": 0.4031484377070469, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29944534.0, "logits/rejected": -8096083.0, "logps/chosen": -304.9493408203125, "logps/rejected": -248.6601104736328, "loss": 0.3268, "rewards/chosen": 0.4248037338256836, "rewards/margins": 2.540825843811035, "rewards/rejected": -2.1160221099853516, "step": 7606 }, { "epoch": 0.40320144170884903, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58910917.333333336, "logits/rejected": -23901340.8, "logps/chosen": -581.3646647135416, "logps/rejected": -296.3828857421875, "loss": 0.2009, "rewards/chosen": 0.6952698230743408, "rewards/margins": 3.5224693775177003, "rewards/rejected": -2.8271995544433595, "step": 7607 }, { "epoch": 0.40325444571065117, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50481520.0, "logits/rejected": -14456883.2, "logps/chosen": -187.00911458333334, "logps/rejected": -216.14658203125, "loss": 0.2902, "rewards/chosen": 0.12849122285842896, "rewards/margins": 1.845116627216339, "rewards/rejected": -1.7166254043579101, "step": 7608 }, { "epoch": 0.4033074497124533, "grad_norm": 38.25, "kl": 0.09832000732421875, "learning_rate": 5e-07, "logits/chosen": -30631122.666666668, "logits/rejected": -18217776.0, "logps/chosen": -713.550537109375, "logps/rejected": -95.61153564453124, "loss": 0.2721, "rewards/chosen": 0.5434927543004354, "rewards/margins": 2.2972046454747517, "rewards/rejected": -1.7537118911743164, "step": 7609 }, { "epoch": 0.40336045371425544, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67977184.0, "logits/rejected": -40697068.0, "logps/chosen": -285.19384765625, "logps/rejected": -423.72198486328125, "loss": 0.247, "rewards/chosen": 0.3603193461894989, "rewards/margins": 3.0432526767253876, "rewards/rejected": -2.6829333305358887, "step": 7610 }, { "epoch": 0.4034134577160576, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72909568.0, "logits/rejected": 1143061.5, "logps/chosen": -165.65171813964844, "logps/rejected": -76.39046478271484, "loss": 0.3458, "rewards/chosen": 0.3928443193435669, "rewards/margins": 1.5103567838668823, "rewards/rejected": -1.1175124645233154, "step": 7611 }, { "epoch": 0.4034664617178597, "grad_norm": 66.0, "kl": 0.0178375244140625, "learning_rate": 5e-07, "logits/chosen": -50269941.333333336, "logits/rejected": -16636270.0, "logps/chosen": -219.83638509114584, "logps/rejected": -399.12139892578125, "loss": 0.3259, "rewards/chosen": 0.4487394491831462, "rewards/margins": 2.646716038386027, "rewards/rejected": -2.197976589202881, "step": 7612 }, { "epoch": 0.40351946571966185, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7064829.5, "logits/rejected": -36411028.0, "logps/chosen": -362.0155944824219, "logps/rejected": -347.4751281738281, "loss": 0.267, "rewards/chosen": 0.7053922414779663, "rewards/margins": 2.3401044607162476, "rewards/rejected": -1.6347122192382812, "step": 7613 }, { "epoch": 0.403572469721464, "grad_norm": 50.75, "kl": 0.8563213348388672, "learning_rate": 5e-07, "logits/chosen": -56565132.8, "logits/rejected": -69012597.33333333, "logps/chosen": -399.4548095703125, "logps/rejected": -440.1918131510417, "loss": 0.3759, "rewards/chosen": 0.1671597957611084, "rewards/margins": 2.3536578019460044, "rewards/rejected": -2.186498006184896, "step": 7614 }, { "epoch": 0.4036254737232661, "grad_norm": 40.5, "kl": 0.6790256500244141, "learning_rate": 5e-07, "logits/chosen": 9955640.0, "logits/rejected": -4877560.8, "logps/chosen": -671.6432698567709, "logps/rejected": -414.00927734375, "loss": 0.2443, "rewards/chosen": 0.5432716210683187, "rewards/margins": 3.2130836327870687, "rewards/rejected": -2.66981201171875, "step": 7615 }, { "epoch": 0.40367847772506826, "grad_norm": 51.5, "kl": 0.0556488037109375, "learning_rate": 5e-07, "logits/chosen": -29217616.0, "logits/rejected": -2144055.3333333335, "logps/chosen": -229.9958984375, "logps/rejected": -250.13590494791666, "loss": 0.335, "rewards/chosen": 0.31437547206878663, "rewards/margins": 2.4028594414393107, "rewards/rejected": -2.088483969370524, "step": 7616 }, { "epoch": 0.4037314817268704, "grad_norm": 33.25, "kl": 0.3879070281982422, "learning_rate": 5e-07, "logits/chosen": -12448009.0, "logits/rejected": -38002936.0, "logps/chosen": -203.2242889404297, "logps/rejected": -443.156494140625, "loss": 0.2647, "rewards/chosen": 0.16373436152935028, "rewards/margins": 3.8032092303037643, "rewards/rejected": -3.639474868774414, "step": 7617 }, { "epoch": 0.40378448572867254, "grad_norm": 92.0, "kl": 1.1806755065917969, "learning_rate": 5e-07, "logits/chosen": -46711526.4, "logits/rejected": -70647296.0, "logps/chosen": -341.8122802734375, "logps/rejected": -380.063720703125, "loss": 0.3971, "rewards/chosen": -0.0342803955078125, "rewards/margins": 2.2895228703816732, "rewards/rejected": -2.323803265889486, "step": 7618 }, { "epoch": 0.4038374897304747, "grad_norm": 62.25, "kl": 0.5940999984741211, "learning_rate": 5e-07, "logits/chosen": -34085475.2, "logits/rejected": -6367016.666666667, "logps/chosen": -377.4789306640625, "logps/rejected": -114.3510030110677, "loss": 0.3782, "rewards/chosen": 0.24087717533111572, "rewards/margins": 1.4913044214248656, "rewards/rejected": -1.25042724609375, "step": 7619 }, { "epoch": 0.4038904937322768, "grad_norm": 56.5, "kl": 0.563690185546875, "learning_rate": 5e-07, "logits/chosen": -25564324.0, "logits/rejected": -25973348.0, "logps/chosen": -373.1860656738281, "logps/rejected": -424.17095947265625, "loss": 0.2778, "rewards/chosen": 0.5954940915107727, "rewards/margins": 2.3058770298957825, "rewards/rejected": -1.7103829383850098, "step": 7620 }, { "epoch": 0.40394349773407895, "grad_norm": 59.5, "kl": 3.226451873779297, "learning_rate": 5e-07, "logits/chosen": -6603382.666666667, "logits/rejected": -10817303.0, "logps/chosen": -459.2429606119792, "logps/rejected": -123.23336029052734, "loss": 0.4265, "rewards/chosen": 0.47055208683013916, "rewards/margins": 2.4195587635040283, "rewards/rejected": -1.9490066766738892, "step": 7621 }, { "epoch": 0.4039965017358811, "grad_norm": 33.0, "kl": 0.1741476058959961, "learning_rate": 5e-07, "logits/chosen": -23310598.0, "logits/rejected": -13358268.0, "logps/chosen": -225.02516174316406, "logps/rejected": -187.72422790527344, "loss": 0.2736, "rewards/chosen": 0.48889219760894775, "rewards/margins": 2.7341943979263306, "rewards/rejected": -2.245302200317383, "step": 7622 }, { "epoch": 0.40404950573768317, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -125922328.0, "logits/rejected": -18347058.0, "logps/chosen": -206.02053833007812, "logps/rejected": -364.8150329589844, "loss": 0.2956, "rewards/chosen": 0.14125680923461914, "rewards/margins": 2.210786819458008, "rewards/rejected": -2.0695300102233887, "step": 7623 }, { "epoch": 0.4041025097394853, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18007278.0, "logits/rejected": 14458376.0, "logps/chosen": -433.7948913574219, "logps/rejected": -228.39632161458334, "loss": 0.2521, "rewards/chosen": 0.8501923084259033, "rewards/margins": 2.4501335620880127, "rewards/rejected": -1.5999412536621094, "step": 7624 }, { "epoch": 0.40415551374128744, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28905752.0, "logits/rejected": -33374108.8, "logps/chosen": -374.2372639973958, "logps/rejected": -513.16748046875, "loss": 0.2464, "rewards/chosen": 0.38065898418426514, "rewards/margins": 2.7043347120285035, "rewards/rejected": -2.3236757278442384, "step": 7625 }, { "epoch": 0.4042085177430896, "grad_norm": 50.25, "kl": 0.7269935607910156, "learning_rate": 5e-07, "logits/chosen": -16424902.4, "logits/rejected": -21642424.0, "logps/chosen": -290.120361328125, "logps/rejected": -611.5504150390625, "loss": 0.2808, "rewards/chosen": 0.49080758094787597, "rewards/margins": 4.2421284198760985, "rewards/rejected": -3.7513208389282227, "step": 7626 }, { "epoch": 0.4042615217448917, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -203637088.0, "logits/rejected": -49797157.333333336, "logps/chosen": -173.5758056640625, "logps/rejected": -251.834716796875, "loss": 0.2231, "rewards/chosen": 0.6547393798828125, "rewards/margins": 2.397022883097331, "rewards/rejected": -1.7422835032145183, "step": 7627 }, { "epoch": 0.40431452574669385, "grad_norm": 58.75, "kl": 0.25263214111328125, "learning_rate": 5e-07, "logits/chosen": -59483685.333333336, "logits/rejected": -69177264.0, "logps/chosen": -481.711181640625, "logps/rejected": -601.3247680664062, "loss": 0.2946, "rewards/chosen": 0.5630597273508707, "rewards/margins": 4.324522177378337, "rewards/rejected": -3.761462450027466, "step": 7628 }, { "epoch": 0.404367529748496, "grad_norm": 54.0, "kl": 0.08147811889648438, "learning_rate": 5e-07, "logits/chosen": -66194216.0, "logits/rejected": -58678728.0, "logps/chosen": -371.83782958984375, "logps/rejected": -234.81971740722656, "loss": 0.2692, "rewards/chosen": 0.5809982419013977, "rewards/margins": 2.130931079387665, "rewards/rejected": -1.549932837486267, "step": 7629 }, { "epoch": 0.40442053375029813, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26086122.666666668, "logits/rejected": -7301634.4, "logps/chosen": -125.16226196289062, "logps/rejected": -135.432470703125, "loss": 0.3415, "rewards/chosen": -0.10467783610026042, "rewards/margins": 1.5935691197713215, "rewards/rejected": -1.698246955871582, "step": 7630 }, { "epoch": 0.40447353775210027, "grad_norm": 47.5, "kl": 1.2707509994506836, "learning_rate": 5e-07, "logits/chosen": 2467556.8, "logits/rejected": -7374952.0, "logps/chosen": -589.31162109375, "logps/rejected": -137.8014933268229, "loss": 0.2835, "rewards/chosen": 0.965472412109375, "rewards/margins": 2.5329373995463054, "rewards/rejected": -1.5674649874369304, "step": 7631 }, { "epoch": 0.4045265417539024, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12127575.0, "logits/rejected": -52247914.666666664, "logps/chosen": -341.8370361328125, "logps/rejected": -584.2902018229166, "loss": 0.2487, "rewards/chosen": 0.390310674905777, "rewards/margins": 2.282200088103612, "rewards/rejected": -1.8918894131978352, "step": 7632 }, { "epoch": 0.40457954575570454, "grad_norm": 38.25, "kl": 0.3210268020629883, "learning_rate": 5e-07, "logits/chosen": -11786183.2, "logits/rejected": -29351501.333333332, "logps/chosen": -129.8048095703125, "logps/rejected": -413.0210367838542, "loss": 0.3008, "rewards/chosen": 0.3964717388153076, "rewards/margins": 3.3542986710866294, "rewards/rejected": -2.9578269322713218, "step": 7633 }, { "epoch": 0.4046325497575067, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1699193.8333333333, "logits/rejected": -16974969.6, "logps/chosen": -46.87358601888021, "logps/rejected": -156.3644775390625, "loss": 0.2842, "rewards/chosen": 0.2496714194615682, "rewards/margins": 1.8575382788976034, "rewards/rejected": -1.6078668594360352, "step": 7634 }, { "epoch": 0.4046855537593088, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11748624.0, "logits/rejected": -24297316.0, "logps/chosen": -682.4303588867188, "logps/rejected": -381.44671630859375, "loss": 0.2675, "rewards/chosen": 0.8729593753814697, "rewards/margins": 2.8154115676879883, "rewards/rejected": -1.9424521923065186, "step": 7635 }, { "epoch": 0.40473855776111095, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9386897.333333334, "logits/rejected": -10057988.0, "logps/chosen": -349.9854736328125, "logps/rejected": -157.61900634765624, "loss": 0.1915, "rewards/chosen": 0.976875384648641, "rewards/margins": 2.8260104020436607, "rewards/rejected": -1.8491350173950196, "step": 7636 }, { "epoch": 0.4047915617629131, "grad_norm": 43.75, "kl": 0.4167203903198242, "learning_rate": 5e-07, "logits/chosen": -3198871.3333333335, "logits/rejected": -31887142.4, "logps/chosen": -176.8005167643229, "logps/rejected": -274.4869873046875, "loss": 0.257, "rewards/chosen": 0.46827979882558185, "rewards/margins": 2.3972665389378864, "rewards/rejected": -1.9289867401123046, "step": 7637 }, { "epoch": 0.4048445657647152, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49870568.0, "logits/rejected": -61576616.0, "logps/chosen": -427.453369140625, "logps/rejected": -176.50120544433594, "loss": 0.2763, "rewards/chosen": 0.640411376953125, "rewards/margins": 2.2108031511306763, "rewards/rejected": -1.5703917741775513, "step": 7638 }, { "epoch": 0.40489756976651736, "grad_norm": 74.0, "kl": 0.09004592895507812, "learning_rate": 5e-07, "logits/chosen": 27380470.4, "logits/rejected": -3916451.3333333335, "logps/chosen": -393.808740234375, "logps/rejected": -129.93977864583334, "loss": 0.4548, "rewards/chosen": -0.1701647996902466, "rewards/margins": 0.7213037252426148, "rewards/rejected": -0.8914685249328613, "step": 7639 }, { "epoch": 0.4049505737683195, "grad_norm": 43.0, "kl": 0.5596675872802734, "learning_rate": 5e-07, "logits/chosen": -51803780.0, "logits/rejected": -16928886.85714286, "logps/chosen": -554.549072265625, "logps/rejected": -217.968017578125, "loss": 0.2333, "rewards/chosen": 1.441064476966858, "rewards/margins": 2.8525271245411465, "rewards/rejected": -1.4114626475742884, "step": 7640 }, { "epoch": 0.40500357777012164, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66776538.666666664, "logits/rejected": -24210548.8, "logps/chosen": -646.6495361328125, "logps/rejected": -359.184033203125, "loss": 0.276, "rewards/chosen": 0.6297312577565511, "rewards/margins": 2.321542247136434, "rewards/rejected": -1.6918109893798827, "step": 7641 }, { "epoch": 0.4050565817719238, "grad_norm": 61.75, "kl": 1.3116416931152344, "learning_rate": 5e-07, "logits/chosen": -26914872.0, "logits/rejected": -39908148.0, "logps/chosen": -335.74578857421875, "logps/rejected": -782.999267578125, "loss": 0.2804, "rewards/chosen": 0.6042159795761108, "rewards/margins": 4.201890349388123, "rewards/rejected": -3.5976743698120117, "step": 7642 }, { "epoch": 0.4051095857737259, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34897288.0, "logits/rejected": -5884373.0, "logps/chosen": -243.39524841308594, "logps/rejected": -400.7833251953125, "loss": 0.3203, "rewards/chosen": -0.12167578190565109, "rewards/margins": 2.4551965668797493, "rewards/rejected": -2.5768723487854004, "step": 7643 }, { "epoch": 0.40516258977552805, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9755701.6, "logits/rejected": -43280125.333333336, "logps/chosen": -319.2656005859375, "logps/rejected": -546.4842529296875, "loss": 0.2819, "rewards/chosen": 0.29673430919647215, "rewards/margins": 4.063049133618673, "rewards/rejected": -3.7663148244222007, "step": 7644 }, { "epoch": 0.4052155937773302, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8094468.0, "logits/rejected": -29344969.6, "logps/chosen": -403.3583984375, "logps/rejected": -477.308984375, "loss": 0.1854, "rewards/chosen": 0.7404092152913412, "rewards/margins": 3.5101964314778646, "rewards/rejected": -2.7697872161865233, "step": 7645 }, { "epoch": 0.4052685977791323, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69441125.33333333, "logits/rejected": -26189024.0, "logps/chosen": -257.84238688151044, "logps/rejected": -445.68037109375, "loss": 0.2558, "rewards/chosen": 0.5273313124974569, "rewards/margins": 2.3121821959813436, "rewards/rejected": -1.7848508834838868, "step": 7646 }, { "epoch": 0.40532160178093446, "grad_norm": 53.75, "kl": 1.16595458984375, "learning_rate": 5e-07, "logits/chosen": -25820214.4, "logits/rejected": -18121258.666666668, "logps/chosen": -258.242333984375, "logps/rejected": -117.37038167317708, "loss": 0.4484, "rewards/chosen": -0.06659364700317383, "rewards/margins": 1.13723889986674, "rewards/rejected": -1.2038325468699138, "step": 7647 }, { "epoch": 0.4053746057827366, "grad_norm": 50.75, "kl": 1.0248966217041016, "learning_rate": 5e-07, "logits/chosen": -39633352.0, "logits/rejected": -40621862.4, "logps/chosen": -728.6961263020834, "logps/rejected": -294.3997802734375, "loss": 0.2331, "rewards/chosen": 1.3621023495992024, "rewards/margins": 2.701761754353841, "rewards/rejected": -1.3396594047546386, "step": 7648 }, { "epoch": 0.40542760978453873, "grad_norm": 63.5, "kl": 0.47919464111328125, "learning_rate": 5e-07, "logits/chosen": -8561738.666666666, "logits/rejected": -23347424.0, "logps/chosen": -426.7261962890625, "logps/rejected": -136.63321533203126, "loss": 0.3806, "rewards/chosen": -0.09337133169174194, "rewards/margins": 1.27798935174942, "rewards/rejected": -1.371360683441162, "step": 7649 }, { "epoch": 0.40548061378634087, "grad_norm": 45.0, "kl": 0.7008628845214844, "learning_rate": 5e-07, "logits/chosen": -27420613.333333332, "logits/rejected": -29129318.4, "logps/chosen": -225.5962931315104, "logps/rejected": -207.5554931640625, "loss": 0.2649, "rewards/chosen": 0.5762837727864584, "rewards/margins": 2.7516722997029626, "rewards/rejected": -2.175388526916504, "step": 7650 }, { "epoch": 0.405533617788143, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22409308.8, "logits/rejected": -35084160.0, "logps/chosen": -258.4183837890625, "logps/rejected": -547.8517252604166, "loss": 0.3035, "rewards/chosen": 0.2792570352554321, "rewards/margins": 3.091643230120341, "rewards/rejected": -2.8123861948649087, "step": 7651 }, { "epoch": 0.40558662178994515, "grad_norm": 65.5, "kl": 1.0200614929199219, "learning_rate": 5e-07, "logits/chosen": -54020315.428571425, "logits/rejected": 4184567.5, "logps/chosen": -376.61729213169644, "logps/rejected": -55.847930908203125, "loss": 0.3339, "rewards/chosen": 0.7472991262163434, "rewards/margins": 2.3415153537477766, "rewards/rejected": -1.594216227531433, "step": 7652 }, { "epoch": 0.4056396257917473, "grad_norm": 48.75, "kl": 0.38231658935546875, "learning_rate": 5e-07, "logits/chosen": -24634686.0, "logits/rejected": -42462328.0, "logps/chosen": -362.2086181640625, "logps/rejected": -388.8040466308594, "loss": 0.2147, "rewards/chosen": 1.0209909677505493, "rewards/margins": 3.7353533506393433, "rewards/rejected": -2.714362382888794, "step": 7653 }, { "epoch": 0.4056926297935494, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29922282.0, "logits/rejected": -29049904.0, "logps/chosen": -289.89520263671875, "logps/rejected": -216.0040283203125, "loss": 0.3136, "rewards/chosen": -0.05176249146461487, "rewards/margins": 2.1375266015529633, "rewards/rejected": -2.189289093017578, "step": 7654 }, { "epoch": 0.40574563379535156, "grad_norm": 47.25, "kl": 0.2596778869628906, "learning_rate": 5e-07, "logits/chosen": -39124409.6, "logits/rejected": -8687529.333333334, "logps/chosen": -317.8844482421875, "logps/rejected": -377.4738362630208, "loss": 0.1923, "rewards/chosen": 1.0982505798339843, "rewards/margins": 4.245332590738932, "rewards/rejected": -3.1470820109049478, "step": 7655 }, { "epoch": 0.4057986377971537, "grad_norm": 49.25, "kl": 0.619450569152832, "learning_rate": 5e-07, "logits/chosen": -32183614.0, "logits/rejected": -27243486.0, "logps/chosen": -371.67510986328125, "logps/rejected": -296.5310363769531, "loss": 0.2637, "rewards/chosen": 0.8324024677276611, "rewards/margins": 2.5671589374542236, "rewards/rejected": -1.7347564697265625, "step": 7656 }, { "epoch": 0.40585164179895583, "grad_norm": 40.0, "kl": 0.21135520935058594, "learning_rate": 5e-07, "logits/chosen": -8728729.0, "logits/rejected": -30681772.0, "logps/chosen": -152.46522521972656, "logps/rejected": -279.2871398925781, "loss": 0.3051, "rewards/chosen": 0.1513546109199524, "rewards/margins": 2.515846073627472, "rewards/rejected": -2.3644914627075195, "step": 7657 }, { "epoch": 0.40590464580075797, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25843581.333333332, "logits/rejected": -25643776.0, "logps/chosen": -209.96065266927084, "logps/rejected": -301.320654296875, "loss": 0.3257, "rewards/chosen": -0.3863866329193115, "rewards/margins": 1.545603609085083, "rewards/rejected": -1.9319902420043946, "step": 7658 }, { "epoch": 0.4059576498025601, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15266302.0, "logits/rejected": -28200741.333333332, "logps/chosen": -230.0736083984375, "logps/rejected": -287.0888264973958, "loss": 0.2557, "rewards/chosen": -0.2487192153930664, "rewards/margins": 1.82878049214681, "rewards/rejected": -2.0774997075398765, "step": 7659 }, { "epoch": 0.40601065380436224, "grad_norm": 57.5, "kl": 2.6497764587402344, "learning_rate": 5e-07, "logits/chosen": -106957760.0, "logits/rejected": 2540336.25, "logps/chosen": -312.8296712239583, "logps/rejected": -95.8064193725586, "loss": 0.2556, "rewards/chosen": 1.231622854868571, "rewards/margins": 4.550719420115153, "rewards/rejected": -3.319096565246582, "step": 7660 }, { "epoch": 0.4060636578061644, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55397080.0, "logits/rejected": -41132940.0, "logps/chosen": -407.05499267578125, "logps/rejected": -302.09698486328125, "loss": 0.2735, "rewards/chosen": 0.4914604425430298, "rewards/margins": 2.567374110221863, "rewards/rejected": -2.075913667678833, "step": 7661 }, { "epoch": 0.4061166618079665, "grad_norm": 73.5, "kl": 2.6927871704101562, "learning_rate": 5e-07, "logits/chosen": -49349510.4, "logits/rejected": -45684032.0, "logps/chosen": -646.801025390625, "logps/rejected": -465.4303385416667, "loss": 0.369, "rewards/chosen": 0.4840348243713379, "rewards/margins": 2.952893543243408, "rewards/rejected": -2.4688587188720703, "step": 7662 }, { "epoch": 0.40616966580976865, "grad_norm": 37.0, "kl": 0.025511741638183594, "learning_rate": 5e-07, "logits/chosen": 4983946.5, "logits/rejected": -12992410.0, "logps/chosen": -81.14689636230469, "logps/rejected": -323.2052001953125, "loss": 0.3535, "rewards/chosen": 0.34332820773124695, "rewards/margins": 1.4165713489055634, "rewards/rejected": -1.0732431411743164, "step": 7663 }, { "epoch": 0.4062226698115708, "grad_norm": 60.0, "kl": 1.3789253234863281, "learning_rate": 5e-07, "logits/chosen": -26541990.4, "logits/rejected": 15470733.333333334, "logps/chosen": -351.081396484375, "logps/rejected": -330.4707845052083, "loss": 0.3502, "rewards/chosen": 0.3619140625, "rewards/margins": 2.2812166849772137, "rewards/rejected": -1.9193026224772136, "step": 7664 }, { "epoch": 0.40627567381337293, "grad_norm": 55.0, "kl": 0.140106201171875, "learning_rate": 5e-07, "logits/chosen": -36813830.4, "logits/rejected": -13531202.666666666, "logps/chosen": -307.2369140625, "logps/rejected": -147.57275390625, "loss": 0.3784, "rewards/chosen": 0.18306884765625, "rewards/margins": 1.378231700261434, "rewards/rejected": -1.1951628526051838, "step": 7665 }, { "epoch": 0.40632867781517507, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30792396.0, "logits/rejected": -21269722.666666668, "logps/chosen": -159.48521423339844, "logps/rejected": -248.113037109375, "loss": 0.2081, "rewards/chosen": 0.3535846769809723, "rewards/margins": 2.3344304462273913, "rewards/rejected": -1.9808457692464192, "step": 7666 }, { "epoch": 0.4063816818169772, "grad_norm": 38.0, "kl": 0.22778701782226562, "learning_rate": 5e-07, "logits/chosen": -16292897.333333334, "logits/rejected": -24004678.4, "logps/chosen": -231.7335001627604, "logps/rejected": -254.9214599609375, "loss": 0.2361, "rewards/chosen": 0.8161489168802897, "rewards/margins": 2.535460058848063, "rewards/rejected": -1.7193111419677733, "step": 7667 }, { "epoch": 0.40643468581877934, "grad_norm": 49.0, "kl": 1.9353218078613281, "learning_rate": 5e-07, "logits/chosen": -12892170.0, "logits/rejected": -10153759.0, "logps/chosen": -218.45077514648438, "logps/rejected": -511.4536437988281, "loss": 0.2886, "rewards/chosen": 0.7326268553733826, "rewards/margins": 2.665433347225189, "rewards/rejected": -1.9328064918518066, "step": 7668 }, { "epoch": 0.4064876898205815, "grad_norm": 56.0, "kl": 0.02315521240234375, "learning_rate": 5e-07, "logits/chosen": -15327017.333333334, "logits/rejected": -56004320.0, "logps/chosen": -254.99560546875, "logps/rejected": -586.4134521484375, "loss": 0.3919, "rewards/chosen": 0.016680022080739338, "rewards/margins": 2.9417076309521994, "rewards/rejected": -2.92502760887146, "step": 7669 }, { "epoch": 0.4065406938223836, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67615568.0, "logits/rejected": -44788870.4, "logps/chosen": -549.28369140625, "logps/rejected": -565.833740234375, "loss": 0.2498, "rewards/chosen": 0.12067663669586182, "rewards/margins": 3.6858930826187133, "rewards/rejected": -3.5652164459228515, "step": 7670 }, { "epoch": 0.40659369782418575, "grad_norm": 59.5, "kl": 2.379100799560547, "learning_rate": 5e-07, "logits/chosen": -38293856.0, "logits/rejected": -8187984.8, "logps/chosen": -303.1343587239583, "logps/rejected": -204.4118896484375, "loss": 0.3178, "rewards/chosen": 0.8676860332489014, "rewards/margins": 2.120510721206665, "rewards/rejected": -1.2528246879577636, "step": 7671 }, { "epoch": 0.4066467018259879, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49498346.666666664, "logits/rejected": -3208431.2, "logps/chosen": -513.4351806640625, "logps/rejected": -270.5865234375, "loss": 0.2888, "rewards/chosen": 0.3142079909642537, "rewards/margins": 2.008701475461324, "rewards/rejected": -1.6944934844970703, "step": 7672 }, { "epoch": 0.40669970582779, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29086253.333333332, "logits/rejected": -12187470.4, "logps/chosen": -176.96907552083334, "logps/rejected": -197.4760986328125, "loss": 0.3097, "rewards/chosen": 0.019926642378171284, "rewards/margins": 1.6241262425978977, "rewards/rejected": -1.6041996002197265, "step": 7673 }, { "epoch": 0.4067527098295921, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9292925.6, "logits/rejected": -35795909.333333336, "logps/chosen": -229.0466796875, "logps/rejected": -423.4466145833333, "loss": 0.374, "rewards/chosen": -0.17639598846435547, "rewards/margins": 2.3881407419840492, "rewards/rejected": -2.564536730448405, "step": 7674 }, { "epoch": 0.40680571383139424, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7996499.0, "logits/rejected": -10083096.0, "logps/chosen": -137.3372802734375, "logps/rejected": -253.87355041503906, "loss": 0.3042, "rewards/chosen": 0.5190449357032776, "rewards/margins": 1.9410046935081482, "rewards/rejected": -1.4219597578048706, "step": 7675 }, { "epoch": 0.4068587178331964, "grad_norm": 75.5, "kl": 1.299337387084961, "learning_rate": 5e-07, "logits/chosen": -53769493.333333336, "logits/rejected": -12719805.0, "logps/chosen": -328.5288492838542, "logps/rejected": -356.3067626953125, "loss": 0.436, "rewards/chosen": 0.0829724669456482, "rewards/margins": 1.9388821721076965, "rewards/rejected": -1.8559097051620483, "step": 7676 }, { "epoch": 0.4069117218349985, "grad_norm": 47.75, "kl": 0.195068359375, "learning_rate": 5e-07, "logits/chosen": -49471680.0, "logits/rejected": -37072176.0, "logps/chosen": -439.43096923828125, "logps/rejected": -347.1902669270833, "loss": 0.1883, "rewards/chosen": 0.6803848147392273, "rewards/margins": 2.8042605916659036, "rewards/rejected": -2.1238757769266763, "step": 7677 }, { "epoch": 0.40696472583680066, "grad_norm": 35.5, "kl": 0.7923069000244141, "learning_rate": 5e-07, "logits/chosen": 6593754.0, "logits/rejected": 2077578.625, "logps/chosen": -147.84774780273438, "logps/rejected": -192.18409729003906, "loss": 0.2381, "rewards/chosen": 0.92137610912323, "rewards/margins": 3.0714398622512817, "rewards/rejected": -2.1500637531280518, "step": 7678 }, { "epoch": 0.4070177298386028, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4049490.5714285714, "logits/rejected": -7614054.0, "logps/chosen": -170.11167689732142, "logps/rejected": -196.06179809570312, "loss": 0.4545, "rewards/chosen": 0.033257282205990384, "rewards/margins": 1.5538155202354704, "rewards/rejected": -1.52055823802948, "step": 7679 }, { "epoch": 0.40707073384040493, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -101527472.0, "logits/rejected": -49223146.666666664, "logps/chosen": -739.982421875, "logps/rejected": -516.8760172526041, "loss": 0.1938, "rewards/chosen": 0.8546386957168579, "rewards/margins": 3.0943932135899863, "rewards/rejected": -2.2397545178731284, "step": 7680 }, { "epoch": 0.40712373784220707, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27368646.4, "logits/rejected": -52105568.0, "logps/chosen": -215.3823974609375, "logps/rejected": -439.8431396484375, "loss": 0.3237, "rewards/chosen": 0.24112434387207032, "rewards/margins": 3.2123526255289714, "rewards/rejected": -2.971228281656901, "step": 7681 }, { "epoch": 0.4071767418440092, "grad_norm": 32.25, "kl": 1.3513355255126953, "learning_rate": 5e-07, "logits/chosen": -12475840.0, "logits/rejected": -7828770.5, "logps/chosen": -110.42047882080078, "logps/rejected": -109.85374450683594, "loss": 0.3511, "rewards/chosen": -0.2590448558330536, "rewards/margins": 2.5592143833637238, "rewards/rejected": -2.8182592391967773, "step": 7682 }, { "epoch": 0.40722974584581134, "grad_norm": 49.5, "kl": 0.41302490234375, "learning_rate": 5e-07, "logits/chosen": -57030852.0, "logits/rejected": -5700367.0, "logps/chosen": -280.2445068359375, "logps/rejected": -228.4077606201172, "loss": 0.3941, "rewards/chosen": 0.04078508913516998, "rewards/margins": 1.2111531645059586, "rewards/rejected": -1.1703680753707886, "step": 7683 }, { "epoch": 0.4072827498476135, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5818434.0, "logits/rejected": -31849493.333333332, "logps/chosen": -576.16943359375, "logps/rejected": -415.55712890625, "loss": 0.2069, "rewards/chosen": 1.203774094581604, "rewards/margins": 3.6058822870254517, "rewards/rejected": -2.4021081924438477, "step": 7684 }, { "epoch": 0.4073357538494156, "grad_norm": 56.5, "kl": 1.7726001739501953, "learning_rate": 5e-07, "logits/chosen": -10858240.0, "logits/rejected": -28562454.0, "logps/chosen": -215.4527791341146, "logps/rejected": -139.46176147460938, "loss": 0.3779, "rewards/chosen": 0.43110565344492596, "rewards/margins": 2.740014592806498, "rewards/rejected": -2.3089089393615723, "step": 7685 }, { "epoch": 0.40738875785121775, "grad_norm": 50.5, "kl": 1.2122459411621094, "learning_rate": 5e-07, "logits/chosen": -18511276.0, "logits/rejected": -28380168.0, "logps/chosen": -494.24664306640625, "logps/rejected": -300.7582702636719, "loss": 0.3068, "rewards/chosen": 0.45554327964782715, "rewards/margins": 2.3898861408233643, "rewards/rejected": -1.934342861175537, "step": 7686 }, { "epoch": 0.4074417618530199, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60469336.0, "logits/rejected": 2412715.3333333335, "logps/chosen": -265.082275390625, "logps/rejected": -330.0861002604167, "loss": 0.1748, "rewards/chosen": 0.7613548040390015, "rewards/margins": 3.050143917401632, "rewards/rejected": -2.2887891133626304, "step": 7687 }, { "epoch": 0.407494765854822, "grad_norm": 55.0, "kl": 1.9511795043945312, "learning_rate": 5e-07, "logits/chosen": -4664560.666666667, "logits/rejected": -15543696.0, "logps/chosen": -825.9396158854166, "logps/rejected": -157.04267578125, "loss": 0.292, "rewards/chosen": 1.7952992121378581, "rewards/margins": 2.6463633219401044, "rewards/rejected": -0.8510641098022461, "step": 7688 }, { "epoch": 0.40754776985662416, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23859770.666666668, "logits/rejected": -23046787.2, "logps/chosen": -389.9083251953125, "logps/rejected": -233.990283203125, "loss": 0.2754, "rewards/chosen": 0.2925226887067159, "rewards/margins": 2.454045554002126, "rewards/rejected": -2.16152286529541, "step": 7689 }, { "epoch": 0.4076007738584263, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38533192.0, "logits/rejected": -11488165.0, "logps/chosen": -382.6893310546875, "logps/rejected": -254.59707641601562, "loss": 0.2955, "rewards/chosen": 0.05313795804977417, "rewards/margins": 2.51068776845932, "rewards/rejected": -2.457549810409546, "step": 7690 }, { "epoch": 0.40765377786022844, "grad_norm": 49.25, "kl": 0.9642543792724609, "learning_rate": 5e-07, "logits/chosen": -27617392.0, "logits/rejected": -28029940.0, "logps/chosen": -401.2165832519531, "logps/rejected": -163.22134399414062, "loss": 0.3163, "rewards/chosen": 0.4867975115776062, "rewards/margins": 1.9741780161857605, "rewards/rejected": -1.4873805046081543, "step": 7691 }, { "epoch": 0.4077067818620306, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22776157.333333332, "logits/rejected": -15559648.0, "logps/chosen": -327.1492919921875, "logps/rejected": -287.9847412109375, "loss": 0.2332, "rewards/chosen": 0.26165111859639484, "rewards/margins": 2.531822474797567, "rewards/rejected": -2.270171356201172, "step": 7692 }, { "epoch": 0.4077597858638327, "grad_norm": 54.25, "kl": 0.5314960479736328, "learning_rate": 5e-07, "logits/chosen": -18111506.285714287, "logits/rejected": -74802272.0, "logps/chosen": -241.3751220703125, "logps/rejected": -544.1971435546875, "loss": 0.3791, "rewards/chosen": 0.3980306216648647, "rewards/margins": 3.2388387748173306, "rewards/rejected": -2.840808153152466, "step": 7693 }, { "epoch": 0.40781278986563485, "grad_norm": 36.25, "kl": 0.3903923034667969, "learning_rate": 5e-07, "logits/chosen": -9341368.666666666, "logits/rejected": -41510208.0, "logps/chosen": -165.685791015625, "logps/rejected": -202.8438720703125, "loss": 0.2944, "rewards/chosen": 0.810526450475057, "rewards/margins": 2.352195223172506, "rewards/rejected": -1.5416687726974487, "step": 7694 }, { "epoch": 0.407865793867437, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17341646.666666668, "logits/rejected": -18571612.0, "logps/chosen": -396.2039388020833, "logps/rejected": -203.2675323486328, "loss": 0.3204, "rewards/chosen": 0.7126251856486002, "rewards/margins": 1.9966186682383218, "rewards/rejected": -1.2839934825897217, "step": 7695 }, { "epoch": 0.4079187978692391, "grad_norm": 41.0, "kl": 0.1848134994506836, "learning_rate": 5e-07, "logits/chosen": -11939757.0, "logits/rejected": -19142814.0, "logps/chosen": -133.27806091308594, "logps/rejected": -321.5439147949219, "loss": 0.3373, "rewards/chosen": 0.2086416631937027, "rewards/margins": 1.6687063127756119, "rewards/rejected": -1.4600646495819092, "step": 7696 }, { "epoch": 0.40797180187104126, "grad_norm": 65.5, "kl": 3.2247848510742188, "learning_rate": 5e-07, "logits/chosen": -42813523.2, "logits/rejected": -43706274.666666664, "logps/chosen": -668.673291015625, "logps/rejected": -487.8368326822917, "loss": 0.3373, "rewards/chosen": 0.7830282211303711, "rewards/margins": 3.0290669759114586, "rewards/rejected": -2.2460387547810874, "step": 7697 }, { "epoch": 0.4080248058728434, "grad_norm": 154.0, "kl": 2.3419151306152344, "learning_rate": 5e-07, "logits/chosen": -26575884.8, "logits/rejected": -36093602.666666664, "logps/chosen": -512.99462890625, "logps/rejected": -469.7005208333333, "loss": 0.323, "rewards/chosen": 1.0558500289916992, "rewards/margins": 2.409989833831787, "rewards/rejected": -1.354139804840088, "step": 7698 }, { "epoch": 0.40807780987464554, "grad_norm": 46.0, "kl": 0.17948341369628906, "learning_rate": 5e-07, "logits/chosen": -22877568.0, "logits/rejected": 13302931.0, "logps/chosen": -297.46417236328125, "logps/rejected": -158.4449462890625, "loss": 0.357, "rewards/chosen": 0.2531376779079437, "rewards/margins": 1.5187662541866302, "rewards/rejected": -1.2656285762786865, "step": 7699 }, { "epoch": 0.4081308138764477, "grad_norm": 36.75, "kl": 0.33138275146484375, "learning_rate": 5e-07, "logits/chosen": -17663296.0, "logits/rejected": -31427177.6, "logps/chosen": -221.0533650716146, "logps/rejected": -260.6842529296875, "loss": 0.2492, "rewards/chosen": 0.2035637100537618, "rewards/margins": 2.632288380463918, "rewards/rejected": -2.428724670410156, "step": 7700 }, { "epoch": 0.4081838178782498, "grad_norm": 55.25, "kl": 1.9068527221679688, "learning_rate": 5e-07, "logits/chosen": -12916784.0, "logits/rejected": 10789035.0, "logps/chosen": -489.4412841796875, "logps/rejected": -94.18627166748047, "loss": 0.3255, "rewards/chosen": 0.784349282582601, "rewards/margins": 2.787081321080526, "rewards/rejected": -2.002732038497925, "step": 7701 }, { "epoch": 0.40823682188005195, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11436382.4, "logits/rejected": 9972302.0, "logps/chosen": -186.3360107421875, "logps/rejected": -353.576416015625, "loss": 0.35, "rewards/chosen": 0.14501137733459474, "rewards/margins": 2.211366446812948, "rewards/rejected": -2.066355069478353, "step": 7702 }, { "epoch": 0.4082898258818541, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6425640.0, "logits/rejected": -20471976.0, "logps/chosen": -138.258349609375, "logps/rejected": -394.3986002604167, "loss": 0.3128, "rewards/chosen": 0.42335205078125, "rewards/margins": 2.135501194000244, "rewards/rejected": -1.7121491432189941, "step": 7703 }, { "epoch": 0.4083428298836562, "grad_norm": 57.0, "kl": 1.1070671081542969, "learning_rate": 5e-07, "logits/chosen": -62453344.0, "logits/rejected": -18510709.333333332, "logps/chosen": -376.10478515625, "logps/rejected": -389.2723795572917, "loss": 0.4356, "rewards/chosen": -0.21874496936798096, "rewards/margins": 1.6553029457728068, "rewards/rejected": -1.8740479151407878, "step": 7704 }, { "epoch": 0.40839583388545836, "grad_norm": 49.25, "kl": 1.3068981170654297, "learning_rate": 5e-07, "logits/chosen": -36684248.0, "logits/rejected": -23992018.0, "logps/chosen": -333.226806640625, "logps/rejected": -169.6278076171875, "loss": 0.3294, "rewards/chosen": 0.28268662095069885, "rewards/margins": 2.130993276834488, "rewards/rejected": -1.848306655883789, "step": 7705 }, { "epoch": 0.4084488378872605, "grad_norm": 91.5, "kl": 0.9195556640625, "learning_rate": 5e-07, "logits/chosen": -6506851.333333333, "logits/rejected": -27914136.0, "logps/chosen": -472.7476399739583, "logps/rejected": -866.6943969726562, "loss": 0.3632, "rewards/chosen": 0.3747828006744385, "rewards/margins": 3.278785467147827, "rewards/rejected": -2.9040026664733887, "step": 7706 }, { "epoch": 0.40850184188906263, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28558640.0, "logits/rejected": -26630884.0, "logps/chosen": -279.24330647786456, "logps/rejected": -65.85365295410156, "loss": 0.4049, "rewards/chosen": 0.0876677135626475, "rewards/margins": 1.8310601810614269, "rewards/rejected": -1.7433924674987793, "step": 7707 }, { "epoch": 0.40855484589086477, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66240458.666666664, "logits/rejected": -2444169.0, "logps/chosen": -363.8334147135417, "logps/rejected": -87.65238037109376, "loss": 0.3273, "rewards/chosen": 0.3088623285293579, "rewards/margins": 1.6051372766494751, "rewards/rejected": -1.2962749481201172, "step": 7708 }, { "epoch": 0.4086078498926669, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32087786.666666668, "logits/rejected": -2221867.4, "logps/chosen": -182.24641927083334, "logps/rejected": -263.1122314453125, "loss": 0.2824, "rewards/chosen": 0.05031471451123556, "rewards/margins": 2.000796510775884, "rewards/rejected": -1.9504817962646483, "step": 7709 }, { "epoch": 0.40866085389446904, "grad_norm": 70.5, "kl": 3.3220367431640625, "learning_rate": 5e-07, "logits/chosen": 19567497.333333332, "logits/rejected": -30619646.0, "logps/chosen": -368.8141276041667, "logps/rejected": -252.48789978027344, "loss": 0.4627, "rewards/chosen": 0.28218309084574383, "rewards/margins": 2.1586517492930093, "rewards/rejected": -1.8764686584472656, "step": 7710 }, { "epoch": 0.4087138578962712, "grad_norm": 64.5, "kl": 0.4026784896850586, "learning_rate": 5e-07, "logits/chosen": -9202648.0, "logits/rejected": -28828549.333333332, "logps/chosen": -399.669091796875, "logps/rejected": -413.531982421875, "loss": 0.3298, "rewards/chosen": 0.3144932746887207, "rewards/margins": 2.713762378692627, "rewards/rejected": -2.3992691040039062, "step": 7711 }, { "epoch": 0.4087668618980733, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38703340.0, "logits/rejected": -32773200.0, "logps/chosen": -182.0378875732422, "logps/rejected": -284.2269694010417, "loss": 0.3216, "rewards/chosen": -0.3288409411907196, "rewards/margins": 0.9258862634499867, "rewards/rejected": -1.2547272046407063, "step": 7712 }, { "epoch": 0.40881986589987546, "grad_norm": 29.5, "kl": 0.36391448974609375, "learning_rate": 5e-07, "logits/chosen": 781166.5, "logits/rejected": -31759134.0, "logps/chosen": -84.46367645263672, "logps/rejected": -361.3687438964844, "loss": 0.2686, "rewards/chosen": 0.2779536545276642, "rewards/margins": 2.8659572899341583, "rewards/rejected": -2.588003635406494, "step": 7713 }, { "epoch": 0.4088728699016776, "grad_norm": 43.5, "kl": 0.013576507568359375, "learning_rate": 5e-07, "logits/chosen": -37593144.0, "logits/rejected": -12343406.666666666, "logps/chosen": -429.06170654296875, "logps/rejected": -223.5699666341146, "loss": 0.2573, "rewards/chosen": 1.0088363885879517, "rewards/margins": 2.2867648998896284, "rewards/rejected": -1.2779285113016765, "step": 7714 }, { "epoch": 0.40892587390347973, "grad_norm": 38.5, "kl": 0.6202125549316406, "learning_rate": 5e-07, "logits/chosen": -12076012.0, "logits/rejected": -33557632.0, "logps/chosen": -292.080078125, "logps/rejected": -281.6058349609375, "loss": 0.1998, "rewards/chosen": 1.4882700443267822, "rewards/margins": 3.463152050971985, "rewards/rejected": -1.9748820066452026, "step": 7715 }, { "epoch": 0.40897887790528187, "grad_norm": 45.5, "kl": 0.03040313720703125, "learning_rate": 5e-07, "logits/chosen": -22833700.0, "logits/rejected": 15435367.0, "logps/chosen": -323.9622802734375, "logps/rejected": -208.68402099609375, "loss": 0.3221, "rewards/chosen": 0.2886369526386261, "rewards/margins": 1.7470205128192902, "rewards/rejected": -1.458383560180664, "step": 7716 }, { "epoch": 0.409031881907084, "grad_norm": 66.5, "kl": 1.9984664916992188, "learning_rate": 5e-07, "logits/chosen": 1340299.1666666667, "logits/rejected": -32058822.4, "logps/chosen": -643.9954833984375, "logps/rejected": -412.684033203125, "loss": 0.258, "rewards/chosen": 0.7145039240519205, "rewards/margins": 2.729605547587077, "rewards/rejected": -2.0151016235351564, "step": 7717 }, { "epoch": 0.40908488590888614, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17643357.333333332, "logits/rejected": -7024167.2, "logps/chosen": -131.3394775390625, "logps/rejected": -468.71298828125, "loss": 0.302, "rewards/chosen": 0.2579474051793416, "rewards/margins": 1.843907602628072, "rewards/rejected": -1.5859601974487305, "step": 7718 }, { "epoch": 0.4091378899106883, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46781728.0, "logits/rejected": -37237592.0, "logps/chosen": -435.0226135253906, "logps/rejected": -453.80743408203125, "loss": 0.2962, "rewards/chosen": 0.27177128195762634, "rewards/margins": 2.2494040429592133, "rewards/rejected": -1.977632761001587, "step": 7719 }, { "epoch": 0.4091908939124904, "grad_norm": 41.5, "kl": 0.7190380096435547, "learning_rate": 5e-07, "logits/chosen": -19299929.333333332, "logits/rejected": -34826361.6, "logps/chosen": -168.8895060221354, "logps/rejected": -450.796337890625, "loss": 0.2829, "rewards/chosen": 0.03466058770815531, "rewards/margins": 2.3564306835333504, "rewards/rejected": -2.3217700958251952, "step": 7720 }, { "epoch": 0.40924389791429255, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1647532.75, "logits/rejected": -19669529.333333332, "logps/chosen": -40.96319580078125, "logps/rejected": -265.444580078125, "loss": 0.2411, "rewards/chosen": 0.16883113980293274, "rewards/margins": 1.98241991798083, "rewards/rejected": -1.8135887781778972, "step": 7721 }, { "epoch": 0.4092969019160947, "grad_norm": 49.5, "kl": 0.8816547393798828, "learning_rate": 5e-07, "logits/chosen": -23005466.666666668, "logits/rejected": -14916425.6, "logps/chosen": -251.51961263020834, "logps/rejected": -219.922265625, "loss": 0.3296, "rewards/chosen": 0.1134913166364034, "rewards/margins": 1.6671960552533467, "rewards/rejected": -1.5537047386169434, "step": 7722 }, { "epoch": 0.40934990591789683, "grad_norm": 45.5, "kl": 0.6829872131347656, "learning_rate": 5e-07, "logits/chosen": -34107942.4, "logits/rejected": -21271221.333333332, "logps/chosen": -225.481689453125, "logps/rejected": -106.5421651204427, "loss": 0.3723, "rewards/chosen": 0.219518780708313, "rewards/margins": 1.7145784298578899, "rewards/rejected": -1.495059649149577, "step": 7723 }, { "epoch": 0.4094029099196989, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8852247.0, "logits/rejected": -27286344.0, "logps/chosen": -51.538795471191406, "logps/rejected": -254.20511881510416, "loss": 0.3086, "rewards/chosen": 0.528437614440918, "rewards/margins": 1.512753963470459, "rewards/rejected": -0.984316349029541, "step": 7724 }, { "epoch": 0.40945591392150105, "grad_norm": 63.0, "kl": 4.036014556884766, "learning_rate": 5e-07, "logits/chosen": -86184960.0, "logits/rejected": -52745952.0, "logps/chosen": -954.2179361979166, "logps/rejected": -350.6577880859375, "loss": 0.2249, "rewards/chosen": 1.4784677823384602, "rewards/margins": 4.048127587636312, "rewards/rejected": -2.5696598052978517, "step": 7725 }, { "epoch": 0.4095089179233032, "grad_norm": 51.0, "kl": 0.6376266479492188, "learning_rate": 5e-07, "logits/chosen": -50504757.333333336, "logits/rejected": -40139436.8, "logps/chosen": -343.1815185546875, "logps/rejected": -294.519873046875, "loss": 0.3579, "rewards/chosen": -0.47336451212565106, "rewards/margins": 1.3691090901692708, "rewards/rejected": -1.842473602294922, "step": 7726 }, { "epoch": 0.4095619219251053, "grad_norm": 52.75, "kl": 2.329029083251953, "learning_rate": 5e-07, "logits/chosen": -23585232.0, "logits/rejected": -28371278.0, "logps/chosen": -245.47012765066964, "logps/rejected": -737.9349365234375, "loss": 0.4457, "rewards/chosen": 0.26822311537606375, "rewards/margins": 4.701615640095302, "rewards/rejected": -4.433392524719238, "step": 7727 }, { "epoch": 0.40961492592690746, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -23676828.0, "logps/rejected": -348.60260009765625, "loss": 0.1544, "rewards/rejected": -2.138923168182373, "step": 7728 }, { "epoch": 0.4096679299287096, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33527157.333333332, "logits/rejected": -7079829.6, "logps/chosen": -303.37241617838544, "logps/rejected": -348.5025634765625, "loss": 0.32, "rewards/chosen": 0.0471511979897817, "rewards/margins": 1.5422668596108753, "rewards/rejected": -1.4951156616210937, "step": 7729 }, { "epoch": 0.40972093393051173, "grad_norm": 43.5, "kl": 0.7231006622314453, "learning_rate": 5e-07, "logits/chosen": -13933908.0, "logits/rejected": -35115108.0, "logps/chosen": -252.03958129882812, "logps/rejected": -392.065185546875, "loss": 0.2326, "rewards/chosen": 0.9378301501274109, "rewards/margins": 2.9217540621757507, "rewards/rejected": -1.9839239120483398, "step": 7730 }, { "epoch": 0.40977393793231387, "grad_norm": 39.25, "kl": 0.07101058959960938, "learning_rate": 5e-07, "logits/chosen": -27135621.333333332, "logits/rejected": -27620140.8, "logps/chosen": -124.29249064127605, "logps/rejected": -431.237158203125, "loss": 0.2931, "rewards/chosen": -0.14040749271710715, "rewards/margins": 1.8757277230421703, "rewards/rejected": -2.0161352157592773, "step": 7731 }, { "epoch": 0.409826941934116, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23559683.2, "logits/rejected": -16083360.0, "logps/chosen": -111.5057861328125, "logps/rejected": -296.4546305338542, "loss": 0.3869, "rewards/chosen": 0.05012646913528442, "rewards/margins": 1.6930383801460267, "rewards/rejected": -1.6429119110107422, "step": 7732 }, { "epoch": 0.40987994593591814, "grad_norm": 52.5, "kl": 1.3744354248046875, "learning_rate": 5e-07, "logits/chosen": -16401377.333333334, "logits/rejected": -17456692.8, "logps/chosen": -605.2396647135416, "logps/rejected": -389.4650390625, "loss": 0.1651, "rewards/chosen": 1.3391059239705403, "rewards/margins": 3.8757758458455402, "rewards/rejected": -2.536669921875, "step": 7733 }, { "epoch": 0.4099329499377203, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9256966.0, "logits/rejected": -19472100.0, "logps/chosen": -219.22203063964844, "logps/rejected": -651.7904052734375, "loss": 0.3485, "rewards/chosen": -0.1675388365983963, "rewards/margins": 1.8303640335798264, "rewards/rejected": -1.9979028701782227, "step": 7734 }, { "epoch": 0.4099859539395224, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58919924.0, "logits/rejected": -49457512.0, "logps/chosen": -318.1612548828125, "logps/rejected": -424.2072448730469, "loss": 0.2455, "rewards/chosen": 0.3287550210952759, "rewards/margins": 2.9737104177474976, "rewards/rejected": -2.6449553966522217, "step": 7735 }, { "epoch": 0.41003895794132456, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22980586.666666668, "logits/rejected": -56989593.6, "logps/chosen": -213.08048502604166, "logps/rejected": -509.0654296875, "loss": 0.2692, "rewards/chosen": 0.01418775071700414, "rewards/margins": 2.5879487370451293, "rewards/rejected": -2.573760986328125, "step": 7736 }, { "epoch": 0.4100919619431267, "grad_norm": 70.5, "kl": 0.8523197174072266, "learning_rate": 5e-07, "logits/chosen": -11798998.4, "logits/rejected": -28082714.666666668, "logps/chosen": -483.2205078125, "logps/rejected": -365.7264811197917, "loss": 0.288, "rewards/chosen": 0.51300950050354, "rewards/margins": 3.131047360102335, "rewards/rejected": -2.6180378595987954, "step": 7737 }, { "epoch": 0.41014496594492883, "grad_norm": 55.0, "kl": 0.1651315689086914, "learning_rate": 5e-07, "logits/chosen": -18319297.14285714, "logits/rejected": 6190476.0, "logps/chosen": -113.29164341517857, "logps/rejected": -182.7333221435547, "loss": 0.4303, "rewards/chosen": 0.11494636535644531, "rewards/margins": 2.2519474029541016, "rewards/rejected": -2.1370010375976562, "step": 7738 }, { "epoch": 0.41019796994673097, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56056346.666666664, "logits/rejected": -15381113.6, "logps/chosen": -384.873779296875, "logps/rejected": -213.5684814453125, "loss": 0.1581, "rewards/chosen": 1.3015529314676921, "rewards/margins": 3.3584397951761886, "rewards/rejected": -2.0568868637084963, "step": 7739 }, { "epoch": 0.4102509739485331, "grad_norm": 63.5, "kl": 3.4354305267333984, "learning_rate": 5e-07, "logits/chosen": 6249612.8, "logits/rejected": -51768592.0, "logps/chosen": -621.11787109375, "logps/rejected": -708.3924153645834, "loss": 0.3477, "rewards/chosen": 0.5962806224822998, "rewards/margins": 4.53118535677592, "rewards/rejected": -3.9349047342936196, "step": 7740 }, { "epoch": 0.41030397795033524, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11269202.0, "logits/rejected": 2071229.5, "logps/chosen": -139.41905212402344, "logps/rejected": -221.739013671875, "loss": 0.2612, "rewards/chosen": -0.21345862746238708, "rewards/margins": 1.4994884232680004, "rewards/rejected": -1.7129470507303874, "step": 7741 }, { "epoch": 0.4103569819521374, "grad_norm": 38.25, "kl": 1.0280685424804688, "learning_rate": 5e-07, "logits/chosen": -26904248.0, "logits/rejected": -23001936.0, "logps/chosen": -221.0129597981771, "logps/rejected": -271.2866455078125, "loss": 0.2429, "rewards/chosen": 0.7141997019449869, "rewards/margins": 2.9096441904703774, "rewards/rejected": -2.1954444885253905, "step": 7742 }, { "epoch": 0.4104099859539395, "grad_norm": 63.0, "kl": 0.6098594665527344, "learning_rate": 5e-07, "logits/chosen": -17898426.0, "logits/rejected": 23280272.0, "logps/chosen": -485.607177734375, "logps/rejected": -217.21661376953125, "loss": 0.2602, "rewards/chosen": 0.9679134488105774, "rewards/margins": 2.509228765964508, "rewards/rejected": -1.5413153171539307, "step": 7743 }, { "epoch": 0.41046298995574165, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11771009.0, "logits/rejected": -12231950.666666666, "logps/chosen": -41.89680480957031, "logps/rejected": -327.6430257161458, "loss": 0.3024, "rewards/chosen": -0.2539060413837433, "rewards/margins": 1.220661054054896, "rewards/rejected": -1.4745670954386394, "step": 7744 }, { "epoch": 0.4105159939575438, "grad_norm": 59.25, "kl": 0.6914291381835938, "learning_rate": 5e-07, "logits/chosen": -50247936.0, "logits/rejected": -7820702.5, "logps/chosen": -321.9820861816406, "logps/rejected": -213.14938354492188, "loss": 0.3696, "rewards/chosen": 0.3149700164794922, "rewards/margins": 1.2913593649864197, "rewards/rejected": -0.9763893485069275, "step": 7745 }, { "epoch": 0.4105689979593459, "grad_norm": 43.75, "kl": 0.4571342468261719, "learning_rate": 5e-07, "logits/chosen": 12043584.0, "logits/rejected": 7301928.0, "logps/chosen": -122.68315887451172, "logps/rejected": -436.1527099609375, "loss": 0.353, "rewards/chosen": -0.13276739418506622, "rewards/margins": 1.8067152947187424, "rewards/rejected": -1.9394826889038086, "step": 7746 }, { "epoch": 0.41062200196114806, "grad_norm": 55.0, "kl": 0.05160331726074219, "learning_rate": 5e-07, "logits/chosen": -2727546.0, "logits/rejected": -52581176.0, "logps/chosen": -156.3541259765625, "logps/rejected": -264.1063537597656, "loss": 0.4486, "rewards/chosen": -0.07418644924958546, "rewards/margins": 1.1518655965725582, "rewards/rejected": -1.2260520458221436, "step": 7747 }, { "epoch": 0.4106750059629502, "grad_norm": 58.0, "kl": 0.6801700592041016, "learning_rate": 5e-07, "logits/chosen": 6746311.2, "logits/rejected": -36340522.666666664, "logps/chosen": -238.791064453125, "logps/rejected": -341.402099609375, "loss": 0.3901, "rewards/chosen": -0.08996756076812744, "rewards/margins": 1.93591738541921, "rewards/rejected": -2.0258849461873374, "step": 7748 }, { "epoch": 0.41072800996475234, "grad_norm": 54.25, "kl": 0.12081050872802734, "learning_rate": 5e-07, "logits/chosen": -19169760.0, "logits/rejected": -30657282.666666668, "logps/chosen": -325.7284423828125, "logps/rejected": -298.6808268229167, "loss": 0.3092, "rewards/chosen": 0.4307108879089355, "rewards/margins": 2.3515896161397296, "rewards/rejected": -1.9208787282307942, "step": 7749 }, { "epoch": 0.4107810139665545, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25856314.666666668, "logits/rejected": -6684962.4, "logps/chosen": -408.3811848958333, "logps/rejected": -219.60341796875, "loss": 0.2882, "rewards/chosen": 0.04650929570198059, "rewards/margins": 2.8323701202869414, "rewards/rejected": -2.7858608245849608, "step": 7750 }, { "epoch": 0.4108340179683566, "grad_norm": 43.5, "kl": 2.6821136474609375, "learning_rate": 5e-07, "logits/chosen": -24759424.0, "logits/rejected": -20241298.666666668, "logps/chosen": -254.254150390625, "logps/rejected": -340.11159261067706, "loss": 0.2744, "rewards/chosen": 0.8804155349731445, "rewards/margins": 3.983223533630371, "rewards/rejected": -3.1028079986572266, "step": 7751 }, { "epoch": 0.41088702197015875, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35277737.6, "logits/rejected": -17792205.333333332, "logps/chosen": -318.316748046875, "logps/rejected": -363.4185384114583, "loss": 0.3356, "rewards/chosen": 0.40195069313049314, "rewards/margins": 3.1064852237701417, "rewards/rejected": -2.7045345306396484, "step": 7752 }, { "epoch": 0.4109400259719609, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6639814.666666667, "logits/rejected": -62442732.8, "logps/chosen": -261.62725830078125, "logps/rejected": -368.078759765625, "loss": 0.2835, "rewards/chosen": 0.2195399006207784, "rewards/margins": 1.95934130748113, "rewards/rejected": -1.7398014068603516, "step": 7753 }, { "epoch": 0.410993029973763, "grad_norm": 56.5, "kl": 0.064117431640625, "learning_rate": 5e-07, "logits/chosen": -1931473.3333333333, "logits/rejected": -1777016.0, "logps/chosen": -395.5262044270833, "logps/rejected": -102.97008056640625, "loss": 0.3062, "rewards/chosen": 0.2732045650482178, "rewards/margins": 1.651186227798462, "rewards/rejected": -1.3779816627502441, "step": 7754 }, { "epoch": 0.41104603397556516, "grad_norm": 45.0, "kl": 0.37988853454589844, "learning_rate": 5e-07, "logits/chosen": -10684582.0, "logits/rejected": 3964114.8, "logps/chosen": -235.19486490885416, "logps/rejected": -246.836181640625, "loss": 0.2844, "rewards/chosen": 0.2553037603696187, "rewards/margins": 1.8985414465268453, "rewards/rejected": -1.6432376861572267, "step": 7755 }, { "epoch": 0.4110990379773673, "grad_norm": 47.25, "kl": 0.2869606018066406, "learning_rate": 5e-07, "logits/chosen": -18506712.0, "logits/rejected": -43460665.6, "logps/chosen": -411.16796875, "logps/rejected": -616.55185546875, "loss": 0.1955, "rewards/chosen": 0.2781687577565511, "rewards/margins": 3.5129168351491296, "rewards/rejected": -3.2347480773925783, "step": 7756 }, { "epoch": 0.41115204197916944, "grad_norm": 61.25, "kl": 0.71002197265625, "learning_rate": 5e-07, "logits/chosen": -12037246.4, "logits/rejected": -9220222.0, "logps/chosen": -271.689208984375, "logps/rejected": -391.33203125, "loss": 0.3156, "rewards/chosen": 0.6333394050598145, "rewards/margins": 2.0185338656107588, "rewards/rejected": -1.385194460550944, "step": 7757 }, { "epoch": 0.4112050459809716, "grad_norm": 50.0, "kl": 1.2789039611816406, "learning_rate": 5e-07, "logits/chosen": -76881552.0, "logits/rejected": -6437146.5, "logps/chosen": -313.81085205078125, "logps/rejected": -367.180908203125, "loss": 0.286, "rewards/chosen": 0.3656059205532074, "rewards/margins": 3.419097512960434, "rewards/rejected": -3.0534915924072266, "step": 7758 }, { "epoch": 0.4112580499827737, "grad_norm": 52.0, "kl": 0.6470813751220703, "learning_rate": 5e-07, "logits/chosen": 5370692.0, "logits/rejected": -8774051.0, "logps/chosen": -268.56435139973956, "logps/rejected": -131.31390380859375, "loss": 0.3215, "rewards/chosen": 0.5880964199701945, "rewards/margins": 2.6861775318781533, "rewards/rejected": -2.098081111907959, "step": 7759 }, { "epoch": 0.41131105398457585, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5518811.333333333, "logits/rejected": -29051136.0, "logps/chosen": -242.48262532552084, "logps/rejected": -338.6525634765625, "loss": 0.2574, "rewards/chosen": 0.7952860196431478, "rewards/margins": 2.237497647603353, "rewards/rejected": -1.442211627960205, "step": 7760 }, { "epoch": 0.411364057986378, "grad_norm": 44.0, "kl": 0.5445632934570312, "learning_rate": 5e-07, "logits/chosen": -20134152.0, "logits/rejected": -37668752.0, "logps/chosen": -315.4208984375, "logps/rejected": -498.5690104166667, "loss": 0.273, "rewards/chosen": 0.882356834411621, "rewards/margins": 2.841242027282715, "rewards/rejected": -1.9588851928710938, "step": 7761 }, { "epoch": 0.4114170619881801, "grad_norm": 54.0, "kl": 0.0189361572265625, "learning_rate": 5e-07, "logits/chosen": -32417440.0, "logits/rejected": -11434188.8, "logps/chosen": -267.19573974609375, "logps/rejected": -187.98380126953126, "loss": 0.353, "rewards/chosen": 0.1424968640009562, "rewards/margins": 1.2063259998957316, "rewards/rejected": -1.0638291358947753, "step": 7762 }, { "epoch": 0.41147006598998226, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -96931456.0, "logits/rejected": -17588032.0, "logps/chosen": -516.1292724609375, "logps/rejected": -214.06195068359375, "loss": 0.2094, "rewards/chosen": 0.6435562372207642, "rewards/margins": 2.4347123702367144, "rewards/rejected": -1.7911561330159504, "step": 7763 }, { "epoch": 0.4115230699917844, "grad_norm": 45.5, "kl": 0.4127540588378906, "learning_rate": 5e-07, "logits/chosen": -6722293.333333333, "logits/rejected": -26141379.2, "logps/chosen": -165.4876505533854, "logps/rejected": -474.785888671875, "loss": 0.2952, "rewards/chosen": 0.6075191100438436, "rewards/margins": 2.9259290297826133, "rewards/rejected": -2.3184099197387695, "step": 7764 }, { "epoch": 0.41157607399358653, "grad_norm": 49.75, "kl": 0.6602973937988281, "learning_rate": 5e-07, "logits/chosen": -18780636.0, "logits/rejected": -25116156.0, "logps/chosen": -241.70199584960938, "logps/rejected": -223.5686798095703, "loss": 0.241, "rewards/chosen": 0.536108672618866, "rewards/margins": 3.1517624258995056, "rewards/rejected": -2.6156537532806396, "step": 7765 }, { "epoch": 0.41162907799538867, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19949912.0, "logits/rejected": -15570938.666666666, "logps/chosen": -332.6277587890625, "logps/rejected": -161.4808349609375, "loss": 0.3778, "rewards/chosen": 0.10724564790725707, "rewards/margins": 1.5239379048347472, "rewards/rejected": -1.4166922569274902, "step": 7766 }, { "epoch": 0.4116820819971908, "grad_norm": 106.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49576952.0, "logits/rejected": -26903094.85714286, "logps/chosen": -743.130615234375, "logps/rejected": -287.5904017857143, "loss": 0.17, "rewards/chosen": 1.0966309309005737, "rewards/margins": 2.9392117261886597, "rewards/rejected": -1.842580795288086, "step": 7767 }, { "epoch": 0.41173508599899294, "grad_norm": 51.25, "kl": 0.722442626953125, "learning_rate": 5e-07, "logits/chosen": -52938252.8, "logits/rejected": 1741796.0, "logps/chosen": -222.148046875, "logps/rejected": -67.2787373860677, "loss": 0.3436, "rewards/chosen": 0.2952445983886719, "rewards/margins": 2.2696068127950033, "rewards/rejected": -1.9743622144063313, "step": 7768 }, { "epoch": 0.4117880900007951, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35927556.0, "logits/rejected": -23479290.666666668, "logps/chosen": -242.97970581054688, "logps/rejected": -219.6002197265625, "loss": 0.2304, "rewards/chosen": -0.17702941596508026, "rewards/margins": 2.077731485168139, "rewards/rejected": -2.2547609011332193, "step": 7769 }, { "epoch": 0.4118410940025972, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16789846.0, "logits/rejected": -27881592.0, "logps/chosen": -607.046630859375, "logps/rejected": -286.880126953125, "loss": 0.2594, "rewards/chosen": 0.6183914542198181, "rewards/margins": 2.663683831691742, "rewards/rejected": -2.045292377471924, "step": 7770 }, { "epoch": 0.41189409800439936, "grad_norm": 64.5, "kl": 0.24907779693603516, "learning_rate": 5e-07, "logits/chosen": -28932825.6, "logits/rejected": 15051757.333333334, "logps/chosen": -333.436669921875, "logps/rejected": -206.4852498372396, "loss": 0.3655, "rewards/chosen": 0.2807185411453247, "rewards/margins": 1.5300384918848673, "rewards/rejected": -1.2493199507395427, "step": 7771 }, { "epoch": 0.4119471020062015, "grad_norm": 31.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15850397.333333334, "logits/rejected": -23546032.0, "logps/chosen": -243.3435262044271, "logps/rejected": -494.88173828125, "loss": 0.1462, "rewards/chosen": 1.2305199305216472, "rewards/margins": 4.091034952799479, "rewards/rejected": -2.860515022277832, "step": 7772 }, { "epoch": 0.41200010600800363, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1826166.0, "logits/rejected": -19311876.0, "logps/chosen": -382.6788635253906, "logps/rejected": -493.38372802734375, "loss": 0.3337, "rewards/chosen": -0.021496586501598358, "rewards/margins": 2.086367316544056, "rewards/rejected": -2.1078639030456543, "step": 7773 }, { "epoch": 0.41205311000980577, "grad_norm": 34.0, "kl": 0.037677764892578125, "learning_rate": 5e-07, "logits/chosen": -4696647.0, "logits/rejected": -30937360.0, "logps/chosen": -167.82020568847656, "logps/rejected": -465.4888509114583, "loss": 0.1539, "rewards/chosen": 0.833861768245697, "rewards/margins": 3.440725107987722, "rewards/rejected": -2.606863339742025, "step": 7774 }, { "epoch": 0.41210611401160785, "grad_norm": 41.75, "kl": 0.2270355224609375, "learning_rate": 5e-07, "logits/chosen": 1583343.0, "logits/rejected": -40356344.0, "logps/chosen": -259.94140625, "logps/rejected": -479.22735595703125, "loss": 0.2601, "rewards/chosen": 0.8902397155761719, "rewards/margins": 2.3146904706954956, "rewards/rejected": -1.4244507551193237, "step": 7775 }, { "epoch": 0.41215911801341, "grad_norm": 46.0, "kl": 2.4223899841308594, "learning_rate": 5e-07, "logits/chosen": -4587235.5, "logits/rejected": -18526482.666666668, "logps/chosen": -934.7803344726562, "logps/rejected": -392.05908203125, "loss": 0.2123, "rewards/chosen": 1.4157689809799194, "rewards/margins": 3.3495175441106158, "rewards/rejected": -1.9337485631306965, "step": 7776 }, { "epoch": 0.4122121220152121, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29693718.0, "logits/rejected": -54734040.0, "logps/chosen": -365.0416564941406, "logps/rejected": -456.2009582519531, "loss": 0.2505, "rewards/chosen": 0.34909653663635254, "rewards/margins": 2.910531520843506, "rewards/rejected": -2.5614349842071533, "step": 7777 }, { "epoch": 0.41226512601701426, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84976200.0, "logits/rejected": -34404888.0, "logps/chosen": -458.0057067871094, "logps/rejected": -475.1185302734375, "loss": 0.2423, "rewards/chosen": 0.7384712100028992, "rewards/margins": 2.771603763103485, "rewards/rejected": -2.033132553100586, "step": 7778 }, { "epoch": 0.4123181300188164, "grad_norm": 69.0, "kl": 0.51806640625, "learning_rate": 5e-07, "logits/chosen": -98355882.66666667, "logits/rejected": -51785160.0, "logps/chosen": -537.508056640625, "logps/rejected": -429.8521423339844, "loss": 0.3157, "rewards/chosen": 0.5370854536692301, "rewards/margins": 3.3036035696665444, "rewards/rejected": -2.7665181159973145, "step": 7779 }, { "epoch": 0.41237113402061853, "grad_norm": 46.0, "kl": 0.7648096084594727, "learning_rate": 5e-07, "logits/chosen": -27089928.0, "logits/rejected": -30935676.8, "logps/chosen": -324.5761311848958, "logps/rejected": -221.723876953125, "loss": 0.3081, "rewards/chosen": 0.18215179443359375, "rewards/margins": 1.7687767028808594, "rewards/rejected": -1.5866249084472657, "step": 7780 }, { "epoch": 0.41242413802242067, "grad_norm": 50.5, "kl": 1.3092422485351562, "learning_rate": 5e-07, "logits/chosen": -47166024.0, "logits/rejected": -39543728.0, "logps/chosen": -353.2650451660156, "logps/rejected": -524.0784912109375, "loss": 0.1895, "rewards/chosen": 1.070185661315918, "rewards/margins": 4.553320646286011, "rewards/rejected": -3.4831349849700928, "step": 7781 }, { "epoch": 0.4124771420242228, "grad_norm": 67.5, "kl": 1.492919921875, "learning_rate": 5e-07, "logits/chosen": -28072220.8, "logits/rejected": -7603825.333333333, "logps/chosen": -677.3818359375, "logps/rejected": -126.1704813639323, "loss": 0.2608, "rewards/chosen": 1.1881009101867677, "rewards/margins": 2.4655113379160563, "rewards/rejected": -1.2774104277292888, "step": 7782 }, { "epoch": 0.41253014602602495, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17284420.0, "logits/rejected": 2061171.6666666667, "logps/chosen": -403.4979248046875, "logps/rejected": -185.92081705729166, "loss": 0.255, "rewards/chosen": 0.183685302734375, "rewards/margins": 1.8460278511047363, "rewards/rejected": -1.6623425483703613, "step": 7783 }, { "epoch": 0.4125831500278271, "grad_norm": 52.0, "kl": 1.2766685485839844, "learning_rate": 5e-07, "logits/chosen": -34277385.6, "logits/rejected": -20355448.0, "logps/chosen": -254.73759765625, "logps/rejected": -218.0080362955729, "loss": 0.3501, "rewards/chosen": 0.5615556716918946, "rewards/margins": 1.9526606877644856, "rewards/rejected": -1.391105016072591, "step": 7784 }, { "epoch": 0.4126361540296292, "grad_norm": 49.75, "kl": 1.045259952545166, "learning_rate": 5e-07, "logits/chosen": -33872341.333333336, "logits/rejected": -726651.25, "logps/chosen": -288.5734049479167, "logps/rejected": -102.13221740722656, "loss": 0.2949, "rewards/chosen": 0.6520011822382609, "rewards/margins": 4.113943139712016, "rewards/rejected": -3.461941957473755, "step": 7785 }, { "epoch": 0.41268915803143136, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14499970.666666666, "logits/rejected": -29063302.4, "logps/chosen": -433.8751627604167, "logps/rejected": -464.140185546875, "loss": 0.2514, "rewards/chosen": -0.09047496318817139, "rewards/margins": 2.4400384187698365, "rewards/rejected": -2.530513381958008, "step": 7786 }, { "epoch": 0.4127421620332335, "grad_norm": 44.0, "kl": 0.6794090270996094, "learning_rate": 5e-07, "logits/chosen": -115628.6875, "logits/rejected": -3681810.75, "logps/chosen": -235.62510681152344, "logps/rejected": -101.87915802001953, "loss": 0.3132, "rewards/chosen": 0.36831724643707275, "rewards/margins": 2.0056122541427612, "rewards/rejected": -1.6372950077056885, "step": 7787 }, { "epoch": 0.41279516603503563, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8563032.57142857, "logits/rejected": -13459027.0, "logps/chosen": -180.98179408482142, "logps/rejected": -121.52177429199219, "loss": 0.4047, "rewards/chosen": 0.3763523783002581, "rewards/margins": 1.2286740030561174, "rewards/rejected": -0.8523216247558594, "step": 7788 }, { "epoch": 0.41284817003683777, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8175616.5, "logits/rejected": -18616686.0, "logps/chosen": -155.48681640625, "logps/rejected": -404.7369384765625, "loss": 0.3629, "rewards/chosen": -0.19262677431106567, "rewards/margins": 1.435409963130951, "rewards/rejected": -1.6280367374420166, "step": 7789 }, { "epoch": 0.4129011740386399, "grad_norm": 37.5, "kl": 0.22042274475097656, "learning_rate": 5e-07, "logits/chosen": -44314216.0, "logits/rejected": -27958756.0, "logps/chosen": -205.69281005859375, "logps/rejected": -241.57484436035156, "loss": 0.3121, "rewards/chosen": 0.2961057424545288, "rewards/margins": 2.4150794744491577, "rewards/rejected": -2.118973731994629, "step": 7790 }, { "epoch": 0.41295417804044204, "grad_norm": 50.5, "kl": 0.9794578552246094, "learning_rate": 5e-07, "logits/chosen": -48983628.0, "logits/rejected": 384393.0, "logps/chosen": -477.5242919921875, "logps/rejected": -487.2752685546875, "loss": 0.2637, "rewards/chosen": 0.5183943510055542, "rewards/margins": 2.9082571268081665, "rewards/rejected": -2.3898627758026123, "step": 7791 }, { "epoch": 0.4130071820422442, "grad_norm": 44.0, "kl": 0.6121902465820312, "learning_rate": 5e-07, "logits/chosen": -67468922.66666667, "logits/rejected": -11135269.6, "logps/chosen": -197.8194580078125, "logps/rejected": -258.267919921875, "loss": 0.3113, "rewards/chosen": -0.14518781503041586, "rewards/margins": 2.2565215508143104, "rewards/rejected": -2.4017093658447264, "step": 7792 }, { "epoch": 0.4130601860440463, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64384549.333333336, "logits/rejected": -34716953.6, "logps/chosen": -432.6991373697917, "logps/rejected": -329.651025390625, "loss": 0.2305, "rewards/chosen": 0.31998393932978314, "rewards/margins": 2.503181723753611, "rewards/rejected": -2.183197784423828, "step": 7793 }, { "epoch": 0.41311319004584846, "grad_norm": 63.25, "kl": 1.5450420379638672, "learning_rate": 5e-07, "logits/chosen": -2932663.2, "logits/rejected": 8296205.333333333, "logps/chosen": -571.709716796875, "logps/rejected": -304.7936604817708, "loss": 0.274, "rewards/chosen": 0.7003512382507324, "rewards/margins": 2.633634090423584, "rewards/rejected": -1.9332828521728516, "step": 7794 }, { "epoch": 0.4131661940476506, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50735043.2, "logits/rejected": 54500357.333333336, "logps/chosen": -304.4662841796875, "logps/rejected": -156.13712565104166, "loss": 0.3557, "rewards/chosen": 0.1864662289619446, "rewards/margins": 1.6933469891548156, "rewards/rejected": -1.506880760192871, "step": 7795 }, { "epoch": 0.41321919804945273, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14670296.0, "logits/rejected": -14366884.0, "logps/chosen": -535.819580078125, "logps/rejected": -202.18973795572916, "loss": 0.2231, "rewards/chosen": -0.029934704303741455, "rewards/margins": 2.1269906560579934, "rewards/rejected": -2.156925360361735, "step": 7796 }, { "epoch": 0.41327220205125487, "grad_norm": 47.75, "kl": 1.2635307312011719, "learning_rate": 5e-07, "logits/chosen": -10588603.333333334, "logits/rejected": -53485276.0, "logps/chosen": -230.51790364583334, "logps/rejected": -959.1368408203125, "loss": 0.3563, "rewards/chosen": 0.43237515290578205, "rewards/margins": 5.3138115008672075, "rewards/rejected": -4.881436347961426, "step": 7797 }, { "epoch": 0.413325206053057, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47749557.333333336, "logits/rejected": -34152227.2, "logps/chosen": -352.7953287760417, "logps/rejected": -384.8313720703125, "loss": 0.2464, "rewards/chosen": 0.5443573395411173, "rewards/margins": 2.612492791811625, "rewards/rejected": -2.068135452270508, "step": 7798 }, { "epoch": 0.41337821005485914, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36753204.0, "logits/rejected": 14278082.0, "logps/chosen": -451.31036376953125, "logps/rejected": -512.6353149414062, "loss": 0.3127, "rewards/chosen": -0.20205307006835938, "rewards/margins": 2.2404356002807617, "rewards/rejected": -2.442488670349121, "step": 7799 }, { "epoch": 0.4134312140566613, "grad_norm": 49.25, "kl": 0.47802734375, "learning_rate": 5e-07, "logits/chosen": -22494046.4, "logits/rejected": -42486634.666666664, "logps/chosen": -434.54150390625, "logps/rejected": -411.9827880859375, "loss": 0.2824, "rewards/chosen": 0.5524438858032227, "rewards/margins": 2.831612173716227, "rewards/rejected": -2.2791682879130044, "step": 7800 }, { "epoch": 0.4134842180584634, "grad_norm": 40.0, "kl": 0.19696044921875, "learning_rate": 5e-07, "logits/chosen": -25350714.666666668, "logits/rejected": -87153260.8, "logps/chosen": -235.0113321940104, "logps/rejected": -207.7362548828125, "loss": 0.3379, "rewards/chosen": -0.47081200281778973, "rewards/margins": 1.5542913754781085, "rewards/rejected": -2.0251033782958983, "step": 7801 }, { "epoch": 0.41353722206026555, "grad_norm": 60.75, "kl": 2.473001480102539, "learning_rate": 5e-07, "logits/chosen": -37828636.0, "logits/rejected": -9846626.0, "logps/chosen": -888.8129272460938, "logps/rejected": -308.7433776855469, "loss": 0.1985, "rewards/chosen": 1.4191818237304688, "rewards/margins": 3.8924145698547363, "rewards/rejected": -2.4732327461242676, "step": 7802 }, { "epoch": 0.4135902260620677, "grad_norm": 53.25, "kl": 2.207855224609375, "learning_rate": 5e-07, "logits/chosen": -48653472.0, "logits/rejected": -55907852.8, "logps/chosen": -685.6897786458334, "logps/rejected": -305.53046875, "loss": 0.3208, "rewards/chosen": 0.5870340665181478, "rewards/margins": 2.62814048131307, "rewards/rejected": -2.041106414794922, "step": 7803 }, { "epoch": 0.4136432300638698, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11647652.8, "logits/rejected": -15199474.666666666, "logps/chosen": -294.0350830078125, "logps/rejected": -367.1665445963542, "loss": 0.3795, "rewards/chosen": 0.16689786911010743, "rewards/margins": 1.9608778317769369, "rewards/rejected": -1.7939799626668294, "step": 7804 }, { "epoch": 0.41369623406567196, "grad_norm": 40.5, "kl": 1.8304786682128906, "learning_rate": 5e-07, "logits/chosen": -57850560.0, "logits/rejected": -4195184.666666667, "logps/chosen": -356.5739990234375, "logps/rejected": -146.49254353841147, "loss": 0.2663, "rewards/chosen": 1.0450345039367677, "rewards/margins": 2.706628259023031, "rewards/rejected": -1.661593755086263, "step": 7805 }, { "epoch": 0.4137492380674741, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40195988.0, "logits/rejected": -1162555.375, "logps/chosen": -372.7189636230469, "logps/rejected": -104.08409118652344, "loss": 0.2604, "rewards/chosen": 0.4336719214916229, "rewards/margins": 2.56306192278862, "rewards/rejected": -2.129390001296997, "step": 7806 }, { "epoch": 0.41380224206927624, "grad_norm": 56.5, "kl": 0.5294780731201172, "learning_rate": 5e-07, "logits/chosen": -40644662.4, "logits/rejected": -15619194.666666666, "logps/chosen": -285.76884765625, "logps/rejected": -248.07710774739584, "loss": 0.3801, "rewards/chosen": 0.3456049680709839, "rewards/margins": 1.67041912873586, "rewards/rejected": -1.3248141606648762, "step": 7807 }, { "epoch": 0.4138552460710784, "grad_norm": 48.75, "kl": 0.5831384658813477, "learning_rate": 5e-07, "logits/chosen": -26466140.8, "logits/rejected": -19833046.666666668, "logps/chosen": -284.9868408203125, "logps/rejected": -670.5009765625, "loss": 0.261, "rewards/chosen": 0.6273704051971436, "rewards/margins": 3.8345421632130945, "rewards/rejected": -3.2071717580159507, "step": 7808 }, { "epoch": 0.4139082500728805, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31022387.2, "logits/rejected": -21907725.333333332, "logps/chosen": -226.574462890625, "logps/rejected": -499.4395751953125, "loss": 0.4078, "rewards/chosen": -0.2413410186767578, "rewards/margins": 2.0292176564534508, "rewards/rejected": -2.2705586751302085, "step": 7809 }, { "epoch": 0.41396125407468265, "grad_norm": 31.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 535252.3125, "logits/rejected": -59557888.0, "logps/chosen": -62.859466552734375, "logps/rejected": -420.03515625, "loss": 0.1325, "rewards/chosen": 0.11276169121265411, "rewards/margins": 2.796931188021387, "rewards/rejected": -2.684169496808733, "step": 7810 }, { "epoch": 0.4140142580764848, "grad_norm": 65.5, "kl": 0.11361312866210938, "learning_rate": 5e-07, "logits/chosen": 2929607.0, "logits/rejected": -12731596.0, "logps/chosen": -759.8331298828125, "logps/rejected": -254.23350524902344, "loss": 0.2678, "rewards/chosen": 0.8308532238006592, "rewards/margins": 2.43792724609375, "rewards/rejected": -1.6070740222930908, "step": 7811 }, { "epoch": 0.4140672620782869, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -18099708.0, "logps/rejected": -278.88677978515625, "loss": 0.1338, "rewards/rejected": -2.2697391510009766, "step": 7812 }, { "epoch": 0.41412026608008906, "grad_norm": 62.5, "kl": 3.2491836547851562, "learning_rate": 5e-07, "logits/chosen": -19317100.0, "logits/rejected": -68014984.0, "logps/chosen": -427.8642985026042, "logps/rejected": -400.5216064453125, "loss": 0.3283, "rewards/chosen": 0.8687171141306559, "rewards/margins": 3.183104674021403, "rewards/rejected": -2.314387559890747, "step": 7813 }, { "epoch": 0.4141732700818912, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16930406.0, "logits/rejected": -51664600.0, "logps/chosen": -187.60011291503906, "logps/rejected": -356.5332336425781, "loss": 0.2366, "rewards/chosen": 0.5531681776046753, "rewards/margins": 3.1481207609176636, "rewards/rejected": -2.5949525833129883, "step": 7814 }, { "epoch": 0.41422627408369334, "grad_norm": 43.75, "kl": 0.3173370361328125, "learning_rate": 5e-07, "logits/chosen": -20943292.8, "logits/rejected": -64407205.333333336, "logps/chosen": -310.22685546875, "logps/rejected": -586.8693440755209, "loss": 0.2787, "rewards/chosen": 0.41466665267944336, "rewards/margins": 4.251617908477783, "rewards/rejected": -3.83695125579834, "step": 7815 }, { "epoch": 0.4142792780854955, "grad_norm": 43.0, "kl": 1.2148361206054688, "learning_rate": 5e-07, "logits/chosen": -14276060.0, "logits/rejected": -30626374.0, "logps/chosen": -247.53811645507812, "logps/rejected": -221.2718505859375, "loss": 0.3004, "rewards/chosen": 0.5005894899368286, "rewards/margins": 2.0609978437423706, "rewards/rejected": -1.560408353805542, "step": 7816 }, { "epoch": 0.4143322820872976, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52071573.333333336, "logits/rejected": -45936019.2, "logps/chosen": -326.0203043619792, "logps/rejected": -309.325244140625, "loss": 0.2383, "rewards/chosen": 0.4589940309524536, "rewards/margins": 2.7029195070266723, "rewards/rejected": -2.2439254760742187, "step": 7817 }, { "epoch": 0.41438528608909975, "grad_norm": 62.25, "kl": 2.788177490234375, "learning_rate": 5e-07, "logits/chosen": 1675265.6, "logits/rejected": -8851202.666666666, "logps/chosen": -334.518310546875, "logps/rejected": -390.4911295572917, "loss": 0.3306, "rewards/chosen": 0.7229094505310059, "rewards/margins": 2.626815636952718, "rewards/rejected": -1.9039061864217122, "step": 7818 }, { "epoch": 0.4144382900909019, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15876944.0, "logits/rejected": -196743.625, "logps/chosen": -97.52169799804688, "logps/rejected": -117.07294464111328, "loss": 0.3876, "rewards/chosen": -0.17739886045455933, "rewards/margins": 1.6151962876319885, "rewards/rejected": -1.7925951480865479, "step": 7819 }, { "epoch": 0.414491294092704, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11229276.0, "logits/rejected": -29664518.0, "logps/chosen": -306.86614990234375, "logps/rejected": -392.698486328125, "loss": 0.2699, "rewards/chosen": 0.4213012456893921, "rewards/margins": 2.502146601676941, "rewards/rejected": -2.080845355987549, "step": 7820 }, { "epoch": 0.41454429809450616, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14488110.666666666, "logits/rejected": -35091180.8, "logps/chosen": -221.70113118489584, "logps/rejected": -297.722119140625, "loss": 0.2051, "rewards/chosen": 0.8434083461761475, "rewards/margins": 2.8618247509002686, "rewards/rejected": -2.018416404724121, "step": 7821 }, { "epoch": 0.4145973020963083, "grad_norm": 66.5, "kl": 1.7233810424804688, "learning_rate": 5e-07, "logits/chosen": -18031362.666666668, "logits/rejected": -19675632.0, "logps/chosen": -336.49375406901044, "logps/rejected": -192.2384033203125, "loss": 0.4147, "rewards/chosen": 0.3412185509999593, "rewards/margins": 1.693895657857259, "rewards/rejected": -1.3526771068572998, "step": 7822 }, { "epoch": 0.41465030609811043, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40910201.6, "logits/rejected": -74672917.33333333, "logps/chosen": -218.421923828125, "logps/rejected": -669.534912109375, "loss": 0.339, "rewards/chosen": -0.07303135395050049, "rewards/margins": 3.157907239596049, "rewards/rejected": -3.2309385935465493, "step": 7823 }, { "epoch": 0.41470331009991257, "grad_norm": 81.0, "kl": 1.8852310180664062, "learning_rate": 5e-07, "logits/chosen": -12094523.0, "logits/rejected": -23652496.0, "logps/chosen": -317.17095947265625, "logps/rejected": -298.9218444824219, "loss": 0.3744, "rewards/chosen": 0.16937895119190216, "rewards/margins": 1.6896807700395584, "rewards/rejected": -1.5203018188476562, "step": 7824 }, { "epoch": 0.4147563141017147, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72549712.0, "logits/rejected": -37144536.0, "logps/chosen": -327.9534606933594, "logps/rejected": -236.94334411621094, "loss": 0.2879, "rewards/chosen": 0.6389849185943604, "rewards/margins": 2.1794285774230957, "rewards/rejected": -1.5404436588287354, "step": 7825 }, { "epoch": 0.4148093181035168, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12914730.0, "logits/rejected": -30708944.0, "logps/chosen": -254.46151733398438, "logps/rejected": -454.3519287109375, "loss": 0.1668, "rewards/chosen": 0.09782104194164276, "rewards/margins": 2.9639324794212976, "rewards/rejected": -2.866111437479655, "step": 7826 }, { "epoch": 0.4148623221053189, "grad_norm": 35.75, "kl": 0.02710723876953125, "learning_rate": 5e-07, "logits/chosen": -6208587.5, "logits/rejected": -46918816.0, "logps/chosen": -281.9343566894531, "logps/rejected": -545.5863037109375, "loss": 0.1868, "rewards/chosen": 1.0897691249847412, "rewards/margins": 4.444031000137329, "rewards/rejected": -3.354261875152588, "step": 7827 }, { "epoch": 0.41491532610712106, "grad_norm": 45.75, "kl": 1.0102005004882812, "learning_rate": 5e-07, "logits/chosen": -69870472.0, "logits/rejected": -26528310.0, "logps/chosen": -213.13644409179688, "logps/rejected": -336.0213623046875, "loss": 0.3311, "rewards/chosen": 0.07106761634349823, "rewards/margins": 2.1523108929395676, "rewards/rejected": -2.0812432765960693, "step": 7828 }, { "epoch": 0.4149683301089232, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53218032.0, "logits/rejected": -14781330.0, "logps/chosen": -499.8182067871094, "logps/rejected": -288.1219177246094, "loss": 0.2968, "rewards/chosen": 0.22453120350837708, "rewards/margins": 2.2749631702899933, "rewards/rejected": -2.050431966781616, "step": 7829 }, { "epoch": 0.41502133411072534, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76881344.0, "logits/rejected": -20639620.57142857, "logps/chosen": -132.9011688232422, "logps/rejected": -157.79312569754464, "loss": 0.2597, "rewards/chosen": -0.189616397023201, "rewards/margins": 1.3348290004900523, "rewards/rejected": -1.5244453975132533, "step": 7830 }, { "epoch": 0.4150743381125275, "grad_norm": 73.5, "kl": 1.8905868530273438, "learning_rate": 5e-07, "logits/chosen": -67296627.2, "logits/rejected": -79319573.33333333, "logps/chosen": -648.27626953125, "logps/rejected": -344.4413248697917, "loss": 0.2425, "rewards/chosen": 1.1349084854125977, "rewards/margins": 3.416076151529948, "rewards/rejected": -2.28116766611735, "step": 7831 }, { "epoch": 0.4151273421143296, "grad_norm": 56.25, "kl": 1.50634765625, "learning_rate": 5e-07, "logits/chosen": -33155954.285714287, "logits/rejected": -4986118.0, "logps/chosen": -247.33780343191964, "logps/rejected": -109.17002868652344, "loss": 0.4795, "rewards/chosen": 0.1667512825557164, "rewards/margins": 0.8666292003222874, "rewards/rejected": -0.699877917766571, "step": 7832 }, { "epoch": 0.41518034611613175, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45225596.0, "logits/rejected": -25422235.42857143, "logps/chosen": -313.0274658203125, "logps/rejected": -236.10006277901786, "loss": 0.2024, "rewards/chosen": 0.173858642578125, "rewards/margins": 2.034554209027972, "rewards/rejected": -1.8606955664498466, "step": 7833 }, { "epoch": 0.4152333501179339, "grad_norm": 55.25, "kl": 3.0578155517578125, "learning_rate": 5e-07, "logits/chosen": -8141461.333333333, "logits/rejected": -38308168.0, "logps/chosen": -145.03641764322916, "logps/rejected": -596.648193359375, "loss": 0.4055, "rewards/chosen": 0.3605462312698364, "rewards/margins": 2.964936375617981, "rewards/rejected": -2.6043901443481445, "step": 7834 }, { "epoch": 0.415286354119736, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51919168.0, "logits/rejected": -15928153.333333334, "logps/chosen": -177.42019653320312, "logps/rejected": -218.25787353515625, "loss": 0.2535, "rewards/chosen": 0.23364488780498505, "rewards/margins": 2.3042457153399787, "rewards/rejected": -2.0706008275349936, "step": 7835 }, { "epoch": 0.41533935812153816, "grad_norm": 39.0, "kl": 0.6124935150146484, "learning_rate": 5e-07, "logits/chosen": -27138768.0, "logits/rejected": -29847698.666666668, "logps/chosen": -381.7777099609375, "logps/rejected": -441.8870849609375, "loss": 0.1644, "rewards/chosen": 0.5672714710235596, "rewards/margins": 3.068862199783325, "rewards/rejected": -2.5015907287597656, "step": 7836 }, { "epoch": 0.4153923621233403, "grad_norm": 51.0, "kl": 1.6199111938476562, "learning_rate": 5e-07, "logits/chosen": -7460085.0, "logits/rejected": -31237260.0, "logps/chosen": -298.03277587890625, "logps/rejected": -345.0509033203125, "loss": 0.2238, "rewards/chosen": 0.8947539925575256, "rewards/margins": 3.27227121591568, "rewards/rejected": -2.3775172233581543, "step": 7837 }, { "epoch": 0.41544536612514243, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78313984.0, "logits/rejected": -24499356.8, "logps/chosen": -369.2746988932292, "logps/rejected": -365.3064208984375, "loss": 0.2522, "rewards/chosen": 0.39245303471883136, "rewards/margins": 2.5482017834981283, "rewards/rejected": -2.155748748779297, "step": 7838 }, { "epoch": 0.41549837012694457, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34192024.0, "logits/rejected": -40602628.0, "logps/chosen": -394.30364990234375, "logps/rejected": -419.31170654296875, "loss": 0.2816, "rewards/chosen": 0.3563808500766754, "rewards/margins": 2.168537527322769, "rewards/rejected": -1.8121566772460938, "step": 7839 }, { "epoch": 0.4155513741287467, "grad_norm": 43.25, "kl": 0.4644918441772461, "learning_rate": 5e-07, "logits/chosen": 12960885.0, "logits/rejected": -58664088.0, "logps/chosen": -213.15472412109375, "logps/rejected": -435.6670227050781, "loss": 0.2546, "rewards/chosen": 0.30768483877182007, "rewards/margins": 3.109709680080414, "rewards/rejected": -2.8020248413085938, "step": 7840 }, { "epoch": 0.41560437813054885, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8099432.0, "logits/rejected": -37924537.6, "logps/chosen": -288.54530843098956, "logps/rejected": -496.123779296875, "loss": 0.2821, "rewards/chosen": 0.21237029631932577, "rewards/margins": 1.9741546591122945, "rewards/rejected": -1.7617843627929688, "step": 7841 }, { "epoch": 0.415657382132351, "grad_norm": 62.25, "kl": 1.1293411254882812, "learning_rate": 5e-07, "logits/chosen": -342493.6, "logits/rejected": -17492722.666666668, "logps/chosen": -298.211572265625, "logps/rejected": -255.1772664388021, "loss": 0.3781, "rewards/chosen": 0.11815477609634399, "rewards/margins": 2.4346551219622294, "rewards/rejected": -2.3165003458658853, "step": 7842 }, { "epoch": 0.4157103861341531, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19899112.0, "logits/rejected": -69147440.0, "logps/chosen": -240.66978454589844, "logps/rejected": -535.1871337890625, "loss": 0.2908, "rewards/chosen": 0.09488677978515625, "rewards/margins": 2.5127878189086914, "rewards/rejected": -2.417901039123535, "step": 7843 }, { "epoch": 0.41576339013595526, "grad_norm": 74.0, "kl": 3.046280860900879, "learning_rate": 5e-07, "logits/chosen": -57490924.0, "logps/chosen": -483.7882995605469, "loss": 0.478, "rewards/chosen": 0.40754538774490356, "step": 7844 }, { "epoch": 0.4158163941377574, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44829752.0, "logits/rejected": -21441066.0, "logps/chosen": -395.76776123046875, "logps/rejected": -320.3888854980469, "loss": 0.1862, "rewards/chosen": 1.7071248292922974, "rewards/margins": 3.386981248855591, "rewards/rejected": -1.6798564195632935, "step": 7845 }, { "epoch": 0.41586939813955953, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45995856.0, "logits/rejected": -26373625.14285714, "logps/chosen": -429.2498474121094, "logps/rejected": -228.42616489955358, "loss": 0.2725, "rewards/chosen": 0.23999939858913422, "rewards/margins": 1.5830864587000437, "rewards/rejected": -1.3430870601109095, "step": 7846 }, { "epoch": 0.41592240214136167, "grad_norm": 58.0, "kl": 0.2781639099121094, "learning_rate": 5e-07, "logits/chosen": -26843786.666666668, "logits/rejected": -14990428.8, "logps/chosen": -286.29103597005206, "logps/rejected": -286.7828369140625, "loss": 0.28, "rewards/chosen": 0.6159711281458536, "rewards/margins": 2.226614324251811, "rewards/rejected": -1.610643196105957, "step": 7847 }, { "epoch": 0.4159754061431638, "grad_norm": 97.0, "kl": 7.362922668457031, "learning_rate": 5e-07, "logits/chosen": -21895265.333333332, "logits/rejected": -22036148.8, "logps/chosen": -786.3514811197916, "logps/rejected": -339.701953125, "loss": 0.1944, "rewards/chosen": 2.721644083658854, "rewards/margins": 4.922690455118815, "rewards/rejected": -2.201046371459961, "step": 7848 }, { "epoch": 0.41602841014496594, "grad_norm": 40.25, "kl": 0.028045654296875, "learning_rate": 5e-07, "logits/chosen": -10582042.0, "logits/rejected": -8264411.0, "logps/chosen": -343.6122741699219, "logps/rejected": -241.06231689453125, "loss": 0.2697, "rewards/chosen": 0.4846051335334778, "rewards/margins": 2.432760179042816, "rewards/rejected": -1.9481550455093384, "step": 7849 }, { "epoch": 0.4160814141467681, "grad_norm": 37.75, "kl": 0.00826263427734375, "learning_rate": 5e-07, "logits/chosen": -25540714.666666668, "logits/rejected": -26348035.2, "logps/chosen": -286.4166259765625, "logps/rejected": -188.2008056640625, "loss": 0.2525, "rewards/chosen": 0.47488756974538165, "rewards/margins": 2.78038874467214, "rewards/rejected": -2.305501174926758, "step": 7850 }, { "epoch": 0.4161344181485702, "grad_norm": 50.25, "kl": 1.3009910583496094, "learning_rate": 5e-07, "logits/chosen": -9809956.8, "logits/rejected": -38339538.666666664, "logps/chosen": -152.76566162109376, "logps/rejected": -198.83683268229166, "loss": 0.3375, "rewards/chosen": 0.6650343894958496, "rewards/margins": 2.0524612426757813, "rewards/rejected": -1.3874268531799316, "step": 7851 }, { "epoch": 0.41618742215037235, "grad_norm": 41.25, "kl": 0.34047508239746094, "learning_rate": 5e-07, "logits/chosen": -14796556.8, "logits/rejected": -48055861.333333336, "logps/chosen": -242.63173828125, "logps/rejected": -798.0445963541666, "loss": 0.2496, "rewards/chosen": 0.5278166770935059, "rewards/margins": 4.787584209442139, "rewards/rejected": -4.259767532348633, "step": 7852 }, { "epoch": 0.4162404261521745, "grad_norm": 47.25, "kl": 0.6368064880371094, "learning_rate": 5e-07, "logits/chosen": -12120348.0, "logits/rejected": -36958665.6, "logps/chosen": -218.3250732421875, "logps/rejected": -316.6692626953125, "loss": 0.2402, "rewards/chosen": 0.48707250754038495, "rewards/margins": 2.797824994723002, "rewards/rejected": -2.3107524871826173, "step": 7853 }, { "epoch": 0.41629343015397663, "grad_norm": 43.5, "kl": 0.9848709106445312, "learning_rate": 5e-07, "logits/chosen": -53807520.0, "logits/rejected": 19283677.714285713, "logps/chosen": -399.7983093261719, "logps/rejected": -309.02821568080356, "loss": 0.1799, "rewards/chosen": 0.5928985476493835, "rewards/margins": 2.6922493406704495, "rewards/rejected": -2.099350793021066, "step": 7854 }, { "epoch": 0.41634643415577877, "grad_norm": 57.75, "kl": 0.3983898162841797, "learning_rate": 5e-07, "logits/chosen": -26862988.8, "logits/rejected": -42338040.0, "logps/chosen": -243.2694580078125, "logps/rejected": -362.2505696614583, "loss": 0.3513, "rewards/chosen": 0.33658084869384763, "rewards/margins": 1.8395114898681642, "rewards/rejected": -1.5029306411743164, "step": 7855 }, { "epoch": 0.4163994381575809, "grad_norm": 46.0, "kl": 0.5636730194091797, "learning_rate": 5e-07, "logits/chosen": 608203.75, "logits/rejected": -19821014.0, "logps/chosen": -138.0172882080078, "logps/rejected": -237.70899963378906, "loss": 0.3416, "rewards/chosen": 0.540228009223938, "rewards/margins": 1.6586908102035522, "rewards/rejected": -1.1184628009796143, "step": 7856 }, { "epoch": 0.41645244215938304, "grad_norm": 53.0, "kl": 1.80950927734375, "learning_rate": 5e-07, "logits/chosen": -25917476.57142857, "logits/rejected": -86150512.0, "logps/chosen": -249.50034877232142, "logps/rejected": -581.000244140625, "loss": 0.389, "rewards/chosen": 0.5343301296234131, "rewards/margins": 3.0179667472839355, "rewards/rejected": -2.4836366176605225, "step": 7857 }, { "epoch": 0.4165054461611852, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43306600.0, "logits/rejected": 3943484.75, "logps/chosen": -222.38252766927084, "logps/rejected": -314.9274597167969, "loss": 0.4154, "rewards/chosen": -0.026096567511558533, "rewards/margins": 1.9533159881830215, "rewards/rejected": -1.97941255569458, "step": 7858 }, { "epoch": 0.4165584501629873, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58955448.0, "logits/rejected": 5653196.0, "logps/chosen": -447.2256774902344, "logps/rejected": -280.53756277901783, "loss": 0.2998, "rewards/chosen": 0.17685852944850922, "rewards/margins": 1.7693570362670081, "rewards/rejected": -1.592498506818499, "step": 7859 }, { "epoch": 0.41661145416478945, "grad_norm": 62.5, "kl": 1.580186367034912, "learning_rate": 5e-07, "logits/chosen": -10010244.0, "logits/rejected": 11088134.0, "logps/chosen": -633.7073364257812, "logps/rejected": -506.61041259765625, "loss": 0.2813, "rewards/chosen": 0.5666582584381104, "rewards/margins": 2.819082021713257, "rewards/rejected": -2.2524237632751465, "step": 7860 }, { "epoch": 0.4166644581665916, "grad_norm": 48.5, "kl": 0.49140167236328125, "learning_rate": 5e-07, "logits/chosen": -53581144.0, "logits/rejected": -15383287.0, "logps/chosen": -655.8739013671875, "logps/rejected": -327.308837890625, "loss": 0.2013, "rewards/chosen": 1.1314910650253296, "rewards/margins": 4.231071591377258, "rewards/rejected": -3.0995805263519287, "step": 7861 }, { "epoch": 0.4167174621683937, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2651921.0, "logits/rejected": 85988304.0, "logps/chosen": -188.10340881347656, "logps/rejected": -853.427978515625, "loss": 0.2132, "rewards/chosen": 0.8720945119857788, "rewards/margins": 3.88640558719635, "rewards/rejected": -3.0143110752105713, "step": 7862 }, { "epoch": 0.41677046617019586, "grad_norm": 41.75, "kl": 0.192230224609375, "learning_rate": 5e-07, "logits/chosen": -18066462.0, "logits/rejected": -4032964.0, "logps/chosen": -123.79484558105469, "logps/rejected": -377.69268798828125, "loss": 0.3166, "rewards/chosen": 0.057250939309597015, "rewards/margins": 2.5101203545928, "rewards/rejected": -2.452869415283203, "step": 7863 }, { "epoch": 0.416823470171998, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52102224.0, "logits/rejected": 6543531.333333333, "logps/chosen": -576.2362060546875, "logps/rejected": -467.3597819010417, "loss": 0.2217, "rewards/chosen": 0.41286012530326843, "rewards/margins": 2.417096028725306, "rewards/rejected": -2.0042359034220376, "step": 7864 }, { "epoch": 0.41687647417380014, "grad_norm": 58.25, "kl": 0.7386054992675781, "learning_rate": 5e-07, "logits/chosen": -39649496.0, "logits/rejected": -31022918.0, "logps/chosen": -282.70196533203125, "logps/rejected": -150.42724609375, "loss": 0.3311, "rewards/chosen": 0.5091392397880554, "rewards/margins": 1.5609849095344543, "rewards/rejected": -1.051845669746399, "step": 7865 }, { "epoch": 0.4169294781756023, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26806694.4, "logits/rejected": -29115312.0, "logps/chosen": -338.98408203125, "logps/rejected": -430.31494140625, "loss": 0.2079, "rewards/chosen": 0.9824430465698242, "rewards/margins": 3.8430298487345373, "rewards/rejected": -2.8605868021647134, "step": 7866 }, { "epoch": 0.4169824821774044, "grad_norm": 30.0, "kl": 0.06345653533935547, "learning_rate": 5e-07, "logits/chosen": -15528334.4, "logits/rejected": -31780704.0, "logps/chosen": -118.667236328125, "logps/rejected": -502.8175455729167, "loss": 0.2839, "rewards/chosen": 0.2834902763366699, "rewards/margins": 4.2412030855814615, "rewards/rejected": -3.9577128092447915, "step": 7867 }, { "epoch": 0.41703548617920655, "grad_norm": 59.5, "kl": 2.1657943725585938, "learning_rate": 5e-07, "logits/chosen": 28063882.666666668, "logits/rejected": 46754732.8, "logps/chosen": -718.5406087239584, "logps/rejected": -499.88583984375, "loss": 0.1891, "rewards/chosen": 1.6483869552612305, "rewards/margins": 3.615246391296387, "rewards/rejected": -1.9668594360351563, "step": 7868 }, { "epoch": 0.4170884901810087, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46187763.2, "logits/rejected": -19163450.666666668, "logps/chosen": -251.5916259765625, "logps/rejected": -211.8946736653646, "loss": 0.3597, "rewards/chosen": 0.18280563354492188, "rewards/margins": 1.625477409362793, "rewards/rejected": -1.442671775817871, "step": 7869 }, { "epoch": 0.4171414941828108, "grad_norm": 51.0, "kl": 1.3295021057128906, "learning_rate": 5e-07, "logits/chosen": -14667814.0, "logits/rejected": -7065418.0, "logps/chosen": -263.90203857421875, "logps/rejected": -200.82818603515625, "loss": 0.2459, "rewards/chosen": 0.3888297975063324, "rewards/margins": 3.238130897283554, "rewards/rejected": -2.8493010997772217, "step": 7870 }, { "epoch": 0.41719449818461296, "grad_norm": 44.0, "kl": 1.0827770233154297, "learning_rate": 5e-07, "logits/chosen": -54466892.0, "logits/rejected": -52025064.0, "logps/chosen": -321.71832275390625, "logps/rejected": -316.3900146484375, "loss": 0.2724, "rewards/chosen": 0.6722812652587891, "rewards/margins": 3.121025800704956, "rewards/rejected": -2.448744535446167, "step": 7871 }, { "epoch": 0.4172475021864151, "grad_norm": 46.75, "kl": 0.6011466979980469, "learning_rate": 5e-07, "logits/chosen": -24180645.333333332, "logits/rejected": -12669000.8, "logps/chosen": -165.07689412434897, "logps/rejected": -359.6144775390625, "loss": 0.2842, "rewards/chosen": 0.47327224413553876, "rewards/margins": 2.0637653509775795, "rewards/rejected": -1.590493106842041, "step": 7872 }, { "epoch": 0.41730050618821724, "grad_norm": 47.25, "kl": 0.3718128204345703, "learning_rate": 5e-07, "logits/chosen": -35842744.0, "logits/rejected": -25319898.666666668, "logps/chosen": -225.90626525878906, "logps/rejected": -369.94775390625, "loss": 0.2357, "rewards/chosen": 0.3914993405342102, "rewards/margins": 2.2094958424568176, "rewards/rejected": -1.8179965019226074, "step": 7873 }, { "epoch": 0.41735351019001937, "grad_norm": 60.0, "kl": 2.5962905883789062, "learning_rate": 5e-07, "logits/chosen": -23576210.666666668, "logits/rejected": -37412009.6, "logps/chosen": -372.2159830729167, "logps/rejected": -412.803515625, "loss": 0.2536, "rewards/chosen": 0.5909330050150553, "rewards/margins": 2.9361772219340003, "rewards/rejected": -2.345244216918945, "step": 7874 }, { "epoch": 0.4174065141918215, "grad_norm": 50.75, "kl": 0.6237869262695312, "learning_rate": 5e-07, "logits/chosen": -38676240.0, "logits/rejected": -50932344.0, "logps/chosen": -435.1671142578125, "logps/rejected": -450.054443359375, "loss": 0.3153, "rewards/chosen": 0.09305763244628906, "rewards/margins": 2.396742582321167, "rewards/rejected": -2.303684949874878, "step": 7875 }, { "epoch": 0.4174595181936236, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27178165.333333332, "logits/rejected": -2599682.4, "logps/chosen": -214.2432657877604, "logps/rejected": -196.5864013671875, "loss": 0.2723, "rewards/chosen": 0.046118736267089844, "rewards/margins": 2.136882019042969, "rewards/rejected": -2.090763282775879, "step": 7876 }, { "epoch": 0.41751252219542573, "grad_norm": 58.5, "kl": 2.746744155883789, "learning_rate": 5e-07, "logits/chosen": -41507213.333333336, "logits/rejected": -10228096.0, "logps/chosen": -406.76611328125, "logps/rejected": -390.48956298828125, "loss": 0.3897, "rewards/chosen": 0.7700831095377604, "rewards/margins": 1.4349299470583596, "rewards/rejected": -0.6648468375205994, "step": 7877 }, { "epoch": 0.41756552619722787, "grad_norm": 52.25, "kl": 2.230905532836914, "learning_rate": 5e-07, "logits/chosen": -23467052.0, "logits/rejected": -45221000.0, "logps/chosen": -523.0582885742188, "logps/rejected": -213.7091827392578, "loss": 0.227, "rewards/chosen": 1.4493913650512695, "rewards/margins": 3.4853415489196777, "rewards/rejected": -2.035950183868408, "step": 7878 }, { "epoch": 0.41761853019903, "grad_norm": 40.75, "kl": 1.5745849609375, "learning_rate": 5e-07, "logits/chosen": 3420740.0, "logits/rejected": -28443971.2, "logps/chosen": -615.3069254557291, "logps/rejected": -154.49656982421874, "loss": 0.156, "rewards/chosen": 1.5519084930419922, "rewards/margins": 3.6260509490966797, "rewards/rejected": -2.0741424560546875, "step": 7879 }, { "epoch": 0.41767153420083214, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14251812.0, "logits/rejected": -19507760.0, "logps/chosen": -210.89462280273438, "logps/rejected": -486.6375325520833, "loss": 0.1713, "rewards/chosen": 0.05863647907972336, "rewards/margins": 3.1452294712265334, "rewards/rejected": -3.08659299214681, "step": 7880 }, { "epoch": 0.4177245382026343, "grad_norm": 31.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19535594.666666668, "logits/rejected": -6398632.0, "logps/chosen": -402.1440836588542, "logps/rejected": -448.29833984375, "loss": 0.2114, "rewards/chosen": 0.9238287607828776, "rewards/margins": 3.46459108988444, "rewards/rejected": -2.5407623291015624, "step": 7881 }, { "epoch": 0.4177775422044364, "grad_norm": 50.25, "kl": 1.4936513900756836, "learning_rate": 5e-07, "logits/chosen": -34470208.0, "logits/rejected": -27541456.0, "logps/chosen": -270.3889973958333, "logps/rejected": -469.0796813964844, "loss": 0.3868, "rewards/chosen": 0.23003816604614258, "rewards/margins": 2.992042064666748, "rewards/rejected": -2.7620038986206055, "step": 7882 }, { "epoch": 0.41783054620623855, "grad_norm": 48.75, "kl": 0.6766786575317383, "learning_rate": 5e-07, "logits/chosen": -12438106.285714285, "logits/rejected": -32415016.0, "logps/chosen": -118.70272391183036, "logps/rejected": -118.01811981201172, "loss": 0.4511, "rewards/chosen": 0.2037546464375087, "rewards/margins": 0.9228327104023525, "rewards/rejected": -0.7190780639648438, "step": 7883 }, { "epoch": 0.4178835502080407, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9084181.0, "logits/rejected": -27207930.666666668, "logps/chosen": -73.62889099121094, "logps/rejected": -242.59930419921875, "loss": 0.217, "rewards/chosen": 0.30405616760253906, "rewards/margins": 2.152129014333089, "rewards/rejected": -1.84807284673055, "step": 7884 }, { "epoch": 0.4179365542098428, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57698064.0, "logits/rejected": -51284160.0, "logps/chosen": -197.69979858398438, "logps/rejected": -320.39840262276783, "loss": 0.197, "rewards/chosen": -0.461477667093277, "rewards/margins": 1.538083234003612, "rewards/rejected": -1.9995609010968889, "step": 7885 }, { "epoch": 0.41798955821164496, "grad_norm": 42.5, "kl": 0.7216672897338867, "learning_rate": 5e-07, "logits/chosen": -15992608.0, "logits/rejected": -25078900.0, "logps/chosen": -288.5783284505208, "logps/rejected": -165.03622436523438, "loss": 0.2261, "rewards/chosen": 1.1264747778574626, "rewards/margins": 4.204348961512248, "rewards/rejected": -3.077874183654785, "step": 7886 }, { "epoch": 0.4180425622134471, "grad_norm": 42.25, "kl": 0.20703697204589844, "learning_rate": 5e-07, "logits/chosen": -49351913.6, "logits/rejected": -11912789.333333334, "logps/chosen": -165.044287109375, "logps/rejected": -93.00785319010417, "loss": 0.346, "rewards/chosen": 0.38283610343933105, "rewards/margins": 1.7106332778930664, "rewards/rejected": -1.3277971744537354, "step": 7887 }, { "epoch": 0.41809556621524924, "grad_norm": 46.25, "kl": 1.2802324295043945, "learning_rate": 5e-07, "logits/chosen": -17044852.57142857, "logits/rejected": -111434680.0, "logps/chosen": -217.48320661272322, "logps/rejected": -467.9042663574219, "loss": 0.4499, "rewards/chosen": 0.16171704019818986, "rewards/margins": 2.1707533427647183, "rewards/rejected": -2.0090363025665283, "step": 7888 }, { "epoch": 0.4181485702170514, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56549317.333333336, "logits/rejected": -19217932.8, "logps/chosen": -181.24871826171875, "logps/rejected": -210.534814453125, "loss": 0.3618, "rewards/chosen": -0.6730908552805582, "rewards/margins": 0.8941819985707601, "rewards/rejected": -1.5672728538513183, "step": 7889 }, { "epoch": 0.4182015742188535, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -101347040.0, "logits/rejected": -7972588.0, "logps/chosen": -235.36756896972656, "logps/rejected": -172.41847229003906, "loss": 0.3521, "rewards/chosen": 0.09973898530006409, "rewards/margins": 1.3203109800815582, "rewards/rejected": -1.2205719947814941, "step": 7890 }, { "epoch": 0.41825457822065565, "grad_norm": 45.75, "kl": 1.1661491394042969, "learning_rate": 5e-07, "logits/chosen": 45813.7109375, "logits/rejected": -8620698.666666666, "logps/chosen": -324.80670166015625, "logps/rejected": -384.5421956380208, "loss": 0.2318, "rewards/chosen": 0.9161819815635681, "rewards/margins": 2.8004394570986433, "rewards/rejected": -1.884257475535075, "step": 7891 }, { "epoch": 0.4183075822224578, "grad_norm": 31.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27911344.0, "logits/rejected": -25474941.333333332, "logps/chosen": -220.7664794921875, "logps/rejected": -268.5680338541667, "loss": 0.1456, "rewards/chosen": 0.6633880734443665, "rewards/margins": 3.5684974789619446, "rewards/rejected": -2.905109405517578, "step": 7892 }, { "epoch": 0.4183605862242599, "grad_norm": 57.75, "kl": 0.9754924774169922, "learning_rate": 5e-07, "logits/chosen": -3157204.0, "logits/rejected": -2149972.0, "logps/chosen": -246.4478302001953, "logps/rejected": -354.49090576171875, "loss": 0.2827, "rewards/chosen": 0.48867103457450867, "rewards/margins": 2.620918244123459, "rewards/rejected": -2.13224720954895, "step": 7893 }, { "epoch": 0.41841359022606206, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47234632.0, "logits/rejected": -35538612.0, "logps/chosen": -510.0320739746094, "logps/rejected": -358.1982421875, "loss": 0.2875, "rewards/chosen": 0.5931390523910522, "rewards/margins": 2.4720518589019775, "rewards/rejected": -1.8789128065109253, "step": 7894 }, { "epoch": 0.4184665942278642, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1268492.5, "logits/rejected": -26120704.0, "logps/chosen": -77.30342102050781, "logps/rejected": -343.8176967075893, "loss": 0.2246, "rewards/chosen": -0.038823701441287994, "rewards/margins": 1.7697049262268203, "rewards/rejected": -1.8085286276681083, "step": 7895 }, { "epoch": 0.41851959822966633, "grad_norm": 52.0, "kl": 2.6961536407470703, "learning_rate": 5e-07, "logits/chosen": -10062993.333333334, "logits/rejected": -41164308.0, "logps/chosen": -222.56510416666666, "logps/rejected": -293.0390319824219, "loss": 0.3618, "rewards/chosen": 0.49071462949117023, "rewards/margins": 3.7251041730244956, "rewards/rejected": -3.234389543533325, "step": 7896 }, { "epoch": 0.41857260223146847, "grad_norm": 40.25, "kl": 1.111612319946289, "learning_rate": 5e-07, "logits/chosen": -8399954.0, "logits/rejected": -28085152.0, "logps/chosen": -226.8873291015625, "logps/rejected": -297.396484375, "loss": 0.2824, "rewards/chosen": 0.5486308932304382, "rewards/margins": 2.6509012579917908, "rewards/rejected": -2.1022703647613525, "step": 7897 }, { "epoch": 0.4186256062332706, "grad_norm": 40.5, "kl": 1.2251300811767578, "learning_rate": 5e-07, "logits/chosen": -10085262.0, "logits/rejected": -69106064.0, "logps/chosen": -173.67250061035156, "logps/rejected": -160.66970825195312, "loss": 0.3662, "rewards/chosen": -0.014776065945625305, "rewards/margins": 1.609494611620903, "rewards/rejected": -1.6242706775665283, "step": 7898 }, { "epoch": 0.41867861023507275, "grad_norm": 52.0, "kl": 0.5961074829101562, "learning_rate": 5e-07, "logits/chosen": -89882144.0, "logits/rejected": -30398899.2, "logps/chosen": -432.10986328125, "logps/rejected": -193.18778076171876, "loss": 0.2628, "rewards/chosen": 0.6616434653600057, "rewards/margins": 2.2833645423253377, "rewards/rejected": -1.621721076965332, "step": 7899 }, { "epoch": 0.4187316142368749, "grad_norm": 54.25, "kl": 0.711665153503418, "learning_rate": 5e-07, "logits/chosen": 4255400.0, "logits/rejected": -6438102.0, "logps/chosen": -164.37186104910714, "logps/rejected": -117.5553207397461, "loss": 0.4316, "rewards/chosen": 0.2426433733531407, "rewards/margins": 1.6657856873103551, "rewards/rejected": -1.4231423139572144, "step": 7900 }, { "epoch": 0.418784618238677, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -16283832.0, "logps/rejected": -255.2117156982422, "loss": 0.1807, "rewards/rejected": -2.111032485961914, "step": 7901 }, { "epoch": 0.41883762224047916, "grad_norm": 62.0, "kl": 1.2607250213623047, "learning_rate": 5e-07, "logits/chosen": -23130526.4, "logits/rejected": -17435578.666666668, "logps/chosen": -184.05318603515624, "logps/rejected": -240.9041951497396, "loss": 0.4163, "rewards/chosen": 0.4175297737121582, "rewards/margins": 1.0344423373540244, "rewards/rejected": -0.6169125636418661, "step": 7902 }, { "epoch": 0.4188906262422813, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49899574.4, "logits/rejected": -32063802.666666668, "logps/chosen": -226.7507080078125, "logps/rejected": -469.1986083984375, "loss": 0.358, "rewards/chosen": 0.11975722312927246, "rewards/margins": 1.9436537901560467, "rewards/rejected": -1.8238965670267742, "step": 7903 }, { "epoch": 0.41894363024408343, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 41083224.0, "logits/rejected": -3533814.75, "logps/chosen": -202.01377868652344, "logps/rejected": -137.46624755859375, "loss": 0.3247, "rewards/chosen": 0.01883140206336975, "rewards/margins": 1.7926426231861115, "rewards/rejected": -1.7738112211227417, "step": 7904 }, { "epoch": 0.41899663424588557, "grad_norm": 43.0, "kl": 1.1911048889160156, "learning_rate": 5e-07, "logits/chosen": -20050868.0, "logits/rejected": -11068441.142857144, "logps/chosen": -754.4891357421875, "logps/rejected": -183.45256696428572, "loss": 0.2288, "rewards/chosen": 3.214642286300659, "rewards/margins": 4.304745980671473, "rewards/rejected": -1.0901036943708147, "step": 7905 }, { "epoch": 0.4190496382476877, "grad_norm": 85.5, "kl": 4.100566864013672, "learning_rate": 5e-07, "logits/chosen": 1119910.75, "logits/rejected": -58861008.0, "logps/chosen": -409.9453125, "logps/rejected": -354.12567138671875, "loss": 0.2916, "rewards/chosen": 1.0277442932128906, "rewards/margins": 2.9982948303222656, "rewards/rejected": -1.970550537109375, "step": 7906 }, { "epoch": 0.41910264224948984, "grad_norm": 64.5, "kl": 1.0597801208496094, "learning_rate": 5e-07, "logits/chosen": -55642227.2, "logits/rejected": -37082968.0, "logps/chosen": -324.813671875, "logps/rejected": -272.30714925130206, "loss": 0.3703, "rewards/chosen": 0.2761198043823242, "rewards/margins": 1.6459319750467936, "rewards/rejected": -1.3698121706644695, "step": 7907 }, { "epoch": 0.419155646251292, "grad_norm": 47.0, "kl": 0.7173871994018555, "learning_rate": 5e-07, "logits/chosen": -18408276.8, "logits/rejected": -33463280.0, "logps/chosen": -154.809130859375, "logps/rejected": -311.04461669921875, "loss": 0.2537, "rewards/chosen": 0.8868548393249511, "rewards/margins": 2.927187951405843, "rewards/rejected": -2.040333112080892, "step": 7908 }, { "epoch": 0.4192086502530941, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49376906.666666664, "logits/rejected": -19884139.2, "logps/chosen": -338.16335042317706, "logps/rejected": -278.1046875, "loss": 0.242, "rewards/chosen": 0.3195785681406657, "rewards/margins": 2.3475517431894937, "rewards/rejected": -2.027973175048828, "step": 7909 }, { "epoch": 0.41926165425489625, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11197806.666666666, "logits/rejected": -30463443.2, "logps/chosen": -200.28946940104166, "logps/rejected": -230.643701171875, "loss": 0.2693, "rewards/chosen": 0.5410435994466146, "rewards/margins": 2.2308581670125327, "rewards/rejected": -1.689814567565918, "step": 7910 }, { "epoch": 0.4193146582566984, "grad_norm": 51.75, "kl": 1.2557077407836914, "learning_rate": 5e-07, "logits/chosen": -1760342.0, "logits/rejected": -17984848.0, "logps/chosen": -168.856640625, "logps/rejected": -266.5261637369792, "loss": 0.4388, "rewards/chosen": 0.14243290424346924, "rewards/margins": 0.9666836023330688, "rewards/rejected": -0.8242506980895996, "step": 7911 }, { "epoch": 0.41936766225850053, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39240320.0, "logits/rejected": -15366224.0, "logps/chosen": -158.17764282226562, "logps/rejected": -345.16461181640625, "loss": 0.3315, "rewards/chosen": -0.23179340362548828, "rewards/margins": 1.945993185043335, "rewards/rejected": -2.1777865886688232, "step": 7912 }, { "epoch": 0.41942066626030267, "grad_norm": 47.75, "kl": 0.7286720275878906, "learning_rate": 5e-07, "logits/chosen": -48837364.0, "logits/rejected": -41886125.333333336, "logps/chosen": -397.4883728027344, "logps/rejected": -265.4581298828125, "loss": 0.2439, "rewards/chosen": -0.0969059020280838, "rewards/margins": 1.809730812907219, "rewards/rejected": -1.9066367149353027, "step": 7913 }, { "epoch": 0.4194736702621048, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78562584.0, "logits/rejected": -8347902.0, "logps/chosen": -252.92726135253906, "logps/rejected": -124.15422058105469, "loss": 0.3021, "rewards/chosen": 0.2504833936691284, "rewards/margins": 2.3041661977767944, "rewards/rejected": -2.053682804107666, "step": 7914 }, { "epoch": 0.41952667426390694, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1133497.3333333333, "logits/rejected": -16943772.8, "logps/chosen": -245.72977701822916, "logps/rejected": -255.9880859375, "loss": 0.3269, "rewards/chosen": -0.1113152801990509, "rewards/margins": 1.6347252547740936, "rewards/rejected": -1.7460405349731445, "step": 7915 }, { "epoch": 0.4195796782657091, "grad_norm": 59.75, "kl": 0.8114843368530273, "learning_rate": 5e-07, "logits/chosen": -22200278.666666668, "logits/rejected": -8252618.5, "logps/chosen": -301.6680908203125, "logps/rejected": -219.74288940429688, "loss": 0.3567, "rewards/chosen": 0.39077051480611164, "rewards/margins": 2.2735403378804526, "rewards/rejected": -1.8827698230743408, "step": 7916 }, { "epoch": 0.4196326822675112, "grad_norm": 47.0, "kl": 0.21114730834960938, "learning_rate": 5e-07, "logits/chosen": -65407701.333333336, "logits/rejected": -17256320.0, "logps/chosen": -248.6739501953125, "logps/rejected": -260.4047607421875, "loss": 0.2379, "rewards/chosen": 0.6843831539154053, "rewards/margins": 2.7661807537078857, "rewards/rejected": -2.0817975997924805, "step": 7917 }, { "epoch": 0.41968568626931335, "grad_norm": 44.5, "kl": 0.6239719390869141, "learning_rate": 5e-07, "logits/chosen": -3720786.5, "logits/rejected": 21713438.0, "logps/chosen": -249.514892578125, "logps/rejected": -482.07171630859375, "loss": 0.2662, "rewards/chosen": 0.3811805844306946, "rewards/margins": 2.9151228070259094, "rewards/rejected": -2.533942222595215, "step": 7918 }, { "epoch": 0.4197386902711155, "grad_norm": 35.75, "kl": 0.10672283172607422, "learning_rate": 5e-07, "logits/chosen": -11442481.333333334, "logits/rejected": -10233552.0, "logps/chosen": -82.86098225911458, "logps/rejected": -109.58487548828126, "loss": 0.3071, "rewards/chosen": -0.021052340666453045, "rewards/margins": 1.6459177215894063, "rewards/rejected": -1.6669700622558594, "step": 7919 }, { "epoch": 0.4197916942729176, "grad_norm": 62.5, "kl": 1.6954994201660156, "learning_rate": 5e-07, "logits/chosen": -49328400.0, "logits/rejected": -32367533.333333332, "logps/chosen": -455.2056640625, "logps/rejected": -422.1394449869792, "loss": 0.3188, "rewards/chosen": 0.9255160331726074, "rewards/margins": 2.5402807235717773, "rewards/rejected": -1.61476469039917, "step": 7920 }, { "epoch": 0.41984469827471976, "grad_norm": 65.0, "kl": 3.7515010833740234, "learning_rate": 5e-07, "logits/chosen": -19549828.57142857, "logits/rejected": -185603040.0, "logps/chosen": -353.1110142299107, "logps/rejected": -72.47988891601562, "loss": 0.4284, "rewards/chosen": 0.5733768599373954, "rewards/margins": 0.9266376750809806, "rewards/rejected": -0.3532608151435852, "step": 7921 }, { "epoch": 0.4198977022765219, "grad_norm": 61.75, "kl": 4.012594223022461, "learning_rate": 5e-07, "logits/chosen": 11639986.0, "logits/rejected": -97996472.0, "logps/chosen": -510.62969970703125, "logps/rejected": -341.06494140625, "loss": 0.3453, "rewards/chosen": 0.9476497173309326, "rewards/margins": 2.387572765350342, "rewards/rejected": -1.4399230480194092, "step": 7922 }, { "epoch": 0.41995070627832404, "grad_norm": 58.5, "kl": 2.7446956634521484, "learning_rate": 5e-07, "logits/chosen": -53215001.6, "logits/rejected": -6614876.0, "logps/chosen": -523.64296875, "logps/rejected": -249.75801595052084, "loss": 0.2928, "rewards/chosen": 0.9027021408081055, "rewards/margins": 2.4694544156392415, "rewards/rejected": -1.566752274831136, "step": 7923 }, { "epoch": 0.4200037102801262, "grad_norm": 129.0, "kl": 1.5299625396728516, "learning_rate": 5e-07, "logits/chosen": -12686281.333333334, "logits/rejected": -64416844.0, "logps/chosen": -418.9784749348958, "logps/rejected": -372.7522277832031, "loss": 0.382, "rewards/chosen": 0.4228708744049072, "rewards/margins": 2.723003625869751, "rewards/rejected": -2.3001327514648438, "step": 7924 }, { "epoch": 0.4200567142819283, "grad_norm": 53.5, "kl": 2.94854736328125, "learning_rate": 5e-07, "logits/chosen": -41002128.0, "logits/rejected": 9897403.0, "logps/chosen": -436.9998372395833, "logps/rejected": -406.52264404296875, "loss": 0.3052, "rewards/chosen": 1.0310400327046711, "rewards/margins": 3.1152258714040117, "rewards/rejected": -2.084185838699341, "step": 7925 }, { "epoch": 0.42010971828373045, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10054938.666666666, "logits/rejected": -44486742.4, "logps/chosen": -181.92228190104166, "logps/rejected": -302.2685546875, "loss": 0.2562, "rewards/chosen": 0.11730117599169414, "rewards/margins": 2.9511476496855416, "rewards/rejected": -2.8338464736938476, "step": 7926 }, { "epoch": 0.42016272228553253, "grad_norm": 28.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33381800.0, "logits/rejected": -31903216.0, "logps/chosen": -128.169189453125, "logps/rejected": -330.5256754557292, "loss": 0.1832, "rewards/chosen": -0.18516883254051208, "rewards/margins": 2.62165966629982, "rewards/rejected": -2.806828498840332, "step": 7927 }, { "epoch": 0.42021572628733467, "grad_norm": 53.25, "kl": 1.2513656616210938, "learning_rate": 5e-07, "logits/chosen": -23975331.2, "logits/rejected": -38833245.333333336, "logps/chosen": -455.59873046875, "logps/rejected": -256.173828125, "loss": 0.3922, "rewards/chosen": 0.24015746116638184, "rewards/margins": 2.0050583044687906, "rewards/rejected": -1.764900843302409, "step": 7928 }, { "epoch": 0.4202687302891368, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 548212.0625, "logits/rejected": -39671042.666666664, "logps/chosen": -53.448936462402344, "logps/rejected": -394.2333170572917, "loss": 0.3104, "rewards/chosen": -0.5451116561889648, "rewards/margins": 1.161015510559082, "rewards/rejected": -1.7061271667480469, "step": 7929 }, { "epoch": 0.42032173429093894, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6225607.0, "logits/rejected": -49471396.571428575, "logps/chosen": -9.913946151733398, "logps/rejected": -323.77085658482144, "loss": 0.2425, "rewards/chosen": 0.03604259714484215, "rewards/margins": 1.7295950230743204, "rewards/rejected": -1.6935524259294783, "step": 7930 }, { "epoch": 0.4203747382927411, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46310780.0, "logits/rejected": -19437400.0, "logps/chosen": -358.9793395996094, "logps/rejected": -278.04327392578125, "loss": 0.3668, "rewards/chosen": -0.32155680656433105, "rewards/margins": 1.6136224269866943, "rewards/rejected": -1.9351792335510254, "step": 7931 }, { "epoch": 0.4204277422945432, "grad_norm": 44.25, "kl": 0.0666961669921875, "learning_rate": 5e-07, "logits/chosen": -34442788.0, "logits/rejected": -27433260.0, "logps/chosen": -189.24508666992188, "logps/rejected": -219.34825134277344, "loss": 0.286, "rewards/chosen": 0.26319465041160583, "rewards/margins": 2.216104358434677, "rewards/rejected": -1.9529097080230713, "step": 7932 }, { "epoch": 0.42048074629634535, "grad_norm": 59.5, "kl": 0.2736663818359375, "learning_rate": 5e-07, "logits/chosen": -27708290.0, "logits/rejected": -47469384.0, "logps/chosen": -490.377685546875, "logps/rejected": -390.4218444824219, "loss": 0.3079, "rewards/chosen": 0.2762245237827301, "rewards/margins": 2.5418156683444977, "rewards/rejected": -2.2655911445617676, "step": 7933 }, { "epoch": 0.4205337502981475, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28865434.666666668, "logits/rejected": -3858844.5, "logps/chosen": -216.9753621419271, "logps/rejected": -175.0155792236328, "loss": 0.3827, "rewards/chosen": 0.20863229036331177, "rewards/margins": 1.731913149356842, "rewards/rejected": -1.5232808589935303, "step": 7934 }, { "epoch": 0.42058675429994963, "grad_norm": 61.75, "kl": 0.5395793914794922, "learning_rate": 5e-07, "logits/chosen": 2647803.8, "logits/rejected": 4568268.666666667, "logps/chosen": -123.94022216796876, "logps/rejected": -391.3460286458333, "loss": 0.3718, "rewards/chosen": 0.33862981796264646, "rewards/margins": 1.9534479141235352, "rewards/rejected": -1.6148180961608887, "step": 7935 }, { "epoch": 0.42063975830175176, "grad_norm": 42.25, "kl": 0.4948883056640625, "learning_rate": 5e-07, "logits/chosen": -60713272.0, "logits/rejected": -14241780.0, "logps/chosen": -446.98828125, "logps/rejected": -326.3978271484375, "loss": 0.2347, "rewards/chosen": 0.8846045732498169, "rewards/margins": 3.2681223154067993, "rewards/rejected": -2.3835177421569824, "step": 7936 }, { "epoch": 0.4206927623035539, "grad_norm": 63.0, "kl": 0.35619354248046875, "learning_rate": 5e-07, "logits/chosen": -42292864.0, "logits/rejected": -34290992.0, "logps/chosen": -300.26287841796875, "logps/rejected": -149.28216552734375, "loss": 0.3573, "rewards/chosen": 0.372583270072937, "rewards/margins": 2.3523852825164795, "rewards/rejected": -1.9798020124435425, "step": 7937 }, { "epoch": 0.42074576630535604, "grad_norm": 40.75, "kl": 1.1864032745361328, "learning_rate": 5e-07, "logits/chosen": -35130796.8, "logits/rejected": -39255536.0, "logps/chosen": -173.7370361328125, "logps/rejected": -289.40578206380206, "loss": 0.3711, "rewards/chosen": 0.25842695236206054, "rewards/margins": 2.430064487457275, "rewards/rejected": -2.171637535095215, "step": 7938 }, { "epoch": 0.4207987703071582, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70849770.66666667, "logits/rejected": -60230681.6, "logps/chosen": -280.0996500651042, "logps/rejected": -477.524072265625, "loss": 0.2565, "rewards/chosen": 0.06188393632570902, "rewards/margins": 2.5374588112036385, "rewards/rejected": -2.4755748748779296, "step": 7939 }, { "epoch": 0.4208517743089603, "grad_norm": 57.25, "kl": 1.7555503845214844, "learning_rate": 5e-07, "logits/chosen": 2849384.0, "logits/rejected": -24068366.4, "logps/chosen": -923.0283203125, "logps/rejected": -483.338671875, "loss": 0.1691, "rewards/chosen": 1.3270792961120605, "rewards/margins": 3.782548427581787, "rewards/rejected": -2.4554691314697266, "step": 7940 }, { "epoch": 0.42090477831076245, "grad_norm": 56.75, "kl": 0.10812568664550781, "learning_rate": 5e-07, "logits/chosen": -55676180.0, "logits/rejected": -24534048.0, "logps/chosen": -200.90618896484375, "logps/rejected": -423.7386881510417, "loss": 0.2024, "rewards/chosen": 0.30086880922317505, "rewards/margins": 2.6021119554837546, "rewards/rejected": -2.3012431462605796, "step": 7941 }, { "epoch": 0.4209577823125646, "grad_norm": 50.75, "kl": 1.381662368774414, "learning_rate": 5e-07, "logits/chosen": -18905288.0, "logits/rejected": -14146214.0, "logps/chosen": -251.47796630859375, "logps/rejected": -439.4584655761719, "loss": 0.3511, "rewards/chosen": 0.41645344098409015, "rewards/margins": 3.381913741429647, "rewards/rejected": -2.9654603004455566, "step": 7942 }, { "epoch": 0.4210107863143667, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15275000.0, "logits/rejected": -55461340.0, "logps/chosen": -164.41981506347656, "logps/rejected": -253.804443359375, "loss": 0.4165, "rewards/chosen": 0.13470374047756195, "rewards/margins": 0.8203289955854416, "rewards/rejected": -0.6856252551078796, "step": 7943 }, { "epoch": 0.42106379031616886, "grad_norm": 48.75, "kl": 0.6147117614746094, "learning_rate": 5e-07, "logits/chosen": -1739230.25, "logits/rejected": -45992324.0, "logps/chosen": -106.00350189208984, "logps/rejected": -369.6760559082031, "loss": 0.2789, "rewards/chosen": 0.5338913202285767, "rewards/margins": 3.188128352165222, "rewards/rejected": -2.6542370319366455, "step": 7944 }, { "epoch": 0.421116794317971, "grad_norm": 46.25, "kl": 0.4877433776855469, "learning_rate": 5e-07, "logits/chosen": -65225796.0, "logits/rejected": -30137006.0, "logps/chosen": -271.00665283203125, "logps/rejected": -309.2577209472656, "loss": 0.3254, "rewards/chosen": -0.012102842330932617, "rewards/margins": 2.3991971015930176, "rewards/rejected": -2.41129994392395, "step": 7945 }, { "epoch": 0.42116979831977314, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3110101.25, "logits/rejected": -35444608.0, "logps/chosen": -81.95106506347656, "logps/rejected": -386.3243931361607, "loss": 0.1832, "rewards/chosen": -0.8046585321426392, "rewards/margins": 1.752630591392517, "rewards/rejected": -2.5572891235351562, "step": 7946 }, { "epoch": 0.4212228023215753, "grad_norm": 39.0, "kl": 0.6003265380859375, "learning_rate": 5e-07, "logits/chosen": -23097840.0, "logits/rejected": -29039538.0, "logps/chosen": -191.1092529296875, "logps/rejected": -291.5583190917969, "loss": 0.2113, "rewards/chosen": 1.0890337228775024, "rewards/margins": 3.0791478157043457, "rewards/rejected": -1.9901140928268433, "step": 7947 }, { "epoch": 0.4212758063233774, "grad_norm": 44.25, "kl": 0.2891197204589844, "learning_rate": 5e-07, "logits/chosen": -26365280.0, "logits/rejected": -31045656.0, "logps/chosen": -349.84283447265625, "logps/rejected": -229.7078094482422, "loss": 0.2309, "rewards/chosen": 1.0429515838623047, "rewards/margins": 3.179335117340088, "rewards/rejected": -2.136383533477783, "step": 7948 }, { "epoch": 0.42132881032517955, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38934432.0, "logits/rejected": -15405028.0, "logps/chosen": -121.33517456054688, "logps/rejected": -340.5159606933594, "loss": 0.3643, "rewards/chosen": -0.42253419756889343, "rewards/margins": 1.5169732868671417, "rewards/rejected": -1.9395074844360352, "step": 7949 }, { "epoch": 0.4213818143269817, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41135456.0, "logits/rejected": -2383534.0, "logps/chosen": -229.26791381835938, "logps/rejected": -264.9035339355469, "loss": 0.3678, "rewards/chosen": -0.13405533134937286, "rewards/margins": 1.6051066368818283, "rewards/rejected": -1.7391619682312012, "step": 7950 }, { "epoch": 0.4214348183287838, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6187119.333333333, "logits/rejected": -29402534.4, "logps/chosen": -385.1767578125, "logps/rejected": -128.29158935546874, "loss": 0.3748, "rewards/chosen": -0.3213691711425781, "rewards/margins": 0.8194211959838866, "rewards/rejected": -1.1407903671264648, "step": 7951 }, { "epoch": 0.42148782233058596, "grad_norm": 52.25, "kl": 1.6675491333007812, "learning_rate": 5e-07, "logits/chosen": -57102378.666666664, "logits/rejected": 863208.6, "logps/chosen": -938.4132486979166, "logps/rejected": -103.28428955078125, "loss": 0.1728, "rewards/chosen": 1.8336710929870605, "rewards/margins": 3.9208632469177247, "rewards/rejected": -2.087192153930664, "step": 7952 }, { "epoch": 0.4215408263323881, "grad_norm": 45.5, "kl": 0.059035301208496094, "learning_rate": 5e-07, "logits/chosen": -26215932.8, "logits/rejected": -30871720.0, "logps/chosen": -200.77174072265626, "logps/rejected": -468.40576171875, "loss": 0.31, "rewards/chosen": 0.161848783493042, "rewards/margins": 3.3107383251190186, "rewards/rejected": -3.1488895416259766, "step": 7953 }, { "epoch": 0.42159383033419023, "grad_norm": 54.5, "kl": 0.00518798828125, "learning_rate": 5e-07, "logits/chosen": -32929156.0, "logits/rejected": -24310092.0, "logps/chosen": -1023.414306640625, "logps/rejected": -176.95028686523438, "loss": 0.2429, "rewards/chosen": 0.9880607724189758, "rewards/margins": 3.124961197376251, "rewards/rejected": -2.1369004249572754, "step": 7954 }, { "epoch": 0.42164683433599237, "grad_norm": 58.25, "kl": 3.603935718536377, "learning_rate": 5e-07, "logits/chosen": -8202237.333333333, "logits/rejected": -15742959.0, "logps/chosen": -601.6000162760416, "logps/rejected": -96.70573425292969, "loss": 0.2986, "rewards/chosen": 1.287028392155965, "rewards/margins": 2.8485487302144366, "rewards/rejected": -1.5615203380584717, "step": 7955 }, { "epoch": 0.4216998383377945, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31411832.0, "logits/rejected": -20760558.0, "logps/chosen": -232.70913696289062, "logps/rejected": -726.3660888671875, "loss": 0.2152, "rewards/chosen": 0.4560942053794861, "rewards/margins": 3.690002143383026, "rewards/rejected": -3.23390793800354, "step": 7956 }, { "epoch": 0.42175284233959665, "grad_norm": 93.0, "kl": 1.1881637573242188, "learning_rate": 5e-07, "logits/chosen": -74922208.0, "logits/rejected": 34974704.0, "logps/chosen": -535.87373046875, "logps/rejected": -143.07040405273438, "loss": 0.2945, "rewards/chosen": 1.1114404678344727, "rewards/margins": 2.482201067606608, "rewards/rejected": -1.3707605997721355, "step": 7957 }, { "epoch": 0.4218058463413988, "grad_norm": 68.5, "kl": 2.3604087829589844, "learning_rate": 5e-07, "logits/chosen": -37781092.571428575, "logits/rejected": -13920951.0, "logps/chosen": -234.74169921875, "logps/rejected": -49.49500274658203, "loss": 0.5391, "rewards/chosen": 0.020825965063912526, "rewards/margins": 0.4200573095253536, "rewards/rejected": -0.39923134446144104, "step": 7958 }, { "epoch": 0.4218588503432009, "grad_norm": 68.0, "kl": 3.110382080078125, "learning_rate": 5e-07, "logits/chosen": -415137.1666666667, "logits/rejected": -121461096.0, "logps/chosen": -555.6966552734375, "logps/rejected": -279.1357116699219, "loss": 0.3928, "rewards/chosen": 0.7858033180236816, "rewards/margins": 2.3054994344711304, "rewards/rejected": -1.5196961164474487, "step": 7959 }, { "epoch": 0.42191185434500306, "grad_norm": 55.0, "kl": 0.31857872009277344, "learning_rate": 5e-07, "logits/chosen": -19974084.0, "logits/rejected": 1793785.375, "logps/chosen": -258.0561218261719, "logps/rejected": -113.45441436767578, "loss": 0.2801, "rewards/chosen": 0.384159117937088, "rewards/margins": 2.219316154718399, "rewards/rejected": -1.835157036781311, "step": 7960 }, { "epoch": 0.4219648583468052, "grad_norm": 72.0, "kl": 3.7843551635742188, "learning_rate": 5e-07, "logits/chosen": -31148573.333333332, "logits/rejected": -3038451.5, "logps/chosen": -430.1734212239583, "logps/rejected": -210.3418731689453, "loss": 0.2737, "rewards/chosen": 1.4527637163798015, "rewards/margins": 2.925486008326213, "rewards/rejected": -1.4727222919464111, "step": 7961 }, { "epoch": 0.42201786234860733, "grad_norm": 63.25, "kl": 0.4095745086669922, "learning_rate": 5e-07, "logits/chosen": -41725392.0, "logits/rejected": -34770486.4, "logps/chosen": -409.881103515625, "logps/rejected": -309.31044921875, "loss": 0.2117, "rewards/chosen": 0.13404948512713113, "rewards/margins": 3.1318758706251777, "rewards/rejected": -2.9978263854980467, "step": 7962 }, { "epoch": 0.42207086635040947, "grad_norm": 61.0, "kl": 1.2980728149414062, "learning_rate": 5e-07, "logits/chosen": -52054986.666666664, "logits/rejected": -19645220.0, "logps/chosen": -367.174560546875, "logps/rejected": -153.2388916015625, "loss": 0.3459, "rewards/chosen": 0.5009056727091471, "rewards/margins": 2.7303737799326577, "rewards/rejected": -2.2294681072235107, "step": 7963 }, { "epoch": 0.4221238703522116, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10428225.0, "logits/rejected": -27236496.0, "logps/chosen": -362.2078552246094, "logps/rejected": -204.67020670572916, "loss": 0.1947, "rewards/chosen": 1.2687824964523315, "rewards/margins": 3.0674454768498736, "rewards/rejected": -1.7986629803975422, "step": 7964 }, { "epoch": 0.42217687435401374, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27747642.666666668, "logits/rejected": 10242846.4, "logps/chosen": -236.77237955729166, "logps/rejected": -88.5780517578125, "loss": 0.2976, "rewards/chosen": 0.49318599700927734, "rewards/margins": 1.9180460929870606, "rewards/rejected": -1.4248600959777833, "step": 7965 }, { "epoch": 0.4222298783558159, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26062960.0, "logits/rejected": -47873240.0, "logps/chosen": -148.84380086263022, "logps/rejected": -545.1416625976562, "loss": 0.3778, "rewards/chosen": 0.13811598221460977, "rewards/margins": 2.344483474890391, "rewards/rejected": -2.2063674926757812, "step": 7966 }, { "epoch": 0.422282882357618, "grad_norm": 61.0, "kl": 0.7295055389404297, "learning_rate": 5e-07, "logits/chosen": -45431072.0, "logits/rejected": -16178836.0, "logps/chosen": -675.3805541992188, "logps/rejected": -168.45985412597656, "loss": 0.2391, "rewards/chosen": 0.6111305356025696, "rewards/margins": 3.0824788212776184, "rewards/rejected": -2.471348285675049, "step": 7967 }, { "epoch": 0.42233588635942015, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54119162.666666664, "logits/rejected": -42330752.0, "logps/chosen": -65.3761698404948, "logps/rejected": -221.437841796875, "loss": 0.2804, "rewards/chosen": -0.04212722678979238, "rewards/margins": 1.7964538584152858, "rewards/rejected": -1.8385810852050781, "step": 7968 }, { "epoch": 0.4223888903612223, "grad_norm": 50.25, "kl": 0.7323875427246094, "learning_rate": 5e-07, "logits/chosen": -19555992.0, "logits/rejected": -13029466.666666666, "logps/chosen": -258.3838134765625, "logps/rejected": -175.105224609375, "loss": 0.3613, "rewards/chosen": 0.6223467826843262, "rewards/margins": 1.3367427031199137, "rewards/rejected": -0.7143959204355875, "step": 7969 }, { "epoch": 0.42244189436302443, "grad_norm": 45.0, "kl": 0.3343820571899414, "learning_rate": 5e-07, "logits/chosen": -40699744.0, "logits/rejected": -13024634.666666666, "logps/chosen": -266.974609375, "logps/rejected": -393.2920328776042, "loss": 0.2926, "rewards/chosen": 0.2865968465805054, "rewards/margins": 3.9325608332951867, "rewards/rejected": -3.645963986714681, "step": 7970 }, { "epoch": 0.42249489836482657, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14038820.0, "logits/rejected": -31180990.0, "logps/chosen": -346.56365966796875, "logps/rejected": -399.4214172363281, "loss": 0.2428, "rewards/chosen": 1.0952942371368408, "rewards/margins": 3.067017674446106, "rewards/rejected": -1.9717234373092651, "step": 7971 }, { "epoch": 0.4225479023666287, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53087564.0, "logits/rejected": -14303941.333333334, "logps/chosen": -475.4678039550781, "logps/rejected": -164.2743123372396, "loss": 0.198, "rewards/chosen": 0.7327712774276733, "rewards/margins": 2.717582106590271, "rewards/rejected": -1.9848108291625977, "step": 7972 }, { "epoch": 0.42260090636843084, "grad_norm": 48.25, "kl": 0.30150604248046875, "learning_rate": 5e-07, "logits/chosen": -22562434.0, "logits/rejected": -18801006.0, "logps/chosen": -229.0538787841797, "logps/rejected": -272.8462829589844, "loss": 0.2385, "rewards/chosen": 0.8365239500999451, "rewards/margins": 2.777059018611908, "rewards/rejected": -1.940535068511963, "step": 7973 }, { "epoch": 0.422653910370233, "grad_norm": 49.25, "kl": 0.3676948547363281, "learning_rate": 5e-07, "logits/chosen": -16608700.0, "logits/rejected": -25888768.0, "logps/chosen": -161.7316436767578, "logps/rejected": -278.5318908691406, "loss": 0.2818, "rewards/chosen": 0.5011535882949829, "rewards/margins": 2.3230897188186646, "rewards/rejected": -1.8219361305236816, "step": 7974 }, { "epoch": 0.4227069143720351, "grad_norm": 51.0, "kl": 0.41493988037109375, "learning_rate": 5e-07, "logits/chosen": -16328941.333333334, "logits/rejected": -10842798.0, "logps/chosen": -191.5552775065104, "logps/rejected": -167.22201538085938, "loss": 0.3377, "rewards/chosen": 0.48713664213816327, "rewards/margins": 2.350089112917582, "rewards/rejected": -1.862952470779419, "step": 7975 }, { "epoch": 0.42275991837383725, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17903270.666666668, "logits/rejected": -14227544.0, "logps/chosen": -170.40530395507812, "logps/rejected": -478.904638671875, "loss": 0.2196, "rewards/chosen": 0.9419291814168295, "rewards/margins": 3.140625317891439, "rewards/rejected": -2.1986961364746094, "step": 7976 }, { "epoch": 0.4228129223756394, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15435083.0, "logits/rejected": -40734498.666666664, "logps/chosen": -185.58453369140625, "logps/rejected": -315.199462890625, "loss": 0.2524, "rewards/chosen": 0.2738075256347656, "rewards/margins": 2.1215470631917315, "rewards/rejected": -1.847739537556966, "step": 7977 }, { "epoch": 0.42286592637744147, "grad_norm": 66.0, "kl": 0.5304384231567383, "learning_rate": 5e-07, "logits/chosen": -15796925.333333334, "logits/rejected": -2056766.0, "logps/chosen": -336.10284423828125, "logps/rejected": -603.2127075195312, "loss": 0.3923, "rewards/chosen": 0.17707167069117227, "rewards/margins": 2.3227442304293313, "rewards/rejected": -2.145672559738159, "step": 7978 }, { "epoch": 0.4229189303792436, "grad_norm": 54.0, "kl": 0.30936431884765625, "learning_rate": 5e-07, "logits/chosen": -34586310.85714286, "logits/rejected": -35757904.0, "logps/chosen": -247.60056849888392, "logps/rejected": -404.36590576171875, "loss": 0.3977, "rewards/chosen": 0.3630311148507254, "rewards/margins": 1.6549135105950492, "rewards/rejected": -1.2918823957443237, "step": 7979 }, { "epoch": 0.42297193438104574, "grad_norm": 65.0, "kl": 1.5505561828613281, "learning_rate": 5e-07, "logits/chosen": -8977596.0, "logits/rejected": -17456732.0, "logps/chosen": -346.21240234375, "logps/rejected": -165.35948181152344, "loss": 0.4097, "rewards/chosen": 0.29525093237559, "rewards/margins": 1.747345248858134, "rewards/rejected": -1.452094316482544, "step": 7980 }, { "epoch": 0.4230249383828479, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42707964.0, "logits/rejected": -29788336.0, "logps/chosen": -462.03973388671875, "logps/rejected": -439.5242106119792, "loss": 0.2404, "rewards/chosen": 0.39898681640625, "rewards/margins": 2.5489301681518555, "rewards/rejected": -2.1499433517456055, "step": 7981 }, { "epoch": 0.42307794238465, "grad_norm": 54.25, "kl": 1.4900131225585938, "learning_rate": 5e-07, "logits/chosen": -34772601.6, "logits/rejected": -12993660.0, "logps/chosen": -305.6298583984375, "logps/rejected": -327.9420166015625, "loss": 0.3808, "rewards/chosen": 0.27189781665802004, "rewards/margins": 2.111021701494853, "rewards/rejected": -1.8391238848368328, "step": 7982 }, { "epoch": 0.42313094638645216, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42211296.0, "logits/rejected": 58523884.8, "logps/chosen": -460.7214762369792, "logps/rejected": -409.8869140625, "loss": 0.187, "rewards/chosen": 0.962287425994873, "rewards/margins": 3.008703517913818, "rewards/rejected": -2.046416091918945, "step": 7983 }, { "epoch": 0.4231839503882543, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10834416.0, "logits/rejected": -76344024.0, "logps/chosen": -529.0001220703125, "logps/rejected": -536.7402954101562, "loss": 0.2011, "rewards/chosen": 0.6329104900360107, "rewards/margins": 4.36877179145813, "rewards/rejected": -3.735861301422119, "step": 7984 }, { "epoch": 0.42323695439005643, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17703642.666666668, "logits/rejected": -21039889.6, "logps/chosen": -91.46547444661458, "logps/rejected": -545.939599609375, "loss": 0.2054, "rewards/chosen": 0.5208686987559, "rewards/margins": 2.969875160853068, "rewards/rejected": -2.449006462097168, "step": 7985 }, { "epoch": 0.42328995839185857, "grad_norm": 59.25, "kl": 0.3589744567871094, "learning_rate": 5e-07, "logits/chosen": -25003323.2, "logits/rejected": -7542447.333333333, "logps/chosen": -329.505517578125, "logps/rejected": -446.5069986979167, "loss": 0.3192, "rewards/chosen": 0.2249433994293213, "rewards/margins": 3.1948235034942627, "rewards/rejected": -2.9698801040649414, "step": 7986 }, { "epoch": 0.4233429623936607, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11657664.0, "logits/rejected": -20290464.0, "logps/chosen": -248.6012166341146, "logps/rejected": -188.29046630859375, "loss": 0.3381, "rewards/chosen": 0.2765553990999858, "rewards/margins": 1.367235807577769, "rewards/rejected": -1.0906804084777832, "step": 7987 }, { "epoch": 0.42339596639546284, "grad_norm": 46.25, "kl": 0.008922576904296875, "learning_rate": 5e-07, "logits/chosen": -28671216.0, "logits/rejected": -48590598.4, "logps/chosen": -486.1829427083333, "logps/rejected": -310.816748046875, "loss": 0.2174, "rewards/chosen": 1.250586986541748, "rewards/margins": 3.0061638832092283, "rewards/rejected": -1.7555768966674805, "step": 7988 }, { "epoch": 0.423448970397265, "grad_norm": 60.75, "kl": 2.5806732177734375, "learning_rate": 5e-07, "logits/chosen": 13520907.2, "logits/rejected": -76693173.33333333, "logps/chosen": -404.6028076171875, "logps/rejected": -250.91255696614584, "loss": 0.3332, "rewards/chosen": 0.7356545448303222, "rewards/margins": 2.3931271553039553, "rewards/rejected": -1.6574726104736328, "step": 7989 }, { "epoch": 0.4235019743990671, "grad_norm": 46.0, "kl": 0.5355415344238281, "learning_rate": 5e-07, "logits/chosen": -84679704.0, "logits/rejected": -39132005.333333336, "logps/chosen": -400.25927734375, "logps/rejected": -244.59940592447916, "loss": 0.2632, "rewards/chosen": -0.04432907700538635, "rewards/margins": 2.0424560805161796, "rewards/rejected": -2.086785157521566, "step": 7990 }, { "epoch": 0.42355497840086925, "grad_norm": 37.75, "kl": 0.11740589141845703, "learning_rate": 5e-07, "logits/chosen": -2164323.0, "logits/rejected": -35947932.0, "logps/chosen": -128.52674865722656, "logps/rejected": -614.6986694335938, "loss": 0.2293, "rewards/chosen": 0.6014993190765381, "rewards/margins": 3.2503855228424072, "rewards/rejected": -2.648886203765869, "step": 7991 }, { "epoch": 0.4236079824026714, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66479008.0, "logits/rejected": -22305554.666666668, "logps/chosen": -439.8109130859375, "logps/rejected": -292.53155517578125, "loss": 0.1995, "rewards/chosen": 0.3734726011753082, "rewards/margins": 2.3832201063632965, "rewards/rejected": -2.0097475051879883, "step": 7992 }, { "epoch": 0.4236609864044735, "grad_norm": 56.5, "kl": 1.5390262603759766, "learning_rate": 5e-07, "logits/chosen": -7961160.666666667, "logits/rejected": -5114989.5, "logps/chosen": -279.9261474609375, "logps/rejected": -40.43199920654297, "loss": 0.3829, "rewards/chosen": 0.6255683898925781, "rewards/margins": 1.3657452464103699, "rewards/rejected": -0.7401768565177917, "step": 7993 }, { "epoch": 0.42371399040627566, "grad_norm": 42.75, "kl": 1.3616113662719727, "learning_rate": 5e-07, "logits/chosen": -21870752.0, "logits/rejected": -42102592.0, "logps/chosen": -256.079833984375, "logps/rejected": -338.009521484375, "loss": 0.2699, "rewards/chosen": 0.5591153502464294, "rewards/margins": 3.2927815318107605, "rewards/rejected": -2.733666181564331, "step": 7994 }, { "epoch": 0.4237669944080778, "grad_norm": 48.25, "kl": 1.0247268676757812, "learning_rate": 5e-07, "logits/chosen": -11983936.0, "logits/rejected": -17321523.2, "logps/chosen": -389.3870442708333, "logps/rejected": -184.9521240234375, "loss": 0.2783, "rewards/chosen": 0.6447579860687256, "rewards/margins": 2.185650110244751, "rewards/rejected": -1.5408921241760254, "step": 7995 }, { "epoch": 0.42381999840987994, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32090406.4, "logits/rejected": 103984277.33333333, "logps/chosen": -361.1155517578125, "logps/rejected": -315.72027587890625, "loss": 0.3903, "rewards/chosen": -0.1164125919342041, "rewards/margins": 1.7122475783030193, "rewards/rejected": -1.8286601702372234, "step": 7996 }, { "epoch": 0.4238730024116821, "grad_norm": 44.0, "kl": 1.2572784423828125, "learning_rate": 5e-07, "logits/chosen": -28462981.333333332, "logits/rejected": 46709251.2, "logps/chosen": -243.2938028971354, "logps/rejected": -326.0942626953125, "loss": 0.2949, "rewards/chosen": 0.21354015668233237, "rewards/margins": 2.1518484910329185, "rewards/rejected": -1.938308334350586, "step": 7997 }, { "epoch": 0.4239260064134842, "grad_norm": 46.5, "kl": 0.8311576843261719, "learning_rate": 5e-07, "logits/chosen": -46415178.666666664, "logits/rejected": -2463446.4, "logps/chosen": -424.6255289713542, "logps/rejected": -166.3521240234375, "loss": 0.2488, "rewards/chosen": 1.2077738444010417, "rewards/margins": 2.6366796175638836, "rewards/rejected": -1.4289057731628418, "step": 7998 }, { "epoch": 0.42397901041528635, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24105488.0, "logits/rejected": 5216773.6, "logps/chosen": -373.269775390625, "logps/rejected": -285.8307373046875, "loss": 0.3041, "rewards/chosen": -0.05763953924179077, "rewards/margins": 1.6313151955604552, "rewards/rejected": -1.688954734802246, "step": 7999 }, { "epoch": 0.4240320144170885, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19414514.0, "logits/rejected": 15368437.0, "logps/chosen": -122.32302856445312, "logps/rejected": -378.3705749511719, "loss": 0.279, "rewards/chosen": 0.4374161660671234, "rewards/margins": 2.112968534231186, "rewards/rejected": -1.6755523681640625, "step": 8000 }, { "epoch": 0.4240850184188906, "grad_norm": 53.25, "kl": 1.2941694259643555, "learning_rate": 5e-07, "logits/chosen": -39257184.0, "logits/rejected": -69339520.0, "logps/chosen": -433.593994140625, "logps/rejected": -265.38543701171875, "loss": 0.3622, "rewards/chosen": 0.5245978832244873, "rewards/margins": 2.241015672683716, "rewards/rejected": -1.7164177894592285, "step": 8001 }, { "epoch": 0.42413802242069276, "grad_norm": 55.75, "kl": 0.34693145751953125, "learning_rate": 5e-07, "logits/chosen": 2299152.0, "logits/rejected": -29189892.0, "logps/chosen": -554.1222534179688, "logps/rejected": -254.14688110351562, "loss": 0.2725, "rewards/chosen": 0.8180591464042664, "rewards/margins": 2.473000466823578, "rewards/rejected": -1.6549413204193115, "step": 8002 }, { "epoch": 0.4241910264224949, "grad_norm": 36.75, "kl": 0.5297508239746094, "learning_rate": 5e-07, "logits/chosen": -13111373.0, "logits/rejected": -57698832.0, "logps/chosen": -186.42930603027344, "logps/rejected": -355.561767578125, "loss": 0.2478, "rewards/chosen": 0.3988252580165863, "rewards/margins": 3.2872694432735443, "rewards/rejected": -2.888444185256958, "step": 8003 }, { "epoch": 0.42424403042429704, "grad_norm": 40.0, "kl": 1.1112823486328125, "learning_rate": 5e-07, "logits/chosen": -20378675.2, "logits/rejected": -33627970.666666664, "logps/chosen": -210.8748046875, "logps/rejected": -312.11134847005206, "loss": 0.2862, "rewards/chosen": 0.886768913269043, "rewards/margins": 2.832996463775635, "rewards/rejected": -1.9462275505065918, "step": 8004 }, { "epoch": 0.4242970344260992, "grad_norm": 50.5, "kl": 0.6919918060302734, "learning_rate": 5e-07, "logits/chosen": -16500944.0, "logits/rejected": -27132333.333333332, "logps/chosen": -545.746142578125, "logps/rejected": -300.2053629557292, "loss": 0.2813, "rewards/chosen": 1.1641885757446289, "rewards/margins": 2.346488666534424, "rewards/rejected": -1.182300090789795, "step": 8005 }, { "epoch": 0.4243500384279013, "grad_norm": 44.25, "kl": 1.0179862976074219, "learning_rate": 5e-07, "logits/chosen": -10634926.666666666, "logits/rejected": -30965907.2, "logps/chosen": -290.1904296875, "logps/rejected": -444.591796875, "loss": 0.2273, "rewards/chosen": 0.7212916215260824, "rewards/margins": 3.1946619828542073, "rewards/rejected": -2.473370361328125, "step": 8006 }, { "epoch": 0.42440304242970345, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29225978.666666668, "logits/rejected": -19565995.2, "logps/chosen": -160.75748697916666, "logps/rejected": -167.211865234375, "loss": 0.226, "rewards/chosen": 0.941888173421224, "rewards/margins": 2.433608945210775, "rewards/rejected": -1.4917207717895509, "step": 8007 }, { "epoch": 0.4244560464315056, "grad_norm": 58.5, "kl": 0.1671428680419922, "learning_rate": 5e-07, "logits/chosen": -16356176.0, "logits/rejected": -61291221.333333336, "logps/chosen": -147.0376953125, "logps/rejected": -328.4260660807292, "loss": 0.3066, "rewards/chosen": 0.3785087585449219, "rewards/margins": 2.6359725316365563, "rewards/rejected": -2.2574637730916343, "step": 8008 }, { "epoch": 0.4245090504333077, "grad_norm": 52.5, "kl": 0.32906341552734375, "learning_rate": 5e-07, "logits/chosen": -62059408.0, "logits/rejected": -8979340.8, "logps/chosen": -709.0079752604166, "logps/rejected": -174.52349853515625, "loss": 0.365, "rewards/chosen": 0.9550694624582926, "rewards/margins": 1.8184029738108318, "rewards/rejected": -0.8633335113525391, "step": 8009 }, { "epoch": 0.42456205443510986, "grad_norm": 56.25, "kl": 0.4755706787109375, "learning_rate": 5e-07, "logits/chosen": -19047312.0, "logits/rejected": -17236738.666666668, "logps/chosen": -321.3935546875, "logps/rejected": -302.89190673828125, "loss": 0.3235, "rewards/chosen": 0.49899892807006835, "rewards/margins": 2.1817962010701497, "rewards/rejected": -1.6827972730000813, "step": 8010 }, { "epoch": 0.424615058436912, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24846736.0, "logits/rejected": -17975406.0, "logps/chosen": -278.49896240234375, "logps/rejected": -146.1811981201172, "loss": 0.3915, "rewards/chosen": 0.05387383699417114, "rewards/margins": 0.951065719127655, "rewards/rejected": -0.8971918821334839, "step": 8011 }, { "epoch": 0.42466806243871413, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -88836528.0, "logits/rejected": -24394102.85714286, "logps/chosen": -545.577880859375, "logps/rejected": -315.6415318080357, "loss": 0.1074, "rewards/chosen": 0.5304321646690369, "rewards/margins": 3.2939173919813975, "rewards/rejected": -2.7634852273123607, "step": 8012 }, { "epoch": 0.42472106644051627, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38719368.0, "logits/rejected": -7426572.0, "logps/chosen": -377.7277526855469, "logps/rejected": -266.5557861328125, "loss": 0.1999, "rewards/chosen": 0.7581096887588501, "rewards/margins": 2.739846348762512, "rewards/rejected": -1.981736660003662, "step": 8013 }, { "epoch": 0.4247740704423184, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7473828.8, "logits/rejected": -74894922.66666667, "logps/chosen": -330.446435546875, "logps/rejected": -265.95416259765625, "loss": 0.3488, "rewards/chosen": -0.02912195920944214, "rewards/margins": 2.3826784253120423, "rewards/rejected": -2.4118003845214844, "step": 8014 }, { "epoch": 0.42482707444412054, "grad_norm": 65.5, "kl": 0.000217437744140625, "learning_rate": 5e-07, "logits/chosen": -74914088.0, "logits/rejected": -8667436.0, "logps/chosen": -143.64830017089844, "logps/rejected": -334.32098388671875, "loss": 0.3033, "rewards/chosen": 0.5223550796508789, "rewards/margins": 2.4589107036590576, "rewards/rejected": -1.9365556240081787, "step": 8015 }, { "epoch": 0.4248800784459227, "grad_norm": 40.0, "kl": 0.7347564697265625, "learning_rate": 5e-07, "logits/chosen": -18808304.0, "logits/rejected": -25929507.2, "logps/chosen": -280.087646484375, "logps/rejected": -292.46640625, "loss": 0.2637, "rewards/chosen": 0.5473257700602213, "rewards/margins": 2.6568529764811197, "rewards/rejected": -2.1095272064208985, "step": 8016 }, { "epoch": 0.4249330824477248, "grad_norm": 51.75, "kl": 4.015556335449219, "learning_rate": 5e-07, "logits/chosen": -21580488.0, "logits/rejected": -5658361.0, "logps/chosen": -766.7775268554688, "logps/rejected": -244.17269897460938, "loss": 0.1928, "rewards/chosen": 1.8375641107559204, "rewards/margins": 3.7024208307266235, "rewards/rejected": -1.8648567199707031, "step": 8017 }, { "epoch": 0.42498608644952696, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30658490.666666668, "logits/rejected": -48110144.0, "logps/chosen": -297.4165445963542, "logps/rejected": -541.0426025390625, "loss": 0.3792, "rewards/chosen": 0.24347837766011557, "rewards/margins": 2.188195784886678, "rewards/rejected": -1.9447174072265625, "step": 8018 }, { "epoch": 0.4250390904513291, "grad_norm": 67.0, "kl": 0.05525779724121094, "learning_rate": 5e-07, "logits/chosen": -86092984.0, "logits/rejected": -30010731.42857143, "logps/chosen": -665.1740112304688, "logps/rejected": -189.726318359375, "loss": 0.2652, "rewards/chosen": 1.0306396484375, "rewards/margins": 2.1806648799351285, "rewards/rejected": -1.1500252314976283, "step": 8019 }, { "epoch": 0.42509209445313123, "grad_norm": 41.0, "kl": 0.1260223388671875, "learning_rate": 5e-07, "logits/chosen": -22473238.0, "logits/rejected": -46362208.0, "logps/chosen": -210.73236083984375, "logps/rejected": -121.76261901855469, "loss": 0.249, "rewards/chosen": 0.6964911222457886, "rewards/margins": 2.64069402217865, "rewards/rejected": -1.9442028999328613, "step": 8020 }, { "epoch": 0.42514509845493337, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50294052.0, "logits/rejected": -8713598.0, "logps/chosen": -284.46563720703125, "logps/rejected": -86.08229064941406, "loss": 0.3045, "rewards/chosen": 0.32190442085266113, "rewards/margins": 2.4109866619110107, "rewards/rejected": -2.0890822410583496, "step": 8021 }, { "epoch": 0.4251981024567355, "grad_norm": 39.75, "kl": 0.4079780578613281, "learning_rate": 5e-07, "logits/chosen": -38604908.8, "logits/rejected": -2910551.0, "logps/chosen": -159.24749755859375, "logps/rejected": -274.9366861979167, "loss": 0.3136, "rewards/chosen": 0.5175923347473145, "rewards/margins": 2.2848379453023275, "rewards/rejected": -1.767245610555013, "step": 8022 }, { "epoch": 0.42525110645853764, "grad_norm": 40.25, "kl": 0.4784412384033203, "learning_rate": 5e-07, "logits/chosen": -13756904.0, "logits/rejected": -18672172.0, "logps/chosen": -144.30235290527344, "logps/rejected": -184.52493286132812, "loss": 0.3625, "rewards/chosen": 0.4330388605594635, "rewards/margins": 1.3846064507961273, "rewards/rejected": -0.9515675902366638, "step": 8023 }, { "epoch": 0.4253041104603398, "grad_norm": 60.25, "kl": 1.8791694641113281, "learning_rate": 5e-07, "logits/chosen": -53019584.0, "logits/rejected": -2032655.0, "logps/chosen": -715.4622802734375, "logps/rejected": -328.2036437988281, "loss": 0.2356, "rewards/chosen": 0.8123741149902344, "rewards/margins": 3.2477712631225586, "rewards/rejected": -2.435397148132324, "step": 8024 }, { "epoch": 0.4253571144621419, "grad_norm": 47.5, "kl": 0.43633270263671875, "learning_rate": 5e-07, "logits/chosen": -21209700.0, "logits/rejected": 12973800.0, "logps/chosen": -260.6584879557292, "logps/rejected": -208.5869903564453, "loss": 0.4043, "rewards/chosen": 0.09386418263117473, "rewards/margins": 1.8174384633700054, "rewards/rejected": -1.7235742807388306, "step": 8025 }, { "epoch": 0.42541011846394405, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11429620.0, "logits/rejected": -11759285.333333334, "logps/chosen": -248.551025390625, "logps/rejected": -161.0544637044271, "loss": 0.2384, "rewards/chosen": 1.2940983772277832, "rewards/margins": 2.645252386728923, "rewards/rejected": -1.3511540095011394, "step": 8026 }, { "epoch": 0.4254631224657462, "grad_norm": 59.0, "kl": 0.33666038513183594, "learning_rate": 5e-07, "logits/chosen": -133228400.0, "logits/rejected": -27989122.0, "logps/chosen": -404.63653564453125, "logps/rejected": -269.8360900878906, "loss": 0.3806, "rewards/chosen": -0.031653791666030884, "rewards/margins": 1.3847666680812836, "rewards/rejected": -1.4164204597473145, "step": 8027 }, { "epoch": 0.4255161264675483, "grad_norm": 61.0, "kl": 2.233247756958008, "learning_rate": 5e-07, "logits/chosen": 12921182.0, "logits/rejected": -45381252.0, "logps/chosen": -349.1020812988281, "logps/rejected": -320.5523681640625, "loss": 0.2873, "rewards/chosen": 0.6182236671447754, "rewards/margins": 2.7405457496643066, "rewards/rejected": -2.1223220825195312, "step": 8028 }, { "epoch": 0.4255691304693504, "grad_norm": 62.25, "kl": 2.372115135192871, "learning_rate": 5e-07, "logits/chosen": -30660492.8, "logits/rejected": -32243296.0, "logps/chosen": -519.41279296875, "logps/rejected": -451.7340494791667, "loss": 0.3225, "rewards/chosen": 0.7677563667297364, "rewards/margins": 2.9362056096394857, "rewards/rejected": -2.1684492429097495, "step": 8029 }, { "epoch": 0.42562213447115255, "grad_norm": 55.0, "kl": 0.2909088134765625, "learning_rate": 5e-07, "logits/chosen": -14354889.6, "logits/rejected": -78162805.33333333, "logps/chosen": -475.45693359375, "logps/rejected": -279.0062662760417, "loss": 0.3204, "rewards/chosen": 0.75785231590271, "rewards/margins": 1.969016154607137, "rewards/rejected": -1.211163838704427, "step": 8030 }, { "epoch": 0.4256751384729547, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58125160.0, "logits/rejected": -48608956.0, "logps/chosen": -164.93736267089844, "logps/rejected": -420.32269287109375, "loss": 0.2796, "rewards/chosen": 0.033047303557395935, "rewards/margins": 2.8886552900075912, "rewards/rejected": -2.8556079864501953, "step": 8031 }, { "epoch": 0.4257281424747568, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22308736.0, "logits/rejected": -10318934.666666666, "logps/chosen": -281.05010986328125, "logps/rejected": -192.5501505533854, "loss": 0.2632, "rewards/chosen": 0.5452972650527954, "rewards/margins": 2.0810584624608355, "rewards/rejected": -1.5357611974080403, "step": 8032 }, { "epoch": 0.42578114647655896, "grad_norm": 47.0, "kl": 0.3348236083984375, "learning_rate": 5e-07, "logits/chosen": -25832389.333333332, "logits/rejected": -19626067.2, "logps/chosen": -778.9644368489584, "logps/rejected": -365.874853515625, "loss": 0.1859, "rewards/chosen": 1.1183358828226726, "rewards/margins": 3.9483281771341963, "rewards/rejected": -2.8299922943115234, "step": 8033 }, { "epoch": 0.4258341504783611, "grad_norm": 67.0, "kl": 0.2922630310058594, "learning_rate": 5e-07, "logits/chosen": -56066016.0, "logits/rejected": -67304064.0, "logps/chosen": -443.8507080078125, "logps/rejected": -641.6915283203125, "loss": 0.3481, "rewards/chosen": 0.23634809255599976, "rewards/margins": 4.32752126455307, "rewards/rejected": -4.09117317199707, "step": 8034 }, { "epoch": 0.42588715448016323, "grad_norm": 58.0, "kl": 1.2802748680114746, "learning_rate": 5e-07, "logits/chosen": -18078612.0, "logits/rejected": -20140428.0, "logps/chosen": -446.8312072753906, "logps/rejected": -137.8006591796875, "loss": 0.3955, "rewards/chosen": 0.2583344578742981, "rewards/margins": 1.0123820900917053, "rewards/rejected": -0.7540476322174072, "step": 8035 }, { "epoch": 0.42594015848196537, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23090560.0, "logits/rejected": -31329955.2, "logps/chosen": -262.69578043619794, "logps/rejected": -211.343505859375, "loss": 0.3038, "rewards/chosen": 0.40865739186604816, "rewards/margins": 1.7791537602742513, "rewards/rejected": -1.3704963684082032, "step": 8036 }, { "epoch": 0.4259931624837675, "grad_norm": 43.0, "kl": 0.3579521179199219, "learning_rate": 5e-07, "logits/chosen": 493391.0, "logits/rejected": -36218512.0, "logps/chosen": -290.8492431640625, "logps/rejected": -269.269873046875, "loss": 0.2965, "rewards/chosen": 0.5749954382578532, "rewards/margins": 2.406279961268107, "rewards/rejected": -1.831284523010254, "step": 8037 }, { "epoch": 0.42604616648556964, "grad_norm": 62.75, "kl": 0.2551460266113281, "learning_rate": 5e-07, "logits/chosen": -29363428.57142857, "logits/rejected": -28204668.0, "logps/chosen": -258.5257568359375, "logps/rejected": -439.6426696777344, "loss": 0.3758, "rewards/chosen": 0.3503757544926235, "rewards/margins": 3.7069219180515836, "rewards/rejected": -3.35654616355896, "step": 8038 }, { "epoch": 0.4260991704873718, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47334410.666666664, "logits/rejected": -14780201.6, "logps/chosen": -380.6460774739583, "logps/rejected": -113.48533935546875, "loss": 0.2604, "rewards/chosen": 0.7316975593566895, "rewards/margins": 2.2174051284790037, "rewards/rejected": -1.4857075691223145, "step": 8039 }, { "epoch": 0.4261521744891739, "grad_norm": 65.5, "kl": 2.836226463317871, "learning_rate": 5e-07, "logits/chosen": -32563692.0, "logits/rejected": 8357011.5, "logps/chosen": -377.953857421875, "logps/rejected": -113.57117462158203, "loss": 0.3667, "rewards/chosen": 0.7233936190605164, "rewards/margins": 2.1288382411003113, "rewards/rejected": -1.405444622039795, "step": 8040 }, { "epoch": 0.42620517849097606, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34725664.0, "logits/rejected": -50862924.8, "logps/chosen": -266.92038981119794, "logps/rejected": -347.322998046875, "loss": 0.333, "rewards/chosen": -0.2989097436269124, "rewards/margins": 1.7366335074106851, "rewards/rejected": -2.0355432510375975, "step": 8041 }, { "epoch": 0.4262581824927782, "grad_norm": 54.25, "kl": 2.6337966918945312, "learning_rate": 5e-07, "logits/chosen": -26921942.85714286, "logits/rejected": -38710992.0, "logps/chosen": -239.14582170758928, "logps/rejected": -423.572265625, "loss": 0.4894, "rewards/chosen": 0.060375677687781196, "rewards/margins": 2.3527462354728153, "rewards/rejected": -2.292370557785034, "step": 8042 }, { "epoch": 0.42631118649458033, "grad_norm": 47.0, "kl": 1.2797050476074219, "learning_rate": 5e-07, "logits/chosen": -8942606.4, "logits/rejected": -22202412.0, "logps/chosen": -278.7361083984375, "logps/rejected": -353.4124348958333, "loss": 0.3091, "rewards/chosen": 0.6555991172790527, "rewards/margins": 2.9381699562072754, "rewards/rejected": -2.2825708389282227, "step": 8043 }, { "epoch": 0.42636419049638247, "grad_norm": 55.0, "kl": 0.12218761444091797, "learning_rate": 5e-07, "logits/chosen": -37332856.0, "logits/rejected": 70978064.0, "logps/chosen": -257.012939453125, "logps/rejected": -251.1146240234375, "loss": 0.2537, "rewards/chosen": 0.4868614077568054, "rewards/margins": 1.9671545227368672, "rewards/rejected": -1.4802931149800618, "step": 8044 }, { "epoch": 0.4264171944981846, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11971876.0, "logits/rejected": -43717632.0, "logps/chosen": -51.925628662109375, "logps/rejected": -311.29099527994794, "loss": 0.2785, "rewards/chosen": -0.125863179564476, "rewards/margins": 1.6312375615040462, "rewards/rejected": -1.7571007410685222, "step": 8045 }, { "epoch": 0.42647019849998674, "grad_norm": 50.25, "kl": 0.19271087646484375, "learning_rate": 5e-07, "logits/chosen": -22209066.666666668, "logits/rejected": -137840976.0, "logps/chosen": -291.80226643880206, "logps/rejected": -755.453857421875, "loss": 0.3141, "rewards/chosen": 0.5085969765981039, "rewards/margins": 6.08851679166158, "rewards/rejected": -5.579919815063477, "step": 8046 }, { "epoch": 0.4265232025017889, "grad_norm": 56.0, "kl": 1.7501296997070312, "learning_rate": 5e-07, "logits/chosen": 232756019.2, "logits/rejected": -46972874.666666664, "logps/chosen": -389.87626953125, "logps/rejected": -463.0790201822917, "loss": 0.3008, "rewards/chosen": 0.5875740051269531, "rewards/margins": 2.9738152821858725, "rewards/rejected": -2.3862412770589194, "step": 8047 }, { "epoch": 0.426576206503591, "grad_norm": 47.75, "kl": 0.46593666076660156, "learning_rate": 5e-07, "logits/chosen": -25071208.0, "logits/rejected": -4361476.4, "logps/chosen": -293.1341552734375, "logps/rejected": -263.3080078125, "loss": 0.2193, "rewards/chosen": 1.0417579809824626, "rewards/margins": 2.774319855372111, "rewards/rejected": -1.7325618743896485, "step": 8048 }, { "epoch": 0.42662921050539315, "grad_norm": 50.5, "kl": 0.028934478759765625, "learning_rate": 5e-07, "logits/chosen": -34435840.0, "logits/rejected": -28816434.666666668, "logps/chosen": -436.8922424316406, "logps/rejected": -348.5104166666667, "loss": 0.2884, "rewards/chosen": -0.21388721466064453, "rewards/margins": 1.5478487014770508, "rewards/rejected": -1.7617359161376953, "step": 8049 }, { "epoch": 0.4266822145071953, "grad_norm": 56.0, "kl": 0.5117979049682617, "learning_rate": 5e-07, "logits/chosen": -13916724.0, "logits/rejected": -20820266.0, "logps/chosen": -167.24119567871094, "logps/rejected": -539.0625, "loss": 0.3692, "rewards/chosen": -0.12568150460720062, "rewards/margins": 3.2226678282022476, "rewards/rejected": -3.3483493328094482, "step": 8050 }, { "epoch": 0.4267352185089974, "grad_norm": 54.0, "kl": 0.25423431396484375, "learning_rate": 5e-07, "logits/chosen": -51845220.0, "logits/rejected": -15113902.0, "logps/chosen": -416.81640625, "logps/rejected": -123.35093688964844, "loss": 0.2805, "rewards/chosen": 0.775796115398407, "rewards/margins": 2.0723277926445007, "rewards/rejected": -1.2965316772460938, "step": 8051 }, { "epoch": 0.42678822251079956, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40037204.0, "logits/rejected": -10660284.0, "logps/chosen": -362.891845703125, "logps/rejected": -109.34918975830078, "loss": 0.292, "rewards/chosen": 0.34391728043556213, "rewards/margins": 2.3682312071323395, "rewards/rejected": -2.0243139266967773, "step": 8052 }, { "epoch": 0.4268412265126017, "grad_norm": 48.0, "kl": 0.5956325531005859, "learning_rate": 5e-07, "logits/chosen": -45104560.0, "logits/rejected": -29797156.0, "logps/chosen": -562.3518676757812, "logps/rejected": -358.305908203125, "loss": 0.2475, "rewards/chosen": 0.9272333383560181, "rewards/margins": 3.2589629888534546, "rewards/rejected": -2.3317296504974365, "step": 8053 }, { "epoch": 0.42689423051440384, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35053576.0, "logits/rejected": -18999664.0, "logps/chosen": -281.524169921875, "logps/rejected": -151.45054626464844, "loss": 0.4001, "rewards/chosen": -0.31643229722976685, "rewards/margins": 1.1629195809364319, "rewards/rejected": -1.4793518781661987, "step": 8054 }, { "epoch": 0.426947234516206, "grad_norm": 47.5, "kl": 1.2150592803955078, "learning_rate": 5e-07, "logits/chosen": -32878178.666666668, "logits/rejected": -37581872.0, "logps/chosen": -226.65812174479166, "logps/rejected": -534.704833984375, "loss": 0.3502, "rewards/chosen": 0.4273567994435628, "rewards/margins": 4.302120288213094, "rewards/rejected": -3.8747634887695312, "step": 8055 }, { "epoch": 0.4270002385180081, "grad_norm": 55.5, "kl": 1.6285552978515625, "learning_rate": 5e-07, "logits/chosen": -13708910.4, "logits/rejected": -7753230.666666667, "logps/chosen": -295.6395263671875, "logps/rejected": -162.3117472330729, "loss": 0.2933, "rewards/chosen": 0.710814094543457, "rewards/margins": 2.7883281071980797, "rewards/rejected": -2.0775140126546225, "step": 8056 }, { "epoch": 0.42705324251981025, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20582826.666666668, "logits/rejected": -26903102.0, "logps/chosen": -401.9970296223958, "logps/rejected": -249.9117431640625, "loss": 0.3787, "rewards/chosen": 0.5396339893341064, "rewards/margins": 1.8374627828598022, "rewards/rejected": -1.2978287935256958, "step": 8057 }, { "epoch": 0.4271062465216124, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28694672.0, "logits/rejected": -21323545.6, "logps/chosen": -162.899658203125, "logps/rejected": -475.17451171875, "loss": 0.2118, "rewards/chosen": 0.8203882376352946, "rewards/margins": 3.0619918982187904, "rewards/rejected": -2.241603660583496, "step": 8058 }, { "epoch": 0.4271592505234145, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28395328.0, "logits/rejected": -35871252.0, "logps/chosen": -469.6258850097656, "logps/rejected": -382.1963806152344, "loss": 0.2865, "rewards/chosen": 0.3287864923477173, "rewards/margins": 2.929303765296936, "rewards/rejected": -2.6005172729492188, "step": 8059 }, { "epoch": 0.42721225452521666, "grad_norm": 45.5, "kl": 2.180023193359375, "learning_rate": 5e-07, "logits/chosen": -2608807.25, "logits/rejected": -37167004.0, "logps/chosen": -140.5867462158203, "logps/rejected": -468.5458984375, "loss": 0.2519, "rewards/chosen": 0.5810648798942566, "rewards/margins": 3.6668359637260437, "rewards/rejected": -3.085771083831787, "step": 8060 }, { "epoch": 0.4272652585270188, "grad_norm": 55.0, "kl": 2.719879627227783, "learning_rate": 5e-07, "logits/chosen": -24976058.0, "logits/rejected": -36127880.0, "logps/chosen": -509.3835144042969, "logps/rejected": -414.0308837890625, "loss": 0.2707, "rewards/chosen": 0.7743973135948181, "rewards/margins": 3.4687477946281433, "rewards/rejected": -2.694350481033325, "step": 8061 }, { "epoch": 0.42731826252882094, "grad_norm": 41.75, "kl": 0.4238605499267578, "learning_rate": 5e-07, "logits/chosen": -39346813.333333336, "logits/rejected": -9952326.4, "logps/chosen": -239.80570475260416, "logps/rejected": -231.890234375, "loss": 0.2588, "rewards/chosen": 0.37938539187113446, "rewards/margins": 2.248978630701701, "rewards/rejected": -1.8695932388305665, "step": 8062 }, { "epoch": 0.4273712665306231, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7674167.0, "logits/rejected": -58641877.333333336, "logps/chosen": -217.23118591308594, "logps/rejected": -344.3324381510417, "loss": 0.2401, "rewards/chosen": 0.44168323278427124, "rewards/margins": 2.3225728074709577, "rewards/rejected": -1.8808895746866863, "step": 8063 }, { "epoch": 0.4274242705324252, "grad_norm": 71.0, "kl": 1.104888916015625, "learning_rate": 5e-07, "logits/chosen": -86280336.0, "logits/rejected": -5784171.0, "logps/chosen": -533.4364420572916, "logps/rejected": -156.6576385498047, "loss": 0.4591, "rewards/chosen": 0.1459091305732727, "rewards/margins": 0.876136064529419, "rewards/rejected": -0.7302269339561462, "step": 8064 }, { "epoch": 0.42747727453422735, "grad_norm": 75.5, "kl": 2.4919967651367188, "learning_rate": 5e-07, "logits/chosen": -15051542.666666666, "logits/rejected": -11484764.0, "logps/chosen": -266.1929524739583, "logps/rejected": -191.25579833984375, "loss": 0.329, "rewards/chosen": 0.9250205357869467, "rewards/margins": 2.291415055592855, "rewards/rejected": -1.3663945198059082, "step": 8065 }, { "epoch": 0.4275302785360295, "grad_norm": 48.25, "kl": 0.37073516845703125, "learning_rate": 5e-07, "logits/chosen": -25387306.666666668, "logits/rejected": -12091021.6, "logps/chosen": -306.464599609375, "logps/rejected": -183.2515380859375, "loss": 0.2635, "rewards/chosen": 0.8530187606811523, "rewards/margins": 2.1700658798217773, "rewards/rejected": -1.317047119140625, "step": 8066 }, { "epoch": 0.4275832825378316, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26612674.666666668, "logits/rejected": -11722320.0, "logps/chosen": -144.0922648111979, "logps/rejected": -252.47958984375, "loss": 0.3743, "rewards/chosen": -0.053707132736841835, "rewards/margins": 1.055296601851781, "rewards/rejected": -1.109003734588623, "step": 8067 }, { "epoch": 0.42763628653963376, "grad_norm": 53.5, "kl": 0.9327716827392578, "learning_rate": 5e-07, "logits/chosen": -38352154.666666664, "logits/rejected": -14460991.0, "logps/chosen": -368.8949381510417, "logps/rejected": -306.1728820800781, "loss": 0.3282, "rewards/chosen": 0.7108098665873209, "rewards/margins": 2.2018239895502725, "rewards/rejected": -1.4910141229629517, "step": 8068 }, { "epoch": 0.4276892905414359, "grad_norm": 45.75, "kl": 0.847844123840332, "learning_rate": 5e-07, "logits/chosen": -4460986.333333333, "logits/rejected": -17102067.2, "logps/chosen": -236.1971232096354, "logps/rejected": -216.288916015625, "loss": 0.2297, "rewards/chosen": 1.0254377524058025, "rewards/margins": 2.742608467737834, "rewards/rejected": -1.7171707153320312, "step": 8069 }, { "epoch": 0.42774229454323803, "grad_norm": 50.75, "kl": 0.295501708984375, "learning_rate": 5e-07, "logits/chosen": -28089086.0, "logits/rejected": -59282725.333333336, "logps/chosen": -445.984375, "logps/rejected": -495.7705485026042, "loss": 0.2123, "rewards/chosen": 0.14013977348804474, "rewards/margins": 2.3697421103715897, "rewards/rejected": -2.229602336883545, "step": 8070 }, { "epoch": 0.42779529854504017, "grad_norm": 51.5, "kl": 1.4428520202636719, "learning_rate": 5e-07, "logits/chosen": -26746409.14285714, "logits/rejected": -60793136.0, "logps/chosen": -276.87486049107144, "logps/rejected": -539.052490234375, "loss": 0.3758, "rewards/chosen": 0.5014492443629673, "rewards/margins": 4.977773802621024, "rewards/rejected": -4.476324558258057, "step": 8071 }, { "epoch": 0.4278483025468423, "grad_norm": 48.5, "kl": 0.40961456298828125, "learning_rate": 5e-07, "logits/chosen": -30616069.333333332, "logits/rejected": -58410640.0, "logps/chosen": -378.532470703125, "logps/rejected": -667.64794921875, "loss": 0.3192, "rewards/chosen": 0.6402739683787028, "rewards/margins": 3.140927235285441, "rewards/rejected": -2.5006532669067383, "step": 8072 }, { "epoch": 0.42790130654864444, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16180890.666666666, "logits/rejected": -1001552.4, "logps/chosen": -430.8072916666667, "logps/rejected": -248.490185546875, "loss": 0.3377, "rewards/chosen": 0.027609388033548992, "rewards/margins": 1.3567747513453166, "rewards/rejected": -1.3291653633117675, "step": 8073 }, { "epoch": 0.4279543105504466, "grad_norm": 96.0, "kl": 0.06290435791015625, "learning_rate": 5e-07, "logits/chosen": -130855704.0, "logits/rejected": -39778280.0, "logps/chosen": -378.2584228515625, "logps/rejected": -730.5803833007812, "loss": 0.275, "rewards/chosen": -0.039284512400627136, "rewards/margins": 4.413094714283943, "rewards/rejected": -4.45237922668457, "step": 8074 }, { "epoch": 0.4280073145522487, "grad_norm": 35.75, "kl": 0.09173011779785156, "learning_rate": 5e-07, "logits/chosen": -29622246.0, "logits/rejected": -13286857.333333334, "logps/chosen": -244.6054229736328, "logps/rejected": -195.09578450520834, "loss": 0.2563, "rewards/chosen": -0.07823886722326279, "rewards/margins": 2.1610400850574174, "rewards/rejected": -2.23927895228068, "step": 8075 }, { "epoch": 0.42806031855405086, "grad_norm": 82.0, "kl": 4.801856994628906, "learning_rate": 5e-07, "logits/chosen": -28127524.57142857, "logits/rejected": 5528791.5, "logps/chosen": -492.52249581473217, "logps/rejected": -78.43838500976562, "loss": 0.4582, "rewards/chosen": 0.7937079157148089, "rewards/margins": 1.5841246332441057, "rewards/rejected": -0.7904167175292969, "step": 8076 }, { "epoch": 0.428113322555853, "grad_norm": 64.0, "kl": 0.8924026489257812, "learning_rate": 5e-07, "logits/chosen": -20632965.333333332, "logits/rejected": -11667983.0, "logps/chosen": -215.68597412109375, "logps/rejected": -342.49658203125, "loss": 0.348, "rewards/chosen": 0.4423668384552002, "rewards/margins": 2.874074697494507, "rewards/rejected": -2.4317078590393066, "step": 8077 }, { "epoch": 0.42816632655765513, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12725021.0, "logits/rejected": -2602602.0, "logps/chosen": -211.6673126220703, "logps/rejected": -303.0358581542969, "loss": 0.2377, "rewards/chosen": 0.6101658940315247, "rewards/margins": 2.9520991444587708, "rewards/rejected": -2.341933250427246, "step": 8078 }, { "epoch": 0.4282193305594572, "grad_norm": 76.0, "kl": 0.6446380615234375, "learning_rate": 5e-07, "logits/chosen": -49402741.333333336, "logits/rejected": -36664472.0, "logps/chosen": -322.3814290364583, "logps/rejected": -257.15142822265625, "loss": 0.4259, "rewards/chosen": 0.2872627178827922, "rewards/margins": 0.9191742340723674, "rewards/rejected": -0.6319115161895752, "step": 8079 }, { "epoch": 0.42827233456125935, "grad_norm": 79.5, "kl": 1.0599069595336914, "learning_rate": 5e-07, "logits/chosen": -36453936.0, "logits/rejected": -28486069.333333332, "logps/chosen": -254.979248046875, "logps/rejected": -364.3866780598958, "loss": 0.3486, "rewards/chosen": 0.1204871654510498, "rewards/margins": 2.7837337970733644, "rewards/rejected": -2.6632466316223145, "step": 8080 }, { "epoch": 0.4283253385630615, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10993078.0, "logits/rejected": -26619497.14285714, "logps/chosen": -538.2947387695312, "logps/rejected": -266.48304966517856, "loss": 0.2235, "rewards/chosen": 0.965716540813446, "rewards/margins": 2.4879272069249834, "rewards/rejected": -1.5222106661115373, "step": 8081 }, { "epoch": 0.4283783425648636, "grad_norm": 43.0, "kl": 1.5580883026123047, "learning_rate": 5e-07, "logits/chosen": -9635603.2, "logits/rejected": -94877610.66666667, "logps/chosen": -457.021826171875, "logps/rejected": -308.5341796875, "loss": 0.3196, "rewards/chosen": 0.6499168395996093, "rewards/margins": 2.5125661214192707, "rewards/rejected": -1.8626492818196614, "step": 8082 }, { "epoch": 0.42843134656666576, "grad_norm": 63.25, "kl": 0.2692527770996094, "learning_rate": 5e-07, "logits/chosen": -50154550.4, "logits/rejected": -20303472.0, "logps/chosen": -528.547119140625, "logps/rejected": -247.2787882486979, "loss": 0.2825, "rewards/chosen": 0.6330088615417481, "rewards/margins": 2.646626377105713, "rewards/rejected": -2.013617515563965, "step": 8083 }, { "epoch": 0.4284843505684679, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58248154.666666664, "logits/rejected": -28392505.6, "logps/chosen": -259.26804606119794, "logps/rejected": -436.71611328125, "loss": 0.2203, "rewards/chosen": 0.8138054211934408, "rewards/margins": 2.9263490994771324, "rewards/rejected": -2.1125436782836915, "step": 8084 }, { "epoch": 0.42853735457027003, "grad_norm": 52.75, "kl": 1.8868179321289062, "learning_rate": 5e-07, "logits/chosen": -64794592.0, "logits/rejected": -40547830.4, "logps/chosen": -681.7537027994791, "logps/rejected": -279.948681640625, "loss": 0.2093, "rewards/chosen": 1.6090636253356934, "rewards/margins": 3.0870590209960938, "rewards/rejected": -1.4779953956604004, "step": 8085 }, { "epoch": 0.42859035857207217, "grad_norm": 52.0, "kl": 1.1895370483398438, "learning_rate": 5e-07, "logits/chosen": -58049779.2, "logits/rejected": -39017234.666666664, "logps/chosen": -297.8124267578125, "logps/rejected": -339.1658935546875, "loss": 0.3355, "rewards/chosen": 0.4706121921539307, "rewards/margins": 2.1010255654652914, "rewards/rejected": -1.6304133733113606, "step": 8086 }, { "epoch": 0.4286433625738743, "grad_norm": 40.25, "kl": 0.9255256652832031, "learning_rate": 5e-07, "logits/chosen": -52746400.0, "logits/rejected": -30847712.0, "logps/chosen": -466.7693277994792, "logps/rejected": -139.7342041015625, "loss": 0.1713, "rewards/chosen": 1.624588966369629, "rewards/margins": 3.605917549133301, "rewards/rejected": -1.981328582763672, "step": 8087 }, { "epoch": 0.42869636657567645, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 46451370.666666664, "logits/rejected": -38921536.0, "logps/chosen": -496.5594075520833, "logps/rejected": -385.462890625, "loss": 0.248, "rewards/chosen": 0.33647104104359943, "rewards/margins": 2.484539278348287, "rewards/rejected": -2.1480682373046873, "step": 8088 }, { "epoch": 0.4287493705774786, "grad_norm": 60.75, "kl": 4.659023284912109, "learning_rate": 5e-07, "logits/chosen": -55310064.0, "logits/rejected": -11045520.0, "logps/chosen": -721.0514526367188, "logps/rejected": -327.09881591796875, "loss": 0.1846, "rewards/chosen": 1.8537132740020752, "rewards/margins": 4.840958833694458, "rewards/rejected": -2.987245559692383, "step": 8089 }, { "epoch": 0.4288023745792807, "grad_norm": 45.25, "kl": 0.07548332214355469, "learning_rate": 5e-07, "logits/chosen": -51116502.4, "logits/rejected": 16640618.666666666, "logps/chosen": -258.6749267578125, "logps/rejected": -127.99740600585938, "loss": 0.3866, "rewards/chosen": 0.1368630647659302, "rewards/margins": 1.5258990844090778, "rewards/rejected": -1.3890360196431477, "step": 8090 }, { "epoch": 0.42885537858108286, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -92471968.0, "logits/rejected": -15947947.2, "logps/chosen": -426.9358723958333, "logps/rejected": -238.5345458984375, "loss": 0.2704, "rewards/chosen": 0.481342077255249, "rewards/margins": 1.925749158859253, "rewards/rejected": -1.444407081604004, "step": 8091 }, { "epoch": 0.428908382582885, "grad_norm": 41.0, "kl": 0.815643310546875, "learning_rate": 5e-07, "logits/chosen": 2991302.0, "logits/rejected": -8039092.5, "logps/chosen": -20.117774963378906, "logps/rejected": -182.09872436523438, "loss": 0.369, "rewards/chosen": 0.00032420456409454346, "rewards/margins": 1.6099276095628738, "rewards/rejected": -1.6096034049987793, "step": 8092 }, { "epoch": 0.42896138658468713, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3808814.0, "logits/rejected": -54499596.8, "logps/chosen": -157.1307576497396, "logps/rejected": -202.0877197265625, "loss": 0.3638, "rewards/chosen": -0.34847791989644367, "rewards/margins": 0.9572475592295329, "rewards/rejected": -1.3057254791259765, "step": 8093 }, { "epoch": 0.42901439058648927, "grad_norm": 43.5, "kl": 0.018705368041992188, "learning_rate": 5e-07, "logits/chosen": -27643098.666666668, "logits/rejected": -28239440.0, "logps/chosen": -575.6360270182291, "logps/rejected": -313.9015197753906, "loss": 0.3321, "rewards/chosen": 0.928538958231608, "rewards/margins": 2.6512593428293862, "rewards/rejected": -1.7227203845977783, "step": 8094 }, { "epoch": 0.4290673945882914, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13895982.0, "logits/rejected": -18121536.0, "logps/chosen": -76.35490417480469, "logps/rejected": -227.5994110107422, "loss": 0.3967, "rewards/chosen": -0.2818743586540222, "rewards/margins": 1.1062493920326233, "rewards/rejected": -1.3881237506866455, "step": 8095 }, { "epoch": 0.42912039859009354, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -124434576.0, "logits/rejected": -9948682.666666666, "logps/chosen": -146.54116821289062, "logps/rejected": -434.58056640625, "loss": 0.2314, "rewards/chosen": -0.13464012742042542, "rewards/margins": 2.0071073472499847, "rewards/rejected": -2.14174747467041, "step": 8096 }, { "epoch": 0.4291734025918957, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29331762.666666668, "logits/rejected": -22345131.2, "logps/chosen": -169.30716959635416, "logps/rejected": -438.9904296875, "loss": 0.255, "rewards/chosen": 0.023384734988212585, "rewards/margins": 2.2418572157621384, "rewards/rejected": -2.218472480773926, "step": 8097 }, { "epoch": 0.4292264065936978, "grad_norm": 56.0, "kl": 1.1665706634521484, "learning_rate": 5e-07, "logits/chosen": -21502235.42857143, "logits/rejected": 1029281.0, "logps/chosen": -481.95814732142856, "logps/rejected": -157.05616760253906, "loss": 0.3388, "rewards/chosen": 1.007906709398542, "rewards/margins": 2.415751252855573, "rewards/rejected": -1.4078445434570312, "step": 8098 }, { "epoch": 0.42927941059549996, "grad_norm": 82.0, "kl": 4.57866096496582, "learning_rate": 5e-07, "logits/chosen": -37828021.333333336, "logits/rejected": 758635.5, "logps/chosen": -700.61474609375, "logps/rejected": -172.7159423828125, "loss": 0.4136, "rewards/chosen": 0.8452739715576172, "rewards/margins": 1.8221195340156555, "rewards/rejected": -0.9768455624580383, "step": 8099 }, { "epoch": 0.4293324145973021, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48128325.333333336, "logits/rejected": -25253244.8, "logps/chosen": -211.1842041015625, "logps/rejected": -135.594580078125, "loss": 0.3064, "rewards/chosen": 0.44689234097798664, "rewards/margins": 1.7610586007436115, "rewards/rejected": -1.314166259765625, "step": 8100 }, { "epoch": 0.42938541859910423, "grad_norm": 28.0, "kl": 0.4361076354980469, "learning_rate": 5e-07, "logits/chosen": -1970418.8, "logits/rejected": -38215664.0, "logps/chosen": -103.48565673828125, "logps/rejected": -420.0008951822917, "loss": 0.2858, "rewards/chosen": 0.4915578365325928, "rewards/margins": 3.182838519414266, "rewards/rejected": -2.6912806828816733, "step": 8101 }, { "epoch": 0.42943842260090637, "grad_norm": 42.5, "kl": 2.9543075561523438, "learning_rate": 5e-07, "logits/chosen": -29228088.0, "logits/rejected": -18206069.333333332, "logps/chosen": -1144.0150146484375, "logps/rejected": -528.5440673828125, "loss": 0.1975, "rewards/chosen": 1.9574531316757202, "rewards/margins": 4.328505237897238, "rewards/rejected": -2.371052106221517, "step": 8102 }, { "epoch": 0.4294914266027085, "grad_norm": 53.0, "kl": 1.4505157470703125, "learning_rate": 5e-07, "logits/chosen": -17441218.0, "logits/rejected": 2667997.5, "logps/chosen": -324.2547302246094, "logps/rejected": -352.73773193359375, "loss": 0.202, "rewards/chosen": 1.1984654664993286, "rewards/margins": 3.378136992454529, "rewards/rejected": -2.1796715259552, "step": 8103 }, { "epoch": 0.42954443060451064, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39101040.0, "logits/rejected": -28855456.0, "logps/chosen": -541.1289876302084, "logps/rejected": -375.1707763671875, "loss": 0.212, "rewards/chosen": 0.6045801639556885, "rewards/margins": 3.1209866046905517, "rewards/rejected": -2.516406440734863, "step": 8104 }, { "epoch": 0.4295974346063128, "grad_norm": 40.0, "kl": 0.350189208984375, "learning_rate": 5e-07, "logits/chosen": -7181775.0, "logits/rejected": -36812608.0, "logps/chosen": -134.50721740722656, "logps/rejected": -434.57781982421875, "loss": 0.2745, "rewards/chosen": 0.4876672327518463, "rewards/margins": 2.7092610895633698, "rewards/rejected": -2.2215938568115234, "step": 8105 }, { "epoch": 0.4296504386081149, "grad_norm": 58.75, "kl": 1.9848194122314453, "learning_rate": 5e-07, "logits/chosen": -23097400.0, "logits/rejected": 2271037.0, "logps/chosen": -260.8592529296875, "logps/rejected": -373.78466796875, "loss": 0.3572, "rewards/chosen": 0.44066540400187176, "rewards/margins": 2.820742050806681, "rewards/rejected": -2.3800766468048096, "step": 8106 }, { "epoch": 0.42970344260991705, "grad_norm": 61.75, "kl": 0.6964244842529297, "learning_rate": 5e-07, "logits/chosen": -11502178.666666666, "logits/rejected": -45264128.0, "logps/chosen": -334.411376953125, "logps/rejected": -180.24996948242188, "loss": 0.4055, "rewards/chosen": 0.49675877888997394, "rewards/margins": 1.19456680615743, "rewards/rejected": -0.697808027267456, "step": 8107 }, { "epoch": 0.4297564466117192, "grad_norm": 67.5, "kl": 1.3888168334960938, "learning_rate": 5e-07, "logits/chosen": -86239221.33333333, "logits/rejected": -41273043.2, "logps/chosen": -738.99072265625, "logps/rejected": -450.16357421875, "loss": 0.2938, "rewards/chosen": 0.4326232671737671, "rewards/margins": 2.096646857261658, "rewards/rejected": -1.6640235900878906, "step": 8108 }, { "epoch": 0.4298094506135213, "grad_norm": 45.25, "kl": 0.5912151336669922, "learning_rate": 5e-07, "logits/chosen": -54309299.2, "logits/rejected": -12459641.333333334, "logps/chosen": -311.2589111328125, "logps/rejected": -436.2408854166667, "loss": 0.3053, "rewards/chosen": 0.580040693283081, "rewards/margins": 2.4952043692270918, "rewards/rejected": -1.9151636759440105, "step": 8109 }, { "epoch": 0.42986245461532346, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9579885.333333334, "logits/rejected": -45887833.6, "logps/chosen": -134.9924519856771, "logps/rejected": -303.51455078125, "loss": 0.3442, "rewards/chosen": -0.1592968006928762, "rewards/margins": 1.2503388384977978, "rewards/rejected": -1.409635639190674, "step": 8110 }, { "epoch": 0.4299154586171256, "grad_norm": 47.25, "kl": 1.7630977630615234, "learning_rate": 5e-07, "logits/chosen": -37273817.6, "logits/rejected": -35970626.666666664, "logps/chosen": -314.1782470703125, "logps/rejected": -336.29791259765625, "loss": 0.3711, "rewards/chosen": 0.38158202171325684, "rewards/margins": 2.2824316819508867, "rewards/rejected": -1.9008496602376301, "step": 8111 }, { "epoch": 0.42996846261892774, "grad_norm": 47.75, "kl": 0.5784111022949219, "learning_rate": 5e-07, "logits/chosen": -50065232.0, "logits/rejected": -39440256.0, "logps/chosen": -282.2672119140625, "logps/rejected": -184.1295623779297, "loss": 0.2429, "rewards/chosen": 0.9068306684494019, "rewards/margins": 2.6181188821792603, "rewards/rejected": -1.7112882137298584, "step": 8112 }, { "epoch": 0.4300214666207299, "grad_norm": 49.25, "kl": 0.4616546630859375, "learning_rate": 5e-07, "logits/chosen": -12515770.666666666, "logits/rejected": -38131827.2, "logps/chosen": -269.14528401692706, "logps/rejected": -320.484912109375, "loss": 0.2688, "rewards/chosen": 0.021373937527338665, "rewards/margins": 2.1420959452788035, "rewards/rejected": -2.120722007751465, "step": 8113 }, { "epoch": 0.430074470622532, "grad_norm": 45.75, "kl": 0.3393688201904297, "learning_rate": 5e-07, "logits/chosen": -12718683.2, "logits/rejected": -55506933.333333336, "logps/chosen": -185.2168701171875, "logps/rejected": -467.2727457682292, "loss": 0.3138, "rewards/chosen": 0.316663122177124, "rewards/margins": 2.6910953680674234, "rewards/rejected": -2.3744322458902993, "step": 8114 }, { "epoch": 0.43012747462433415, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26801573.333333332, "logits/rejected": -22856358.4, "logps/chosen": -178.4365030924479, "logps/rejected": -348.55087890625, "loss": 0.2022, "rewards/chosen": 0.32664668560028076, "rewards/margins": 3.2303412199020385, "rewards/rejected": -2.9036945343017577, "step": 8115 }, { "epoch": 0.4301804786261363, "grad_norm": 65.0, "kl": 0.9734430313110352, "learning_rate": 5e-07, "logits/chosen": -25187692.8, "logits/rejected": -36150304.0, "logps/chosen": -251.055712890625, "logps/rejected": -235.23836263020834, "loss": 0.2861, "rewards/chosen": 0.7341263294219971, "rewards/margins": 2.8137781620025635, "rewards/rejected": -2.0796518325805664, "step": 8116 }, { "epoch": 0.4302334826279384, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13817752.0, "logits/rejected": -15906158.666666666, "logps/chosen": -229.9585418701172, "logps/rejected": -332.83457438151044, "loss": 0.2074, "rewards/chosen": 0.17810620367527008, "rewards/margins": 2.516850526134173, "rewards/rejected": -2.338744322458903, "step": 8117 }, { "epoch": 0.43028648662974056, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11845496.0, "logits/rejected": -2034890.6, "logps/chosen": -331.7972005208333, "logps/rejected": -391.83662109375, "loss": 0.2244, "rewards/chosen": 0.7085286776224772, "rewards/margins": 2.620157019297282, "rewards/rejected": -1.9116283416748048, "step": 8118 }, { "epoch": 0.4303394906315427, "grad_norm": 81.0, "kl": 4.279998779296875, "learning_rate": 5e-07, "logits/chosen": -51858437.333333336, "logits/rejected": -10708584.8, "logps/chosen": -692.4620768229166, "logps/rejected": -371.6201416015625, "loss": 0.2029, "rewards/chosen": 1.2906423409779866, "rewards/margins": 4.987846167882283, "rewards/rejected": -3.697203826904297, "step": 8119 }, { "epoch": 0.43039249463334484, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23332253.333333332, "logits/rejected": -53223224.0, "logps/chosen": -368.8785400390625, "logps/rejected": -796.435546875, "loss": 0.2698, "rewards/chosen": 0.9196560382843018, "rewards/margins": 5.030468225479126, "rewards/rejected": -4.110812187194824, "step": 8120 }, { "epoch": 0.430445498635147, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69242928.0, "logits/rejected": -56317176.0, "logps/chosen": -301.8920593261719, "logps/rejected": -633.8021850585938, "loss": 0.2035, "rewards/chosen": 0.892967700958252, "rewards/margins": 3.8218512535095215, "rewards/rejected": -2.9288835525512695, "step": 8121 }, { "epoch": 0.4304985026369491, "grad_norm": 48.25, "kl": 0.35060882568359375, "learning_rate": 5e-07, "logits/chosen": 7790565.5, "logits/rejected": -30124048.0, "logps/chosen": -302.37066650390625, "logps/rejected": -275.337890625, "loss": 0.2548, "rewards/chosen": 1.0661423206329346, "rewards/margins": 2.437976598739624, "rewards/rejected": -1.3718342781066895, "step": 8122 }, { "epoch": 0.43055150663875125, "grad_norm": 55.25, "kl": 1.8964214324951172, "learning_rate": 5e-07, "logits/chosen": -35462288.0, "logits/rejected": -50708560.0, "logps/chosen": -297.859033203125, "logps/rejected": -239.388916015625, "loss": 0.4488, "rewards/chosen": 0.129494571685791, "rewards/margins": 0.8190986156463623, "rewards/rejected": -0.6896040439605713, "step": 8123 }, { "epoch": 0.4306045106405534, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34649962.666666664, "logits/rejected": -4374282.0, "logps/chosen": -363.7601725260417, "logps/rejected": -186.83349609375, "loss": 0.3555, "rewards/chosen": 0.39857470989227295, "rewards/margins": 2.1911126375198364, "rewards/rejected": -1.7925379276275635, "step": 8124 }, { "epoch": 0.4306575146423555, "grad_norm": 56.0, "kl": 0.7011003494262695, "learning_rate": 5e-07, "logits/chosen": -44408278.4, "logits/rejected": -16667256.0, "logps/chosen": -283.6154541015625, "logps/rejected": -401.3207600911458, "loss": 0.3598, "rewards/chosen": 0.170768141746521, "rewards/margins": 2.2678391536076865, "rewards/rejected": -2.0970710118611655, "step": 8125 }, { "epoch": 0.43071051864415766, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39364469.333333336, "logits/rejected": -53390713.6, "logps/chosen": -443.6473795572917, "logps/rejected": -244.329052734375, "loss": 0.2072, "rewards/chosen": 0.8471797307332357, "rewards/margins": 3.160767682393392, "rewards/rejected": -2.313587951660156, "step": 8126 }, { "epoch": 0.4307635226459598, "grad_norm": 35.25, "kl": 0.20402908325195312, "learning_rate": 5e-07, "logits/chosen": -290302.25, "logits/rejected": -18911485.333333332, "logps/chosen": -26.10363006591797, "logps/rejected": -260.07668050130206, "loss": 0.2531, "rewards/chosen": -0.055405620485544205, "rewards/margins": 1.781202948341767, "rewards/rejected": -1.8366085688273113, "step": 8127 }, { "epoch": 0.43081652664776193, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24680064.0, "logits/rejected": -35917568.0, "logps/chosen": -98.34107971191406, "logps/rejected": -216.4535369873047, "loss": 0.3514, "rewards/chosen": 0.08039142936468124, "rewards/margins": 1.379550002515316, "rewards/rejected": -1.2991585731506348, "step": 8128 }, { "epoch": 0.43086953064956407, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1886363.6666666667, "logits/rejected": -2245107.6, "logps/chosen": -178.53951009114584, "logps/rejected": -173.75396728515625, "loss": 0.283, "rewards/chosen": 0.7071426709493002, "rewards/margins": 2.039505608876546, "rewards/rejected": -1.3323629379272461, "step": 8129 }, { "epoch": 0.43092253465136615, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17558803.2, "logits/rejected": -2726937.0, "logps/chosen": -268.0425048828125, "logps/rejected": -223.4524943033854, "loss": 0.3727, "rewards/chosen": 0.20802879333496094, "rewards/margins": 2.1055275599161787, "rewards/rejected": -1.8974987665812175, "step": 8130 }, { "epoch": 0.4309755386531683, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65584298.666666664, "logits/rejected": -16380620.8, "logps/chosen": -416.4913736979167, "logps/rejected": -187.151025390625, "loss": 0.2827, "rewards/chosen": -0.1585647463798523, "rewards/margins": 1.8829540371894837, "rewards/rejected": -2.041518783569336, "step": 8131 }, { "epoch": 0.4310285426549704, "grad_norm": 50.5, "kl": 0.9890480041503906, "learning_rate": 5e-07, "logits/chosen": -24396434.666666668, "logits/rejected": -32082664.0, "logps/chosen": -390.7036946614583, "logps/rejected": -572.1744995117188, "loss": 0.325, "rewards/chosen": 0.757521390914917, "rewards/margins": 3.7842180728912354, "rewards/rejected": -3.0266966819763184, "step": 8132 }, { "epoch": 0.43108154665677256, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2712608.25, "logits/rejected": -17557374.666666668, "logps/chosen": -63.74000930786133, "logps/rejected": -312.0691731770833, "loss": 0.2574, "rewards/chosen": -0.36692237854003906, "rewards/margins": 1.8410609563191733, "rewards/rejected": -2.2079833348592124, "step": 8133 }, { "epoch": 0.4311345506585747, "grad_norm": 54.75, "kl": 1.1035947799682617, "learning_rate": 5e-07, "logits/chosen": 2156902.0, "logits/rejected": -20590974.666666668, "logps/chosen": -328.8908203125, "logps/rejected": -113.35880533854167, "loss": 0.3631, "rewards/chosen": 0.6383741855621338, "rewards/margins": 1.5241859118143717, "rewards/rejected": -0.8858117262522379, "step": 8134 }, { "epoch": 0.43118755466037684, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6210100.5, "logits/rejected": -1779545.3333333333, "logps/chosen": -172.3992462158203, "logps/rejected": -311.68023681640625, "loss": 0.2235, "rewards/chosen": 0.9730856418609619, "rewards/margins": 2.756860176722209, "rewards/rejected": -1.7837745348612468, "step": 8135 }, { "epoch": 0.431240558662179, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30678758.0, "logits/rejected": -34929328.0, "logps/chosen": -375.69573974609375, "logps/rejected": -331.95361328125, "loss": 0.3089, "rewards/chosen": 0.30179497599601746, "rewards/margins": 1.927023857831955, "rewards/rejected": -1.6252288818359375, "step": 8136 }, { "epoch": 0.4312935626639811, "grad_norm": 55.75, "kl": 0.4872169494628906, "learning_rate": 5e-07, "logits/chosen": -10462485.333333334, "logits/rejected": -18402009.6, "logps/chosen": -380.8210042317708, "logps/rejected": -358.8291015625, "loss": 0.3205, "rewards/chosen": 0.10068359971046448, "rewards/margins": 1.9600572645664216, "rewards/rejected": -1.8593736648559571, "step": 8137 }, { "epoch": 0.43134656666578325, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71509376.0, "logits/rejected": -29354102.85714286, "logps/chosen": -1033.935546875, "logps/rejected": -281.61314174107144, "loss": 0.188, "rewards/chosen": -0.02158203162252903, "rewards/margins": 2.3705936428159475, "rewards/rejected": -2.3921756744384766, "step": 8138 }, { "epoch": 0.4313995706675854, "grad_norm": 43.75, "kl": 0.9288139343261719, "learning_rate": 5e-07, "logits/chosen": -81438920.0, "logits/rejected": -56940872.0, "logps/chosen": -274.5379638671875, "logps/rejected": -645.0047607421875, "loss": 0.2576, "rewards/chosen": 0.38335224986076355, "rewards/margins": 3.3030414283275604, "rewards/rejected": -2.919689178466797, "step": 8139 }, { "epoch": 0.4314525746693875, "grad_norm": 54.0, "kl": 2.018831253051758, "learning_rate": 5e-07, "logits/chosen": -25458916.0, "logits/rejected": -22697632.0, "logps/chosen": -646.5505981445312, "logps/rejected": -344.570556640625, "loss": 0.3076, "rewards/chosen": 0.8997771739959717, "rewards/margins": 2.4187618494033813, "rewards/rejected": -1.5189846754074097, "step": 8140 }, { "epoch": 0.43150557867118966, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32760628.0, "logits/rejected": -19157541.333333332, "logps/chosen": -396.71502685546875, "logps/rejected": -261.892822265625, "loss": 0.149, "rewards/chosen": 1.3010621070861816, "rewards/margins": 3.841541131337484, "rewards/rejected": -2.5404790242513022, "step": 8141 }, { "epoch": 0.4315585826729918, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48388693.333333336, "logits/rejected": -31951385.6, "logps/chosen": -286.06483968098956, "logps/rejected": -302.603271484375, "loss": 0.1666, "rewards/chosen": 0.9071617126464844, "rewards/margins": 3.3494441986083983, "rewards/rejected": -2.442282485961914, "step": 8142 }, { "epoch": 0.43161158667479393, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -24546872.0, "logps/rejected": -332.57794189453125, "loss": 0.1747, "rewards/rejected": -2.2387654781341553, "step": 8143 }, { "epoch": 0.43166459067659607, "grad_norm": 45.5, "kl": 0.1880054473876953, "learning_rate": 5e-07, "logits/chosen": -7860665.0, "logits/rejected": -21541052.0, "logps/chosen": -305.8715515136719, "logps/rejected": -266.8503011067708, "loss": 0.2267, "rewards/chosen": 0.6431821584701538, "rewards/margins": 2.3265737295150757, "rewards/rejected": -1.6833915710449219, "step": 8144 }, { "epoch": 0.4317175946783982, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70603472.0, "logits/rejected": -31156181.333333332, "logps/chosen": -293.067626953125, "logps/rejected": -306.30735270182294, "loss": 0.2329, "rewards/chosen": 0.04286651685833931, "rewards/margins": 2.018907515952985, "rewards/rejected": -1.9760409990946453, "step": 8145 }, { "epoch": 0.43177059868020035, "grad_norm": 46.5, "kl": 0.18842315673828125, "learning_rate": 5e-07, "logits/chosen": -8680255.333333334, "logits/rejected": -17004804.0, "logps/chosen": -140.30973307291666, "logps/rejected": -346.3582458496094, "loss": 0.4304, "rewards/chosen": 0.03467687467734019, "rewards/margins": 1.36335655550162, "rewards/rejected": -1.3286796808242798, "step": 8146 }, { "epoch": 0.4318236026820025, "grad_norm": 44.5, "kl": 0.9090032577514648, "learning_rate": 5e-07, "logits/chosen": -5734235.5, "logits/rejected": -32598946.0, "logps/chosen": -142.0179901123047, "logps/rejected": -227.28671264648438, "loss": 0.3031, "rewards/chosen": 0.5957009196281433, "rewards/margins": 2.1421982645988464, "rewards/rejected": -1.5464973449707031, "step": 8147 }, { "epoch": 0.4318766066838046, "grad_norm": 58.5, "kl": 0.28223419189453125, "learning_rate": 5e-07, "logits/chosen": -72475424.0, "logits/rejected": -36719754.666666664, "logps/chosen": -459.864990234375, "logps/rejected": -458.183349609375, "loss": 0.3029, "rewards/chosen": 0.6661120414733886, "rewards/margins": 2.6829227765401207, "rewards/rejected": -2.016810735066732, "step": 8148 }, { "epoch": 0.43192961068560676, "grad_norm": 54.75, "kl": 1.4996528625488281, "learning_rate": 5e-07, "logits/chosen": -9351977.0, "logits/rejected": -7641488.5, "logps/chosen": -305.0302734375, "logps/rejected": -321.8877258300781, "loss": 0.2552, "rewards/chosen": 0.6240180730819702, "rewards/margins": 3.1698297262191772, "rewards/rejected": -2.545811653137207, "step": 8149 }, { "epoch": 0.4319826146874089, "grad_norm": 56.0, "kl": 0.6045646667480469, "learning_rate": 5e-07, "logits/chosen": -57802552.0, "logits/rejected": -3811209.3333333335, "logps/chosen": -315.13067626953125, "logps/rejected": -174.2230428059896, "loss": 0.3361, "rewards/chosen": 0.34566575288772583, "rewards/margins": 1.4160478711128235, "rewards/rejected": -1.0703821182250977, "step": 8150 }, { "epoch": 0.43203561868921103, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24709232.0, "logits/rejected": -38406332.8, "logps/chosen": -478.8093668619792, "logps/rejected": -233.1991943359375, "loss": 0.3139, "rewards/chosen": -0.30356242259343463, "rewards/margins": 1.640863541762034, "rewards/rejected": -1.9444259643554687, "step": 8151 }, { "epoch": 0.43208862269101317, "grad_norm": 43.25, "kl": 0.06835365295410156, "learning_rate": 5e-07, "logits/chosen": -29987444.0, "logits/rejected": -5345211.5, "logps/chosen": -148.8734588623047, "logps/rejected": -353.40997314453125, "loss": 0.274, "rewards/chosen": 0.3203003704547882, "rewards/margins": 2.378220111131668, "rewards/rejected": -2.05791974067688, "step": 8152 }, { "epoch": 0.4321416266928153, "grad_norm": 56.25, "kl": 0.9861078262329102, "learning_rate": 5e-07, "logits/chosen": -12714033.333333334, "logits/rejected": -49487852.0, "logps/chosen": -205.79229736328125, "logps/rejected": -381.1451110839844, "loss": 0.4505, "rewards/chosen": -0.1563806732495626, "rewards/margins": 1.8896946708361309, "rewards/rejected": -2.0460753440856934, "step": 8153 }, { "epoch": 0.43219463069461744, "grad_norm": 41.25, "kl": 1.3875722885131836, "learning_rate": 5e-07, "logits/chosen": -20689542.0, "logits/rejected": -42098994.28571428, "logps/chosen": -437.3948059082031, "logps/rejected": -254.64315359933036, "loss": 0.1412, "rewards/chosen": 2.4958343505859375, "rewards/margins": 4.420452799115862, "rewards/rejected": -1.9246184485299247, "step": 8154 }, { "epoch": 0.4322476346964196, "grad_norm": 58.75, "kl": 0.25293684005737305, "learning_rate": 5e-07, "logits/chosen": 1135934.3, "logits/rejected": -73112906.66666667, "logps/chosen": -118.26556396484375, "logps/rejected": -425.6607259114583, "loss": 0.3944, "rewards/chosen": -0.14974806308746338, "rewards/margins": 1.785298450787862, "rewards/rejected": -1.9350465138753254, "step": 8155 }, { "epoch": 0.4323006386982217, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11854120.0, "logits/rejected": -20340640.0, "logps/chosen": -58.73031234741211, "logps/rejected": -269.220703125, "loss": 0.3263, "rewards/chosen": 0.04399093985557556, "rewards/margins": 1.9299176633358002, "rewards/rejected": -1.8859267234802246, "step": 8156 }, { "epoch": 0.43235364270002385, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17283374.666666668, "logits/rejected": -17075940.8, "logps/chosen": -392.3969319661458, "logps/rejected": -313.760986328125, "loss": 0.1685, "rewards/chosen": 0.8940460681915283, "rewards/margins": 3.716416597366333, "rewards/rejected": -2.8223705291748047, "step": 8157 }, { "epoch": 0.432406646701826, "grad_norm": 33.75, "kl": 0.5541067123413086, "learning_rate": 5e-07, "logits/chosen": 2998243.5, "logits/rejected": -28532696.0, "logps/chosen": -60.93238830566406, "logps/rejected": -320.5303141276042, "loss": 0.2452, "rewards/chosen": -0.42495405673980713, "rewards/margins": 1.8146721919377646, "rewards/rejected": -2.2396262486775718, "step": 8158 }, { "epoch": 0.43245965070362813, "grad_norm": 40.5, "kl": 0.6416893005371094, "learning_rate": 5e-07, "logits/chosen": -10011448.0, "logits/rejected": -15373870.4, "logps/chosen": -508.3833414713542, "logps/rejected": -278.64140625, "loss": 0.1868, "rewards/chosen": 1.199520428975423, "rewards/margins": 3.5631665547688804, "rewards/rejected": -2.363646125793457, "step": 8159 }, { "epoch": 0.43251265470543027, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 23773114.666666668, "logits/rejected": -24162312.0, "logps/chosen": -311.34580485026044, "logps/rejected": -486.942919921875, "loss": 0.2704, "rewards/chosen": 0.42266162236531574, "rewards/margins": 2.761826737721761, "rewards/rejected": -2.3391651153564452, "step": 8160 }, { "epoch": 0.4325656587072324, "grad_norm": 56.25, "kl": 1.2886962890625, "learning_rate": 5e-07, "logits/chosen": -14286996.0, "logits/rejected": -51872312.0, "logps/chosen": -392.3687438964844, "logps/rejected": -257.8741760253906, "loss": 0.2774, "rewards/chosen": 0.5705259442329407, "rewards/margins": 2.142256796360016, "rewards/rejected": -1.5717308521270752, "step": 8161 }, { "epoch": 0.43261866270903454, "grad_norm": 46.0, "kl": 1.1399211883544922, "learning_rate": 5e-07, "logits/chosen": -9069720.0, "logits/rejected": -61268620.0, "logps/chosen": -150.77264404296875, "logps/rejected": -258.71533203125, "loss": 0.3414, "rewards/chosen": 0.38835468888282776, "rewards/margins": 1.6092207729816437, "rewards/rejected": -1.220866084098816, "step": 8162 }, { "epoch": 0.4326716667108367, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4589938.4, "logits/rejected": -18653953.333333332, "logps/chosen": -244.83017578125, "logps/rejected": -314.00050862630206, "loss": 0.3677, "rewards/chosen": -0.07862023115158082, "rewards/margins": 2.0794643203417458, "rewards/rejected": -2.1580845514933267, "step": 8163 }, { "epoch": 0.4327246707126388, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1580584.0, "logits/rejected": -19237366.0, "logps/chosen": -332.14947509765625, "logps/rejected": -353.3252258300781, "loss": 0.2987, "rewards/chosen": 0.03325815871357918, "rewards/margins": 2.2332753725349903, "rewards/rejected": -2.200017213821411, "step": 8164 }, { "epoch": 0.43277767471444095, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41560336.0, "logits/rejected": -48827754.666666664, "logps/chosen": -194.9664764404297, "logps/rejected": -398.3558756510417, "loss": 0.2446, "rewards/chosen": 0.3727371394634247, "rewards/margins": 2.444893409808477, "rewards/rejected": -2.0721562703450522, "step": 8165 }, { "epoch": 0.4328306787162431, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33188758.4, "logits/rejected": 28964200.0, "logps/chosen": -814.364404296875, "logps/rejected": -368.5413818359375, "loss": 0.2775, "rewards/chosen": 0.8666601181030273, "rewards/margins": 3.2689738273620605, "rewards/rejected": -2.402313709259033, "step": 8166 }, { "epoch": 0.4328836827180452, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27736664.0, "logits/rejected": -16776966.4, "logps/chosen": -245.98602294921875, "logps/rejected": -369.0090087890625, "loss": 0.217, "rewards/chosen": 0.32738592227300006, "rewards/margins": 2.844473667939504, "rewards/rejected": -2.5170877456665037, "step": 8167 }, { "epoch": 0.43293668671984736, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9417207.2, "logits/rejected": 8835194.666666666, "logps/chosen": -310.014208984375, "logps/rejected": -706.2909342447916, "loss": 0.3116, "rewards/chosen": 0.08932533264160156, "rewards/margins": 4.138153457641602, "rewards/rejected": -4.048828125, "step": 8168 }, { "epoch": 0.4329896907216495, "grad_norm": 52.5, "kl": 0.85638427734375, "learning_rate": 5e-07, "logits/chosen": -39093785.6, "logits/rejected": -17938809.333333332, "logps/chosen": -495.726806640625, "logps/rejected": -229.8252970377604, "loss": 0.3232, "rewards/chosen": 0.555713415145874, "rewards/margins": 2.1999313831329346, "rewards/rejected": -1.6442179679870605, "step": 8169 }, { "epoch": 0.43304269472345164, "grad_norm": 51.0, "kl": 1.6208553314208984, "learning_rate": 5e-07, "logits/chosen": -63851104.0, "logits/rejected": -35712090.666666664, "logps/chosen": -370.8395751953125, "logps/rejected": -481.31298828125, "loss": 0.3644, "rewards/chosen": 0.027159881591796876, "rewards/margins": 3.031801732381185, "rewards/rejected": -3.004641850789388, "step": 8170 }, { "epoch": 0.4330956987252538, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1277306.5, "logits/rejected": -11754962.666666666, "logps/chosen": -158.8288116455078, "logps/rejected": -132.6673380533854, "loss": 0.2175, "rewards/chosen": 0.766815185546875, "rewards/margins": 2.374223391215007, "rewards/rejected": -1.6074082056681316, "step": 8171 }, { "epoch": 0.4331487027270559, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9555879.333333334, "logits/rejected": 2807360.0, "logps/chosen": -284.36553955078125, "logps/rejected": -345.11201171875, "loss": 0.2871, "rewards/chosen": 0.2581898371378581, "rewards/margins": 1.7726126352945963, "rewards/rejected": -1.5144227981567382, "step": 8172 }, { "epoch": 0.43320170672885805, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21975150.0, "logits/rejected": -21203308.0, "logps/chosen": -379.9296875, "logps/rejected": -645.5384521484375, "loss": 0.2523, "rewards/chosen": 0.298287034034729, "rewards/margins": 3.2993170022964478, "rewards/rejected": -3.0010299682617188, "step": 8173 }, { "epoch": 0.4332547107306602, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30723122.666666668, "logits/rejected": -24170440.0, "logps/chosen": -223.00882975260416, "logps/rejected": -461.8087890625, "loss": 0.2706, "rewards/chosen": 0.2658620278040568, "rewards/margins": 2.6045012871424356, "rewards/rejected": -2.338639259338379, "step": 8174 }, { "epoch": 0.4333077147324623, "grad_norm": 46.25, "kl": 0.9843273162841797, "learning_rate": 5e-07, "logits/chosen": -6496800.0, "logits/rejected": -12613246.4, "logps/chosen": -338.92987060546875, "logps/rejected": -432.5998046875, "loss": 0.2754, "rewards/chosen": 0.17184881369272867, "rewards/margins": 2.546516935030619, "rewards/rejected": -2.3746681213378906, "step": 8175 }, { "epoch": 0.43336071873426446, "grad_norm": 55.5, "kl": 0.6294517517089844, "learning_rate": 5e-07, "logits/chosen": -28131248.0, "logits/rejected": -36978122.666666664, "logps/chosen": -317.06181640625, "logps/rejected": -462.894775390625, "loss": 0.2837, "rewards/chosen": 0.5633814334869385, "rewards/margins": 3.1676523685455322, "rewards/rejected": -2.6042709350585938, "step": 8176 }, { "epoch": 0.4334137227360666, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2233436.0, "logits/rejected": -5053349.333333333, "logps/chosen": -178.4542724609375, "logps/rejected": -137.30677286783853, "loss": 0.3719, "rewards/chosen": 0.16798365116119385, "rewards/margins": 2.0537169377009077, "rewards/rejected": -1.8857332865397136, "step": 8177 }, { "epoch": 0.43346672673786873, "grad_norm": 68.5, "kl": 0.7448239326477051, "learning_rate": 5e-07, "logits/chosen": -18919676.8, "logits/rejected": -16619032.0, "logps/chosen": -265.359130859375, "logps/rejected": -232.7752482096354, "loss": 0.3478, "rewards/chosen": 0.5996069431304931, "rewards/margins": 1.4272387345631916, "rewards/rejected": -0.8276317914326986, "step": 8178 }, { "epoch": 0.43351973073967087, "grad_norm": 40.25, "kl": 2.3117542266845703, "learning_rate": 5e-07, "logits/chosen": -10859753.6, "logits/rejected": -66712389.333333336, "logps/chosen": -600.1787109375, "logps/rejected": -574.8369140625, "loss": 0.2287, "rewards/chosen": 1.280155086517334, "rewards/margins": 4.060278161366781, "rewards/rejected": -2.7801230748494468, "step": 8179 }, { "epoch": 0.43357273474147295, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27290060.8, "logits/rejected": -36493536.0, "logps/chosen": -340.154443359375, "logps/rejected": -185.1480712890625, "loss": 0.2825, "rewards/chosen": 0.8823679924011231, "rewards/margins": 2.0757383346557616, "rewards/rejected": -1.1933703422546387, "step": 8180 }, { "epoch": 0.4336257387432751, "grad_norm": 56.5, "kl": 3.0565357208251953, "learning_rate": 5e-07, "logits/chosen": -22890981.333333332, "logits/rejected": -22244616.0, "logps/chosen": -429.2318522135417, "logps/rejected": -327.02557373046875, "loss": 0.369, "rewards/chosen": 0.8513245582580566, "rewards/margins": 3.445178508758545, "rewards/rejected": -2.5938539505004883, "step": 8181 }, { "epoch": 0.43367874274507723, "grad_norm": 75.5, "kl": 1.5040092468261719, "learning_rate": 5e-07, "logits/chosen": -42320597.333333336, "logits/rejected": -748639.5, "logps/chosen": -459.2443033854167, "logps/rejected": -385.9890441894531, "loss": 0.2853, "rewards/chosen": 0.816619873046875, "rewards/margins": 3.6303277015686035, "rewards/rejected": -2.8137078285217285, "step": 8182 }, { "epoch": 0.43373174674687937, "grad_norm": 45.0, "kl": 0.020033836364746094, "learning_rate": 5e-07, "logits/chosen": -41808960.0, "logits/rejected": -17646998.0, "logps/chosen": -200.49530029296875, "logps/rejected": -199.07101440429688, "loss": 0.3924, "rewards/chosen": -0.20449602603912354, "rewards/margins": 1.0699855089187622, "rewards/rejected": -1.2744815349578857, "step": 8183 }, { "epoch": 0.4337847507486815, "grad_norm": 42.5, "kl": 1.059316635131836, "learning_rate": 5e-07, "logits/chosen": -12374982.4, "logits/rejected": -32063258.666666668, "logps/chosen": -512.139453125, "logps/rejected": -460.9892985026042, "loss": 0.2372, "rewards/chosen": 1.091497802734375, "rewards/margins": 4.39909782409668, "rewards/rejected": -3.3076000213623047, "step": 8184 }, { "epoch": 0.43383775475048364, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3054337.75, "logits/rejected": -39071652.571428575, "logps/chosen": -75.49160766601562, "logps/rejected": -341.61673409598217, "loss": 0.2262, "rewards/chosen": -0.437530517578125, "rewards/margins": 1.4586520876203264, "rewards/rejected": -1.8961826051984514, "step": 8185 }, { "epoch": 0.4338907587522858, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18397845.333333332, "logits/rejected": -18842086.4, "logps/chosen": -342.06298828125, "logps/rejected": -191.343798828125, "loss": 0.2532, "rewards/chosen": 1.0063370068868, "rewards/margins": 2.2237382253011067, "rewards/rejected": -1.2174012184143066, "step": 8186 }, { "epoch": 0.4339437627540879, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67377482.66666667, "logits/rejected": -16780017.6, "logps/chosen": -216.0987752278646, "logps/rejected": -228.5373046875, "loss": 0.2907, "rewards/chosen": -0.025606155395507812, "rewards/margins": 1.7089008331298827, "rewards/rejected": -1.7345069885253905, "step": 8187 }, { "epoch": 0.43399676675589005, "grad_norm": 54.75, "kl": 0.4315338134765625, "learning_rate": 5e-07, "logits/chosen": -8511954.4, "logits/rejected": -16428889.333333334, "logps/chosen": -239.364892578125, "logps/rejected": -191.7406005859375, "loss": 0.4716, "rewards/chosen": -0.2995625019073486, "rewards/margins": 0.7271613597869873, "rewards/rejected": -1.026723861694336, "step": 8188 }, { "epoch": 0.4340497707576922, "grad_norm": 51.25, "kl": 1.0120506286621094, "learning_rate": 5e-07, "logits/chosen": -16539354.666666666, "logits/rejected": -30045590.4, "logps/chosen": -385.6966959635417, "logps/rejected": -285.3443359375, "loss": 0.2281, "rewards/chosen": 1.2229350407918294, "rewards/margins": 3.0261458714803062, "rewards/rejected": -1.8032108306884767, "step": 8189 }, { "epoch": 0.4341027747594943, "grad_norm": 49.75, "kl": 2.059377670288086, "learning_rate": 5e-07, "logits/chosen": -9554280.8, "logits/rejected": -16835836.0, "logps/chosen": -293.14140625, "logps/rejected": -204.21455891927084, "loss": 0.3175, "rewards/chosen": 0.6787249565124511, "rewards/margins": 2.5289421399434406, "rewards/rejected": -1.8502171834309895, "step": 8190 }, { "epoch": 0.43415577876129646, "grad_norm": 55.5, "kl": 1.7791047096252441, "learning_rate": 5e-07, "logits/chosen": -33881164.8, "logits/rejected": -16047373.333333334, "logps/chosen": -345.0458984375, "logps/rejected": -283.1459554036458, "loss": 0.3537, "rewards/chosen": 0.5915363311767579, "rewards/margins": 2.3074817339579266, "rewards/rejected": -1.7159454027811687, "step": 8191 }, { "epoch": 0.4342087827630986, "grad_norm": 41.75, "kl": 0.6581821441650391, "learning_rate": 5e-07, "logits/chosen": -32006476.0, "logits/rejected": -38383480.0, "logps/chosen": -187.23037719726562, "logps/rejected": -465.577392578125, "loss": 0.313, "rewards/chosen": 0.35439246892929077, "rewards/margins": 3.298381507396698, "rewards/rejected": -2.9439890384674072, "step": 8192 }, { "epoch": 0.43426178676490074, "grad_norm": 50.25, "kl": 0.13291168212890625, "learning_rate": 5e-07, "logits/chosen": -66996176.0, "logits/rejected": -48643104.0, "logps/chosen": -492.5113830566406, "logps/rejected": -415.9140319824219, "loss": 0.2655, "rewards/chosen": 0.13597488403320312, "rewards/margins": 3.516388416290283, "rewards/rejected": -3.38041353225708, "step": 8193 }, { "epoch": 0.4343147907667029, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29013172.0, "logits/rejected": -83471072.0, "logps/chosen": -328.364501953125, "logps/rejected": -451.8691711425781, "loss": 0.3186, "rewards/chosen": 0.19337385892868042, "rewards/margins": 2.0967854857444763, "rewards/rejected": -1.903411626815796, "step": 8194 }, { "epoch": 0.434367794768505, "grad_norm": 52.0, "kl": 0.02556610107421875, "learning_rate": 5e-07, "logits/chosen": -53947384.0, "logits/rejected": 6038660.0, "logps/chosen": -194.7038116455078, "logps/rejected": -338.73191324869794, "loss": 0.2703, "rewards/chosen": -0.33429011702537537, "rewards/margins": 1.5980505049228668, "rewards/rejected": -1.9323406219482422, "step": 8195 }, { "epoch": 0.43442079877030715, "grad_norm": 54.75, "kl": 0.8549652099609375, "learning_rate": 5e-07, "logits/chosen": -33908505.6, "logits/rejected": -58446512.0, "logps/chosen": -227.105517578125, "logps/rejected": -190.5757853190104, "loss": 0.3768, "rewards/chosen": 0.01874786913394928, "rewards/margins": 2.1212347974379857, "rewards/rejected": -2.1024869283040366, "step": 8196 }, { "epoch": 0.4344738027721093, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41631904.0, "logits/rejected": -21277162.0, "logps/chosen": -343.7015686035156, "logps/rejected": -313.352294921875, "loss": 0.2573, "rewards/chosen": 0.33988913893699646, "rewards/margins": 3.8104699552059174, "rewards/rejected": -3.470580816268921, "step": 8197 }, { "epoch": 0.4345268067739114, "grad_norm": 40.5, "kl": 2.1869821548461914, "learning_rate": 5e-07, "logits/chosen": -71329696.0, "logits/rejected": -655650.0, "logps/chosen": -271.7357177734375, "logps/rejected": -674.8152465820312, "loss": 0.2727, "rewards/chosen": 0.2647551894187927, "rewards/margins": 4.507156789302826, "rewards/rejected": -4.242401599884033, "step": 8198 }, { "epoch": 0.43457981077571356, "grad_norm": 47.75, "kl": 2.048309326171875, "learning_rate": 5e-07, "logits/chosen": 2133979.6666666665, "logits/rejected": -30478198.4, "logps/chosen": -194.39644368489584, "logps/rejected": -330.0390869140625, "loss": 0.2734, "rewards/chosen": 0.770124594370524, "rewards/margins": 2.4733930269877114, "rewards/rejected": -1.7032684326171874, "step": 8199 }, { "epoch": 0.4346328147775157, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11455242.666666666, "logits/rejected": -34316995.2, "logps/chosen": -214.31414794921875, "logps/rejected": -304.6647705078125, "loss": 0.1904, "rewards/chosen": 0.771488348642985, "rewards/margins": 3.011543814341227, "rewards/rejected": -2.240055465698242, "step": 8200 }, { "epoch": 0.43468581877931783, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48989033.6, "logits/rejected": -18824357.333333332, "logps/chosen": -360.558251953125, "logps/rejected": -466.8964029947917, "loss": 0.3639, "rewards/chosen": 0.0032021760940551757, "rewards/margins": 3.035280283292135, "rewards/rejected": -3.0320781071980796, "step": 8201 }, { "epoch": 0.43473882278111997, "grad_norm": 44.25, "kl": 1.5182361602783203, "learning_rate": 5e-07, "logits/chosen": -16975372.0, "logits/rejected": -47283744.0, "logps/chosen": -229.20147705078125, "logps/rejected": -373.79498291015625, "loss": 0.2838, "rewards/chosen": 0.34991177916526794, "rewards/margins": 3.1593759953975677, "rewards/rejected": -2.8094642162323, "step": 8202 }, { "epoch": 0.4347918267829221, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38337436.0, "logits/rejected": 1262348.5, "logps/chosen": -301.2682800292969, "logps/rejected": -161.8317413330078, "loss": 0.2941, "rewards/chosen": 0.7533141374588013, "rewards/margins": 1.9706851243972778, "rewards/rejected": -1.2173709869384766, "step": 8203 }, { "epoch": 0.43484483078472425, "grad_norm": 32.75, "kl": 0.7735700607299805, "learning_rate": 5e-07, "logits/chosen": -11537313.333333334, "logits/rejected": -17353897.6, "logps/chosen": -364.5384114583333, "logps/rejected": -365.031689453125, "loss": 0.1761, "rewards/chosen": 1.380533218383789, "rewards/margins": 4.589204406738281, "rewards/rejected": -3.2086711883544923, "step": 8204 }, { "epoch": 0.4348978347865264, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26814349.333333332, "logits/rejected": -88248652.8, "logps/chosen": -221.06083170572916, "logps/rejected": -349.0394775390625, "loss": 0.2124, "rewards/chosen": 0.36385174592336017, "rewards/margins": 3.053148849805196, "rewards/rejected": -2.689297103881836, "step": 8205 }, { "epoch": 0.4349508387883285, "grad_norm": 63.75, "kl": 0.38864707946777344, "learning_rate": 5e-07, "logits/chosen": -8289202.666666667, "logits/rejected": -18445694.4, "logps/chosen": -655.63134765625, "logps/rejected": -288.0138916015625, "loss": 0.276, "rewards/chosen": 0.20189642906188965, "rewards/margins": 2.02411093711853, "rewards/rejected": -1.8222145080566405, "step": 8206 }, { "epoch": 0.43500384279013066, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20248802.0, "logits/rejected": -41008636.0, "logps/chosen": -257.1337890625, "logps/rejected": -332.2550354003906, "loss": 0.3612, "rewards/chosen": 0.013080216944217682, "rewards/margins": 1.4417859092354774, "rewards/rejected": -1.4287056922912598, "step": 8207 }, { "epoch": 0.4350568467919328, "grad_norm": 51.0, "kl": 0.08195304870605469, "learning_rate": 5e-07, "logits/chosen": -76578096.0, "logits/rejected": -24987864.0, "logps/chosen": -437.038330078125, "logps/rejected": -197.2379608154297, "loss": 0.254, "rewards/chosen": 0.4946111738681793, "rewards/margins": 2.6921290457248688, "rewards/rejected": -2.1975178718566895, "step": 8208 }, { "epoch": 0.43510985079373493, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23202368.0, "logits/rejected": -3226326.0, "logps/chosen": -201.04969787597656, "logps/rejected": -463.5033365885417, "loss": 0.1677, "rewards/chosen": 0.23164846003055573, "rewards/margins": 2.966644619901975, "rewards/rejected": -2.7349961598714194, "step": 8209 }, { "epoch": 0.43516285479553707, "grad_norm": 70.5, "kl": 0.9438009262084961, "learning_rate": 5e-07, "logits/chosen": -22078505.6, "logits/rejected": 4173365.0, "logps/chosen": -240.2685791015625, "logps/rejected": -193.2791748046875, "loss": 0.4373, "rewards/chosen": 0.26715545654296874, "rewards/margins": 0.7711171785990396, "rewards/rejected": -0.5039617220560709, "step": 8210 }, { "epoch": 0.4352158587973392, "grad_norm": 54.75, "kl": 1.5359058380126953, "learning_rate": 5e-07, "logits/chosen": -9833080.0, "logits/rejected": -38178476.8, "logps/chosen": -421.9088134765625, "logps/rejected": -349.715283203125, "loss": 0.2319, "rewards/chosen": 0.2772144079208374, "rewards/margins": 2.8573760747909547, "rewards/rejected": -2.5801616668701173, "step": 8211 }, { "epoch": 0.43526886279914134, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62251718.4, "logits/rejected": -19734482.666666668, "logps/chosen": -375.214892578125, "logps/rejected": -123.77734375, "loss": 0.3897, "rewards/chosen": 0.3034167528152466, "rewards/margins": 1.124753975868225, "rewards/rejected": -0.8213372230529785, "step": 8212 }, { "epoch": 0.4353218668009435, "grad_norm": 47.75, "kl": 0.6962165832519531, "learning_rate": 5e-07, "logits/chosen": -17591166.4, "logits/rejected": -837490.6666666666, "logps/chosen": -262.655810546875, "logps/rejected": -272.7227376302083, "loss": 0.2919, "rewards/chosen": 0.661229419708252, "rewards/margins": 2.554282792409261, "rewards/rejected": -1.893053372701009, "step": 8213 }, { "epoch": 0.4353748708027456, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -90333000.0, "logits/rejected": -16919538.0, "logps/chosen": -336.87933349609375, "logps/rejected": -286.9962463378906, "loss": 0.2227, "rewards/chosen": 0.8831851482391357, "rewards/margins": 3.183870792388916, "rewards/rejected": -2.3006856441497803, "step": 8214 }, { "epoch": 0.43542787480454775, "grad_norm": 50.75, "kl": 0.1523151397705078, "learning_rate": 5e-07, "logits/chosen": -32292981.333333332, "logits/rejected": -28404160.0, "logps/chosen": -242.4771525065104, "logps/rejected": -148.44476318359375, "loss": 0.4212, "rewards/chosen": 0.09669376413027446, "rewards/margins": 1.5531094173590343, "rewards/rejected": -1.4564156532287598, "step": 8215 }, { "epoch": 0.4354808788063499, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1364811.5, "logits/rejected": -33790435.2, "logps/chosen": -38.23590342203776, "logps/rejected": -410.44580078125, "loss": 0.2661, "rewards/chosen": 0.0555515984694163, "rewards/margins": 2.3181294182936347, "rewards/rejected": -2.2625778198242186, "step": 8216 }, { "epoch": 0.43553388280815203, "grad_norm": 64.0, "kl": 1.7383155822753906, "learning_rate": 5e-07, "logits/chosen": -8280758.4, "logits/rejected": 1161607.6666666667, "logps/chosen": -285.6841064453125, "logps/rejected": -257.3257649739583, "loss": 0.2974, "rewards/chosen": 0.8904294013977051, "rewards/margins": 2.3907784461975097, "rewards/rejected": -1.5003490447998047, "step": 8217 }, { "epoch": 0.43558688680995417, "grad_norm": 53.25, "kl": 2.247504234313965, "learning_rate": 5e-07, "logits/chosen": -16600684.0, "logits/rejected": -28298822.4, "logps/chosen": -389.9748942057292, "logps/rejected": -248.1405029296875, "loss": 0.3155, "rewards/chosen": 0.2930934230486552, "rewards/margins": 1.7362396518389385, "rewards/rejected": -1.4431462287902832, "step": 8218 }, { "epoch": 0.4356398908117563, "grad_norm": 49.0, "kl": 1.6385421752929688, "learning_rate": 5e-07, "logits/chosen": -26500952.0, "logits/rejected": -81099264.0, "logps/chosen": -579.8162109375, "logps/rejected": -378.9820963541667, "loss": 0.2605, "rewards/chosen": 0.9790134429931641, "rewards/margins": 2.6919175783793134, "rewards/rejected": -1.7129041353861492, "step": 8219 }, { "epoch": 0.43569289481355844, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14439788.8, "logits/rejected": -2376546.6666666665, "logps/chosen": -232.727490234375, "logps/rejected": -275.447509765625, "loss": 0.3098, "rewards/chosen": 0.24167890548706056, "rewards/margins": 2.869400691986084, "rewards/rejected": -2.6277217864990234, "step": 8220 }, { "epoch": 0.4357458988153606, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18551742.0, "logits/rejected": -14374707.0, "logps/chosen": -334.027099609375, "logps/rejected": -194.9891357421875, "loss": 0.2343, "rewards/chosen": 1.0504021644592285, "rewards/margins": 2.9023523330688477, "rewards/rejected": -1.8519501686096191, "step": 8221 }, { "epoch": 0.4357989028171627, "grad_norm": 49.75, "kl": 0.9464740753173828, "learning_rate": 5e-07, "logits/chosen": -9609682.0, "logits/rejected": 156058764.8, "logps/chosen": -247.51444498697916, "logps/rejected": -211.031396484375, "loss": 0.3199, "rewards/chosen": 0.02494758367538452, "rewards/margins": 1.8779211401939393, "rewards/rejected": -1.8529735565185548, "step": 8222 }, { "epoch": 0.43585190681896485, "grad_norm": 59.5, "kl": 3.422290802001953, "learning_rate": 5e-07, "logits/chosen": 6310610.0, "logits/rejected": -38084506.666666664, "logps/chosen": -302.32587890625, "logps/rejected": -264.6483154296875, "loss": 0.3255, "rewards/chosen": 1.02362003326416, "rewards/margins": 2.561095937093099, "rewards/rejected": -1.5374759038289387, "step": 8223 }, { "epoch": 0.435904910820767, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47976132.0, "logits/rejected": 56674778.666666664, "logps/chosen": -373.2825012207031, "logps/rejected": -285.36834716796875, "loss": 0.2569, "rewards/chosen": 0.2837676703929901, "rewards/margins": 1.8382749259471893, "rewards/rejected": -1.5545072555541992, "step": 8224 }, { "epoch": 0.4359579148225691, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38235388.0, "logits/rejected": -11427780.0, "logps/chosen": -152.8109893798828, "logps/rejected": -242.29782104492188, "loss": 0.2924, "rewards/chosen": 0.3728359341621399, "rewards/margins": 2.2071803212165833, "rewards/rejected": -1.8343443870544434, "step": 8225 }, { "epoch": 0.43601091882437126, "grad_norm": 40.0, "kl": 0.5642948150634766, "learning_rate": 5e-07, "logits/chosen": -19442265.333333332, "logits/rejected": -16093728.0, "logps/chosen": -175.0986124674479, "logps/rejected": -358.00380859375, "loss": 0.2174, "rewards/chosen": 0.395777424176534, "rewards/margins": 3.3974043925603232, "rewards/rejected": -3.001626968383789, "step": 8226 }, { "epoch": 0.4360639228261734, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19671568.0, "logits/rejected": -26768018.666666668, "logps/chosen": -423.4634704589844, "logps/rejected": -349.956298828125, "loss": 0.25, "rewards/chosen": 0.3449516296386719, "rewards/margins": 1.8539204597473145, "rewards/rejected": -1.5089688301086426, "step": 8227 }, { "epoch": 0.43611692682797554, "grad_norm": 74.0, "kl": 0.46201515197753906, "learning_rate": 5e-07, "logits/chosen": 2509539.6, "logits/rejected": -6183212.666666667, "logps/chosen": -434.97744140625, "logps/rejected": -209.3946533203125, "loss": 0.3322, "rewards/chosen": 0.2375415086746216, "rewards/margins": 2.1373522996902468, "rewards/rejected": -1.899810791015625, "step": 8228 }, { "epoch": 0.4361699308297777, "grad_norm": 40.0, "kl": 0.0957479476928711, "learning_rate": 5e-07, "logits/chosen": -39162584.0, "logits/rejected": -87350144.0, "logps/chosen": -326.6966247558594, "logps/rejected": -467.7392578125, "loss": 0.2712, "rewards/chosen": 0.33601391315460205, "rewards/margins": 2.5714937448501587, "rewards/rejected": -2.2354798316955566, "step": 8229 }, { "epoch": 0.4362229348315798, "grad_norm": 30.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54734456.0, "logits/rejected": -91170250.66666667, "logps/chosen": -358.19805908203125, "logps/rejected": -439.2535400390625, "loss": 0.1415, "rewards/chosen": 1.490535020828247, "rewards/margins": 3.7966345945994058, "rewards/rejected": -2.3060995737711587, "step": 8230 }, { "epoch": 0.4362759388333819, "grad_norm": 61.75, "kl": 0.5015792846679688, "learning_rate": 5e-07, "logits/chosen": -78947312.0, "logits/rejected": -16402740.0, "logps/chosen": -552.8839721679688, "logps/rejected": -193.78311157226562, "loss": 0.2984, "rewards/chosen": 0.5666122436523438, "rewards/margins": 2.120962381362915, "rewards/rejected": -1.5543501377105713, "step": 8231 }, { "epoch": 0.43632894283518403, "grad_norm": 47.25, "kl": 0.16887664794921875, "learning_rate": 5e-07, "logits/chosen": -3994373.75, "logits/rejected": -40968496.0, "logps/chosen": -137.40541076660156, "logps/rejected": -550.2733154296875, "loss": 0.2728, "rewards/chosen": -0.022432561963796616, "rewards/margins": 3.5498240031301975, "rewards/rejected": -3.572256565093994, "step": 8232 }, { "epoch": 0.43638194683698617, "grad_norm": 53.0, "kl": 0.06521034240722656, "learning_rate": 5e-07, "logits/chosen": -32147796.57142857, "logits/rejected": -1467100.5, "logps/chosen": -303.7374790736607, "logps/rejected": -39.8060302734375, "loss": 0.3783, "rewards/chosen": 0.47402497700282503, "rewards/margins": 2.3207599776131764, "rewards/rejected": -1.8467350006103516, "step": 8233 }, { "epoch": 0.4364349508387883, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16014059.2, "logits/rejected": -1505012.0, "logps/chosen": -198.5081787109375, "logps/rejected": -412.496337890625, "loss": 0.2702, "rewards/chosen": 0.8910154342651367, "rewards/margins": 2.33787161509196, "rewards/rejected": -1.446856180826823, "step": 8234 }, { "epoch": 0.43648795484059044, "grad_norm": 30.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4479249.333333333, "logits/rejected": -28499859.2, "logps/chosen": -59.52010599772135, "logps/rejected": -518.299609375, "loss": 0.2572, "rewards/chosen": 0.18720702330271402, "rewards/margins": 2.6449153820673623, "rewards/rejected": -2.4577083587646484, "step": 8235 }, { "epoch": 0.4365409588423926, "grad_norm": 63.25, "kl": 1.8088188171386719, "learning_rate": 5e-07, "logits/chosen": 6860984.0, "logits/rejected": -10659504.8, "logps/chosen": -459.0763346354167, "logps/rejected": -276.5628173828125, "loss": 0.3039, "rewards/chosen": 0.3694353898366292, "rewards/margins": 1.9661995728810628, "rewards/rejected": -1.5967641830444337, "step": 8236 }, { "epoch": 0.4365939628441947, "grad_norm": 61.0, "kl": 0.6648950576782227, "learning_rate": 5e-07, "logits/chosen": -81733066.66666667, "logits/rejected": 1652088.0, "logps/chosen": -198.4453125, "logps/rejected": -516.090087890625, "loss": 0.4152, "rewards/chosen": 0.10647830367088318, "rewards/margins": 1.7859934270381927, "rewards/rejected": -1.6795151233673096, "step": 8237 }, { "epoch": 0.43664696684599685, "grad_norm": 41.25, "kl": 1.1710796356201172, "learning_rate": 5e-07, "logits/chosen": -10039132.0, "logits/rejected": -31164972.0, "logps/chosen": -222.12814331054688, "logps/rejected": -465.69744873046875, "loss": 0.2616, "rewards/chosen": 0.7078326940536499, "rewards/margins": 2.8673092126846313, "rewards/rejected": -2.1594765186309814, "step": 8238 }, { "epoch": 0.436699970847799, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34116976.0, "logits/rejected": -25255104.0, "logps/chosen": -318.5529479980469, "logps/rejected": -386.4056701660156, "loss": 0.2666, "rewards/chosen": 0.4752517640590668, "rewards/margins": 2.676553338766098, "rewards/rejected": -2.2013015747070312, "step": 8239 }, { "epoch": 0.4367529748496011, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6573439.2, "logits/rejected": 25224042.666666668, "logps/chosen": -158.5024169921875, "logps/rejected": -331.9878336588542, "loss": 0.3258, "rewards/chosen": 0.15550918579101564, "rewards/margins": 2.8527010599772136, "rewards/rejected": -2.6971918741861978, "step": 8240 }, { "epoch": 0.43680597885140326, "grad_norm": 42.5, "kl": 1.5429706573486328, "learning_rate": 5e-07, "logits/chosen": 1305426.1, "logits/rejected": -10366373.333333334, "logps/chosen": -116.76905517578125, "logps/rejected": -230.20904541015625, "loss": 0.2853, "rewards/chosen": 0.8056995391845703, "rewards/margins": 2.360308837890625, "rewards/rejected": -1.5546092987060547, "step": 8241 }, { "epoch": 0.4368589828532054, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28607430.4, "logits/rejected": -19668378.666666668, "logps/chosen": -586.09912109375, "logps/rejected": -235.4200439453125, "loss": 0.2806, "rewards/chosen": 0.8797436714172363, "rewards/margins": 2.6539374351501466, "rewards/rejected": -1.7741937637329102, "step": 8242 }, { "epoch": 0.43691198685500754, "grad_norm": 32.0, "kl": 0.3585700988769531, "learning_rate": 5e-07, "logits/chosen": 4171617.0, "logits/rejected": -18053126.0, "logps/chosen": -83.27510070800781, "logps/rejected": -226.28390502929688, "loss": 0.3141, "rewards/chosen": 0.4914916753768921, "rewards/margins": 1.9429517984390259, "rewards/rejected": -1.4514601230621338, "step": 8243 }, { "epoch": 0.4369649908568097, "grad_norm": 57.0, "kl": 3.6201324462890625, "learning_rate": 5e-07, "logits/chosen": -18232848.0, "logits/rejected": 19320673.333333332, "logps/chosen": -385.251611328125, "logps/rejected": -535.4898681640625, "loss": 0.4234, "rewards/chosen": 0.20815551280975342, "rewards/margins": 2.6143147548039756, "rewards/rejected": -2.406159241994222, "step": 8244 }, { "epoch": 0.4370179948586118, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37823973.333333336, "logits/rejected": -10035658.4, "logps/chosen": -113.13582356770833, "logps/rejected": -365.387646484375, "loss": 0.3121, "rewards/chosen": 0.07741999626159668, "rewards/margins": 2.0573312282562255, "rewards/rejected": -1.9799112319946288, "step": 8245 }, { "epoch": 0.43707099886041395, "grad_norm": 63.75, "kl": 3.1372451782226562, "learning_rate": 5e-07, "logits/chosen": -31844137.6, "logits/rejected": -40338522.666666664, "logps/chosen": -613.7146484375, "logps/rejected": -383.371337890625, "loss": 0.3465, "rewards/chosen": 0.6543781757354736, "rewards/margins": 1.917553981145223, "rewards/rejected": -1.2631758054097493, "step": 8246 }, { "epoch": 0.4371240028622161, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47732764.0, "logits/rejected": -23282221.333333332, "logps/chosen": -333.6119384765625, "logps/rejected": -367.9699300130208, "loss": 0.2477, "rewards/chosen": -0.09457092732191086, "rewards/margins": 2.0875605220595994, "rewards/rejected": -2.1821314493815103, "step": 8247 }, { "epoch": 0.4371770068640182, "grad_norm": 31.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6425828.0, "logits/rejected": -27788320.0, "logps/chosen": -214.16677856445312, "logps/rejected": -406.0557338169643, "loss": 0.0944, "rewards/chosen": 1.2367355823516846, "rewards/margins": 3.989581891468593, "rewards/rejected": -2.7528463091169084, "step": 8248 }, { "epoch": 0.43723001086582036, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84193424.0, "logits/rejected": -25095768.0, "logps/chosen": -648.0430297851562, "logps/rejected": -326.2952473958333, "loss": 0.2844, "rewards/chosen": -0.31486818194389343, "rewards/margins": 1.2955111960570018, "rewards/rejected": -1.6103793780008953, "step": 8249 }, { "epoch": 0.4372830148676225, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8174830.0, "logits/rejected": -12407704.0, "logps/chosen": -351.1760559082031, "logps/rejected": -223.28387451171875, "loss": 0.2547, "rewards/chosen": 0.03397674858570099, "rewards/margins": 1.812186911702156, "rewards/rejected": -1.778210163116455, "step": 8250 }, { "epoch": 0.43733601886942464, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66810888.0, "logits/rejected": -19697100.0, "logps/chosen": -331.9747619628906, "logps/rejected": -205.66615295410156, "loss": 0.2753, "rewards/chosen": 0.32286757230758667, "rewards/margins": 2.23523873090744, "rewards/rejected": -1.9123711585998535, "step": 8251 }, { "epoch": 0.4373890228712268, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86647320.0, "logits/rejected": -17616318.666666668, "logps/chosen": -302.97119140625, "logps/rejected": -273.38934326171875, "loss": 0.2564, "rewards/chosen": 0.6624031066894531, "rewards/margins": 2.0001745223999023, "rewards/rejected": -1.3377714157104492, "step": 8252 }, { "epoch": 0.4374420268730289, "grad_norm": 63.5, "kl": 6.109275817871094, "learning_rate": 5e-07, "logits/chosen": -48629769.6, "logits/rejected": -17599954.666666668, "logps/chosen": -652.106005859375, "logps/rejected": -489.1275634765625, "loss": 0.2507, "rewards/chosen": 1.524652099609375, "rewards/margins": 3.7598318417867027, "rewards/rejected": -2.2351797421773276, "step": 8253 }, { "epoch": 0.43749503087483105, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -5745457.0, "logps/rejected": -288.9503173828125, "loss": 0.1607, "rewards/rejected": -2.0361032485961914, "step": 8254 }, { "epoch": 0.4375480348766332, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28008842.0, "logits/rejected": -20348484.0, "logps/chosen": -208.8341827392578, "logps/rejected": -340.1282552083333, "loss": 0.3145, "rewards/chosen": 0.3502868413925171, "rewards/margins": 1.6774259805679321, "rewards/rejected": -1.327139139175415, "step": 8255 }, { "epoch": 0.4376010388784353, "grad_norm": 44.25, "kl": 0.782073974609375, "learning_rate": 5e-07, "logits/chosen": -39666229.333333336, "logits/rejected": -7640779.2, "logps/chosen": -536.2363688151041, "logps/rejected": -430.18193359375, "loss": 0.1807, "rewards/chosen": 0.8621500333150228, "rewards/margins": 3.6587698300679525, "rewards/rejected": -2.7966197967529296, "step": 8256 }, { "epoch": 0.43765404288023746, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32573980.0, "logits/rejected": -28825474.0, "logps/chosen": -260.2957458496094, "logps/rejected": -445.6787109375, "loss": 0.3272, "rewards/chosen": 0.06218855082988739, "rewards/margins": 1.7060080021619797, "rewards/rejected": -1.6438194513320923, "step": 8257 }, { "epoch": 0.4377070468820396, "grad_norm": 56.5, "kl": 0.6334218978881836, "learning_rate": 5e-07, "logits/chosen": -15658101.333333334, "logits/rejected": -6511363.0, "logps/chosen": -192.3289998372396, "logps/rejected": -493.906982421875, "loss": 0.4031, "rewards/chosen": 0.04519231120745341, "rewards/margins": 2.7566121319929757, "rewards/rejected": -2.7114198207855225, "step": 8258 }, { "epoch": 0.43776005088384173, "grad_norm": 33.75, "kl": 0.12788105010986328, "learning_rate": 5e-07, "logits/chosen": -3597598.0, "logits/rejected": -30763916.8, "logps/chosen": -111.63235473632812, "logps/rejected": -291.409765625, "loss": 0.2579, "rewards/chosen": 0.2890204191207886, "rewards/margins": 2.3328837156295776, "rewards/rejected": -2.043863296508789, "step": 8259 }, { "epoch": 0.43781305488564387, "grad_norm": 58.5, "kl": 0.6231346130371094, "learning_rate": 5e-07, "logits/chosen": -28398304.0, "logits/rejected": -22132630.0, "logps/chosen": -238.8738250732422, "logps/rejected": -342.20123291015625, "loss": 0.3208, "rewards/chosen": 0.19667893648147583, "rewards/margins": 1.835109531879425, "rewards/rejected": -1.6384305953979492, "step": 8260 }, { "epoch": 0.437866058887446, "grad_norm": 55.75, "kl": 2.388103485107422, "learning_rate": 5e-07, "logits/chosen": -30334304.0, "logits/rejected": 2146870.0, "logps/chosen": -394.55859375, "logps/rejected": -63.809776306152344, "loss": 0.4029, "rewards/chosen": 0.6209928592046102, "rewards/margins": 1.7285840113957724, "rewards/rejected": -1.107591152191162, "step": 8261 }, { "epoch": 0.43791906288924815, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5363388.5, "logits/rejected": -6711605.0, "logps/chosen": -425.79534912109375, "logps/rejected": -135.79164123535156, "loss": 0.2649, "rewards/chosen": 0.9359556436538696, "rewards/margins": 2.2157247066497803, "rewards/rejected": -1.2797690629959106, "step": 8262 }, { "epoch": 0.4379720668910503, "grad_norm": 56.25, "kl": 3.0274200439453125, "learning_rate": 5e-07, "logits/chosen": -35877384.0, "logits/rejected": -10344281.0, "logps/chosen": -362.408935546875, "logps/rejected": -285.3985595703125, "loss": 0.4063, "rewards/chosen": 0.32458831866582233, "rewards/margins": 1.4300967653592427, "rewards/rejected": -1.1055084466934204, "step": 8263 }, { "epoch": 0.4380250708928524, "grad_norm": 52.5, "kl": 0.6064023971557617, "learning_rate": 5e-07, "logits/chosen": 12308918.666666666, "logits/rejected": -44200288.0, "logps/chosen": -247.77069091796875, "logps/rejected": -256.5562438964844, "loss": 0.3615, "rewards/chosen": 0.2901901404062907, "rewards/margins": 2.669992129007975, "rewards/rejected": -2.3798019886016846, "step": 8264 }, { "epoch": 0.43807807489465456, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7557.25, "logits/rejected": -23415456.0, "logps/chosen": -60.494380950927734, "logps/rejected": -281.3042805989583, "loss": 0.1515, "rewards/chosen": 0.9599809646606445, "rewards/margins": 3.3106606801350913, "rewards/rejected": -2.3506797154744468, "step": 8265 }, { "epoch": 0.4381310788964567, "grad_norm": 42.75, "kl": 0.20955276489257812, "learning_rate": 5e-07, "logits/chosen": -15716987.0, "logits/rejected": -47654640.0, "logps/chosen": -238.3333740234375, "logps/rejected": -449.45440673828125, "loss": 0.3208, "rewards/chosen": 0.06348470598459244, "rewards/margins": 2.66830400377512, "rewards/rejected": -2.6048192977905273, "step": 8266 }, { "epoch": 0.43818408289825883, "grad_norm": 57.5, "kl": 0.720245361328125, "learning_rate": 5e-07, "logits/chosen": -2754313.5, "logits/rejected": -36763264.0, "logps/chosen": -504.7408752441406, "logps/rejected": -590.967529296875, "loss": 0.2979, "rewards/chosen": 0.19920337200164795, "rewards/margins": 2.782156825065613, "rewards/rejected": -2.582953453063965, "step": 8267 }, { "epoch": 0.43823708690006097, "grad_norm": 31.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52748816.0, "logits/rejected": -42749581.333333336, "logps/chosen": -213.04702758789062, "logps/rejected": -206.48331705729166, "loss": 0.1868, "rewards/chosen": 0.8984135985374451, "rewards/margins": 3.3409215013186135, "rewards/rejected": -2.4425079027811685, "step": 8268 }, { "epoch": 0.4382900909018631, "grad_norm": 54.5, "kl": 1.3457403182983398, "learning_rate": 5e-07, "logits/chosen": -6097408.666666667, "logits/rejected": -30190224.0, "logps/chosen": -182.22784423828125, "logps/rejected": -228.19036865234375, "loss": 0.4367, "rewards/chosen": 0.2718987266222636, "rewards/margins": 1.2953240672747295, "rewards/rejected": -1.0234253406524658, "step": 8269 }, { "epoch": 0.43834309490366524, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64519606.85714286, "logits/rejected": -20328008.0, "logps/chosen": -173.9256591796875, "logps/rejected": -108.34188842773438, "loss": 0.4583, "rewards/chosen": 0.14017475502831594, "rewards/margins": 0.5860289420400346, "rewards/rejected": -0.44585418701171875, "step": 8270 }, { "epoch": 0.4383960989054674, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38737964.8, "logits/rejected": -35287234.666666664, "logps/chosen": -288.786083984375, "logps/rejected": -284.69919840494794, "loss": 0.3751, "rewards/chosen": 0.10171172618865967, "rewards/margins": 1.806208109855652, "rewards/rejected": -1.7044963836669922, "step": 8271 }, { "epoch": 0.4384491029072695, "grad_norm": 27.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7132477.0, "logits/rejected": -25651768.0, "logps/chosen": -31.395971298217773, "logps/rejected": -241.87869262695312, "loss": 0.2681, "rewards/chosen": 0.2439049333333969, "rewards/margins": 2.74411903321743, "rewards/rejected": -2.500214099884033, "step": 8272 }, { "epoch": 0.43850210690907165, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50120992.0, "logits/rejected": -48154483.2, "logps/chosen": -496.5826416015625, "logps/rejected": -400.72578125, "loss": 0.2931, "rewards/chosen": 0.29570438464482623, "rewards/margins": 2.000979538758596, "rewards/rejected": -1.7052751541137696, "step": 8273 }, { "epoch": 0.4385551109108738, "grad_norm": 41.5, "kl": 0.6418819427490234, "learning_rate": 5e-07, "logits/chosen": 21806421.333333332, "logits/rejected": -35551155.2, "logps/chosen": -293.97833251953125, "logps/rejected": -514.131640625, "loss": 0.1992, "rewards/chosen": 0.5294471979141235, "rewards/margins": 3.3315258264541625, "rewards/rejected": -2.802078628540039, "step": 8274 }, { "epoch": 0.43860811491267593, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59948826.666666664, "logits/rejected": -28225582.0, "logps/chosen": -400.1153564453125, "logps/rejected": -295.50958251953125, "loss": 0.3027, "rewards/chosen": 0.585215171178182, "rewards/margins": 2.6366785367329917, "rewards/rejected": -2.0514633655548096, "step": 8275 }, { "epoch": 0.43866111891447807, "grad_norm": 50.25, "kl": 0.7664661407470703, "learning_rate": 5e-07, "logits/chosen": -25122478.0, "logits/rejected": -42657984.0, "logps/chosen": -309.3670349121094, "logps/rejected": -295.5386047363281, "loss": 0.3249, "rewards/chosen": -0.12841439247131348, "rewards/margins": 2.1201276779174805, "rewards/rejected": -2.248542070388794, "step": 8276 }, { "epoch": 0.4387141229162802, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46092192.0, "logits/rejected": -20916185.6, "logps/chosen": -291.96579996744794, "logps/rejected": -373.052001953125, "loss": 0.2827, "rewards/chosen": 0.4340779781341553, "rewards/margins": 2.121228742599487, "rewards/rejected": -1.687150764465332, "step": 8277 }, { "epoch": 0.43876712691808234, "grad_norm": 50.25, "kl": 2.1668529510498047, "learning_rate": 5e-07, "logits/chosen": -43683416.0, "logits/rejected": -48718412.0, "logps/chosen": -449.5261535644531, "logps/rejected": -355.0840759277344, "loss": 0.2752, "rewards/chosen": 0.6408277750015259, "rewards/margins": 2.915209412574768, "rewards/rejected": -2.274381637573242, "step": 8278 }, { "epoch": 0.4388201309198845, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38908592.0, "logits/rejected": -54525904.0, "logps/chosen": -229.9549072265625, "logps/rejected": -620.367919921875, "loss": 0.2986, "rewards/chosen": 0.394610595703125, "rewards/margins": 3.433551025390625, "rewards/rejected": -3.0389404296875, "step": 8279 }, { "epoch": 0.4388731349216866, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10231024.0, "logits/rejected": -59691.0, "logps/chosen": -218.557763671875, "logps/rejected": -103.91098022460938, "loss": 0.3284, "rewards/chosen": 0.4813532829284668, "rewards/margins": 2.1692161560058594, "rewards/rejected": -1.6878628730773926, "step": 8280 }, { "epoch": 0.43892613892348875, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57406428.0, "logits/rejected": -41292728.0, "logps/chosen": -252.83567810058594, "logps/rejected": -176.286376953125, "loss": 0.3284, "rewards/chosen": 0.28963175415992737, "rewards/margins": 1.67660191655159, "rewards/rejected": -1.3869701623916626, "step": 8281 }, { "epoch": 0.43897914292529083, "grad_norm": 51.0, "kl": 0.007305145263671875, "learning_rate": 5e-07, "logits/chosen": -32068078.0, "logits/rejected": -6004508.0, "logps/chosen": -395.93194580078125, "logps/rejected": -89.0574951171875, "loss": 0.2605, "rewards/chosen": 0.7210540771484375, "rewards/margins": 1.9883943398793538, "rewards/rejected": -1.2673402627309163, "step": 8282 }, { "epoch": 0.43903214692709297, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7493432.0, "logits/rejected": -45284742.4, "logps/chosen": -217.00736490885416, "logps/rejected": -350.256396484375, "loss": 0.277, "rewards/chosen": 0.49375152587890625, "rewards/margins": 2.8512809753417967, "rewards/rejected": -2.3575294494628904, "step": 8283 }, { "epoch": 0.4390851509288951, "grad_norm": 55.75, "kl": 0.19770050048828125, "learning_rate": 5e-07, "logits/chosen": -80338104.0, "logits/rejected": -20782308.0, "logps/chosen": -441.0665588378906, "logps/rejected": -162.38265991210938, "loss": 0.3604, "rewards/chosen": 0.0957174226641655, "rewards/margins": 1.3637845441699028, "rewards/rejected": -1.2680671215057373, "step": 8284 }, { "epoch": 0.43913815493069724, "grad_norm": 68.5, "kl": 2.2843170166015625, "learning_rate": 5e-07, "logits/chosen": -22589701.333333332, "logits/rejected": -88459576.0, "logps/chosen": -243.70585123697916, "logps/rejected": -733.84033203125, "loss": 0.4618, "rewards/chosen": 0.058769663174947105, "rewards/margins": 1.8492832978566487, "rewards/rejected": -1.7905136346817017, "step": 8285 }, { "epoch": 0.4391911589324994, "grad_norm": 55.5, "kl": 0.8721904754638672, "learning_rate": 5e-07, "logits/chosen": -11044981.0, "logits/rejected": -9269604.0, "logps/chosen": -505.495849609375, "logps/rejected": -225.0480194091797, "loss": 0.3096, "rewards/chosen": 0.7784953117370605, "rewards/margins": 1.9572439193725586, "rewards/rejected": -1.178748607635498, "step": 8286 }, { "epoch": 0.4392441629343015, "grad_norm": 40.25, "kl": 2.161020278930664, "learning_rate": 5e-07, "logits/chosen": -4529105.142857143, "logits/rejected": -23898376.0, "logps/chosen": -409.48667689732144, "logps/rejected": -95.98658752441406, "loss": 0.3746, "rewards/chosen": 0.8994603838239398, "rewards/margins": 2.2089681114469255, "rewards/rejected": -1.3095077276229858, "step": 8287 }, { "epoch": 0.43929716693610366, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46684144.0, "logits/rejected": -16590083.0, "logps/chosen": -592.2076416015625, "logps/rejected": -159.25994873046875, "loss": 0.2837, "rewards/chosen": 0.5377159118652344, "rewards/margins": 2.131239175796509, "rewards/rejected": -1.5935232639312744, "step": 8288 }, { "epoch": 0.4393501709379058, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15653798.0, "logits/rejected": -6864638.0, "logps/chosen": -806.38134765625, "logps/rejected": -231.44424438476562, "loss": 0.2721, "rewards/chosen": 0.38211172819137573, "rewards/margins": 2.461322009563446, "rewards/rejected": -2.0792102813720703, "step": 8289 }, { "epoch": 0.43940317493970793, "grad_norm": 53.25, "kl": 1.4438018798828125, "learning_rate": 5e-07, "logits/chosen": -61653604.571428575, "logits/rejected": -24043632.0, "logps/chosen": -390.74166434151783, "logps/rejected": -354.2174377441406, "loss": 0.4348, "rewards/chosen": 0.2848121438707624, "rewards/margins": 2.225202134677342, "rewards/rejected": -1.9403899908065796, "step": 8290 }, { "epoch": 0.43945617894151007, "grad_norm": 50.0, "kl": 1.059366226196289, "learning_rate": 5e-07, "logits/chosen": -20786572.0, "logits/rejected": -266321.0625, "logps/chosen": -271.7135823567708, "logps/rejected": -67.8262939453125, "loss": 0.3748, "rewards/chosen": 0.5496029853820801, "rewards/margins": 1.7823033332824707, "rewards/rejected": -1.2327003479003906, "step": 8291 }, { "epoch": 0.4395091829433122, "grad_norm": 50.25, "kl": 0.6141595840454102, "learning_rate": 5e-07, "logits/chosen": -49320136.0, "logits/rejected": -30620400.0, "logps/chosen": -427.0471496582031, "logps/rejected": -226.371337890625, "loss": 0.2087, "rewards/chosen": 1.0622894763946533, "rewards/margins": 2.6574587027231855, "rewards/rejected": -1.595169226328532, "step": 8292 }, { "epoch": 0.43956218694511434, "grad_norm": 39.0, "kl": 1.3241653442382812, "learning_rate": 5e-07, "logits/chosen": -1650386.25, "logits/rejected": -21029413.333333332, "logps/chosen": -250.80999755859375, "logps/rejected": -197.34415690104166, "loss": 0.2423, "rewards/chosen": 0.24282656610012054, "rewards/margins": 2.052984341979027, "rewards/rejected": -1.8101577758789062, "step": 8293 }, { "epoch": 0.4396151909469165, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6791662.0, "logits/rejected": -13765239.0, "logps/chosen": -434.12506103515625, "logps/rejected": -189.8944854736328, "loss": 0.2384, "rewards/chosen": 0.9215805530548096, "rewards/margins": 2.8698720932006836, "rewards/rejected": -1.948291540145874, "step": 8294 }, { "epoch": 0.4396681949487186, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38159692.0, "logits/rejected": -53324616.0, "logps/chosen": -381.3214416503906, "logps/rejected": -647.6109619140625, "loss": 0.2309, "rewards/chosen": 0.5380027294158936, "rewards/margins": 3.1463606357574463, "rewards/rejected": -2.6083579063415527, "step": 8295 }, { "epoch": 0.43972119895052075, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10920464.0, "logits/rejected": -40187770.666666664, "logps/chosen": -157.96543884277344, "logps/rejected": -593.6171061197916, "loss": 0.2052, "rewards/chosen": 0.31467530131340027, "rewards/margins": 2.8871919016043344, "rewards/rejected": -2.572516600290934, "step": 8296 }, { "epoch": 0.4397742029523229, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59233546.666666664, "logits/rejected": -32992409.6, "logps/chosen": -449.493896484375, "logps/rejected": -465.1298828125, "loss": 0.2023, "rewards/chosen": 0.93577774365743, "rewards/margins": 3.358820422490438, "rewards/rejected": -2.4230426788330077, "step": 8297 }, { "epoch": 0.439827206954125, "grad_norm": 45.25, "kl": 0.6415634155273438, "learning_rate": 5e-07, "logits/chosen": -27149363.2, "logits/rejected": -19790562.666666668, "logps/chosen": -416.75107421875, "logps/rejected": -383.0401611328125, "loss": 0.2582, "rewards/chosen": 0.8881733894348145, "rewards/margins": 3.9039004325866697, "rewards/rejected": -3.0157270431518555, "step": 8298 }, { "epoch": 0.43988021095592716, "grad_norm": 55.25, "kl": 4.35759162902832, "learning_rate": 5e-07, "logits/chosen": -20198052.57142857, "logits/rejected": 1515914.0, "logps/chosen": -223.45396205357142, "logps/rejected": -81.82050323486328, "loss": 0.4879, "rewards/chosen": 0.3304923602512905, "rewards/margins": 1.4768432208469937, "rewards/rejected": -1.1463508605957031, "step": 8299 }, { "epoch": 0.4399332149577293, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20420068.0, "logits/rejected": -47023192.0, "logps/chosen": -141.42880249023438, "logps/rejected": -794.658935546875, "loss": 0.2682, "rewards/chosen": -0.04006356745958328, "rewards/margins": 3.7018998190760612, "rewards/rejected": -3.7419633865356445, "step": 8300 }, { "epoch": 0.43998621895953144, "grad_norm": 37.0, "kl": 1.3909683227539062, "learning_rate": 5e-07, "logits/chosen": -18954518.666666668, "logits/rejected": -24251136.0, "logps/chosen": -300.7723795572917, "logps/rejected": -245.4068603515625, "loss": 0.2322, "rewards/chosen": 0.8742086092631022, "rewards/margins": 2.7070720354715982, "rewards/rejected": -1.8328634262084962, "step": 8301 }, { "epoch": 0.4400392229613336, "grad_norm": 49.75, "kl": 0.23018550872802734, "learning_rate": 5e-07, "logits/chosen": -22965680.0, "logits/rejected": 523875.25, "logps/chosen": -300.67388916015625, "logps/rejected": -141.35617065429688, "loss": 0.3008, "rewards/chosen": 0.2969668507575989, "rewards/margins": 1.914283812046051, "rewards/rejected": -1.6173169612884521, "step": 8302 }, { "epoch": 0.4400922269631357, "grad_norm": 39.75, "kl": 0.610931396484375, "learning_rate": 5e-07, "logits/chosen": -16374385.6, "logits/rejected": -15327525.333333334, "logps/chosen": -197.4229736328125, "logps/rejected": -166.85130818684897, "loss": 0.3464, "rewards/chosen": 0.13062760829925538, "rewards/margins": 2.142199492454529, "rewards/rejected": -2.0115718841552734, "step": 8303 }, { "epoch": 0.44014523096493785, "grad_norm": 51.5, "kl": 0.046848297119140625, "learning_rate": 5e-07, "logits/chosen": -42844499.2, "logits/rejected": -11988841.333333334, "logps/chosen": -289.4140380859375, "logps/rejected": -136.43294270833334, "loss": 0.342, "rewards/chosen": 0.5503220558166504, "rewards/margins": 1.5370333989461265, "rewards/rejected": -0.986711343129476, "step": 8304 }, { "epoch": 0.44019823496674, "grad_norm": 52.0, "kl": 2.418929100036621, "learning_rate": 5e-07, "logits/chosen": -22881300.8, "logits/rejected": -26197136.0, "logps/chosen": -170.73797607421875, "logps/rejected": -165.24992879231772, "loss": 0.2789, "rewards/chosen": 0.7648500442504883, "rewards/margins": 2.371418062845866, "rewards/rejected": -1.6065680185953777, "step": 8305 }, { "epoch": 0.4402512389685421, "grad_norm": 47.75, "kl": 0.45481300354003906, "learning_rate": 5e-07, "logits/chosen": -25853114.666666668, "logits/rejected": -31307900.8, "logps/chosen": -366.0236409505208, "logps/rejected": -397.4015625, "loss": 0.1952, "rewards/chosen": 0.6096893151601156, "rewards/margins": 3.056484969456991, "rewards/rejected": -2.446795654296875, "step": 8306 }, { "epoch": 0.44030424297034426, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 29348520.0, "logits/rejected": -2022777.3333333333, "logps/chosen": -126.12916564941406, "logps/rejected": -308.24318440755206, "loss": 0.1993, "rewards/chosen": 0.020420074462890625, "rewards/margins": 2.6366024017333984, "rewards/rejected": -2.616182327270508, "step": 8307 }, { "epoch": 0.4403572469721464, "grad_norm": 29.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 923290.1875, "logits/rejected": -29095734.85714286, "logps/chosen": -39.59718704223633, "logps/rejected": -249.51933942522322, "loss": 0.1707, "rewards/chosen": 0.18439941108226776, "rewards/margins": 2.2899323297398433, "rewards/rejected": -2.1055329186575755, "step": 8308 }, { "epoch": 0.44041025097394854, "grad_norm": 49.5, "kl": 1.7045669555664062, "learning_rate": 5e-07, "logits/chosen": -37246904.0, "logits/rejected": -15157196.8, "logps/chosen": -400.6943359375, "logps/rejected": -295.5314453125, "loss": 0.2203, "rewards/chosen": 1.2567311922709148, "rewards/margins": 2.9979974428812666, "rewards/rejected": -1.7412662506103516, "step": 8309 }, { "epoch": 0.4404632549757507, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32384493.333333332, "logits/rejected": -14176832.0, "logps/chosen": -273.23911539713544, "logps/rejected": -409.011181640625, "loss": 0.2286, "rewards/chosen": 1.0060160160064697, "rewards/margins": 2.9584404468536376, "rewards/rejected": -1.9524244308471679, "step": 8310 }, { "epoch": 0.4405162589775528, "grad_norm": 39.25, "kl": 1.3989677429199219, "learning_rate": 5e-07, "logits/chosen": -11750283.2, "logits/rejected": -25845568.0, "logps/chosen": -187.1189453125, "logps/rejected": -206.56022135416666, "loss": 0.3021, "rewards/chosen": 0.5859343051910401, "rewards/margins": 2.675786797205607, "rewards/rejected": -2.089852492014567, "step": 8311 }, { "epoch": 0.44056926297935495, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4622309.0, "logits/rejected": -11218740.0, "logps/chosen": -164.62451171875, "logps/rejected": -196.95626831054688, "loss": 0.3174, "rewards/chosen": 0.07572026550769806, "rewards/margins": 1.9961385279893875, "rewards/rejected": -1.9204182624816895, "step": 8312 }, { "epoch": 0.4406222669811571, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4130736.75, "logits/rejected": -4970094.5, "logps/chosen": -87.573974609375, "logps/rejected": -194.48707580566406, "loss": 0.3409, "rewards/chosen": -0.417407363653183, "rewards/margins": 2.0304860174655914, "rewards/rejected": -2.4478933811187744, "step": 8313 }, { "epoch": 0.4406752709829592, "grad_norm": 48.5, "kl": 2.777242660522461, "learning_rate": 5e-07, "logits/chosen": -19661605.333333332, "logits/rejected": -19598600.0, "logps/chosen": -485.8736572265625, "logps/rejected": -309.4837646484375, "loss": 0.3121, "rewards/chosen": 0.9941482543945312, "rewards/margins": 3.1878280639648438, "rewards/rejected": -2.1936798095703125, "step": 8314 }, { "epoch": 0.44072827498476136, "grad_norm": 40.25, "kl": 1.336935043334961, "learning_rate": 5e-07, "logits/chosen": -29606276.0, "logits/rejected": -2473776.5, "logps/chosen": -280.0015563964844, "logps/rejected": -258.19964599609375, "loss": 0.2783, "rewards/chosen": 0.5347355008125305, "rewards/margins": 2.570604145526886, "rewards/rejected": -2.0358686447143555, "step": 8315 }, { "epoch": 0.4407812789865635, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20808136.0, "logits/rejected": -21679108.0, "logps/chosen": -268.76947021484375, "logps/rejected": -280.5789489746094, "loss": 0.3042, "rewards/chosen": 0.5202721357345581, "rewards/margins": 3.849582552909851, "rewards/rejected": -3.329310417175293, "step": 8316 }, { "epoch": 0.44083428298836563, "grad_norm": 45.0, "kl": 0.6758022308349609, "learning_rate": 5e-07, "logits/chosen": -20665838.4, "logits/rejected": -39020082.666666664, "logps/chosen": -218.1772216796875, "logps/rejected": -164.12504069010416, "loss": 0.3326, "rewards/chosen": 0.5663991451263428, "rewards/margins": 2.093586492538452, "rewards/rejected": -1.5271873474121094, "step": 8317 }, { "epoch": 0.44088728699016777, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -105639786.66666667, "logits/rejected": -13473481.6, "logps/chosen": -324.19163004557294, "logps/rejected": -311.5018310546875, "loss": 0.3608, "rewards/chosen": -0.12093812227249146, "rewards/margins": 1.04716500043869, "rewards/rejected": -1.1681031227111816, "step": 8318 }, { "epoch": 0.4409402909919699, "grad_norm": 47.25, "kl": 2.658935546875, "learning_rate": 5e-07, "logits/chosen": -17864274.0, "logits/rejected": -6070207.0, "logps/chosen": -231.83938598632812, "logps/rejected": -485.1855163574219, "loss": 0.3825, "rewards/chosen": 0.0794166550040245, "rewards/margins": 2.1932522282004356, "rewards/rejected": -2.113835573196411, "step": 8319 }, { "epoch": 0.44099329499377204, "grad_norm": 95.0, "kl": 6.55511474609375, "learning_rate": 5e-07, "logits/chosen": -53699376.0, "logits/rejected": -6553650.0, "logps/chosen": -1119.6551513671875, "logps/rejected": -388.3524475097656, "loss": 0.2418, "rewards/chosen": 2.282958984375, "rewards/margins": 4.587784767150879, "rewards/rejected": -2.304825782775879, "step": 8320 }, { "epoch": 0.4410462989955742, "grad_norm": 43.5, "kl": 1.9022340774536133, "learning_rate": 5e-07, "logits/chosen": -40473197.333333336, "logits/rejected": -36281992.0, "logps/chosen": -235.02498372395834, "logps/rejected": -384.0400390625, "loss": 0.3388, "rewards/chosen": 0.5780440966288248, "rewards/margins": 2.9988998572031655, "rewards/rejected": -2.420855760574341, "step": 8321 }, { "epoch": 0.4410993029973763, "grad_norm": 51.75, "kl": 0.8783550262451172, "learning_rate": 5e-07, "logits/chosen": -28559107.2, "logits/rejected": -29015229.333333332, "logps/chosen": -217.2495361328125, "logps/rejected": -305.01841227213544, "loss": 0.4239, "rewards/chosen": -0.30636093616485593, "rewards/margins": 1.5537073850631713, "rewards/rejected": -1.8600683212280273, "step": 8322 }, { "epoch": 0.44115230699917846, "grad_norm": 52.25, "kl": 1.3813400268554688, "learning_rate": 5e-07, "logits/chosen": -41661028.0, "logits/rejected": -18743390.0, "logps/chosen": -573.2630615234375, "logps/rejected": -326.1474609375, "loss": 0.2255, "rewards/chosen": 1.3345673084259033, "rewards/margins": 3.7822954654693604, "rewards/rejected": -2.447728157043457, "step": 8323 }, { "epoch": 0.4412053110009806, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36359757.333333336, "logits/rejected": -21428902.4, "logps/chosen": -395.4236653645833, "logps/rejected": -289.5825927734375, "loss": 0.2452, "rewards/chosen": 0.3352752923965454, "rewards/margins": 2.342410683631897, "rewards/rejected": -2.0071353912353516, "step": 8324 }, { "epoch": 0.44125831500278273, "grad_norm": 153.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26060203.2, "logits/rejected": -43002592.0, "logps/chosen": -273.630029296875, "logps/rejected": -627.3404947916666, "loss": 0.4032, "rewards/chosen": -0.44011292457580564, "rewards/margins": 2.438090785344442, "rewards/rejected": -2.8782037099202475, "step": 8325 }, { "epoch": 0.44131131900458487, "grad_norm": 43.75, "kl": 0.7522850036621094, "learning_rate": 5e-07, "logits/chosen": -27902320.0, "logits/rejected": -66467888.0, "logps/chosen": -267.05816650390625, "logps/rejected": -280.331298828125, "loss": 0.3323, "rewards/chosen": 0.3481329083442688, "rewards/margins": 2.0118451714515686, "rewards/rejected": -1.6637122631072998, "step": 8326 }, { "epoch": 0.441364323006387, "grad_norm": 60.5, "kl": 3.46697998046875, "learning_rate": 5e-07, "logits/chosen": -23646242.666666668, "logits/rejected": -2859084.25, "logps/chosen": -603.4529215494791, "logps/rejected": -148.4667510986328, "loss": 0.3061, "rewards/chosen": 1.3368072509765625, "rewards/margins": 2.4651211500167847, "rewards/rejected": -1.1283138990402222, "step": 8327 }, { "epoch": 0.44141732700818914, "grad_norm": 74.5, "kl": 0.022647857666015625, "learning_rate": 5e-07, "logits/chosen": -29085442.0, "logits/rejected": 26923596.0, "logps/chosen": -445.768798828125, "logps/rejected": -220.28524780273438, "loss": 0.3846, "rewards/chosen": -0.3326343595981598, "rewards/margins": 1.2545722424983978, "rewards/rejected": -1.5872066020965576, "step": 8328 }, { "epoch": 0.4414703310099913, "grad_norm": 54.25, "kl": 0.3916893005371094, "learning_rate": 5e-07, "logits/chosen": -41704009.6, "logits/rejected": -13204592.0, "logps/chosen": -285.414794921875, "logps/rejected": -134.74322509765625, "loss": 0.3207, "rewards/chosen": 0.773980188369751, "rewards/margins": 1.8565125942230225, "rewards/rejected": -1.0825324058532715, "step": 8329 }, { "epoch": 0.4415233350117934, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25884264.0, "logits/rejected": -37536692.0, "logps/chosen": -441.2374572753906, "logps/rejected": -479.3694763183594, "loss": 0.1844, "rewards/chosen": 0.9586582183837891, "rewards/margins": 4.0871617794036865, "rewards/rejected": -3.1285035610198975, "step": 8330 }, { "epoch": 0.44157633901359555, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49541077.333333336, "logits/rejected": -38933801.6, "logps/chosen": -190.8174845377604, "logps/rejected": -419.677587890625, "loss": 0.2206, "rewards/chosen": 0.9743126233418783, "rewards/margins": 3.010983117421468, "rewards/rejected": -2.0366704940795897, "step": 8331 }, { "epoch": 0.44162934301539764, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52022668.0, "logits/rejected": -18205629.333333332, "logps/chosen": -274.6168212890625, "logps/rejected": -291.4230143229167, "loss": 0.2139, "rewards/chosen": 0.49255067110061646, "rewards/margins": 2.4250934720039368, "rewards/rejected": -1.9325428009033203, "step": 8332 }, { "epoch": 0.4416823470171998, "grad_norm": 52.0, "kl": 2.0142669677734375, "learning_rate": 5e-07, "logits/chosen": -18297267.2, "logits/rejected": -54198138.666666664, "logps/chosen": -492.71474609375, "logps/rejected": -405.2342936197917, "loss": 0.2986, "rewards/chosen": 0.6071091175079346, "rewards/margins": 3.379771280288696, "rewards/rejected": -2.7726621627807617, "step": 8333 }, { "epoch": 0.4417353510190019, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10515116.8, "logits/rejected": -24575328.0, "logps/chosen": -239.113037109375, "logps/rejected": -464.2684326171875, "loss": 0.3167, "rewards/chosen": 0.2980372667312622, "rewards/margins": 2.290915544827779, "rewards/rejected": -1.9928782780965169, "step": 8334 }, { "epoch": 0.44178835502080405, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45069216.0, "logits/rejected": -53636784.0, "logps/chosen": -336.130859375, "logps/rejected": -519.853759765625, "loss": 0.2616, "rewards/chosen": 0.19264272848765054, "rewards/margins": 2.961737004915873, "rewards/rejected": -2.7690942764282225, "step": 8335 }, { "epoch": 0.4418413590226062, "grad_norm": 64.5, "kl": 0.6884880065917969, "learning_rate": 5e-07, "logits/chosen": -63276697.6, "logits/rejected": -46074426.666666664, "logps/chosen": -322.066943359375, "logps/rejected": -454.1300048828125, "loss": 0.37, "rewards/chosen": -0.04214234352111816, "rewards/margins": 2.0014062404632567, "rewards/rejected": -2.043548583984375, "step": 8336 }, { "epoch": 0.4418943630244083, "grad_norm": 61.25, "kl": 0.2786731719970703, "learning_rate": 5e-07, "logits/chosen": -27360162.0, "logits/rejected": -1574993.75, "logps/chosen": -329.09637451171875, "logps/rejected": -270.107666015625, "loss": 0.3689, "rewards/chosen": -0.06202735751867294, "rewards/margins": 1.6526912227272987, "rewards/rejected": -1.7147185802459717, "step": 8337 }, { "epoch": 0.44194736702621046, "grad_norm": 28.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9679632.0, "logits/rejected": -4914554.666666667, "logps/chosen": -1199.5447998046875, "logps/rejected": -288.8209635416667, "loss": 0.1559, "rewards/chosen": 1.4809589385986328, "rewards/margins": 4.272377967834473, "rewards/rejected": -2.79141902923584, "step": 8338 }, { "epoch": 0.4420003710280126, "grad_norm": 44.0, "kl": 0.0268402099609375, "learning_rate": 5e-07, "logits/chosen": -41259244.0, "logits/rejected": -35101744.0, "logps/chosen": -322.6572570800781, "logps/rejected": -399.9137268066406, "loss": 0.2313, "rewards/chosen": 0.7677713632583618, "rewards/margins": 3.123536229133606, "rewards/rejected": -2.355764865875244, "step": 8339 }, { "epoch": 0.44205337502981473, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 29745517.333333332, "logits/rejected": -37485833.6, "logps/chosen": -188.0771484375, "logps/rejected": -253.8849365234375, "loss": 0.2683, "rewards/chosen": 0.14998372395833334, "rewards/margins": 2.0691636403401694, "rewards/rejected": -1.919179916381836, "step": 8340 }, { "epoch": 0.44210637903161687, "grad_norm": 38.25, "kl": 0.9766159057617188, "learning_rate": 5e-07, "logits/chosen": -27619829.333333332, "logits/rejected": -21266020.0, "logps/chosen": -619.1151529947916, "logps/rejected": -320.2903747558594, "loss": 0.2936, "rewards/chosen": 1.1882785161336262, "rewards/margins": 3.4243219693501787, "rewards/rejected": -2.2360434532165527, "step": 8341 }, { "epoch": 0.442159383033419, "grad_norm": 48.75, "kl": 2.8794708251953125, "learning_rate": 5e-07, "logits/chosen": -34270394.666666664, "logits/rejected": -34747648.0, "logps/chosen": -345.6097412109375, "logps/rejected": -412.001611328125, "loss": 0.2524, "rewards/chosen": 0.40212098757425946, "rewards/margins": 2.629186264673869, "rewards/rejected": -2.2270652770996096, "step": 8342 }, { "epoch": 0.44221238703522114, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35491500.0, "logits/rejected": -63664106.666666664, "logps/chosen": -331.8773193359375, "logps/rejected": -557.6552734375, "loss": 0.2487, "rewards/chosen": 0.7087257504463196, "rewards/margins": 2.2189216415087385, "rewards/rejected": -1.5101958910624187, "step": 8343 }, { "epoch": 0.4422653910370233, "grad_norm": 872.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94758794.66666667, "logits/rejected": 162340800.0, "logps/chosen": -241.56803385416666, "logps/rejected": -372.147998046875, "loss": 0.1898, "rewards/chosen": 0.792128324508667, "rewards/margins": 3.0538596630096437, "rewards/rejected": -2.2617313385009767, "step": 8344 }, { "epoch": 0.4423183950388254, "grad_norm": 44.75, "kl": 3.4419851303100586, "learning_rate": 5e-07, "logits/chosen": -30227594.666666668, "logits/rejected": -9992312.0, "logps/chosen": -710.3255208333334, "logps/rejected": -254.5130615234375, "loss": 0.1933, "rewards/chosen": 1.3016713460286458, "rewards/margins": 3.456258900960286, "rewards/rejected": -2.1545875549316404, "step": 8345 }, { "epoch": 0.44237139904062756, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18157254.4, "logits/rejected": 266127168.0, "logps/chosen": -209.86025390625, "logps/rejected": -305.63916015625, "loss": 0.2918, "rewards/chosen": 0.7927083015441895, "rewards/margins": 2.2629709879557294, "rewards/rejected": -1.4702626864115398, "step": 8346 }, { "epoch": 0.4424244030424297, "grad_norm": 36.0, "kl": 0.3109893798828125, "learning_rate": 5e-07, "logits/chosen": -18594965.333333332, "logits/rejected": -46997056.0, "logps/chosen": -151.7287394205729, "logps/rejected": -416.828515625, "loss": 0.1621, "rewards/chosen": 0.7212969462076823, "rewards/margins": 3.587888971964518, "rewards/rejected": -2.866592025756836, "step": 8347 }, { "epoch": 0.44247740704423183, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5602869.0, "logits/rejected": -30024882.666666668, "logps/chosen": -46.948978424072266, "logps/rejected": -425.5229899088542, "loss": 0.2624, "rewards/chosen": -0.30317869782447815, "rewards/margins": 1.8022102614243827, "rewards/rejected": -2.105388959248861, "step": 8348 }, { "epoch": 0.44253041104603397, "grad_norm": 41.5, "kl": 1.6445732116699219, "learning_rate": 5e-07, "logits/chosen": -24726268.0, "logits/rejected": -15092149.0, "logps/chosen": -227.6664581298828, "logps/rejected": -166.501953125, "loss": 0.331, "rewards/chosen": 0.7198325991630554, "rewards/margins": 2.4696232676506042, "rewards/rejected": -1.7497906684875488, "step": 8349 }, { "epoch": 0.4425834150478361, "grad_norm": 55.5, "kl": 0.09714126586914062, "learning_rate": 5e-07, "logits/chosen": -23022380.0, "logits/rejected": -26430106.0, "logps/chosen": -308.0054931640625, "logps/rejected": -497.51519775390625, "loss": 0.3103, "rewards/chosen": 0.5482353568077087, "rewards/margins": 2.049186885356903, "rewards/rejected": -1.5009515285491943, "step": 8350 }, { "epoch": 0.44263641904963824, "grad_norm": 50.25, "kl": 1.0495471954345703, "learning_rate": 5e-07, "logits/chosen": 476671.4, "logits/rejected": -22155916.0, "logps/chosen": -56.492529296875, "logps/rejected": -162.2750244140625, "loss": 0.4406, "rewards/chosen": 0.06925038099288941, "rewards/margins": 0.9196451067924499, "rewards/rejected": -0.8503947257995605, "step": 8351 }, { "epoch": 0.4426894230514404, "grad_norm": 57.5, "kl": 0.9851284027099609, "learning_rate": 5e-07, "logits/chosen": -17023701.333333332, "logits/rejected": -47137088.0, "logps/chosen": -333.2285970052083, "logps/rejected": -381.40777587890625, "loss": 0.4214, "rewards/chosen": 0.1375615398089091, "rewards/margins": 1.5757833520571392, "rewards/rejected": -1.43822181224823, "step": 8352 }, { "epoch": 0.4427424270532425, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26148902.0, "logits/rejected": -15154689.0, "logps/chosen": -211.1273193359375, "logps/rejected": -381.0834045410156, "loss": 0.3306, "rewards/chosen": -0.06918492913246155, "rewards/margins": 2.0786121785640717, "rewards/rejected": -2.147797107696533, "step": 8353 }, { "epoch": 0.44279543105504465, "grad_norm": 46.75, "kl": 1.92437744140625, "learning_rate": 5e-07, "logits/chosen": -44268444.0, "logits/rejected": -29376280.0, "logps/chosen": -413.5566711425781, "logps/rejected": -473.119140625, "loss": 0.2821, "rewards/chosen": 0.3911471962928772, "rewards/margins": 2.4597750306129456, "rewards/rejected": -2.0686278343200684, "step": 8354 }, { "epoch": 0.4428484350568468, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37753272.0, "logits/rejected": -29699760.0, "logps/chosen": -309.3613586425781, "logps/rejected": -365.1280517578125, "loss": 0.2755, "rewards/chosen": 0.7204425930976868, "rewards/margins": 2.2128475308418274, "rewards/rejected": -1.4924049377441406, "step": 8355 }, { "epoch": 0.4429014390586489, "grad_norm": 61.25, "kl": 1.403421401977539, "learning_rate": 5e-07, "logits/chosen": -1888923.2, "logits/rejected": -6116666.666666667, "logps/chosen": -316.1462646484375, "logps/rejected": -275.0592041015625, "loss": 0.3309, "rewards/chosen": 0.6527422904968262, "rewards/margins": 2.4933623313903808, "rewards/rejected": -1.8406200408935547, "step": 8356 }, { "epoch": 0.44295444306045106, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18573690.666666668, "logits/rejected": -10571604.0, "logps/chosen": -493.26123046875, "logps/rejected": -171.77578735351562, "loss": 0.3328, "rewards/chosen": 0.6128842035929362, "rewards/margins": 2.3867040077845254, "rewards/rejected": -1.7738198041915894, "step": 8357 }, { "epoch": 0.4430074470622532, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58646588.0, "logits/rejected": -24094502.0, "logps/chosen": -310.70098876953125, "logps/rejected": -319.7027282714844, "loss": 0.2519, "rewards/chosen": 0.26943254470825195, "rewards/margins": 2.9583165645599365, "rewards/rejected": -2.6888840198516846, "step": 8358 }, { "epoch": 0.44306045106405534, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64440915.2, "logits/rejected": -37344562.666666664, "logps/chosen": -213.935400390625, "logps/rejected": -445.5006917317708, "loss": 0.265, "rewards/chosen": 0.4916487216949463, "rewards/margins": 3.5968691984812415, "rewards/rejected": -3.1052204767862954, "step": 8359 }, { "epoch": 0.4431134550658575, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 103623024.0, "logits/rejected": -27786910.0, "logps/chosen": -419.36962890625, "logps/rejected": -413.2357482910156, "loss": 0.2297, "rewards/chosen": 0.5265656113624573, "rewards/margins": 3.5971129536628723, "rewards/rejected": -3.070547342300415, "step": 8360 }, { "epoch": 0.4431664590676596, "grad_norm": 81.5, "kl": 1.010025978088379, "learning_rate": 5e-07, "logits/chosen": -6088754.0, "logits/rejected": -6076421.5, "logps/chosen": -575.56591796875, "logps/rejected": -457.84088134765625, "loss": 0.3218, "rewards/chosen": 0.6039550304412842, "rewards/margins": 3.4506938457489014, "rewards/rejected": -2.846738815307617, "step": 8361 }, { "epoch": 0.44321946306946175, "grad_norm": 33.25, "kl": 3.3124866485595703, "learning_rate": 5e-07, "logits/chosen": 2610894.5, "logits/rejected": -29005824.0, "logps/chosen": -222.14056396484375, "logps/rejected": -174.08839416503906, "loss": 0.2317, "rewards/chosen": 1.127244234085083, "rewards/margins": 3.2408056259155273, "rewards/rejected": -2.1135613918304443, "step": 8362 }, { "epoch": 0.4432724670712639, "grad_norm": 66.0, "kl": 0.26092529296875, "learning_rate": 5e-07, "logits/chosen": -3134148.0, "logits/rejected": -67886136.0, "logps/chosen": -257.1718343098958, "logps/rejected": -245.68115234375, "loss": 0.3731, "rewards/chosen": 0.2886159420013428, "rewards/margins": 2.207066774368286, "rewards/rejected": -1.9184508323669434, "step": 8363 }, { "epoch": 0.443325471073066, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48879141.333333336, "logits/rejected": -26744280.0, "logps/chosen": -155.14610799153647, "logps/rejected": -281.193505859375, "loss": 0.2466, "rewards/chosen": 0.285373051961263, "rewards/margins": 2.6979951222737633, "rewards/rejected": -2.4126220703125, "step": 8364 }, { "epoch": 0.44337847507486816, "grad_norm": 43.75, "kl": 2.288616180419922, "learning_rate": 5e-07, "logits/chosen": -36556744.0, "logits/rejected": -21783328.0, "logps/chosen": -245.24534606933594, "logps/rejected": -321.60009765625, "loss": 0.2931, "rewards/chosen": 0.5690629482269287, "rewards/margins": 2.7880043983459473, "rewards/rejected": -2.2189414501190186, "step": 8365 }, { "epoch": 0.4434314790766703, "grad_norm": 86.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9325928.0, "logits/rejected": 1705944.2, "logps/chosen": -443.672607421875, "logps/rejected": -94.4888427734375, "loss": 0.3815, "rewards/chosen": -0.09218746423721313, "rewards/margins": 1.0466785788536073, "rewards/rejected": -1.1388660430908204, "step": 8366 }, { "epoch": 0.44348448307847244, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35937740.0, "logits/rejected": -5894183.333333333, "logps/chosen": -457.8215637207031, "logps/rejected": -382.1879069010417, "loss": 0.2239, "rewards/chosen": 1.091850996017456, "rewards/margins": 2.9713343779246015, "rewards/rejected": -1.8794833819071453, "step": 8367 }, { "epoch": 0.4435374870802746, "grad_norm": 35.75, "kl": 2.8534774780273438, "learning_rate": 5e-07, "logits/chosen": -34511548.0, "logits/rejected": -36185116.0, "logps/chosen": -613.2689208984375, "logps/rejected": -430.2115173339844, "loss": 0.1401, "rewards/chosen": 1.8158198595046997, "rewards/margins": 3.826102137565613, "rewards/rejected": -2.010282278060913, "step": 8368 }, { "epoch": 0.4435904910820767, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16955822.0, "logits/rejected": -50473952.0, "logps/chosen": -403.5276184082031, "logps/rejected": -528.1276041666666, "loss": 0.209, "rewards/chosen": 0.9463249444961548, "rewards/margins": 3.0530126492182412, "rewards/rejected": -2.1066877047220864, "step": 8369 }, { "epoch": 0.44364349508387885, "grad_norm": 34.0, "kl": 1.0514516830444336, "learning_rate": 5e-07, "logits/chosen": -24119520.0, "logits/rejected": -26514202.666666668, "logps/chosen": -180.2737060546875, "logps/rejected": -311.9575602213542, "loss": 0.2672, "rewards/chosen": 0.6441296577453614, "rewards/margins": 3.0800808906555175, "rewards/rejected": -2.4359512329101562, "step": 8370 }, { "epoch": 0.443696499085681, "grad_norm": 44.25, "kl": 1.1918621063232422, "learning_rate": 5e-07, "logits/chosen": -20565012.0, "logits/rejected": 4472002.0, "logps/chosen": -350.464111328125, "logps/rejected": -68.05038452148438, "loss": 0.3145, "rewards/chosen": 0.785240888595581, "rewards/margins": 2.2979769706726074, "rewards/rejected": -1.5127360820770264, "step": 8371 }, { "epoch": 0.4437495030874831, "grad_norm": 62.75, "kl": 0.24768352508544922, "learning_rate": 5e-07, "logits/chosen": -9039904.0, "logits/rejected": 1674277.5, "logps/chosen": -136.6861572265625, "logps/rejected": -97.89012145996094, "loss": 0.4151, "rewards/chosen": 0.13441714644432068, "rewards/margins": 1.8295007646083832, "rewards/rejected": -1.6950836181640625, "step": 8372 }, { "epoch": 0.44380250708928526, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37595913.6, "logits/rejected": -26628016.0, "logps/chosen": -513.47060546875, "logps/rejected": -157.2775675455729, "loss": 0.3456, "rewards/chosen": 0.6226660251617432, "rewards/margins": 1.615649366378784, "rewards/rejected": -0.992983341217041, "step": 8373 }, { "epoch": 0.4438555110910874, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29721376.0, "logits/rejected": -27800774.4, "logps/chosen": -670.7194010416666, "logps/rejected": -191.23043212890624, "loss": 0.2656, "rewards/chosen": 0.6732745170593262, "rewards/margins": 2.3113808631896973, "rewards/rejected": -1.638106346130371, "step": 8374 }, { "epoch": 0.44390851509288953, "grad_norm": 38.25, "kl": 0.769317626953125, "learning_rate": 5e-07, "logits/chosen": -11132372.0, "logits/rejected": -35948712.0, "logps/chosen": -234.7872314453125, "logps/rejected": -468.721435546875, "loss": 0.2625, "rewards/chosen": 0.5126182436943054, "rewards/margins": 3.5179619193077087, "rewards/rejected": -3.0053436756134033, "step": 8375 }, { "epoch": 0.44396151909469167, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78936889.6, "logits/rejected": -11885341.333333334, "logps/chosen": -302.7818359375, "logps/rejected": -338.0320638020833, "loss": 0.4262, "rewards/chosen": -0.16700103282928466, "rewards/margins": 1.3847267389297486, "rewards/rejected": -1.5517277717590332, "step": 8376 }, { "epoch": 0.4440145230964938, "grad_norm": 40.0, "kl": 2.1826820373535156, "learning_rate": 5e-07, "logits/chosen": -20760670.666666668, "logits/rejected": -45056532.0, "logps/chosen": -222.70222981770834, "logps/rejected": -443.5097351074219, "loss": 0.3629, "rewards/chosen": 0.6052313645680746, "rewards/margins": 2.716522296269735, "rewards/rejected": -2.11129093170166, "step": 8377 }, { "epoch": 0.44406752709829594, "grad_norm": 53.25, "kl": 0.7865114212036133, "learning_rate": 5e-07, "logits/chosen": -48475385.6, "logits/rejected": -32182320.0, "logps/chosen": -180.8702392578125, "logps/rejected": -248.35685221354166, "loss": 0.364, "rewards/chosen": 0.38323211669921875, "rewards/margins": 1.696499188741048, "rewards/rejected": -1.3132670720418294, "step": 8378 }, { "epoch": 0.4441205311000981, "grad_norm": 47.5, "kl": 0.025638580322265625, "learning_rate": 5e-07, "logits/chosen": -25821520.0, "logits/rejected": 4937891.0, "logps/chosen": -371.40185546875, "logps/rejected": -152.3085734049479, "loss": 0.2254, "rewards/chosen": 0.6654435396194458, "rewards/margins": 2.6466304063796997, "rewards/rejected": -1.981186866760254, "step": 8379 }, { "epoch": 0.4441735351019002, "grad_norm": 30.625, "kl": 0.3935260772705078, "learning_rate": 5e-07, "logits/chosen": 9956085.0, "logits/rejected": -12510122.0, "logps/chosen": -49.17299270629883, "logps/rejected": -176.01272583007812, "loss": 0.3122, "rewards/chosen": 0.1881084144115448, "rewards/margins": 2.209459275007248, "rewards/rejected": -2.021350860595703, "step": 8380 }, { "epoch": 0.44422653910370236, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10808326.0, "logits/rejected": -31060762.666666668, "logps/chosen": -159.2811279296875, "logps/rejected": -180.4248046875, "loss": 0.2934, "rewards/chosen": -0.14420700073242188, "rewards/margins": 1.2539730072021484, "rewards/rejected": -1.3981800079345703, "step": 8381 }, { "epoch": 0.4442795431055045, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 23485004.0, "logits/rejected": -49317504.0, "logps/chosen": -358.84136962890625, "logps/rejected": -197.1676788330078, "loss": 0.3021, "rewards/chosen": 0.3591456413269043, "rewards/margins": 1.9995036125183105, "rewards/rejected": -1.6403579711914062, "step": 8382 }, { "epoch": 0.4443325471073066, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34798084.0, "logits/rejected": -20894886.85714286, "logps/chosen": -43.55552673339844, "logps/rejected": -302.5487583705357, "loss": 0.1847, "rewards/chosen": 0.10539589077234268, "rewards/margins": 2.1483106900538718, "rewards/rejected": -2.042914799281529, "step": 8383 }, { "epoch": 0.4443855511091087, "grad_norm": 39.5, "kl": 1.7928524017333984, "learning_rate": 5e-07, "logits/chosen": -39909672.0, "logits/rejected": -34331496.0, "logps/chosen": -406.8621826171875, "logps/rejected": -359.9442138671875, "loss": 0.296, "rewards/chosen": 0.40724655985832214, "rewards/margins": 2.6239292323589325, "rewards/rejected": -2.2166826725006104, "step": 8384 }, { "epoch": 0.44443855511091085, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -103600552.0, "logits/rejected": -40731434.666666664, "logps/chosen": -552.62109375, "logps/rejected": -262.4186604817708, "loss": 0.2285, "rewards/chosen": 0.294729620218277, "rewards/margins": 2.1972591777642565, "rewards/rejected": -1.9025295575459797, "step": 8385 }, { "epoch": 0.444491559112713, "grad_norm": 30.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16341600.0, "logits/rejected": -83240409.6, "logps/chosen": -167.32928466796875, "logps/rejected": -315.069091796875, "loss": 0.2153, "rewards/chosen": 0.25856220722198486, "rewards/margins": 3.001647686958313, "rewards/rejected": -2.743085479736328, "step": 8386 }, { "epoch": 0.4445445631145151, "grad_norm": 46.0, "kl": 0.3309803009033203, "learning_rate": 5e-07, "logits/chosen": -20347682.0, "logits/rejected": -12725642.666666666, "logps/chosen": -312.52276611328125, "logps/rejected": -284.1817220052083, "loss": 0.2335, "rewards/chosen": 0.4529381990432739, "rewards/margins": 2.5006686449050903, "rewards/rejected": -2.0477304458618164, "step": 8387 }, { "epoch": 0.44459756711631726, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23086324.0, "logits/rejected": -17800226.0, "logps/chosen": -313.4374084472656, "logps/rejected": -317.1492919921875, "loss": 0.2007, "rewards/chosen": 1.0896971225738525, "rewards/margins": 3.213845729827881, "rewards/rejected": -2.1241486072540283, "step": 8388 }, { "epoch": 0.4446505711181194, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4669951.428571428, "logits/rejected": -34904240.0, "logps/chosen": -246.67822265625, "logps/rejected": -518.460205078125, "loss": 0.4001, "rewards/chosen": 0.20612648555210658, "rewards/margins": 4.1548080784933905, "rewards/rejected": -3.948681592941284, "step": 8389 }, { "epoch": 0.44470357511992153, "grad_norm": 85.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46797098.666666664, "logits/rejected": -27360835.2, "logps/chosen": -349.0771484375, "logps/rejected": -333.865625, "loss": 0.2992, "rewards/chosen": 0.013296003142992655, "rewards/margins": 2.090348310271899, "rewards/rejected": -2.0770523071289064, "step": 8390 }, { "epoch": 0.44475657912172367, "grad_norm": 43.5, "kl": 0.5472793579101562, "learning_rate": 5e-07, "logits/chosen": -44401612.8, "logits/rejected": -25435050.666666668, "logps/chosen": -489.69501953125, "logps/rejected": -288.5233561197917, "loss": 0.2615, "rewards/chosen": 1.4164904594421386, "rewards/margins": 3.55100622177124, "rewards/rejected": -2.1345157623291016, "step": 8391 }, { "epoch": 0.4448095831235258, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16089682.666666666, "logits/rejected": -23160866.0, "logps/chosen": -267.6090901692708, "logps/rejected": -300.14013671875, "loss": 0.3335, "rewards/chosen": 0.34536083539326984, "rewards/margins": 3.2012649377187095, "rewards/rejected": -2.8559041023254395, "step": 8392 }, { "epoch": 0.44486258712532795, "grad_norm": 40.5, "kl": 0.75616455078125, "learning_rate": 5e-07, "logits/chosen": -52477300.0, "logits/rejected": -50655144.0, "logps/chosen": -343.52606201171875, "logps/rejected": -437.6136779785156, "loss": 0.2372, "rewards/chosen": 0.6658836603164673, "rewards/margins": 3.7270034551620483, "rewards/rejected": -3.061119794845581, "step": 8393 }, { "epoch": 0.4449155911271301, "grad_norm": 44.75, "kl": 0.7750740051269531, "learning_rate": 5e-07, "logits/chosen": -14142697.6, "logits/rejected": -14851105.333333334, "logps/chosen": -214.7083740234375, "logps/rejected": -343.5703938802083, "loss": 0.3595, "rewards/chosen": 0.04442169666290283, "rewards/margins": 2.1118580738703407, "rewards/rejected": -2.067436377207438, "step": 8394 }, { "epoch": 0.4449685951289322, "grad_norm": 59.75, "kl": 0.07774543762207031, "learning_rate": 5e-07, "logits/chosen": -37438253.333333336, "logits/rejected": -2115468.6, "logps/chosen": -343.7421875, "logps/rejected": -195.05682373046875, "loss": 0.31, "rewards/chosen": -0.10553359985351562, "rewards/margins": 1.7857110977172852, "rewards/rejected": -1.8912446975708008, "step": 8395 }, { "epoch": 0.44502159913073436, "grad_norm": 51.25, "kl": 1.1616859436035156, "learning_rate": 5e-07, "logits/chosen": -26459843.2, "logits/rejected": -32731472.0, "logps/chosen": -339.2439697265625, "logps/rejected": -462.0575358072917, "loss": 0.2681, "rewards/chosen": 0.8592886924743652, "rewards/margins": 2.905134359995524, "rewards/rejected": -2.0458456675211587, "step": 8396 }, { "epoch": 0.4450746031325365, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68621392.0, "logits/rejected": -51383184.0, "logps/chosen": -201.6771240234375, "logps/rejected": -547.3859252929688, "loss": 0.2399, "rewards/chosen": 0.2614012658596039, "rewards/margins": 3.967731326818466, "rewards/rejected": -3.7063300609588623, "step": 8397 }, { "epoch": 0.44512760713433863, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23426216.0, "logits/rejected": -16044561.6, "logps/chosen": -231.63126627604166, "logps/rejected": -589.105859375, "loss": 0.2088, "rewards/chosen": 0.3977176745732625, "rewards/margins": 4.210931785901387, "rewards/rejected": -3.813214111328125, "step": 8398 }, { "epoch": 0.44518061113614077, "grad_norm": 58.5, "kl": 0.2188272476196289, "learning_rate": 5e-07, "logits/chosen": 12038874.4, "logits/rejected": -11168785.333333334, "logps/chosen": -363.355419921875, "logps/rejected": -104.33242797851562, "loss": 0.3002, "rewards/chosen": 0.5586118221282959, "rewards/margins": 2.527309528986613, "rewards/rejected": -1.9686977068583171, "step": 8399 }, { "epoch": 0.4452336151379429, "grad_norm": 54.5, "kl": 0.3104705810546875, "learning_rate": 5e-07, "logits/chosen": -31336516.0, "logits/rejected": -35293744.0, "logps/chosen": -363.6126708984375, "logps/rejected": -537.9420776367188, "loss": 0.22, "rewards/chosen": 0.6371496319770813, "rewards/margins": 3.383014738559723, "rewards/rejected": -2.7458651065826416, "step": 8400 }, { "epoch": 0.44528661913974504, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50708792.0, "logits/rejected": -50413109.333333336, "logps/chosen": -314.16265869140625, "logps/rejected": -288.83892822265625, "loss": 0.1776, "rewards/chosen": 1.2517063617706299, "rewards/margins": 3.1579809983571367, "rewards/rejected": -1.906274636586507, "step": 8401 }, { "epoch": 0.4453396231415472, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7177557.5, "logits/rejected": -12074594.666666666, "logps/chosen": -209.2130889892578, "logps/rejected": -224.55485026041666, "loss": 0.2383, "rewards/chosen": 0.8435502052307129, "rewards/margins": 2.3664097785949707, "rewards/rejected": -1.5228595733642578, "step": 8402 }, { "epoch": 0.4453926271433493, "grad_norm": 51.0, "kl": 0.40154075622558594, "learning_rate": 5e-07, "logits/chosen": -36284316.0, "logits/rejected": -5212619.5, "logps/chosen": -267.3563232421875, "logps/rejected": -136.7969207763672, "loss": 0.3075, "rewards/chosen": 0.5335882902145386, "rewards/margins": 1.8296401500701904, "rewards/rejected": -1.2960518598556519, "step": 8403 }, { "epoch": 0.44544563114515145, "grad_norm": 62.5, "kl": 0.2742757797241211, "learning_rate": 5e-07, "logits/chosen": -43625160.0, "logits/rejected": -212203376.0, "logps/chosen": -341.19378662109375, "logps/rejected": -426.96771240234375, "loss": 0.4938, "rewards/chosen": -0.48117319742838544, "rewards/margins": 1.6777685483296711, "rewards/rejected": -2.1589417457580566, "step": 8404 }, { "epoch": 0.4454986351469536, "grad_norm": 39.5, "kl": 0.32312774658203125, "learning_rate": 5e-07, "logits/chosen": -7102211.333333333, "logits/rejected": -20443539.2, "logps/chosen": -277.07216389973956, "logps/rejected": -614.364599609375, "loss": 0.1524, "rewards/chosen": 1.1797032356262207, "rewards/margins": 3.7602278709411623, "rewards/rejected": -2.5805246353149416, "step": 8405 }, { "epoch": 0.44555163914875573, "grad_norm": 68.5, "kl": 0.6889495849609375, "learning_rate": 5e-07, "logits/chosen": -30415651.2, "logits/rejected": 7837140.666666667, "logps/chosen": -281.94306640625, "logps/rejected": -190.2708536783854, "loss": 0.3966, "rewards/chosen": 0.17312589883804322, "rewards/margins": 1.4817514379819234, "rewards/rejected": -1.3086255391438801, "step": 8406 }, { "epoch": 0.44560464315055787, "grad_norm": 49.75, "kl": 2.2185440063476562, "learning_rate": 5e-07, "logits/chosen": -40697168.0, "logits/rejected": -12679088.0, "logps/chosen": -911.6192626953125, "logps/rejected": -430.37890625, "loss": 0.1491, "rewards/chosen": 1.7306312322616577, "rewards/margins": 4.743608832359314, "rewards/rejected": -3.0129776000976562, "step": 8407 }, { "epoch": 0.44565764715236, "grad_norm": 60.75, "kl": 1.2870292663574219, "learning_rate": 5e-07, "logits/chosen": -32426854.4, "logits/rejected": -36001685.333333336, "logps/chosen": -287.115087890625, "logps/rejected": -393.6675618489583, "loss": 0.4369, "rewards/chosen": -0.32540478706359866, "rewards/margins": 1.23378103574117, "rewards/rejected": -1.5591858228047688, "step": 8408 }, { "epoch": 0.44571065115416214, "grad_norm": 45.25, "kl": 2.5808610916137695, "learning_rate": 5e-07, "logits/chosen": -22493594.0, "logits/rejected": -12323706.0, "logps/chosen": -578.6275024414062, "logps/rejected": -312.7149658203125, "loss": 0.2021, "rewards/chosen": 1.6871522665023804, "rewards/margins": 4.182303309440613, "rewards/rejected": -2.4951510429382324, "step": 8409 }, { "epoch": 0.4457636551559643, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65337200.0, "logits/rejected": -32424054.85714286, "logps/chosen": -468.4736022949219, "logps/rejected": -283.88272530691967, "loss": 0.1751, "rewards/chosen": 0.03291931375861168, "rewards/margins": 2.327840371323483, "rewards/rejected": -2.2949210575648715, "step": 8410 }, { "epoch": 0.4458166591577664, "grad_norm": 46.0, "kl": 1.0966567993164062, "learning_rate": 5e-07, "logits/chosen": -30911888.0, "logits/rejected": -8858570.0, "logps/chosen": -316.837158203125, "logps/rejected": -222.86148071289062, "loss": 0.1973, "rewards/chosen": 0.7282569408416748, "rewards/margins": 3.661125898361206, "rewards/rejected": -2.9328689575195312, "step": 8411 }, { "epoch": 0.44586966315956855, "grad_norm": 54.5, "kl": 1.1385040283203125, "learning_rate": 5e-07, "logits/chosen": -15554641.333333334, "logits/rejected": -41002048.0, "logps/chosen": -278.3778076171875, "logps/rejected": -582.081787109375, "loss": 0.3577, "rewards/chosen": 0.2954843044281006, "rewards/margins": 3.5990731716156006, "rewards/rejected": -3.3035888671875, "step": 8412 }, { "epoch": 0.4459226671613707, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6445671.0, "logits/rejected": -13816020.0, "logps/chosen": -45.700164794921875, "logps/rejected": -241.56394958496094, "loss": 0.2705, "rewards/chosen": 0.3327846825122833, "rewards/margins": 2.4450097382068634, "rewards/rejected": -2.11222505569458, "step": 8413 }, { "epoch": 0.4459756711631728, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -107120.0, "logits/rejected": -2466581.5, "logps/chosen": -498.68896484375, "logps/rejected": -257.9844055175781, "loss": 0.3699, "rewards/chosen": 0.02776031196117401, "rewards/margins": 1.8133526295423508, "rewards/rejected": -1.7855923175811768, "step": 8414 }, { "epoch": 0.44602867516497496, "grad_norm": 35.0, "kl": 0.17801666259765625, "learning_rate": 5e-07, "logits/chosen": -456070.25, "logits/rejected": -28889010.0, "logps/chosen": -92.01165771484375, "logps/rejected": -338.09368896484375, "loss": 0.3378, "rewards/chosen": -0.27325335144996643, "rewards/margins": 2.4449423253536224, "rewards/rejected": -2.718195676803589, "step": 8415 }, { "epoch": 0.4460816791667771, "grad_norm": 79.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1324284.0, "logits/rejected": -17561076.0, "logps/chosen": -460.79150390625, "logps/rejected": -239.51095581054688, "loss": 0.3052, "rewards/chosen": 0.17297133803367615, "rewards/margins": 2.1217263638973236, "rewards/rejected": -1.9487550258636475, "step": 8416 }, { "epoch": 0.44613468316857924, "grad_norm": 39.25, "kl": 0.24350357055664062, "learning_rate": 5e-07, "logits/chosen": -13923214.4, "logits/rejected": -26902058.666666668, "logps/chosen": -145.5477783203125, "logps/rejected": -511.6441243489583, "loss": 0.3967, "rewards/chosen": -0.3996936559677124, "rewards/margins": 2.8170749266942345, "rewards/rejected": -3.2167685826619468, "step": 8417 }, { "epoch": 0.4461876871703814, "grad_norm": 46.5, "kl": 0.678131103515625, "learning_rate": 5e-07, "logits/chosen": -43434256.0, "logits/rejected": 9864092.0, "logps/chosen": -439.9571940104167, "logps/rejected": -469.8697265625, "loss": 0.1898, "rewards/chosen": 0.7372517585754395, "rewards/margins": 3.3865845680236815, "rewards/rejected": -2.649332809448242, "step": 8418 }, { "epoch": 0.4462406911721835, "grad_norm": 62.0, "kl": 0.2678642272949219, "learning_rate": 5e-07, "logits/chosen": -55659736.0, "logits/rejected": -31962166.0, "logps/chosen": -396.16693115234375, "logps/rejected": -197.6732940673828, "loss": 0.3196, "rewards/chosen": 0.3511628210544586, "rewards/margins": 1.8710940182209015, "rewards/rejected": -1.5199311971664429, "step": 8419 }, { "epoch": 0.44629369517398565, "grad_norm": 43.0, "kl": 0.25402069091796875, "learning_rate": 5e-07, "logits/chosen": -33889098.666666664, "logits/rejected": 67602982.4, "logps/chosen": -273.0100911458333, "logps/rejected": -230.25048828125, "loss": 0.3076, "rewards/chosen": -0.176483154296875, "rewards/margins": 1.758323860168457, "rewards/rejected": -1.934807014465332, "step": 8420 }, { "epoch": 0.4463466991757878, "grad_norm": 39.75, "kl": 0.8395509719848633, "learning_rate": 5e-07, "logits/chosen": -14533913.0, "logits/rejected": -20161864.0, "logps/chosen": -328.1092529296875, "logps/rejected": -290.29913330078125, "loss": 0.2192, "rewards/chosen": 1.5715992450714111, "rewards/margins": 3.238105297088623, "rewards/rejected": -1.666506052017212, "step": 8421 }, { "epoch": 0.4463997031775899, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25897976.0, "logits/rejected": -32131840.0, "logps/chosen": -312.121337890625, "logps/rejected": -421.7104797363281, "loss": 0.3244, "rewards/chosen": -0.1345480978488922, "rewards/margins": 2.1143239438533783, "rewards/rejected": -2.2488720417022705, "step": 8422 }, { "epoch": 0.44645270717939206, "grad_norm": 52.5, "kl": 1.209686279296875, "learning_rate": 5e-07, "logits/chosen": -15314596.0, "logits/rejected": -6723578.0, "logps/chosen": -239.91693115234375, "logps/rejected": -134.28277587890625, "loss": 0.3418, "rewards/chosen": 0.7213917970657349, "rewards/margins": 1.5462321639060974, "rewards/rejected": -0.8248403668403625, "step": 8423 }, { "epoch": 0.4465057111811942, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64516000.0, "logits/rejected": -66737284.0, "logps/chosen": -348.98675537109375, "logps/rejected": -487.45318603515625, "loss": 0.3289, "rewards/chosen": -0.06493931263685226, "rewards/margins": 1.8637594655156136, "rewards/rejected": -1.9286987781524658, "step": 8424 }, { "epoch": 0.44655871518299634, "grad_norm": 48.75, "kl": 0.6262855529785156, "learning_rate": 5e-07, "logits/chosen": -35001187.2, "logits/rejected": -32222714.666666668, "logps/chosen": -194.767333984375, "logps/rejected": -361.6343587239583, "loss": 0.3839, "rewards/chosen": 0.06310133934020996, "rewards/margins": 2.179268089930216, "rewards/rejected": -2.1161667505900064, "step": 8425 }, { "epoch": 0.4466117191847985, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49579416.0, "logits/rejected": -28130261.333333332, "logps/chosen": -462.7677917480469, "logps/rejected": -477.0974934895833, "loss": 0.1664, "rewards/chosen": 0.4916839599609375, "rewards/margins": 3.723997433980306, "rewards/rejected": -3.2323134740193686, "step": 8426 }, { "epoch": 0.4466647231866006, "grad_norm": 45.75, "kl": 1.10992431640625, "learning_rate": 5e-07, "logits/chosen": -14568141.333333334, "logits/rejected": -4083564.75, "logps/chosen": -171.07000732421875, "logps/rejected": -100.9866943359375, "loss": 0.4024, "rewards/chosen": 0.09012812376022339, "rewards/margins": 1.8946565985679626, "rewards/rejected": -1.8045284748077393, "step": 8427 }, { "epoch": 0.44671772718840275, "grad_norm": 54.5, "kl": 0.0162353515625, "learning_rate": 5e-07, "logits/chosen": -48205049.6, "logits/rejected": -31100200.0, "logps/chosen": -266.14287109375, "logps/rejected": -381.4134928385417, "loss": 0.3775, "rewards/chosen": -0.0355583667755127, "rewards/margins": 2.229507144292196, "rewards/rejected": -2.2650655110677085, "step": 8428 }, { "epoch": 0.4467707311902049, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30812106.666666668, "logits/rejected": -38557504.0, "logps/chosen": -311.1304931640625, "logps/rejected": -320.0732177734375, "loss": 0.2548, "rewards/chosen": 0.18522467215855917, "rewards/margins": 2.5463983019193015, "rewards/rejected": -2.361173629760742, "step": 8429 }, { "epoch": 0.446823735192007, "grad_norm": 41.75, "kl": 1.8568801879882812, "learning_rate": 5e-07, "logits/chosen": -44650944.0, "logits/rejected": -22180824.0, "logps/chosen": -280.74053955078125, "logps/rejected": -318.14764404296875, "loss": 0.2414, "rewards/chosen": 0.7219018936157227, "rewards/margins": 2.973484992980957, "rewards/rejected": -2.2515830993652344, "step": 8430 }, { "epoch": 0.44687673919380916, "grad_norm": 29.125, "kl": 4.656092643737793, "learning_rate": 5e-07, "logits/chosen": -4156556.0, "logits/rejected": -10441563.333333334, "logps/chosen": -417.433544921875, "logps/rejected": -95.60387166341145, "loss": 0.3865, "rewards/chosen": 0.5692589282989502, "rewards/margins": 1.3316256205240884, "rewards/rejected": -0.7623666922251383, "step": 8431 }, { "epoch": 0.4469297431956113, "grad_norm": 49.5, "kl": 1.8335037231445312, "learning_rate": 5e-07, "logits/chosen": 2905105.3333333335, "logits/rejected": -23019410.0, "logps/chosen": -238.85404459635416, "logps/rejected": -497.6063232421875, "loss": 0.3333, "rewards/chosen": 0.41649027665456134, "rewards/margins": 3.8298794825871787, "rewards/rejected": -3.413389205932617, "step": 8432 }, { "epoch": 0.4469827471974134, "grad_norm": 50.25, "kl": 0.18054962158203125, "learning_rate": 5e-07, "logits/chosen": -44463036.0, "logits/rejected": -58102588.0, "logps/chosen": -335.0550842285156, "logps/rejected": -495.2540283203125, "loss": 0.3483, "rewards/chosen": -0.33137673139572144, "rewards/margins": 2.0435271859169006, "rewards/rejected": -2.374903917312622, "step": 8433 }, { "epoch": 0.4470357511992155, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65857984.0, "logits/rejected": -4527884.0, "logps/chosen": -311.8248291015625, "logps/rejected": -96.07005615234375, "loss": 0.3601, "rewards/chosen": -0.27858734130859375, "rewards/margins": 1.0672805786132813, "rewards/rejected": -1.345867919921875, "step": 8434 }, { "epoch": 0.44708875520101765, "grad_norm": 98.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43992696.0, "logits/rejected": 2005324.125, "logps/chosen": -438.7040710449219, "logps/rejected": -329.10809326171875, "loss": 0.3337, "rewards/chosen": -0.06825409829616547, "rewards/margins": 2.026377573609352, "rewards/rejected": -2.0946316719055176, "step": 8435 }, { "epoch": 0.4471417592028198, "grad_norm": 55.5, "kl": 0.7335720062255859, "learning_rate": 5e-07, "logits/chosen": -9848630.4, "logits/rejected": 3868786.3333333335, "logps/chosen": -237.933203125, "logps/rejected": -336.1412760416667, "loss": 0.3306, "rewards/chosen": 0.5520616054534913, "rewards/margins": 2.166733153661092, "rewards/rejected": -1.6146715482076008, "step": 8436 }, { "epoch": 0.4471947632046219, "grad_norm": 40.75, "kl": 0.4175434112548828, "learning_rate": 5e-07, "logits/chosen": 4903451.0, "logits/rejected": -102630784.0, "logps/chosen": -78.90047454833984, "logps/rejected": -565.9473876953125, "loss": 0.308, "rewards/chosen": 0.16630858182907104, "rewards/margins": 2.304788291454315, "rewards/rejected": -2.138479709625244, "step": 8437 }, { "epoch": 0.44724776720642406, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57423056.0, "logits/rejected": -16985092.0, "logps/chosen": -250.58746337890625, "logps/rejected": -246.38156127929688, "loss": 0.3096, "rewards/chosen": 0.6844058036804199, "rewards/margins": 2.208347201347351, "rewards/rejected": -1.5239413976669312, "step": 8438 }, { "epoch": 0.4473007712082262, "grad_norm": 52.5, "kl": 0.26015281677246094, "learning_rate": 5e-07, "logits/chosen": 13358368.0, "logits/rejected": -39025332.0, "logps/chosen": -302.56195068359375, "logps/rejected": -251.12255859375, "loss": 0.3013, "rewards/chosen": 0.3873756527900696, "rewards/margins": 1.9264679551124573, "rewards/rejected": -1.5390923023223877, "step": 8439 }, { "epoch": 0.44735377521002834, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53201450.666666664, "logits/rejected": -44786873.6, "logps/chosen": -331.9999593098958, "logps/rejected": -251.6449462890625, "loss": 0.2658, "rewards/chosen": 0.4575192928314209, "rewards/margins": 1.9583045482635497, "rewards/rejected": -1.5007852554321288, "step": 8440 }, { "epoch": 0.4474067792118305, "grad_norm": 65.5, "kl": 0.218780517578125, "learning_rate": 5e-07, "logits/chosen": -32234234.666666668, "logits/rejected": -10561044.0, "logps/chosen": -313.19809977213544, "logps/rejected": -170.8525848388672, "loss": 0.3576, "rewards/chosen": 0.5988696813583374, "rewards/margins": 1.5314127802848816, "rewards/rejected": -0.9325430989265442, "step": 8441 }, { "epoch": 0.4474597832136326, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14280258.666666666, "logits/rejected": -21044268.0, "logps/chosen": -270.1814778645833, "logps/rejected": -115.4745864868164, "loss": 0.3103, "rewards/chosen": 0.9074457486470541, "rewards/margins": 1.8831898172696433, "rewards/rejected": -0.9757440686225891, "step": 8442 }, { "epoch": 0.44751278721543475, "grad_norm": 38.25, "kl": 0.4321308135986328, "learning_rate": 5e-07, "logits/chosen": 2134524.0, "logits/rejected": -23632184.0, "logps/chosen": -26.060598373413086, "logps/rejected": -268.86297607421875, "loss": 0.2755, "rewards/chosen": -0.482799232006073, "rewards/margins": 1.876827379067739, "rewards/rejected": -2.359626611073812, "step": 8443 }, { "epoch": 0.4475657912172369, "grad_norm": 49.75, "kl": 0.14630126953125, "learning_rate": 5e-07, "logits/chosen": -22514748.0, "logits/rejected": -50586796.0, "logps/chosen": -319.7972106933594, "logps/rejected": -386.7159423828125, "loss": 0.2091, "rewards/chosen": 1.6104490756988525, "rewards/margins": 3.049316644668579, "rewards/rejected": -1.4388675689697266, "step": 8444 }, { "epoch": 0.447618795219039, "grad_norm": 38.0, "kl": 1.9522409439086914, "learning_rate": 5e-07, "logits/chosen": -2020907.25, "logits/rejected": 23865022.0, "logps/chosen": -59.83594512939453, "logps/rejected": -419.989990234375, "loss": 0.2949, "rewards/chosen": 0.3107825517654419, "rewards/margins": 3.72913658618927, "rewards/rejected": -3.418354034423828, "step": 8445 }, { "epoch": 0.44767179922084116, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30225328.0, "logits/rejected": -29302754.666666668, "logps/chosen": -110.650390625, "logps/rejected": -387.1685384114583, "loss": 0.329, "rewards/chosen": 0.04648616909980774, "rewards/margins": 2.9728831748167672, "rewards/rejected": -2.9263970057169595, "step": 8446 }, { "epoch": 0.4477248032226433, "grad_norm": 52.75, "kl": 1.4358015060424805, "learning_rate": 5e-07, "logits/chosen": -57688858.666666664, "logits/rejected": -10001232.0, "logps/chosen": -296.73406982421875, "logps/rejected": -320.33056640625, "loss": 0.253, "rewards/chosen": 0.6228586832682291, "rewards/margins": 2.3242846171061196, "rewards/rejected": -1.7014259338378905, "step": 8447 }, { "epoch": 0.44777780722444543, "grad_norm": 35.75, "kl": 1.6492843627929688, "learning_rate": 5e-07, "logits/chosen": -16362558.0, "logits/rejected": -53735580.0, "logps/chosen": -502.0259094238281, "logps/rejected": -406.37908935546875, "loss": 0.2747, "rewards/chosen": 0.9099478721618652, "rewards/margins": 3.3478338718414307, "rewards/rejected": -2.4378859996795654, "step": 8448 }, { "epoch": 0.44783081122624757, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49614532.0, "logits/rejected": -38628656.0, "logps/chosen": -482.7388000488281, "logps/rejected": -529.01708984375, "loss": 0.1834, "rewards/chosen": 1.2037668228149414, "rewards/margins": 3.735142230987549, "rewards/rejected": -2.5313754081726074, "step": 8449 }, { "epoch": 0.4478838152280497, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13051096.0, "logits/rejected": -6683425.5, "logps/chosen": -109.15119171142578, "logps/rejected": -228.83639526367188, "loss": 0.3288, "rewards/chosen": 0.13354526460170746, "rewards/margins": 2.2608059495687485, "rewards/rejected": -2.127260684967041, "step": 8450 }, { "epoch": 0.44793681922985185, "grad_norm": 71.0, "kl": 1.1035213470458984, "learning_rate": 5e-07, "logits/chosen": 65652548.571428575, "logits/rejected": 53594288.0, "logps/chosen": -330.47140066964283, "logps/rejected": -1362.735595703125, "loss": 0.3938, "rewards/chosen": 0.30798438617161344, "rewards/margins": 4.622596297945295, "rewards/rejected": -4.314611911773682, "step": 8451 }, { "epoch": 0.447989823231654, "grad_norm": 53.75, "kl": 2.8039779663085938, "learning_rate": 5e-07, "logits/chosen": -25610088.0, "logits/rejected": -8247621.5, "logps/chosen": -274.7747395833333, "logps/rejected": -63.37828826904297, "loss": 0.4955, "rewards/chosen": -0.1863854726155599, "rewards/margins": 2.0018617312113443, "rewards/rejected": -2.1882472038269043, "step": 8452 }, { "epoch": 0.4480428272334561, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42000188.0, "logits/rejected": -17352941.333333332, "logps/chosen": -146.85025024414062, "logps/rejected": -277.97119140625, "loss": 0.2992, "rewards/chosen": 0.008019253611564636, "rewards/margins": 1.42590694129467, "rewards/rejected": -1.4178876876831055, "step": 8453 }, { "epoch": 0.44809583123525826, "grad_norm": 58.75, "kl": 1.7474727630615234, "learning_rate": 5e-07, "logits/chosen": 5103161.5, "logits/rejected": -10249406.0, "logps/chosen": -220.35662841796875, "logps/rejected": -143.4044952392578, "loss": 0.3305, "rewards/chosen": 0.9660794734954834, "rewards/margins": 1.8352279663085938, "rewards/rejected": -0.8691484928131104, "step": 8454 }, { "epoch": 0.4481488352370604, "grad_norm": 67.5, "kl": 0.8640518188476562, "learning_rate": 5e-07, "logits/chosen": -37181587.2, "logits/rejected": -45413344.0, "logps/chosen": -454.19609375, "logps/rejected": -301.43731689453125, "loss": 0.2692, "rewards/chosen": 0.7751709461212158, "rewards/margins": 3.3070538679758705, "rewards/rejected": -2.531882921854655, "step": 8455 }, { "epoch": 0.44820183923886253, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -95119820.8, "logits/rejected": -24525045.333333332, "logps/chosen": -385.2005615234375, "logps/rejected": -451.711181640625, "loss": 0.2982, "rewards/chosen": 0.38567276000976564, "rewards/margins": 2.771443557739258, "rewards/rejected": -2.385770797729492, "step": 8456 }, { "epoch": 0.44825484324066467, "grad_norm": 44.75, "kl": 1.1099720001220703, "learning_rate": 5e-07, "logits/chosen": -37162140.0, "logits/rejected": -25286376.0, "logps/chosen": -175.70169067382812, "logps/rejected": -464.4602355957031, "loss": 0.3704, "rewards/chosen": -0.203277587890625, "rewards/margins": 1.9606118202209473, "rewards/rejected": -2.1638894081115723, "step": 8457 }, { "epoch": 0.4483078472424668, "grad_norm": 80.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41776293.333333336, "logits/rejected": -9107497.0, "logps/chosen": -385.434326171875, "logps/rejected": -117.98243713378906, "loss": 0.3257, "rewards/chosen": 0.4485129515329997, "rewards/margins": 2.4771827856699624, "rewards/rejected": -2.028669834136963, "step": 8458 }, { "epoch": 0.44836085124426894, "grad_norm": 38.25, "kl": 0.0359954833984375, "learning_rate": 5e-07, "logits/chosen": -33616122.666666664, "logits/rejected": 1200630.4, "logps/chosen": -257.14023844401044, "logps/rejected": -473.54072265625, "loss": 0.2589, "rewards/chosen": 0.6410939693450928, "rewards/margins": 2.850767660140991, "rewards/rejected": -2.2096736907958983, "step": 8459 }, { "epoch": 0.4484138552460711, "grad_norm": 39.25, "kl": 0.8143672943115234, "learning_rate": 5e-07, "logits/chosen": -60539240.0, "logits/rejected": 3797180.0, "logps/chosen": -201.784912109375, "logps/rejected": -213.9667510986328, "loss": 0.2941, "rewards/chosen": 0.8280141353607178, "rewards/margins": 2.0727732181549072, "rewards/rejected": -1.2447590827941895, "step": 8460 }, { "epoch": 0.4484668592478732, "grad_norm": 49.75, "kl": 0.5986385345458984, "learning_rate": 5e-07, "logits/chosen": -9097222.0, "logits/rejected": -6109216.8, "logps/chosen": -370.7384440104167, "logps/rejected": -376.822412109375, "loss": 0.2222, "rewards/chosen": 0.8975334167480469, "rewards/margins": 2.851603698730469, "rewards/rejected": -1.9540702819824218, "step": 8461 }, { "epoch": 0.44851986324967535, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -92178432.0, "logits/rejected": -650085.45, "logps/chosen": -361.0760904947917, "logps/rejected": -328.943896484375, "loss": 0.2454, "rewards/chosen": 0.35009511311848956, "rewards/margins": 2.3187276204427083, "rewards/rejected": -1.9686325073242188, "step": 8462 }, { "epoch": 0.4485728672514775, "grad_norm": 41.0, "kl": 0.3683280944824219, "learning_rate": 5e-07, "logits/chosen": -17947915.2, "logits/rejected": -17056176.0, "logps/chosen": -114.86490478515626, "logps/rejected": -152.1170654296875, "loss": 0.435, "rewards/chosen": 0.183999764919281, "rewards/margins": 0.71582320133845, "rewards/rejected": -0.5318234364191691, "step": 8463 }, { "epoch": 0.44862587125327963, "grad_norm": 40.5, "kl": 0.2173290252685547, "learning_rate": 5e-07, "logits/chosen": -1938925.0, "logits/rejected": -29216236.0, "logps/chosen": -265.9404296875, "logps/rejected": -473.77264404296875, "loss": 0.243, "rewards/chosen": 0.4942653775215149, "rewards/margins": 3.0258613228797913, "rewards/rejected": -2.5315959453582764, "step": 8464 }, { "epoch": 0.44867887525508177, "grad_norm": 31.0, "kl": 1.0152020454406738, "learning_rate": 5e-07, "logits/chosen": -13393993.6, "logits/rejected": 4120527.3333333335, "logps/chosen": -81.84259033203125, "logps/rejected": -280.4149983723958, "loss": 0.2846, "rewards/chosen": 0.6587869167327881, "rewards/margins": 2.9351385275522865, "rewards/rejected": -2.2763516108194985, "step": 8465 }, { "epoch": 0.4487318792568839, "grad_norm": 54.25, "kl": 2.2134838104248047, "learning_rate": 5e-07, "logits/chosen": -25371397.333333332, "logits/rejected": -26034376.0, "logps/chosen": -434.7870686848958, "logps/rejected": -343.467529296875, "loss": 0.3174, "rewards/chosen": 0.8411996364593506, "rewards/margins": 2.717857241630554, "rewards/rejected": -1.8766576051712036, "step": 8466 }, { "epoch": 0.44878488325868604, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40078777.6, "logits/rejected": -31860154.666666668, "logps/chosen": -345.485888671875, "logps/rejected": -279.9152425130208, "loss": 0.3442, "rewards/chosen": 0.2351544141769409, "rewards/margins": 2.2220661878585815, "rewards/rejected": -1.9869117736816406, "step": 8467 }, { "epoch": 0.4488378872604882, "grad_norm": 30.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6764251.5, "logits/rejected": -16360672.0, "logps/chosen": -134.37442016601562, "logps/rejected": -417.814697265625, "loss": 0.2926, "rewards/chosen": 0.11092966794967651, "rewards/margins": 2.29816335439682, "rewards/rejected": -2.1872336864471436, "step": 8468 }, { "epoch": 0.4488908912622903, "grad_norm": 46.75, "kl": 2.4330291748046875, "learning_rate": 5e-07, "logits/chosen": -37861475.2, "logits/rejected": 6567056.0, "logps/chosen": -186.01126708984376, "logps/rejected": -163.38958740234375, "loss": 0.4284, "rewards/chosen": -0.13539338111877441, "rewards/margins": 1.6886668999989827, "rewards/rejected": -1.824060281117757, "step": 8469 }, { "epoch": 0.44894389526409245, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39169212.0, "logits/rejected": -32761397.333333332, "logps/chosen": -200.6786651611328, "logps/rejected": -377.92822265625, "loss": 0.1341, "rewards/chosen": 0.7917623519897461, "rewards/margins": 3.820263226826986, "rewards/rejected": -3.0285008748372397, "step": 8470 }, { "epoch": 0.4489968992658946, "grad_norm": 29.5, "kl": 0.008016586303710938, "learning_rate": 5e-07, "logits/chosen": -1702089.25, "logits/rejected": -39629474.666666664, "logps/chosen": -38.26493453979492, "logps/rejected": -266.66782633463544, "loss": 0.1692, "rewards/chosen": 0.9642817378044128, "rewards/margins": 3.3166364232699075, "rewards/rejected": -2.3523546854654946, "step": 8471 }, { "epoch": 0.4490499032676967, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15161686.0, "logits/rejected": 450906.2916666667, "logps/chosen": -149.15887451171875, "logps/rejected": -266.5938720703125, "loss": 0.2637, "rewards/chosen": 0.48327791690826416, "rewards/margins": 2.1430983146031695, "rewards/rejected": -1.6598203976949055, "step": 8472 }, { "epoch": 0.44910290726949886, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27199288.0, "logits/rejected": -31344784.0, "logps/chosen": -532.8641967773438, "logps/rejected": -351.4225667317708, "loss": 0.1346, "rewards/chosen": 1.7594940662384033, "rewards/margins": 4.070953289667765, "rewards/rejected": -2.311459223429362, "step": 8473 }, { "epoch": 0.449155911271301, "grad_norm": 60.25, "kl": 1.5643978118896484, "learning_rate": 5e-07, "logits/chosen": -56552037.333333336, "logits/rejected": -16333415.0, "logps/chosen": -531.5736083984375, "logps/rejected": -220.82357788085938, "loss": 0.4386, "rewards/chosen": 0.2705986102422078, "rewards/margins": 1.3993271191914876, "rewards/rejected": -1.1287285089492798, "step": 8474 }, { "epoch": 0.44920891527310314, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33489802.0, "logits/rejected": -30656240.0, "logps/chosen": -223.6761016845703, "logps/rejected": -408.3109944661458, "loss": 0.2347, "rewards/chosen": 0.3050979673862457, "rewards/margins": 2.168034940958023, "rewards/rejected": -1.8629369735717773, "step": 8475 }, { "epoch": 0.4492619192749053, "grad_norm": 42.5, "kl": 0.6392478942871094, "learning_rate": 5e-07, "logits/chosen": 6100608.0, "logits/rejected": -77750752.0, "logps/chosen": -93.68043518066406, "logps/rejected": -544.5891723632812, "loss": 0.3758, "rewards/chosen": -0.3559979796409607, "rewards/margins": 1.7347341179847717, "rewards/rejected": -2.0907320976257324, "step": 8476 }, { "epoch": 0.4493149232767074, "grad_norm": 48.75, "kl": 0.8921184539794922, "learning_rate": 5e-07, "logits/chosen": -17938788.0, "logits/rejected": -5665434.4, "logps/chosen": -455.8478597005208, "logps/rejected": -402.77646484375, "loss": 0.2036, "rewards/chosen": 0.9480361938476562, "rewards/margins": 2.712099075317383, "rewards/rejected": -1.7640628814697266, "step": 8477 }, { "epoch": 0.44936792727850955, "grad_norm": 55.0, "kl": 2.6572608947753906, "learning_rate": 5e-07, "logits/chosen": 84630.0, "logits/rejected": -38001228.0, "logps/chosen": -291.4240315755208, "logps/rejected": -779.8683471679688, "loss": 0.3755, "rewards/chosen": 0.47691575686136883, "rewards/margins": 3.7052437464396157, "rewards/rejected": -3.228327989578247, "step": 8478 }, { "epoch": 0.4494209312803117, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -25388150.0, "logps/rejected": -351.69854736328125, "loss": 0.1188, "rewards/rejected": -2.4448397159576416, "step": 8479 }, { "epoch": 0.4494739352821138, "grad_norm": 39.0, "kl": 1.355630874633789, "learning_rate": 5e-07, "logits/chosen": 8447627.333333334, "logits/rejected": -14428110.4, "logps/chosen": -245.53438313802084, "logps/rejected": -303.17216796875, "loss": 0.235, "rewards/chosen": 0.5664602915445963, "rewards/margins": 2.9240782419840494, "rewards/rejected": -2.357617950439453, "step": 8480 }, { "epoch": 0.44952693928391596, "grad_norm": 47.0, "kl": 1.9826221466064453, "learning_rate": 5e-07, "logits/chosen": -8575288.0, "logits/rejected": -25387952.0, "logps/chosen": -630.613818359375, "logps/rejected": -294.08856201171875, "loss": 0.1946, "rewards/chosen": 1.5583124160766602, "rewards/margins": 3.5126053492228193, "rewards/rejected": -1.954292933146159, "step": 8481 }, { "epoch": 0.4495799432857181, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5801918.8, "logits/rejected": -74829130.66666667, "logps/chosen": -517.18349609375, "logps/rejected": -744.9973958333334, "loss": 0.3096, "rewards/chosen": 0.25145387649536133, "rewards/margins": 3.060265064239502, "rewards/rejected": -2.8088111877441406, "step": 8482 }, { "epoch": 0.44963294728752023, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22659766.0, "logits/rejected": -15749769.0, "logps/chosen": -321.46685791015625, "logps/rejected": -239.8152618408203, "loss": 0.2586, "rewards/chosen": 0.8454241156578064, "rewards/margins": 2.6512851119041443, "rewards/rejected": -1.805860996246338, "step": 8483 }, { "epoch": 0.4496859512893223, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58844364.8, "logits/rejected": -36737317.333333336, "logps/chosen": -257.278125, "logps/rejected": -292.5769856770833, "loss": 0.3117, "rewards/chosen": 0.3522913932800293, "rewards/margins": 2.6864666620890296, "rewards/rejected": -2.3341752688090005, "step": 8484 }, { "epoch": 0.44973895529112445, "grad_norm": 46.75, "kl": 0.9199981689453125, "learning_rate": 5e-07, "logits/chosen": -49134124.8, "logits/rejected": 1827398.0, "logps/chosen": -337.1013671875, "logps/rejected": -61.417215983072914, "loss": 0.3752, "rewards/chosen": 0.38379220962524413, "rewards/margins": 1.3051389535268147, "rewards/rejected": -0.9213467439015707, "step": 8485 }, { "epoch": 0.4497919592929266, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38069776.0, "logits/rejected": -32009516.8, "logps/chosen": -248.376220703125, "logps/rejected": -483.60234375, "loss": 0.2248, "rewards/chosen": 0.34711917241414386, "rewards/margins": 2.9685806592305504, "rewards/rejected": -2.6214614868164063, "step": 8486 }, { "epoch": 0.44984496329472873, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38067864.0, "logits/rejected": -3951841.2, "logps/chosen": -436.2327473958333, "logps/rejected": -168.53587646484374, "loss": 0.3361, "rewards/chosen": -0.1217127541700999, "rewards/margins": 1.2890078802903493, "rewards/rejected": -1.4107206344604493, "step": 8487 }, { "epoch": 0.44989796729653087, "grad_norm": 51.75, "kl": 2.4529991149902344, "learning_rate": 5e-07, "logits/chosen": -21213321.333333332, "logits/rejected": -38053560.0, "logps/chosen": -245.0300496419271, "logps/rejected": -478.48223876953125, "loss": 0.4188, "rewards/chosen": 0.1380484402179718, "rewards/margins": 3.1647911369800568, "rewards/rejected": -3.026742696762085, "step": 8488 }, { "epoch": 0.449950971298333, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42443440.0, "logits/rejected": -12536389.333333334, "logps/chosen": -416.34990234375, "logps/rejected": -243.7157999674479, "loss": 0.3067, "rewards/chosen": 0.447706937789917, "rewards/margins": 2.2726816336313886, "rewards/rejected": -1.8249746958414714, "step": 8489 }, { "epoch": 0.45000397530013514, "grad_norm": 41.25, "kl": 0.32691097259521484, "learning_rate": 5e-07, "logits/chosen": 6914772.0, "logits/rejected": -11984057.142857144, "logps/chosen": -44.354881286621094, "logps/rejected": -452.98779296875, "loss": 0.179, "rewards/chosen": -0.1849357634782791, "rewards/margins": 2.052967150296484, "rewards/rejected": -2.237902913774763, "step": 8490 }, { "epoch": 0.4500569793019373, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4222704.0, "logits/rejected": 44739045.333333336, "logps/chosen": -291.4632568359375, "logps/rejected": -340.2422688802083, "loss": 0.2481, "rewards/chosen": -0.13753509521484375, "rewards/margins": 1.6380011240641277, "rewards/rejected": -1.7755362192789714, "step": 8491 }, { "epoch": 0.4501099833037394, "grad_norm": 41.25, "kl": 0.08879852294921875, "learning_rate": 5e-07, "logits/chosen": -30178594.666666668, "logits/rejected": -24530592.0, "logps/chosen": -220.58406575520834, "logps/rejected": -316.856103515625, "loss": 0.2746, "rewards/chosen": -0.20612895488739014, "rewards/margins": 1.9818246126174928, "rewards/rejected": -2.187953567504883, "step": 8492 }, { "epoch": 0.45016298730554155, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20839772.0, "logits/rejected": -13192754.666666666, "logps/chosen": -414.83306884765625, "logps/rejected": -274.2655843098958, "loss": 0.2133, "rewards/chosen": 0.6681464910507202, "rewards/margins": 2.566159923871358, "rewards/rejected": -1.898013432820638, "step": 8493 }, { "epoch": 0.4502159913073437, "grad_norm": 92.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44192448.0, "logits/rejected": -30851720.0, "logps/chosen": -452.64951171875, "logps/rejected": -302.41221110026044, "loss": 0.3422, "rewards/chosen": 0.12541444301605226, "rewards/margins": 2.423375995953878, "rewards/rejected": -2.2979615529378257, "step": 8494 }, { "epoch": 0.4502689953091458, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42413360.0, "logits/rejected": -58166736.0, "logps/chosen": -285.34503173828125, "logps/rejected": -301.6102294921875, "loss": 0.3494, "rewards/chosen": -0.07032528519630432, "rewards/margins": 1.8198233544826508, "rewards/rejected": -1.890148639678955, "step": 8495 }, { "epoch": 0.45032199931094796, "grad_norm": 45.75, "kl": 0.7999458312988281, "learning_rate": 5e-07, "logits/chosen": -47415642.666666664, "logits/rejected": 654697.75, "logps/chosen": -290.9206949869792, "logps/rejected": -323.2049560546875, "loss": 0.3301, "rewards/chosen": 0.44186751047770184, "rewards/margins": 2.8321966330210366, "rewards/rejected": -2.390329122543335, "step": 8496 }, { "epoch": 0.4503750033127501, "grad_norm": 48.5, "kl": 1.9648380279541016, "learning_rate": 5e-07, "logits/chosen": -15962040.0, "logits/rejected": -51987418.666666664, "logps/chosen": -282.051220703125, "logps/rejected": -331.97617594401044, "loss": 0.3789, "rewards/chosen": 0.2283561944961548, "rewards/margins": 2.7189342101415, "rewards/rejected": -2.490578015645345, "step": 8497 }, { "epoch": 0.45042800731455224, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9500570.666666666, "logits/rejected": -1472897.4, "logps/chosen": -151.6975301106771, "logps/rejected": -207.469091796875, "loss": 0.301, "rewards/chosen": 0.08434581756591797, "rewards/margins": 2.037491226196289, "rewards/rejected": -1.953145408630371, "step": 8498 }, { "epoch": 0.4504810113163544, "grad_norm": 31.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12226417.0, "logits/rejected": -22704608.0, "logps/chosen": -43.84814453125, "logps/rejected": -223.1287638346354, "loss": 0.2394, "rewards/chosen": 0.6754644513130188, "rewards/margins": 2.264219105243683, "rewards/rejected": -1.588754653930664, "step": 8499 }, { "epoch": 0.4505340153181565, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40065286.4, "logits/rejected": -4289116.0, "logps/chosen": -247.812548828125, "logps/rejected": -394.5306396484375, "loss": 0.4141, "rewards/chosen": -0.055743408203125, "rewards/margins": 1.1339284261067708, "rewards/rejected": -1.1896718343098958, "step": 8500 }, { "epoch": 0.45058701931995865, "grad_norm": 47.5, "kl": 0.3018836975097656, "learning_rate": 5e-07, "logits/chosen": -33022630.0, "logits/rejected": -18086112.0, "logps/chosen": -293.869140625, "logps/rejected": -348.0916748046875, "loss": 0.2456, "rewards/chosen": 0.42870837450027466, "rewards/margins": 3.1759265065193176, "rewards/rejected": -2.747218132019043, "step": 8501 }, { "epoch": 0.4506400233217608, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33207498.666666668, "logits/rejected": -24029302.0, "logps/chosen": -367.9087727864583, "logps/rejected": -185.897705078125, "loss": 0.2903, "rewards/chosen": 0.8251292705535889, "rewards/margins": 3.067028760910034, "rewards/rejected": -2.2418994903564453, "step": 8502 }, { "epoch": 0.4506930273235629, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31387484.8, "logits/rejected": -12285616.0, "logps/chosen": -295.47412109375, "logps/rejected": -120.5427958170573, "loss": 0.4005, "rewards/chosen": -0.04935165643692017, "rewards/margins": 1.2443594495455426, "rewards/rejected": -1.2937111059824626, "step": 8503 }, { "epoch": 0.45074603132536506, "grad_norm": 42.25, "kl": 0.6335601806640625, "learning_rate": 5e-07, "logits/chosen": 10397432.0, "logits/rejected": -79429792.0, "logps/chosen": -295.3715006510417, "logps/rejected": -403.0482421875, "loss": 0.2488, "rewards/chosen": 0.6116628249486288, "rewards/margins": 2.4544328292210897, "rewards/rejected": -1.8427700042724608, "step": 8504 }, { "epoch": 0.4507990353271672, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23876877.333333332, "logits/rejected": -29683098.0, "logps/chosen": -223.26875813802084, "logps/rejected": -504.85693359375, "loss": 0.4083, "rewards/chosen": 0.03768173853556315, "rewards/margins": 3.1633338133494058, "rewards/rejected": -3.1256520748138428, "step": 8505 }, { "epoch": 0.45085203932896933, "grad_norm": 39.75, "kl": 4.952964782714844, "learning_rate": 5e-07, "logits/chosen": -15750710.4, "logits/rejected": -8219537.333333333, "logps/chosen": -161.00775146484375, "logps/rejected": -380.0684000651042, "loss": 0.3436, "rewards/chosen": 0.5568480491638184, "rewards/margins": 2.7973429361979165, "rewards/rejected": -2.240494887034098, "step": 8506 }, { "epoch": 0.45090504333077147, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -97815285.33333333, "logits/rejected": 294779.0, "logps/chosen": -1134.1344401041667, "logps/rejected": -104.8970458984375, "loss": 0.2798, "rewards/chosen": 1.2725892066955566, "rewards/margins": 2.7540574073791504, "rewards/rejected": -1.4814682006835938, "step": 8507 }, { "epoch": 0.4509580473325736, "grad_norm": 51.5, "kl": 1.5088882446289062, "learning_rate": 5e-07, "logits/chosen": -1508765.0, "logits/rejected": -3797273.3333333335, "logps/chosen": -233.1854248046875, "logps/rejected": -141.52058919270834, "loss": 0.3167, "rewards/chosen": 0.7322279930114746, "rewards/margins": 2.045201810201009, "rewards/rejected": -1.3129738171895344, "step": 8508 }, { "epoch": 0.45101105133437575, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7930945.333333333, "logits/rejected": 3909791.2, "logps/chosen": -344.5758870442708, "logps/rejected": -152.8842041015625, "loss": 0.2472, "rewards/chosen": 0.6613348325093588, "rewards/margins": 2.3471813519795734, "rewards/rejected": -1.6858465194702148, "step": 8509 }, { "epoch": 0.4510640553361779, "grad_norm": 30.0, "kl": 2.3579940795898438, "learning_rate": 5e-07, "logits/chosen": -8140337.333333333, "logits/rejected": -22806000.0, "logps/chosen": -146.67635091145834, "logps/rejected": -358.7438720703125, "loss": 0.2046, "rewards/chosen": 0.672165314356486, "rewards/margins": 2.9396623770395913, "rewards/rejected": -2.2674970626831055, "step": 8510 }, { "epoch": 0.45111705933798, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35667216.0, "logits/rejected": -14134110.0, "logps/chosen": -312.43768310546875, "logps/rejected": -131.60328674316406, "loss": 0.2835, "rewards/chosen": 0.5286680459976196, "rewards/margins": 2.1281241178512573, "rewards/rejected": -1.5994560718536377, "step": 8511 }, { "epoch": 0.45117006333978216, "grad_norm": 52.0, "kl": 0.014575958251953125, "learning_rate": 5e-07, "logits/chosen": -65612243.2, "logits/rejected": -37093602.666666664, "logps/chosen": -310.927734375, "logps/rejected": -548.8503824869791, "loss": 0.2861, "rewards/chosen": 0.5189360618591309, "rewards/margins": 2.5443470319112143, "rewards/rejected": -2.0254109700520835, "step": 8512 }, { "epoch": 0.4512230673415843, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28302988.0, "logits/rejected": -21749982.0, "logps/chosen": -439.99835205078125, "logps/rejected": -382.3587646484375, "loss": 0.3165, "rewards/chosen": 0.6099153757095337, "rewards/margins": 1.759528398513794, "rewards/rejected": -1.1496130228042603, "step": 8513 }, { "epoch": 0.45127607134338643, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12434010.666666666, "logits/rejected": 54394872.0, "logps/chosen": -352.6883138020833, "logps/rejected": -381.5516357421875, "loss": 0.443, "rewards/chosen": 0.06078724066416422, "rewards/margins": 1.0238590637842815, "rewards/rejected": -0.9630718231201172, "step": 8514 }, { "epoch": 0.45132907534518857, "grad_norm": 47.75, "kl": 0.9244270324707031, "learning_rate": 5e-07, "logits/chosen": -43078550.4, "logits/rejected": -30891757.333333332, "logps/chosen": -311.429296875, "logps/rejected": -360.206298828125, "loss": 0.3577, "rewards/chosen": 0.09637528657913208, "rewards/margins": 1.748734136422475, "rewards/rejected": -1.652358849843343, "step": 8515 }, { "epoch": 0.4513820793469907, "grad_norm": 53.25, "kl": 0.48851776123046875, "learning_rate": 5e-07, "logits/chosen": -51461196.0, "logits/rejected": -41368400.0, "logps/chosen": -367.8828430175781, "logps/rejected": -436.1274719238281, "loss": 0.1934, "rewards/chosen": 0.9956787824630737, "rewards/margins": 3.421526312828064, "rewards/rejected": -2.4258475303649902, "step": 8516 }, { "epoch": 0.45143508334879284, "grad_norm": 53.5, "kl": 0.3014240264892578, "learning_rate": 5e-07, "logits/chosen": -49623104.0, "logits/rejected": 1295958.25, "logps/chosen": -299.10348074776783, "logps/rejected": -338.67852783203125, "loss": 0.2939, "rewards/chosen": 0.7993759427751813, "rewards/margins": 3.703148058482579, "rewards/rejected": -2.9037721157073975, "step": 8517 }, { "epoch": 0.451488087350595, "grad_norm": 51.75, "kl": 0.3909921646118164, "learning_rate": 5e-07, "logits/chosen": -99061.8, "logits/rejected": -101176832.0, "logps/chosen": -36.660247802734375, "logps/rejected": -231.58162434895834, "loss": 0.416, "rewards/chosen": 0.0712015151977539, "rewards/margins": 1.0700269063313803, "rewards/rejected": -0.9988253911336263, "step": 8518 }, { "epoch": 0.4515410913523971, "grad_norm": 43.25, "kl": 0.9316539764404297, "learning_rate": 5e-07, "logits/chosen": -34253888.0, "logits/rejected": -6057367.5, "logps/chosen": -363.1459045410156, "logps/rejected": -409.50384521484375, "loss": 0.2916, "rewards/chosen": 0.2602379620075226, "rewards/margins": 2.85860612988472, "rewards/rejected": -2.5983681678771973, "step": 8519 }, { "epoch": 0.45159409535419925, "grad_norm": 45.5, "kl": 1.029764175415039, "learning_rate": 5e-07, "logits/chosen": -21958593.333333332, "logits/rejected": -33313936.0, "logps/chosen": -306.12457275390625, "logps/rejected": -218.2490692138672, "loss": 0.3611, "rewards/chosen": 0.5193929274876913, "rewards/margins": 2.314594109853109, "rewards/rejected": -1.7952011823654175, "step": 8520 }, { "epoch": 0.4516470993560014, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10409890.0, "logits/rejected": -9289426.285714285, "logps/chosen": -169.79315185546875, "logps/rejected": -219.179443359375, "loss": 0.2556, "rewards/chosen": -0.6937012076377869, "rewards/margins": 0.931704648903438, "rewards/rejected": -1.6254058565412248, "step": 8521 }, { "epoch": 0.45170010335780353, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30232306.666666668, "logits/rejected": -28807564.8, "logps/chosen": -847.8328450520834, "logps/rejected": -395.613330078125, "loss": 0.2035, "rewards/chosen": 0.6451582511266073, "rewards/margins": 3.054503115018209, "rewards/rejected": -2.4093448638916017, "step": 8522 }, { "epoch": 0.45175310735960567, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56873561.6, "logits/rejected": 3992073.0, "logps/chosen": -318.3960205078125, "logps/rejected": -75.05049641927083, "loss": 0.4412, "rewards/chosen": 0.1122802734375, "rewards/margins": 0.5672533750534058, "rewards/rejected": -0.45497310161590576, "step": 8523 }, { "epoch": 0.4518061113614078, "grad_norm": 58.5, "kl": 1.5763063430786133, "learning_rate": 5e-07, "logits/chosen": -18872632.0, "logits/rejected": -21065905.333333332, "logps/chosen": -380.1150146484375, "logps/rejected": -260.6634114583333, "loss": 0.3585, "rewards/chosen": 0.4239644527435303, "rewards/margins": 1.6871496359507243, "rewards/rejected": -1.263185183207194, "step": 8524 }, { "epoch": 0.45185911536320994, "grad_norm": 41.75, "kl": 2.3920211791992188, "learning_rate": 5e-07, "logits/chosen": -8019595.2, "logits/rejected": -19979564.0, "logps/chosen": -224.2188232421875, "logps/rejected": -287.0653483072917, "loss": 0.3793, "rewards/chosen": 0.3466529846191406, "rewards/margins": 2.199835141499837, "rewards/rejected": -1.8531821568806965, "step": 8525 }, { "epoch": 0.4519121193650121, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26292076.0, "logits/rejected": -27448837.333333332, "logps/chosen": -205.62164306640625, "logps/rejected": -275.2138671875, "loss": 0.3618, "rewards/chosen": -0.12563134729862213, "rewards/margins": 0.9405994266271591, "rewards/rejected": -1.0662307739257812, "step": 8526 }, { "epoch": 0.4519651233668142, "grad_norm": 49.75, "kl": 2.742471694946289, "learning_rate": 5e-07, "logits/chosen": -28224778.0, "logits/rejected": -6673559.5, "logps/chosen": -255.40249633789062, "logps/rejected": -358.6302185058594, "loss": 0.3805, "rewards/chosen": 0.34383001923561096, "rewards/margins": 2.2485168278217316, "rewards/rejected": -1.9046868085861206, "step": 8527 }, { "epoch": 0.45201812736861635, "grad_norm": 45.5, "kl": 1.4210071563720703, "learning_rate": 5e-07, "logits/chosen": -37674803.2, "logits/rejected": 11508569.333333334, "logps/chosen": -208.682080078125, "logps/rejected": -227.35713704427084, "loss": 0.3946, "rewards/chosen": 0.36954941749572756, "rewards/margins": 1.5309456825256347, "rewards/rejected": -1.1613962650299072, "step": 8528 }, { "epoch": 0.4520711313704185, "grad_norm": 47.25, "kl": 0.7463550567626953, "learning_rate": 5e-07, "logits/chosen": -6852922.666666667, "logits/rejected": 295126835.2, "logps/chosen": -211.88859049479166, "logps/rejected": -361.1209228515625, "loss": 0.1842, "rewards/chosen": 1.1531086762746174, "rewards/margins": 3.0263418992360434, "rewards/rejected": -1.8732332229614257, "step": 8529 }, { "epoch": 0.4521241353722206, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41240496.0, "logits/rejected": -34291132.0, "logps/chosen": -317.78045654296875, "logps/rejected": -584.8562622070312, "loss": 0.2161, "rewards/chosen": 0.7056423425674438, "rewards/margins": 3.3495911359786987, "rewards/rejected": -2.643948793411255, "step": 8530 }, { "epoch": 0.45217713937402276, "grad_norm": 46.5, "kl": 0.9208002090454102, "learning_rate": 5e-07, "logits/chosen": -1779922.6666666667, "logits/rejected": -61545024.0, "logps/chosen": -122.94422403971355, "logps/rejected": -517.96416015625, "loss": 0.2529, "rewards/chosen": 0.37634936968485516, "rewards/margins": 2.5624364058176674, "rewards/rejected": -2.1860870361328124, "step": 8531 }, { "epoch": 0.4522301433758249, "grad_norm": 58.0, "kl": 1.3689289093017578, "learning_rate": 5e-07, "logits/chosen": -32664490.666666668, "logits/rejected": -1706415.0, "logps/chosen": -302.5133463541667, "logps/rejected": -36.0489501953125, "loss": 0.367, "rewards/chosen": 0.7012588977813721, "rewards/margins": 1.4253254532814026, "rewards/rejected": -0.7240665555000305, "step": 8532 }, { "epoch": 0.45228314737762704, "grad_norm": 46.25, "kl": 0.7083396911621094, "learning_rate": 5e-07, "logits/chosen": -22525949.333333332, "logits/rejected": 56927788.8, "logps/chosen": -1333.4730631510417, "logps/rejected": -314.5653076171875, "loss": 0.1891, "rewards/chosen": 2.1106160481770835, "rewards/margins": 3.7041418393452963, "rewards/rejected": -1.5935257911682128, "step": 8533 }, { "epoch": 0.4523361513794292, "grad_norm": 49.0, "kl": 0.5067901611328125, "learning_rate": 5e-07, "logits/chosen": -9602721.6, "logits/rejected": -21203846.666666668, "logps/chosen": -233.3330810546875, "logps/rejected": -250.357177734375, "loss": 0.3763, "rewards/chosen": 0.1210558295249939, "rewards/margins": 1.5970540722211202, "rewards/rejected": -1.4759982426961262, "step": 8534 }, { "epoch": 0.45238915538123126, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -605940.65, "logits/rejected": -27653653.333333332, "logps/chosen": -151.44898681640626, "logps/rejected": -400.8041585286458, "loss": 0.2496, "rewards/chosen": 0.7336216926574707, "rewards/margins": 2.8694767316182457, "rewards/rejected": -2.135855038960775, "step": 8535 }, { "epoch": 0.4524421593830334, "grad_norm": 46.0, "kl": 0.20987701416015625, "learning_rate": 5e-07, "logits/chosen": -45899667.2, "logits/rejected": -70132266.66666667, "logps/chosen": -691.469091796875, "logps/rejected": -633.1355794270834, "loss": 0.2958, "rewards/chosen": 0.6120364189147949, "rewards/margins": 5.189697933197022, "rewards/rejected": -4.577661514282227, "step": 8536 }, { "epoch": 0.45249516338483553, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35939124.0, "logits/rejected": -15943757.333333334, "logps/chosen": -90.17460632324219, "logps/rejected": -445.570556640625, "loss": 0.1963, "rewards/chosen": -0.03931274637579918, "rewards/margins": 2.7845451968411603, "rewards/rejected": -2.8238579432169595, "step": 8537 }, { "epoch": 0.45254816738663767, "grad_norm": 50.0, "kl": 2.4888839721679688, "learning_rate": 5e-07, "logits/chosen": -19191164.0, "logits/rejected": -12617214.0, "logps/chosen": -276.2084045410156, "logps/rejected": -402.7324523925781, "loss": 0.2763, "rewards/chosen": 0.5465547442436218, "rewards/margins": 2.77704781293869, "rewards/rejected": -2.2304930686950684, "step": 8538 }, { "epoch": 0.4526011713884398, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -138280096.0, "logits/rejected": -9060346.4, "logps/chosen": -494.1987711588542, "logps/rejected": -228.877294921875, "loss": 0.3648, "rewards/chosen": 0.08810476462046306, "rewards/margins": 1.1120006958643596, "rewards/rejected": -1.0238959312438964, "step": 8539 }, { "epoch": 0.45265417539024194, "grad_norm": 63.5, "kl": 1.1555900573730469, "learning_rate": 5e-07, "logits/chosen": -16233606.4, "logits/rejected": -6326502.666666667, "logps/chosen": -375.0582763671875, "logps/rejected": -310.880126953125, "loss": 0.2877, "rewards/chosen": 0.6062251091003418, "rewards/margins": 2.506049092610677, "rewards/rejected": -1.8998239835103352, "step": 8540 }, { "epoch": 0.4527071793920441, "grad_norm": 48.0, "kl": 0.029979705810546875, "learning_rate": 5e-07, "logits/chosen": -36558572.0, "logits/rejected": -15090950.0, "logps/chosen": -368.0049133300781, "logps/rejected": -200.06399536132812, "loss": 0.3574, "rewards/chosen": 0.19993898272514343, "rewards/margins": 1.620486706495285, "rewards/rejected": -1.4205477237701416, "step": 8541 }, { "epoch": 0.4527601833938462, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66562768.0, "logits/rejected": -30466032.0, "logps/chosen": -415.8503011067708, "logps/rejected": -392.368408203125, "loss": 0.2723, "rewards/chosen": 0.05821432669957479, "rewards/margins": 1.9971629579861958, "rewards/rejected": -1.938948631286621, "step": 8542 }, { "epoch": 0.45281318739564835, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22530256.0, "logits/rejected": 545161.6, "logps/chosen": -113.19703165690105, "logps/rejected": -309.53359375, "loss": 0.2977, "rewards/chosen": 0.11784656842549641, "rewards/margins": 1.71721666653951, "rewards/rejected": -1.5993700981140138, "step": 8543 }, { "epoch": 0.4528661913974505, "grad_norm": 63.0, "kl": 0.2161545753479004, "learning_rate": 5e-07, "logits/chosen": -56576320.0, "logits/rejected": -43153504.0, "logps/chosen": -362.2310791015625, "logps/rejected": -427.95928955078125, "loss": 0.2573, "rewards/chosen": 0.7140498161315918, "rewards/margins": 2.579953908920288, "rewards/rejected": -1.8659040927886963, "step": 8544 }, { "epoch": 0.4529191953992526, "grad_norm": 42.75, "kl": 0.9787626266479492, "learning_rate": 5e-07, "logits/chosen": -14868950.4, "logits/rejected": -12678797.333333334, "logps/chosen": -97.71675415039063, "logps/rejected": -506.5093587239583, "loss": 0.3683, "rewards/chosen": 0.1975055456161499, "rewards/margins": 2.3975655953089396, "rewards/rejected": -2.2000600496927896, "step": 8545 }, { "epoch": 0.45297219940105476, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43880788.0, "logits/rejected": 5517982.5, "logps/chosen": -320.8764953613281, "logps/rejected": -335.235595703125, "loss": 0.2924, "rewards/chosen": 0.7051910161972046, "rewards/margins": 2.8547462224960327, "rewards/rejected": -2.149555206298828, "step": 8546 }, { "epoch": 0.4530252034028569, "grad_norm": 52.5, "kl": 1.7944183349609375, "learning_rate": 5e-07, "logits/chosen": -24617803.2, "logits/rejected": -23335893.333333332, "logps/chosen": -291.9898193359375, "logps/rejected": -434.9139811197917, "loss": 0.328, "rewards/chosen": 0.530458402633667, "rewards/margins": 2.67597066561381, "rewards/rejected": -2.145512262980143, "step": 8547 }, { "epoch": 0.45307820740465904, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28002932.0, "logits/rejected": -52665200.0, "logps/chosen": -549.7210083007812, "logps/rejected": -299.3505859375, "loss": 0.2916, "rewards/chosen": 0.2932083308696747, "rewards/margins": 1.8774942258993785, "rewards/rejected": -1.5842858950297039, "step": 8548 }, { "epoch": 0.4531312114064612, "grad_norm": 48.75, "kl": 0.4498729705810547, "learning_rate": 5e-07, "logits/chosen": -53108352.0, "logits/rejected": -24036852.0, "logps/chosen": -282.0076904296875, "logps/rejected": -388.4037780761719, "loss": 0.4009, "rewards/chosen": -0.012972002228101095, "rewards/margins": 2.719898576537768, "rewards/rejected": -2.732870578765869, "step": 8549 }, { "epoch": 0.4531842154082633, "grad_norm": 45.25, "kl": 0.9291191101074219, "learning_rate": 5e-07, "logits/chosen": -10111190.4, "logits/rejected": -2804726.6666666665, "logps/chosen": -144.09697265625, "logps/rejected": -320.0985107421875, "loss": 0.4059, "rewards/chosen": -0.1517309546470642, "rewards/margins": 1.416420837243398, "rewards/rejected": -1.5681517918904622, "step": 8550 }, { "epoch": 0.45323721941006545, "grad_norm": 57.75, "kl": 0.8094234466552734, "learning_rate": 5e-07, "logits/chosen": -58616499.2, "logits/rejected": -133757.75, "logps/chosen": -348.03779296875, "logps/rejected": -54.485148111979164, "loss": 0.3875, "rewards/chosen": 0.14788987636566162, "rewards/margins": 1.6472181717554728, "rewards/rejected": -1.4993282953898113, "step": 8551 }, { "epoch": 0.4532902234118676, "grad_norm": 48.25, "kl": 0.6119308471679688, "learning_rate": 5e-07, "logits/chosen": -6450822.8, "logits/rejected": -28473392.0, "logps/chosen": -244.4621337890625, "logps/rejected": -194.9268798828125, "loss": 0.4147, "rewards/chosen": -0.13327080011367798, "rewards/margins": 1.1922278602917988, "rewards/rejected": -1.3254986604054768, "step": 8552 }, { "epoch": 0.4533432274136697, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17976908.0, "logits/rejected": -32611336.0, "logps/chosen": -297.990966796875, "logps/rejected": -540.5540771484375, "loss": 0.2502, "rewards/chosen": 0.40032273530960083, "rewards/margins": 3.192693054676056, "rewards/rejected": -2.792370319366455, "step": 8553 }, { "epoch": 0.45339623141547186, "grad_norm": 56.75, "kl": 0.6174278259277344, "learning_rate": 5e-07, "logits/chosen": -54140172.8, "logits/rejected": -10825486.666666666, "logps/chosen": -305.24775390625, "logps/rejected": -273.84291585286456, "loss": 0.3401, "rewards/chosen": 0.37166123390197753, "rewards/margins": 2.0632653713226317, "rewards/rejected": -1.6916041374206543, "step": 8554 }, { "epoch": 0.453449235417274, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -173176.625, "logits/rejected": -14627146.0, "logps/chosen": -68.64933013916016, "logps/rejected": -231.5360870361328, "loss": 0.3367, "rewards/chosen": 0.15527547895908356, "rewards/margins": 1.689806953072548, "rewards/rejected": -1.5345314741134644, "step": 8555 }, { "epoch": 0.45350223941907614, "grad_norm": 32.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12722213.333333334, "logits/rejected": -15542643.2, "logps/chosen": -104.7970682779948, "logps/rejected": -267.758935546875, "loss": 0.2244, "rewards/chosen": 0.45888304710388184, "rewards/margins": 3.04527907371521, "rewards/rejected": -2.586396026611328, "step": 8556 }, { "epoch": 0.4535552434208783, "grad_norm": 56.0, "kl": 0.6628036499023438, "learning_rate": 5e-07, "logits/chosen": -42004960.0, "logits/rejected": -58397120.0, "logps/chosen": -283.54951171875, "logps/rejected": -530.8805338541666, "loss": 0.3276, "rewards/chosen": 0.3473366260528564, "rewards/margins": 3.113942003250122, "rewards/rejected": -2.7666053771972656, "step": 8557 }, { "epoch": 0.4536082474226804, "grad_norm": 42.0, "kl": 0.4324951171875, "learning_rate": 5e-07, "logits/chosen": -17982714.0, "logits/rejected": -20664450.0, "logps/chosen": -294.9768371582031, "logps/rejected": -292.0227355957031, "loss": 0.2365, "rewards/chosen": 0.33020615577697754, "rewards/margins": 3.5732176303863525, "rewards/rejected": -3.243011474609375, "step": 8558 }, { "epoch": 0.45366125142448255, "grad_norm": 55.0, "kl": 1.1184654235839844, "learning_rate": 5e-07, "logits/chosen": -36378656.0, "logits/rejected": -9675571.0, "logps/chosen": -339.7843933105469, "logps/rejected": -248.22750854492188, "loss": 0.3043, "rewards/chosen": 0.6045414209365845, "rewards/margins": 2.254077672958374, "rewards/rejected": -1.6495362520217896, "step": 8559 }, { "epoch": 0.4537142554262847, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13925968.0, "logits/rejected": -17813800.0, "logps/chosen": -246.2069091796875, "logps/rejected": -286.7075602213542, "loss": 0.3401, "rewards/chosen": 0.36533005237579347, "rewards/margins": 1.935161582628886, "rewards/rejected": -1.5698315302530925, "step": 8560 }, { "epoch": 0.4537672594280868, "grad_norm": 54.25, "kl": 1.1542701721191406, "learning_rate": 5e-07, "logits/chosen": -939150.0, "logits/rejected": -26355669.333333332, "logps/chosen": -393.22744140625, "logps/rejected": -214.763427734375, "loss": 0.2426, "rewards/chosen": 0.9202960968017578, "rewards/margins": 2.9034372011820477, "rewards/rejected": -1.9831411043802898, "step": 8561 }, { "epoch": 0.45382026342988896, "grad_norm": 55.5, "kl": 0.19063758850097656, "learning_rate": 5e-07, "logits/chosen": -8438489.0, "logits/rejected": -5500507.666666667, "logps/chosen": -138.7603759765625, "logps/rejected": -272.09153238932294, "loss": 0.2412, "rewards/chosen": 1.0894588232040405, "rewards/margins": 2.3776588837305708, "rewards/rejected": -1.28820006052653, "step": 8562 }, { "epoch": 0.4538732674316911, "grad_norm": 47.5, "kl": 1.0637359619140625, "learning_rate": 5e-07, "logits/chosen": -23021746.0, "logits/rejected": -21625140.0, "logps/chosen": -360.946044921875, "logps/rejected": -342.41070556640625, "loss": 0.2397, "rewards/chosen": 0.4930315911769867, "rewards/margins": 2.8230675160884857, "rewards/rejected": -2.330035924911499, "step": 8563 }, { "epoch": 0.45392627143349323, "grad_norm": 50.25, "kl": 0.5867404937744141, "learning_rate": 5e-07, "logits/chosen": 21178213.333333332, "logits/rejected": -29912486.4, "logps/chosen": -304.41943359375, "logps/rejected": -175.469287109375, "loss": 0.2838, "rewards/chosen": 0.007054393490155538, "rewards/margins": 2.3807215382655462, "rewards/rejected": -2.3736671447753905, "step": 8564 }, { "epoch": 0.45397927543529537, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3822498.0, "logits/rejected": -34453225.14285714, "logps/chosen": -31.558225631713867, "logps/rejected": -331.2431640625, "loss": 0.2256, "rewards/chosen": 0.6182920336723328, "rewards/margins": 2.5460446647235324, "rewards/rejected": -1.9277526310511999, "step": 8565 }, { "epoch": 0.4540322794370975, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13267288.0, "logits/rejected": -33489142.4, "logps/chosen": -421.1341959635417, "logps/rejected": -335.69072265625, "loss": 0.2146, "rewards/chosen": 0.7226959864298502, "rewards/margins": 2.9550326029459635, "rewards/rejected": -2.2323366165161134, "step": 8566 }, { "epoch": 0.45408528343889965, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77586912.0, "logits/rejected": -43483545.6, "logps/chosen": -418.7788899739583, "logps/rejected": -278.929833984375, "loss": 0.2478, "rewards/chosen": 0.3913045326868693, "rewards/margins": 2.6198415199915566, "rewards/rejected": -2.2285369873046874, "step": 8567 }, { "epoch": 0.4541382874407018, "grad_norm": 27.5, "kl": 1.0534133911132812, "learning_rate": 5e-07, "logits/chosen": -15261708.0, "logits/rejected": -36812389.333333336, "logps/chosen": -718.1311645507812, "logps/rejected": -349.80712890625, "loss": 0.1086, "rewards/chosen": 1.778826117515564, "rewards/margins": 5.090316812197367, "rewards/rejected": -3.3114906946818032, "step": 8568 }, { "epoch": 0.4541912914425039, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1754101.3333333333, "logits/rejected": -13940412.8, "logps/chosen": -668.2501220703125, "logps/rejected": -290.826171875, "loss": 0.188, "rewards/chosen": 0.9646280606587728, "rewards/margins": 3.24243491490682, "rewards/rejected": -2.277806854248047, "step": 8569 }, { "epoch": 0.45424429544430606, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36949648.0, "logits/rejected": -58716960.0, "logps/chosen": -371.7166748046875, "logps/rejected": -95.52983093261719, "loss": 0.3476, "rewards/chosen": 0.3876589834690094, "rewards/margins": 1.3498505651950836, "rewards/rejected": -0.9621915817260742, "step": 8570 }, { "epoch": 0.4542972994461082, "grad_norm": 37.75, "kl": 1.4182052612304688, "learning_rate": 5e-07, "logits/chosen": 2747405.3333333335, "logits/rejected": -19445926.4, "logps/chosen": -210.57222493489584, "logps/rejected": -334.41552734375, "loss": 0.2218, "rewards/chosen": 0.8911787668863932, "rewards/margins": 2.6404892603556314, "rewards/rejected": -1.7493104934692383, "step": 8571 }, { "epoch": 0.45435030344791033, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3242440.3333333335, "logits/rejected": -1920708.8, "logps/chosen": -270.73215738932294, "logps/rejected": -214.50244140625, "loss": 0.3034, "rewards/chosen": 0.4430854717890422, "rewards/margins": 1.6990782658259074, "rewards/rejected": -1.2559927940368651, "step": 8572 }, { "epoch": 0.45440330744971247, "grad_norm": 50.5, "kl": 0.6487922668457031, "learning_rate": 5e-07, "logits/chosen": -27226090.666666668, "logits/rejected": -76294560.0, "logps/chosen": -216.0491739908854, "logps/rejected": -218.4462158203125, "loss": 0.3351, "rewards/chosen": -0.05685050288836161, "rewards/margins": 1.5404022475083667, "rewards/rejected": -1.5972527503967284, "step": 8573 }, { "epoch": 0.4544563114515146, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25416358.4, "logits/rejected": -42575498.666666664, "logps/chosen": -259.0325927734375, "logps/rejected": -606.5101318359375, "loss": 0.3039, "rewards/chosen": 0.268326997756958, "rewards/margins": 3.781237522761027, "rewards/rejected": -3.512910525004069, "step": 8574 }, { "epoch": 0.45450931545331674, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37001856.0, "logits/rejected": -292312.0, "logps/chosen": -326.4209798177083, "logps/rejected": -217.7157958984375, "loss": 0.2562, "rewards/chosen": 0.5265088081359863, "rewards/margins": 2.354737377166748, "rewards/rejected": -1.8282285690307618, "step": 8575 }, { "epoch": 0.4545623194551189, "grad_norm": 42.5, "kl": 0.09149551391601562, "learning_rate": 5e-07, "logits/chosen": -59817241.6, "logits/rejected": -76662682.66666667, "logps/chosen": -515.715673828125, "logps/rejected": -734.5243326822916, "loss": 0.2581, "rewards/chosen": 0.6353741645812988, "rewards/margins": 4.691143767038981, "rewards/rejected": -4.055769602457683, "step": 8576 }, { "epoch": 0.454615323456921, "grad_norm": 41.0, "kl": 3.297819137573242, "learning_rate": 5e-07, "logits/chosen": 29743546.666666668, "logits/rejected": -28441657.6, "logps/chosen": -951.3636881510416, "logps/rejected": -228.35078125, "loss": 0.2252, "rewards/chosen": 1.3213785489400227, "rewards/margins": 2.8711666425069176, "rewards/rejected": -1.5497880935668946, "step": 8577 }, { "epoch": 0.45466832745872315, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28483464.0, "logits/rejected": -37220924.0, "logps/chosen": -399.83624267578125, "logps/rejected": -343.0096740722656, "loss": 0.2932, "rewards/chosen": 0.21346226334571838, "rewards/margins": 2.8200398981571198, "rewards/rejected": -2.6065776348114014, "step": 8578 }, { "epoch": 0.4547213314605253, "grad_norm": 52.0, "kl": 0.82550048828125, "learning_rate": 5e-07, "logits/chosen": -21733284.0, "logits/rejected": -32408258.0, "logps/chosen": -476.285400390625, "logps/rejected": -642.97265625, "loss": 0.2693, "rewards/chosen": 0.8202637434005737, "rewards/margins": 2.8969606161117554, "rewards/rejected": -2.0766968727111816, "step": 8579 }, { "epoch": 0.45477433546232743, "grad_norm": 35.0, "kl": 0.6414299011230469, "learning_rate": 5e-07, "logits/chosen": -29343162.0, "logits/rejected": 3392512.0, "logps/chosen": -268.42193603515625, "logps/rejected": -496.835205078125, "loss": 0.2516, "rewards/chosen": 0.43598511815071106, "rewards/margins": 3.4233491718769073, "rewards/rejected": -2.9873640537261963, "step": 8580 }, { "epoch": 0.45482733946412957, "grad_norm": 50.25, "kl": 0.17426109313964844, "learning_rate": 5e-07, "logits/chosen": -33862565.333333336, "logits/rejected": -20960684.8, "logps/chosen": -250.5838623046875, "logps/rejected": -296.506982421875, "loss": 0.2228, "rewards/chosen": 0.45334116617838544, "rewards/margins": 2.8519667307535808, "rewards/rejected": -2.3986255645751955, "step": 8581 }, { "epoch": 0.4548803434659317, "grad_norm": 88.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46473656.0, "logits/rejected": -2704133.0, "logps/chosen": -405.911865234375, "logps/rejected": -144.41256713867188, "loss": 0.2929, "rewards/chosen": 0.39658087491989136, "rewards/margins": 2.2268155217170715, "rewards/rejected": -1.8302346467971802, "step": 8582 }, { "epoch": 0.45493334746773384, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44421104.0, "logits/rejected": -20915184.0, "logps/chosen": -183.7139892578125, "logps/rejected": -418.0063069661458, "loss": 0.2906, "rewards/chosen": -0.05336683988571167, "rewards/margins": 1.6505683064460754, "rewards/rejected": -1.703935146331787, "step": 8583 }, { "epoch": 0.454986351469536, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18070185.333333332, "logits/rejected": -4855045.0, "logps/chosen": -530.3039143880209, "logps/rejected": -150.69732666015625, "loss": 0.3807, "rewards/chosen": 0.5470434029897054, "rewards/margins": 1.7771379550298056, "rewards/rejected": -1.2300945520401, "step": 8584 }, { "epoch": 0.45503935547133806, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55064213.333333336, "logits/rejected": -42850248.0, "logps/chosen": -426.1691080729167, "logps/rejected": -387.5570373535156, "loss": 0.2719, "rewards/chosen": 0.7687408129374186, "rewards/margins": 3.330122391382853, "rewards/rejected": -2.5613815784454346, "step": 8585 }, { "epoch": 0.4550923594731402, "grad_norm": 48.75, "kl": 0.5978622436523438, "learning_rate": 5e-07, "logits/chosen": -27996992.0, "logits/rejected": -20093378.0, "logps/chosen": -321.6243082682292, "logps/rejected": -246.64537048339844, "loss": 0.3112, "rewards/chosen": 0.6447856028874716, "rewards/margins": 2.6208552916844687, "rewards/rejected": -1.976069688796997, "step": 8586 }, { "epoch": 0.45514536347494233, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48053904.0, "logits/rejected": -45936628.0, "logps/chosen": -381.26861572265625, "logps/rejected": -913.9459228515625, "loss": 0.1945, "rewards/chosen": 0.788438081741333, "rewards/margins": 4.387181758880615, "rewards/rejected": -3.5987436771392822, "step": 8587 }, { "epoch": 0.45519836747674447, "grad_norm": 47.5, "kl": 0.15238189697265625, "learning_rate": 5e-07, "logits/chosen": -6433952.0, "logits/rejected": -3752678.5, "logps/chosen": -132.67669677734375, "logps/rejected": -99.30392456054688, "loss": 0.359, "rewards/chosen": 0.18805842101573944, "rewards/margins": 1.4478154629468918, "rewards/rejected": -1.2597570419311523, "step": 8588 }, { "epoch": 0.4552513714785466, "grad_norm": 49.0, "kl": 1.654611587524414, "learning_rate": 5e-07, "logits/chosen": -70786137.6, "logits/rejected": -84823525.33333333, "logps/chosen": -561.1943359375, "logps/rejected": -226.9389444986979, "loss": 0.3334, "rewards/chosen": 0.6604824066162109, "rewards/margins": 2.5624243418375654, "rewards/rejected": -1.9019419352213542, "step": 8589 }, { "epoch": 0.45530437548034874, "grad_norm": 43.75, "kl": 1.7366657257080078, "learning_rate": 5e-07, "logits/chosen": -36278457.6, "logits/rejected": -22394858.666666668, "logps/chosen": -139.022802734375, "logps/rejected": -194.13484700520834, "loss": 0.364, "rewards/chosen": 0.06633232831954956, "rewards/margins": 2.337666022777557, "rewards/rejected": -2.271333694458008, "step": 8590 }, { "epoch": 0.4553573794821509, "grad_norm": 31.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12689556.0, "logits/rejected": -51016393.6, "logps/chosen": -149.8447469075521, "logps/rejected": -239.1114013671875, "loss": 0.207, "rewards/chosen": 0.9930276870727539, "rewards/margins": 3.0071325302124023, "rewards/rejected": -2.0141048431396484, "step": 8591 }, { "epoch": 0.455410383483953, "grad_norm": 45.25, "kl": 0.7964153289794922, "learning_rate": 5e-07, "logits/chosen": -6969662.0, "logits/rejected": -71982384.0, "logps/chosen": -296.0373942057292, "logps/rejected": -381.244384765625, "loss": 0.2762, "rewards/chosen": 1.1408332188924153, "rewards/margins": 3.4066277345021563, "rewards/rejected": -2.265794515609741, "step": 8592 }, { "epoch": 0.45546338748575516, "grad_norm": 46.0, "kl": 2.3307437896728516, "learning_rate": 5e-07, "logits/chosen": -1765121.75, "logits/rejected": -25170412.0, "logps/chosen": -222.6746063232422, "logps/rejected": -276.5718994140625, "loss": 0.3081, "rewards/chosen": 0.5863473415374756, "rewards/margins": 2.0465967655181885, "rewards/rejected": -1.460249423980713, "step": 8593 }, { "epoch": 0.4555163914875573, "grad_norm": 33.25, "kl": 2.8041343688964844, "learning_rate": 5e-07, "logits/chosen": 5411200.0, "logits/rejected": -31259928.0, "logps/chosen": -101.15373992919922, "logps/rejected": -309.6127624511719, "loss": 0.3379, "rewards/chosen": 0.39039671421051025, "rewards/margins": 1.9466220140457153, "rewards/rejected": -1.556225299835205, "step": 8594 }, { "epoch": 0.45556939548935943, "grad_norm": 57.5, "kl": 1.245025634765625, "learning_rate": 5e-07, "logits/chosen": -48011136.0, "logits/rejected": -62820181.333333336, "logps/chosen": -461.388623046875, "logps/rejected": -283.83689371744794, "loss": 0.3322, "rewards/chosen": 0.8726511001586914, "rewards/margins": 2.193953673044841, "rewards/rejected": -1.3213025728861492, "step": 8595 }, { "epoch": 0.45562239949116157, "grad_norm": 47.25, "kl": 0.06894683837890625, "learning_rate": 5e-07, "logits/chosen": -47081616.0, "logits/rejected": -40266480.0, "logps/chosen": -117.41297912597656, "logps/rejected": -408.1798095703125, "loss": 0.2442, "rewards/chosen": 0.5307022333145142, "rewards/margins": 3.1798027753829956, "rewards/rejected": -2.6491005420684814, "step": 8596 }, { "epoch": 0.4556754034929637, "grad_norm": 63.25, "kl": 0.47605133056640625, "learning_rate": 5e-07, "logits/chosen": 27255046.0, "logits/rejected": -47550480.0, "logps/chosen": -198.4993438720703, "logps/rejected": -205.29681396484375, "loss": 0.3294, "rewards/chosen": 0.0023833364248275757, "rewards/margins": 1.9827748388051987, "rewards/rejected": -1.980391502380371, "step": 8597 }, { "epoch": 0.45572840749476584, "grad_norm": 63.0, "kl": 0.32202911376953125, "learning_rate": 5e-07, "logits/chosen": -36380924.0, "logits/rejected": -25135740.0, "logps/chosen": -709.1838989257812, "logps/rejected": -222.89694213867188, "loss": 0.2453, "rewards/chosen": 1.0787254571914673, "rewards/margins": 3.424540877342224, "rewards/rejected": -2.345815420150757, "step": 8598 }, { "epoch": 0.455781411496568, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64104820.0, "logits/rejected": -51288256.0, "logps/chosen": -367.2962646484375, "logps/rejected": -599.641357421875, "loss": 0.2064, "rewards/chosen": 0.8363998532295227, "rewards/margins": 4.1570733189582825, "rewards/rejected": -3.3206734657287598, "step": 8599 }, { "epoch": 0.4558344154983701, "grad_norm": 52.25, "kl": 0.7146835327148438, "learning_rate": 5e-07, "logits/chosen": -63867272.0, "logits/rejected": 86831376.0, "logps/chosen": -413.06988525390625, "logps/rejected": -831.4779663085938, "loss": 0.2846, "rewards/chosen": 0.03183440864086151, "rewards/margins": 3.8542707413434982, "rewards/rejected": -3.8224363327026367, "step": 8600 }, { "epoch": 0.45588741950017225, "grad_norm": 47.75, "kl": 0.016765594482421875, "learning_rate": 5e-07, "logits/chosen": -5594054.0, "logits/rejected": -4984839.5, "logps/chosen": -190.7286834716797, "logps/rejected": -206.5371551513672, "loss": 0.3194, "rewards/chosen": 0.24540381133556366, "rewards/margins": 1.965320035815239, "rewards/rejected": -1.7199162244796753, "step": 8601 }, { "epoch": 0.4559404235019744, "grad_norm": 30.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1134604.3333333333, "logits/rejected": -48340944.0, "logps/chosen": -56.28208923339844, "logps/rejected": -661.74140625, "loss": 0.2262, "rewards/chosen": 0.16730276743570963, "rewards/margins": 3.3229077021280924, "rewards/rejected": -3.155604934692383, "step": 8602 }, { "epoch": 0.4559934275037765, "grad_norm": 46.0, "kl": 0.18854141235351562, "learning_rate": 5e-07, "logits/chosen": -65534805.333333336, "logits/rejected": -11629818.0, "logps/chosen": -253.5940958658854, "logps/rejected": -92.14033508300781, "loss": 0.3299, "rewards/chosen": 0.5372698307037354, "rewards/margins": 2.69217586517334, "rewards/rejected": -2.1549060344696045, "step": 8603 }, { "epoch": 0.45604643150557866, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45050422.85714286, "logits/rejected": 19650488.0, "logps/chosen": -232.0809326171875, "logps/rejected": -206.4420623779297, "loss": 0.4092, "rewards/chosen": 0.2858304977416992, "rewards/margins": 1.5046247243881226, "rewards/rejected": -1.2187942266464233, "step": 8604 }, { "epoch": 0.4560994355073808, "grad_norm": 61.5, "kl": 0.3774280548095703, "learning_rate": 5e-07, "logits/chosen": -2294480.0, "logits/rejected": -19515021.333333332, "logps/chosen": -257.8273681640625, "logps/rejected": -541.9679361979166, "loss": 0.3157, "rewards/chosen": 0.4566225051879883, "rewards/margins": 2.8180819193522133, "rewards/rejected": -2.361459414164225, "step": 8605 }, { "epoch": 0.45615243950918294, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35959704.0, "logits/rejected": 9512904.0, "logps/chosen": -301.211181640625, "logps/rejected": -296.6631164550781, "loss": 0.3079, "rewards/chosen": 0.6247658133506775, "rewards/margins": 1.965522825717926, "rewards/rejected": -1.3407570123672485, "step": 8606 }, { "epoch": 0.4562054435109851, "grad_norm": 53.75, "kl": 0.5928955078125, "learning_rate": 5e-07, "logits/chosen": -23942185.6, "logits/rejected": -27209496.0, "logps/chosen": -429.41962890625, "logps/rejected": -293.8370361328125, "loss": 0.3055, "rewards/chosen": 0.33005828857421876, "rewards/margins": 3.03457088470459, "rewards/rejected": -2.704512596130371, "step": 8607 }, { "epoch": 0.4562584475127872, "grad_norm": 47.0, "kl": 1.1375350952148438, "learning_rate": 5e-07, "logits/chosen": -23502928.0, "logits/rejected": -32009512.0, "logps/chosen": -368.05120849609375, "logps/rejected": -297.8860270182292, "loss": 0.229, "rewards/chosen": 0.6917617917060852, "rewards/margins": 2.0932885607083636, "rewards/rejected": -1.4015267690022786, "step": 8608 }, { "epoch": 0.45631145151458935, "grad_norm": 45.75, "kl": 1.7988262176513672, "learning_rate": 5e-07, "logits/chosen": -38849876.0, "logits/rejected": -21591846.0, "logps/chosen": -482.2166442871094, "logps/rejected": -180.2274169921875, "loss": 0.2795, "rewards/chosen": 0.8886503577232361, "rewards/margins": 2.3476324677467346, "rewards/rejected": -1.4589821100234985, "step": 8609 }, { "epoch": 0.4563644555163915, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30910168.0, "logits/rejected": -25520409.14285714, "logps/chosen": -268.66961669921875, "logps/rejected": -461.32802036830356, "loss": 0.2106, "rewards/chosen": -0.9855713248252869, "rewards/margins": 1.3076034188270569, "rewards/rejected": -2.2931747436523438, "step": 8610 }, { "epoch": 0.4564174595181936, "grad_norm": 49.5, "kl": 0.5131325721740723, "learning_rate": 5e-07, "logits/chosen": -17501274.0, "logits/rejected": -22685162.666666668, "logps/chosen": -330.3785095214844, "logps/rejected": -322.87322998046875, "loss": 0.2447, "rewards/chosen": 0.6675323247909546, "rewards/margins": 2.5043783585230512, "rewards/rejected": -1.8368460337320964, "step": 8611 }, { "epoch": 0.45647046351999576, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20834530.285714287, "logits/rejected": -4128549.0, "logps/chosen": -409.5083705357143, "logps/rejected": -883.9207153320312, "loss": 0.357, "rewards/chosen": 0.5082266671316964, "rewards/margins": 2.557549272264753, "rewards/rejected": -2.0493226051330566, "step": 8612 }, { "epoch": 0.4565234675217979, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29517126.0, "logits/rejected": -29383674.0, "logps/chosen": -345.1436767578125, "logps/rejected": -370.63934326171875, "loss": 0.1551, "rewards/chosen": 1.6438225507736206, "rewards/margins": 3.9449156522750854, "rewards/rejected": -2.301093101501465, "step": 8613 }, { "epoch": 0.45657647152360004, "grad_norm": 55.0, "kl": 1.761033058166504, "learning_rate": 5e-07, "logits/chosen": -11897917.0, "logits/rejected": -15236532.0, "logps/chosen": -277.8866271972656, "logps/rejected": -193.66957092285156, "loss": 0.2621, "rewards/chosen": 1.0491348505020142, "rewards/margins": 2.7246302366256714, "rewards/rejected": -1.6754953861236572, "step": 8614 }, { "epoch": 0.4566294755254022, "grad_norm": 51.25, "kl": 0.12132930755615234, "learning_rate": 5e-07, "logits/chosen": -24461376.0, "logits/rejected": -19712292.0, "logps/chosen": -416.702783203125, "logps/rejected": -197.92500813802084, "loss": 0.3247, "rewards/chosen": 0.5053540229797363, "rewards/margins": 2.0806547164916993, "rewards/rejected": -1.575300693511963, "step": 8615 }, { "epoch": 0.4566824795272043, "grad_norm": 68.0, "kl": 0.030719757080078125, "learning_rate": 5e-07, "logits/chosen": -64592410.666666664, "logits/rejected": -4076944.8, "logps/chosen": -762.5413411458334, "logps/rejected": -459.511181640625, "loss": 0.2607, "rewards/chosen": 0.7874420483907064, "rewards/margins": 2.452783044179281, "rewards/rejected": -1.6653409957885743, "step": 8616 }, { "epoch": 0.45673548352900645, "grad_norm": 50.5, "kl": 0.45510101318359375, "learning_rate": 5e-07, "logits/chosen": -17047469.333333332, "logits/rejected": 153377510.4, "logps/chosen": -860.2146809895834, "logps/rejected": -351.787646484375, "loss": 0.2289, "rewards/chosen": 0.8696004549662272, "rewards/margins": 3.114936796824137, "rewards/rejected": -2.24533634185791, "step": 8617 }, { "epoch": 0.4567884875308086, "grad_norm": 36.5, "kl": 0.5624923706054688, "learning_rate": 5e-07, "logits/chosen": -28674944.0, "logits/rejected": -34036792.0, "logps/chosen": -259.07147216796875, "logps/rejected": -243.24281311035156, "loss": 0.251, "rewards/chosen": 0.4995950758457184, "rewards/margins": 3.056248813867569, "rewards/rejected": -2.5566537380218506, "step": 8618 }, { "epoch": 0.4568414915326107, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28438154.0, "logits/rejected": -15919961.0, "logps/chosen": -209.47251892089844, "logps/rejected": -270.75250244140625, "loss": 0.3392, "rewards/chosen": -0.08157861977815628, "rewards/margins": 2.082315318286419, "rewards/rejected": -2.163893938064575, "step": 8619 }, { "epoch": 0.45689449553441286, "grad_norm": 51.75, "kl": 1.6365013122558594, "learning_rate": 5e-07, "logits/chosen": -37911460.0, "logits/rejected": -29942634.0, "logps/chosen": -193.5168914794922, "logps/rejected": -249.0313262939453, "loss": 0.3311, "rewards/chosen": 0.209303617477417, "rewards/margins": 1.9861308336257935, "rewards/rejected": -1.7768272161483765, "step": 8620 }, { "epoch": 0.456947499536215, "grad_norm": 55.5, "kl": 2.205331802368164, "learning_rate": 5e-07, "logits/chosen": -1016382.1, "logits/rejected": -23812389.333333332, "logps/chosen": -350.24814453125, "logps/rejected": -229.62015787760416, "loss": 0.3155, "rewards/chosen": 0.6380710124969482, "rewards/margins": 2.4951972484588625, "rewards/rejected": -1.857126235961914, "step": 8621 }, { "epoch": 0.45700050353801713, "grad_norm": 43.0, "kl": 0.5351982116699219, "learning_rate": 5e-07, "logits/chosen": -21925529.6, "logits/rejected": -57587360.0, "logps/chosen": -203.4253662109375, "logps/rejected": -536.8980305989584, "loss": 0.2923, "rewards/chosen": 0.4282522201538086, "rewards/margins": 3.910564104715983, "rewards/rejected": -3.4823118845621743, "step": 8622 }, { "epoch": 0.45705350753981927, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38772752.0, "logits/rejected": -1040273.0, "logps/chosen": -255.06024169921875, "logps/rejected": -130.4084269205729, "loss": 0.2553, "rewards/chosen": 0.5121025443077087, "rewards/margins": 1.9110599557558696, "rewards/rejected": -1.3989574114481609, "step": 8623 }, { "epoch": 0.4571065115416214, "grad_norm": 44.75, "kl": 2.4512100219726562, "learning_rate": 5e-07, "logits/chosen": -31733273.6, "logits/rejected": -25742968.0, "logps/chosen": -363.25458984375, "logps/rejected": -267.7439778645833, "loss": 0.2604, "rewards/chosen": 0.7941903114318848, "rewards/margins": 3.7351521492004394, "rewards/rejected": -2.9409618377685547, "step": 8624 }, { "epoch": 0.45715951554342354, "grad_norm": 53.75, "kl": 1.1425037384033203, "learning_rate": 5e-07, "logits/chosen": -7964291.333333333, "logits/rejected": -25389576.0, "logps/chosen": -201.8564249674479, "logps/rejected": -192.65110778808594, "loss": 0.3654, "rewards/chosen": 0.48666731516520184, "rewards/margins": 2.0318291584650674, "rewards/rejected": -1.5451618432998657, "step": 8625 }, { "epoch": 0.4572125195452257, "grad_norm": 49.5, "kl": 0.4283885955810547, "learning_rate": 5e-07, "logits/chosen": 555018.75, "logits/rejected": -13804328.0, "logps/chosen": -405.10015869140625, "logps/rejected": -336.716552734375, "loss": 0.1898, "rewards/chosen": 0.8085792660713196, "rewards/margins": 3.093097507953644, "rewards/rejected": -2.284518241882324, "step": 8626 }, { "epoch": 0.4572655235470278, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27804051.2, "logits/rejected": 651126.6666666666, "logps/chosen": -170.519921875, "logps/rejected": -65.6068623860677, "loss": 0.4292, "rewards/chosen": -0.12933821678161622, "rewards/margins": 0.9235589186350506, "rewards/rejected": -1.0528971354166667, "step": 8627 }, { "epoch": 0.45731852754882996, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61682672.0, "logits/rejected": -18653555.2, "logps/chosen": -370.9637451171875, "logps/rejected": -257.9858154296875, "loss": 0.3088, "rewards/chosen": 0.4241495927174886, "rewards/margins": 1.5831518014272052, "rewards/rejected": -1.1590022087097167, "step": 8628 }, { "epoch": 0.4573715315506321, "grad_norm": 52.25, "kl": 0.15363693237304688, "learning_rate": 5e-07, "logits/chosen": -2311201.3333333335, "logits/rejected": -12879274.0, "logps/chosen": -165.79178873697916, "logps/rejected": -173.7571258544922, "loss": 0.4047, "rewards/chosen": 0.04547883073488871, "rewards/margins": 2.3167054752508798, "rewards/rejected": -2.271226644515991, "step": 8629 }, { "epoch": 0.45742453555243423, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6575513.0, "logits/rejected": -18879770.0, "logps/chosen": -44.084495544433594, "logps/rejected": -361.12957763671875, "loss": 0.2973, "rewards/chosen": 0.20772606134414673, "rewards/margins": 2.0876572728157043, "rewards/rejected": -1.8799312114715576, "step": 8630 }, { "epoch": 0.45747753955423637, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39620106.666666664, "logits/rejected": -43594320.0, "logps/chosen": -296.1806640625, "logps/rejected": -89.42339324951172, "loss": 0.39, "rewards/chosen": 0.5011601448059082, "rewards/margins": 0.9195345044136047, "rewards/rejected": -0.41837435960769653, "step": 8631 }, { "epoch": 0.4575305435560385, "grad_norm": 31.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51177436.0, "logits/rejected": -24299498.666666668, "logps/chosen": -296.7502746582031, "logps/rejected": -217.33455403645834, "loss": 0.2448, "rewards/chosen": 0.17141342163085938, "rewards/margins": 2.120713233947754, "rewards/rejected": -1.9492998123168945, "step": 8632 }, { "epoch": 0.45758354755784064, "grad_norm": 43.0, "kl": 0.029815673828125, "learning_rate": 5e-07, "logits/chosen": -1943204.375, "logits/rejected": -17092242.0, "logps/chosen": -86.6915054321289, "logps/rejected": -326.9115905761719, "loss": 0.3515, "rewards/chosen": 0.5243285894393921, "rewards/margins": 1.7936588525772095, "rewards/rejected": -1.2693302631378174, "step": 8633 }, { "epoch": 0.4576365515596428, "grad_norm": 63.75, "kl": 0.8849239349365234, "learning_rate": 5e-07, "logits/chosen": -33573997.333333336, "logits/rejected": -12444711.0, "logps/chosen": -212.74503580729166, "logps/rejected": -302.4486083984375, "loss": 0.3677, "rewards/chosen": 0.4752720197041829, "rewards/margins": 2.244601090749105, "rewards/rejected": -1.7693290710449219, "step": 8634 }, { "epoch": 0.4576895555614449, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10404025.6, "logits/rejected": -6994333.333333333, "logps/chosen": -265.098388671875, "logps/rejected": -542.3155924479166, "loss": 0.3263, "rewards/chosen": 0.2858360767364502, "rewards/margins": 2.919449027379354, "rewards/rejected": -2.633612950642904, "step": 8635 }, { "epoch": 0.457742559563247, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72673930.66666667, "logits/rejected": -53184185.6, "logps/chosen": -156.53108723958334, "logps/rejected": -335.7178955078125, "loss": 0.2474, "rewards/chosen": 0.2025561730066935, "rewards/margins": 2.620858327547709, "rewards/rejected": -2.4183021545410157, "step": 8636 }, { "epoch": 0.45779556356504914, "grad_norm": 63.5, "kl": 1.2460269927978516, "learning_rate": 5e-07, "logits/chosen": -24316756.8, "logits/rejected": -43884706.666666664, "logps/chosen": -387.6650390625, "logps/rejected": -324.8736572265625, "loss": 0.2817, "rewards/chosen": 0.8627130508422851, "rewards/margins": 2.3584800084431965, "rewards/rejected": -1.4957669576009114, "step": 8637 }, { "epoch": 0.45784856756685127, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34796720.0, "logits/rejected": -30548477.333333332, "logps/chosen": -388.46954345703125, "logps/rejected": -248.75895182291666, "loss": 0.2157, "rewards/chosen": 0.8056061267852783, "rewards/margins": 2.426439682642619, "rewards/rejected": -1.6208335558573406, "step": 8638 }, { "epoch": 0.4579015715686534, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36911412.0, "logits/rejected": -29924213.333333332, "logps/chosen": -202.72242736816406, "logps/rejected": -360.9563802083333, "loss": 0.2915, "rewards/chosen": -1.0844357013702393, "rewards/margins": 0.9421857992808023, "rewards/rejected": -2.0266215006510415, "step": 8639 }, { "epoch": 0.45795457557045555, "grad_norm": 65.5, "kl": 0.9198417663574219, "learning_rate": 5e-07, "logits/chosen": -42967501.71428572, "logits/rejected": -4543752.0, "logps/chosen": -304.69911411830356, "logps/rejected": -94.89590454101562, "loss": 0.4691, "rewards/chosen": 0.021078859056745256, "rewards/margins": 2.0276921136038646, "rewards/rejected": -2.006613254547119, "step": 8640 }, { "epoch": 0.4580075795722577, "grad_norm": 43.25, "kl": 1.4482650756835938, "learning_rate": 5e-07, "logits/chosen": -22971874.0, "logits/rejected": -10013244.0, "logps/chosen": -635.907470703125, "logps/rejected": -256.80560302734375, "loss": 0.2497, "rewards/chosen": 0.7245794534683228, "rewards/margins": 3.032380700111389, "rewards/rejected": -2.3078012466430664, "step": 8641 }, { "epoch": 0.4580605835740598, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2147612.0, "logits/rejected": -20105913.14285714, "logps/chosen": -37.29517364501953, "logps/rejected": -407.93673270089283, "loss": 0.1525, "rewards/chosen": -0.0650840774178505, "rewards/margins": 2.8650486522487233, "rewards/rejected": -2.930132729666574, "step": 8642 }, { "epoch": 0.45811358757586196, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1705200.5, "logits/rejected": -21690836.0, "logps/chosen": -267.93060302734375, "logps/rejected": -409.93377685546875, "loss": 0.2521, "rewards/chosen": 0.6225523948669434, "rewards/margins": 2.4259856939315796, "rewards/rejected": -1.8034332990646362, "step": 8643 }, { "epoch": 0.4581665915776641, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43597389.333333336, "logits/rejected": -71136256.0, "logps/chosen": -331.083251953125, "logps/rejected": -251.79189453125, "loss": 0.3574, "rewards/chosen": -0.20833269755045572, "rewards/margins": 1.1068305333455404, "rewards/rejected": -1.3151632308959962, "step": 8644 }, { "epoch": 0.45821959557946623, "grad_norm": 39.25, "kl": 0.01296234130859375, "learning_rate": 5e-07, "logits/chosen": 3548052.75, "logits/rejected": -11195633.333333334, "logps/chosen": -38.37653732299805, "logps/rejected": -139.3449503580729, "loss": 0.296, "rewards/chosen": 0.2701055705547333, "rewards/margins": 1.4734917183717091, "rewards/rejected": -1.2033861478169758, "step": 8645 }, { "epoch": 0.45827259958126837, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15506249.0, "logits/rejected": 17371402.666666668, "logps/chosen": -288.7686767578125, "logps/rejected": -239.783935546875, "loss": 0.3386, "rewards/chosen": 0.415871798992157, "rewards/margins": 1.3125903010368347, "rewards/rejected": -0.8967185020446777, "step": 8646 }, { "epoch": 0.4583256035830705, "grad_norm": 59.5, "kl": 0.20183944702148438, "learning_rate": 5e-07, "logits/chosen": -16368023.0, "logits/rejected": -12042855.0, "logps/chosen": -261.96173095703125, "logps/rejected": -292.11456298828125, "loss": 0.2669, "rewards/chosen": 0.7593913078308105, "rewards/margins": 2.324718713760376, "rewards/rejected": -1.5653274059295654, "step": 8647 }, { "epoch": 0.45837860758487264, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34916480.0, "logits/rejected": -115734656.0, "logps/chosen": -183.5828857421875, "logps/rejected": -481.77667236328125, "loss": 0.319, "rewards/chosen": -0.2087850719690323, "rewards/margins": 2.5806736797094345, "rewards/rejected": -2.789458751678467, "step": 8648 }, { "epoch": 0.4584316115866748, "grad_norm": 40.5, "kl": 0.6410694122314453, "learning_rate": 5e-07, "logits/chosen": -14471785.333333334, "logits/rejected": -30879929.6, "logps/chosen": -141.5684814453125, "logps/rejected": -252.9683837890625, "loss": 0.316, "rewards/chosen": 0.14735243717829385, "rewards/margins": 1.587986687819163, "rewards/rejected": -1.4406342506408691, "step": 8649 }, { "epoch": 0.4584846155884769, "grad_norm": 47.0, "kl": 1.1641435623168945, "learning_rate": 5e-07, "logits/chosen": -11114065.0, "logits/rejected": -12120960.0, "logps/chosen": -196.21644592285156, "logps/rejected": -352.37481689453125, "loss": 0.2225, "rewards/chosen": 1.023432970046997, "rewards/margins": 3.0567121505737305, "rewards/rejected": -2.0332791805267334, "step": 8650 }, { "epoch": 0.45853761959027906, "grad_norm": 29.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6841630.0, "logits/rejected": -23014312.0, "logps/chosen": -42.17702865600586, "logps/rejected": -532.5223388671875, "loss": 0.1504, "rewards/chosen": 0.9210346341133118, "rewards/margins": 3.572406589984894, "rewards/rejected": -2.651371955871582, "step": 8651 }, { "epoch": 0.4585906235920812, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3400235.3333333335, "logits/rejected": -10909803.2, "logps/chosen": -290.98801676432294, "logps/rejected": -217.544970703125, "loss": 0.2555, "rewards/chosen": 0.48964182535807294, "rewards/margins": 2.099598757425944, "rewards/rejected": -1.609956932067871, "step": 8652 }, { "epoch": 0.45864362759388333, "grad_norm": 49.25, "kl": 2.0553817749023438, "learning_rate": 5e-07, "logits/chosen": -10079004.57142857, "logits/rejected": -3328046.5, "logps/chosen": -228.20169503348214, "logps/rejected": -81.37394714355469, "loss": 0.3389, "rewards/chosen": 0.6541787556239537, "rewards/margins": 3.226107256753104, "rewards/rejected": -2.5719285011291504, "step": 8653 }, { "epoch": 0.45869663159568547, "grad_norm": 49.25, "kl": 1.6741752624511719, "learning_rate": 5e-07, "logits/chosen": -47871772.0, "logits/rejected": -32112832.0, "logps/chosen": -499.7927551269531, "logps/rejected": -472.78082275390625, "loss": 0.2667, "rewards/chosen": 0.5060449242591858, "rewards/margins": 3.727891981601715, "rewards/rejected": -3.2218470573425293, "step": 8654 }, { "epoch": 0.4587496355974876, "grad_norm": 56.25, "kl": 0.6188488006591797, "learning_rate": 5e-07, "logits/chosen": -17752384.0, "logits/rejected": -8558478.0, "logps/chosen": -334.0244140625, "logps/rejected": -240.6383819580078, "loss": 0.3404, "rewards/chosen": 0.21860122680664062, "rewards/margins": 1.5759910345077515, "rewards/rejected": -1.3573898077011108, "step": 8655 }, { "epoch": 0.45880263959928974, "grad_norm": 51.25, "kl": 1.3481788635253906, "learning_rate": 5e-07, "logits/chosen": -29852973.333333332, "logits/rejected": 53269092.0, "logps/chosen": -340.6987711588542, "logps/rejected": -560.1839599609375, "loss": 0.3675, "rewards/chosen": 0.28690733512242633, "rewards/margins": 3.2652141054471335, "rewards/rejected": -2.978306770324707, "step": 8656 }, { "epoch": 0.4588556436010919, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24707514.666666668, "logits/rejected": 20865659.2, "logps/chosen": -603.2496744791666, "logps/rejected": -380.431591796875, "loss": 0.2144, "rewards/chosen": 0.9336760838826498, "rewards/margins": 2.595129712422689, "rewards/rejected": -1.661453628540039, "step": 8657 }, { "epoch": 0.458908647602894, "grad_norm": 39.75, "kl": 0.29486083984375, "learning_rate": 5e-07, "logits/chosen": -11578443.0, "logits/rejected": -47120709.333333336, "logps/chosen": -351.9762268066406, "logps/rejected": -229.44453938802084, "loss": 0.2147, "rewards/chosen": 0.6251893639564514, "rewards/margins": 2.7934348384539285, "rewards/rejected": -2.168245474497477, "step": 8658 }, { "epoch": 0.45896165160469615, "grad_norm": 59.5, "kl": 1.8269004821777344, "learning_rate": 5e-07, "logits/chosen": -12489514.4, "logits/rejected": -29071248.0, "logps/chosen": -563.5462890625, "logps/rejected": -289.7255452473958, "loss": 0.3444, "rewards/chosen": 0.6210527896881104, "rewards/margins": 1.795760202407837, "rewards/rejected": -1.1747074127197266, "step": 8659 }, { "epoch": 0.4590146556064983, "grad_norm": 53.5, "kl": 1.7897987365722656, "learning_rate": 5e-07, "logits/chosen": 6304944.0, "logits/rejected": -12557061.333333334, "logps/chosen": -200.325927734375, "logps/rejected": -215.03938802083334, "loss": 0.3061, "rewards/chosen": 0.6426831245422363, "rewards/margins": 2.8822275161743165, "rewards/rejected": -2.23954439163208, "step": 8660 }, { "epoch": 0.4590676596083004, "grad_norm": 61.0, "kl": 0.7352142333984375, "learning_rate": 5e-07, "logits/chosen": -51926468.0, "logits/rejected": -27124296.0, "logps/chosen": -440.30755615234375, "logps/rejected": -436.8513488769531, "loss": 0.1985, "rewards/chosen": 0.9011719226837158, "rewards/margins": 3.6106717586517334, "rewards/rejected": -2.7094998359680176, "step": 8661 }, { "epoch": 0.45912066361010256, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20579320.0, "logits/rejected": -7450606.666666667, "logps/chosen": -230.327734375, "logps/rejected": -199.5986124674479, "loss": 0.3451, "rewards/chosen": 0.2089153289794922, "rewards/margins": 1.9589900334676107, "rewards/rejected": -1.7500747044881184, "step": 8662 }, { "epoch": 0.4591736676119047, "grad_norm": 35.5, "kl": 2.389972686767578, "learning_rate": 5e-07, "logits/chosen": -976562.25, "logits/rejected": -23840126.0, "logps/chosen": -154.79261779785156, "logps/rejected": -361.9512939453125, "loss": 0.2515, "rewards/chosen": 0.5437811017036438, "rewards/margins": 2.705675184726715, "rewards/rejected": -2.1618940830230713, "step": 8663 }, { "epoch": 0.45922667161370684, "grad_norm": 40.25, "kl": 0.4902477264404297, "learning_rate": 5e-07, "logits/chosen": -5545188.5, "logits/rejected": -56081808.0, "logps/chosen": -180.80038452148438, "logps/rejected": -318.3628234863281, "loss": 0.3194, "rewards/chosen": 0.20468509197235107, "rewards/margins": 1.8362833261489868, "rewards/rejected": -1.6315982341766357, "step": 8664 }, { "epoch": 0.459279675615509, "grad_norm": 57.75, "kl": 1.4966678619384766, "learning_rate": 5e-07, "logits/chosen": -41533712.0, "logits/rejected": -123203.75, "logps/chosen": -186.67178344726562, "logps/rejected": -118.1348648071289, "loss": 0.3712, "rewards/chosen": 0.12482299655675888, "rewards/margins": 1.1384202465415, "rewards/rejected": -1.0135972499847412, "step": 8665 }, { "epoch": 0.4593326796173111, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1659755.6, "logits/rejected": -9142998.666666666, "logps/chosen": -265.86552734375, "logps/rejected": -321.1800944010417, "loss": 0.2984, "rewards/chosen": 0.37205893993377687, "rewards/margins": 3.14580606619517, "rewards/rejected": -2.773747126261393, "step": 8666 }, { "epoch": 0.45938568361911325, "grad_norm": 51.0, "kl": 0.42620086669921875, "learning_rate": 5e-07, "logits/chosen": -10210115.0, "logits/rejected": 2115667.5, "logps/chosen": -253.4765625, "logps/rejected": -216.3086700439453, "loss": 0.2523, "rewards/chosen": 1.016296148300171, "rewards/margins": 2.490358829498291, "rewards/rejected": -1.4740626811981201, "step": 8667 }, { "epoch": 0.4594386876209154, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30405508.0, "logits/rejected": -8515416.0, "logps/chosen": -276.76007080078125, "logps/rejected": -165.15167236328125, "loss": 0.2164, "rewards/chosen": 0.2899799346923828, "rewards/margins": 2.465707302093506, "rewards/rejected": -2.175727367401123, "step": 8668 }, { "epoch": 0.4594916916227175, "grad_norm": 65.5, "kl": 2.3431625366210938, "learning_rate": 5e-07, "logits/chosen": -31027417.14285714, "logits/rejected": 4765886.0, "logps/chosen": -416.57669503348217, "logps/rejected": -562.1806640625, "loss": 0.3452, "rewards/chosen": 0.8384323120117188, "rewards/margins": 4.184880018234253, "rewards/rejected": -3.346447706222534, "step": 8669 }, { "epoch": 0.45954469562451966, "grad_norm": 52.0, "kl": 2.5089950561523438, "learning_rate": 5e-07, "logits/chosen": 5285499.0, "logits/rejected": -16926016.0, "logps/chosen": -50.30113220214844, "logps/rejected": -464.7453206380208, "loss": 0.31, "rewards/chosen": -0.03392601013183594, "rewards/margins": 2.123575210571289, "rewards/rejected": -2.157501220703125, "step": 8670 }, { "epoch": 0.4595976996263218, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19426724.0, "logits/rejected": -2752364.0, "logps/chosen": -403.919189453125, "logps/rejected": -209.53500366210938, "loss": 0.3129, "rewards/chosen": 0.6943981647491455, "rewards/margins": 1.7627308368682861, "rewards/rejected": -1.0683326721191406, "step": 8671 }, { "epoch": 0.45965070362812394, "grad_norm": 40.0, "kl": 1.7110023498535156, "learning_rate": 5e-07, "logits/chosen": -8168940.0, "logits/rejected": -45621784.0, "logps/chosen": -170.03900146484375, "logps/rejected": -373.8558349609375, "loss": 0.2836, "rewards/chosen": 0.8368008931477865, "rewards/margins": 3.7176664670308432, "rewards/rejected": -2.8808655738830566, "step": 8672 }, { "epoch": 0.4597037076299261, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39975859.2, "logits/rejected": -29800368.0, "logps/chosen": -359.576025390625, "logps/rejected": -346.550537109375, "loss": 0.311, "rewards/chosen": 0.3331110954284668, "rewards/margins": 2.598559506734212, "rewards/rejected": -2.2654484113057456, "step": 8673 }, { "epoch": 0.4597567116317282, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14663300.0, "logits/rejected": -7864684.0, "logps/chosen": -130.34393310546875, "logps/rejected": -284.08648681640625, "loss": 0.2179, "rewards/chosen": 0.9551560878753662, "rewards/margins": 2.984271287918091, "rewards/rejected": -2.0291152000427246, "step": 8674 }, { "epoch": 0.45980971563353035, "grad_norm": 39.0, "kl": 2.488567352294922, "learning_rate": 5e-07, "logits/chosen": -19062030.0, "logits/rejected": -55049928.0, "logps/chosen": -117.58708953857422, "logps/rejected": -436.0807189941406, "loss": 0.2848, "rewards/chosen": 0.3793543875217438, "rewards/margins": 2.610939174890518, "rewards/rejected": -2.2315847873687744, "step": 8675 }, { "epoch": 0.4598627196353325, "grad_norm": 49.25, "kl": 0.3073740005493164, "learning_rate": 5e-07, "logits/chosen": -47387940.0, "logits/rejected": 1177273.9166666667, "logps/chosen": -469.03778076171875, "logps/rejected": -88.05912272135417, "loss": 0.2768, "rewards/chosen": 0.8723357915878296, "rewards/margins": 2.0270432233810425, "rewards/rejected": -1.154707431793213, "step": 8676 }, { "epoch": 0.4599157236371346, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20712012.0, "logits/rejected": -60476188.0, "logps/chosen": -177.40008544921875, "logps/rejected": -482.4918212890625, "loss": 0.3668, "rewards/chosen": 0.10331402222315471, "rewards/margins": 3.266795257727305, "rewards/rejected": -3.1634812355041504, "step": 8677 }, { "epoch": 0.45996872763893676, "grad_norm": 43.75, "kl": 0.25942134857177734, "learning_rate": 5e-07, "logits/chosen": -32148628.0, "logits/rejected": -32348746.666666668, "logps/chosen": -137.60122680664062, "logps/rejected": -410.5832926432292, "loss": 0.2741, "rewards/chosen": -0.14877614378929138, "rewards/margins": 1.6500791609287262, "rewards/rejected": -1.7988553047180176, "step": 8678 }, { "epoch": 0.4600217316407389, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -85150634.66666667, "logits/rejected": -50315443.2, "logps/chosen": -664.1468912760416, "logps/rejected": -360.78369140625, "loss": 0.2268, "rewards/chosen": 0.3606109619140625, "rewards/margins": 2.569981002807617, "rewards/rejected": -2.2093700408935546, "step": 8679 }, { "epoch": 0.46007473564254103, "grad_norm": 48.5, "kl": 3.00494384765625, "learning_rate": 5e-07, "logits/chosen": 5345959.5, "logits/rejected": -17979248.0, "logps/chosen": -309.7705078125, "logps/rejected": -221.853515625, "loss": 0.2469, "rewards/chosen": 0.970207691192627, "rewards/margins": 3.128387451171875, "rewards/rejected": -2.158179759979248, "step": 8680 }, { "epoch": 0.46012773964434317, "grad_norm": 51.0, "kl": 2.4507617950439453, "learning_rate": 5e-07, "logits/chosen": -4291872.666666667, "logits/rejected": 9344599.0, "logps/chosen": -203.13629150390625, "logps/rejected": -170.85350036621094, "loss": 0.3663, "rewards/chosen": 0.6142466068267822, "rewards/margins": 1.6156669855117798, "rewards/rejected": -1.0014203786849976, "step": 8681 }, { "epoch": 0.4601807436461453, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31226181.333333332, "logits/rejected": 25222956.8, "logps/chosen": -219.23539225260416, "logps/rejected": -375.5001708984375, "loss": 0.2368, "rewards/chosen": 0.5495623350143433, "rewards/margins": 2.5487423658370973, "rewards/rejected": -1.9991800308227539, "step": 8682 }, { "epoch": 0.46023374764794744, "grad_norm": 50.75, "kl": 2.1342878341674805, "learning_rate": 5e-07, "logits/chosen": -18809598.4, "logits/rejected": -27089536.0, "logps/chosen": -512.17021484375, "logps/rejected": -778.3841959635416, "loss": 0.2374, "rewards/chosen": 0.9677897453308105, "rewards/margins": 5.72630033493042, "rewards/rejected": -4.758510589599609, "step": 8683 }, { "epoch": 0.4602867516497496, "grad_norm": 42.25, "kl": 0.592376708984375, "learning_rate": 5e-07, "logits/chosen": -26109714.666666668, "logits/rejected": -10051121.6, "logps/chosen": -770.302490234375, "logps/rejected": -191.5125732421875, "loss": 0.1848, "rewards/chosen": 1.92427396774292, "rewards/margins": 3.887587642669678, "rewards/rejected": -1.963313674926758, "step": 8684 }, { "epoch": 0.4603397556515517, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54176616.0, "logits/rejected": -25910666.666666668, "logps/chosen": -349.7167663574219, "logps/rejected": -274.05051676432294, "loss": 0.2397, "rewards/chosen": 0.0038665831089019775, "rewards/margins": 1.8285833100477855, "rewards/rejected": -1.8247167269388835, "step": 8685 }, { "epoch": 0.46039275965335386, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12471359.0, "logits/rejected": -49543312.0, "logps/chosen": -228.6557159423828, "logps/rejected": -393.7213948567708, "loss": 0.147, "rewards/chosen": 0.9980224967002869, "rewards/margins": 3.7171924312909446, "rewards/rejected": -2.7191699345906577, "step": 8686 }, { "epoch": 0.46044576365515594, "grad_norm": 35.75, "kl": 0.10994148254394531, "learning_rate": 5e-07, "logits/chosen": -21930234.0, "logits/rejected": -28715237.333333332, "logps/chosen": -885.614501953125, "logps/rejected": -305.86269124348956, "loss": 0.1829, "rewards/chosen": 1.5290024280548096, "rewards/margins": 3.476022481918335, "rewards/rejected": -1.9470200538635254, "step": 8687 }, { "epoch": 0.4604987676569581, "grad_norm": 52.5, "kl": 1.6739826202392578, "learning_rate": 5e-07, "logits/chosen": -12221282.4, "logits/rejected": -21587370.666666668, "logps/chosen": -205.4788330078125, "logps/rejected": -312.63999430338544, "loss": 0.3772, "rewards/chosen": 0.5547186374664307, "rewards/margins": 2.1308626969655355, "rewards/rejected": -1.5761440594991047, "step": 8688 }, { "epoch": 0.4605517716587602, "grad_norm": 48.25, "kl": 0.00335693359375, "learning_rate": 5e-07, "logits/chosen": -15653857.142857144, "logits/rejected": -17523668.0, "logps/chosen": -480.7337123325893, "logps/rejected": -428.44464111328125, "loss": 0.3783, "rewards/chosen": 0.8610846655709403, "rewards/margins": 1.916059102330889, "rewards/rejected": -1.0549744367599487, "step": 8689 }, { "epoch": 0.46060477566056235, "grad_norm": 49.5, "kl": 7.713359832763672, "learning_rate": 5e-07, "logits/chosen": -14884890.285714285, "logits/rejected": -34998736.0, "logps/chosen": -241.44168526785714, "logps/rejected": -724.5032348632812, "loss": 0.5191, "rewards/chosen": 0.18537330627441406, "rewards/margins": 6.738864421844482, "rewards/rejected": -6.553491115570068, "step": 8690 }, { "epoch": 0.4606577796623645, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59754544.0, "logits/rejected": -15852150.0, "logps/chosen": -346.80096435546875, "logps/rejected": -380.4925231933594, "loss": 0.2443, "rewards/chosen": 0.5926728844642639, "rewards/margins": 3.676204025745392, "rewards/rejected": -3.083531141281128, "step": 8691 }, { "epoch": 0.4607107836641666, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39097970.666666664, "logits/rejected": -14901531.2, "logps/chosen": -477.5031331380208, "logps/rejected": -216.466796875, "loss": 0.2851, "rewards/chosen": 0.39635133743286133, "rewards/margins": 1.9971013069152832, "rewards/rejected": -1.6007499694824219, "step": 8692 }, { "epoch": 0.46076378766596876, "grad_norm": 47.25, "kl": 1.0942726135253906, "learning_rate": 5e-07, "logits/chosen": -27943382.4, "logits/rejected": -26423749.333333332, "logps/chosen": -274.7279052734375, "logps/rejected": -418.701416015625, "loss": 0.3364, "rewards/chosen": 0.3508593082427979, "rewards/margins": 2.102088435490926, "rewards/rejected": -1.7512291272481282, "step": 8693 }, { "epoch": 0.4608167916677709, "grad_norm": 69.0, "kl": 0.8042945861816406, "learning_rate": 5e-07, "logits/chosen": -36350400.0, "logits/rejected": -17422654.666666668, "logps/chosen": -221.111572265625, "logps/rejected": -254.4444376627604, "loss": 0.3291, "rewards/chosen": 0.5207639694213867, "rewards/margins": 2.345109558105469, "rewards/rejected": -1.824345588684082, "step": 8694 }, { "epoch": 0.46086979566957303, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22030080.0, "logits/rejected": -19448204.0, "logps/chosen": -197.46890258789062, "logps/rejected": -388.2118225097656, "loss": 0.3309, "rewards/chosen": -0.06973190605640411, "rewards/margins": 1.898340031504631, "rewards/rejected": -1.9680719375610352, "step": 8695 }, { "epoch": 0.46092279967137517, "grad_norm": 57.75, "kl": 0.21960067749023438, "learning_rate": 5e-07, "logits/chosen": -36534084.571428575, "logits/rejected": -79467120.0, "logps/chosen": -255.52779715401786, "logps/rejected": -426.55291748046875, "loss": 0.3935, "rewards/chosen": 0.2849199431283133, "rewards/margins": 2.6467089312417165, "rewards/rejected": -2.3617889881134033, "step": 8696 }, { "epoch": 0.4609758036731773, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14474257.0, "logits/rejected": -8267796.5, "logps/chosen": -211.07708740234375, "logps/rejected": -111.23120880126953, "loss": 0.3872, "rewards/chosen": -0.09567652642726898, "rewards/margins": 1.051511898636818, "rewards/rejected": -1.147188425064087, "step": 8697 }, { "epoch": 0.46102880767497945, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5665232.0, "logits/rejected": -13866016.0, "logps/chosen": -323.106201171875, "logps/rejected": -274.65216064453125, "loss": 0.1751, "rewards/chosen": 1.2164665460586548, "rewards/margins": 3.776272416114807, "rewards/rejected": -2.5598058700561523, "step": 8698 }, { "epoch": 0.4610818116767816, "grad_norm": 53.25, "kl": 1.3463973999023438, "learning_rate": 5e-07, "logits/chosen": -55594176.0, "logits/rejected": -30962464.0, "logps/chosen": -369.976220703125, "logps/rejected": -580.09423828125, "loss": 0.2702, "rewards/chosen": 0.6080151557922363, "rewards/margins": 4.1337967872619625, "rewards/rejected": -3.5257816314697266, "step": 8699 }, { "epoch": 0.4611348156785837, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76798592.0, "logits/rejected": -9156743.2, "logps/chosen": -337.5867919921875, "logps/rejected": -335.165771484375, "loss": 0.2104, "rewards/chosen": 0.8792858123779297, "rewards/margins": 2.939126968383789, "rewards/rejected": -2.0598411560058594, "step": 8700 }, { "epoch": 0.46118781968038586, "grad_norm": 48.0, "kl": 0.2264118194580078, "learning_rate": 5e-07, "logits/chosen": -40773283.2, "logits/rejected": -64753594.666666664, "logps/chosen": -299.3421630859375, "logps/rejected": -305.4993896484375, "loss": 0.3436, "rewards/chosen": 0.31054534912109377, "rewards/margins": 2.13262357711792, "rewards/rejected": -1.8220782279968262, "step": 8701 }, { "epoch": 0.461240823682188, "grad_norm": 44.5, "kl": 2.543720245361328, "learning_rate": 5e-07, "logits/chosen": -31799571.2, "logits/rejected": -28067538.666666668, "logps/chosen": -229.4412353515625, "logps/rejected": -339.35996500651044, "loss": 0.405, "rewards/chosen": 0.2293633460998535, "rewards/margins": 2.0872204144795736, "rewards/rejected": -1.85785706837972, "step": 8702 }, { "epoch": 0.46129382768399013, "grad_norm": 55.75, "kl": 2.424335479736328, "learning_rate": 5e-07, "logits/chosen": 59889286.4, "logits/rejected": -36302773.333333336, "logps/chosen": -276.9453857421875, "logps/rejected": -377.5628662109375, "loss": 0.2491, "rewards/chosen": 1.316619873046875, "rewards/margins": 3.2621703147888184, "rewards/rejected": -1.9455504417419434, "step": 8703 }, { "epoch": 0.46134683168579227, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 496802.5, "logits/rejected": -77898617.6, "logps/chosen": -85.1604715983073, "logps/rejected": -424.702099609375, "loss": 0.2635, "rewards/chosen": 0.0937700867652893, "rewards/margins": 2.625413954257965, "rewards/rejected": -2.531643867492676, "step": 8704 }, { "epoch": 0.4613998356875944, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -105719960.0, "logits/rejected": -34336009.14285714, "logps/chosen": -317.817138671875, "logps/rejected": -382.52650669642856, "loss": 0.176, "rewards/chosen": 0.377328485250473, "rewards/margins": 2.370900856597083, "rewards/rejected": -1.99357237134661, "step": 8705 }, { "epoch": 0.46145283968939654, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38349296.0, "logits/rejected": -55520320.0, "logps/chosen": -157.1070556640625, "logps/rejected": -456.814501953125, "loss": 0.2256, "rewards/chosen": 0.23074670632680258, "rewards/margins": 2.8247629721959435, "rewards/rejected": -2.594016265869141, "step": 8706 }, { "epoch": 0.4615058436911987, "grad_norm": 45.75, "kl": 0.8547706604003906, "learning_rate": 5e-07, "logits/chosen": -32358872.0, "logits/rejected": -2868373.6, "logps/chosen": -314.11183675130206, "logps/rejected": -211.98837890625, "loss": 0.2459, "rewards/chosen": 0.1558649738629659, "rewards/margins": 2.3177466114362084, "rewards/rejected": -2.1618816375732424, "step": 8707 }, { "epoch": 0.4615588476930008, "grad_norm": 41.25, "kl": 0.11401748657226562, "learning_rate": 5e-07, "logits/chosen": -12140423.0, "logits/rejected": -6290742.0, "logps/chosen": -302.5075988769531, "logps/rejected": -116.19602457682292, "loss": 0.1945, "rewards/chosen": 1.1421715021133423, "rewards/margins": 3.219774524370829, "rewards/rejected": -2.077603022257487, "step": 8708 }, { "epoch": 0.46161185169480295, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33975136.0, "logits/rejected": -13715990.666666666, "logps/chosen": -468.166845703125, "logps/rejected": -562.6442057291666, "loss": 0.2013, "rewards/chosen": 1.3520200729370118, "rewards/margins": 4.601953315734863, "rewards/rejected": -3.2499332427978516, "step": 8709 }, { "epoch": 0.4616648556966051, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15325132.0, "logits/rejected": -20344193.6, "logps/chosen": -283.67767333984375, "logps/rejected": -126.4593017578125, "loss": 0.2352, "rewards/chosen": 0.9684723218282064, "rewards/margins": 2.772311051686605, "rewards/rejected": -1.8038387298583984, "step": 8710 }, { "epoch": 0.46171785969840723, "grad_norm": 54.75, "kl": 1.185257911682129, "learning_rate": 5e-07, "logits/chosen": -11406528.0, "logits/rejected": -28102588.0, "logps/chosen": -357.20479910714283, "logps/rejected": -332.6053466796875, "loss": 0.3231, "rewards/chosen": 0.857752936226981, "rewards/margins": 3.2955216339656284, "rewards/rejected": -2.4377686977386475, "step": 8711 }, { "epoch": 0.46177086370020937, "grad_norm": 49.0, "kl": 0.0604400634765625, "learning_rate": 5e-07, "logits/chosen": -30027721.6, "logits/rejected": -36439493.333333336, "logps/chosen": -706.9126953125, "logps/rejected": -838.3531901041666, "loss": 0.2285, "rewards/chosen": 0.853244686126709, "rewards/margins": 3.7990009625752768, "rewards/rejected": -2.945756276448568, "step": 8712 }, { "epoch": 0.4618238677020115, "grad_norm": 39.25, "kl": 0.2772083282470703, "learning_rate": 5e-07, "logits/chosen": -24915246.0, "logits/rejected": -25173568.0, "logps/chosen": -115.49012756347656, "logps/rejected": -252.55479431152344, "loss": 0.3672, "rewards/chosen": -0.05817561224102974, "rewards/margins": 1.4033965580165386, "rewards/rejected": -1.4615721702575684, "step": 8713 }, { "epoch": 0.46187687170381364, "grad_norm": 46.25, "kl": 0.17562484741210938, "learning_rate": 5e-07, "logits/chosen": -69430960.0, "logits/rejected": -26074734.0, "logps/chosen": -502.2329406738281, "logps/rejected": -291.28973388671875, "loss": 0.2138, "rewards/chosen": 0.8493118286132812, "rewards/margins": 3.4372520446777344, "rewards/rejected": -2.587940216064453, "step": 8714 }, { "epoch": 0.4619298757056158, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19627448.0, "logits/rejected": -22001505.333333332, "logps/chosen": -259.75791015625, "logps/rejected": -340.82191975911456, "loss": 0.2692, "rewards/chosen": 0.4524266719818115, "rewards/margins": 3.6106412410736084, "rewards/rejected": -3.158214569091797, "step": 8715 }, { "epoch": 0.4619828797074179, "grad_norm": 56.25, "kl": 1.3020315170288086, "learning_rate": 5e-07, "logits/chosen": -11845884.0, "logits/rejected": -25987020.0, "logps/chosen": -258.71925862630206, "logps/rejected": -327.98138427734375, "loss": 0.3264, "rewards/chosen": 0.4665474096934001, "rewards/margins": 2.7226027647654214, "rewards/rejected": -2.2560553550720215, "step": 8716 }, { "epoch": 0.46203588370922005, "grad_norm": 58.75, "kl": 0.3693389892578125, "learning_rate": 5e-07, "logits/chosen": -22968683.42857143, "logits/rejected": -53543928.0, "logps/chosen": -315.0371791294643, "logps/rejected": -84.440673828125, "loss": 0.3963, "rewards/chosen": 0.4709454263959612, "rewards/margins": 1.394657620361873, "rewards/rejected": -0.9237121939659119, "step": 8717 }, { "epoch": 0.4620888877110222, "grad_norm": 30.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 128917.5703125, "logits/rejected": -17028813.333333332, "logps/chosen": -156.09619140625, "logps/rejected": -361.9813232421875, "loss": 0.1411, "rewards/chosen": 1.011598825454712, "rewards/margins": 3.6396432717641196, "rewards/rejected": -2.6280444463094077, "step": 8718 }, { "epoch": 0.4621418917128243, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31101558.4, "logits/rejected": -26811840.0, "logps/chosen": -290.645849609375, "logps/rejected": -247.2425740559896, "loss": 0.3289, "rewards/chosen": 0.2833050966262817, "rewards/margins": 2.118644960721334, "rewards/rejected": -1.835339864095052, "step": 8719 }, { "epoch": 0.46219489571462646, "grad_norm": 58.0, "kl": 1.3190326690673828, "learning_rate": 5e-07, "logits/chosen": -28309427.2, "logits/rejected": 7405306.666666667, "logps/chosen": -205.2464111328125, "logps/rejected": -158.19593302408853, "loss": 0.42, "rewards/chosen": -0.033843010663986206, "rewards/margins": 1.2139796713987987, "rewards/rejected": -1.247822682062785, "step": 8720 }, { "epoch": 0.4622478997164286, "grad_norm": 57.25, "kl": 0.2357335090637207, "learning_rate": 5e-07, "logits/chosen": -15325364.0, "logits/rejected": -9403315.0, "logps/chosen": -274.4313151041667, "logps/rejected": -257.40911865234375, "loss": 0.3233, "rewards/chosen": 0.462430198987325, "rewards/margins": 3.0011362632115683, "rewards/rejected": -2.538706064224243, "step": 8721 }, { "epoch": 0.46230090371823074, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -89030474.66666667, "logits/rejected": 25493432.0, "logps/chosen": -334.789306640625, "logps/rejected": -499.13544921875, "loss": 0.2442, "rewards/chosen": 0.3628245194753011, "rewards/margins": 2.599297030766805, "rewards/rejected": -2.236472511291504, "step": 8722 }, { "epoch": 0.4623539077200329, "grad_norm": 45.0, "kl": 2.0201292037963867, "learning_rate": 5e-07, "logits/chosen": -42495513.6, "logits/rejected": -26282306.666666668, "logps/chosen": -245.91630859375, "logps/rejected": -367.2447916666667, "loss": 0.2687, "rewards/chosen": 0.9516173362731933, "rewards/margins": 3.3221946716308595, "rewards/rejected": -2.370577335357666, "step": 8723 }, { "epoch": 0.462406911721835, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23206632.0, "logits/rejected": -15522720.0, "logps/chosen": -342.0059407552083, "logps/rejected": -372.7139892578125, "loss": 0.231, "rewards/chosen": 0.4488879442214966, "rewards/margins": 2.9004801988601683, "rewards/rejected": -2.4515922546386717, "step": 8724 }, { "epoch": 0.46245991572363715, "grad_norm": 42.25, "kl": 3.1526451110839844, "learning_rate": 5e-07, "logits/chosen": -19925805.333333332, "logits/rejected": -18998144.0, "logps/chosen": -266.8254801432292, "logps/rejected": -287.5841064453125, "loss": 0.3677, "rewards/chosen": 0.5865233341852824, "rewards/margins": 2.480242053667704, "rewards/rejected": -1.8937187194824219, "step": 8725 }, { "epoch": 0.4625129197254393, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -260502.375, "logits/rejected": -57680533.333333336, "logps/chosen": -250.2796630859375, "logps/rejected": -408.2173665364583, "loss": 0.1794, "rewards/chosen": 0.5456740260124207, "rewards/margins": 3.0488478541374207, "rewards/rejected": -2.503173828125, "step": 8726 }, { "epoch": 0.4625659237272414, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5691219.2, "logits/rejected": -19280696.0, "logps/chosen": -158.53836669921876, "logps/rejected": -340.0421549479167, "loss": 0.2918, "rewards/chosen": 0.3836104393005371, "rewards/margins": 2.829448731740316, "rewards/rejected": -2.445838292439779, "step": 8727 }, { "epoch": 0.46261892772904356, "grad_norm": 41.0, "kl": 0.7403488159179688, "learning_rate": 5e-07, "logits/chosen": -43561080.0, "logits/rejected": -14601216.0, "logps/chosen": -372.8143615722656, "logps/rejected": -220.1177978515625, "loss": 0.2861, "rewards/chosen": -0.09378509223461151, "rewards/margins": 1.830825999379158, "rewards/rejected": -1.9246110916137695, "step": 8728 }, { "epoch": 0.4626719317308457, "grad_norm": 46.5, "kl": 1.3885955810546875, "learning_rate": 5e-07, "logits/chosen": -44275814.4, "logits/rejected": 1801056.5, "logps/chosen": -356.453173828125, "logps/rejected": -108.34775797526042, "loss": 0.2856, "rewards/chosen": 1.05845947265625, "rewards/margins": 2.803701909383138, "rewards/rejected": -1.745242436726888, "step": 8729 }, { "epoch": 0.46272493573264784, "grad_norm": 56.5, "kl": 0.00118255615234375, "learning_rate": 5e-07, "logits/chosen": -34438476.8, "logits/rejected": -41957152.0, "logps/chosen": -482.668505859375, "logps/rejected": -259.45184326171875, "loss": 0.3042, "rewards/chosen": 0.4953296661376953, "rewards/margins": 2.754014460245768, "rewards/rejected": -2.2586847941080728, "step": 8730 }, { "epoch": 0.46277793973445, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6878831.0, "logits/rejected": 16786136.0, "logps/chosen": -30.30508041381836, "logps/rejected": -272.0218098958333, "loss": 0.2439, "rewards/chosen": 0.3300797641277313, "rewards/margins": 2.2758117218812304, "rewards/rejected": -1.9457319577534993, "step": 8731 }, { "epoch": 0.4628309437362521, "grad_norm": 45.25, "kl": 0.196319580078125, "learning_rate": 5e-07, "logits/chosen": -25454216.0, "logits/rejected": -16403970.666666666, "logps/chosen": -212.3899658203125, "logps/rejected": -306.8547770182292, "loss": 0.3996, "rewards/chosen": 0.0174205482006073, "rewards/margins": 1.288710214694341, "rewards/rejected": -1.2712896664937336, "step": 8732 }, { "epoch": 0.46288394773805425, "grad_norm": 46.5, "kl": 0.2193145751953125, "learning_rate": 5e-07, "logits/chosen": -17467622.4, "logits/rejected": -21091156.0, "logps/chosen": -174.97864990234376, "logps/rejected": -177.92081705729166, "loss": 0.3757, "rewards/chosen": 0.06372935771942138, "rewards/margins": 1.6236668348312377, "rewards/rejected": -1.5599374771118164, "step": 8733 }, { "epoch": 0.4629369517398564, "grad_norm": 42.5, "kl": 0.7793922424316406, "learning_rate": 5e-07, "logits/chosen": -17633720.0, "logits/rejected": -18567886.4, "logps/chosen": -249.9775594075521, "logps/rejected": -595.121435546875, "loss": 0.2336, "rewards/chosen": 0.6775245666503906, "rewards/margins": 3.0287359237670897, "rewards/rejected": -2.351211357116699, "step": 8734 }, { "epoch": 0.4629899557416585, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1296336.625, "logits/rejected": -35661949.333333336, "logps/chosen": -24.383380889892578, "logps/rejected": -277.60166422526044, "loss": 0.2644, "rewards/chosen": 0.5619198083877563, "rewards/margins": 2.059783101081848, "rewards/rejected": -1.4978632926940918, "step": 8735 }, { "epoch": 0.46304295974346066, "grad_norm": 26.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2887924.3333333335, "logits/rejected": -7797916.8, "logps/chosen": -178.716064453125, "logps/rejected": -175.85443115234375, "loss": 0.1675, "rewards/chosen": 1.2881513436635335, "rewards/margins": 3.6086799462636314, "rewards/rejected": -2.3205286026000977, "step": 8736 }, { "epoch": 0.46309596374526274, "grad_norm": 49.25, "kl": 0.031790733337402344, "learning_rate": 5e-07, "logits/chosen": -26009518.4, "logits/rejected": -10358529.333333334, "logps/chosen": -410.512158203125, "logps/rejected": -163.69538370768228, "loss": 0.2902, "rewards/chosen": 0.525559377670288, "rewards/margins": 3.3308511892954504, "rewards/rejected": -2.8052918116251626, "step": 8737 }, { "epoch": 0.4631489677470649, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 54621728.0, "logits/rejected": -37228628.571428575, "logps/chosen": -298.51007080078125, "logps/rejected": -267.6067592075893, "loss": 0.2364, "rewards/chosen": -0.24953614175319672, "rewards/margins": 1.3974938711949758, "rewards/rejected": -1.6470300129481725, "step": 8738 }, { "epoch": 0.463201971748867, "grad_norm": 35.25, "kl": 0.33911895751953125, "learning_rate": 5e-07, "logits/chosen": 9401542.666666666, "logits/rejected": -8265128.8, "logps/chosen": -162.56301879882812, "logps/rejected": -202.7329833984375, "loss": 0.2545, "rewards/chosen": 0.8345223267873129, "rewards/margins": 2.240780528386434, "rewards/rejected": -1.4062582015991212, "step": 8739 }, { "epoch": 0.46325497575066915, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3026412.6666666665, "logits/rejected": -39511993.6, "logps/chosen": -73.74014282226562, "logps/rejected": -401.01328125, "loss": 0.2408, "rewards/chosen": 0.04150416453679403, "rewards/margins": 2.5904539783795677, "rewards/rejected": -2.5489498138427735, "step": 8740 }, { "epoch": 0.4633079797524713, "grad_norm": 28.5, "kl": 2.8281784057617188, "learning_rate": 5e-07, "logits/chosen": 2444251.0, "logits/rejected": -23271884.8, "logps/chosen": -588.2012532552084, "logps/rejected": -553.82958984375, "loss": 0.2224, "rewards/chosen": 1.7935352325439453, "rewards/margins": 4.597380638122559, "rewards/rejected": -2.8038454055786133, "step": 8741 }, { "epoch": 0.4633609837542734, "grad_norm": 55.5, "kl": 0.717158317565918, "learning_rate": 5e-07, "logits/chosen": -37580800.0, "logits/rejected": -11565911.0, "logps/chosen": -236.56118774414062, "logps/rejected": -100.29124450683594, "loss": 0.3124, "rewards/chosen": 0.7684012055397034, "rewards/margins": 2.282051146030426, "rewards/rejected": -1.5136499404907227, "step": 8742 }, { "epoch": 0.46341398775607556, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23806664.0, "logits/rejected": -23259180.0, "logps/chosen": -329.28619384765625, "logps/rejected": -303.8096923828125, "loss": 0.3102, "rewards/chosen": 0.14796680212020874, "rewards/margins": 2.3230053782463074, "rewards/rejected": -2.1750385761260986, "step": 8743 }, { "epoch": 0.4634669917578777, "grad_norm": 46.75, "kl": 1.6438570022583008, "learning_rate": 5e-07, "logits/chosen": 2398232.75, "logits/rejected": -10056522.0, "logps/chosen": -68.33930969238281, "logps/rejected": -253.00250244140625, "loss": 0.3382, "rewards/chosen": 0.26557666063308716, "rewards/margins": 1.6311361193656921, "rewards/rejected": -1.365559458732605, "step": 8744 }, { "epoch": 0.46351999575967984, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5297839.0, "logits/rejected": -3290832.4, "logps/chosen": -446.1709391276042, "logps/rejected": -150.2212158203125, "loss": 0.4073, "rewards/chosen": -0.6599812904993693, "rewards/margins": 0.3998358329137167, "rewards/rejected": -1.059817123413086, "step": 8745 }, { "epoch": 0.463572999761482, "grad_norm": 44.5, "kl": 2.546825408935547, "learning_rate": 5e-07, "logits/chosen": -28819053.333333332, "logits/rejected": -39309084.0, "logps/chosen": -236.81416829427084, "logps/rejected": -258.7138366699219, "loss": 0.2939, "rewards/chosen": 1.1448062260945637, "rewards/margins": 2.6643756230672198, "rewards/rejected": -1.5195693969726562, "step": 8746 }, { "epoch": 0.4636260037632841, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60642112.0, "logits/rejected": -12712850.666666666, "logps/chosen": -454.67236328125, "logps/rejected": -198.6893513997396, "loss": 0.3433, "rewards/chosen": 0.2290422201156616, "rewards/margins": 2.777019508679708, "rewards/rejected": -2.5479772885640464, "step": 8747 }, { "epoch": 0.46367900776508625, "grad_norm": 45.25, "kl": 0.9743442535400391, "learning_rate": 5e-07, "logits/chosen": 1393390.6, "logits/rejected": -73727370.66666667, "logps/chosen": -123.6273193359375, "logps/rejected": -200.54364013671875, "loss": 0.3311, "rewards/chosen": 0.22804079055786133, "rewards/margins": 2.852731482187907, "rewards/rejected": -2.6246906916300454, "step": 8748 }, { "epoch": 0.4637320117668884, "grad_norm": 55.75, "kl": 0.8357143402099609, "learning_rate": 5e-07, "logits/chosen": -29997412.0, "logits/rejected": -1878289.625, "logps/chosen": -342.1275329589844, "logps/rejected": -91.54049682617188, "loss": 0.3364, "rewards/chosen": 0.9841375350952148, "rewards/margins": 1.6309444904327393, "rewards/rejected": -0.6468069553375244, "step": 8749 }, { "epoch": 0.4637850157686905, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40701706.666666664, "logits/rejected": 9144657.6, "logps/chosen": -355.9958902994792, "logps/rejected": -213.45478515625, "loss": 0.2708, "rewards/chosen": 0.2147559920946757, "rewards/margins": 2.051090983549754, "rewards/rejected": -1.8363349914550782, "step": 8750 }, { "epoch": 0.46383801977049266, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11899720.0, "logits/rejected": -61415312.0, "logps/chosen": -450.37548828125, "logps/rejected": -502.6380920410156, "loss": 0.287, "rewards/chosen": 0.2371448576450348, "rewards/margins": 2.8137692511081696, "rewards/rejected": -2.5766243934631348, "step": 8751 }, { "epoch": 0.4638910237722948, "grad_norm": 37.0, "kl": 4.485224723815918, "learning_rate": 5e-07, "logits/chosen": 26109638.0, "logits/rejected": -29427604.0, "logps/chosen": -604.8230590820312, "logps/rejected": -255.24539184570312, "loss": 0.2964, "rewards/chosen": 1.4955036640167236, "rewards/margins": 3.408554792404175, "rewards/rejected": -1.9130511283874512, "step": 8752 }, { "epoch": 0.46394402777409693, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24208192.0, "logits/rejected": -12121053.333333334, "logps/chosen": -267.0228515625, "logps/rejected": -667.6459554036459, "loss": 0.329, "rewards/chosen": 0.20573487281799316, "rewards/margins": 3.1179396152496337, "rewards/rejected": -2.9122047424316406, "step": 8753 }, { "epoch": 0.46399703177589907, "grad_norm": 47.5, "kl": 2.4396629333496094, "learning_rate": 5e-07, "logits/chosen": -50115684.0, "logits/rejected": -21963278.666666668, "logps/chosen": -498.62091064453125, "logps/rejected": -179.9627685546875, "loss": 0.2209, "rewards/chosen": 0.8680481314659119, "rewards/margins": 2.331681986649831, "rewards/rejected": -1.4636338551839192, "step": 8754 }, { "epoch": 0.4640500357777012, "grad_norm": 41.75, "kl": 0.10476016998291016, "learning_rate": 5e-07, "logits/chosen": 5830136.666666667, "logits/rejected": 15483435.2, "logps/chosen": -537.2774658203125, "logps/rejected": -97.56439819335938, "loss": 0.2988, "rewards/chosen": 0.7153668403625488, "rewards/margins": 2.2363894462585447, "rewards/rejected": -1.5210226058959961, "step": 8755 }, { "epoch": 0.46410303977950335, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35291168.0, "logits/rejected": -19090386.666666668, "logps/chosen": -288.0036865234375, "logps/rejected": -321.5270589192708, "loss": 0.3933, "rewards/chosen": 0.011798858642578125, "rewards/margins": 1.634071667989095, "rewards/rejected": -1.6222728093465169, "step": 8756 }, { "epoch": 0.4641560437813055, "grad_norm": 69.0, "kl": 0.305633544921875, "learning_rate": 5e-07, "logits/chosen": -102795824.0, "logits/rejected": -17569268.57142857, "logps/chosen": -469.1820068359375, "logps/rejected": -312.1206752232143, "loss": 0.2656, "rewards/chosen": -0.4082702696323395, "rewards/margins": 1.075702279806137, "rewards/rejected": -1.4839725494384766, "step": 8757 }, { "epoch": 0.4642090477831076, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52970032.0, "logits/rejected": -10910168.0, "logps/chosen": -426.7099914550781, "logps/rejected": -170.7472941080729, "loss": 0.2556, "rewards/chosen": 0.16995926201343536, "rewards/margins": 2.3942308773597083, "rewards/rejected": -2.224271615346273, "step": 8758 }, { "epoch": 0.46426205178490976, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7248938.0, "logits/rejected": -23599220.8, "logps/chosen": -158.4546915690104, "logps/rejected": -307.5602783203125, "loss": 0.3772, "rewards/chosen": -0.374530553817749, "rewards/margins": 1.1172035694122315, "rewards/rejected": -1.4917341232299806, "step": 8759 }, { "epoch": 0.4643150557867119, "grad_norm": 62.0, "kl": 2.8576126098632812, "learning_rate": 5e-07, "logits/chosen": -7190084.8, "logits/rejected": -22682906.666666668, "logps/chosen": -670.95126953125, "logps/rejected": -657.668701171875, "loss": 0.2853, "rewards/chosen": 1.3095131874084474, "rewards/margins": 4.165677801767985, "rewards/rejected": -2.8561646143595376, "step": 8760 }, { "epoch": 0.46436805978851403, "grad_norm": 45.5, "kl": 0.8778877258300781, "learning_rate": 5e-07, "logits/chosen": -24990616.0, "logits/rejected": -2153450.0, "logps/chosen": -150.5028839111328, "logps/rejected": -235.19876098632812, "loss": 0.3908, "rewards/chosen": -0.1853814274072647, "rewards/margins": 1.1853381246328354, "rewards/rejected": -1.3707195520401, "step": 8761 }, { "epoch": 0.46442106379031617, "grad_norm": 39.0, "kl": 0.5891189575195312, "learning_rate": 5e-07, "logits/chosen": -28576928.0, "logits/rejected": -53306836.0, "logps/chosen": -197.08670043945312, "logps/rejected": -651.96630859375, "loss": 0.2531, "rewards/chosen": 0.33924582600593567, "rewards/margins": 3.52377513051033, "rewards/rejected": -3.1845293045043945, "step": 8762 }, { "epoch": 0.4644740677921183, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11726863.0, "logits/rejected": 31129268.0, "logps/chosen": -249.66641235351562, "logps/rejected": -305.7527160644531, "loss": 0.3174, "rewards/chosen": 0.7221935391426086, "rewards/margins": 1.9555932879447937, "rewards/rejected": -1.233399748802185, "step": 8763 }, { "epoch": 0.46452707179392044, "grad_norm": 44.5, "kl": 0.9304428100585938, "learning_rate": 5e-07, "logits/chosen": -57924568.0, "logits/rejected": 16778612.0, "logps/chosen": -338.62542724609375, "logps/rejected": -423.07763671875, "loss": 0.2506, "rewards/chosen": 0.3488689661026001, "rewards/margins": 3.344422936439514, "rewards/rejected": -2.995553970336914, "step": 8764 }, { "epoch": 0.4645800757957226, "grad_norm": 45.5, "kl": 0.0149993896484375, "learning_rate": 5e-07, "logits/chosen": -34325112.0, "logits/rejected": -18559468.0, "logps/chosen": -311.06195068359375, "logps/rejected": -323.5196533203125, "loss": 0.2684, "rewards/chosen": 0.05402222275733948, "rewards/margins": 3.2382620871067047, "rewards/rejected": -3.1842398643493652, "step": 8765 }, { "epoch": 0.4646330797975247, "grad_norm": 49.75, "kl": 1.3683967590332031, "learning_rate": 5e-07, "logits/chosen": -32868856.0, "logits/rejected": -4714685.333333333, "logps/chosen": -405.32220458984375, "logps/rejected": -268.1919352213542, "loss": 0.2757, "rewards/chosen": 0.3549911677837372, "rewards/margins": 2.1659878591696424, "rewards/rejected": -1.810996691385905, "step": 8766 }, { "epoch": 0.46468608379932685, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27854232.0, "logits/rejected": -9069152.0, "logps/chosen": -405.9280700683594, "logps/rejected": -243.04745483398438, "loss": 0.2952, "rewards/chosen": 0.33053818345069885, "rewards/margins": 2.05142405629158, "rewards/rejected": -1.7208858728408813, "step": 8767 }, { "epoch": 0.464739087801129, "grad_norm": 30.625, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -23617722.0, "logps/rejected": -204.89389038085938, "loss": 0.1887, "rewards/rejected": -1.9920556545257568, "step": 8768 }, { "epoch": 0.46479209180293113, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -210265.3125, "logits/rejected": -52450826.666666664, "logps/chosen": -104.08598327636719, "logps/rejected": -575.2974446614584, "loss": 0.1973, "rewards/chosen": -0.20821094512939453, "rewards/margins": 2.3235510190327964, "rewards/rejected": -2.531761964162191, "step": 8769 }, { "epoch": 0.46484509580473327, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -800003.25, "logits/rejected": -33136884.0, "logps/chosen": -387.06964111328125, "logps/rejected": -404.81414794921875, "loss": 0.2302, "rewards/chosen": 0.6688764691352844, "rewards/margins": 3.2641342282295227, "rewards/rejected": -2.5952577590942383, "step": 8770 }, { "epoch": 0.4648980998065354, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -408142.4, "logits/rejected": -38115413.333333336, "logps/chosen": -82.4456298828125, "logps/rejected": -338.89080810546875, "loss": 0.2687, "rewards/chosen": 0.6041320323944092, "rewards/margins": 3.0627249240875245, "rewards/rejected": -2.4585928916931152, "step": 8771 }, { "epoch": 0.46495110380833754, "grad_norm": 43.25, "kl": 2.2322998046875, "learning_rate": 5e-07, "logits/chosen": -37407830.4, "logits/rejected": -29902648.0, "logps/chosen": -219.4664794921875, "logps/rejected": -114.30868530273438, "loss": 0.3838, "rewards/chosen": 0.20917787551879882, "rewards/margins": 1.7331106503804523, "rewards/rejected": -1.5239327748616536, "step": 8772 }, { "epoch": 0.4650041078101397, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33740554.666666664, "logits/rejected": 8117438.4, "logps/chosen": -237.40604654947916, "logps/rejected": -304.96220703125, "loss": 0.2683, "rewards/chosen": 0.2668421268463135, "rewards/margins": 1.9596361637115478, "rewards/rejected": -1.6927940368652343, "step": 8773 }, { "epoch": 0.4650571118119418, "grad_norm": 44.0, "kl": 0.3417243957519531, "learning_rate": 5e-07, "logits/chosen": 2224491.0, "logits/rejected": -17752130.666666668, "logps/chosen": -311.38092041015625, "logps/rejected": -269.0598958333333, "loss": 0.1949, "rewards/chosen": 0.6980911493301392, "rewards/margins": 2.54021433989207, "rewards/rejected": -1.8421231905619304, "step": 8774 }, { "epoch": 0.46511011581374395, "grad_norm": 49.5, "kl": 0.06826591491699219, "learning_rate": 5e-07, "logits/chosen": -30397502.0, "logits/rejected": -9280316.666666666, "logps/chosen": -338.852294921875, "logps/rejected": -227.59859212239584, "loss": 0.2525, "rewards/chosen": -0.03249511122703552, "rewards/margins": 1.758855253458023, "rewards/rejected": -1.7913503646850586, "step": 8775 }, { "epoch": 0.4651631198155461, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58994328.0, "logits/rejected": -5510427.0, "logps/chosen": -412.65521240234375, "logps/rejected": -341.9893493652344, "loss": 0.2376, "rewards/chosen": 0.9937881827354431, "rewards/margins": 3.080439507961273, "rewards/rejected": -2.08665132522583, "step": 8776 }, { "epoch": 0.4652161238173482, "grad_norm": 55.75, "kl": 0.5844192504882812, "learning_rate": 5e-07, "logits/chosen": -20439333.333333332, "logits/rejected": -40310393.6, "logps/chosen": -343.552490234375, "logps/rejected": -263.232568359375, "loss": 0.2412, "rewards/chosen": 0.7691599527994791, "rewards/margins": 2.7891384760538735, "rewards/rejected": -2.0199785232543945, "step": 8777 }, { "epoch": 0.46526912781915036, "grad_norm": 40.0, "kl": 1.7939796447753906, "learning_rate": 5e-07, "logits/chosen": -12123408.0, "logits/rejected": -30606028.8, "logps/chosen": -362.4807535807292, "logps/rejected": -379.6673583984375, "loss": 0.2116, "rewards/chosen": 0.5712265173594157, "rewards/margins": 2.842347733179728, "rewards/rejected": -2.2711212158203127, "step": 8778 }, { "epoch": 0.4653221318209525, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31049232.0, "logits/rejected": -85813482.66666667, "logps/chosen": -383.24822998046875, "logps/rejected": -370.98583984375, "loss": 0.2023, "rewards/chosen": 0.9324371218681335, "rewards/margins": 2.8431569933891296, "rewards/rejected": -1.910719871520996, "step": 8779 }, { "epoch": 0.46537513582275464, "grad_norm": 56.5, "kl": 2.9423866271972656, "learning_rate": 5e-07, "logits/chosen": -11731310.4, "logits/rejected": 908555.0, "logps/chosen": -230.129052734375, "logps/rejected": -93.64510091145833, "loss": 0.4424, "rewards/chosen": 0.3526019096374512, "rewards/margins": 1.3412156740824381, "rewards/rejected": -0.9886137644449869, "step": 8780 }, { "epoch": 0.4654281398245568, "grad_norm": 56.25, "kl": 0.23780441284179688, "learning_rate": 5e-07, "logits/chosen": -5239228.0, "logits/rejected": -12829490.0, "logps/chosen": -225.2942352294922, "logps/rejected": -264.3223571777344, "loss": 0.3246, "rewards/chosen": 0.010079056024551392, "rewards/margins": 1.996481567621231, "rewards/rejected": -1.9864025115966797, "step": 8781 }, { "epoch": 0.4654811438263589, "grad_norm": 52.25, "kl": 0.5601558685302734, "learning_rate": 5e-07, "logits/chosen": -17221924.8, "logits/rejected": -26822837.333333332, "logps/chosen": -141.2974365234375, "logps/rejected": -183.50960286458334, "loss": 0.3737, "rewards/chosen": 0.010837674140930176, "rewards/margins": 1.927403728167216, "rewards/rejected": -1.9165660540262859, "step": 8782 }, { "epoch": 0.46553414782816105, "grad_norm": 32.0, "kl": 1.313751220703125, "learning_rate": 5e-07, "logits/chosen": -24057771.2, "logits/rejected": -24105493.333333332, "logps/chosen": -170.459423828125, "logps/rejected": -301.7926025390625, "loss": 0.3019, "rewards/chosen": 0.21779294013977052, "rewards/margins": 3.1736295541127526, "rewards/rejected": -2.955836613972982, "step": 8783 }, { "epoch": 0.4655871518299632, "grad_norm": 84.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8164578.0, "logits/rejected": -31578538.0, "logps/chosen": -484.48541259765625, "logps/rejected": -511.2469482421875, "loss": 0.2885, "rewards/chosen": 0.3271636962890625, "rewards/margins": 2.726102113723755, "rewards/rejected": -2.3989384174346924, "step": 8784 }, { "epoch": 0.4656401558317653, "grad_norm": 41.0, "kl": 0.5972938537597656, "learning_rate": 5e-07, "logits/chosen": -42390980.0, "logits/rejected": -19373252.0, "logps/chosen": -295.6258239746094, "logps/rejected": -318.3548583984375, "loss": 0.2382, "rewards/chosen": 0.8143690228462219, "rewards/margins": 3.176772892475128, "rewards/rejected": -2.3624038696289062, "step": 8785 }, { "epoch": 0.46569315983356746, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37837296.0, "logits/rejected": -60628504.0, "logps/chosen": -573.0905151367188, "logps/rejected": -200.56875610351562, "loss": 0.2448, "rewards/chosen": 0.9550206661224365, "rewards/margins": 2.873039126396179, "rewards/rejected": -1.9180184602737427, "step": 8786 }, { "epoch": 0.4657461638353696, "grad_norm": 48.5, "kl": 0.9527168273925781, "learning_rate": 5e-07, "logits/chosen": -53572528.0, "logits/rejected": -25332154.666666668, "logps/chosen": -154.98048400878906, "logps/rejected": -340.21435546875, "loss": 0.2419, "rewards/chosen": 0.08070680499076843, "rewards/margins": 2.2885057628154755, "rewards/rejected": -2.207798957824707, "step": 8787 }, { "epoch": 0.4657991678371717, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24267264.0, "logits/rejected": -51667144.0, "logps/chosen": -586.76806640625, "logps/rejected": -547.909423828125, "loss": 0.1727, "rewards/chosen": 1.1905927658081055, "rewards/margins": 4.6893839836120605, "rewards/rejected": -3.498791217803955, "step": 8788 }, { "epoch": 0.4658521718389738, "grad_norm": 45.0, "kl": 0.48101806640625, "learning_rate": 5e-07, "logits/chosen": -22044096.0, "logits/rejected": -30455536.0, "logps/chosen": -349.36517333984375, "logps/rejected": -356.8836669921875, "loss": 0.1861, "rewards/chosen": 1.122296929359436, "rewards/margins": 3.3524900674819946, "rewards/rejected": -2.2301931381225586, "step": 8789 }, { "epoch": 0.46590517584077595, "grad_norm": 43.0, "kl": 1.0974349975585938, "learning_rate": 5e-07, "logits/chosen": -28426464.0, "logits/rejected": -24412157.333333332, "logps/chosen": -240.2852294921875, "logps/rejected": -399.7620035807292, "loss": 0.3201, "rewards/chosen": 0.36475138664245604, "rewards/margins": 3.7338910579681395, "rewards/rejected": -3.3691396713256836, "step": 8790 }, { "epoch": 0.4659581798425781, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2891443.6666666665, "logits/rejected": 16773270.4, "logps/chosen": -206.68155924479166, "logps/rejected": -202.68270263671874, "loss": 0.251, "rewards/chosen": 0.4093245267868042, "rewards/margins": 2.33383948802948, "rewards/rejected": -1.9245149612426757, "step": 8791 }, { "epoch": 0.46601118384438023, "grad_norm": 178.0, "kl": 1.1348342895507812, "learning_rate": 5e-07, "logits/chosen": -4320507.0, "logits/rejected": -9284499.0, "logps/chosen": -136.21067810058594, "logps/rejected": -136.4285888671875, "loss": 0.2843, "rewards/chosen": 0.4563429057598114, "rewards/margins": 2.339229315519333, "rewards/rejected": -1.8828864097595215, "step": 8792 }, { "epoch": 0.46606418784618237, "grad_norm": 54.25, "kl": 1.472330093383789, "learning_rate": 5e-07, "logits/chosen": -30305088.0, "logits/rejected": -67399544.0, "logps/chosen": -354.4296875, "logps/rejected": -282.73541259765625, "loss": 0.3059, "rewards/chosen": 0.7572172482808431, "rewards/margins": 3.212438186009725, "rewards/rejected": -2.455220937728882, "step": 8793 }, { "epoch": 0.4661171918479845, "grad_norm": 66.5, "kl": 1.3135795593261719, "learning_rate": 5e-07, "logits/chosen": 121863496.0, "logits/rejected": -33187108.0, "logps/chosen": -378.4009704589844, "logps/rejected": -271.08160400390625, "loss": 0.3587, "rewards/chosen": 0.17032985389232635, "rewards/margins": 1.6985475271940231, "rewards/rejected": -1.5282176733016968, "step": 8794 }, { "epoch": 0.46617019584978664, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17452450.666666668, "logits/rejected": 36951718.4, "logps/chosen": -425.2641194661458, "logps/rejected": -318.188232421875, "loss": 0.3881, "rewards/chosen": -0.3343369960784912, "rewards/margins": 0.693492555618286, "rewards/rejected": -1.0278295516967773, "step": 8795 }, { "epoch": 0.4662231998515888, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -340970.5, "logits/rejected": -12077532.0, "logps/chosen": -99.50247192382812, "logps/rejected": -311.60276285807294, "loss": 0.2638, "rewards/chosen": -0.8586059808731079, "rewards/margins": 1.4457048177719116, "rewards/rejected": -2.3043107986450195, "step": 8796 }, { "epoch": 0.4662762038533909, "grad_norm": 46.0, "kl": 2.8327016830444336, "learning_rate": 5e-07, "logits/chosen": -16010132.0, "logits/rejected": -31104936.0, "logps/chosen": -502.809814453125, "logps/rejected": -253.19027709960938, "loss": 0.293, "rewards/chosen": 1.0405896504720051, "rewards/margins": 3.432668050130208, "rewards/rejected": -2.392078399658203, "step": 8797 }, { "epoch": 0.46632920785519305, "grad_norm": 44.0, "kl": 0.3489723205566406, "learning_rate": 5e-07, "logits/chosen": -49957482.666666664, "logits/rejected": -12974081.6, "logps/chosen": -238.37935384114584, "logps/rejected": -246.197216796875, "loss": 0.251, "rewards/chosen": 0.6427842775980631, "rewards/margins": 2.219973913828532, "rewards/rejected": -1.5771896362304687, "step": 8798 }, { "epoch": 0.4663822118569952, "grad_norm": 44.75, "kl": 0.5618934631347656, "learning_rate": 5e-07, "logits/chosen": -27025376.0, "logits/rejected": -55332048.0, "logps/chosen": -226.9947713216146, "logps/rejected": -234.62890625, "loss": 0.3312, "rewards/chosen": 0.70775039990743, "rewards/margins": 2.153893788655599, "rewards/rejected": -1.446143388748169, "step": 8799 }, { "epoch": 0.4664352158587973, "grad_norm": 31.125, "kl": 0.13697052001953125, "learning_rate": 5e-07, "logits/chosen": -7111843.0, "logits/rejected": -46338532.571428575, "logps/chosen": -479.5654296875, "logps/rejected": -437.2863071986607, "loss": 0.0933, "rewards/chosen": 1.4351806640625, "rewards/margins": 4.469955989292689, "rewards/rejected": -3.0347753252301897, "step": 8800 }, { "epoch": 0.46648821986059946, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30115032.0, "logits/rejected": -26662984.0, "logps/chosen": -309.2572937011719, "logps/rejected": -149.8603719075521, "loss": 0.3011, "rewards/chosen": 0.40280765295028687, "rewards/margins": 1.5028379162152607, "rewards/rejected": -1.1000302632649739, "step": 8801 }, { "epoch": 0.4665412238624016, "grad_norm": 57.5, "kl": 1.5873870849609375, "learning_rate": 5e-07, "logits/chosen": -57870809.6, "logits/rejected": -52564304.0, "logps/chosen": -377.389208984375, "logps/rejected": -390.4034423828125, "loss": 0.394, "rewards/chosen": 0.04781129360198975, "rewards/margins": 2.3843635956446327, "rewards/rejected": -2.336552302042643, "step": 8802 }, { "epoch": 0.46659422786420374, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48272144.0, "logits/rejected": -63830212.0, "logps/chosen": -456.1781005859375, "logps/rejected": -266.2554626464844, "loss": 0.3281, "rewards/chosen": 0.2751404643058777, "rewards/margins": 1.662555754184723, "rewards/rejected": -1.3874152898788452, "step": 8803 }, { "epoch": 0.4666472318660059, "grad_norm": 46.75, "kl": 0.21262359619140625, "learning_rate": 5e-07, "logits/chosen": -9683598.666666666, "logits/rejected": -43469320.0, "logps/chosen": -350.0847574869792, "logps/rejected": -405.4061584472656, "loss": 0.3321, "rewards/chosen": 0.5651458899180094, "rewards/margins": 2.4699275890986123, "rewards/rejected": -1.904781699180603, "step": 8804 }, { "epoch": 0.466700235867808, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23180397.333333332, "logits/rejected": -65049164.8, "logps/chosen": -214.5495402018229, "logps/rejected": -299.3553466796875, "loss": 0.3517, "rewards/chosen": -0.027520249287287395, "rewards/margins": 1.2490091582139333, "rewards/rejected": -1.2765294075012208, "step": 8805 }, { "epoch": 0.46675323986961015, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36469360.0, "logits/rejected": -30680125.333333332, "logps/chosen": -394.671484375, "logps/rejected": -364.3165283203125, "loss": 0.3499, "rewards/chosen": 0.017664122581481933, "rewards/margins": 2.318481930096944, "rewards/rejected": -2.3008178075154624, "step": 8806 }, { "epoch": 0.4668062438714123, "grad_norm": 43.75, "kl": 2.7084484100341797, "learning_rate": 5e-07, "logits/chosen": -6112355.2, "logits/rejected": -8841252.666666666, "logps/chosen": -162.0293701171875, "logps/rejected": -438.6020100911458, "loss": 0.3248, "rewards/chosen": 0.6938570976257324, "rewards/margins": 2.4446093877156576, "rewards/rejected": -1.750752290089925, "step": 8807 }, { "epoch": 0.4668592478732144, "grad_norm": 52.75, "kl": 0.048590660095214844, "learning_rate": 5e-07, "logits/chosen": -9157792.0, "logits/rejected": -124983210.66666667, "logps/chosen": -224.3012939453125, "logps/rejected": -286.8765869140625, "loss": 0.3244, "rewards/chosen": 0.1765605926513672, "rewards/margins": 2.4587334314982097, "rewards/rejected": -2.2821728388468423, "step": 8808 }, { "epoch": 0.46691225187501656, "grad_norm": 57.75, "kl": 1.7476997375488281, "learning_rate": 5e-07, "logits/chosen": -29057898.666666668, "logits/rejected": -41399168.0, "logps/chosen": -240.78938802083334, "logps/rejected": -388.44671630859375, "loss": 0.4781, "rewards/chosen": -0.19493289788564047, "rewards/margins": 1.9479947487513225, "rewards/rejected": -2.142927646636963, "step": 8809 }, { "epoch": 0.4669652558768187, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31609528.0, "logits/rejected": -40873032.0, "logps/chosen": -292.6807861328125, "logps/rejected": -641.75048828125, "loss": 0.2818, "rewards/chosen": 0.07351437211036682, "rewards/margins": 3.0512875616550446, "rewards/rejected": -2.9777731895446777, "step": 8810 }, { "epoch": 0.46701825987862083, "grad_norm": 53.5, "kl": 0.41351890563964844, "learning_rate": 5e-07, "logits/chosen": -27578816.0, "logits/rejected": -30965342.0, "logps/chosen": -330.03546142578125, "logps/rejected": -378.65423583984375, "loss": 0.3203, "rewards/chosen": 0.2835861146450043, "rewards/margins": 2.0678602159023285, "rewards/rejected": -1.7842741012573242, "step": 8811 }, { "epoch": 0.46707126388042297, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8864772.666666666, "logits/rejected": -18975636.8, "logps/chosen": -110.7161356608073, "logps/rejected": -251.1443359375, "loss": 0.2355, "rewards/chosen": 0.5952135721842448, "rewards/margins": 2.3381985346476237, "rewards/rejected": -1.7429849624633789, "step": 8812 }, { "epoch": 0.4671242678822251, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23246845.333333332, "logits/rejected": -14943728.0, "logps/chosen": -724.4510904947916, "logps/rejected": -227.4541748046875, "loss": 0.2428, "rewards/chosen": 1.7229838371276855, "rewards/margins": 2.9942317962646485, "rewards/rejected": -1.271247959136963, "step": 8813 }, { "epoch": 0.46717727188402725, "grad_norm": 58.75, "kl": 0.4220771789550781, "learning_rate": 5e-07, "logits/chosen": -50946624.0, "logits/rejected": -9778720.0, "logps/chosen": -154.95048014322916, "logps/rejected": -285.717626953125, "loss": 0.3161, "rewards/chosen": 0.03288434942563375, "rewards/margins": 1.6461230655511219, "rewards/rejected": -1.6132387161254882, "step": 8814 }, { "epoch": 0.4672302758858294, "grad_norm": 52.25, "kl": 3.2286319732666016, "learning_rate": 5e-07, "logits/chosen": 2569609.0, "logits/rejected": -45519653.333333336, "logps/chosen": -490.528515625, "logps/rejected": -354.979736328125, "loss": 0.3114, "rewards/chosen": 0.9640337944030761, "rewards/margins": 2.8893308957417805, "rewards/rejected": -1.9252971013387044, "step": 8815 }, { "epoch": 0.4672832798876315, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11409585.333333334, "logits/rejected": -682463.2, "logps/chosen": -345.291015625, "logps/rejected": -172.4045166015625, "loss": 0.2727, "rewards/chosen": 0.5101493994394938, "rewards/margins": 2.3945377508799233, "rewards/rejected": -1.8843883514404296, "step": 8816 }, { "epoch": 0.46733628388943366, "grad_norm": 52.5, "kl": 0.2735786437988281, "learning_rate": 5e-07, "logits/chosen": -3469724.0, "logits/rejected": -17009780.0, "logps/chosen": -246.25439453125, "logps/rejected": -331.32379150390625, "loss": 0.3188, "rewards/chosen": 0.47272437810897827, "rewards/margins": 1.7820199131965637, "rewards/rejected": -1.3092955350875854, "step": 8817 }, { "epoch": 0.4673892878912358, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23349539.2, "logits/rejected": -1942997.3333333333, "logps/chosen": -233.127978515625, "logps/rejected": -435.7002766927083, "loss": 0.3356, "rewards/chosen": 0.28605568408966064, "rewards/margins": 1.9678306182225545, "rewards/rejected": -1.6817749341328938, "step": 8818 }, { "epoch": 0.46744229189303793, "grad_norm": 124.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1692552.3333333333, "logits/rejected": -24199422.4, "logps/chosen": -466.2923177083333, "logps/rejected": -402.9484375, "loss": 0.2527, "rewards/chosen": -0.2912699381510417, "rewards/margins": 2.945978673299154, "rewards/rejected": -3.2372486114501955, "step": 8819 }, { "epoch": 0.46749529589484007, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7693832.0, "logits/rejected": -34258666.666666664, "logps/chosen": -162.1952667236328, "logps/rejected": -228.90816243489584, "loss": 0.163, "rewards/chosen": 1.0862610340118408, "rewards/margins": 3.416118542353312, "rewards/rejected": -2.329857508341471, "step": 8820 }, { "epoch": 0.4675482998966422, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77071832.0, "logits/rejected": -16522661.333333334, "logps/chosen": -518.4998779296875, "logps/rejected": -430.2794596354167, "loss": 0.1852, "rewards/chosen": 0.4395507872104645, "rewards/margins": 3.188618093729019, "rewards/rejected": -2.7490673065185547, "step": 8821 }, { "epoch": 0.46760130389844434, "grad_norm": 47.5, "kl": 0.3874378204345703, "learning_rate": 5e-07, "logits/chosen": -88458668.8, "logits/rejected": -30553653.333333332, "logps/chosen": -381.769091796875, "logps/rejected": -530.2322591145834, "loss": 0.2291, "rewards/chosen": 1.0142038345336915, "rewards/margins": 3.6857046763102215, "rewards/rejected": -2.67150084177653, "step": 8822 }, { "epoch": 0.4676543079002465, "grad_norm": 40.0, "kl": 0.6400032043457031, "learning_rate": 5e-07, "logits/chosen": -3868846.75, "logits/rejected": -40444072.0, "logps/chosen": -298.00335693359375, "logps/rejected": -335.947998046875, "loss": 0.2085, "rewards/chosen": 1.0651395320892334, "rewards/margins": 3.287945032119751, "rewards/rejected": -2.2228055000305176, "step": 8823 }, { "epoch": 0.4677073119020486, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47942880.0, "logits/rejected": -21789996.0, "logps/chosen": -174.1300048828125, "logps/rejected": -286.06585693359375, "loss": 0.2218, "rewards/chosen": 0.547783374786377, "rewards/margins": 3.2314109802246094, "rewards/rejected": -2.6836276054382324, "step": 8824 }, { "epoch": 0.46776031590385075, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11850992.0, "logits/rejected": -62300016.0, "logps/chosen": -305.04949951171875, "logps/rejected": -574.1234741210938, "loss": 0.2084, "rewards/chosen": 0.8173238635063171, "rewards/margins": 4.076061904430389, "rewards/rejected": -3.2587380409240723, "step": 8825 }, { "epoch": 0.4678133199056529, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57744408.0, "logits/rejected": -4762657.0, "logps/chosen": -660.4981689453125, "logps/rejected": -111.56682586669922, "loss": 0.175, "rewards/chosen": 1.5488274097442627, "rewards/margins": 3.5817410945892334, "rewards/rejected": -2.0329136848449707, "step": 8826 }, { "epoch": 0.46786632390745503, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9528208.666666666, "logits/rejected": -27691424.0, "logps/chosen": -219.65226236979166, "logps/rejected": -317.9394775390625, "loss": 0.2668, "rewards/chosen": 0.06488990783691406, "rewards/margins": 2.0769842147827147, "rewards/rejected": -2.0120943069458006, "step": 8827 }, { "epoch": 0.46791932790925717, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77253984.0, "logits/rejected": -22958456.0, "logps/chosen": -275.15102132161456, "logps/rejected": -298.2612548828125, "loss": 0.2194, "rewards/chosen": 0.4719240665435791, "rewards/margins": 2.9196321964263916, "rewards/rejected": -2.4477081298828125, "step": 8828 }, { "epoch": 0.4679723319110593, "grad_norm": 48.25, "kl": 0.9986953735351562, "learning_rate": 5e-07, "logits/chosen": -1488181.0, "logits/rejected": -22246748.0, "logps/chosen": -343.5032653808594, "logps/rejected": -299.2815856933594, "loss": 0.2815, "rewards/chosen": 0.3641233444213867, "rewards/margins": 2.5047285556793213, "rewards/rejected": -2.1406052112579346, "step": 8829 }, { "epoch": 0.46802533591286144, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22289397.333333332, "logits/rejected": -20431622.4, "logps/chosen": -332.88804117838544, "logps/rejected": -161.0012451171875, "loss": 0.2144, "rewards/chosen": 0.9881469408671061, "rewards/margins": 2.934454313913981, "rewards/rejected": -1.946307373046875, "step": 8830 }, { "epoch": 0.4680783399146636, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29028323.2, "logits/rejected": -57584992.0, "logps/chosen": -264.429638671875, "logps/rejected": -442.779296875, "loss": 0.3004, "rewards/chosen": 0.22000365257263182, "rewards/margins": 3.132534551620483, "rewards/rejected": -2.9125308990478516, "step": 8831 }, { "epoch": 0.4681313439164657, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30686992.0, "logits/rejected": -14511733.333333334, "logps/chosen": -478.3052978515625, "logps/rejected": -253.810791015625, "loss": 0.1819, "rewards/chosen": 1.6458061933517456, "rewards/margins": 3.491771737734477, "rewards/rejected": -1.8459655443827312, "step": 8832 }, { "epoch": 0.46818434791826785, "grad_norm": 46.75, "kl": 1.9913330078125, "learning_rate": 5e-07, "logits/chosen": -19578686.4, "logits/rejected": 176657194.66666666, "logps/chosen": -354.580029296875, "logps/rejected": -403.8854166666667, "loss": 0.2848, "rewards/chosen": 0.7821887493133545, "rewards/margins": 3.3073943614959718, "rewards/rejected": -2.525205612182617, "step": 8833 }, { "epoch": 0.46823735192007, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25695724.0, "logits/rejected": -14485459.0, "logps/chosen": -307.5751647949219, "logps/rejected": -492.762451171875, "loss": 0.2639, "rewards/chosen": 0.22338451445102692, "rewards/margins": 3.1212691217660904, "rewards/rejected": -2.8978846073150635, "step": 8834 }, { "epoch": 0.4682903559218721, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52155416.0, "logits/rejected": -35444592.0, "logps/chosen": -370.17449951171875, "logps/rejected": -690.611083984375, "loss": 0.3158, "rewards/chosen": -0.040892694145441055, "rewards/margins": 3.3186549209058285, "rewards/rejected": -3.3595476150512695, "step": 8835 }, { "epoch": 0.46834335992367426, "grad_norm": 59.25, "kl": 1.1521530151367188, "learning_rate": 5e-07, "logits/chosen": -43482585.6, "logits/rejected": -14220869.333333334, "logps/chosen": -472.823046875, "logps/rejected": -115.66012573242188, "loss": 0.2919, "rewards/chosen": 0.6872711181640625, "rewards/margins": 2.514417330423991, "rewards/rejected": -1.8271462122599285, "step": 8836 }, { "epoch": 0.4683963639254764, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30026548.0, "logits/rejected": -34542357.333333336, "logps/chosen": -133.11009216308594, "logps/rejected": -223.015869140625, "loss": 0.2222, "rewards/chosen": 0.11859340965747833, "rewards/margins": 2.422695353627205, "rewards/rejected": -2.3041019439697266, "step": 8837 }, { "epoch": 0.46844936792727854, "grad_norm": 40.5, "kl": 0.5138740539550781, "learning_rate": 5e-07, "logits/chosen": -11441791.2, "logits/rejected": -40577349.333333336, "logps/chosen": -221.9354248046875, "logps/rejected": -437.1096598307292, "loss": 0.2594, "rewards/chosen": 0.7790143489837646, "rewards/margins": 3.023515780766805, "rewards/rejected": -2.2445014317830405, "step": 8838 }, { "epoch": 0.4685023719290806, "grad_norm": 61.0, "kl": 1.8970260620117188, "learning_rate": 5e-07, "logits/chosen": -18833346.0, "logits/rejected": -15794562.0, "logps/chosen": -471.6776428222656, "logps/rejected": -342.11517333984375, "loss": 0.3053, "rewards/chosen": 0.6278865933418274, "rewards/margins": 3.2762959599494934, "rewards/rejected": -2.648409366607666, "step": 8839 }, { "epoch": 0.46855537593088276, "grad_norm": 42.75, "kl": 1.3058357238769531, "learning_rate": 5e-07, "logits/chosen": -11189688.0, "logits/rejected": -11673908.0, "logps/chosen": -143.82841796875, "logps/rejected": -346.7639973958333, "loss": 0.3417, "rewards/chosen": 0.3552090644836426, "rewards/margins": 2.1858123461405436, "rewards/rejected": -1.8306032816569011, "step": 8840 }, { "epoch": 0.4686083799326849, "grad_norm": 58.0, "kl": 2.2794723510742188, "learning_rate": 5e-07, "logits/chosen": 9638232.8, "logits/rejected": -44993498.666666664, "logps/chosen": -321.203564453125, "logps/rejected": -666.2047119140625, "loss": 0.3674, "rewards/chosen": 0.06131637096405029, "rewards/margins": 2.5943017403284707, "rewards/rejected": -2.5329853693644204, "step": 8841 }, { "epoch": 0.46866138393448703, "grad_norm": 41.25, "kl": 0.5677728652954102, "learning_rate": 5e-07, "logits/chosen": -106456117.33333333, "logits/rejected": -7441995.2, "logps/chosen": -235.90384928385416, "logps/rejected": -287.243359375, "loss": 0.2487, "rewards/chosen": 0.7664760748545328, "rewards/margins": 2.5295936743418377, "rewards/rejected": -1.7631175994873047, "step": 8842 }, { "epoch": 0.46871438793628917, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18655028.0, "logits/rejected": -16763665.333333334, "logps/chosen": -110.09994506835938, "logps/rejected": -306.9960123697917, "loss": 0.2021, "rewards/chosen": 0.5619537234306335, "rewards/margins": 2.626041750113169, "rewards/rejected": -2.0640880266825357, "step": 8843 }, { "epoch": 0.4687673919380913, "grad_norm": 50.75, "kl": 3.0117549896240234, "learning_rate": 5e-07, "logits/chosen": -6853660.0, "logits/rejected": 23099522.666666668, "logps/chosen": -265.893115234375, "logps/rejected": -606.4532877604166, "loss": 0.2988, "rewards/chosen": 0.518973445892334, "rewards/margins": 2.685312525431315, "rewards/rejected": -2.166339079538981, "step": 8844 }, { "epoch": 0.46882039593989344, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16096625.6, "logits/rejected": -258040.66666666666, "logps/chosen": -194.18154296875, "logps/rejected": -239.67889404296875, "loss": 0.3588, "rewards/chosen": 0.19008866548538209, "rewards/margins": 1.9016179203987122, "rewards/rejected": -1.71152925491333, "step": 8845 }, { "epoch": 0.4688733999416956, "grad_norm": 38.5, "kl": 1.025125503540039, "learning_rate": 5e-07, "logits/chosen": -5998877.5, "logits/rejected": -44586732.0, "logps/chosen": -254.47454833984375, "logps/rejected": -247.3939208984375, "loss": 0.3006, "rewards/chosen": -0.014946222305297852, "rewards/margins": 2.370044469833374, "rewards/rejected": -2.384990692138672, "step": 8846 }, { "epoch": 0.4689264039434977, "grad_norm": 41.75, "kl": 0.5575790405273438, "learning_rate": 5e-07, "logits/chosen": -23346648.0, "logits/rejected": -9801446.0, "logps/chosen": -232.97610473632812, "logps/rejected": -205.85430908203125, "loss": 0.2253, "rewards/chosen": 0.8166912794113159, "rewards/margins": 2.681681990623474, "rewards/rejected": -1.8649907112121582, "step": 8847 }, { "epoch": 0.46897940794529985, "grad_norm": 53.75, "kl": 0.12812232971191406, "learning_rate": 5e-07, "logits/chosen": -56015324.0, "logits/rejected": 2176216.3333333335, "logps/chosen": -412.8811340332031, "logps/rejected": -174.751220703125, "loss": 0.257, "rewards/chosen": 0.012928783893585205, "rewards/margins": 1.9383862614631653, "rewards/rejected": -1.92545747756958, "step": 8848 }, { "epoch": 0.469032411947102, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50974816.0, "logits/rejected": -47166604.8, "logps/chosen": -268.90797932942706, "logps/rejected": -429.28359375, "loss": 0.2516, "rewards/chosen": 0.2494655648867289, "rewards/margins": 2.262434391180674, "rewards/rejected": -2.012968826293945, "step": 8849 }, { "epoch": 0.4690854159489041, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34161700.0, "logits/rejected": -24615101.333333332, "logps/chosen": -190.27218627929688, "logps/rejected": -252.06441243489584, "loss": 0.2946, "rewards/chosen": -0.22644653916358948, "rewards/margins": 1.272839476664861, "rewards/rejected": -1.4992860158284504, "step": 8850 }, { "epoch": 0.46913841995070626, "grad_norm": 49.0, "kl": 1.4199638366699219, "learning_rate": 5e-07, "logits/chosen": -51819296.0, "logits/rejected": -14593044.8, "logps/chosen": -146.75081380208334, "logps/rejected": -728.523046875, "loss": 0.3473, "rewards/chosen": -0.5597171783447266, "rewards/margins": 1.765207862854004, "rewards/rejected": -2.3249250411987306, "step": 8851 }, { "epoch": 0.4691914239525084, "grad_norm": 45.0, "kl": 0.24219131469726562, "learning_rate": 5e-07, "logits/chosen": -56738841.6, "logits/rejected": -25742301.333333332, "logps/chosen": -230.381494140625, "logps/rejected": -236.9014892578125, "loss": 0.3399, "rewards/chosen": 0.09678796529769898, "rewards/margins": 2.209234114487966, "rewards/rejected": -2.112446149190267, "step": 8852 }, { "epoch": 0.46924442795431054, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27270944.0, "logits/rejected": 10122882.0, "logps/chosen": -718.8531901041666, "logps/rejected": -429.73760986328125, "loss": 0.2493, "rewards/chosen": 1.063562552134196, "rewards/margins": 3.0388113657633467, "rewards/rejected": -1.9752488136291504, "step": 8853 }, { "epoch": 0.4692974319561127, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20320952.0, "logits/rejected": -17911992.0, "logps/chosen": -736.1229248046875, "logps/rejected": -407.5575256347656, "loss": 0.2036, "rewards/chosen": 1.09050714969635, "rewards/margins": 3.5431593656539917, "rewards/rejected": -2.4526522159576416, "step": 8854 }, { "epoch": 0.4693504359579148, "grad_norm": 45.0, "kl": 1.3210563659667969, "learning_rate": 5e-07, "logits/chosen": -65107379.2, "logits/rejected": -17727568.0, "logps/chosen": -618.97998046875, "logps/rejected": -178.75911458333334, "loss": 0.2581, "rewards/chosen": 1.0868822097778321, "rewards/margins": 3.3556535402933756, "rewards/rejected": -2.2687713305155435, "step": 8855 }, { "epoch": 0.46940343995971695, "grad_norm": 58.0, "kl": 0.2793140411376953, "learning_rate": 5e-07, "logits/chosen": 15481265.6, "logits/rejected": -29410226.666666668, "logps/chosen": -379.390673828125, "logps/rejected": -200.1129150390625, "loss": 0.3193, "rewards/chosen": 0.5873042106628418, "rewards/margins": 1.9205963770548502, "rewards/rejected": -1.3332921663920085, "step": 8856 }, { "epoch": 0.4694564439615191, "grad_norm": 52.0, "kl": 1.197998046875, "learning_rate": 5e-07, "logits/chosen": -14661844.57142857, "logits/rejected": -5487157.0, "logps/chosen": -279.78857421875, "logps/rejected": -103.4795150756836, "loss": 0.3003, "rewards/chosen": 1.2946627480643136, "rewards/margins": 1.7558894199984414, "rewards/rejected": -0.4612266719341278, "step": 8857 }, { "epoch": 0.4695094479633212, "grad_norm": 52.25, "kl": 0.7511444091796875, "learning_rate": 5e-07, "logits/chosen": -57545210.666666664, "logits/rejected": -58124776.0, "logps/chosen": -270.03354899088544, "logps/rejected": -438.8998107910156, "loss": 0.3099, "rewards/chosen": 0.7004889647165934, "rewards/margins": 1.9655181566874185, "rewards/rejected": -1.2650291919708252, "step": 8858 }, { "epoch": 0.46956245196512336, "grad_norm": 47.75, "kl": 1.6116485595703125, "learning_rate": 5e-07, "logits/chosen": -32781020.0, "logits/rejected": -50945984.0, "logps/chosen": -348.453125, "logps/rejected": -524.3759765625, "loss": 0.1978, "rewards/chosen": 0.4951872229576111, "rewards/margins": 3.5543763836224875, "rewards/rejected": -3.0591891606648765, "step": 8859 }, { "epoch": 0.4696154559669255, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19912282.666666668, "logits/rejected": 118201.2, "logps/chosen": -368.6402994791667, "logps/rejected": -443.6416015625, "loss": 0.2004, "rewards/chosen": 1.2472890218098958, "rewards/margins": 3.289083607991536, "rewards/rejected": -2.0417945861816404, "step": 8860 }, { "epoch": 0.46966845996872764, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8170540.0, "logits/rejected": -115008789.33333333, "logps/chosen": -235.606640625, "logps/rejected": -560.7342529296875, "loss": 0.2398, "rewards/chosen": 0.887301254272461, "rewards/margins": 3.8433998107910154, "rewards/rejected": -2.9560985565185547, "step": 8861 }, { "epoch": 0.4697214639705298, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19986762.0, "logits/rejected": -9172274.0, "logps/chosen": -180.97900390625, "logps/rejected": -226.4348907470703, "loss": 0.3012, "rewards/chosen": -0.1780201941728592, "rewards/margins": 2.846467539668083, "rewards/rejected": -3.0244877338409424, "step": 8862 }, { "epoch": 0.4697744679723319, "grad_norm": 53.5, "kl": 1.0213546752929688, "learning_rate": 5e-07, "logits/chosen": -34243004.8, "logits/rejected": 19925918.666666668, "logps/chosen": -300.694921875, "logps/rejected": -301.1546223958333, "loss": 0.3711, "rewards/chosen": 0.22210321426391602, "rewards/margins": 1.734785811106364, "rewards/rejected": -1.512682596842448, "step": 8863 }, { "epoch": 0.46982747197413405, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13878979.2, "logits/rejected": -12432313.333333334, "logps/chosen": -239.0072265625, "logps/rejected": -420.88525390625, "loss": 0.361, "rewards/chosen": 0.27195603847503663, "rewards/margins": 1.633099627494812, "rewards/rejected": -1.3611435890197754, "step": 8864 }, { "epoch": 0.4698804759759362, "grad_norm": 77.5, "kl": 0.18996620178222656, "learning_rate": 5e-07, "logits/chosen": -50136080.0, "logits/rejected": -18837726.0, "logps/chosen": -309.4200744628906, "logps/rejected": -373.51104736328125, "loss": 0.3886, "rewards/chosen": -0.12409525364637375, "rewards/margins": 1.1375361159443855, "rewards/rejected": -1.2616313695907593, "step": 8865 }, { "epoch": 0.4699334799777383, "grad_norm": 49.75, "kl": 0.34405040740966797, "learning_rate": 5e-07, "logits/chosen": -67938150.4, "logits/rejected": -23301496.0, "logps/chosen": -729.76953125, "logps/rejected": -115.86899820963542, "loss": 0.3244, "rewards/chosen": 0.7086420059204102, "rewards/margins": 2.5488265355428057, "rewards/rejected": -1.8401845296223958, "step": 8866 }, { "epoch": 0.46998648397954046, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7881253.0, "logits/rejected": -26121224.0, "logps/chosen": -405.4339599609375, "logps/rejected": -242.06610107421875, "loss": 0.3559, "rewards/chosen": 0.13876138627529144, "rewards/margins": 1.5175940841436386, "rewards/rejected": -1.3788326978683472, "step": 8867 }, { "epoch": 0.4700394879813426, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30539028.0, "logits/rejected": -28253706.666666668, "logps/chosen": -493.83245849609375, "logps/rejected": -323.63421630859375, "loss": 0.2069, "rewards/chosen": 0.7702286243438721, "rewards/margins": 2.7173580328623457, "rewards/rejected": -1.9471294085184734, "step": 8868 }, { "epoch": 0.47009249198314473, "grad_norm": 39.25, "kl": 3.486743927001953, "learning_rate": 5e-07, "logits/chosen": -26359266.666666668, "logits/rejected": -38140240.0, "logps/chosen": -569.9857177734375, "logps/rejected": -450.44814453125, "loss": 0.1989, "rewards/chosen": 1.2515020370483398, "rewards/margins": 3.412051200866699, "rewards/rejected": -2.1605491638183594, "step": 8869 }, { "epoch": 0.47014549598494687, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15901070.666666666, "logits/rejected": -6014472.0, "logps/chosen": -214.673095703125, "logps/rejected": -260.0080078125, "loss": 0.2967, "rewards/chosen": 0.4840383132298787, "rewards/margins": 2.103987272580465, "rewards/rejected": -1.619948959350586, "step": 8870 }, { "epoch": 0.470198499986749, "grad_norm": 53.25, "kl": 2.167980194091797, "learning_rate": 5e-07, "logits/chosen": -18058310.0, "logits/rejected": -69094184.0, "logps/chosen": -361.24493408203125, "logps/rejected": -175.2702178955078, "loss": 0.2109, "rewards/chosen": 0.9001373052597046, "rewards/margins": 3.553139090538025, "rewards/rejected": -2.6530017852783203, "step": 8871 }, { "epoch": 0.47025150398855114, "grad_norm": 47.25, "kl": 0.2955131530761719, "learning_rate": 5e-07, "logits/chosen": -3065210.4, "logits/rejected": -22732826.666666668, "logps/chosen": -290.2396728515625, "logps/rejected": -263.1971028645833, "loss": 0.2961, "rewards/chosen": 0.915011978149414, "rewards/margins": 2.8098647753397623, "rewards/rejected": -1.8948527971903484, "step": 8872 }, { "epoch": 0.4703045079903533, "grad_norm": 66.0, "kl": 7.874473571777344, "learning_rate": 5e-07, "logits/chosen": -74171283.2, "logits/rejected": -52241888.0, "logps/chosen": -678.9669921875, "logps/rejected": -553.3285319010416, "loss": 0.2315, "rewards/chosen": 1.6442668914794922, "rewards/margins": 4.305712191263835, "rewards/rejected": -2.6614452997843423, "step": 8873 }, { "epoch": 0.4703575119921554, "grad_norm": 39.5, "kl": 0.09490966796875, "learning_rate": 5e-07, "logits/chosen": -54909786.666666664, "logits/rejected": -19537536.0, "logps/chosen": -159.80396525065103, "logps/rejected": -386.31376953125, "loss": 0.2355, "rewards/chosen": 0.19826334714889526, "rewards/margins": 2.7933309435844422, "rewards/rejected": -2.595067596435547, "step": 8874 }, { "epoch": 0.47041051599395756, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3554868.5, "logits/rejected": -54067552.0, "logps/chosen": -340.5622253417969, "logps/rejected": -434.6357727050781, "loss": 0.2387, "rewards/chosen": 0.7794592380523682, "rewards/margins": 2.8799407482147217, "rewards/rejected": -2.1004815101623535, "step": 8875 }, { "epoch": 0.4704635199957597, "grad_norm": 73.5, "kl": 0.22527313232421875, "learning_rate": 5e-07, "logits/chosen": -34242544.0, "logits/rejected": -25529240.0, "logps/chosen": -613.8233642578125, "logps/rejected": -157.10226440429688, "loss": 0.3564, "rewards/chosen": 0.4218200743198395, "rewards/margins": 1.3809012472629547, "rewards/rejected": -0.9590811729431152, "step": 8876 }, { "epoch": 0.47051652399756183, "grad_norm": 48.25, "kl": 0.9278926849365234, "learning_rate": 5e-07, "logits/chosen": 2754491.6, "logits/rejected": -3497962.6666666665, "logps/chosen": -47.328024291992186, "logps/rejected": -79.73361714680989, "loss": 0.4253, "rewards/chosen": -0.1029977798461914, "rewards/margins": 1.3204402923583984, "rewards/rejected": -1.4234380722045898, "step": 8877 }, { "epoch": 0.47056952799936397, "grad_norm": 36.0, "kl": 0.3224601745605469, "learning_rate": 5e-07, "logits/chosen": 7965240.666666667, "logits/rejected": -30260665.6, "logps/chosen": -22.6248779296875, "logps/rejected": -290.442431640625, "loss": 0.2931, "rewards/chosen": 0.21250967184702554, "rewards/margins": 1.6898847977320355, "rewards/rejected": -1.4773751258850099, "step": 8878 }, { "epoch": 0.4706225320011661, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66785216.0, "logits/rejected": -2145953.3333333335, "logps/chosen": -142.141357421875, "logps/rejected": -238.30558268229166, "loss": 0.3318, "rewards/chosen": 0.07323342561721802, "rewards/margins": 1.036017874876658, "rewards/rejected": -0.9627844492594401, "step": 8879 }, { "epoch": 0.47067553600296824, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14253873.333333334, "logits/rejected": -18226972.8, "logps/chosen": -223.26228841145834, "logps/rejected": -494.14306640625, "loss": 0.1865, "rewards/chosen": 0.7991542021433512, "rewards/margins": 3.288143459955851, "rewards/rejected": -2.4889892578125, "step": 8880 }, { "epoch": 0.4707285400047704, "grad_norm": 45.0, "kl": 1.6261444091796875, "learning_rate": 5e-07, "logits/chosen": 12361396.0, "logits/rejected": -22429490.285714287, "logps/chosen": -30.671064376831055, "logps/rejected": -289.83865792410717, "loss": 0.2102, "rewards/chosen": 0.9911672472953796, "rewards/margins": 2.649260835988181, "rewards/rejected": -1.6580935886928014, "step": 8881 }, { "epoch": 0.4707815440065725, "grad_norm": 26.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5695388.5, "logits/rejected": -28759717.333333332, "logps/chosen": -38.696311950683594, "logps/rejected": -259.00042724609375, "loss": 0.1522, "rewards/chosen": 0.7143789529800415, "rewards/margins": 3.434179902076721, "rewards/rejected": -2.7198009490966797, "step": 8882 }, { "epoch": 0.47083454800837465, "grad_norm": 59.75, "kl": 0.7587738037109375, "learning_rate": 5e-07, "logits/chosen": -4043932.0, "logits/rejected": -13449098.0, "logps/chosen": -273.1045445033482, "logps/rejected": -289.27734375, "loss": 0.4225, "rewards/chosen": 0.22497751031603133, "rewards/margins": 2.530507343155997, "rewards/rejected": -2.305529832839966, "step": 8883 }, { "epoch": 0.4708875520101768, "grad_norm": 61.0, "kl": 0.07695388793945312, "learning_rate": 5e-07, "logits/chosen": -6895745.333333333, "logits/rejected": 5193263.0, "logps/chosen": -403.5947265625, "logps/rejected": -140.30291748046875, "loss": 0.4308, "rewards/chosen": 0.1698018511136373, "rewards/margins": 0.8530411918958029, "rewards/rejected": -0.6832393407821655, "step": 8884 }, { "epoch": 0.47094055601197893, "grad_norm": 52.25, "kl": 1.4857673645019531, "learning_rate": 5e-07, "logits/chosen": -54473400.0, "logits/rejected": -16967546.0, "logps/chosen": -276.281982421875, "logps/rejected": -183.6449432373047, "loss": 0.2923, "rewards/chosen": 0.5669904947280884, "rewards/margins": 2.115859270095825, "rewards/rejected": -1.5488687753677368, "step": 8885 }, { "epoch": 0.47099356001378107, "grad_norm": 79.0, "kl": 2.7931652069091797, "learning_rate": 5e-07, "logits/chosen": -6709885.714285715, "logits/rejected": -9732402.0, "logps/chosen": -711.798828125, "logps/rejected": -158.05963134765625, "loss": 0.3512, "rewards/chosen": 0.8539193017142159, "rewards/margins": 2.7383736712591986, "rewards/rejected": -1.884454369544983, "step": 8886 }, { "epoch": 0.4710465640155832, "grad_norm": 52.5, "kl": 1.6560063362121582, "learning_rate": 5e-07, "logits/chosen": -10166849.333333334, "logits/rejected": -5388307.5, "logps/chosen": -357.5037841796875, "logps/rejected": -133.58485412597656, "loss": 0.3611, "rewards/chosen": 0.7253217697143555, "rewards/margins": 1.5984333157539368, "rewards/rejected": -0.8731115460395813, "step": 8887 }, { "epoch": 0.47109956801738534, "grad_norm": 58.75, "kl": 6.194536209106445, "learning_rate": 5e-07, "logits/chosen": -24382822.0, "logps/chosen": -549.750732421875, "loss": 0.491, "rewards/chosen": 0.7475488781929016, "step": 8888 }, { "epoch": 0.4711525720191874, "grad_norm": 49.25, "kl": 2.9477157592773438, "learning_rate": 5e-07, "logits/chosen": -13743036.8, "logits/rejected": -36711421.333333336, "logps/chosen": -605.56044921875, "logps/rejected": -336.9750162760417, "loss": 0.2209, "rewards/chosen": 1.7866357803344726, "rewards/margins": 3.591840330759684, "rewards/rejected": -1.8052045504252117, "step": 8889 }, { "epoch": 0.47120557602098956, "grad_norm": 65.0, "kl": 0.9742507934570312, "learning_rate": 5e-07, "logits/chosen": -32991238.4, "logits/rejected": -46332837.333333336, "logps/chosen": -197.392333984375, "logps/rejected": -461.3185628255208, "loss": 0.3623, "rewards/chosen": 0.17980659008026123, "rewards/margins": 1.6927199761072795, "rewards/rejected": -1.5129133860270183, "step": 8890 }, { "epoch": 0.4712585800227917, "grad_norm": 44.25, "kl": 3.6952972412109375, "learning_rate": 5e-07, "logits/chosen": -19357346.285714287, "logits/rejected": -13563501.0, "logps/chosen": -333.69388253348217, "logps/rejected": -215.28298950195312, "loss": 0.3614, "rewards/chosen": 0.9548963819231305, "rewards/margins": 2.2121229682649886, "rewards/rejected": -1.257226586341858, "step": 8891 }, { "epoch": 0.47131158402459383, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41570676.0, "logits/rejected": -49146720.0, "logps/chosen": -168.146728515625, "logps/rejected": -342.6109212239583, "loss": 0.2641, "rewards/chosen": 0.17192897200584412, "rewards/margins": 1.8927026689052582, "rewards/rejected": -1.720773696899414, "step": 8892 }, { "epoch": 0.47136458802639597, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5922011.0, "logits/rejected": -11462666.666666666, "logps/chosen": -185.06277465820312, "logps/rejected": -427.1926676432292, "loss": 0.2287, "rewards/chosen": 0.39620599150657654, "rewards/margins": 2.585310230652491, "rewards/rejected": -2.1891042391459146, "step": 8893 }, { "epoch": 0.4714175920281981, "grad_norm": 56.25, "kl": 0.2964210510253906, "learning_rate": 5e-07, "logits/chosen": -50042309.333333336, "logits/rejected": 18141411.2, "logps/chosen": -395.7572835286458, "logps/rejected": -420.05537109375, "loss": 0.2727, "rewards/chosen": 0.4835786819458008, "rewards/margins": 2.0294694900512695, "rewards/rejected": -1.5458908081054688, "step": 8894 }, { "epoch": 0.47147059603000024, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5117546.666666667, "logits/rejected": -39196160.0, "logps/chosen": -18.058993021647137, "logps/rejected": -214.196630859375, "loss": 0.3108, "rewards/chosen": 0.5288588205973307, "rewards/margins": 1.6448254267374676, "rewards/rejected": -1.1159666061401368, "step": 8895 }, { "epoch": 0.4715236000318024, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25845850.0, "logits/rejected": -18055186.0, "logps/chosen": -243.19540405273438, "logps/rejected": -359.7857666015625, "loss": 0.3036, "rewards/chosen": 0.3480522036552429, "rewards/margins": 2.371121108531952, "rewards/rejected": -2.023068904876709, "step": 8896 }, { "epoch": 0.4715766040336045, "grad_norm": 42.75, "kl": 0.6370353698730469, "learning_rate": 5e-07, "logits/chosen": -13428573.333333334, "logits/rejected": -46466804.0, "logps/chosen": -229.4508056640625, "logps/rejected": -429.8153381347656, "loss": 0.3613, "rewards/chosen": 0.42330169677734375, "rewards/margins": 2.1132805347442627, "rewards/rejected": -1.689978837966919, "step": 8897 }, { "epoch": 0.47162960803540666, "grad_norm": 36.75, "kl": 0.07533645629882812, "learning_rate": 5e-07, "logits/chosen": -26612396.0, "logits/rejected": -11498809.333333334, "logps/chosen": -187.69070434570312, "logps/rejected": -248.77180989583334, "loss": 0.1906, "rewards/chosen": 1.0932226181030273, "rewards/margins": 3.009549140930176, "rewards/rejected": -1.9163265228271484, "step": 8898 }, { "epoch": 0.4716826120372088, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25442680.0, "logits/rejected": -54395572.0, "logps/chosen": -324.51861572265625, "logps/rejected": -257.2682800292969, "loss": 0.3212, "rewards/chosen": 0.2837538719177246, "rewards/margins": 1.946070909500122, "rewards/rejected": -1.6623170375823975, "step": 8899 }, { "epoch": 0.47173561603901093, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39147732.0, "logits/rejected": -8409183.0, "logps/chosen": -298.52880859375, "logps/rejected": -368.09588623046875, "loss": 0.3154, "rewards/chosen": 0.2147602140903473, "rewards/margins": 1.9154719412326813, "rewards/rejected": -1.700711727142334, "step": 8900 }, { "epoch": 0.47178862004081307, "grad_norm": 29.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6495639.333333333, "logits/rejected": -3589920.4, "logps/chosen": -146.48235066731772, "logps/rejected": -98.92733154296874, "loss": 0.3019, "rewards/chosen": 0.02121672034263611, "rewards/margins": 1.679194015264511, "rewards/rejected": -1.657977294921875, "step": 8901 }, { "epoch": 0.4718416240426152, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7824705.5, "logits/rejected": -46322874.666666664, "logps/chosen": -45.94263458251953, "logps/rejected": -305.47878011067706, "loss": 0.2315, "rewards/chosen": 0.020885564386844635, "rewards/margins": 2.090484398106734, "rewards/rejected": -2.069598833719889, "step": 8902 }, { "epoch": 0.47189462804441734, "grad_norm": 43.0, "kl": 2.1313095092773438, "learning_rate": 5e-07, "logits/chosen": -18123505.6, "logits/rejected": 1863701.3333333333, "logps/chosen": -243.9063232421875, "logps/rejected": -301.51975504557294, "loss": 0.3261, "rewards/chosen": 0.6626193523406982, "rewards/margins": 2.768458922704061, "rewards/rejected": -2.105839570363363, "step": 8903 }, { "epoch": 0.4719476320462195, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41430184.0, "logits/rejected": -10454460.0, "logps/chosen": -316.0373229980469, "logps/rejected": -290.5727844238281, "loss": 0.275, "rewards/chosen": 0.502960205078125, "rewards/margins": 2.587458610534668, "rewards/rejected": -2.084498405456543, "step": 8904 }, { "epoch": 0.4720006360480216, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 41857357.333333336, "logits/rejected": -35943680.0, "logps/chosen": -218.85978190104166, "logps/rejected": -410.51103515625, "loss": 0.2249, "rewards/chosen": 0.3829481601715088, "rewards/margins": 2.9056211948394775, "rewards/rejected": -2.5226730346679687, "step": 8905 }, { "epoch": 0.47205364004982375, "grad_norm": 54.5, "kl": 0.02053070068359375, "learning_rate": 5e-07, "logits/chosen": -34400540.0, "logits/rejected": -26816308.0, "logps/chosen": -363.3703918457031, "logps/rejected": -248.5496063232422, "loss": 0.2746, "rewards/chosen": 0.643916130065918, "rewards/margins": 2.3564600944519043, "rewards/rejected": -1.7125439643859863, "step": 8906 }, { "epoch": 0.4721066440516259, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91874880.0, "logits/rejected": -29374252.8, "logps/chosen": -509.0115966796875, "logps/rejected": -180.06617431640626, "loss": 0.2509, "rewards/chosen": 0.7243469556172689, "rewards/margins": 2.27290948232015, "rewards/rejected": -1.5485625267028809, "step": 8907 }, { "epoch": 0.472159648053428, "grad_norm": 56.25, "kl": 0.0058231353759765625, "learning_rate": 5e-07, "logits/chosen": -73441574.4, "logits/rejected": 1496384.6666666667, "logps/chosen": -181.98101806640625, "logps/rejected": -402.3423258463542, "loss": 0.4078, "rewards/chosen": -0.18536860942840577, "rewards/margins": 1.5950141827265423, "rewards/rejected": -1.780382792154948, "step": 8908 }, { "epoch": 0.47221265205523016, "grad_norm": 48.0, "kl": 1.0590848922729492, "learning_rate": 5e-07, "logits/chosen": -50170788.0, "logits/rejected": 99305416.0, "logps/chosen": -455.2064208984375, "logps/rejected": -475.60101318359375, "loss": 0.1931, "rewards/chosen": 1.2668986320495605, "rewards/margins": 3.258096218109131, "rewards/rejected": -1.9911975860595703, "step": 8909 }, { "epoch": 0.4722656560570323, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20345170.0, "logits/rejected": -35157332.0, "logps/chosen": -200.0780029296875, "logps/rejected": -212.04690551757812, "loss": 0.2437, "rewards/chosen": 0.6594076156616211, "rewards/margins": 2.5898406505584717, "rewards/rejected": -1.9304330348968506, "step": 8910 }, { "epoch": 0.47231866005883444, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14767292.0, "logits/rejected": 16845738.666666668, "logps/chosen": -774.11279296875, "logps/rejected": -395.2181803385417, "loss": 0.2495, "rewards/chosen": 1.086706519126892, "rewards/margins": 2.92240305741628, "rewards/rejected": -1.835696538289388, "step": 8911 }, { "epoch": 0.4723716640606366, "grad_norm": 46.75, "kl": 0.853118896484375, "learning_rate": 5e-07, "logits/chosen": -19636680.0, "logits/rejected": -23566994.0, "logps/chosen": -233.24185180664062, "logps/rejected": -491.1181640625, "loss": 0.2908, "rewards/chosen": 0.3306463062763214, "rewards/margins": 3.3526916801929474, "rewards/rejected": -3.022045373916626, "step": 8912 }, { "epoch": 0.4724246680624387, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17857582.0, "logits/rejected": -8343130.0, "logps/chosen": -156.30113220214844, "logps/rejected": -235.7621612548828, "loss": 0.2961, "rewards/chosen": 0.406057745218277, "rewards/margins": 2.1655884087085724, "rewards/rejected": -1.7595306634902954, "step": 8913 }, { "epoch": 0.47247767206424085, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40640589.71428572, "logits/rejected": -38917648.0, "logps/chosen": -281.7101527622768, "logps/rejected": -481.6320495605469, "loss": 0.4156, "rewards/chosen": 0.15063522543225968, "rewards/margins": 3.8090765305927823, "rewards/rejected": -3.6584413051605225, "step": 8914 }, { "epoch": 0.472530676066043, "grad_norm": 43.25, "kl": 0.7524271011352539, "learning_rate": 5e-07, "logits/chosen": -25505101.333333332, "logits/rejected": 32315609.6, "logps/chosen": -433.449951171875, "logps/rejected": -537.139013671875, "loss": 0.2049, "rewards/chosen": 0.86669921875, "rewards/margins": 3.98565788269043, "rewards/rejected": -3.11895866394043, "step": 8915 }, { "epoch": 0.4725836800678451, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1710632.0, "logits/rejected": -25604950.4, "logps/chosen": -214.38623046875, "logps/rejected": -264.575439453125, "loss": 0.3141, "rewards/chosen": -0.38054148356119794, "rewards/margins": 1.6796008427937823, "rewards/rejected": -2.0601423263549803, "step": 8916 }, { "epoch": 0.47263668406964726, "grad_norm": 29.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35173890.666666664, "logits/rejected": -37770035.2, "logps/chosen": -798.7876790364584, "logps/rejected": -446.938037109375, "loss": 0.1913, "rewards/chosen": 1.2864313920338948, "rewards/margins": 3.817694362004598, "rewards/rejected": -2.531262969970703, "step": 8917 }, { "epoch": 0.4726896880714494, "grad_norm": 62.75, "kl": 0.5992393493652344, "learning_rate": 5e-07, "logits/chosen": -49093500.8, "logits/rejected": -34377704.0, "logps/chosen": -415.17529296875, "logps/rejected": -186.23909505208334, "loss": 0.3661, "rewards/chosen": 0.49001340866088866, "rewards/margins": 1.4265018781026204, "rewards/rejected": -0.9364884694417318, "step": 8918 }, { "epoch": 0.47274269207325154, "grad_norm": 38.75, "kl": 1.0582408905029297, "learning_rate": 5e-07, "logits/chosen": -15156758.666666666, "logits/rejected": -22364788.8, "logps/chosen": -255.977294921875, "logps/rejected": -294.6710205078125, "loss": 0.2145, "rewards/chosen": 0.9462247689565023, "rewards/margins": 2.766825850804647, "rewards/rejected": -1.8206010818481446, "step": 8919 }, { "epoch": 0.4727956960750537, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43892608.0, "logits/rejected": -12520326.0, "logps/chosen": -273.79752022879467, "logps/rejected": -1161.2840576171875, "loss": 0.4187, "rewards/chosen": 0.11280931745256696, "rewards/margins": 4.361466544015067, "rewards/rejected": -4.2486572265625, "step": 8920 }, { "epoch": 0.4728487000768558, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34688960.0, "logits/rejected": -76412776.0, "logps/chosen": -246.75608825683594, "logps/rejected": -318.6927490234375, "loss": 0.2708, "rewards/chosen": 0.08276556432247162, "rewards/margins": 3.067244991660118, "rewards/rejected": -2.9844794273376465, "step": 8921 }, { "epoch": 0.47290170407865795, "grad_norm": 43.75, "kl": 0.5799407958984375, "learning_rate": 5e-07, "logits/chosen": 40496034.666666664, "logits/rejected": -10405902.4, "logps/chosen": -242.83711751302084, "logps/rejected": -168.903857421875, "loss": 0.3259, "rewards/chosen": 0.11453513304392497, "rewards/margins": 1.736799422899882, "rewards/rejected": -1.622264289855957, "step": 8922 }, { "epoch": 0.4729547080804601, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40790389.333333336, "logits/rejected": -26746284.8, "logps/chosen": -150.70050048828125, "logps/rejected": -393.77021484375, "loss": 0.279, "rewards/chosen": -0.011827905972798666, "rewards/margins": 2.11731371084849, "rewards/rejected": -2.129141616821289, "step": 8923 }, { "epoch": 0.4730077120822622, "grad_norm": 52.75, "kl": 1.043853759765625, "learning_rate": 5e-07, "logits/chosen": -34671658.666666664, "logits/rejected": -27373864.0, "logps/chosen": -388.1464029947917, "logps/rejected": -301.8648986816406, "loss": 0.2695, "rewards/chosen": 0.7222778002421061, "rewards/margins": 3.906333605448405, "rewards/rejected": -3.184055805206299, "step": 8924 }, { "epoch": 0.47306071608406436, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -109343808.0, "logits/rejected": -30417723.42857143, "logps/chosen": -237.03768920898438, "logps/rejected": -463.05862862723217, "loss": 0.0924, "rewards/chosen": 1.3149429559707642, "rewards/margins": 4.272412351199559, "rewards/rejected": -2.9574693952287947, "step": 8925 }, { "epoch": 0.4731137200858665, "grad_norm": 35.25, "kl": 0.8979644775390625, "learning_rate": 5e-07, "logits/chosen": -30443680.0, "logits/rejected": -40541573.333333336, "logps/chosen": -360.76064453125, "logps/rejected": -410.0647379557292, "loss": 0.2795, "rewards/chosen": 1.2334231376647948, "rewards/margins": 3.429162565867106, "rewards/rejected": -2.195739428202311, "step": 8926 }, { "epoch": 0.47316672408766863, "grad_norm": 52.5, "kl": 0.05684852600097656, "learning_rate": 5e-07, "logits/chosen": -4978143.0, "logits/rejected": -31371828.0, "logps/chosen": -95.41831970214844, "logps/rejected": -441.5635986328125, "loss": 0.2717, "rewards/chosen": 0.34975969791412354, "rewards/margins": 2.952964425086975, "rewards/rejected": -2.6032047271728516, "step": 8927 }, { "epoch": 0.47321972808947077, "grad_norm": 62.5, "kl": 0.5779690742492676, "learning_rate": 5e-07, "logits/chosen": -57315978.666666664, "logits/rejected": 5883391.5, "logps/chosen": -303.4229736328125, "logps/rejected": -142.3739471435547, "loss": 0.3204, "rewards/chosen": 0.6076745986938477, "rewards/margins": 2.076308846473694, "rewards/rejected": -1.4686342477798462, "step": 8928 }, { "epoch": 0.4732727320912729, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2877790.5, "logits/rejected": -15295098.666666666, "logps/chosen": -449.7210998535156, "logps/rejected": -372.8570963541667, "loss": 0.1148, "rewards/chosen": 1.6783568859100342, "rewards/margins": 4.418260018030802, "rewards/rejected": -2.739903132120768, "step": 8929 }, { "epoch": 0.47332573609307504, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 148691568.0, "logits/rejected": -41342372.571428575, "logps/chosen": -1250.17919921875, "logps/rejected": -372.81825474330356, "loss": 0.1496, "rewards/chosen": -0.19282226264476776, "rewards/margins": 2.371252389890807, "rewards/rejected": -2.5640746525355746, "step": 8930 }, { "epoch": 0.4733787400948772, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55888564.0, "logits/rejected": -48019892.0, "logps/chosen": -172.75173950195312, "logps/rejected": -587.2723999023438, "loss": 0.2779, "rewards/chosen": 0.15561047196388245, "rewards/margins": 3.7057251036167145, "rewards/rejected": -3.550114631652832, "step": 8931 }, { "epoch": 0.4734317440966793, "grad_norm": 43.0, "kl": 0.2043304443359375, "learning_rate": 5e-07, "logits/chosen": -37364160.0, "logits/rejected": -42211584.0, "logps/chosen": -251.1191609700521, "logps/rejected": -342.818212890625, "loss": 0.2841, "rewards/chosen": -0.0017751057942708333, "rewards/margins": 2.0041355768839515, "rewards/rejected": -2.0059106826782225, "step": 8932 }, { "epoch": 0.47348474809848146, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 42664976.0, "logits/rejected": -22236003.2, "logps/chosen": -275.2616373697917, "logps/rejected": -288.654248046875, "loss": 0.2579, "rewards/chosen": -0.23142401377360025, "rewards/margins": 2.5724334081014, "rewards/rejected": -2.803857421875, "step": 8933 }, { "epoch": 0.4735377521002836, "grad_norm": 42.75, "kl": 1.4318885803222656, "learning_rate": 5e-07, "logits/chosen": -29069930.0, "logits/rejected": -17344832.0, "logps/chosen": -685.7587280273438, "logps/rejected": -386.2994384765625, "loss": 0.2037, "rewards/chosen": 1.0989094972610474, "rewards/margins": 4.195792317390442, "rewards/rejected": -3.0968828201293945, "step": 8934 }, { "epoch": 0.47359075610208573, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59701285.333333336, "logits/rejected": -20058476.8, "logps/chosen": -788.7897135416666, "logps/rejected": -227.651953125, "loss": 0.197, "rewards/chosen": 0.651202400525411, "rewards/margins": 3.140442474683126, "rewards/rejected": -2.4892400741577148, "step": 8935 }, { "epoch": 0.47364376010388787, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14250580.0, "logits/rejected": -20446892.0, "logps/chosen": -258.6416015625, "logps/rejected": -492.912841796875, "loss": 0.1487, "rewards/chosen": 1.0699615478515625, "rewards/margins": 3.827312469482422, "rewards/rejected": -2.7573509216308594, "step": 8936 }, { "epoch": 0.47369676410569, "grad_norm": 46.0, "kl": 0.9878082275390625, "learning_rate": 5e-07, "logits/chosen": -33472084.0, "logits/rejected": -51159552.0, "logps/chosen": -715.1260375976562, "logps/rejected": -816.07177734375, "loss": 0.1202, "rewards/chosen": 1.1343291997909546, "rewards/margins": 4.88178288936615, "rewards/rejected": -3.7474536895751953, "step": 8937 }, { "epoch": 0.47374976810749214, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60472441.6, "logits/rejected": -17112861.333333332, "logps/chosen": -389.5185302734375, "logps/rejected": -255.6279296875, "loss": 0.3672, "rewards/chosen": 0.15131995677947999, "rewards/margins": 1.779232160250346, "rewards/rejected": -1.627912203470866, "step": 8938 }, { "epoch": 0.4738027721092943, "grad_norm": 42.75, "kl": 5.61656379699707, "learning_rate": 5e-07, "logits/chosen": -19423974.666666668, "logits/rejected": -128701944.0, "logps/chosen": -341.8565266927083, "logps/rejected": -289.2574462890625, "loss": 0.3318, "rewards/chosen": 1.162411371866862, "rewards/margins": 3.2741836706797285, "rewards/rejected": -2.111772298812866, "step": 8939 }, { "epoch": 0.47385577611109636, "grad_norm": 38.0, "kl": 4.143486022949219, "learning_rate": 5e-07, "logits/chosen": -18601203.2, "logits/rejected": 278211.1666666667, "logps/chosen": -644.09736328125, "logps/rejected": -415.829345703125, "loss": 0.2153, "rewards/chosen": 1.4055059432983399, "rewards/margins": 4.859302965799968, "rewards/rejected": -3.4537970225016275, "step": 8940 }, { "epoch": 0.4739087801128985, "grad_norm": 38.0, "kl": 1.002985954284668, "learning_rate": 5e-07, "logits/chosen": -21616264.0, "logits/rejected": -22192716.0, "logps/chosen": -201.8555145263672, "logps/rejected": -122.52494812011719, "loss": 0.4079, "rewards/chosen": -0.24790306389331818, "rewards/margins": 1.3145742863416672, "rewards/rejected": -1.5624773502349854, "step": 8941 }, { "epoch": 0.47396178411470063, "grad_norm": 61.5, "kl": 1.9702568054199219, "learning_rate": 5e-07, "logits/chosen": -8021093.6, "logits/rejected": -20495720.0, "logps/chosen": -252.553759765625, "logps/rejected": -300.1724039713542, "loss": 0.4022, "rewards/chosen": 0.5279836177825927, "rewards/margins": 1.950851074854533, "rewards/rejected": -1.4228674570719402, "step": 8942 }, { "epoch": 0.47401478811650277, "grad_norm": 65.5, "kl": 0.1487274169921875, "learning_rate": 5e-07, "logits/chosen": -36228758.4, "logits/rejected": -7072985.333333333, "logps/chosen": -554.89541015625, "logps/rejected": -245.50274658203125, "loss": 0.3325, "rewards/chosen": 0.42195377349853513, "rewards/margins": 2.089817460378011, "rewards/rejected": -1.6678636868794758, "step": 8943 }, { "epoch": 0.4740677921183049, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80061088.0, "logits/rejected": -32829945.6, "logps/chosen": -375.567626953125, "logps/rejected": -214.6403076171875, "loss": 0.297, "rewards/chosen": -0.32701416810353595, "rewards/margins": 1.672255317370097, "rewards/rejected": -1.9992694854736328, "step": 8944 }, { "epoch": 0.47412079612010705, "grad_norm": 49.0, "kl": 0.08460044860839844, "learning_rate": 5e-07, "logits/chosen": -16539922.0, "logits/rejected": -11876717.333333334, "logps/chosen": -285.6207275390625, "logps/rejected": -344.9195963541667, "loss": 0.1997, "rewards/chosen": 0.9481334686279297, "rewards/margins": 2.8800595601399737, "rewards/rejected": -1.9319260915120442, "step": 8945 }, { "epoch": 0.4741738001219092, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17528004.0, "logits/rejected": -53862440.0, "logps/chosen": -90.5498275756836, "logps/rejected": -319.17694091796875, "loss": 0.3087, "rewards/chosen": 0.04533100128173828, "rewards/margins": 2.588996171951294, "rewards/rejected": -2.5436651706695557, "step": 8946 }, { "epoch": 0.4742268041237113, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32007941.333333332, "logits/rejected": -17941904.0, "logps/chosen": -405.2145182291667, "logps/rejected": -271.267236328125, "loss": 0.2469, "rewards/chosen": 0.7595427831013998, "rewards/margins": 2.4349135716756187, "rewards/rejected": -1.6753707885742188, "step": 8947 }, { "epoch": 0.47427980812551346, "grad_norm": 47.25, "kl": 2.359013080596924, "learning_rate": 5e-07, "logits/chosen": 3085752.6, "logits/rejected": -33259920.0, "logps/chosen": -167.32607421875, "logps/rejected": -391.3206787109375, "loss": 0.3841, "rewards/chosen": 0.11865160465240479, "rewards/margins": 2.0013431469599405, "rewards/rejected": -1.8826915423075359, "step": 8948 }, { "epoch": 0.4743328121273156, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14870045.333333334, "logits/rejected": -30960412.8, "logps/chosen": -453.5500081380208, "logps/rejected": -294.680859375, "loss": 0.2487, "rewards/chosen": 0.4688669840494792, "rewards/margins": 2.6677305857340494, "rewards/rejected": -2.1988636016845704, "step": 8949 }, { "epoch": 0.47438581612911773, "grad_norm": 41.5, "kl": 1.2724480628967285, "learning_rate": 5e-07, "logits/chosen": 32198550.0, "logits/rejected": -26508885.333333332, "logps/chosen": -1480.85595703125, "logps/rejected": -456.99755859375, "loss": 0.2112, "rewards/chosen": 0.019849061965942383, "rewards/margins": 2.364304780960083, "rewards/rejected": -2.3444557189941406, "step": 8950 }, { "epoch": 0.47443882013091987, "grad_norm": 39.75, "kl": 1.0811996459960938, "learning_rate": 5e-07, "logits/chosen": -23546252.0, "logits/rejected": -41509688.0, "logps/chosen": -124.8730239868164, "logps/rejected": -547.3846435546875, "loss": 0.3516, "rewards/chosen": -0.2280859649181366, "rewards/margins": 2.4389581978321075, "rewards/rejected": -2.667044162750244, "step": 8951 }, { "epoch": 0.474491824132722, "grad_norm": 37.5, "kl": 2.7048187255859375, "learning_rate": 5e-07, "logits/chosen": -30552122.0, "logits/rejected": -11654646.0, "logps/chosen": -364.8425598144531, "logps/rejected": -244.0463409423828, "loss": 0.3223, "rewards/chosen": 0.5834709405899048, "rewards/margins": 1.8097337484359741, "rewards/rejected": -1.2262628078460693, "step": 8952 }, { "epoch": 0.47454482813452414, "grad_norm": 46.25, "kl": 2.790210723876953, "learning_rate": 5e-07, "logits/chosen": -19902798.0, "logits/rejected": -32746764.0, "logps/chosen": -251.5570526123047, "logps/rejected": -244.5052947998047, "loss": 0.2483, "rewards/chosen": 0.5004100799560547, "rewards/margins": 2.3767004013061523, "rewards/rejected": -1.8762903213500977, "step": 8953 }, { "epoch": 0.4745978321363263, "grad_norm": 62.5, "kl": 2.473679542541504, "learning_rate": 5e-07, "logits/chosen": -52786771.2, "logits/rejected": 1379881.3333333333, "logps/chosen": -549.566796875, "logps/rejected": -122.0401611328125, "loss": 0.2918, "rewards/chosen": 0.7691397190093994, "rewards/margins": 2.305862538019816, "rewards/rejected": -1.5367228190104167, "step": 8954 }, { "epoch": 0.4746508361381284, "grad_norm": 50.75, "kl": 0.10611915588378906, "learning_rate": 5e-07, "logits/chosen": -19214460.0, "logits/rejected": -79124888.0, "logps/chosen": -386.64434814453125, "logps/rejected": -491.0199890136719, "loss": 0.2755, "rewards/chosen": 0.19306373596191406, "rewards/margins": 3.1241722106933594, "rewards/rejected": -2.9311084747314453, "step": 8955 }, { "epoch": 0.47470384013993056, "grad_norm": 40.25, "kl": 1.248556137084961, "learning_rate": 5e-07, "logits/chosen": -29904460.8, "logits/rejected": -28304760.0, "logps/chosen": -454.190771484375, "logps/rejected": -340.517822265625, "loss": 0.2401, "rewards/chosen": 1.1636692047119142, "rewards/margins": 3.8274694442749024, "rewards/rejected": -2.6638002395629883, "step": 8956 }, { "epoch": 0.4747568441417327, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28949869.333333332, "logits/rejected": -19721603.2, "logps/chosen": -403.083984375, "logps/rejected": -622.835498046875, "loss": 0.1743, "rewards/chosen": 1.1655431588490803, "rewards/margins": 3.497825225194295, "rewards/rejected": -2.332282066345215, "step": 8957 }, { "epoch": 0.47480984814353483, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16878132.0, "logits/rejected": -3390507.0, "logps/chosen": -276.1537170410156, "logps/rejected": -479.25811767578125, "loss": 0.364, "rewards/chosen": -0.33512192964553833, "rewards/margins": 1.6389722228050232, "rewards/rejected": -1.9740941524505615, "step": 8958 }, { "epoch": 0.47486285214533697, "grad_norm": 44.25, "kl": 2.2153854370117188, "learning_rate": 5e-07, "logits/chosen": -20265761.6, "logits/rejected": 9144664.666666666, "logps/chosen": -158.1432861328125, "logps/rejected": -306.75583902994794, "loss": 0.3806, "rewards/chosen": 0.7100836753845214, "rewards/margins": 2.024245611826579, "rewards/rejected": -1.3141619364420574, "step": 8959 }, { "epoch": 0.4749158561471391, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3581579.6, "logits/rejected": -26280946.666666668, "logps/chosen": -123.17811279296875, "logps/rejected": -484.905029296875, "loss": 0.3276, "rewards/chosen": 0.2490403652191162, "rewards/margins": 2.5007708072662354, "rewards/rejected": -2.251730442047119, "step": 8960 }, { "epoch": 0.47496886014894124, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15043428.8, "logits/rejected": -42250453.333333336, "logps/chosen": -218.3809326171875, "logps/rejected": -460.284912109375, "loss": 0.4561, "rewards/chosen": -0.21856245994567872, "rewards/margins": 0.7876906236012776, "rewards/rejected": -1.0062530835469563, "step": 8961 }, { "epoch": 0.4750218641507434, "grad_norm": 47.75, "kl": 0.3835868835449219, "learning_rate": 5e-07, "logits/chosen": -27849637.333333332, "logits/rejected": -12010928.8, "logps/chosen": -725.9027506510416, "logps/rejected": -253.7602294921875, "loss": 0.2828, "rewards/chosen": 1.1376760005950928, "rewards/margins": 2.465347719192505, "rewards/rejected": -1.327671718597412, "step": 8962 }, { "epoch": 0.4750748681525455, "grad_norm": 59.0, "kl": 0.6769180297851562, "learning_rate": 5e-07, "logits/chosen": -9299720.0, "logits/rejected": -16889812.0, "logps/chosen": -209.54507882254464, "logps/rejected": -83.81568908691406, "loss": 0.4294, "rewards/chosen": 0.2283022744315011, "rewards/margins": 2.0326632601874217, "rewards/rejected": -1.8043609857559204, "step": 8963 }, { "epoch": 0.47512787215434765, "grad_norm": 50.0, "kl": 0.9957389831542969, "learning_rate": 5e-07, "logits/chosen": -25385769.6, "logits/rejected": 4523227.333333333, "logps/chosen": -248.984912109375, "logps/rejected": -316.13330078125, "loss": 0.3438, "rewards/chosen": 0.22134115695953369, "rewards/margins": 2.6666277488072714, "rewards/rejected": -2.445286591847738, "step": 8964 }, { "epoch": 0.4751808761561498, "grad_norm": 62.5, "kl": 1.9939537048339844, "learning_rate": 5e-07, "logits/chosen": -10733685.333333334, "logits/rejected": 6346673.6, "logps/chosen": -348.2996419270833, "logps/rejected": -293.07197265625, "loss": 0.2669, "rewards/chosen": 0.3844056526819865, "rewards/margins": 2.578077451388041, "rewards/rejected": -2.1936717987060548, "step": 8965 }, { "epoch": 0.4752338801579519, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7160560.8, "logits/rejected": -37764762.666666664, "logps/chosen": -263.4754638671875, "logps/rejected": -221.24051920572916, "loss": 0.3842, "rewards/chosen": -0.05240086317062378, "rewards/margins": 1.6110525568326313, "rewards/rejected": -1.6634534200032551, "step": 8966 }, { "epoch": 0.47528688415975406, "grad_norm": 37.75, "kl": 1.7401466369628906, "learning_rate": 5e-07, "logits/chosen": 867167.3125, "logits/rejected": -10219971.0, "logps/chosen": -90.74398803710938, "logps/rejected": -358.455078125, "loss": 0.3035, "rewards/chosen": 0.6144689917564392, "rewards/margins": 2.358895242214203, "rewards/rejected": -1.7444262504577637, "step": 8967 }, { "epoch": 0.4753398881615562, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5755998.666666667, "logits/rejected": -19814260.0, "logps/chosen": -39.10411071777344, "logps/rejected": -417.5484619140625, "loss": 0.4129, "rewards/chosen": -0.058360288540522255, "rewards/margins": 2.238131811221441, "rewards/rejected": -2.296492099761963, "step": 8968 }, { "epoch": 0.47539289216335834, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10525796.666666666, "logits/rejected": -67475360.0, "logps/chosen": -285.1129964192708, "logps/rejected": -315.8902587890625, "loss": 0.2557, "rewards/chosen": 0.16792017221450806, "rewards/margins": 2.56508127450943, "rewards/rejected": -2.397161102294922, "step": 8969 }, { "epoch": 0.4754458961651605, "grad_norm": 76.5, "kl": 0.5391464233398438, "learning_rate": 5e-07, "logits/chosen": -66466424.0, "logits/rejected": -17683880.0, "logps/chosen": -254.54283142089844, "logps/rejected": -179.08599853515625, "loss": 0.361, "rewards/chosen": 0.1663007140159607, "rewards/margins": 1.6284037232398987, "rewards/rejected": -1.462103009223938, "step": 8970 }, { "epoch": 0.4754989001669626, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -132555128.0, "logits/rejected": -23917242.666666668, "logps/chosen": -410.20562744140625, "logps/rejected": -281.92030843098956, "loss": 0.2468, "rewards/chosen": 0.37895357608795166, "rewards/margins": 2.3485679229100542, "rewards/rejected": -1.9696143468221028, "step": 8971 }, { "epoch": 0.47555190416876475, "grad_norm": 54.75, "kl": 0.7939071655273438, "learning_rate": 5e-07, "logits/chosen": -16360921.0, "logits/rejected": -27004060.0, "logps/chosen": -253.0509796142578, "logps/rejected": -320.46783447265625, "loss": 0.3288, "rewards/chosen": 0.37001922726631165, "rewards/margins": 1.6658895313739777, "rewards/rejected": -1.295870304107666, "step": 8972 }, { "epoch": 0.4756049081705669, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17389402.0, "logits/rejected": -22879072.0, "logps/chosen": -90.51785278320312, "logps/rejected": -246.0842742919922, "loss": 0.2863, "rewards/chosen": 0.26692914962768555, "rewards/margins": 2.218813180923462, "rewards/rejected": -1.9518840312957764, "step": 8973 }, { "epoch": 0.475657912172369, "grad_norm": 35.75, "kl": 0.1857166290283203, "learning_rate": 5e-07, "logits/chosen": -17345710.0, "logits/rejected": -29318429.333333332, "logps/chosen": -328.72698974609375, "logps/rejected": -209.8458048502604, "loss": 0.1748, "rewards/chosen": 0.6193546056747437, "rewards/margins": 2.8866974910100303, "rewards/rejected": -2.2673428853352866, "step": 8974 }, { "epoch": 0.47571091617417116, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21514474.666666668, "logits/rejected": -9549960.0, "logps/chosen": -172.917724609375, "logps/rejected": -183.217236328125, "loss": 0.3396, "rewards/chosen": -0.3461415767669678, "rewards/margins": 1.1352831363677978, "rewards/rejected": -1.4814247131347655, "step": 8975 }, { "epoch": 0.4757639201759733, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53904040.0, "logits/rejected": -18144561.14285714, "logps/chosen": -986.4473266601562, "logps/rejected": -450.46114676339283, "loss": 0.1789, "rewards/chosen": -0.10794677585363388, "rewards/margins": 2.615044895027365, "rewards/rejected": -2.7229916708809987, "step": 8976 }, { "epoch": 0.47581692417777544, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36293989.333333336, "logits/rejected": 3752290.4, "logps/chosen": -200.09651692708334, "logps/rejected": -332.379052734375, "loss": 0.2973, "rewards/chosen": 0.09362894296646118, "rewards/margins": 1.7183832764625548, "rewards/rejected": -1.6247543334960937, "step": 8977 }, { "epoch": 0.4758699281795776, "grad_norm": 56.75, "kl": 0.5192489624023438, "learning_rate": 5e-07, "logits/chosen": -67273420.8, "logits/rejected": -41501733.333333336, "logps/chosen": -252.9723388671875, "logps/rejected": -253.8577880859375, "loss": 0.4219, "rewards/chosen": 0.011291974782943725, "rewards/margins": 1.2900082210699717, "rewards/rejected": -1.278716246287028, "step": 8978 }, { "epoch": 0.4759229321813797, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45460042.666666664, "logits/rejected": -8453852.0, "logps/chosen": -295.6943359375, "logps/rejected": -282.86962890625, "loss": 0.2445, "rewards/chosen": 0.3087697426478068, "rewards/margins": 2.572239820162455, "rewards/rejected": -2.2634700775146483, "step": 8979 }, { "epoch": 0.47597593618318185, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14892507.0, "logits/rejected": -8723086.0, "logps/chosen": -294.15338134765625, "logps/rejected": -235.7077178955078, "loss": 0.286, "rewards/chosen": 0.6626005172729492, "rewards/margins": 2.340775966644287, "rewards/rejected": -1.678175449371338, "step": 8980 }, { "epoch": 0.476028940184984, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53163744.0, "logits/rejected": 1284720.0, "logps/chosen": -398.19970703125, "logps/rejected": -123.1640625, "loss": 0.3172, "rewards/chosen": 0.08841603994369507, "rewards/margins": 1.649796712398529, "rewards/rejected": -1.561380672454834, "step": 8981 }, { "epoch": 0.4760819441867861, "grad_norm": 56.25, "kl": 0.9843826293945312, "learning_rate": 5e-07, "logits/chosen": -16744070.4, "logits/rejected": -71384997.33333333, "logps/chosen": -356.7185546875, "logps/rejected": -471.2386067708333, "loss": 0.2992, "rewards/chosen": 0.49779162406921384, "rewards/margins": 3.4732421398162843, "rewards/rejected": -2.9754505157470703, "step": 8982 }, { "epoch": 0.47613494818858826, "grad_norm": 49.25, "kl": 0.6134796142578125, "learning_rate": 5e-07, "logits/chosen": -33074438.4, "logits/rejected": -37361760.0, "logps/chosen": -253.036083984375, "logps/rejected": -316.8025309244792, "loss": 0.326, "rewards/chosen": 0.3150261640548706, "rewards/margins": 2.1282681226730347, "rewards/rejected": -1.813241958618164, "step": 8983 }, { "epoch": 0.4761879521903904, "grad_norm": 52.0, "kl": 0.18560409545898438, "learning_rate": 5e-07, "logits/chosen": -21610558.0, "logits/rejected": -47512244.0, "logps/chosen": -245.5186767578125, "logps/rejected": -324.9728088378906, "loss": 0.2215, "rewards/chosen": 0.5833561420440674, "rewards/margins": 3.19950532913208, "rewards/rejected": -2.6161491870880127, "step": 8984 }, { "epoch": 0.47624095619219253, "grad_norm": 37.0, "kl": 0.15550708770751953, "learning_rate": 5e-07, "logits/chosen": -34021392.0, "logits/rejected": -15032260.0, "logps/chosen": -684.6822509765625, "logps/rejected": -155.24172973632812, "loss": 0.159, "rewards/chosen": 1.313331127166748, "rewards/margins": 4.507448673248291, "rewards/rejected": -3.194117546081543, "step": 8985 }, { "epoch": 0.47629396019399467, "grad_norm": 60.25, "kl": 0.33742523193359375, "learning_rate": 5e-07, "logits/chosen": -17984659.2, "logits/rejected": -17344352.0, "logps/chosen": -297.3701904296875, "logps/rejected": -213.36153157552084, "loss": 0.3543, "rewards/chosen": 0.33041982650756835, "rewards/margins": 1.6085427284240723, "rewards/rejected": -1.278122901916504, "step": 8986 }, { "epoch": 0.4763469641957968, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38002140.0, "logits/rejected": -97381272.0, "logps/chosen": -220.57301330566406, "logps/rejected": -564.6348266601562, "loss": 0.27, "rewards/chosen": -0.04340839385986328, "rewards/margins": 3.7916693687438965, "rewards/rejected": -3.8350777626037598, "step": 8987 }, { "epoch": 0.47639996819759894, "grad_norm": 40.5, "kl": 0.14373016357421875, "learning_rate": 5e-07, "logits/chosen": -63719749.333333336, "logits/rejected": -10159240.8, "logps/chosen": -219.3455607096354, "logps/rejected": -304.4830322265625, "loss": 0.3506, "rewards/chosen": -0.2229877511660258, "rewards/margins": 1.3368960340817768, "rewards/rejected": -1.5598837852478027, "step": 8988 }, { "epoch": 0.4764529721994011, "grad_norm": 48.75, "kl": 2.9129676818847656, "learning_rate": 5e-07, "logits/chosen": 737428.8, "logits/rejected": -26098341.333333332, "logps/chosen": -433.851171875, "logps/rejected": -469.0044352213542, "loss": 0.3097, "rewards/chosen": 0.8159015655517579, "rewards/margins": 3.4368134816487634, "rewards/rejected": -2.6209119160970054, "step": 8989 }, { "epoch": 0.4765059762012032, "grad_norm": 35.25, "kl": 3.4444808959960938, "learning_rate": 5e-07, "logits/chosen": -19279016.0, "logits/rejected": -1754823.3333333333, "logps/chosen": -439.575048828125, "logps/rejected": -518.5144449869791, "loss": 0.2455, "rewards/chosen": 1.4752986907958985, "rewards/margins": 3.9814072926839197, "rewards/rejected": -2.506108601888021, "step": 8990 }, { "epoch": 0.4765589802030053, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35730278.4, "logits/rejected": -34856968.0, "logps/chosen": -385.47880859375, "logps/rejected": -199.50567626953125, "loss": 0.3376, "rewards/chosen": 0.15952818393707274, "rewards/margins": 2.591020353635152, "rewards/rejected": -2.4314921696980796, "step": 8991 }, { "epoch": 0.47661198420480744, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50072019.2, "logits/rejected": -14551160.0, "logps/chosen": -454.41171875, "logps/rejected": -333.5497639973958, "loss": 0.3446, "rewards/chosen": 0.3571801662445068, "rewards/margins": 2.00941424369812, "rewards/rejected": -1.6522340774536133, "step": 8992 }, { "epoch": 0.4766649882066096, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51154892.0, "logits/rejected": 2864804.5, "logps/chosen": -269.9969482421875, "logps/rejected": -117.27497100830078, "loss": 0.2832, "rewards/chosen": 1.0853042602539062, "rewards/margins": 2.7937395572662354, "rewards/rejected": -1.708435297012329, "step": 8993 }, { "epoch": 0.4767179922084117, "grad_norm": 40.75, "kl": 0.18607330322265625, "learning_rate": 5e-07, "logits/chosen": -25562578.666666668, "logits/rejected": -74883424.0, "logps/chosen": -381.6871744791667, "logps/rejected": -350.628076171875, "loss": 0.2, "rewards/chosen": 1.0516049067179363, "rewards/margins": 2.9707110087076822, "rewards/rejected": -1.9191061019897462, "step": 8994 }, { "epoch": 0.47677099621021385, "grad_norm": 60.5, "kl": 0.3928642272949219, "learning_rate": 5e-07, "logits/chosen": 16824101.333333332, "logits/rejected": -29159450.0, "logps/chosen": -787.7203776041666, "logps/rejected": -169.73336791992188, "loss": 0.32, "rewards/chosen": 1.2619683742523193, "rewards/margins": 1.8206408023834229, "rewards/rejected": -0.5586724281311035, "step": 8995 }, { "epoch": 0.476824000212016, "grad_norm": 50.75, "kl": 0.06583786010742188, "learning_rate": 5e-07, "logits/chosen": -141854.9, "logits/rejected": 37328821.333333336, "logps/chosen": -243.6576416015625, "logps/rejected": -414.9419352213542, "loss": 0.3307, "rewards/chosen": 0.45004920959472655, "rewards/margins": 2.0553006807963055, "rewards/rejected": -1.6052514712015789, "step": 8996 }, { "epoch": 0.4768770042138181, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42001517.333333336, "logits/rejected": -22313248.0, "logps/chosen": -246.39762369791666, "logps/rejected": -401.021630859375, "loss": 0.3022, "rewards/chosen": -0.32558999458948773, "rewards/margins": 1.813531251748403, "rewards/rejected": -2.139121246337891, "step": 8997 }, { "epoch": 0.47693000821562026, "grad_norm": 43.0, "kl": 0.2604637145996094, "learning_rate": 5e-07, "logits/chosen": -34943136.0, "logits/rejected": -7504281.333333333, "logps/chosen": -187.14298095703126, "logps/rejected": -156.52885945638022, "loss": 0.3637, "rewards/chosen": 0.15863914489746095, "rewards/margins": 2.2179468790690104, "rewards/rejected": -2.0593077341715493, "step": 8998 }, { "epoch": 0.4769830122174224, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32130902.0, "logits/rejected": -50265044.0, "logps/chosen": -385.0485534667969, "logps/rejected": -405.0628967285156, "loss": 0.288, "rewards/chosen": 0.21840959787368774, "rewards/margins": 2.488946497440338, "rewards/rejected": -2.2705368995666504, "step": 8999 }, { "epoch": 0.47703601621922453, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84560554.66666667, "logits/rejected": -24392972.8, "logps/chosen": -443.9960123697917, "logps/rejected": -204.12225341796875, "loss": 0.3154, "rewards/chosen": -0.00864461064338684, "rewards/margins": 1.7112751662731172, "rewards/rejected": -1.719919776916504, "step": 9000 }, { "epoch": 0.47708902022102667, "grad_norm": 53.5, "kl": 0.7751007080078125, "learning_rate": 5e-07, "logits/chosen": -20063016.0, "logits/rejected": -29930424.0, "logps/chosen": -329.42626953125, "logps/rejected": -439.4774576822917, "loss": 0.329, "rewards/chosen": 0.45023417472839355, "rewards/margins": 2.362172683080037, "rewards/rejected": -1.9119385083516438, "step": 9001 }, { "epoch": 0.4771420242228288, "grad_norm": 54.25, "kl": 1.6952905654907227, "learning_rate": 5e-07, "logits/chosen": 22455037.333333332, "logits/rejected": 1227908.8, "logps/chosen": -348.2462972005208, "logps/rejected": -280.737646484375, "loss": 0.2964, "rewards/chosen": 0.1897982358932495, "rewards/margins": 1.978472399711609, "rewards/rejected": -1.7886741638183594, "step": 9002 }, { "epoch": 0.47719502822463095, "grad_norm": 36.25, "kl": 1.0431461334228516, "learning_rate": 5e-07, "logits/chosen": -1460389.6666666667, "logits/rejected": -1737780.0, "logps/chosen": -317.66225179036456, "logps/rejected": -318.11572265625, "loss": 0.2905, "rewards/chosen": 0.9666957855224609, "rewards/margins": 4.558034896850586, "rewards/rejected": -3.591339111328125, "step": 9003 }, { "epoch": 0.4772480322264331, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4565347.333333333, "logits/rejected": -12251263.2, "logps/chosen": -71.31962585449219, "logps/rejected": -213.939013671875, "loss": 0.2547, "rewards/chosen": 0.2778860727945964, "rewards/margins": 2.287919489542643, "rewards/rejected": -2.0100334167480467, "step": 9004 }, { "epoch": 0.4773010362282352, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35498464.0, "logits/rejected": -18736480.0, "logps/chosen": -381.9851379394531, "logps/rejected": -124.59521484375, "loss": 0.268, "rewards/chosen": 0.5828354358673096, "rewards/margins": 2.160403251647949, "rewards/rejected": -1.5775678157806396, "step": 9005 }, { "epoch": 0.47735404023003736, "grad_norm": 41.25, "kl": 0.23779296875, "learning_rate": 5e-07, "logits/chosen": -25751486.0, "logits/rejected": -17501614.0, "logps/chosen": -177.80902099609375, "logps/rejected": -383.5660400390625, "loss": 0.2648, "rewards/chosen": 0.45702987909317017, "rewards/margins": 2.6207364201545715, "rewards/rejected": -2.1637065410614014, "step": 9006 }, { "epoch": 0.4774070442318395, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13063710.666666666, "logits/rejected": -3236671.0, "logps/chosen": -304.76694742838544, "logps/rejected": -101.63060302734375, "loss": 0.2553, "rewards/chosen": 0.5663243532180786, "rewards/margins": 2.4540377855300903, "rewards/rejected": -1.8877134323120117, "step": 9007 }, { "epoch": 0.47746004823364163, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33863052.8, "logits/rejected": -29413800.0, "logps/chosen": -507.05390625, "logps/rejected": -449.0666910807292, "loss": 0.3589, "rewards/chosen": 0.30782761573791506, "rewards/margins": 2.5972248554229735, "rewards/rejected": -2.2893972396850586, "step": 9008 }, { "epoch": 0.47751305223544377, "grad_norm": 66.5, "kl": 2.2274961471557617, "learning_rate": 5e-07, "logits/chosen": -19723762.0, "logits/rejected": -9012840.0, "logps/chosen": -332.3682556152344, "logps/rejected": -413.0731608072917, "loss": 0.2557, "rewards/chosen": 0.7217962741851807, "rewards/margins": 2.617553313573201, "rewards/rejected": -1.8957570393880208, "step": 9009 }, { "epoch": 0.4775660562372459, "grad_norm": 105.0, "kl": 0.4524965286254883, "learning_rate": 5e-07, "logits/chosen": 14669504.0, "logits/rejected": -35880029.333333336, "logps/chosen": -1447.7314453125, "logps/rejected": -185.49015299479166, "loss": 0.3318, "rewards/chosen": 0.41840438842773436, "rewards/margins": 2.0460424105326336, "rewards/rejected": -1.6276380221048992, "step": 9010 }, { "epoch": 0.47761906023904804, "grad_norm": 40.25, "kl": 0.8246383666992188, "learning_rate": 5e-07, "logits/chosen": -113357304.0, "logits/rejected": -31771172.0, "logps/chosen": -758.0618896484375, "logps/rejected": -294.0669250488281, "loss": 0.2576, "rewards/chosen": 1.1192076206207275, "rewards/margins": 3.1346352100372314, "rewards/rejected": -2.015427589416504, "step": 9011 }, { "epoch": 0.4776720642408502, "grad_norm": 44.5, "kl": 2.6243839263916016, "learning_rate": 5e-07, "logits/chosen": -19015508.8, "logits/rejected": -34733221.333333336, "logps/chosen": -108.8406005859375, "logps/rejected": -311.5698649088542, "loss": 0.4415, "rewards/chosen": -0.11220425367355347, "rewards/margins": 1.073968231678009, "rewards/rejected": -1.1861724853515625, "step": 9012 }, { "epoch": 0.4777250682426523, "grad_norm": 42.75, "kl": 0.13457870483398438, "learning_rate": 5e-07, "logits/chosen": -22288150.4, "logits/rejected": -95541898.66666667, "logps/chosen": -228.363720703125, "logps/rejected": -323.0443522135417, "loss": 0.2828, "rewards/chosen": 0.551097059249878, "rewards/margins": 3.402463420232137, "rewards/rejected": -2.8513663609822593, "step": 9013 }, { "epoch": 0.47777807224445445, "grad_norm": 50.0, "kl": 0.47226715087890625, "learning_rate": 5e-07, "logits/chosen": -19529862.85714286, "logits/rejected": -45234368.0, "logps/chosen": -259.99124581473217, "logps/rejected": -501.04168701171875, "loss": 0.2759, "rewards/chosen": 1.079035758972168, "rewards/margins": 5.796202182769775, "rewards/rejected": -4.717166423797607, "step": 9014 }, { "epoch": 0.4778310762462566, "grad_norm": 45.75, "kl": 1.8048667907714844, "learning_rate": 5e-07, "logits/chosen": 17812886.0, "logits/rejected": -22349988.0, "logps/chosen": -210.6189727783203, "logps/rejected": -295.921630859375, "loss": 0.3292, "rewards/chosen": 0.07408660650253296, "rewards/margins": 2.179986894130707, "rewards/rejected": -2.105900287628174, "step": 9015 }, { "epoch": 0.47788408024805873, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56463484.0, "logits/rejected": -28630317.333333332, "logps/chosen": -377.9669494628906, "logps/rejected": -171.0554402669271, "loss": 0.2738, "rewards/chosen": 0.02319183573126793, "rewards/margins": 1.7805958452324073, "rewards/rejected": -1.7574040095011394, "step": 9016 }, { "epoch": 0.47793708424986087, "grad_norm": 37.25, "kl": 1.7490386962890625, "learning_rate": 5e-07, "logits/chosen": -46925221.333333336, "logits/rejected": -62198835.2, "logps/chosen": -531.3823649088541, "logps/rejected": -485.970947265625, "loss": 0.1127, "rewards/chosen": 1.7343292236328125, "rewards/margins": 5.126298141479492, "rewards/rejected": -3.39196891784668, "step": 9017 }, { "epoch": 0.477990088251663, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21177553.333333332, "logits/rejected": -17602761.6, "logps/chosen": -221.49580891927084, "logps/rejected": -179.0109375, "loss": 0.3332, "rewards/chosen": -0.06915257374445598, "rewards/margins": 1.4634968479474384, "rewards/rejected": -1.5326494216918944, "step": 9018 }, { "epoch": 0.47804309225346514, "grad_norm": 50.0, "kl": 0.24767494201660156, "learning_rate": 5e-07, "logits/chosen": -18874805.333333332, "logits/rejected": -16682732.8, "logps/chosen": -187.46903483072916, "logps/rejected": -220.269873046875, "loss": 0.273, "rewards/chosen": 0.39277013142903644, "rewards/margins": 2.2055962880452475, "rewards/rejected": -1.812826156616211, "step": 9019 }, { "epoch": 0.4780960962552673, "grad_norm": 55.0, "kl": 0.128814697265625, "learning_rate": 5e-07, "logits/chosen": -41023044.0, "logits/rejected": -18284850.666666668, "logps/chosen": -306.79705810546875, "logps/rejected": -376.8418782552083, "loss": 0.2056, "rewards/chosen": 1.271662950515747, "rewards/margins": 2.8909618854522705, "rewards/rejected": -1.6192989349365234, "step": 9020 }, { "epoch": 0.4781491002570694, "grad_norm": 53.75, "kl": 1.7874794006347656, "learning_rate": 5e-07, "logits/chosen": 12497486.0, "logits/rejected": -5990062.0, "logps/chosen": -598.8060913085938, "logps/rejected": -258.2212829589844, "loss": 0.1797, "rewards/chosen": 1.1361726522445679, "rewards/margins": 3.964558482170105, "rewards/rejected": -2.828385829925537, "step": 9021 }, { "epoch": 0.47820210425887155, "grad_norm": 49.0, "kl": 0.9386615753173828, "learning_rate": 5e-07, "logits/chosen": -36907958.4, "logits/rejected": -23784522.666666668, "logps/chosen": -202.213818359375, "logps/rejected": -310.5181477864583, "loss": 0.319, "rewards/chosen": 0.3441324234008789, "rewards/margins": 2.683234214782715, "rewards/rejected": -2.339101791381836, "step": 9022 }, { "epoch": 0.4782551082606737, "grad_norm": 31.5, "kl": 2.2832107543945312, "learning_rate": 5e-07, "logits/chosen": -7941447.333333333, "logits/rejected": -13480944.0, "logps/chosen": -233.788818359375, "logps/rejected": -398.4314208984375, "loss": 0.2001, "rewards/chosen": 0.7933500607808431, "rewards/margins": 3.274106057484945, "rewards/rejected": -2.4807559967041017, "step": 9023 }, { "epoch": 0.4783081122624758, "grad_norm": 57.0, "kl": 1.1426281929016113, "learning_rate": 5e-07, "logits/chosen": 16744197.333333334, "logits/rejected": -12581387.0, "logps/chosen": -252.1345418294271, "logps/rejected": -196.1517333984375, "loss": 0.4069, "rewards/chosen": 0.18594221274058023, "rewards/margins": 1.941270391146342, "rewards/rejected": -1.7553281784057617, "step": 9024 }, { "epoch": 0.47836111626427796, "grad_norm": 42.5, "kl": 0.2764568328857422, "learning_rate": 5e-07, "logits/chosen": -28279148.0, "logits/rejected": 1228826.5, "logps/chosen": -336.2080993652344, "logps/rejected": -238.3439178466797, "loss": 0.2554, "rewards/chosen": 0.4919561445713043, "rewards/margins": 3.290051370859146, "rewards/rejected": -2.798095226287842, "step": 9025 }, { "epoch": 0.4784141202660801, "grad_norm": 45.25, "kl": 1.4365224838256836, "learning_rate": 5e-07, "logits/chosen": -10987918.857142856, "logits/rejected": 3780175.5, "logps/chosen": -214.76417759486608, "logps/rejected": -32.82640838623047, "loss": 0.4271, "rewards/chosen": 0.36948258536202566, "rewards/margins": 1.6810676540647234, "rewards/rejected": -1.3115850687026978, "step": 9026 }, { "epoch": 0.47846712426788224, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34259180.8, "logits/rejected": -29275994.666666668, "logps/chosen": -81.92332763671875, "logps/rejected": -230.60555013020834, "loss": 0.3119, "rewards/chosen": 0.27886247634887695, "rewards/margins": 2.5919737815856934, "rewards/rejected": -2.3131113052368164, "step": 9027 }, { "epoch": 0.4785201282696844, "grad_norm": 62.25, "kl": 0.1863861083984375, "learning_rate": 5e-07, "logits/chosen": -11884337.333333334, "logits/rejected": 5777712.0, "logps/chosen": -430.6740315755208, "logps/rejected": -77.40225219726562, "loss": 0.2887, "rewards/chosen": 0.6251617670059204, "rewards/margins": 3.5053149461746216, "rewards/rejected": -2.880153179168701, "step": 9028 }, { "epoch": 0.4785731322714865, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 74289152.0, "logits/rejected": -13218463.0, "logps/chosen": -382.1138916015625, "logps/rejected": -259.52178955078125, "loss": 0.2446, "rewards/chosen": 0.6638189554214478, "rewards/margins": 3.04527747631073, "rewards/rejected": -2.3814585208892822, "step": 9029 }, { "epoch": 0.47862613627328865, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48665336.0, "logits/rejected": -8136726.0, "logps/chosen": -496.92095947265625, "logps/rejected": -327.9412027994792, "loss": 0.2293, "rewards/chosen": -0.05481109395623207, "rewards/margins": 2.127355482429266, "rewards/rejected": -2.182166576385498, "step": 9030 }, { "epoch": 0.4786791402750908, "grad_norm": 57.25, "kl": 1.45477294921875, "learning_rate": 5e-07, "logits/chosen": -38172483.2, "logits/rejected": -8482925.333333334, "logps/chosen": -398.30244140625, "logps/rejected": -173.719970703125, "loss": 0.2937, "rewards/chosen": 0.6541641235351563, "rewards/margins": 2.718656826019287, "rewards/rejected": -2.064492702484131, "step": 9031 }, { "epoch": 0.4787321442768929, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 32656584.0, "logits/rejected": -2560293.5, "logps/chosen": -287.3500671386719, "logps/rejected": -112.78955078125, "loss": 0.2361, "rewards/chosen": 1.0860035419464111, "rewards/margins": 2.824251651763916, "rewards/rejected": -1.7382481098175049, "step": 9032 }, { "epoch": 0.47878514827869506, "grad_norm": 51.75, "kl": 3.56911563873291, "learning_rate": 5e-07, "logits/chosen": -24273226.666666668, "logits/rejected": -16151706.0, "logps/chosen": -264.14528401692706, "logps/rejected": -291.85040283203125, "loss": 0.3195, "rewards/chosen": 0.8372068405151367, "rewards/margins": 3.9073641300201416, "rewards/rejected": -3.070157289505005, "step": 9033 }, { "epoch": 0.4788381522804972, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56231896.0, "logits/rejected": -19969680.0, "logps/chosen": -449.97283935546875, "logps/rejected": -112.94868977864583, "loss": 0.3491, "rewards/chosen": 0.16493530571460724, "rewards/margins": 0.9948963671922684, "rewards/rejected": -0.8299610614776611, "step": 9034 }, { "epoch": 0.47889115628229934, "grad_norm": 49.75, "kl": 0.2863121032714844, "learning_rate": 5e-07, "logits/chosen": -36075958.4, "logits/rejected": -15680296.0, "logps/chosen": -412.566162109375, "logps/rejected": -156.7183837890625, "loss": 0.35, "rewards/chosen": 0.6279084205627441, "rewards/margins": 1.9778197924296061, "rewards/rejected": -1.349911371866862, "step": 9035 }, { "epoch": 0.47894416028410147, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40529418.666666664, "logits/rejected": -5033573.6, "logps/chosen": -172.3330281575521, "logps/rejected": -307.017724609375, "loss": 0.2806, "rewards/chosen": 0.06226882338523865, "rewards/margins": 2.075175279378891, "rewards/rejected": -2.0129064559936523, "step": 9036 }, { "epoch": 0.4789971642859036, "grad_norm": 52.5, "kl": 0.2525177001953125, "learning_rate": 5e-07, "logits/chosen": -21975372.0, "logits/rejected": -31506116.0, "logps/chosen": -413.10064697265625, "logps/rejected": -570.7725219726562, "loss": 0.2199, "rewards/chosen": 0.6537209153175354, "rewards/margins": 4.118901789188385, "rewards/rejected": -3.4651808738708496, "step": 9037 }, { "epoch": 0.47905016828770575, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40611721.6, "logits/rejected": -33435290.666666668, "logps/chosen": -454.98857421875, "logps/rejected": -296.75400797526044, "loss": 0.2529, "rewards/chosen": 1.008713436126709, "rewards/margins": 2.926179599761963, "rewards/rejected": -1.917466163635254, "step": 9038 }, { "epoch": 0.4791031722895079, "grad_norm": 56.0, "kl": 2.359600067138672, "learning_rate": 5e-07, "logits/chosen": -25693516.8, "logits/rejected": -34473480.0, "logps/chosen": -432.12900390625, "logps/rejected": -429.0784912109375, "loss": 0.2772, "rewards/chosen": 0.9053438186645508, "rewards/margins": 3.216569073994955, "rewards/rejected": -2.311225255330404, "step": 9039 }, { "epoch": 0.47915617629131, "grad_norm": 50.0, "kl": 0.15431594848632812, "learning_rate": 5e-07, "logits/chosen": -32909827.2, "logits/rejected": 82306992.0, "logps/chosen": -377.9126220703125, "logps/rejected": -331.00286865234375, "loss": 0.2859, "rewards/chosen": 0.4992959976196289, "rewards/margins": 2.9698932647705076, "rewards/rejected": -2.470597267150879, "step": 9040 }, { "epoch": 0.4792091802931121, "grad_norm": 76.5, "kl": 0.28290367126464844, "learning_rate": 5e-07, "logits/chosen": -32564122.666666668, "logits/rejected": -1865384.5, "logps/chosen": -360.4132893880208, "logps/rejected": -133.86123657226562, "loss": 0.2769, "rewards/chosen": 1.1640129884084065, "rewards/margins": 2.9215493996938067, "rewards/rejected": -1.7575364112854004, "step": 9041 }, { "epoch": 0.47926218429491424, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28859802.0, "logits/rejected": -23432044.0, "logps/chosen": -191.72544860839844, "logps/rejected": -378.7846374511719, "loss": 0.3479, "rewards/chosen": -0.178666889667511, "rewards/margins": 1.61115962266922, "rewards/rejected": -1.789826512336731, "step": 9042 }, { "epoch": 0.4793151882967164, "grad_norm": 36.75, "kl": 1.4209403991699219, "learning_rate": 5e-07, "logits/chosen": -21536876.8, "logits/rejected": 2724144.0, "logps/chosen": -539.964599609375, "logps/rejected": -88.44258626302083, "loss": 0.2468, "rewards/chosen": 1.2472967147827148, "rewards/margins": 3.106766923268636, "rewards/rejected": -1.8594702084859211, "step": 9043 }, { "epoch": 0.4793681922985185, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13839971.0, "logits/rejected": -102729200.0, "logps/chosen": -837.9356689453125, "logps/rejected": -490.623291015625, "loss": 0.2526, "rewards/chosen": 0.49315452575683594, "rewards/margins": 2.984337568283081, "rewards/rejected": -2.491183042526245, "step": 9044 }, { "epoch": 0.47942119630032065, "grad_norm": 64.0, "kl": 0.5976362228393555, "learning_rate": 5e-07, "logits/chosen": -71827528.0, "logits/rejected": 13488252.0, "logps/chosen": -528.4012451171875, "logps/rejected": -382.2333170572917, "loss": 0.1856, "rewards/chosen": 1.2022583484649658, "rewards/margins": 2.9565788110097246, "rewards/rejected": -1.754320462544759, "step": 9045 }, { "epoch": 0.4794742003021228, "grad_norm": 101.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63691896.0, "logits/rejected": -54903786.666666664, "logps/chosen": -1203.6046142578125, "logps/rejected": -369.2231852213542, "loss": 0.2148, "rewards/chosen": 0.4209045469760895, "rewards/margins": 2.339552154143651, "rewards/rejected": -1.9186476071675618, "step": 9046 }, { "epoch": 0.4795272043039249, "grad_norm": 53.0, "kl": 0.07106208801269531, "learning_rate": 5e-07, "logits/chosen": -35208560.0, "logits/rejected": -8656162.0, "logps/chosen": -410.71197509765625, "logps/rejected": -127.10131072998047, "loss": 0.2781, "rewards/chosen": 0.6721305847167969, "rewards/margins": 2.032291531562805, "rewards/rejected": -1.3601609468460083, "step": 9047 }, { "epoch": 0.47958020830572706, "grad_norm": 77.0, "kl": 1.0214576721191406, "learning_rate": 5e-07, "logits/chosen": -14265273.6, "logits/rejected": -19759238.666666668, "logps/chosen": -396.877294921875, "logps/rejected": -285.58770751953125, "loss": 0.2718, "rewards/chosen": 0.5760998249053955, "rewards/margins": 4.1026831785837805, "rewards/rejected": -3.5265833536783853, "step": 9048 }, { "epoch": 0.4796332123075292, "grad_norm": 48.0, "kl": 0.45162200927734375, "learning_rate": 5e-07, "logits/chosen": -33865352.0, "logits/rejected": -6283091.5, "logps/chosen": -246.11691284179688, "logps/rejected": -298.0857238769531, "loss": 0.2547, "rewards/chosen": 0.5628522038459778, "rewards/margins": 2.53417831659317, "rewards/rejected": -1.9713261127471924, "step": 9049 }, { "epoch": 0.47968621630933134, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27899376.0, "logits/rejected": -18123806.4, "logps/chosen": -324.8672281901042, "logps/rejected": -133.875146484375, "loss": 0.2551, "rewards/chosen": 0.82623291015625, "rewards/margins": 2.3142182350158693, "rewards/rejected": -1.487985324859619, "step": 9050 }, { "epoch": 0.4797392203111335, "grad_norm": 47.75, "kl": 1.2790393829345703, "learning_rate": 5e-07, "logits/chosen": -2701494.25, "logits/rejected": -12691621.333333334, "logps/chosen": -135.37307739257812, "logps/rejected": -151.23259480794272, "loss": 0.1888, "rewards/chosen": 0.8068782687187195, "rewards/margins": 2.756832142670949, "rewards/rejected": -1.9499538739522297, "step": 9051 }, { "epoch": 0.4797922243129356, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11886177.0, "logits/rejected": -5003283.333333333, "logps/chosen": -201.19137573242188, "logps/rejected": -135.3455607096354, "loss": 0.3121, "rewards/chosen": 0.1071355789899826, "rewards/margins": 1.47963647544384, "rewards/rejected": -1.3725008964538574, "step": 9052 }, { "epoch": 0.47984522831473775, "grad_norm": 54.25, "kl": 0.6819343566894531, "learning_rate": 5e-07, "logits/chosen": 27683654.4, "logits/rejected": -15320976.0, "logps/chosen": -324.6124267578125, "logps/rejected": -389.8280029296875, "loss": 0.3456, "rewards/chosen": 0.2218196153640747, "rewards/margins": 3.4251938422520958, "rewards/rejected": -3.203374226888021, "step": 9053 }, { "epoch": 0.4798982323165399, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54981189.333333336, "logits/rejected": -20866272.0, "logps/chosen": -261.98402913411456, "logps/rejected": -273.8403625488281, "loss": 0.4354, "rewards/chosen": -0.20884146293004355, "rewards/margins": 2.5010775129000344, "rewards/rejected": -2.709918975830078, "step": 9054 }, { "epoch": 0.479951236318342, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25829286.0, "logits/rejected": -22783424.0, "logps/chosen": -247.677490234375, "logps/rejected": -393.4864501953125, "loss": 0.1127, "rewards/chosen": 1.440086841583252, "rewards/margins": 4.923858165740967, "rewards/rejected": -3.483771324157715, "step": 9055 }, { "epoch": 0.48000424032014416, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22757740.8, "logits/rejected": -59533610.666666664, "logps/chosen": -327.909228515625, "logps/rejected": -146.68975830078125, "loss": 0.4069, "rewards/chosen": 0.042712098360061644, "rewards/margins": 1.343997444709142, "rewards/rejected": -1.3012853463490803, "step": 9056 }, { "epoch": 0.4800572443219463, "grad_norm": 41.0, "kl": 0.1761636734008789, "learning_rate": 5e-07, "logits/chosen": -48106074.666666664, "logits/rejected": -49246032.0, "logps/chosen": -224.8929239908854, "logps/rejected": -157.2004150390625, "loss": 0.2268, "rewards/chosen": 0.4626740614573161, "rewards/margins": 2.363481823603312, "rewards/rejected": -1.900807762145996, "step": 9057 }, { "epoch": 0.48011024832374843, "grad_norm": 87.5, "kl": 2.607328414916992, "learning_rate": 5e-07, "logits/chosen": -47933862.4, "logits/rejected": -20249845.333333332, "logps/chosen": -631.536669921875, "logps/rejected": -216.4801025390625, "loss": 0.2708, "rewards/chosen": 1.3913561820983886, "rewards/margins": 2.7987935066223146, "rewards/rejected": -1.4074373245239258, "step": 9058 }, { "epoch": 0.48016325232555057, "grad_norm": 41.0, "kl": 1.2708587646484375, "learning_rate": 5e-07, "logits/chosen": -23084044.8, "logits/rejected": -14118688.0, "logps/chosen": -274.790478515625, "logps/rejected": -188.445556640625, "loss": 0.3869, "rewards/chosen": 0.22847251892089843, "rewards/margins": 1.8228999455769856, "rewards/rejected": -1.5944274266560872, "step": 9059 }, { "epoch": 0.4802162563273527, "grad_norm": 34.0, "kl": 0.5515823364257812, "learning_rate": 5e-07, "logits/chosen": -16380781.333333334, "logits/rejected": -4363515.2, "logps/chosen": -104.5732421875, "logps/rejected": -87.82945556640625, "loss": 0.2753, "rewards/chosen": -0.20621923605600992, "rewards/margins": 2.141478975613912, "rewards/rejected": -2.347698211669922, "step": 9060 }, { "epoch": 0.48026926032915485, "grad_norm": 61.0, "kl": 0.7402858734130859, "learning_rate": 5e-07, "logits/chosen": -66572288.0, "logits/rejected": -28230008.0, "logps/chosen": -350.4793701171875, "logps/rejected": -428.1335856119792, "loss": 0.3769, "rewards/chosen": 0.18519697189331055, "rewards/margins": 1.5931643803914388, "rewards/rejected": -1.4079674084981282, "step": 9061 }, { "epoch": 0.480322264330957, "grad_norm": 76.0, "kl": 3.3685989379882812, "learning_rate": 5e-07, "logits/chosen": -33298938.666666668, "logits/rejected": -32237606.0, "logps/chosen": -335.673095703125, "logps/rejected": -430.7753601074219, "loss": 0.3656, "rewards/chosen": 0.645712415377299, "rewards/margins": 2.7447200218836465, "rewards/rejected": -2.0990076065063477, "step": 9062 }, { "epoch": 0.4803752683327591, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14537638.0, "logits/rejected": -10063568.0, "logps/chosen": -163.5949249267578, "logps/rejected": -211.4462890625, "loss": 0.2563, "rewards/chosen": 0.44036024808883667, "rewards/margins": 2.661648452281952, "rewards/rejected": -2.2212882041931152, "step": 9063 }, { "epoch": 0.48042827233456126, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17143002.0, "logits/rejected": -17793052.0, "logps/chosen": -346.5256652832031, "logps/rejected": -211.07061767578125, "loss": 0.2987, "rewards/chosen": 0.38513270020484924, "rewards/margins": 2.1208676397800446, "rewards/rejected": -1.7357349395751953, "step": 9064 }, { "epoch": 0.4804812763363634, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38964404.0, "logits/rejected": -39208316.0, "logps/chosen": -229.80386352539062, "logps/rejected": -181.6876220703125, "loss": 0.3838, "rewards/chosen": 0.02148691564798355, "rewards/margins": 1.027625359594822, "rewards/rejected": -1.0061384439468384, "step": 9065 }, { "epoch": 0.48053428033816553, "grad_norm": 46.75, "kl": 3.4934234619140625, "learning_rate": 5e-07, "logits/chosen": -9658004.666666666, "logits/rejected": 261586512.0, "logps/chosen": -215.96883138020834, "logps/rejected": -468.38653564453125, "loss": 0.3818, "rewards/chosen": 0.4613770643870036, "rewards/margins": 3.4007369677225747, "rewards/rejected": -2.9393599033355713, "step": 9066 }, { "epoch": 0.48058728433996767, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33166828.8, "logits/rejected": -52189946.666666664, "logps/chosen": -313.606591796875, "logps/rejected": -390.7550048828125, "loss": 0.2554, "rewards/chosen": 0.5942109107971192, "rewards/margins": 3.1965191841125487, "rewards/rejected": -2.6023082733154297, "step": 9067 }, { "epoch": 0.4806402883417698, "grad_norm": 41.25, "kl": 0.06380558013916016, "learning_rate": 5e-07, "logits/chosen": -15215664.0, "logits/rejected": -22687029.333333332, "logps/chosen": -163.1091796875, "logps/rejected": -810.735595703125, "loss": 0.3384, "rewards/chosen": 0.14338570833206177, "rewards/margins": 3.6216982007026672, "rewards/rejected": -3.4783124923706055, "step": 9068 }, { "epoch": 0.48069329234357194, "grad_norm": 60.25, "kl": 0.16334152221679688, "learning_rate": 5e-07, "logits/chosen": -58910362.666666664, "logits/rejected": -2457435.0, "logps/chosen": -395.3564453125, "logps/rejected": -63.358673095703125, "loss": 0.3501, "rewards/chosen": 0.45428188641866046, "rewards/margins": 1.870282252629598, "rewards/rejected": -1.4160003662109375, "step": 9069 }, { "epoch": 0.4807462963453741, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1250770.3333333333, "logits/rejected": -15370093.0, "logps/chosen": -348.7417805989583, "logps/rejected": -400.02789306640625, "loss": 0.4142, "rewards/chosen": 0.024689396222432453, "rewards/margins": 1.5294410785039265, "rewards/rejected": -1.5047516822814941, "step": 9070 }, { "epoch": 0.4807993003471762, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14303730.0, "logits/rejected": -12955079.0, "logps/chosen": -187.09988403320312, "logps/rejected": -444.12921142578125, "loss": 0.3319, "rewards/chosen": -0.1842365860939026, "rewards/margins": 2.1423961520195007, "rewards/rejected": -2.3266327381134033, "step": 9071 }, { "epoch": 0.48085230434897835, "grad_norm": 51.25, "kl": 0.2001171112060547, "learning_rate": 5e-07, "logits/chosen": 44127357.333333336, "logits/rejected": -12196368.0, "logps/chosen": -393.7450358072917, "logps/rejected": -261.56123046875, "loss": 0.2487, "rewards/chosen": 0.46727486451466876, "rewards/margins": 2.4948534091313683, "rewards/rejected": -2.0275785446166994, "step": 9072 }, { "epoch": 0.4809053083507805, "grad_norm": 32.0, "kl": 1.14093017578125, "learning_rate": 5e-07, "logits/chosen": -3042815.0, "logits/rejected": -27498008.0, "logps/chosen": -258.3932800292969, "logps/rejected": -324.63671875, "loss": 0.3004, "rewards/chosen": 0.8852013945579529, "rewards/margins": 2.8629950881004333, "rewards/rejected": -1.9777936935424805, "step": 9073 }, { "epoch": 0.48095831235258263, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51202176.0, "logits/rejected": -13207336.0, "logps/chosen": -449.8996175130208, "logps/rejected": -344.2595703125, "loss": 0.2193, "rewards/chosen": 0.41279908021291095, "rewards/margins": 2.7573936541875206, "rewards/rejected": -2.3445945739746095, "step": 9074 }, { "epoch": 0.48101131635438477, "grad_norm": 51.75, "kl": 1.2315750122070312, "learning_rate": 5e-07, "logits/chosen": 1546012.2, "logits/rejected": -67044762.666666664, "logps/chosen": -346.28203125, "logps/rejected": -555.8626708984375, "loss": 0.2881, "rewards/chosen": 0.4405199527740479, "rewards/margins": 3.015628480911255, "rewards/rejected": -2.575108528137207, "step": 9075 }, { "epoch": 0.4810643203561869, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30702044.8, "logits/rejected": -5428542.666666667, "logps/chosen": -354.40849609375, "logps/rejected": -167.03218587239584, "loss": 0.2861, "rewards/chosen": 0.6927786350250245, "rewards/margins": 2.3293495655059813, "rewards/rejected": -1.636570930480957, "step": 9076 }, { "epoch": 0.48111732435798904, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23260456.0, "logits/rejected": -44864501.333333336, "logps/chosen": -198.50177001953125, "logps/rejected": -379.4293212890625, "loss": 0.1877, "rewards/chosen": 0.871673583984375, "rewards/margins": 2.845078945159912, "rewards/rejected": -1.973405361175537, "step": 9077 }, { "epoch": 0.4811703283597912, "grad_norm": 45.25, "kl": 0.10454940795898438, "learning_rate": 5e-07, "logits/chosen": -21815602.666666668, "logits/rejected": -5096818.4, "logps/chosen": -329.4508056640625, "logps/rejected": -116.0937255859375, "loss": 0.2068, "rewards/chosen": 0.9528356393178304, "rewards/margins": 3.1911918481191, "rewards/rejected": -2.2383562088012696, "step": 9078 }, { "epoch": 0.4812233323615933, "grad_norm": 61.75, "kl": 0.28482818603515625, "learning_rate": 5e-07, "logits/chosen": -8300682.4, "logits/rejected": -10671994.0, "logps/chosen": -290.211474609375, "logps/rejected": -226.17303466796875, "loss": 0.3525, "rewards/chosen": 0.233970308303833, "rewards/margins": 2.7004942735036215, "rewards/rejected": -2.4665239651997886, "step": 9079 }, { "epoch": 0.48127633636339545, "grad_norm": 51.5, "kl": 1.96331787109375, "learning_rate": 5e-07, "logits/chosen": -17960084.0, "logits/rejected": -38105628.0, "logps/chosen": -418.44720458984375, "logps/rejected": -386.1953125, "loss": 0.2174, "rewards/chosen": 1.5357170104980469, "rewards/margins": 3.2518391609191895, "rewards/rejected": -1.7161221504211426, "step": 9080 }, { "epoch": 0.4813293403651976, "grad_norm": 44.25, "kl": 1.0057792663574219, "learning_rate": 5e-07, "logits/chosen": -11968314.0, "logits/rejected": -3277984.5, "logps/chosen": -188.19920349121094, "logps/rejected": -249.89181518554688, "loss": 0.299, "rewards/chosen": 0.24384979903697968, "rewards/margins": 2.3348501175642014, "rewards/rejected": -2.0910003185272217, "step": 9081 }, { "epoch": 0.4813823443669997, "grad_norm": 54.0, "kl": 0.7641773223876953, "learning_rate": 5e-07, "logits/chosen": -15302918.4, "logits/rejected": -15640221.333333334, "logps/chosen": -303.895166015625, "logps/rejected": -207.7837117513021, "loss": 0.2998, "rewards/chosen": 0.404587984085083, "rewards/margins": 3.159637371699015, "rewards/rejected": -2.755049387613932, "step": 9082 }, { "epoch": 0.48143534836880186, "grad_norm": 47.0, "kl": 0.6043834686279297, "learning_rate": 5e-07, "logits/chosen": -28768762.0, "logits/rejected": -14772000.0, "logps/chosen": -221.79310607910156, "logps/rejected": -150.2731730143229, "loss": 0.293, "rewards/chosen": -0.2603416442871094, "rewards/margins": 1.2608251571655273, "rewards/rejected": -1.5211668014526367, "step": 9083 }, { "epoch": 0.481488352370604, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36075120.0, "logits/rejected": -20722712.0, "logps/chosen": -377.84686279296875, "logps/rejected": -277.6564025878906, "loss": 0.3084, "rewards/chosen": -0.17812120914459229, "rewards/margins": 2.632158637046814, "rewards/rejected": -2.8102798461914062, "step": 9084 }, { "epoch": 0.48154135637240614, "grad_norm": 61.25, "kl": 2.731198310852051, "learning_rate": 5e-07, "logits/chosen": -53016620.0, "logits/rejected": -5539628.0, "logps/chosen": -457.9363708496094, "logps/rejected": -165.11019897460938, "loss": 0.3593, "rewards/chosen": 0.14419777691364288, "rewards/margins": 1.545044019818306, "rewards/rejected": -1.400846242904663, "step": 9085 }, { "epoch": 0.4815943603742083, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58202160.0, "logits/rejected": -31507636.0, "logps/chosen": -573.54248046875, "logps/rejected": -296.8929138183594, "loss": 0.2245, "rewards/chosen": 1.360791802406311, "rewards/margins": 3.559291958808899, "rewards/rejected": -2.198500156402588, "step": 9086 }, { "epoch": 0.4816473643760104, "grad_norm": 45.75, "kl": 1.8293380737304688, "learning_rate": 5e-07, "logits/chosen": -60243141.333333336, "logits/rejected": -46419376.0, "logps/chosen": -106.35369873046875, "logps/rejected": -190.030029296875, "loss": 0.4144, "rewards/chosen": 0.20518372456232706, "rewards/margins": 0.7915219585100809, "rewards/rejected": -0.5863382339477539, "step": 9087 }, { "epoch": 0.48170036837781255, "grad_norm": 62.75, "kl": 2.0761585235595703, "learning_rate": 5e-07, "logits/chosen": 4703122.4, "logits/rejected": -23363216.0, "logps/chosen": -276.3712158203125, "logps/rejected": -198.0998738606771, "loss": 0.4062, "rewards/chosen": -0.02654496431350708, "rewards/margins": 2.1848566571871437, "rewards/rejected": -2.211401621500651, "step": 9088 }, { "epoch": 0.4817533723796147, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5234848.0, "logits/rejected": -24136048.0, "logps/chosen": -245.830517578125, "logps/rejected": -406.1647135416667, "loss": 0.2764, "rewards/chosen": 0.6114401340484619, "rewards/margins": 2.7003440380096437, "rewards/rejected": -2.0889039039611816, "step": 9089 }, { "epoch": 0.4818063763814168, "grad_norm": 45.5, "kl": 0.5499439239501953, "learning_rate": 5e-07, "logits/chosen": -16594060.0, "logits/rejected": -31986580.0, "logps/chosen": -255.63021850585938, "logps/rejected": -222.09576416015625, "loss": 0.2366, "rewards/chosen": 0.489865779876709, "rewards/margins": 3.310746669769287, "rewards/rejected": -2.820880889892578, "step": 9090 }, { "epoch": 0.48185938038321896, "grad_norm": 50.5, "kl": 1.6623115539550781, "learning_rate": 5e-07, "logits/chosen": -26184790.4, "logits/rejected": -6101777.333333333, "logps/chosen": -292.62734375, "logps/rejected": -277.22296142578125, "loss": 0.361, "rewards/chosen": 0.0978444755077362, "rewards/margins": 2.9642366111278533, "rewards/rejected": -2.866392135620117, "step": 9091 }, { "epoch": 0.48191238438502104, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19328598.666666668, "logits/rejected": 1059728.0, "logps/chosen": -260.94101969401044, "logps/rejected": -72.48641967773438, "loss": 0.3419, "rewards/chosen": 0.4863894780476888, "rewards/margins": 2.278846581776937, "rewards/rejected": -1.792457103729248, "step": 9092 }, { "epoch": 0.4819653883868232, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70530490.66666667, "logits/rejected": -25903451.2, "logps/chosen": -441.6926676432292, "logps/rejected": -256.1643310546875, "loss": 0.1885, "rewards/chosen": 0.6645731925964355, "rewards/margins": 3.2047688484191896, "rewards/rejected": -2.540195655822754, "step": 9093 }, { "epoch": 0.4820183923886253, "grad_norm": 48.75, "kl": 2.6122331619262695, "learning_rate": 5e-07, "logits/chosen": -38507784.0, "logits/rejected": -1720172.5, "logps/chosen": -1030.790771484375, "logps/rejected": -212.3380584716797, "loss": 0.2306, "rewards/chosen": 1.2745273113250732, "rewards/margins": 3.0829129219055176, "rewards/rejected": -1.8083856105804443, "step": 9094 }, { "epoch": 0.48207139639042745, "grad_norm": 45.75, "kl": 1.3934383392333984, "learning_rate": 5e-07, "logits/chosen": -35040272.0, "logits/rejected": -50969724.0, "logps/chosen": -232.12155151367188, "logps/rejected": -429.74322509765625, "loss": 0.2064, "rewards/chosen": 1.132076621055603, "rewards/margins": 3.290118098258972, "rewards/rejected": -2.158041477203369, "step": 9095 }, { "epoch": 0.4821244003922296, "grad_norm": 43.5, "kl": 0.337799072265625, "learning_rate": 5e-07, "logits/chosen": 1744675.8, "logits/rejected": -12479512.0, "logps/chosen": -231.9115234375, "logps/rejected": -216.75518798828125, "loss": 0.3203, "rewards/chosen": 0.3014386653900146, "rewards/margins": 2.335637426376343, "rewards/rejected": -2.034198760986328, "step": 9096 }, { "epoch": 0.48217740439403173, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18570120.0, "logits/rejected": -77487880.0, "logps/chosen": -389.81280517578125, "logps/rejected": -216.49465942382812, "loss": 0.2825, "rewards/chosen": 0.5068778991699219, "rewards/margins": 2.0374759435653687, "rewards/rejected": -1.5305980443954468, "step": 9097 }, { "epoch": 0.48223040839583386, "grad_norm": 51.0, "kl": 1.6084680557250977, "learning_rate": 5e-07, "logits/chosen": -40209721.6, "logits/rejected": -226421.5, "logps/chosen": -173.733984375, "logps/rejected": -211.05155436197916, "loss": 0.3648, "rewards/chosen": 0.2220907688140869, "rewards/margins": 2.6365852832794188, "rewards/rejected": -2.414494514465332, "step": 9098 }, { "epoch": 0.482283412397636, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72508102.4, "logits/rejected": -29719794.666666668, "logps/chosen": -305.937060546875, "logps/rejected": -353.0078125, "loss": 0.3029, "rewards/chosen": 0.22794220447540284, "rewards/margins": 3.006315286954244, "rewards/rejected": -2.7783730824788413, "step": 9099 }, { "epoch": 0.48233641639943814, "grad_norm": 58.5, "kl": 1.5058879852294922, "learning_rate": 5e-07, "logits/chosen": -27340201.6, "logits/rejected": -1378860.4166666667, "logps/chosen": -434.693701171875, "logps/rejected": -249.97648111979166, "loss": 0.2972, "rewards/chosen": 0.48987274169921874, "rewards/margins": 2.9300042470296224, "rewards/rejected": -2.440131505330404, "step": 9100 }, { "epoch": 0.4823894204012403, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22044785.333333332, "logits/rejected": -55236889.6, "logps/chosen": -282.1874186197917, "logps/rejected": -312.1080078125, "loss": 0.2578, "rewards/chosen": 0.4470291535059611, "rewards/margins": 2.3486891190210977, "rewards/rejected": -1.9016599655151367, "step": 9101 }, { "epoch": 0.4824424244030424, "grad_norm": 38.5, "kl": 1.7171173095703125, "learning_rate": 5e-07, "logits/chosen": -28733898.666666668, "logits/rejected": -82608716.8, "logps/chosen": -358.9292399088542, "logps/rejected": -355.125830078125, "loss": 0.1465, "rewards/chosen": 1.468319257100423, "rewards/margins": 4.5022839864095054, "rewards/rejected": -3.033964729309082, "step": 9102 }, { "epoch": 0.48249542840484455, "grad_norm": 45.25, "kl": 0.5277652740478516, "learning_rate": 5e-07, "logits/chosen": -77393568.0, "logits/rejected": -36785904.0, "logps/chosen": -190.03515625, "logps/rejected": -281.32346598307294, "loss": 0.3287, "rewards/chosen": 0.13332772254943848, "rewards/margins": 2.674124002456665, "rewards/rejected": -2.5407962799072266, "step": 9103 }, { "epoch": 0.4825484324066467, "grad_norm": 42.5, "kl": 2.8702735900878906, "learning_rate": 5e-07, "logits/chosen": -7712496.8, "logits/rejected": -3469804.0, "logps/chosen": -287.298486328125, "logps/rejected": -193.68180338541666, "loss": 0.3403, "rewards/chosen": 0.41430816650390623, "rewards/margins": 2.4733477592468263, "rewards/rejected": -2.05903959274292, "step": 9104 }, { "epoch": 0.4826014364084488, "grad_norm": 30.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24451029.333333332, "logits/rejected": -20668393.6, "logps/chosen": -230.030029296875, "logps/rejected": -254.33115234375, "loss": 0.2025, "rewards/chosen": 0.5898305575052897, "rewards/margins": 3.1293628374735514, "rewards/rejected": -2.539532279968262, "step": 9105 }, { "epoch": 0.48265444041025096, "grad_norm": 64.5, "kl": 1.3148250579833984, "learning_rate": 5e-07, "logits/chosen": -35537859.2, "logits/rejected": -22602514.666666668, "logps/chosen": -318.9370361328125, "logps/rejected": -269.9499104817708, "loss": 0.3084, "rewards/chosen": 0.3449440002441406, "rewards/margins": 2.7985973358154297, "rewards/rejected": -2.453653335571289, "step": 9106 }, { "epoch": 0.4827074444120531, "grad_norm": 51.25, "kl": 0.7750682830810547, "learning_rate": 5e-07, "logits/chosen": -8311989.5, "logits/rejected": -3202809.0, "logps/chosen": -274.81781005859375, "logps/rejected": -375.8234456380208, "loss": 0.2774, "rewards/chosen": 0.5000995397567749, "rewards/margins": 1.9787448644638062, "rewards/rejected": -1.4786453247070312, "step": 9107 }, { "epoch": 0.48276044841385524, "grad_norm": 52.0, "kl": 0.15636062622070312, "learning_rate": 5e-07, "logits/chosen": -43444070.4, "logits/rejected": 855982.6666666666, "logps/chosen": -306.2925048828125, "logps/rejected": -136.47455851236978, "loss": 0.4097, "rewards/chosen": -0.08014329075813294, "rewards/margins": 1.4116622626781463, "rewards/rejected": -1.4918055534362793, "step": 9108 }, { "epoch": 0.4828134524156574, "grad_norm": 45.25, "kl": 1.8187255859375, "learning_rate": 5e-07, "logits/chosen": -2754720.0, "logits/rejected": -4460651.6, "logps/chosen": -257.0978597005208, "logps/rejected": -521.985205078125, "loss": 0.2502, "rewards/chosen": 0.4760422706604004, "rewards/margins": 2.463766002655029, "rewards/rejected": -1.9877237319946288, "step": 9109 }, { "epoch": 0.4828664564174595, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14622372.0, "logits/rejected": -15216782.4, "logps/chosen": -611.3605143229166, "logps/rejected": -405.4267578125, "loss": 0.1281, "rewards/chosen": 1.4482666651407878, "rewards/margins": 3.9122381846110024, "rewards/rejected": -2.4639715194702148, "step": 9110 }, { "epoch": 0.48291946041926165, "grad_norm": 32.25, "kl": 0.3576822280883789, "learning_rate": 5e-07, "logits/chosen": -5799885.333333333, "logits/rejected": -31282780.8, "logps/chosen": -376.9887288411458, "logps/rejected": -217.6248291015625, "loss": 0.1913, "rewards/chosen": 1.5715748469034831, "rewards/margins": 3.388213602701823, "rewards/rejected": -1.8166387557983399, "step": 9111 }, { "epoch": 0.4829724644210638, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33324357.333333332, "logits/rejected": -29678240.0, "logps/chosen": -283.2148844401042, "logps/rejected": -333.936279296875, "loss": 0.2434, "rewards/chosen": 0.40171802043914795, "rewards/margins": 2.4927260160446165, "rewards/rejected": -2.0910079956054686, "step": 9112 }, { "epoch": 0.4830254684228659, "grad_norm": 48.75, "kl": 1.179840087890625, "learning_rate": 5e-07, "logits/chosen": -19971580.8, "logits/rejected": -4675409.666666667, "logps/chosen": -244.8178955078125, "logps/rejected": -172.5941162109375, "loss": 0.4429, "rewards/chosen": -0.330355167388916, "rewards/margins": 1.1976598103841145, "rewards/rejected": -1.5280149777730305, "step": 9113 }, { "epoch": 0.48307847242466806, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3488139.5, "logits/rejected": -14275520.0, "logps/chosen": -211.7474365234375, "logps/rejected": -208.86629231770834, "loss": 0.3151, "rewards/chosen": -0.6489059329032898, "rewards/margins": 0.9109679659207661, "rewards/rejected": -1.559873898824056, "step": 9114 }, { "epoch": 0.4831314764264702, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36261952.0, "logits/rejected": -37031500.0, "logps/chosen": -310.0071105957031, "logps/rejected": -565.4337158203125, "loss": 0.3003, "rewards/chosen": -0.13221770524978638, "rewards/margins": 2.660548150539398, "rewards/rejected": -2.7927658557891846, "step": 9115 }, { "epoch": 0.48318448042827233, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40643480.0, "logits/rejected": -21762553.333333332, "logps/chosen": -291.8127136230469, "logps/rejected": -419.0113525390625, "loss": 0.1987, "rewards/chosen": 0.3502296507358551, "rewards/margins": 2.645117829243342, "rewards/rejected": -2.294888178507487, "step": 9116 }, { "epoch": 0.48323748443007447, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9116573.333333334, "logits/rejected": -25786822.4, "logps/chosen": -153.376953125, "logps/rejected": -160.55111083984374, "loss": 0.3658, "rewards/chosen": -0.10813707113265991, "rewards/margins": 1.1005326867103578, "rewards/rejected": -1.2086697578430177, "step": 9117 }, { "epoch": 0.4832904884318766, "grad_norm": 28.5, "kl": 0.6639213562011719, "learning_rate": 5e-07, "logits/chosen": -2396874.5, "logits/rejected": -33536746.666666668, "logps/chosen": -171.43203735351562, "logps/rejected": -183.0727335611979, "loss": 0.1337, "rewards/chosen": 1.9351052045822144, "rewards/margins": 3.846574505170186, "rewards/rejected": -1.911469300587972, "step": 9118 }, { "epoch": 0.48334349243367875, "grad_norm": 46.75, "kl": 0.44875144958496094, "learning_rate": 5e-07, "logits/chosen": -20813069.333333332, "logits/rejected": -35378536.0, "logps/chosen": -148.01553344726562, "logps/rejected": -424.0055847167969, "loss": 0.34, "rewards/chosen": 0.2786367932955424, "rewards/margins": 4.1634183923403425, "rewards/rejected": -3.8847815990448, "step": 9119 }, { "epoch": 0.4833964964354809, "grad_norm": 57.0, "kl": 1.2181329727172852, "learning_rate": 5e-07, "logits/chosen": -39842396.8, "logits/rejected": -14366914.666666666, "logps/chosen": -658.65791015625, "logps/rejected": -200.9959716796875, "loss": 0.275, "rewards/chosen": 1.5661986351013184, "rewards/margins": 2.9639658610026043, "rewards/rejected": -1.3977672259012859, "step": 9120 }, { "epoch": 0.483449500437283, "grad_norm": 49.5, "kl": 2.2443771362304688, "learning_rate": 5e-07, "logits/chosen": -75610704.0, "logits/rejected": 5573525.0, "logps/chosen": -627.6505737304688, "logps/rejected": -150.57138061523438, "loss": 0.2112, "rewards/chosen": 1.391082525253296, "rewards/margins": 3.252722978591919, "rewards/rejected": -1.861640453338623, "step": 9121 }, { "epoch": 0.48350250443908516, "grad_norm": 40.75, "kl": 0.9205818176269531, "learning_rate": 5e-07, "logits/chosen": -4267753.666666667, "logits/rejected": -4307936.8, "logps/chosen": -257.2465413411458, "logps/rejected": -254.99501953125, "loss": 0.3153, "rewards/chosen": 0.1665454904238383, "rewards/margins": 1.6978498498598735, "rewards/rejected": -1.5313043594360352, "step": 9122 }, { "epoch": 0.4835555084408873, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19935606.666666668, "logits/rejected": -22834860.8, "logps/chosen": -641.080078125, "logps/rejected": -463.10546875, "loss": 0.1807, "rewards/chosen": 0.8513151804606119, "rewards/margins": 3.891072336832682, "rewards/rejected": -3.0397571563720702, "step": 9123 }, { "epoch": 0.48360851244268943, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6600018.5, "logits/rejected": -60557800.0, "logps/chosen": -323.30072021484375, "logps/rejected": -281.1870422363281, "loss": 0.2518, "rewards/chosen": 0.2518118917942047, "rewards/margins": 3.1590122282505035, "rewards/rejected": -2.907200336456299, "step": 9124 }, { "epoch": 0.48366151644449157, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18513130.666666668, "logits/rejected": -17508900.0, "logps/chosen": -221.42032877604166, "logps/rejected": -198.27804565429688, "loss": 0.3316, "rewards/chosen": 0.39013830820719403, "rewards/margins": 2.8739611307779946, "rewards/rejected": -2.483822822570801, "step": 9125 }, { "epoch": 0.4837145204462937, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31314349.333333332, "logits/rejected": -10434438.4, "logps/chosen": -471.134033203125, "logps/rejected": -290.37890625, "loss": 0.2115, "rewards/chosen": 0.9051717122395834, "rewards/margins": 2.7041375478108725, "rewards/rejected": -1.798965835571289, "step": 9126 }, { "epoch": 0.48376752444809584, "grad_norm": 58.0, "kl": 0.38117218017578125, "learning_rate": 5e-07, "logits/chosen": -66370568.0, "logits/rejected": -8197813.0, "logps/chosen": -459.64208984375, "logps/rejected": -443.4910888671875, "loss": 0.2843, "rewards/chosen": 0.3448028564453125, "rewards/margins": 2.424569606781006, "rewards/rejected": -2.0797667503356934, "step": 9127 }, { "epoch": 0.483820528449898, "grad_norm": 64.0, "kl": 2.3296871185302734, "learning_rate": 5e-07, "logits/chosen": -16192156.0, "logits/rejected": -39092436.0, "logps/chosen": -668.7566731770834, "logps/rejected": -494.02777099609375, "loss": 0.3024, "rewards/chosen": 0.781022310256958, "rewards/margins": 3.111421823501587, "rewards/rejected": -2.330399513244629, "step": 9128 }, { "epoch": 0.4838735324517001, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11721568.0, "logits/rejected": -36788224.0, "logps/chosen": -168.41077677408853, "logps/rejected": -426.79306640625, "loss": 0.2426, "rewards/chosen": 0.32841893037160236, "rewards/margins": 2.409804733594259, "rewards/rejected": -2.0813858032226564, "step": 9129 }, { "epoch": 0.48392653645350225, "grad_norm": 41.0, "kl": 0.40020179748535156, "learning_rate": 5e-07, "logits/chosen": -22210298.666666668, "logits/rejected": -26063104.0, "logps/chosen": -339.22629801432294, "logps/rejected": -353.475830078125, "loss": 0.1952, "rewards/chosen": 0.7956497669219971, "rewards/margins": 4.106437730789184, "rewards/rejected": -3.3107879638671873, "step": 9130 }, { "epoch": 0.4839795404553044, "grad_norm": 55.0, "kl": 0.48355865478515625, "learning_rate": 5e-07, "logits/chosen": 916786.25, "logits/rejected": -29885464.0, "logps/chosen": -367.8962097167969, "logps/rejected": -284.6222229003906, "loss": 0.2982, "rewards/chosen": 0.08002547919750214, "rewards/margins": 2.35998897254467, "rewards/rejected": -2.279963493347168, "step": 9131 }, { "epoch": 0.48403254445710653, "grad_norm": 39.75, "kl": 1.3964805603027344, "learning_rate": 5e-07, "logits/chosen": -17467436.0, "logits/rejected": -49307360.0, "logps/chosen": -203.24044799804688, "logps/rejected": -364.8283284505208, "loss": 0.2041, "rewards/chosen": 1.560869574546814, "rewards/margins": 3.498969594637553, "rewards/rejected": -1.938100020090739, "step": 9132 }, { "epoch": 0.48408554845890867, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48782396.0, "logits/rejected": -13983907.0, "logps/chosen": -256.43231201171875, "logps/rejected": -222.5647430419922, "loss": 0.2415, "rewards/chosen": 0.29323121905326843, "rewards/margins": 3.406126946210861, "rewards/rejected": -3.1128957271575928, "step": 9133 }, { "epoch": 0.4841385524607108, "grad_norm": 39.0, "kl": 2.6786460876464844, "learning_rate": 5e-07, "logits/chosen": -30448405.333333332, "logits/rejected": -35858361.6, "logps/chosen": -189.30928548177084, "logps/rejected": -344.281396484375, "loss": 0.2978, "rewards/chosen": 0.3725968599319458, "rewards/margins": 2.330293393135071, "rewards/rejected": -1.957696533203125, "step": 9134 }, { "epoch": 0.48419155646251294, "grad_norm": 46.5, "kl": 0.4340362548828125, "learning_rate": 5e-07, "logits/chosen": -15493744.0, "logits/rejected": -17819188.8, "logps/chosen": -480.5208333333333, "logps/rejected": -262.994189453125, "loss": 0.2014, "rewards/chosen": 1.5510478019714355, "rewards/margins": 2.86458044052124, "rewards/rejected": -1.3135326385498047, "step": 9135 }, { "epoch": 0.4842445604643151, "grad_norm": 40.5, "kl": 0.21475458145141602, "learning_rate": 5e-07, "logits/chosen": -8733973.0, "logits/rejected": -5109176.0, "logps/chosen": -163.3279571533203, "logps/rejected": -166.84475708007812, "loss": 0.2154, "rewards/chosen": 0.950833797454834, "rewards/margins": 2.9346108436584473, "rewards/rejected": -1.9837770462036133, "step": 9136 }, { "epoch": 0.4842975644661172, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42260652.8, "logits/rejected": -15714377.333333334, "logps/chosen": -368.825244140625, "logps/rejected": -279.82867431640625, "loss": 0.3739, "rewards/chosen": 0.03407333195209503, "rewards/margins": 2.0162127604087194, "rewards/rejected": -1.9821394284566243, "step": 9137 }, { "epoch": 0.48435056846791935, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21743493.333333332, "logits/rejected": -40983968.0, "logps/chosen": -278.20298258463544, "logps/rejected": -527.310400390625, "loss": 0.267, "rewards/chosen": 0.01634172350168228, "rewards/margins": 3.1155984297394754, "rewards/rejected": -3.099256706237793, "step": 9138 }, { "epoch": 0.4844035724697215, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79296869.33333333, "logits/rejected": -14757560.0, "logps/chosen": -219.4957275390625, "logps/rejected": -531.9263916015625, "loss": 0.4457, "rewards/chosen": -0.31671706835428876, "rewards/margins": 2.334374030431112, "rewards/rejected": -2.6510910987854004, "step": 9139 }, { "epoch": 0.4844565764715236, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37988764.0, "logits/rejected": -18858214.85714286, "logps/chosen": -188.29031372070312, "logps/rejected": -314.19911411830356, "loss": 0.2158, "rewards/chosen": 0.20265503227710724, "rewards/margins": 2.5964334245238985, "rewards/rejected": -2.3937783922467912, "step": 9140 }, { "epoch": 0.48450958047332576, "grad_norm": 42.5, "kl": 1.2780647277832031, "learning_rate": 5e-07, "logits/chosen": -9873206.666666666, "logits/rejected": -24559886.4, "logps/chosen": -182.3524373372396, "logps/rejected": -248.2943359375, "loss": 0.2416, "rewards/chosen": 0.4392727216084798, "rewards/margins": 2.458646615346273, "rewards/rejected": -2.019373893737793, "step": 9141 }, { "epoch": 0.4845625844751279, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32649518.0, "logits/rejected": -30263933.333333332, "logps/chosen": -438.8100891113281, "logps/rejected": -226.8594970703125, "loss": 0.2487, "rewards/chosen": 0.346383661031723, "rewards/margins": 2.000567525625229, "rewards/rejected": -1.6541838645935059, "step": 9142 }, { "epoch": 0.48461558847693, "grad_norm": 60.25, "kl": 1.3815231323242188, "learning_rate": 5e-07, "logits/chosen": -38801830.4, "logits/rejected": -37320816.0, "logps/chosen": -579.09755859375, "logps/rejected": -401.749755859375, "loss": 0.3235, "rewards/chosen": 0.6528509616851806, "rewards/margins": 3.0561312198638917, "rewards/rejected": -2.403280258178711, "step": 9143 }, { "epoch": 0.4846685924787321, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9055372.8, "logits/rejected": -63472832.0, "logps/chosen": -85.89366455078125, "logps/rejected": -599.0158284505209, "loss": 0.3637, "rewards/chosen": -0.006325769424438477, "rewards/margins": 1.9245010534922282, "rewards/rejected": -1.9308268229166667, "step": 9144 }, { "epoch": 0.48472159648053426, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25841548.0, "logits/rejected": -31546690.666666668, "logps/chosen": -110.57109832763672, "logps/rejected": -201.56622314453125, "loss": 0.249, "rewards/chosen": 0.45042574405670166, "rewards/margins": 2.1378054221471148, "rewards/rejected": -1.6873796780904133, "step": 9145 }, { "epoch": 0.4847746004823364, "grad_norm": 70.0, "kl": 1.1172962188720703, "learning_rate": 5e-07, "logits/chosen": -24283216.0, "logps/chosen": -485.88958740234375, "loss": 0.4068, "rewards/chosen": 0.5786401033401489, "step": 9146 }, { "epoch": 0.48482760448413853, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61635240.0, "logits/rejected": -41396804.0, "logps/chosen": -267.6815185546875, "logps/rejected": -231.80430603027344, "loss": 0.2392, "rewards/chosen": 0.48715782165527344, "rewards/margins": 3.0586447715759277, "rewards/rejected": -2.5714869499206543, "step": 9147 }, { "epoch": 0.48488060848594067, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19682129.333333332, "logits/rejected": -53796947.2, "logps/chosen": -246.68558756510416, "logps/rejected": -361.7650634765625, "loss": 0.1656, "rewards/chosen": 0.8218555450439453, "rewards/margins": 3.6515966415405274, "rewards/rejected": -2.829741096496582, "step": 9148 }, { "epoch": 0.4849336124877428, "grad_norm": 44.75, "kl": 1.7454299926757812, "learning_rate": 5e-07, "logits/chosen": -32300083.2, "logits/rejected": -5090813.0, "logps/chosen": -298.28310546875, "logps/rejected": -369.2286783854167, "loss": 0.2785, "rewards/chosen": 0.7276366233825684, "rewards/margins": 3.994940725962321, "rewards/rejected": -3.2673041025797525, "step": 9149 }, { "epoch": 0.48498661648954494, "grad_norm": 46.0, "kl": 3.860757827758789, "learning_rate": 5e-07, "logits/chosen": -35313864.0, "logits/rejected": -4456987.5, "logps/chosen": -513.7401733398438, "logps/rejected": -186.096435546875, "loss": 0.292, "rewards/chosen": 1.1448357105255127, "rewards/margins": 2.70530104637146, "rewards/rejected": -1.5604653358459473, "step": 9150 }, { "epoch": 0.4850396204913471, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7726268.666666667, "logits/rejected": -42367680.0, "logps/chosen": -174.96466064453125, "logps/rejected": -243.938037109375, "loss": 0.2661, "rewards/chosen": -0.16036962469418845, "rewards/margins": 2.1510534862677253, "rewards/rejected": -2.311423110961914, "step": 9151 }, { "epoch": 0.4850926244931492, "grad_norm": 49.5, "kl": 0.7858657836914062, "learning_rate": 5e-07, "logits/chosen": -16539625.0, "logits/rejected": -25667368.0, "logps/chosen": -184.7366180419922, "logps/rejected": -222.3028106689453, "loss": 0.3079, "rewards/chosen": 0.1732887327671051, "rewards/margins": 2.2396489679813385, "rewards/rejected": -2.0663602352142334, "step": 9152 }, { "epoch": 0.48514562849495135, "grad_norm": 57.25, "kl": 0.8494510650634766, "learning_rate": 5e-07, "logits/chosen": -20366352.0, "logits/rejected": -7883692.5, "logps/chosen": -163.97048950195312, "logps/rejected": -124.54743957519531, "loss": 0.4347, "rewards/chosen": 0.32978888352711994, "rewards/margins": 0.8017938633759816, "rewards/rejected": -0.4720049798488617, "step": 9153 }, { "epoch": 0.4851986324967535, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1448603.875, "logits/rejected": -17921257.14285714, "logps/chosen": -295.0460205078125, "logps/rejected": -355.0354701450893, "loss": 0.1376, "rewards/chosen": 1.4383118152618408, "rewards/margins": 3.944715602057321, "rewards/rejected": -2.50640378679548, "step": 9154 }, { "epoch": 0.4852516364985556, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11204897.333333334, "logits/rejected": -16957740.8, "logps/chosen": -227.93070475260416, "logps/rejected": -387.06259765625, "loss": 0.2398, "rewards/chosen": 0.5968391497929891, "rewards/margins": 2.81983433564504, "rewards/rejected": -2.2229951858520507, "step": 9155 }, { "epoch": 0.48530464050035776, "grad_norm": 49.0, "kl": 0.9217357635498047, "learning_rate": 5e-07, "logits/chosen": -44022057.6, "logits/rejected": -35053781.333333336, "logps/chosen": -130.13974609375, "logps/rejected": -475.6923014322917, "loss": 0.3166, "rewards/chosen": 0.4010735511779785, "rewards/margins": 2.246901766459147, "rewards/rejected": -1.8458282152811687, "step": 9156 }, { "epoch": 0.4853576445021599, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29150704.0, "logits/rejected": -34399283.2, "logps/chosen": -263.9474283854167, "logps/rejected": -296.31416015625, "loss": 0.2842, "rewards/chosen": 0.24238944053649902, "rewards/margins": 2.1729085445404053, "rewards/rejected": -1.9305191040039062, "step": 9157 }, { "epoch": 0.48541064850396204, "grad_norm": 54.0, "kl": 2.7425012588500977, "learning_rate": 5e-07, "logits/chosen": -17930880.0, "logits/rejected": -1492586.0, "logps/chosen": -422.471826171875, "logps/rejected": -105.0115966796875, "loss": 0.3874, "rewards/chosen": 0.9396747589111328, "rewards/margins": 1.9922680695851644, "rewards/rejected": -1.0525933106740315, "step": 9158 }, { "epoch": 0.4854636525057642, "grad_norm": 53.5, "kl": 0.6231956481933594, "learning_rate": 5e-07, "logits/chosen": 1561801.0, "logits/rejected": -43881332.0, "logps/chosen": -391.9028015136719, "logps/rejected": -517.1558227539062, "loss": 0.2671, "rewards/chosen": 0.12234078347682953, "rewards/margins": 3.438933953642845, "rewards/rejected": -3.3165931701660156, "step": 9159 }, { "epoch": 0.4855166565075663, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22987944.0, "logits/rejected": -73043568.0, "logps/chosen": -297.0244140625, "logps/rejected": -405.90869140625, "loss": 0.3039, "rewards/chosen": 0.5782275994618734, "rewards/margins": 2.7269869645436606, "rewards/rejected": -2.148759365081787, "step": 9160 }, { "epoch": 0.48556966050936845, "grad_norm": 43.75, "kl": 1.5607709884643555, "learning_rate": 5e-07, "logits/chosen": -20710020.0, "logits/rejected": -11525324.0, "logps/chosen": -151.45945739746094, "logps/rejected": -261.8961588541667, "loss": 0.234, "rewards/chosen": 0.4873022437095642, "rewards/margins": 2.507985850175222, "rewards/rejected": -2.0206836064656577, "step": 9161 }, { "epoch": 0.4856226645111706, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58134988.0, "logits/rejected": -13524853.333333334, "logps/chosen": -450.56182861328125, "logps/rejected": -315.614013671875, "loss": 0.2301, "rewards/chosen": 0.3559723198413849, "rewards/margins": 2.4438220957914987, "rewards/rejected": -2.087849775950114, "step": 9162 }, { "epoch": 0.4856756685129727, "grad_norm": 59.25, "kl": 0.7664031982421875, "learning_rate": 5e-07, "logits/chosen": -33500309.333333332, "logits/rejected": -19066812.0, "logps/chosen": -367.7460123697917, "logps/rejected": -537.6551513671875, "loss": 0.3281, "rewards/chosen": 0.5448209047317505, "rewards/margins": 3.070322871208191, "rewards/rejected": -2.5255019664764404, "step": 9163 }, { "epoch": 0.48572867251477486, "grad_norm": 41.5, "kl": 0.2757883071899414, "learning_rate": 5e-07, "logits/chosen": -652696.3125, "logits/rejected": -13920042.0, "logps/chosen": -196.34681701660156, "logps/rejected": -74.76007080078125, "loss": 0.372, "rewards/chosen": 0.19875940680503845, "rewards/margins": 1.292376846075058, "rewards/rejected": -1.0936174392700195, "step": 9164 }, { "epoch": 0.485781676516577, "grad_norm": 49.75, "kl": 0.8863229751586914, "learning_rate": 5e-07, "logits/chosen": -46767820.0, "logits/rejected": 9309571.0, "logps/chosen": -337.876220703125, "logps/rejected": -158.33485412597656, "loss": 0.3387, "rewards/chosen": 0.13577136397361755, "rewards/margins": 1.8384660184383392, "rewards/rejected": -1.7026946544647217, "step": 9165 }, { "epoch": 0.48583468051837914, "grad_norm": 41.75, "kl": 1.324411392211914, "learning_rate": 5e-07, "logits/chosen": -19664227.2, "logits/rejected": -28795621.333333332, "logps/chosen": -289.7288818359375, "logps/rejected": -506.3368326822917, "loss": 0.2884, "rewards/chosen": 0.8122617721557617, "rewards/margins": 2.6539559682210285, "rewards/rejected": -1.8416941960652669, "step": 9166 }, { "epoch": 0.4858876845201813, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -20658656.0, "logps/rejected": -222.5861053466797, "loss": 0.1177, "rewards/rejected": -2.5782322883605957, "step": 9167 }, { "epoch": 0.4859406885219834, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44151372.0, "logits/rejected": -29057714.666666668, "logps/chosen": -277.3594665527344, "logps/rejected": -290.6763102213542, "loss": 0.2165, "rewards/chosen": 0.6269668936729431, "rewards/margins": 2.333872576554616, "rewards/rejected": -1.706905682881673, "step": 9168 }, { "epoch": 0.48599369252378555, "grad_norm": 66.5, "kl": 3.613758087158203, "learning_rate": 5e-07, "logits/chosen": -10320424.0, "logits/rejected": -17532866.0, "logps/chosen": -426.2843017578125, "logps/rejected": -284.688720703125, "loss": 0.2893, "rewards/chosen": 0.7204843759536743, "rewards/margins": 2.3867307901382446, "rewards/rejected": -1.6662464141845703, "step": 9169 }, { "epoch": 0.4860466965255877, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15320278.4, "logits/rejected": -54537253.333333336, "logps/chosen": -62.03948974609375, "logps/rejected": -437.1713053385417, "loss": 0.3168, "rewards/chosen": 0.09409214854240418, "rewards/margins": 3.5348708808422087, "rewards/rejected": -3.4407787322998047, "step": 9170 }, { "epoch": 0.4860997005273898, "grad_norm": 67.0, "kl": 0.7716550827026367, "learning_rate": 5e-07, "logits/chosen": -22524288.0, "logits/rejected": -13102337.6, "logps/chosen": -400.428466796875, "logps/rejected": -178.853857421875, "loss": 0.2937, "rewards/chosen": 0.8860503832499186, "rewards/margins": 1.8327579180399578, "rewards/rejected": -0.9467075347900391, "step": 9171 }, { "epoch": 0.48615270452919196, "grad_norm": 40.0, "kl": 3.4827346801757812, "learning_rate": 5e-07, "logits/chosen": -39847884.8, "logits/rejected": -28851818.666666668, "logps/chosen": -345.1697265625, "logps/rejected": -449.2842610677083, "loss": 0.3264, "rewards/chosen": 0.34996671676635743, "rewards/margins": 3.1884228706359865, "rewards/rejected": -2.838456153869629, "step": 9172 }, { "epoch": 0.4862057085309941, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31259404.8, "logits/rejected": -4928442.333333333, "logps/chosen": -219.5990234375, "logps/rejected": -175.0586954752604, "loss": 0.4006, "rewards/chosen": 0.18855857849121094, "rewards/margins": 1.542789141337077, "rewards/rejected": -1.354230562845866, "step": 9173 }, { "epoch": 0.48625871253279623, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28797448.0, "logits/rejected": -37136505.6, "logps/chosen": -254.429931640625, "logps/rejected": -365.180908203125, "loss": 0.248, "rewards/chosen": 0.017678832014401753, "rewards/margins": 2.6015329351027807, "rewards/rejected": -2.583854103088379, "step": 9174 }, { "epoch": 0.48631171653459837, "grad_norm": 47.25, "kl": 0.34491729736328125, "learning_rate": 5e-07, "logits/chosen": -16815890.666666668, "logits/rejected": -19458094.4, "logps/chosen": -170.91162109375, "logps/rejected": -223.862109375, "loss": 0.2625, "rewards/chosen": 0.5040027300516764, "rewards/margins": 2.5770167986551917, "rewards/rejected": -2.0730140686035154, "step": 9175 }, { "epoch": 0.4863647205364005, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32857890.0, "logits/rejected": -4488167.5, "logps/chosen": -317.054931640625, "logps/rejected": -224.66448974609375, "loss": 0.2831, "rewards/chosen": 0.7312738299369812, "rewards/margins": 3.06820410490036, "rewards/rejected": -2.336930274963379, "step": 9176 }, { "epoch": 0.48641772453820264, "grad_norm": 49.0, "kl": 0.9348220825195312, "learning_rate": 5e-07, "logits/chosen": -71413578.66666667, "logits/rejected": -19605939.2, "logps/chosen": -932.88671875, "logps/rejected": -230.85859375, "loss": 0.1823, "rewards/chosen": 1.283980369567871, "rewards/margins": 3.1356834411621093, "rewards/rejected": -1.8517030715942382, "step": 9177 }, { "epoch": 0.4864707285400048, "grad_norm": 50.25, "kl": 0.7103061676025391, "learning_rate": 5e-07, "logits/chosen": 5555232.0, "logits/rejected": -42433685.333333336, "logps/chosen": -519.37021484375, "logps/rejected": -317.83970133463544, "loss": 0.2454, "rewards/chosen": 0.6037903308868409, "rewards/margins": 4.187596368789673, "rewards/rejected": -3.583806037902832, "step": 9178 }, { "epoch": 0.4865237325418069, "grad_norm": 34.25, "kl": 2.1791839599609375, "learning_rate": 5e-07, "logits/chosen": -421452.6, "logits/rejected": -26715557.333333332, "logps/chosen": -150.30333251953124, "logps/rejected": -330.35498046875, "loss": 0.3368, "rewards/chosen": 0.4592099666595459, "rewards/margins": 2.4805738608042396, "rewards/rejected": -2.021363894144694, "step": 9179 }, { "epoch": 0.48657673654360906, "grad_norm": 45.25, "kl": 0.7756462097167969, "learning_rate": 5e-07, "logits/chosen": -27133508.0, "logits/rejected": -18672584.0, "logps/chosen": -255.77230834960938, "logps/rejected": -240.556884765625, "loss": 0.2835, "rewards/chosen": 0.4055839478969574, "rewards/margins": 2.6277423799037933, "rewards/rejected": -2.222158432006836, "step": 9180 }, { "epoch": 0.4866297405454112, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30300332.0, "logits/rejected": -28140826.0, "logps/chosen": -151.76702880859375, "logps/rejected": -464.94940185546875, "loss": 0.2941, "rewards/chosen": -0.004816772881895304, "rewards/margins": 2.839652535971254, "rewards/rejected": -2.8444693088531494, "step": 9181 }, { "epoch": 0.48668274454721333, "grad_norm": 54.5, "kl": 0.6028366088867188, "learning_rate": 5e-07, "logits/chosen": -15520125.714285715, "logits/rejected": -61416800.0, "logps/chosen": -520.2192034040179, "logps/rejected": -333.44573974609375, "loss": 0.3168, "rewards/chosen": 0.8281141008649554, "rewards/margins": 3.643031188419887, "rewards/rejected": -2.8149170875549316, "step": 9182 }, { "epoch": 0.48673574854901547, "grad_norm": 53.75, "kl": 1.6429862976074219, "learning_rate": 5e-07, "logits/chosen": -29893426.666666668, "logits/rejected": -14624422.4, "logps/chosen": -353.2141520182292, "logps/rejected": -463.6759765625, "loss": 0.2427, "rewards/chosen": 0.6521713733673096, "rewards/margins": 2.4241039752960205, "rewards/rejected": -1.771932601928711, "step": 9183 }, { "epoch": 0.4867887525508176, "grad_norm": 73.0, "kl": 2.223787307739258, "learning_rate": 5e-07, "logits/chosen": -66782354.28571428, "logits/rejected": -1788913.875, "logps/chosen": -614.7544642857143, "logps/rejected": -49.18186569213867, "loss": 0.373, "rewards/chosen": 0.7657720020839146, "rewards/margins": 2.617150442940848, "rewards/rejected": -1.8513784408569336, "step": 9184 }, { "epoch": 0.48684175655261974, "grad_norm": 58.75, "kl": 1.7693910598754883, "learning_rate": 5e-07, "logits/chosen": -60815924.0, "logits/rejected": -18050758.666666668, "logps/chosen": -402.23040771484375, "logps/rejected": -243.962158203125, "loss": 0.2796, "rewards/chosen": 0.5853231549263, "rewards/margins": 2.0826414227485657, "rewards/rejected": -1.4973182678222656, "step": 9185 }, { "epoch": 0.4868947605544219, "grad_norm": 73.0, "kl": 0.266754150390625, "learning_rate": 5e-07, "logits/chosen": 16011219.0, "logits/rejected": -19775853.714285713, "logps/chosen": -287.65863037109375, "logps/rejected": -403.06270926339283, "loss": 0.2244, "rewards/chosen": -0.5767181515693665, "rewards/margins": 1.2504700677735465, "rewards/rejected": -1.827188219342913, "step": 9186 }, { "epoch": 0.486947764556224, "grad_norm": 49.5, "kl": 4.796836853027344, "learning_rate": 5e-07, "logits/chosen": -24409627.42857143, "logits/rejected": -1678669.75, "logps/chosen": -608.2829241071429, "logps/rejected": -247.07470703125, "loss": 0.3746, "rewards/chosen": 0.904041222163609, "rewards/margins": 2.90204542023795, "rewards/rejected": -1.9980041980743408, "step": 9187 }, { "epoch": 0.48700076855802615, "grad_norm": 50.5, "kl": 1.9536018371582031, "learning_rate": 5e-07, "logits/chosen": -20136438.4, "logits/rejected": -45358677.333333336, "logps/chosen": -376.2544921875, "logps/rejected": -180.6222127278646, "loss": 0.3232, "rewards/chosen": 0.7770400047302246, "rewards/margins": 2.200277805328369, "rewards/rejected": -1.4232378005981445, "step": 9188 }, { "epoch": 0.4870537725598283, "grad_norm": 62.75, "kl": 0.6393089294433594, "learning_rate": 5e-07, "logits/chosen": -7790626.4, "logits/rejected": 8073718.666666667, "logps/chosen": -272.963623046875, "logps/rejected": -422.1663818359375, "loss": 0.3659, "rewards/chosen": 0.17417728900909424, "rewards/margins": 1.90320885181427, "rewards/rejected": -1.7290315628051758, "step": 9189 }, { "epoch": 0.48710677656163043, "grad_norm": 52.0, "kl": 0.7438621520996094, "learning_rate": 5e-07, "logits/chosen": -31945650.666666668, "logits/rejected": -18126828.8, "logps/chosen": -326.1311442057292, "logps/rejected": -319.820654296875, "loss": 0.2843, "rewards/chosen": 0.3013727863629659, "rewards/margins": 2.176047964890798, "rewards/rejected": -1.874675178527832, "step": 9190 }, { "epoch": 0.48715978056343257, "grad_norm": 45.5, "kl": 0.15557479858398438, "learning_rate": 5e-07, "logits/chosen": -20815456.0, "logits/rejected": -33786424.0, "logps/chosen": -457.24970703125, "logps/rejected": -380.0231526692708, "loss": 0.2985, "rewards/chosen": 0.773985481262207, "rewards/margins": 2.8143993377685548, "rewards/rejected": -2.0404138565063477, "step": 9191 }, { "epoch": 0.4872127845652347, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94629898.66666667, "logits/rejected": -44267868.8, "logps/chosen": -387.0684000651042, "logps/rejected": -266.2971923828125, "loss": 0.2234, "rewards/chosen": 0.31642558177312213, "rewards/margins": 2.6511423786481223, "rewards/rejected": -2.334716796875, "step": 9192 }, { "epoch": 0.4872657885670368, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3409614.5, "logits/rejected": -24113572.57142857, "logps/chosen": -90.69991302490234, "logps/rejected": -167.109130859375, "loss": 0.2821, "rewards/chosen": -0.5097389221191406, "rewards/margins": 0.9334042412894112, "rewards/rejected": -1.4431431634085519, "step": 9193 }, { "epoch": 0.4873187925688389, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18749075.2, "logits/rejected": -33814762.666666664, "logps/chosen": -288.2778076171875, "logps/rejected": -839.7469075520834, "loss": 0.3059, "rewards/chosen": 0.236069655418396, "rewards/margins": 4.901342686017354, "rewards/rejected": -4.665273030598958, "step": 9194 }, { "epoch": 0.48737179657064106, "grad_norm": 55.5, "kl": 1.5369873046875, "learning_rate": 5e-07, "logits/chosen": -9156231.2, "logits/rejected": -30402170.666666668, "logps/chosen": -508.00009765625, "logps/rejected": -280.4635823567708, "loss": 0.2723, "rewards/chosen": 0.9955864906311035, "rewards/margins": 2.388467884063721, "rewards/rejected": -1.3928813934326172, "step": 9195 }, { "epoch": 0.4874248005724432, "grad_norm": 40.5, "kl": 0.862152099609375, "learning_rate": 5e-07, "logits/chosen": -72961450.66666667, "logits/rejected": -44764403.2, "logps/chosen": -191.03326416015625, "logps/rejected": -476.08271484375, "loss": 0.2344, "rewards/chosen": 0.2667886018753052, "rewards/margins": 2.729875683784485, "rewards/rejected": -2.4630870819091797, "step": 9196 }, { "epoch": 0.48747780457424533, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35029088.0, "logits/rejected": -46476000.0, "logps/chosen": -277.89666748046875, "logps/rejected": -260.2997233072917, "loss": 0.1633, "rewards/chosen": 0.28150710463523865, "rewards/margins": 2.936622232198715, "rewards/rejected": -2.6551151275634766, "step": 9197 }, { "epoch": 0.48753080857604747, "grad_norm": 47.75, "kl": 2.084432601928711, "learning_rate": 5e-07, "logits/chosen": -29490613.333333332, "logits/rejected": -38481416.0, "logps/chosen": -282.2765706380208, "logps/rejected": -452.6737976074219, "loss": 0.3121, "rewards/chosen": 1.0206214586893718, "rewards/margins": 2.5169929663340254, "rewards/rejected": -1.4963715076446533, "step": 9198 }, { "epoch": 0.4875838125778496, "grad_norm": 118.0, "kl": 1.0704002380371094, "learning_rate": 5e-07, "logits/chosen": -27848466.0, "logits/rejected": 146960176.0, "logps/chosen": -369.2794189453125, "logps/rejected": -735.40625, "loss": 0.2027, "rewards/chosen": 1.7417283058166504, "rewards/margins": 3.414199113845825, "rewards/rejected": -1.6724708080291748, "step": 9199 }, { "epoch": 0.48763681657965174, "grad_norm": 58.75, "kl": 0.6522855758666992, "learning_rate": 5e-07, "logits/chosen": -10249090.0, "logits/rejected": -32436328.0, "logps/chosen": -284.949462890625, "logps/rejected": -276.01947021484375, "loss": 0.3858, "rewards/chosen": 0.14205369353294373, "rewards/margins": 1.5155697762966156, "rewards/rejected": -1.3735160827636719, "step": 9200 }, { "epoch": 0.4876898205814539, "grad_norm": 65.5, "kl": 1.4690990447998047, "learning_rate": 5e-07, "logits/chosen": -55248860.0, "logits/rejected": -3846801.0, "logps/chosen": -641.5950317382812, "logps/rejected": -308.3741149902344, "loss": 0.2605, "rewards/chosen": 1.3490371704101562, "rewards/margins": 2.612167716026306, "rewards/rejected": -1.26313054561615, "step": 9201 }, { "epoch": 0.487742824583256, "grad_norm": 48.0, "kl": 0.541748046875, "learning_rate": 5e-07, "logits/chosen": -217163392.0, "logits/rejected": -12912985.333333334, "logps/chosen": -474.77520751953125, "logps/rejected": -320.9278157552083, "loss": 0.1769, "rewards/chosen": 1.4080597162246704, "rewards/margins": 3.1190601587295532, "rewards/rejected": -1.7110004425048828, "step": 9202 }, { "epoch": 0.48779582858505816, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73473062.4, "logits/rejected": -29115344.0, "logps/chosen": -202.0901123046875, "logps/rejected": -348.0355224609375, "loss": 0.2667, "rewards/chosen": 0.42101230621337893, "rewards/margins": 3.734914207458496, "rewards/rejected": -3.313901901245117, "step": 9203 }, { "epoch": 0.4878488325868603, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10337542.0, "logits/rejected": -39497160.0, "logps/chosen": -120.59044647216797, "logps/rejected": -361.1405029296875, "loss": 0.3947, "rewards/chosen": -0.6809719204902649, "rewards/margins": 1.329075038433075, "rewards/rejected": -2.01004695892334, "step": 9204 }, { "epoch": 0.48790183658866243, "grad_norm": 40.75, "kl": 0.19253206253051758, "learning_rate": 5e-07, "logits/chosen": -8050968.0, "logits/rejected": -23577184.0, "logps/chosen": -178.348876953125, "logps/rejected": -301.0971272786458, "loss": 0.3254, "rewards/chosen": 0.5488601207733155, "rewards/margins": 2.2778554439544676, "rewards/rejected": -1.7289953231811523, "step": 9205 }, { "epoch": 0.48795484059046457, "grad_norm": 40.5, "kl": 1.6909475326538086, "learning_rate": 5e-07, "logits/chosen": -25432339.2, "logits/rejected": -47418266.666666664, "logps/chosen": -152.10638427734375, "logps/rejected": -269.9176432291667, "loss": 0.2957, "rewards/chosen": 0.5375401020050049, "rewards/margins": 2.432994031906128, "rewards/rejected": -1.895453929901123, "step": 9206 }, { "epoch": 0.4880078445922667, "grad_norm": 58.25, "kl": 0.13604736328125, "learning_rate": 5e-07, "logits/chosen": -15775832.0, "logits/rejected": -47503226.666666664, "logps/chosen": -341.87811279296875, "logps/rejected": -331.7591959635417, "loss": 0.2459, "rewards/chosen": 1.119958519935608, "rewards/margins": 2.616771499315898, "rewards/rejected": -1.4968129793802898, "step": 9207 }, { "epoch": 0.48806084859406884, "grad_norm": 37.0, "kl": 0.5278434753417969, "learning_rate": 5e-07, "logits/chosen": -84052528.0, "logits/rejected": -19231461.333333332, "logps/chosen": -402.8338623046875, "logps/rejected": -501.0208740234375, "loss": 0.1698, "rewards/chosen": -0.08798980712890625, "rewards/margins": 2.8131707509358725, "rewards/rejected": -2.901160558064779, "step": 9208 }, { "epoch": 0.488113852595871, "grad_norm": 46.25, "kl": 1.51702880859375, "learning_rate": 5e-07, "logits/chosen": -14293235.2, "logits/rejected": -28971946.666666668, "logps/chosen": -255.1247314453125, "logps/rejected": -441.6693522135417, "loss": 0.4156, "rewards/chosen": -0.029818105697631835, "rewards/margins": 1.9815460364023842, "rewards/rejected": -2.011364142100016, "step": 9209 }, { "epoch": 0.4881668565976731, "grad_norm": 67.0, "kl": 0.5903720855712891, "learning_rate": 5e-07, "logits/chosen": -42448224.0, "logits/rejected": 8885322.0, "logps/chosen": -361.5433349609375, "logps/rejected": -207.3643798828125, "loss": 0.3463, "rewards/chosen": 0.2466359535853068, "rewards/margins": 2.8170462052027383, "rewards/rejected": -2.5704102516174316, "step": 9210 }, { "epoch": 0.48821986059947525, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62546420.0, "logits/rejected": -21970330.666666668, "logps/chosen": -693.526611328125, "logps/rejected": -253.09513346354166, "loss": 0.1655, "rewards/chosen": 0.6736114621162415, "rewards/margins": 3.10523651043574, "rewards/rejected": -2.4316250483194985, "step": 9211 }, { "epoch": 0.4882728646012774, "grad_norm": 43.5, "kl": 1.9254684448242188, "learning_rate": 5e-07, "logits/chosen": -44673701.333333336, "logits/rejected": -5042200.0, "logps/chosen": -324.4036458333333, "logps/rejected": -211.00411987304688, "loss": 0.3108, "rewards/chosen": 1.0360540548960369, "rewards/margins": 2.545040527979533, "rewards/rejected": -1.508986473083496, "step": 9212 }, { "epoch": 0.4883258686030795, "grad_norm": 94.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4327315.333333333, "logits/rejected": 32476828.8, "logps/chosen": -142.29550170898438, "logps/rejected": -468.9087890625, "loss": 0.3054, "rewards/chosen": 0.5267328421274821, "rewards/margins": 2.0280237356821695, "rewards/rejected": -1.5012908935546876, "step": 9213 }, { "epoch": 0.48837887260488166, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5805992.5, "logits/rejected": -13936540.0, "logps/chosen": -100.81737518310547, "logps/rejected": -234.04679361979166, "loss": 0.2754, "rewards/chosen": -0.19398459792137146, "rewards/margins": 1.7778209745883942, "rewards/rejected": -1.9718055725097656, "step": 9214 }, { "epoch": 0.4884318766066838, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1142202.3333333333, "logits/rejected": -35967817.6, "logps/chosen": -300.07647705078125, "logps/rejected": -369.3419677734375, "loss": 0.2527, "rewards/chosen": 0.23493756850560507, "rewards/margins": 2.118348213036855, "rewards/rejected": -1.88341064453125, "step": 9215 }, { "epoch": 0.48848488060848594, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8373706.0, "logits/rejected": -23286926.0, "logps/chosen": -197.57815551757812, "logps/rejected": -262.97039794921875, "loss": 0.3235, "rewards/chosen": -0.02677258849143982, "rewards/margins": 2.0463582575321198, "rewards/rejected": -2.0731308460235596, "step": 9216 }, { "epoch": 0.4885378846102881, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52298042.666666664, "logits/rejected": 8045678.4, "logps/chosen": -442.0592041015625, "logps/rejected": -330.9772705078125, "loss": 0.2611, "rewards/chosen": 0.638592004776001, "rewards/margins": 2.17966570854187, "rewards/rejected": -1.541073703765869, "step": 9217 }, { "epoch": 0.4885908886120902, "grad_norm": 48.0, "kl": 0.8307256698608398, "learning_rate": 5e-07, "logits/chosen": -19789257.14285714, "logits/rejected": -9752677.0, "logps/chosen": -112.46651785714286, "logps/rejected": -270.83526611328125, "loss": 0.3894, "rewards/chosen": 0.43035292625427246, "rewards/margins": 1.8660402297973633, "rewards/rejected": -1.4356873035430908, "step": 9218 }, { "epoch": 0.48864389261389235, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67670880.0, "logits/rejected": -8858947.2, "logps/chosen": -526.6299641927084, "logps/rejected": -248.894482421875, "loss": 0.2472, "rewards/chosen": 0.6622854073842367, "rewards/margins": 2.6365550835927327, "rewards/rejected": -1.9742696762084961, "step": 9219 }, { "epoch": 0.4886968966156945, "grad_norm": 54.25, "kl": 0.8829631805419922, "learning_rate": 5e-07, "logits/chosen": -49641178.666666664, "logits/rejected": 12943208.0, "logps/chosen": -385.1595052083333, "logps/rejected": -205.9365234375, "loss": 0.346, "rewards/chosen": 0.5305927991867065, "rewards/margins": 1.345362162590027, "rewards/rejected": -0.8147693634033203, "step": 9220 }, { "epoch": 0.4887499006174966, "grad_norm": 49.5, "kl": 0.6017608642578125, "learning_rate": 5e-07, "logits/chosen": -31325670.0, "logits/rejected": -5839399.0, "logps/chosen": -186.6996307373047, "logps/rejected": -496.4036560058594, "loss": 0.3167, "rewards/chosen": 0.043002936989068985, "rewards/margins": 2.247888181358576, "rewards/rejected": -2.204885244369507, "step": 9221 }, { "epoch": 0.48880290461929876, "grad_norm": 40.75, "kl": 1.0691146850585938, "learning_rate": 5e-07, "logits/chosen": -6693411.5, "logits/rejected": -31076876.0, "logps/chosen": -428.9521789550781, "logps/rejected": -237.1057891845703, "loss": 0.2776, "rewards/chosen": 0.5964810252189636, "rewards/margins": 2.8792659640312195, "rewards/rejected": -2.282784938812256, "step": 9222 }, { "epoch": 0.4888559086211009, "grad_norm": 42.75, "kl": 1.81890869140625, "learning_rate": 5e-07, "logits/chosen": -15707609.333333334, "logits/rejected": -21806248.0, "logps/chosen": -219.9075927734375, "logps/rejected": -319.00009765625, "loss": 0.189, "rewards/chosen": 1.0848759015401204, "rewards/margins": 3.1450056393941246, "rewards/rejected": -2.060129737854004, "step": 9223 }, { "epoch": 0.48890891262290304, "grad_norm": 46.75, "kl": 1.3939743041992188, "learning_rate": 5e-07, "logits/chosen": -1369497.0, "logits/rejected": -10084077.0, "logps/chosen": -194.9954071044922, "logps/rejected": -190.9978790283203, "loss": 0.3163, "rewards/chosen": 0.5731137990951538, "rewards/margins": 2.107637405395508, "rewards/rejected": -1.534523606300354, "step": 9224 }, { "epoch": 0.4889619166247052, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23689288.0, "logits/rejected": -39337430.4, "logps/chosen": -222.38248697916666, "logps/rejected": -390.603076171875, "loss": 0.292, "rewards/chosen": 0.13346176346143088, "rewards/margins": 1.796696283419927, "rewards/rejected": -1.6632345199584961, "step": 9225 }, { "epoch": 0.4890149206265073, "grad_norm": 52.0, "kl": 0.9101219177246094, "learning_rate": 5e-07, "logits/chosen": -24485706.666666668, "logits/rejected": 16322231.0, "logps/chosen": -366.8180338541667, "logps/rejected": -186.06813049316406, "loss": 0.3072, "rewards/chosen": 0.70162566502889, "rewards/margins": 2.223670800526937, "rewards/rejected": -1.5220451354980469, "step": 9226 }, { "epoch": 0.48906792462830945, "grad_norm": 57.5, "kl": 1.7276344299316406, "learning_rate": 5e-07, "logits/chosen": -52999272.0, "logits/rejected": 13596070.0, "logps/chosen": -92.36984252929688, "logps/rejected": -107.26280975341797, "loss": 0.4454, "rewards/chosen": -0.3283819258213043, "rewards/margins": 0.61276575922966, "rewards/rejected": -0.9411476850509644, "step": 9227 }, { "epoch": 0.4891209286301116, "grad_norm": 31.75, "kl": 1.423335075378418, "learning_rate": 5e-07, "logits/chosen": 14851992.0, "logits/rejected": -13856751.0, "logps/chosen": -438.16986083984375, "logps/rejected": -160.9312744140625, "loss": 0.2112, "rewards/chosen": 1.3462570905685425, "rewards/margins": 3.5521119832992554, "rewards/rejected": -2.205854892730713, "step": 9228 }, { "epoch": 0.4891739326319137, "grad_norm": 48.0, "kl": 0.7280445098876953, "learning_rate": 5e-07, "logits/chosen": -51721940.0, "logits/rejected": -11121148.0, "logps/chosen": -232.6269989013672, "logps/rejected": -88.46602630615234, "loss": 0.3824, "rewards/chosen": -0.0072650909423828125, "rewards/margins": 1.2130063772201538, "rewards/rejected": -1.2202714681625366, "step": 9229 }, { "epoch": 0.48922693663371586, "grad_norm": 48.75, "kl": 0.6722755432128906, "learning_rate": 5e-07, "logits/chosen": -7498647.2, "logits/rejected": -28750480.0, "logps/chosen": -156.26829833984374, "logps/rejected": -248.15897623697916, "loss": 0.409, "rewards/chosen": 0.001928853988647461, "rewards/margins": 1.651066827774048, "rewards/rejected": -1.6491379737854004, "step": 9230 }, { "epoch": 0.489279940635518, "grad_norm": 44.5, "kl": 0.8775711059570312, "learning_rate": 5e-07, "logits/chosen": -9781298.4, "logits/rejected": -49163018.666666664, "logps/chosen": -213.8853759765625, "logps/rejected": -356.6336263020833, "loss": 0.3185, "rewards/chosen": 0.4981827735900879, "rewards/margins": 2.9004716873168945, "rewards/rejected": -2.4022889137268066, "step": 9231 }, { "epoch": 0.48933294463732013, "grad_norm": 32.0, "kl": 1.6635665893554688, "learning_rate": 5e-07, "logits/chosen": -22201316.0, "logits/rejected": -94547208.0, "logps/chosen": -296.82098388671875, "logps/rejected": -820.3230590820312, "loss": 0.1761, "rewards/chosen": 0.9533642530441284, "rewards/margins": 4.723901629447937, "rewards/rejected": -3.7705373764038086, "step": 9232 }, { "epoch": 0.48938594863912227, "grad_norm": 62.0, "kl": 2.768705368041992, "learning_rate": 5e-07, "logits/chosen": -2005549.142857143, "logits/rejected": 105390640.0, "logps/chosen": -243.29570661272322, "logps/rejected": -230.9900665283203, "loss": 0.5028, "rewards/chosen": 0.24733820983341762, "rewards/margins": 0.7622506363051278, "rewards/rejected": -0.5149124264717102, "step": 9233 }, { "epoch": 0.4894389526409244, "grad_norm": 40.75, "kl": 1.072134017944336, "learning_rate": 5e-07, "logits/chosen": -25331544.0, "logits/rejected": -2877265.0, "logps/chosen": -94.6136245727539, "logps/rejected": -370.7464599609375, "loss": 0.3285, "rewards/chosen": 0.20709727704524994, "rewards/margins": 2.086110934615135, "rewards/rejected": -1.8790136575698853, "step": 9234 }, { "epoch": 0.48949195664272654, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9507874.666666666, "logits/rejected": -40954800.0, "logps/chosen": -753.2010091145834, "logps/rejected": -266.009423828125, "loss": 0.2129, "rewards/chosen": 1.3055470784505208, "rewards/margins": 2.947117169698079, "rewards/rejected": -1.6415700912475586, "step": 9235 }, { "epoch": 0.4895449606445287, "grad_norm": 63.25, "kl": 3.439544677734375, "learning_rate": 5e-07, "logits/chosen": -18017960.0, "logits/rejected": -38543224.0, "logps/chosen": -385.6283874511719, "logps/rejected": -481.0181579589844, "loss": 0.3273, "rewards/chosen": 0.18791113793849945, "rewards/margins": 3.3131314367055893, "rewards/rejected": -3.12522029876709, "step": 9236 }, { "epoch": 0.4895979646463308, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27303144.0, "logits/rejected": -21803280.0, "logps/chosen": -535.08837890625, "logps/rejected": -368.8409729003906, "loss": 0.258, "rewards/chosen": 0.45399633049964905, "rewards/margins": 3.2175513803958893, "rewards/rejected": -2.7635550498962402, "step": 9237 }, { "epoch": 0.48965096864813296, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48072053.333333336, "logits/rejected": 4296060.0, "logps/chosen": -214.05548095703125, "logps/rejected": -348.179345703125, "loss": 0.2894, "rewards/chosen": -0.012086992462476095, "rewards/margins": 2.0385063836971917, "rewards/rejected": -2.050593376159668, "step": 9238 }, { "epoch": 0.4897039726499351, "grad_norm": 63.75, "kl": 1.6866950988769531, "learning_rate": 5e-07, "logits/chosen": -20025033.6, "logits/rejected": -22665730.666666668, "logps/chosen": -252.467822265625, "logps/rejected": -331.6690266927083, "loss": 0.2937, "rewards/chosen": 0.7236470222473145, "rewards/margins": 2.572956943511963, "rewards/rejected": -1.8493099212646484, "step": 9239 }, { "epoch": 0.48975697665173723, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3432874.0, "logits/rejected": -27946706.666666668, "logps/chosen": -133.94126892089844, "logps/rejected": -263.4097086588542, "loss": 0.2099, "rewards/chosen": -0.0999540314078331, "rewards/margins": 2.2453665112455687, "rewards/rejected": -2.345320542653402, "step": 9240 }, { "epoch": 0.48980998065353937, "grad_norm": 52.0, "kl": 1.2684879302978516, "learning_rate": 5e-07, "logits/chosen": -12792273.333333334, "logits/rejected": -11880534.4, "logps/chosen": -251.48116048177084, "logps/rejected": -225.72041015625, "loss": 0.2876, "rewards/chosen": 0.2218419313430786, "rewards/margins": 2.095963406562805, "rewards/rejected": -1.8741214752197266, "step": 9241 }, { "epoch": 0.4898629846553415, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 33111256.0, "logits/rejected": -33650486.85714286, "logps/chosen": -229.47213745117188, "logps/rejected": -400.24720982142856, "loss": 0.2235, "rewards/chosen": 0.3373123109340668, "rewards/margins": 2.144590126616614, "rewards/rejected": -1.8072778156825475, "step": 9242 }, { "epoch": 0.48991598865714364, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1967457.5, "logits/rejected": -6602188.666666667, "logps/chosen": -58.80324172973633, "logps/rejected": -132.35392252604166, "loss": 0.3272, "rewards/chosen": 0.1307930052280426, "rewards/margins": 1.1198178033034005, "rewards/rejected": -0.989024798075358, "step": 9243 }, { "epoch": 0.4899689926589457, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8656509.333333334, "logits/rejected": -20044774.4, "logps/chosen": -106.3251444498698, "logps/rejected": -304.918212890625, "loss": 0.151, "rewards/chosen": 1.179488182067871, "rewards/margins": 3.9003137588500976, "rewards/rejected": -2.7208255767822265, "step": 9244 }, { "epoch": 0.49002199666074786, "grad_norm": 55.5, "kl": 1.1900548934936523, "learning_rate": 5e-07, "logits/chosen": -44818288.0, "logits/rejected": -19176808.0, "logps/chosen": -530.47109375, "logps/rejected": -226.33597819010416, "loss": 0.3291, "rewards/chosen": 0.29428725242614745, "rewards/margins": 2.4229051748911536, "rewards/rejected": -2.1286179224650064, "step": 9245 }, { "epoch": 0.49007500066255, "grad_norm": 32.25, "kl": 1.0536613464355469, "learning_rate": 5e-07, "logits/chosen": 4366825.6, "logits/rejected": -5032335.333333333, "logps/chosen": -20.652607727050782, "logps/rejected": -323.0567626953125, "loss": 0.2623, "rewards/chosen": 0.7977630615234375, "rewards/margins": 2.548165194193522, "rewards/rejected": -1.7504021326700847, "step": 9246 }, { "epoch": 0.49012800466435213, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23454528.0, "logits/rejected": 6864092.666666667, "logps/chosen": -201.14248657226562, "logps/rejected": -292.8515218098958, "loss": 0.2027, "rewards/chosen": 0.6173363327980042, "rewards/margins": 2.8615362445513406, "rewards/rejected": -2.2441999117533364, "step": 9247 }, { "epoch": 0.49018100866615427, "grad_norm": 33.0, "kl": 0.6964645385742188, "learning_rate": 5e-07, "logits/chosen": -21297672.0, "logits/rejected": -46189056.0, "logps/chosen": -211.01741536458334, "logps/rejected": -403.4492919921875, "loss": 0.2095, "rewards/chosen": 0.6590667963027954, "rewards/margins": 2.8834715127944945, "rewards/rejected": -2.224404716491699, "step": 9248 }, { "epoch": 0.4902340126679564, "grad_norm": 45.25, "kl": 0.6309299468994141, "learning_rate": 5e-07, "logits/chosen": -453823.0, "logits/rejected": -30766166.4, "logps/chosen": -182.71405029296875, "logps/rejected": -324.6278076171875, "loss": 0.3154, "rewards/chosen": -0.26130930582682294, "rewards/margins": 1.9146428426106772, "rewards/rejected": -2.1759521484375, "step": 9249 }, { "epoch": 0.49028701666975855, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1370600.0, "logits/rejected": -24312306.0, "logps/chosen": -557.2305501302084, "logps/rejected": -214.05844116210938, "loss": 0.2856, "rewards/chosen": 1.0059510072072346, "rewards/margins": 2.2313172419865923, "rewards/rejected": -1.225366234779358, "step": 9250 }, { "epoch": 0.4903400206715607, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13742785.6, "logits/rejected": 33980648.0, "logps/chosen": -228.9941650390625, "logps/rejected": -196.6251424153646, "loss": 0.4122, "rewards/chosen": 0.04691299796104431, "rewards/margins": 1.403704176346461, "rewards/rejected": -1.3567911783854167, "step": 9251 }, { "epoch": 0.4903930246733628, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35154793.6, "logits/rejected": -32516378.666666668, "logps/chosen": -353.4456787109375, "logps/rejected": -432.652587890625, "loss": 0.3008, "rewards/chosen": 0.4608969211578369, "rewards/margins": 2.341446034113566, "rewards/rejected": -1.8805491129557292, "step": 9252 }, { "epoch": 0.49044602867516496, "grad_norm": 43.75, "kl": 1.3649215698242188, "learning_rate": 5e-07, "logits/chosen": -33336896.0, "logits/rejected": -24853106.666666668, "logps/chosen": -448.6880859375, "logps/rejected": -437.9324544270833, "loss": 0.2624, "rewards/chosen": 1.001719093322754, "rewards/margins": 3.252108828226725, "rewards/rejected": -2.250389734903971, "step": 9253 }, { "epoch": 0.4904990326769671, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11720302.666666666, "logits/rejected": -3132421.4, "logps/chosen": -462.5753987630208, "logps/rejected": -462.19130859375, "loss": 0.227, "rewards/chosen": 0.592310627301534, "rewards/margins": 3.083382328351339, "rewards/rejected": -2.4910717010498047, "step": 9254 }, { "epoch": 0.49055203667876923, "grad_norm": 41.25, "kl": 0.23394012451171875, "learning_rate": 5e-07, "logits/chosen": -10542894.4, "logits/rejected": -40594757.333333336, "logps/chosen": -221.102587890625, "logps/rejected": -224.64827473958334, "loss": 0.2993, "rewards/chosen": 0.6934712409973145, "rewards/margins": 2.282298215230306, "rewards/rejected": -1.5888269742329915, "step": 9255 }, { "epoch": 0.49060504068057137, "grad_norm": 54.25, "kl": 0.9308891296386719, "learning_rate": 5e-07, "logits/chosen": -87914636.8, "logits/rejected": -3195020.3333333335, "logps/chosen": -337.9050537109375, "logps/rejected": -231.2273966471354, "loss": 0.3529, "rewards/chosen": 0.10920655727386475, "rewards/margins": 2.0054960648218794, "rewards/rejected": -1.8962895075480144, "step": 9256 }, { "epoch": 0.4906580446823735, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45199692.8, "logits/rejected": -45110784.0, "logps/chosen": -371.005712890625, "logps/rejected": -449.057861328125, "loss": 0.28, "rewards/chosen": 0.5901898384094239, "rewards/margins": 3.030434513092041, "rewards/rejected": -2.440244674682617, "step": 9257 }, { "epoch": 0.49071104868417564, "grad_norm": 49.25, "kl": 1.1580982208251953, "learning_rate": 5e-07, "logits/chosen": -4988407.2, "logits/rejected": -2596072.8333333335, "logps/chosen": -547.37099609375, "logps/rejected": -234.4820760091146, "loss": 0.3032, "rewards/chosen": 0.9811480522155762, "rewards/margins": 3.391003735860189, "rewards/rejected": -2.409855683644613, "step": 9258 }, { "epoch": 0.4907640526859778, "grad_norm": 36.75, "kl": 1.4815216064453125, "learning_rate": 5e-07, "logits/chosen": -23352560.0, "logits/rejected": -2029628.8, "logps/chosen": -132.52386474609375, "logps/rejected": -268.3588134765625, "loss": 0.3151, "rewards/chosen": 0.10581116875012715, "rewards/margins": 1.774304058154424, "rewards/rejected": -1.668492889404297, "step": 9259 }, { "epoch": 0.4908170566877799, "grad_norm": 47.0, "kl": 0.34644317626953125, "learning_rate": 5e-07, "logits/chosen": -40337578.666666664, "logits/rejected": -21828962.0, "logps/chosen": -340.92539469401044, "logps/rejected": -198.86053466796875, "loss": 0.3237, "rewards/chosen": 0.531552791595459, "rewards/margins": 3.092294216156006, "rewards/rejected": -2.560741424560547, "step": 9260 }, { "epoch": 0.49087006068958206, "grad_norm": 59.5, "kl": 0.18944644927978516, "learning_rate": 5e-07, "logits/chosen": -45325032.0, "logits/rejected": -6397078.5, "logps/chosen": -240.19869995117188, "logps/rejected": -269.4292297363281, "loss": 0.341, "rewards/chosen": 0.10471739619970322, "rewards/margins": 1.6075822338461876, "rewards/rejected": -1.5028648376464844, "step": 9261 }, { "epoch": 0.4909230646913842, "grad_norm": 53.25, "kl": 0.48423194885253906, "learning_rate": 5e-07, "logits/chosen": 8630020.0, "logits/rejected": -35404972.0, "logps/chosen": -470.8040466308594, "logps/rejected": -348.3971252441406, "loss": 0.2734, "rewards/chosen": 0.6488271355628967, "rewards/margins": 2.4350834488868713, "rewards/rejected": -1.7862563133239746, "step": 9262 }, { "epoch": 0.49097606869318633, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34989512.0, "logits/rejected": -72450613.33333333, "logps/chosen": -193.25082397460938, "logps/rejected": -400.2458089192708, "loss": 0.14, "rewards/chosen": 1.1063450574874878, "rewards/margins": 3.718131899833679, "rewards/rejected": -2.6117868423461914, "step": 9263 }, { "epoch": 0.49102907269498847, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7234146.0, "logits/rejected": -46341448.0, "logps/chosen": -189.423828125, "logps/rejected": -427.69537353515625, "loss": 0.3176, "rewards/chosen": 0.3230881690979004, "rewards/margins": 2.009647011756897, "rewards/rejected": -1.6865588426589966, "step": 9264 }, { "epoch": 0.4910820766967906, "grad_norm": 62.5, "kl": 0.9910717010498047, "learning_rate": 5e-07, "logits/chosen": -10897424.0, "logits/rejected": -35011098.666666664, "logps/chosen": -310.0330322265625, "logps/rejected": -330.13063557942706, "loss": 0.2857, "rewards/chosen": 0.6491135597229004, "rewards/margins": 3.0575069427490233, "rewards/rejected": -2.408393383026123, "step": 9265 }, { "epoch": 0.49113508069859274, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39429460.0, "logits/rejected": -39700386.666666664, "logps/chosen": -274.54522705078125, "logps/rejected": -358.6743570963542, "loss": 0.2654, "rewards/chosen": 0.1705375611782074, "rewards/margins": 2.110154241323471, "rewards/rejected": -1.9396166801452637, "step": 9266 }, { "epoch": 0.4911880847003949, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8065421.333333333, "logits/rejected": -73461128.0, "logps/chosen": -370.7772623697917, "logps/rejected": -475.85009765625, "loss": 0.3484, "rewards/chosen": 0.24767341216405234, "rewards/margins": 3.0638569792111716, "rewards/rejected": -2.816183567047119, "step": 9267 }, { "epoch": 0.491241088702197, "grad_norm": 48.25, "kl": 0.9135913848876953, "learning_rate": 5e-07, "logits/chosen": -53925804.8, "logits/rejected": 19894597.333333332, "logps/chosen": -149.119677734375, "logps/rejected": -284.9005126953125, "loss": 0.3094, "rewards/chosen": 0.6965014934539795, "rewards/margins": 1.964988072713216, "rewards/rejected": -1.2684865792592366, "step": 9268 }, { "epoch": 0.49129409270399915, "grad_norm": 27.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 20332058.0, "logits/rejected": -25232045.714285713, "logps/chosen": -41.843502044677734, "logps/rejected": -263.62789481026783, "loss": 0.1642, "rewards/chosen": 0.27850228548049927, "rewards/margins": 2.6360293371336803, "rewards/rejected": -2.357527051653181, "step": 9269 }, { "epoch": 0.4913470967058013, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40769656.0, "logits/rejected": 6269034.0, "logps/chosen": -227.02775065104166, "logps/rejected": -304.68450927734375, "loss": 0.3333, "rewards/chosen": 0.5252950191497803, "rewards/margins": 2.1141552925109863, "rewards/rejected": -1.588860273361206, "step": 9270 }, { "epoch": 0.4914001007076034, "grad_norm": 65.5, "kl": 2.7936792373657227, "learning_rate": 5e-07, "logits/chosen": -2970559.714285714, "logits/rejected": -406258.375, "logps/chosen": -156.82669503348214, "logps/rejected": -41.21398162841797, "loss": 0.4042, "rewards/chosen": 0.36381438800266813, "rewards/margins": 2.2745342595236644, "rewards/rejected": -1.910719871520996, "step": 9271 }, { "epoch": 0.49145310470940556, "grad_norm": 36.5, "kl": 1.9166746139526367, "learning_rate": 5e-07, "logits/chosen": -15335377.0, "logits/rejected": -33793832.0, "logps/chosen": -206.9410400390625, "logps/rejected": -357.6581115722656, "loss": 0.2494, "rewards/chosen": 0.6563234925270081, "rewards/margins": 3.261481821537018, "rewards/rejected": -2.6051583290100098, "step": 9272 }, { "epoch": 0.4915061087112077, "grad_norm": 60.5, "kl": 1.3447151184082031, "learning_rate": 5e-07, "logits/chosen": 3677518.4, "logits/rejected": -6820143.333333333, "logps/chosen": -166.902734375, "logps/rejected": -141.3341064453125, "loss": 0.277, "rewards/chosen": 0.7645168304443359, "rewards/margins": 2.486910820007324, "rewards/rejected": -1.7223939895629883, "step": 9273 }, { "epoch": 0.49155911271300984, "grad_norm": 61.5, "kl": 1.2870674133300781, "learning_rate": 5e-07, "logits/chosen": -31927208.0, "logits/rejected": -9744085.0, "logps/chosen": -325.92828369140625, "logps/rejected": -229.88511657714844, "loss": 0.3733, "rewards/chosen": 0.42779489358266193, "rewards/margins": 1.6714045604070027, "rewards/rejected": -1.2436096668243408, "step": 9274 }, { "epoch": 0.491612116714812, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41376028.0, "logits/rejected": -11614683.0, "logps/chosen": -271.3133850097656, "logps/rejected": -78.60746765136719, "loss": 0.4461, "rewards/chosen": 0.054199449717998505, "rewards/margins": 0.45714040845632553, "rewards/rejected": -0.402940958738327, "step": 9275 }, { "epoch": 0.4916651207166141, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7258190.0, "logits/rejected": -17264850.0, "logps/chosen": -121.93390655517578, "logps/rejected": -356.6735534667969, "loss": 0.3717, "rewards/chosen": -0.34301942586898804, "rewards/margins": 1.4671372771263123, "rewards/rejected": -1.8101567029953003, "step": 9276 }, { "epoch": 0.49171812471841625, "grad_norm": 71.0, "kl": 0.8731746673583984, "learning_rate": 5e-07, "logits/chosen": -45848229.333333336, "logits/rejected": -54492.0, "logps/chosen": -332.8814290364583, "logps/rejected": -387.569677734375, "loss": 0.2938, "rewards/chosen": 0.2947649558385213, "rewards/margins": 1.687851579984029, "rewards/rejected": -1.3930866241455078, "step": 9277 }, { "epoch": 0.4917711287202184, "grad_norm": 53.0, "kl": 3.8906822204589844, "learning_rate": 5e-07, "logits/chosen": 7685678.666666667, "logits/rejected": -25822481.6, "logps/chosen": -264.7063395182292, "logps/rejected": -245.513427734375, "loss": 0.3279, "rewards/chosen": 0.34563930829366046, "rewards/margins": 1.7614303429921467, "rewards/rejected": -1.4157910346984863, "step": 9278 }, { "epoch": 0.4918241327220205, "grad_norm": 55.0, "kl": 3.9913339614868164, "learning_rate": 5e-07, "logits/chosen": -17279870.85714286, "logits/rejected": -32956606.0, "logps/chosen": -317.6484375, "logps/rejected": -554.6363525390625, "loss": 0.4238, "rewards/chosen": 0.5170776503426688, "rewards/margins": 4.893936361585345, "rewards/rejected": -4.376858711242676, "step": 9279 }, { "epoch": 0.49187713672382266, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17203816.0, "logits/rejected": -42692784.0, "logps/chosen": -185.8217010498047, "logps/rejected": -410.04913330078125, "loss": 0.3056, "rewards/chosen": 0.2732701897621155, "rewards/margins": 2.3989797234535217, "rewards/rejected": -2.1257095336914062, "step": 9280 }, { "epoch": 0.4919301407256248, "grad_norm": 39.75, "kl": 1.351515769958496, "learning_rate": 5e-07, "logits/chosen": -10043318.0, "logits/rejected": -40200252.0, "logps/chosen": -74.64269256591797, "logps/rejected": -229.65147399902344, "loss": 0.3709, "rewards/chosen": 0.14795313775539398, "rewards/margins": 1.5303178876638412, "rewards/rejected": -1.3823647499084473, "step": 9281 }, { "epoch": 0.49198314472742694, "grad_norm": 35.75, "kl": 1.2702679634094238, "learning_rate": 5e-07, "logits/chosen": -25108000.0, "logits/rejected": -10894862.0, "logps/chosen": -103.99864959716797, "logps/rejected": -158.07302856445312, "loss": 0.2987, "rewards/chosen": 0.4500292241573334, "rewards/margins": 2.358022302389145, "rewards/rejected": -1.9079930782318115, "step": 9282 }, { "epoch": 0.4920361487292291, "grad_norm": 55.5, "kl": 1.5004348754882812, "learning_rate": 5e-07, "logits/chosen": -34170448.0, "logits/rejected": -60350297.6, "logps/chosen": -360.5249837239583, "logps/rejected": -450.58291015625, "loss": 0.235, "rewards/chosen": -0.0047715504964192705, "rewards/margins": 2.428960863749186, "rewards/rejected": -2.4337324142456054, "step": 9283 }, { "epoch": 0.4920891527310312, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36541779.2, "logits/rejected": -53585962.666666664, "logps/chosen": -356.62958984375, "logps/rejected": -599.614990234375, "loss": 0.1939, "rewards/chosen": 0.8673275947570801, "rewards/margins": 5.205071989695232, "rewards/rejected": -4.337744394938151, "step": 9284 }, { "epoch": 0.49214215673283335, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34969053.333333336, "logits/rejected": -77151632.0, "logps/chosen": -182.73006184895834, "logps/rejected": -608.714599609375, "loss": 0.3183, "rewards/chosen": 0.3982989390691121, "rewards/margins": 3.519960363705953, "rewards/rejected": -3.121661424636841, "step": 9285 }, { "epoch": 0.4921951607346355, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -107722688.0, "logits/rejected": -25172884.8, "logps/chosen": -401.3806966145833, "logps/rejected": -165.0935791015625, "loss": 0.2223, "rewards/chosen": 0.43176575501759845, "rewards/margins": 3.0979169925053918, "rewards/rejected": -2.666151237487793, "step": 9286 }, { "epoch": 0.4922481647364376, "grad_norm": 34.75, "kl": 0.4327354431152344, "learning_rate": 5e-07, "logits/chosen": -826513.5, "logits/rejected": -56049368.0, "logps/chosen": -163.94149780273438, "logps/rejected": -447.4282531738281, "loss": 0.2595, "rewards/chosen": 0.13349804282188416, "rewards/margins": 3.313070386648178, "rewards/rejected": -3.179572343826294, "step": 9287 }, { "epoch": 0.49230116873823976, "grad_norm": 50.25, "kl": 2.342583656311035, "learning_rate": 5e-07, "logits/chosen": -4290531.2, "logits/rejected": -34533477.333333336, "logps/chosen": -579.950244140625, "logps/rejected": -293.9709065755208, "loss": 0.325, "rewards/chosen": 0.6960601329803466, "rewards/margins": 3.0708407878875734, "rewards/rejected": -2.3747806549072266, "step": 9288 }, { "epoch": 0.4923541727400419, "grad_norm": 48.0, "kl": 4.575481414794922, "learning_rate": 5e-07, "logits/chosen": 7323210.0, "logits/rejected": -5238795.0, "logps/chosen": -173.47930908203125, "logps/rejected": -162.27769470214844, "loss": 0.4686, "rewards/chosen": 0.31740198532740277, "rewards/margins": 1.5679840842882793, "rewards/rejected": -1.2505820989608765, "step": 9289 }, { "epoch": 0.49240717674184403, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -116514200.0, "logits/rejected": -14639131.42857143, "logps/chosen": -599.637451171875, "logps/rejected": -223.79997907366072, "loss": 0.2333, "rewards/chosen": -0.15631103515625, "rewards/margins": 1.8422881535121374, "rewards/rejected": -1.9985991886683874, "step": 9290 }, { "epoch": 0.49246018074364617, "grad_norm": 52.25, "kl": 0.7571907043457031, "learning_rate": 5e-07, "logits/chosen": -42996208.0, "logits/rejected": -72728192.0, "logps/chosen": -150.0439453125, "logps/rejected": -373.30145263671875, "loss": 0.3019, "rewards/chosen": 0.28872600197792053, "rewards/margins": 2.251857429742813, "rewards/rejected": -1.9631314277648926, "step": 9291 }, { "epoch": 0.4925131847454483, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68293688.0, "logits/rejected": -2694540.75, "logps/chosen": -228.87234497070312, "logps/rejected": -271.99041748046875, "loss": 0.332, "rewards/chosen": 0.27794378995895386, "rewards/margins": 1.691978394985199, "rewards/rejected": -1.4140346050262451, "step": 9292 }, { "epoch": 0.49256618874725044, "grad_norm": 55.5, "kl": 1.010284423828125, "learning_rate": 5e-07, "logits/chosen": -27273744.0, "logits/rejected": -62857036.8, "logps/chosen": -191.2353719075521, "logps/rejected": -512.983544921875, "loss": 0.2238, "rewards/chosen": 0.1231536865234375, "rewards/margins": 3.4739761352539062, "rewards/rejected": -3.3508224487304688, "step": 9293 }, { "epoch": 0.4926191927490525, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15514316.8, "logits/rejected": -34959712.0, "logps/chosen": -336.3334228515625, "logps/rejected": -274.8931884765625, "loss": 0.3268, "rewards/chosen": 0.29580590724945066, "rewards/margins": 2.310876981417338, "rewards/rejected": -2.015071074167887, "step": 9294 }, { "epoch": 0.49267219675085466, "grad_norm": 56.75, "kl": 4.6756439208984375, "learning_rate": 5e-07, "logits/chosen": -36433699.2, "logits/rejected": -27615216.0, "logps/chosen": -324.19609375, "logps/rejected": -272.92616780598956, "loss": 0.2719, "rewards/chosen": 1.061428165435791, "rewards/margins": 3.430527146657308, "rewards/rejected": -2.369098981221517, "step": 9295 }, { "epoch": 0.4927252007526568, "grad_norm": 55.5, "kl": 1.7672080993652344, "learning_rate": 5e-07, "logits/chosen": -61274436.0, "logits/rejected": -37467552.0, "logps/chosen": -562.7568969726562, "logps/rejected": -567.579833984375, "loss": 0.2545, "rewards/chosen": 0.8360395431518555, "rewards/margins": 3.1067304611206055, "rewards/rejected": -2.27069091796875, "step": 9296 }, { "epoch": 0.49277820475445894, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32269368.0, "logits/rejected": -23749800.0, "logps/chosen": -315.610107421875, "logps/rejected": -527.318603515625, "loss": 0.2124, "rewards/chosen": 0.7802202701568604, "rewards/margins": 3.288792848587036, "rewards/rejected": -2.508572578430176, "step": 9297 }, { "epoch": 0.4928312087562611, "grad_norm": 41.25, "kl": 0.12917327880859375, "learning_rate": 5e-07, "logits/chosen": -33251226.0, "logits/rejected": -41040332.0, "logps/chosen": -289.63763427734375, "logps/rejected": -325.27252197265625, "loss": 0.2619, "rewards/chosen": 0.4876871109008789, "rewards/margins": 2.7644829750061035, "rewards/rejected": -2.2767958641052246, "step": 9298 }, { "epoch": 0.4928842127580632, "grad_norm": 56.0, "kl": 2.8229293823242188, "learning_rate": 5e-07, "logits/chosen": -41737120.0, "logits/rejected": -63942336.0, "logps/chosen": -400.966064453125, "logps/rejected": -517.4827880859375, "loss": 0.3725, "rewards/chosen": 0.4061676263809204, "rewards/margins": 3.249780297279358, "rewards/rejected": -2.8436126708984375, "step": 9299 }, { "epoch": 0.49293721675986535, "grad_norm": 34.5, "kl": 0.001384735107421875, "learning_rate": 5e-07, "logits/chosen": -30626154.0, "logits/rejected": -11661913.0, "logps/chosen": -92.91293334960938, "logps/rejected": -307.3689270019531, "loss": 0.2595, "rewards/chosen": 0.28814828395843506, "rewards/margins": 2.7038336992263794, "rewards/rejected": -2.4156854152679443, "step": 9300 }, { "epoch": 0.4929902207616675, "grad_norm": 49.0, "kl": 0.24813270568847656, "learning_rate": 5e-07, "logits/chosen": -19394452.0, "logits/rejected": -6967280.0, "logps/chosen": -324.95684814453125, "logps/rejected": -125.75125122070312, "loss": 0.3865, "rewards/chosen": -0.23902417719364166, "rewards/margins": 1.1449763625860214, "rewards/rejected": -1.384000539779663, "step": 9301 }, { "epoch": 0.4930432247634696, "grad_norm": 34.75, "kl": 2.384429931640625, "learning_rate": 5e-07, "logits/chosen": -22152166.666666668, "logits/rejected": -23873150.0, "logps/chosen": -372.8030598958333, "logps/rejected": -381.17242431640625, "loss": 0.3557, "rewards/chosen": 0.7538367112477621, "rewards/margins": 3.747711737950643, "rewards/rejected": -2.993875026702881, "step": 9302 }, { "epoch": 0.49309622876527176, "grad_norm": 47.0, "kl": 0.6959209442138672, "learning_rate": 5e-07, "logits/chosen": -25027666.285714287, "logits/rejected": 2026942.25, "logps/chosen": -317.4995814732143, "logps/rejected": -134.8489990234375, "loss": 0.2911, "rewards/chosen": 1.1107213156563895, "rewards/margins": 4.688105549131121, "rewards/rejected": -3.5773842334747314, "step": 9303 }, { "epoch": 0.4931492327670739, "grad_norm": 40.0, "kl": 0.9135189056396484, "learning_rate": 5e-07, "logits/chosen": -40091160.0, "logits/rejected": -30809792.0, "logps/chosen": -305.51513671875, "logps/rejected": -296.9161682128906, "loss": 0.229, "rewards/chosen": 0.7134921550750732, "rewards/margins": 2.977259397506714, "rewards/rejected": -2.2637672424316406, "step": 9304 }, { "epoch": 0.49320223676887603, "grad_norm": 46.75, "kl": 0.5024480819702148, "learning_rate": 5e-07, "logits/chosen": -37925618.666666664, "logits/rejected": -70936752.0, "logps/chosen": -242.06193033854166, "logps/rejected": -500.83148193359375, "loss": 0.3337, "rewards/chosen": 0.5113420089085897, "rewards/margins": 2.5402865012486777, "rewards/rejected": -2.028944492340088, "step": 9305 }, { "epoch": 0.49325524077067817, "grad_norm": 38.0, "kl": 1.4036836624145508, "learning_rate": 5e-07, "logits/chosen": -10027406.666666666, "logits/rejected": -16516646.4, "logps/chosen": -168.5167236328125, "logps/rejected": -361.612158203125, "loss": 0.2602, "rewards/chosen": 0.3384068012237549, "rewards/margins": 2.411040163040161, "rewards/rejected": -2.0726333618164063, "step": 9306 }, { "epoch": 0.4933082447724803, "grad_norm": 60.75, "kl": 1.6834793090820312, "learning_rate": 5e-07, "logits/chosen": -44867060.0, "logits/rejected": -4740886.5, "logps/chosen": -505.04412841796875, "logps/rejected": -229.55833435058594, "loss": 0.3271, "rewards/chosen": 0.8469176292419434, "rewards/margins": 2.0801682472229004, "rewards/rejected": -1.233250617980957, "step": 9307 }, { "epoch": 0.49336124877428245, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21763040.0, "logits/rejected": -6141898.0, "logps/chosen": -206.52783203125, "logps/rejected": -164.35782877604166, "loss": 0.3782, "rewards/chosen": -0.2105919599533081, "rewards/margins": 2.0219335476557414, "rewards/rejected": -2.2325255076090493, "step": 9308 }, { "epoch": 0.4934142527760846, "grad_norm": 59.75, "kl": 0.1480579376220703, "learning_rate": 5e-07, "logits/chosen": -50175296.0, "logits/rejected": -23934376.0, "logps/chosen": -350.7676513671875, "logps/rejected": -102.57143147786458, "loss": 0.4142, "rewards/chosen": -0.03999362587928772, "rewards/margins": 1.259834506114324, "rewards/rejected": -1.2998281319936116, "step": 9309 }, { "epoch": 0.4934672567778867, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13214525.333333334, "logits/rejected": -24226822.4, "logps/chosen": -439.0035807291667, "logps/rejected": -293.856982421875, "loss": 0.1906, "rewards/chosen": 1.0036590894063313, "rewards/margins": 3.3376749356587725, "rewards/rejected": -2.3340158462524414, "step": 9310 }, { "epoch": 0.49352026077968886, "grad_norm": 40.25, "kl": 0.08075332641601562, "learning_rate": 5e-07, "logits/chosen": -47301866.666666664, "logits/rejected": -64715840.0, "logps/chosen": -161.10186767578125, "logps/rejected": -306.597509765625, "loss": 0.2581, "rewards/chosen": 0.6386291980743408, "rewards/margins": 2.3821081638336183, "rewards/rejected": -1.7434789657592773, "step": 9311 }, { "epoch": 0.493573264781491, "grad_norm": 58.0, "kl": 0.06386852264404297, "learning_rate": 5e-07, "logits/chosen": -22551440.0, "logits/rejected": -13445957.333333334, "logps/chosen": -271.634423828125, "logps/rejected": -595.6718343098959, "loss": 0.3407, "rewards/chosen": 0.2817979335784912, "rewards/margins": 2.6848539193471272, "rewards/rejected": -2.403055985768636, "step": 9312 }, { "epoch": 0.49362626878329313, "grad_norm": 37.75, "kl": 0.7032871246337891, "learning_rate": 5e-07, "logits/chosen": -2115151.6666666665, "logits/rejected": 135487296.0, "logps/chosen": -120.43857828776042, "logps/rejected": -253.42890625, "loss": 0.2737, "rewards/chosen": 0.3140767415364583, "rewards/margins": 2.400934918721517, "rewards/rejected": -2.0868581771850585, "step": 9313 }, { "epoch": 0.49367927278509527, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64204149.333333336, "logits/rejected": -25666588.8, "logps/chosen": -551.6284586588541, "logps/rejected": -172.9676025390625, "loss": 0.2387, "rewards/chosen": 0.45145368576049805, "rewards/margins": 2.254445934295654, "rewards/rejected": -1.8029922485351562, "step": 9314 }, { "epoch": 0.4937322767868974, "grad_norm": 35.5, "kl": 4.007221221923828, "learning_rate": 5e-07, "logits/chosen": -21312882.0, "logits/rejected": -52881328.0, "logps/chosen": -349.97418212890625, "logps/rejected": -298.3140869140625, "loss": 0.2347, "rewards/chosen": 0.875156581401825, "rewards/margins": 3.412365138530731, "rewards/rejected": -2.5372085571289062, "step": 9315 }, { "epoch": 0.49378528078869954, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -85558672.0, "logits/rejected": -78089168.0, "logps/chosen": -352.74493408203125, "logps/rejected": -324.42388916015625, "loss": 0.2676, "rewards/chosen": 0.2305809110403061, "rewards/margins": 2.6228080838918686, "rewards/rejected": -2.3922271728515625, "step": 9316 }, { "epoch": 0.4938382847905017, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41995988.0, "logits/rejected": -73374096.0, "logps/chosen": -186.997314453125, "logps/rejected": -381.0613098144531, "loss": 0.4051, "rewards/chosen": -0.3706901967525482, "rewards/margins": 1.0533811151981354, "rewards/rejected": -1.4240713119506836, "step": 9317 }, { "epoch": 0.4938912887923038, "grad_norm": 51.75, "kl": 1.158803939819336, "learning_rate": 5e-07, "logits/chosen": -50993171.2, "logits/rejected": -61117546.666666664, "logps/chosen": -211.330908203125, "logps/rejected": -565.7333577473959, "loss": 0.3229, "rewards/chosen": 0.2925442695617676, "rewards/margins": 2.6378883361816405, "rewards/rejected": -2.345344066619873, "step": 9318 }, { "epoch": 0.49394429279410595, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86826784.0, "logits/rejected": -22391525.333333332, "logps/chosen": -211.45806884765625, "logps/rejected": -214.80045572916666, "loss": 0.2181, "rewards/chosen": 0.08053360134363174, "rewards/margins": 2.1602006927132607, "rewards/rejected": -2.079667091369629, "step": 9319 }, { "epoch": 0.4939972967959081, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10620442.666666666, "logits/rejected": -52673920.0, "logps/chosen": -447.9182535807292, "logps/rejected": -435.19716796875, "loss": 0.3227, "rewards/chosen": -0.3008168538411458, "rewards/margins": 1.8792280832926431, "rewards/rejected": -2.180044937133789, "step": 9320 }, { "epoch": 0.49405030079771023, "grad_norm": 42.5, "kl": 0.8698348999023438, "learning_rate": 5e-07, "logits/chosen": -26893533.333333332, "logits/rejected": -25833068.8, "logps/chosen": -191.427490234375, "logps/rejected": -454.009619140625, "loss": 0.2625, "rewards/chosen": 0.2237188220024109, "rewards/margins": 2.2765867114067078, "rewards/rejected": -2.052867889404297, "step": 9321 }, { "epoch": 0.49410330479951237, "grad_norm": 54.0, "kl": 2.0109710693359375, "learning_rate": 5e-07, "logits/chosen": -35366153.6, "logits/rejected": -64717349.333333336, "logps/chosen": -403.4107666015625, "logps/rejected": -317.788818359375, "loss": 0.3897, "rewards/chosen": 0.32663545608520506, "rewards/margins": 1.9335061073303224, "rewards/rejected": -1.6068706512451172, "step": 9322 }, { "epoch": 0.4941563088013145, "grad_norm": 55.25, "kl": 0.19036865234375, "learning_rate": 5e-07, "logits/chosen": -30043584.0, "logits/rejected": 634836.6666666666, "logps/chosen": -210.175390625, "logps/rejected": -227.39449055989584, "loss": 0.4424, "rewards/chosen": 0.17893083095550538, "rewards/margins": 0.5589596350987752, "rewards/rejected": -0.38002880414326984, "step": 9323 }, { "epoch": 0.49420931280311664, "grad_norm": 91.0, "kl": 3.5252199172973633, "learning_rate": 5e-07, "logits/chosen": -50670994.28571428, "logits/rejected": 152528608.0, "logps/chosen": -328.55733816964283, "logps/rejected": -282.30859375, "loss": 0.4785, "rewards/chosen": 0.3835056168692453, "rewards/margins": 0.7675144118922097, "rewards/rejected": -0.3840087950229645, "step": 9324 }, { "epoch": 0.4942623168049188, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62680216.0, "logits/rejected": -15491957.333333334, "logps/chosen": -757.950927734375, "logps/rejected": -226.7464803059896, "loss": 0.1398, "rewards/chosen": 1.66678786277771, "rewards/margins": 3.8659420808156333, "rewards/rejected": -2.1991542180379233, "step": 9325 }, { "epoch": 0.4943153208067209, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10046460.8, "logits/rejected": -11006505.333333334, "logps/chosen": -241.8640869140625, "logps/rejected": -311.8924967447917, "loss": 0.3085, "rewards/chosen": 0.6826820373535156, "rewards/margins": 2.004501501719157, "rewards/rejected": -1.3218194643656414, "step": 9326 }, { "epoch": 0.49436832480852305, "grad_norm": 71.0, "kl": 0.6129245758056641, "learning_rate": 5e-07, "logits/chosen": 104737664.0, "logits/rejected": -20459380.0, "logps/chosen": -426.994384765625, "logps/rejected": -223.0145263671875, "loss": 0.2922, "rewards/chosen": 0.7387226819992065, "rewards/margins": 2.2859623432159424, "rewards/rejected": -1.5472396612167358, "step": 9327 }, { "epoch": 0.4944213288103252, "grad_norm": 55.0, "kl": 0.705535888671875, "learning_rate": 5e-07, "logits/chosen": -7605218.666666667, "logits/rejected": -9122755.2, "logps/chosen": -167.84601847330728, "logps/rejected": -523.273095703125, "loss": 0.292, "rewards/chosen": 0.198199729124705, "rewards/margins": 2.1458420554796853, "rewards/rejected": -1.9476423263549805, "step": 9328 }, { "epoch": 0.4944743328121273, "grad_norm": 55.25, "kl": 0.9094161987304688, "learning_rate": 5e-07, "logits/chosen": -41388691.2, "logits/rejected": -66254746.666666664, "logps/chosen": -765.51923828125, "logps/rejected": -565.694580078125, "loss": 0.204, "rewards/chosen": 1.3321258544921875, "rewards/margins": 4.174986712137858, "rewards/rejected": -2.8428608576456704, "step": 9329 }, { "epoch": 0.49452733681392946, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38568582.4, "logits/rejected": -42671178.666666664, "logps/chosen": -391.109765625, "logps/rejected": -508.8521728515625, "loss": 0.332, "rewards/chosen": 0.2912908554077148, "rewards/margins": 3.546576754252116, "rewards/rejected": -3.255285898844401, "step": 9330 }, { "epoch": 0.4945803408157316, "grad_norm": 46.5, "kl": 1.3981170654296875, "learning_rate": 5e-07, "logits/chosen": -30234762.666666668, "logits/rejected": -12766403.2, "logps/chosen": -579.3962809244791, "logps/rejected": -241.016357421875, "loss": 0.1729, "rewards/chosen": 1.0018195311228435, "rewards/margins": 4.074227444330852, "rewards/rejected": -3.072407913208008, "step": 9331 }, { "epoch": 0.49463334481753374, "grad_norm": 44.25, "kl": 0.11332321166992188, "learning_rate": 5e-07, "logits/chosen": -29094380.0, "logits/rejected": -66975436.0, "logps/chosen": -264.91680908203125, "logps/rejected": -265.3470458984375, "loss": 0.2295, "rewards/chosen": 0.8666417002677917, "rewards/margins": 2.873356521129608, "rewards/rejected": -2.0067148208618164, "step": 9332 }, { "epoch": 0.4946863488193359, "grad_norm": 58.5, "kl": 0.12072372436523438, "learning_rate": 5e-07, "logits/chosen": -22407948.0, "logits/rejected": -40478757.333333336, "logps/chosen": -135.79454040527344, "logps/rejected": -471.4058024088542, "loss": 0.2362, "rewards/chosen": 0.18398666381835938, "rewards/margins": 2.1471924781799316, "rewards/rejected": -1.9632058143615723, "step": 9333 }, { "epoch": 0.494739352821138, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45004448.0, "logits/rejected": -44035578.666666664, "logps/chosen": -503.8407287597656, "logps/rejected": -532.9346516927084, "loss": 0.1227, "rewards/chosen": 1.0771392583847046, "rewards/margins": 4.006109595298767, "rewards/rejected": -2.9289703369140625, "step": 9334 }, { "epoch": 0.49479235682294015, "grad_norm": 38.0, "kl": 2.0959854125976562, "learning_rate": 5e-07, "logits/chosen": -33702868.0, "logits/rejected": -28875466.0, "logps/chosen": -304.07366943359375, "logps/rejected": -214.39727783203125, "loss": 0.3094, "rewards/chosen": 0.3352717459201813, "rewards/margins": 2.540574222803116, "rewards/rejected": -2.2053024768829346, "step": 9335 }, { "epoch": 0.4948453608247423, "grad_norm": 58.75, "kl": 3.354318618774414, "learning_rate": 5e-07, "logits/chosen": -51352725.333333336, "logits/rejected": -20378387.2, "logps/chosen": -480.5262858072917, "logps/rejected": -197.5802001953125, "loss": 0.3494, "rewards/chosen": -0.2268758217493693, "rewards/margins": 1.2269376357396442, "rewards/rejected": -1.4538134574890136, "step": 9336 }, { "epoch": 0.4948983648265444, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74478216.0, "logits/rejected": -85710101.33333333, "logps/chosen": -348.2795104980469, "logps/rejected": -507.17578125, "loss": 0.1665, "rewards/chosen": 0.664925217628479, "rewards/margins": 3.1040836572647095, "rewards/rejected": -2.4391584396362305, "step": 9337 }, { "epoch": 0.49495136882834656, "grad_norm": 57.25, "kl": 1.2117977142333984, "learning_rate": 5e-07, "logits/chosen": -50305056.0, "logits/rejected": -5775427.0, "logps/chosen": -277.2654215494792, "logps/rejected": -141.7354736328125, "loss": 0.3825, "rewards/chosen": 0.48419805367787677, "rewards/margins": 1.9276833136876423, "rewards/rejected": -1.4434852600097656, "step": 9338 }, { "epoch": 0.4950043728301487, "grad_norm": 54.75, "kl": 3.1797103881835938, "learning_rate": 5e-07, "logits/chosen": -40630240.0, "logits/rejected": -16164928.0, "logps/chosen": -290.79990234375, "logps/rejected": -552.438720703125, "loss": 0.3738, "rewards/chosen": 0.14979902505874634, "rewards/margins": 2.8377100586891175, "rewards/rejected": -2.687911033630371, "step": 9339 }, { "epoch": 0.49505737683195083, "grad_norm": 38.75, "kl": 0.6236391067504883, "learning_rate": 5e-07, "logits/chosen": -20477236.0, "logits/rejected": -4275997.0, "logps/chosen": -138.63268025716147, "logps/rejected": -72.66568756103516, "loss": 0.3768, "rewards/chosen": 0.1476348340511322, "rewards/margins": 2.5984321534633636, "rewards/rejected": -2.4507973194122314, "step": 9340 }, { "epoch": 0.49511038083375297, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26020522.0, "logits/rejected": 1121892.0, "logps/chosen": -98.8925552368164, "logps/rejected": -408.4153645833333, "loss": 0.2055, "rewards/chosen": -0.021245386451482773, "rewards/margins": 2.4602789220710597, "rewards/rejected": -2.4815243085225425, "step": 9341 }, { "epoch": 0.4951633848355551, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17607117.333333332, "logits/rejected": -52322744.0, "logps/chosen": -215.2169189453125, "logps/rejected": -367.17926025390625, "loss": 0.3582, "rewards/chosen": 0.24458499749501547, "rewards/margins": 2.345406730969747, "rewards/rejected": -2.1008217334747314, "step": 9342 }, { "epoch": 0.49521638883735725, "grad_norm": 46.25, "kl": 0.11609363555908203, "learning_rate": 5e-07, "logits/chosen": -34680921.6, "logits/rejected": -7644466.666666667, "logps/chosen": -165.1470458984375, "logps/rejected": -298.2957763671875, "loss": 0.3369, "rewards/chosen": 0.3839179754257202, "rewards/margins": 2.15290531317393, "rewards/rejected": -1.7689873377482097, "step": 9343 }, { "epoch": 0.4952693928391594, "grad_norm": 30.5, "kl": 1.3913803100585938, "learning_rate": 5e-07, "logits/chosen": -95498880.0, "logits/rejected": -45659739.428571425, "logps/chosen": -416.77685546875, "logps/rejected": -343.85679408482144, "loss": 0.1114, "rewards/chosen": 0.880139172077179, "rewards/margins": 3.938355267047882, "rewards/rejected": -3.058216094970703, "step": 9344 }, { "epoch": 0.49532239684096147, "grad_norm": 71.0, "kl": 3.084756851196289, "learning_rate": 5e-07, "logits/chosen": -34901557.333333336, "logits/rejected": -19196688.0, "logps/chosen": -663.8044840494791, "logps/rejected": -294.5987548828125, "loss": 0.2425, "rewards/chosen": 1.2112948099772136, "rewards/margins": 3.7977034250895185, "rewards/rejected": -2.5864086151123047, "step": 9345 }, { "epoch": 0.4953754008427636, "grad_norm": 39.0, "kl": 1.0943336486816406, "learning_rate": 5e-07, "logits/chosen": -37487561.6, "logits/rejected": -1416812.3333333333, "logps/chosen": -224.644140625, "logps/rejected": -135.2536824544271, "loss": 0.331, "rewards/chosen": 0.4963075160980225, "rewards/margins": 2.0650989055633544, "rewards/rejected": -1.568791389465332, "step": 9346 }, { "epoch": 0.49542840484456574, "grad_norm": 45.5, "kl": 0.348052978515625, "learning_rate": 5e-07, "logits/chosen": -92027264.0, "logits/rejected": -13407347.2, "logps/chosen": -631.4141031901041, "logps/rejected": -208.94140625, "loss": 0.2078, "rewards/chosen": 0.9555913607279459, "rewards/margins": 3.169983641306559, "rewards/rejected": -2.2143922805786134, "step": 9347 }, { "epoch": 0.4954814088463679, "grad_norm": 47.75, "kl": 1.4179306030273438, "learning_rate": 5e-07, "logits/chosen": 10390550.4, "logits/rejected": -39456626.666666664, "logps/chosen": -146.75074462890626, "logps/rejected": -461.5654703776042, "loss": 0.4449, "rewards/chosen": -0.18680572509765625, "rewards/margins": 1.783842404683431, "rewards/rejected": -1.9706481297810872, "step": 9348 }, { "epoch": 0.49553441284817, "grad_norm": 37.5, "kl": 1.0562801361083984, "learning_rate": 5e-07, "logits/chosen": -6628984.5, "logits/rejected": -16170210.0, "logps/chosen": -150.366943359375, "logps/rejected": -100.69380187988281, "loss": 0.2113, "rewards/chosen": 0.6032319664955139, "rewards/margins": 3.845806896686554, "rewards/rejected": -3.24257493019104, "step": 9349 }, { "epoch": 0.49558741684997215, "grad_norm": 53.0, "kl": 1.7678956985473633, "learning_rate": 5e-07, "logits/chosen": -28405545.6, "logits/rejected": -4335466.666666667, "logps/chosen": -445.3349609375, "logps/rejected": -315.2973225911458, "loss": 0.2414, "rewards/chosen": 0.990687084197998, "rewards/margins": 2.8696050961812336, "rewards/rejected": -1.8789180119832356, "step": 9350 }, { "epoch": 0.4956404208517743, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19707680.0, "logits/rejected": -44238488.0, "logps/chosen": -292.958251953125, "logps/rejected": -547.5291341145834, "loss": 0.3271, "rewards/chosen": 0.21092548370361328, "rewards/margins": 2.3351860682169594, "rewards/rejected": -2.124260584513346, "step": 9351 }, { "epoch": 0.4956934248535764, "grad_norm": 44.0, "kl": 3.859823226928711, "learning_rate": 5e-07, "logits/chosen": -8922140.0, "logits/rejected": -18924952.0, "logps/chosen": -189.30413818359375, "logps/rejected": -135.75542195638022, "loss": 0.3435, "rewards/chosen": 0.5874626159667968, "rewards/margins": 3.2787328084309895, "rewards/rejected": -2.691270192464193, "step": 9352 }, { "epoch": 0.49574642885537856, "grad_norm": 41.0, "kl": 2.512030601501465, "learning_rate": 5e-07, "logits/chosen": -19127408.0, "logits/rejected": -42578656.0, "logps/chosen": -785.1383056640625, "logps/rejected": -397.6916809082031, "loss": 0.2586, "rewards/chosen": 0.7098658084869385, "rewards/margins": 3.3716912269592285, "rewards/rejected": -2.66182541847229, "step": 9353 }, { "epoch": 0.4957994328571807, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27793248.0, "logits/rejected": -16235376.0, "logps/chosen": -515.157470703125, "logps/rejected": -767.5947265625, "loss": 0.226, "rewards/chosen": 1.0455724716186523, "rewards/margins": 3.4294387817382814, "rewards/rejected": -2.383866310119629, "step": 9354 }, { "epoch": 0.49585243685898284, "grad_norm": 41.5, "kl": 0.3193359375, "learning_rate": 5e-07, "logits/chosen": -29601125.333333332, "logits/rejected": -23033640.0, "logps/chosen": -378.4317626953125, "logps/rejected": -181.9955078125, "loss": 0.2004, "rewards/chosen": 1.1863166491190593, "rewards/margins": 2.8891055742899576, "rewards/rejected": -1.7027889251708985, "step": 9355 }, { "epoch": 0.495905440860785, "grad_norm": 50.25, "kl": 0.025600433349609375, "learning_rate": 5e-07, "logits/chosen": -99109384.0, "logits/rejected": -25530204.0, "logps/chosen": -352.37677001953125, "logps/rejected": -425.4325256347656, "loss": 0.2572, "rewards/chosen": 0.26365339756011963, "rewards/margins": 3.1759854555130005, "rewards/rejected": -2.912332057952881, "step": 9356 }, { "epoch": 0.4959584448625871, "grad_norm": 44.75, "kl": 0.04071807861328125, "learning_rate": 5e-07, "logits/chosen": -29148968.0, "logits/rejected": -13625848.0, "logps/chosen": -279.433349609375, "logps/rejected": -225.25436401367188, "loss": 0.2716, "rewards/chosen": 0.5748843550682068, "rewards/margins": 2.391291320323944, "rewards/rejected": -1.8164069652557373, "step": 9357 }, { "epoch": 0.49601144886438925, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39619988.0, "logits/rejected": -12983186.666666666, "logps/chosen": -426.47589111328125, "logps/rejected": -341.5452880859375, "loss": 0.2162, "rewards/chosen": 1.222325086593628, "rewards/margins": 2.7678234577178955, "rewards/rejected": -1.5454983711242676, "step": 9358 }, { "epoch": 0.4960644528661914, "grad_norm": 43.75, "kl": 2.5715560913085938, "learning_rate": 5e-07, "logits/chosen": -19995640.0, "logits/rejected": -121773160.0, "logps/chosen": -248.02891540527344, "logps/rejected": -433.77978515625, "loss": 0.2932, "rewards/chosen": 0.5515362024307251, "rewards/margins": 3.2165709733963013, "rewards/rejected": -2.665034770965576, "step": 9359 }, { "epoch": 0.4961174568679935, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2822378.25, "logits/rejected": 63541920.0, "logps/chosen": -151.77633666992188, "logps/rejected": -452.6773681640625, "loss": 0.2779, "rewards/chosen": 0.5011216402053833, "rewards/margins": 2.432670831680298, "rewards/rejected": -1.9315491914749146, "step": 9360 }, { "epoch": 0.49617046086979566, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10298672.0, "logits/rejected": -40383322.666666664, "logps/chosen": -301.54681396484375, "logps/rejected": -408.6238606770833, "loss": 0.1921, "rewards/chosen": 0.4928955137729645, "rewards/margins": 3.2317842543125153, "rewards/rejected": -2.738888740539551, "step": 9361 }, { "epoch": 0.4962234648715978, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 45948296.0, "logits/rejected": -13035409.142857144, "logps/chosen": -919.8118896484375, "logps/rejected": -243.00422014508928, "loss": 0.221, "rewards/chosen": 1.699072241783142, "rewards/margins": 3.0488788911274503, "rewards/rejected": -1.349806649344308, "step": 9362 }, { "epoch": 0.49627646887339993, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40501245.333333336, "logits/rejected": -31392313.6, "logps/chosen": -515.6259358723959, "logps/rejected": -250.211279296875, "loss": 0.2923, "rewards/chosen": 1.4460724194844563, "rewards/margins": 2.387783749898275, "rewards/rejected": -0.9417113304138184, "step": 9363 }, { "epoch": 0.49632947287520207, "grad_norm": 37.0, "kl": 0.29158973693847656, "learning_rate": 5e-07, "logits/chosen": -7443842.5, "logits/rejected": -31054116.0, "logps/chosen": -261.0024719238281, "logps/rejected": -225.3972625732422, "loss": 0.2285, "rewards/chosen": 0.966977596282959, "rewards/margins": 3.195606231689453, "rewards/rejected": -2.228628635406494, "step": 9364 }, { "epoch": 0.4963824768770042, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74797288.0, "logits/rejected": -8839646.0, "logps/chosen": -414.67230224609375, "logps/rejected": -143.03219604492188, "loss": 0.3316, "rewards/chosen": 0.36645424365997314, "rewards/margins": 1.5470410585403442, "rewards/rejected": -1.180586814880371, "step": 9365 }, { "epoch": 0.49643548087880635, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14500958.4, "logits/rejected": -19002778.666666668, "logps/chosen": -86.71097412109376, "logps/rejected": -315.7192789713542, "loss": 0.4056, "rewards/chosen": -0.023568111658096313, "rewards/margins": 1.290837647517522, "rewards/rejected": -1.3144057591756184, "step": 9366 }, { "epoch": 0.4964884848806085, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7837072.0, "logits/rejected": -22240230.4, "logps/chosen": -307.0163167317708, "logps/rejected": -211.7078125, "loss": 0.273, "rewards/chosen": 0.23310470581054688, "rewards/margins": 2.1538118362426757, "rewards/rejected": -1.9207071304321288, "step": 9367 }, { "epoch": 0.4965414888824106, "grad_norm": 42.75, "kl": 2.670513153076172, "learning_rate": 5e-07, "logits/chosen": -18131650.285714287, "logits/rejected": -24431940.0, "logps/chosen": -175.26942661830358, "logps/rejected": -85.66455078125, "loss": 0.384, "rewards/chosen": 0.5197784560067313, "rewards/margins": 3.1634931223733087, "rewards/rejected": -2.643714666366577, "step": 9368 }, { "epoch": 0.49659449288421276, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3347427.6666666665, "logits/rejected": -31871827.2, "logps/chosen": -166.755615234375, "logps/rejected": -368.37216796875, "loss": 0.2531, "rewards/chosen": 0.6399175326029459, "rewards/margins": 2.4774518648783364, "rewards/rejected": -1.8375343322753905, "step": 9369 }, { "epoch": 0.4966474968860149, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57141296.0, "logits/rejected": -18480304.0, "logps/chosen": -509.0944519042969, "logps/rejected": -323.95835367838544, "loss": 0.1957, "rewards/chosen": 0.6660522222518921, "rewards/margins": 2.652301987012227, "rewards/rejected": -1.9862497647603352, "step": 9370 }, { "epoch": 0.49670050088781703, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8691821.333333334, "logits/rejected": -59834169.6, "logps/chosen": -196.01114908854166, "logps/rejected": -703.52119140625, "loss": 0.2243, "rewards/chosen": -0.15688323974609375, "rewards/margins": 3.2650360107421874, "rewards/rejected": -3.421919250488281, "step": 9371 }, { "epoch": 0.49675350488961917, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25024922.666666668, "logits/rejected": -21734584.0, "logps/chosen": -669.592529296875, "logps/rejected": -276.831640625, "loss": 0.2136, "rewards/chosen": 0.6721343994140625, "rewards/margins": 2.810444450378418, "rewards/rejected": -2.1383100509643556, "step": 9372 }, { "epoch": 0.4968065088914213, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5976555.428571428, "logits/rejected": -158792640.0, "logps/chosen": -343.41053989955356, "logps/rejected": -1093.5654296875, "loss": 0.3611, "rewards/chosen": 0.40154525211879183, "rewards/margins": 6.005415167127337, "rewards/rejected": -5.603869915008545, "step": 9373 }, { "epoch": 0.49685951289322344, "grad_norm": 45.25, "kl": 0.778289794921875, "learning_rate": 5e-07, "logits/chosen": -12796095.0, "logits/rejected": -12175321.0, "logps/chosen": -141.3694305419922, "logps/rejected": -285.9617919921875, "loss": 0.4462, "rewards/chosen": -0.7249776721000671, "rewards/margins": 0.5252209305763245, "rewards/rejected": -1.2501986026763916, "step": 9374 }, { "epoch": 0.4969125168950256, "grad_norm": 48.5, "kl": 2.9582138061523438, "learning_rate": 5e-07, "logits/chosen": -43129452.0, "logits/rejected": -19810556.0, "logps/chosen": -446.951171875, "logps/rejected": -290.7049255371094, "loss": 0.2734, "rewards/chosen": 1.107161045074463, "rewards/margins": 2.7168174982070923, "rewards/rejected": -1.6096564531326294, "step": 9375 }, { "epoch": 0.4969655208968277, "grad_norm": 46.75, "kl": 1.0105819702148438, "learning_rate": 5e-07, "logits/chosen": -40851286.4, "logits/rejected": 1238479.3333333333, "logps/chosen": -301.190283203125, "logps/rejected": -79.62829081217448, "loss": 0.382, "rewards/chosen": 0.3781465768814087, "rewards/margins": 1.7909135103225708, "rewards/rejected": -1.412766933441162, "step": 9376 }, { "epoch": 0.49701852489862985, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 282701.5, "logits/rejected": -8530998.0, "logps/chosen": -87.12020874023438, "logps/rejected": -299.78110758463544, "loss": 0.2506, "rewards/chosen": 0.19113865494728088, "rewards/margins": 1.8234441181023915, "rewards/rejected": -1.6323054631551106, "step": 9377 }, { "epoch": 0.497071528900432, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10048792.8, "logits/rejected": -76420938.66666667, "logps/chosen": -168.46103515625, "logps/rejected": -591.1153157552084, "loss": 0.3008, "rewards/chosen": 0.21241025924682616, "rewards/margins": 3.5005165417989095, "rewards/rejected": -3.2881062825520835, "step": 9378 }, { "epoch": 0.49712453290223413, "grad_norm": 46.25, "kl": 2.0864906311035156, "learning_rate": 5e-07, "logits/chosen": -38092338.666666664, "logits/rejected": -5967223.0, "logps/chosen": -254.45894368489584, "logps/rejected": -262.534912109375, "loss": 0.3412, "rewards/chosen": 0.6026347875595093, "rewards/margins": 3.0567139387130737, "rewards/rejected": -2.4540791511535645, "step": 9379 }, { "epoch": 0.49717753690403627, "grad_norm": 52.0, "kl": 0.3328971862792969, "learning_rate": 5e-07, "logits/chosen": -4536469.6, "logits/rejected": -47562373.333333336, "logps/chosen": -293.5093994140625, "logps/rejected": -387.583740234375, "loss": 0.2745, "rewards/chosen": 0.43306994438171387, "rewards/margins": 2.981144348780314, "rewards/rejected": -2.5480744043986, "step": 9380 }, { "epoch": 0.4972305409058384, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38889376.0, "logits/rejected": -76296096.0, "logps/chosen": -144.245849609375, "logps/rejected": -188.46229553222656, "loss": 0.3347, "rewards/chosen": -0.02869585156440735, "rewards/margins": 1.6111149489879608, "rewards/rejected": -1.6398108005523682, "step": 9381 }, { "epoch": 0.49728354490764054, "grad_norm": 46.75, "kl": 0.8952541351318359, "learning_rate": 5e-07, "logits/chosen": -56861.8, "logits/rejected": 405482.0, "logps/chosen": -123.781396484375, "logps/rejected": -724.8040364583334, "loss": 0.3362, "rewards/chosen": 0.23974099159240722, "rewards/margins": 2.492287302017212, "rewards/rejected": -2.2525463104248047, "step": 9382 }, { "epoch": 0.4973365489094427, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32182112.0, "logits/rejected": -24145333.333333332, "logps/chosen": -133.73204040527344, "logps/rejected": -467.4586588541667, "loss": 0.196, "rewards/chosen": 0.24340935051441193, "rewards/margins": 2.9001013785600662, "rewards/rejected": -2.6566920280456543, "step": 9383 }, { "epoch": 0.4973895529112448, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71097578.66666667, "logits/rejected": -38204896.0, "logps/chosen": -47.4569091796875, "logps/rejected": -296.617578125, "loss": 0.3035, "rewards/chosen": -0.440088431040446, "rewards/margins": 1.9842001279195147, "rewards/rejected": -2.4242885589599608, "step": 9384 }, { "epoch": 0.49744255691304695, "grad_norm": 62.5, "kl": 0.5313644409179688, "learning_rate": 5e-07, "logits/chosen": -55348176.0, "logits/rejected": -11092538.0, "logps/chosen": -672.1570638020834, "logps/rejected": -398.6456298828125, "loss": 0.3755, "rewards/chosen": 0.5565861066182455, "rewards/margins": 2.726887067159017, "rewards/rejected": -2.1703009605407715, "step": 9385 }, { "epoch": 0.4974955609148491, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6011586.666666667, "logits/rejected": 49318518.4, "logps/chosen": -150.14816284179688, "logps/rejected": -393.411865234375, "loss": 0.2801, "rewards/chosen": 0.26434510946273804, "rewards/margins": 1.9662911772727967, "rewards/rejected": -1.7019460678100586, "step": 9386 }, { "epoch": 0.4975485649166512, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34606364.8, "logits/rejected": 13668042.666666666, "logps/chosen": -256.4685546875, "logps/rejected": -465.1410725911458, "loss": 0.2912, "rewards/chosen": 0.4531737804412842, "rewards/margins": 2.543648386001587, "rewards/rejected": -2.0904746055603027, "step": 9387 }, { "epoch": 0.49760156891845336, "grad_norm": 58.0, "kl": 2.6456336975097656, "learning_rate": 5e-07, "logits/chosen": -4409516.0, "logits/rejected": -76140248.0, "logps/chosen": -259.83278401692706, "logps/rejected": -487.0728759765625, "loss": 0.3919, "rewards/chosen": 0.14423253138860068, "rewards/margins": 3.0155537327130637, "rewards/rejected": -2.871321201324463, "step": 9388 }, { "epoch": 0.4976545729202555, "grad_norm": 55.0, "kl": 0.04461669921875, "learning_rate": 5e-07, "logits/chosen": -24098485.333333332, "logits/rejected": -27097075.2, "logps/chosen": -290.01658121744794, "logps/rejected": -285.803466796875, "loss": 0.2621, "rewards/chosen": 0.27642873922983807, "rewards/margins": 2.254377500216166, "rewards/rejected": -1.9779487609863282, "step": 9389 }, { "epoch": 0.49770757692205764, "grad_norm": 29.75, "kl": 0.0819234848022461, "learning_rate": 5e-07, "logits/chosen": 3569264.3333333335, "logits/rejected": -10839835.2, "logps/chosen": -38.37636057535807, "logps/rejected": -135.68011474609375, "loss": 0.2732, "rewards/chosen": 1.0663052399953206, "rewards/margins": 2.1041375001271563, "rewards/rejected": -1.037832260131836, "step": 9390 }, { "epoch": 0.4977605809238598, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42351720.0, "logits/rejected": -16893636.0, "logps/chosen": -261.10955810546875, "logps/rejected": -412.768798828125, "loss": 0.3218, "rewards/chosen": 0.06397438049316406, "rewards/margins": 1.8211637735366821, "rewards/rejected": -1.757189393043518, "step": 9391 }, { "epoch": 0.4978135849256619, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29942142.0, "logits/rejected": 3762119.25, "logps/chosen": -250.75709533691406, "logps/rejected": -60.39487838745117, "loss": 0.3974, "rewards/chosen": -0.0811113566160202, "rewards/margins": 1.0113468915224075, "rewards/rejected": -1.0924582481384277, "step": 9392 }, { "epoch": 0.49786658892746405, "grad_norm": 61.0, "kl": 0.07328414916992188, "learning_rate": 5e-07, "logits/chosen": -47563523.2, "logits/rejected": 2442083.1666666665, "logps/chosen": -398.7941650390625, "logps/rejected": -319.44899495442706, "loss": 0.436, "rewards/chosen": -0.07195343971252441, "rewards/margins": 1.1157942771911622, "rewards/rejected": -1.1877477169036865, "step": 9393 }, { "epoch": 0.4979195929292662, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8086487.0, "logits/rejected": -12896360.0, "logps/chosen": -321.6883544921875, "logps/rejected": -227.5726776123047, "loss": 0.2056, "rewards/chosen": 0.8530945181846619, "rewards/margins": 3.5136730074882507, "rewards/rejected": -2.660578489303589, "step": 9394 }, { "epoch": 0.4979725969310683, "grad_norm": 38.5, "kl": 0.8175621032714844, "learning_rate": 5e-07, "logits/chosen": -34324740.0, "logits/rejected": -16718543.0, "logps/chosen": -137.49720764160156, "logps/rejected": -186.75396728515625, "loss": 0.2912, "rewards/chosen": 0.19670629501342773, "rewards/margins": 2.6440131664276123, "rewards/rejected": -2.4473068714141846, "step": 9395 }, { "epoch": 0.4980256009328704, "grad_norm": 57.25, "kl": 0.8588943481445312, "learning_rate": 5e-07, "logits/chosen": 1799548.0, "logits/rejected": -23251950.4, "logps/chosen": -381.5564778645833, "logps/rejected": -380.4463134765625, "loss": 0.2183, "rewards/chosen": 0.6873077551523844, "rewards/margins": 2.7021959463755287, "rewards/rejected": -2.0148881912231444, "step": 9396 }, { "epoch": 0.49807860493467254, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68283440.0, "logits/rejected": -21675638.0, "logps/chosen": -330.5874938964844, "logps/rejected": -342.9045715332031, "loss": 0.2856, "rewards/chosen": 0.058272384107112885, "rewards/margins": 2.893142007291317, "rewards/rejected": -2.834869623184204, "step": 9397 }, { "epoch": 0.4981316089364747, "grad_norm": 44.75, "kl": 1.1691265106201172, "learning_rate": 5e-07, "logits/chosen": -5884998.0, "logits/rejected": -3690335.25, "logps/chosen": -201.77508544921875, "logps/rejected": -331.17889404296875, "loss": 0.28, "rewards/chosen": 0.549115002155304, "rewards/margins": 2.468400776386261, "rewards/rejected": -1.919285774230957, "step": 9398 }, { "epoch": 0.4981846129382768, "grad_norm": 54.25, "kl": 3.0879287719726562, "learning_rate": 5e-07, "logits/chosen": -35818293.333333336, "logits/rejected": 4324111.0, "logps/chosen": -234.86517333984375, "logps/rejected": -271.56207275390625, "loss": 0.4082, "rewards/chosen": 0.29607417186101276, "rewards/margins": 2.489480276902517, "rewards/rejected": -2.193406105041504, "step": 9399 }, { "epoch": 0.49823761694007895, "grad_norm": 54.75, "kl": 0.0396575927734375, "learning_rate": 5e-07, "logits/chosen": -11386439.2, "logits/rejected": -4748771.666666667, "logps/chosen": -603.407470703125, "logps/rejected": -88.56072998046875, "loss": 0.3323, "rewards/chosen": 0.8882358551025391, "rewards/margins": 1.5582815488179524, "rewards/rejected": -0.6700456937154134, "step": 9400 }, { "epoch": 0.4982906209418811, "grad_norm": 55.5, "kl": 1.5421791076660156, "learning_rate": 5e-07, "logits/chosen": -45792147.2, "logits/rejected": -35590973.333333336, "logps/chosen": -344.7056640625, "logps/rejected": -421.1259765625, "loss": 0.3524, "rewards/chosen": 0.4338324546813965, "rewards/margins": 2.4012263298034666, "rewards/rejected": -1.9673938751220703, "step": 9401 }, { "epoch": 0.4983436249436832, "grad_norm": 45.0, "kl": 2.2274627685546875, "learning_rate": 5e-07, "logits/chosen": -29611250.666666668, "logits/rejected": -20906616.0, "logps/chosen": -760.4308268229166, "logps/rejected": -158.9368408203125, "loss": 0.1982, "rewards/chosen": 2.227564493815104, "rewards/margins": 3.475092760721842, "rewards/rejected": -1.2475282669067382, "step": 9402 }, { "epoch": 0.49839662894548536, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23606808.0, "logits/rejected": -14901424.0, "logps/chosen": -96.7926025390625, "logps/rejected": -157.0319580078125, "loss": 0.295, "rewards/chosen": 0.6469613313674927, "rewards/margins": 1.790869164466858, "rewards/rejected": -1.1439078330993653, "step": 9403 }, { "epoch": 0.4984496329472875, "grad_norm": 51.75, "kl": 2.0412464141845703, "learning_rate": 5e-07, "logits/chosen": -15384328.0, "logits/rejected": -15154858.666666666, "logps/chosen": -334.6554931640625, "logps/rejected": -250.14213053385416, "loss": 0.2622, "rewards/chosen": 0.851076316833496, "rewards/margins": 3.6420146942138674, "rewards/rejected": -2.790938377380371, "step": 9404 }, { "epoch": 0.49850263694908964, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30110642.0, "logits/rejected": 8866751.0, "logps/chosen": -258.16070556640625, "logps/rejected": -551.1219482421875, "loss": 0.3083, "rewards/chosen": 0.005852796137332916, "rewards/margins": 2.4981847777962685, "rewards/rejected": -2.4923319816589355, "step": 9405 }, { "epoch": 0.4985556409508918, "grad_norm": 52.0, "kl": 2.5616912841796875, "learning_rate": 5e-07, "logits/chosen": -21421545.333333332, "logits/rejected": -55298848.0, "logps/chosen": -269.6090901692708, "logps/rejected": -831.5986328125, "loss": 0.3536, "rewards/chosen": 0.6327314376831055, "rewards/margins": 2.717228651046753, "rewards/rejected": -2.0844972133636475, "step": 9406 }, { "epoch": 0.4986086449526939, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44329408.0, "logits/rejected": -23912578.666666668, "logps/chosen": -680.290283203125, "logps/rejected": -380.5439046223958, "loss": 0.1487, "rewards/chosen": 2.5500519275665283, "rewards/margins": 4.4343704382578535, "rewards/rejected": -1.884318510691325, "step": 9407 }, { "epoch": 0.49866164895449605, "grad_norm": 44.5, "kl": 3.6507062911987305, "learning_rate": 5e-07, "logits/chosen": -37901321.6, "logits/rejected": -2248895.5, "logps/chosen": -438.2123046875, "logps/rejected": -238.70967610677084, "loss": 0.2712, "rewards/chosen": 1.2556151390075683, "rewards/margins": 3.194891039530436, "rewards/rejected": -1.939275900522868, "step": 9408 }, { "epoch": 0.4987146529562982, "grad_norm": 47.25, "kl": 1.608114242553711, "learning_rate": 5e-07, "logits/chosen": -56361648.0, "logits/rejected": -1178327.5, "logps/chosen": -665.4499918619791, "logps/rejected": -508.25439453125, "loss": 0.1158, "rewards/chosen": 1.4509216944376628, "rewards/margins": 4.641486612955729, "rewards/rejected": -3.1905649185180662, "step": 9409 }, { "epoch": 0.4987676569581003, "grad_norm": 53.75, "kl": 0.0986175537109375, "learning_rate": 5e-07, "logits/chosen": -43156808.0, "logits/rejected": -42274240.0, "logps/chosen": -361.3141174316406, "logps/rejected": -328.2661437988281, "loss": 0.3195, "rewards/chosen": 0.09498730301856995, "rewards/margins": 2.190040022134781, "rewards/rejected": -2.095052719116211, "step": 9410 }, { "epoch": 0.49882066095990246, "grad_norm": 59.25, "kl": 0.1525249481201172, "learning_rate": 5e-07, "logits/chosen": -61101472.0, "logits/rejected": -13945810.0, "logps/chosen": -371.0513610839844, "logps/rejected": -227.3209228515625, "loss": 0.3708, "rewards/chosen": 0.439971923828125, "rewards/margins": 1.0907291173934937, "rewards/rejected": -0.6507571935653687, "step": 9411 }, { "epoch": 0.4988736649617046, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 74382944.0, "logits/rejected": -32502474.0, "logps/chosen": -815.6860961914062, "logps/rejected": -205.801025390625, "loss": 0.3108, "rewards/chosen": 1.0422699451446533, "rewards/margins": 1.9551253914833069, "rewards/rejected": -0.9128554463386536, "step": 9412 }, { "epoch": 0.49892666896350674, "grad_norm": 41.5, "kl": 2.366170883178711, "learning_rate": 5e-07, "logits/chosen": -17375050.666666668, "logits/rejected": -4698405.6, "logps/chosen": -580.073974609375, "logps/rejected": -349.432958984375, "loss": 0.252, "rewards/chosen": 1.3850948015848796, "rewards/margins": 2.689234224955241, "rewards/rejected": -1.3041394233703614, "step": 9413 }, { "epoch": 0.4989796729653089, "grad_norm": 68.0, "kl": 0.28972625732421875, "learning_rate": 5e-07, "logits/chosen": -3071170.0, "logits/rejected": -22415960.0, "logps/chosen": -402.0417785644531, "logps/rejected": -216.68801879882812, "loss": 0.2055, "rewards/chosen": 1.3253802061080933, "rewards/margins": 3.396624445915222, "rewards/rejected": -2.071244239807129, "step": 9414 }, { "epoch": 0.499032676967111, "grad_norm": 49.75, "kl": 1.4884061813354492, "learning_rate": 5e-07, "logits/chosen": 11208262.666666666, "logits/rejected": -110535.2, "logps/chosen": -77.68057250976562, "logps/rejected": -216.786572265625, "loss": 0.2975, "rewards/chosen": 0.2044492761294047, "rewards/margins": 1.7686100045839945, "rewards/rejected": -1.5641607284545898, "step": 9415 }, { "epoch": 0.49908568096891315, "grad_norm": 37.5, "kl": 0.36169004440307617, "learning_rate": 5e-07, "logits/chosen": -3657138.5, "logits/rejected": -2547456.25, "logps/chosen": -353.9405517578125, "logps/rejected": -139.50306701660156, "loss": 0.2898, "rewards/chosen": 0.8045071363449097, "rewards/margins": 2.28685462474823, "rewards/rejected": -1.4823474884033203, "step": 9416 }, { "epoch": 0.4991386849707153, "grad_norm": 41.75, "kl": 0.04123497009277344, "learning_rate": 5e-07, "logits/chosen": -1914862.6666666667, "logits/rejected": -40831296.0, "logps/chosen": -108.39930216471355, "logps/rejected": -700.6509399414062, "loss": 0.4119, "rewards/chosen": -0.12585861484209696, "rewards/margins": 3.6776954432328544, "rewards/rejected": -3.803554058074951, "step": 9417 }, { "epoch": 0.4991916889725174, "grad_norm": 28.375, "kl": 2.3774490356445312, "learning_rate": 5e-07, "logits/chosen": -26394414.0, "logits/rejected": -19505414.0, "logps/chosen": -597.8886108398438, "logps/rejected": -226.74546813964844, "loss": 0.1732, "rewards/chosen": 1.894849419593811, "rewards/margins": 3.505397081375122, "rewards/rejected": -1.610547661781311, "step": 9418 }, { "epoch": 0.49924469297431956, "grad_norm": 50.5, "kl": 2.984312057495117, "learning_rate": 5e-07, "logits/chosen": -16005444.8, "logits/rejected": -1503717.3333333333, "logps/chosen": -286.884375, "logps/rejected": -73.1539306640625, "loss": 0.4723, "rewards/chosen": 0.03901016712188721, "rewards/margins": 0.7312291860580444, "rewards/rejected": -0.6922190189361572, "step": 9419 }, { "epoch": 0.4992976969761217, "grad_norm": 36.25, "kl": 0.8294544219970703, "learning_rate": 5e-07, "logits/chosen": -19298026.666666668, "logits/rejected": -15987920.0, "logps/chosen": -182.69622802734375, "logps/rejected": -265.3846435546875, "loss": 0.2129, "rewards/chosen": 0.6318338314692179, "rewards/margins": 3.045100967089335, "rewards/rejected": -2.413267135620117, "step": 9420 }, { "epoch": 0.49935070097792383, "grad_norm": 53.0, "kl": 3.6625022888183594, "learning_rate": 5e-07, "logits/chosen": -19598153.333333332, "logits/rejected": -29420024.0, "logps/chosen": -446.1962890625, "logps/rejected": -197.9066925048828, "loss": 0.3611, "rewards/chosen": 0.9717269738515218, "rewards/margins": 2.5687812169392905, "rewards/rejected": -1.5970542430877686, "step": 9421 }, { "epoch": 0.49940370497972597, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23611772.0, "logits/rejected": -28859560.0, "logps/chosen": -378.8686218261719, "logps/rejected": -145.82542419433594, "loss": 0.2309, "rewards/chosen": 1.216106653213501, "rewards/margins": 3.3306758403778076, "rewards/rejected": -2.1145691871643066, "step": 9422 }, { "epoch": 0.4994567089815281, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9305719.0, "logits/rejected": 20217184.0, "logps/chosen": -270.4972229003906, "logps/rejected": -322.3314208984375, "loss": 0.2197, "rewards/chosen": -0.4397315979003906, "rewards/margins": 2.1170736948649087, "rewards/rejected": -2.5568052927652993, "step": 9423 }, { "epoch": 0.49950971298333025, "grad_norm": 41.0, "kl": 0.15527915954589844, "learning_rate": 5e-07, "logits/chosen": -28746056.0, "logits/rejected": -22664626.666666668, "logps/chosen": -326.21881103515625, "logps/rejected": -237.1565958658854, "loss": 0.2307, "rewards/chosen": -0.04947242885828018, "rewards/margins": 1.8875183090567589, "rewards/rejected": -1.936990737915039, "step": 9424 }, { "epoch": 0.4995627169851324, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56652325.333333336, "logits/rejected": 2135227.2, "logps/chosen": -721.6207682291666, "logps/rejected": -387.5052490234375, "loss": 0.2709, "rewards/chosen": -0.021894335746765137, "rewards/margins": 2.0215138673782347, "rewards/rejected": -2.043408203125, "step": 9425 }, { "epoch": 0.4996157209869345, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6274774.5, "logits/rejected": -45904448.0, "logps/chosen": -183.26522827148438, "logps/rejected": -247.7119598388672, "loss": 0.2408, "rewards/chosen": 0.6992586851119995, "rewards/margins": 2.8683029413223267, "rewards/rejected": -2.169044256210327, "step": 9426 }, { "epoch": 0.49966872498873666, "grad_norm": 45.0, "kl": 0.022247314453125, "learning_rate": 5e-07, "logits/chosen": -34380293.333333336, "logits/rejected": -24786084.0, "logps/chosen": -257.965576171875, "logps/rejected": -315.7560119628906, "loss": 0.2807, "rewards/chosen": 0.8187994956970215, "rewards/margins": 2.5248351097106934, "rewards/rejected": -1.7060356140136719, "step": 9427 }, { "epoch": 0.4997217289905388, "grad_norm": 42.75, "kl": 2.9055557250976562, "learning_rate": 5e-07, "logits/chosen": -54241610.666666664, "logits/rejected": -10929121.6, "logps/chosen": -634.733642578125, "logps/rejected": -262.604638671875, "loss": 0.1439, "rewards/chosen": 1.6761047045389812, "rewards/margins": 4.224689833323161, "rewards/rejected": -2.54858512878418, "step": 9428 }, { "epoch": 0.49977473299234093, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32543539.2, "logits/rejected": -18621109.333333332, "logps/chosen": -684.671240234375, "logps/rejected": -189.883544921875, "loss": 0.3458, "rewards/chosen": 0.5019276142120361, "rewards/margins": 2.1086905002593994, "rewards/rejected": -1.6067628860473633, "step": 9429 }, { "epoch": 0.49982773699414307, "grad_norm": 47.25, "kl": 2.158243179321289, "learning_rate": 5e-07, "logits/chosen": -34195555.2, "logits/rejected": -5003484.666666667, "logps/chosen": -324.7425048828125, "logps/rejected": -51.25885009765625, "loss": 0.3085, "rewards/chosen": 0.7850650310516357, "rewards/margins": 2.523526843388875, "rewards/rejected": -1.7384618123372395, "step": 9430 }, { "epoch": 0.4998807409959452, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6813178.0, "logits/rejected": -43298684.8, "logps/chosen": -776.6175944010416, "logps/rejected": -623.50361328125, "loss": 0.206, "rewards/chosen": 0.6204226016998291, "rewards/margins": 3.7559576511383055, "rewards/rejected": -3.1355350494384764, "step": 9431 }, { "epoch": 0.49993374499774734, "grad_norm": 48.25, "kl": 0.058320045471191406, "learning_rate": 5e-07, "logits/chosen": -36390928.0, "logits/rejected": -33972917.333333336, "logps/chosen": -196.8406219482422, "logps/rejected": -338.84364827473956, "loss": 0.3052, "rewards/chosen": 0.18184947967529297, "rewards/margins": 2.040099461873372, "rewards/rejected": -1.8582499821980794, "step": 9432 }, { "epoch": 0.4999867489995495, "grad_norm": 44.25, "kl": 0.23978233337402344, "learning_rate": 5e-07, "logits/chosen": -23045470.0, "logits/rejected": -3809978.5, "logps/chosen": -283.43768310546875, "logps/rejected": -240.3341064453125, "loss": 0.3615, "rewards/chosen": -0.13880963623523712, "rewards/margins": 1.4469552487134933, "rewards/rejected": -1.5857648849487305, "step": 9433 }, { "epoch": 0.5000397530013516, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14877520.0, "logits/rejected": -5558010.333333333, "logps/chosen": -302.9322814941406, "logps/rejected": -270.283447265625, "loss": 0.2934, "rewards/chosen": -0.2494659423828125, "rewards/margins": 1.3633486429850261, "rewards/rejected": -1.6128145853678386, "step": 9434 }, { "epoch": 0.5000927570031537, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14208014.666666666, "logits/rejected": -40839702.4, "logps/chosen": -113.7550557454427, "logps/rejected": -339.2750244140625, "loss": 0.2651, "rewards/chosen": -0.004873653252919515, "rewards/margins": 2.6075111428896585, "rewards/rejected": -2.612384796142578, "step": 9435 }, { "epoch": 0.5001457610049559, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42195348.0, "logits/rejected": -20271168.0, "logps/chosen": -449.86798095703125, "logps/rejected": -301.525146484375, "loss": 0.4024, "rewards/chosen": -0.003741443157196045, "rewards/margins": 1.306298553943634, "rewards/rejected": -1.31003999710083, "step": 9436 }, { "epoch": 0.500198765006758, "grad_norm": 65.0, "kl": 0.035686492919921875, "learning_rate": 5e-07, "logits/chosen": -35843448.0, "logits/rejected": -11579399.0, "logps/chosen": -305.6620686848958, "logps/rejected": -124.3900146484375, "loss": 0.4432, "rewards/chosen": -0.005576014518737793, "rewards/margins": 1.0576900243759155, "rewards/rejected": -1.0632660388946533, "step": 9437 }, { "epoch": 0.5002517690085602, "grad_norm": 41.25, "kl": 2.3303050994873047, "learning_rate": 5e-07, "logits/chosen": -9802250.0, "logits/rejected": -41208648.0, "logps/chosen": -238.17762756347656, "logps/rejected": -155.13934326171875, "loss": 0.36, "rewards/chosen": 0.6161249279975891, "rewards/margins": 1.9291360974311829, "rewards/rejected": -1.3130111694335938, "step": 9438 }, { "epoch": 0.5003047730103622, "grad_norm": 53.75, "kl": 0.9901046752929688, "learning_rate": 5e-07, "logits/chosen": -35577744.0, "logits/rejected": -31054972.0, "logps/chosen": -398.03271484375, "logps/rejected": -448.7110900878906, "loss": 0.2957, "rewards/chosen": 0.34000852704048157, "rewards/margins": 2.5407924950122833, "rewards/rejected": -2.2007839679718018, "step": 9439 }, { "epoch": 0.5003577770121644, "grad_norm": 62.5, "kl": 4.288278579711914, "learning_rate": 5e-07, "logits/chosen": -8270270.4, "logits/rejected": 78160853.33333333, "logps/chosen": -636.5603515625, "logps/rejected": -531.5804443359375, "loss": 0.2782, "rewards/chosen": 0.927972412109375, "rewards/margins": 3.2350135803222657, "rewards/rejected": -2.3070411682128906, "step": 9440 }, { "epoch": 0.5004107810139665, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 48648912.0, "logits/rejected": 5017898.8, "logps/chosen": -121.33807373046875, "logps/rejected": -183.8375732421875, "loss": 0.2994, "rewards/chosen": 0.24681623776753744, "rewards/margins": 1.6993270715077717, "rewards/rejected": -1.4525108337402344, "step": 9441 }, { "epoch": 0.5004637850157687, "grad_norm": 63.5, "kl": 0.2292633056640625, "learning_rate": 5e-07, "logits/chosen": -44165277.333333336, "logits/rejected": 16706450.0, "logps/chosen": -468.2035319010417, "logps/rejected": -351.90484619140625, "loss": 0.3333, "rewards/chosen": 0.6666041612625122, "rewards/margins": 2.2278010845184326, "rewards/rejected": -1.5611969232559204, "step": 9442 }, { "epoch": 0.5005167890175708, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47522264.0, "logits/rejected": -54902104.0, "logps/chosen": -350.89501953125, "logps/rejected": -582.439697265625, "loss": 0.2454, "rewards/chosen": 0.3954077959060669, "rewards/margins": 3.672098994255066, "rewards/rejected": -3.276691198348999, "step": 9443 }, { "epoch": 0.500569793019373, "grad_norm": 48.25, "kl": 0.49776458740234375, "learning_rate": 5e-07, "logits/chosen": -38184204.0, "logits/rejected": -20391788.0, "logps/chosen": -259.9938049316406, "logps/rejected": -319.63226318359375, "loss": 0.3521, "rewards/chosen": -0.22200697660446167, "rewards/margins": 2.330855667591095, "rewards/rejected": -2.5528626441955566, "step": 9444 }, { "epoch": 0.5006227970211751, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21118736.0, "logits/rejected": -32462064.0, "logps/chosen": -183.69315592447916, "logps/rejected": -327.554345703125, "loss": 0.2571, "rewards/chosen": 0.2603928844134013, "rewards/margins": 2.3820491115252174, "rewards/rejected": -2.1216562271118162, "step": 9445 }, { "epoch": 0.5006758010229773, "grad_norm": 60.25, "kl": 0.6502761840820312, "learning_rate": 5e-07, "logits/chosen": -58120800.0, "logits/rejected": -28189024.0, "logps/chosen": -483.5356038411458, "logps/rejected": -321.536279296875, "loss": 0.3013, "rewards/chosen": 0.3242899576822917, "rewards/margins": 1.8851767222086588, "rewards/rejected": -1.560886764526367, "step": 9446 }, { "epoch": 0.5007288050247793, "grad_norm": 55.5, "kl": 1.1837406158447266, "learning_rate": 5e-07, "logits/chosen": -12026853.333333334, "logits/rejected": -20217824.0, "logps/chosen": -166.94034830729166, "logps/rejected": -267.4659729003906, "loss": 0.3586, "rewards/chosen": 0.9413272539774576, "rewards/margins": 1.06194798151652, "rewards/rejected": -0.1206207275390625, "step": 9447 }, { "epoch": 0.5007818090265815, "grad_norm": 54.75, "kl": 1.4258670806884766, "learning_rate": 5e-07, "logits/chosen": -24579776.0, "logits/rejected": -17854342.0, "logps/chosen": -298.358154296875, "logps/rejected": -307.8004150390625, "loss": 0.3063, "rewards/chosen": 0.882275402545929, "rewards/margins": 2.2794094681739807, "rewards/rejected": -1.3971340656280518, "step": 9448 }, { "epoch": 0.5008348130283836, "grad_norm": 57.25, "kl": 2.558349609375, "learning_rate": 5e-07, "logits/chosen": -44344124.0, "logits/rejected": -29077106.666666668, "logps/chosen": -1310.134033203125, "logps/rejected": -374.19287109375, "loss": 0.1551, "rewards/chosen": 1.1031097173690796, "rewards/margins": 3.471558610598246, "rewards/rejected": -2.3684488932291665, "step": 9449 }, { "epoch": 0.5008878170301858, "grad_norm": 44.0, "kl": 0.8080930709838867, "learning_rate": 5e-07, "logits/chosen": -54119718.4, "logits/rejected": -42735584.0, "logps/chosen": -351.49501953125, "logps/rejected": -921.4962565104166, "loss": 0.2823, "rewards/chosen": 0.5025091171264648, "rewards/margins": 3.4055935541788735, "rewards/rejected": -2.9030844370524087, "step": 9450 }, { "epoch": 0.5009408210319879, "grad_norm": 53.25, "kl": 0.7148551940917969, "learning_rate": 5e-07, "logits/chosen": -4472840.0, "logits/rejected": -51740394.666666664, "logps/chosen": -453.746923828125, "logps/rejected": -470.8344319661458, "loss": 0.2061, "rewards/chosen": 1.190888023376465, "rewards/margins": 3.6983444849650065, "rewards/rejected": -2.5074564615885415, "step": 9451 }, { "epoch": 0.5009938250337901, "grad_norm": 51.75, "kl": 1.1864967346191406, "learning_rate": 5e-07, "logits/chosen": -26071499.2, "logits/rejected": -18106852.0, "logps/chosen": -278.6796142578125, "logps/rejected": -375.8557535807292, "loss": 0.2772, "rewards/chosen": 0.6387716293334961, "rewards/margins": 3.4026870091756187, "rewards/rejected": -2.7639153798421225, "step": 9452 }, { "epoch": 0.5010468290355922, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72024304.0, "logits/rejected": -9779297.6, "logps/chosen": -351.9790852864583, "logps/rejected": -165.6313720703125, "loss": 0.2595, "rewards/chosen": 0.2824920415878296, "rewards/margins": 2.1802242040634154, "rewards/rejected": -1.8977321624755858, "step": 9453 }, { "epoch": 0.5010998330373944, "grad_norm": 83.0, "kl": 2.3846397399902344, "learning_rate": 5e-07, "logits/chosen": -49268204.8, "logits/rejected": -5592445.333333333, "logps/chosen": -339.2107177734375, "logps/rejected": -178.65616861979166, "loss": 0.3445, "rewards/chosen": 0.982391357421875, "rewards/margins": 1.9575783411661782, "rewards/rejected": -0.9751869837443033, "step": 9454 }, { "epoch": 0.5011528370391964, "grad_norm": 52.5, "kl": 2.173637866973877, "learning_rate": 5e-07, "logits/chosen": -2839365.2, "logits/rejected": -44426389.333333336, "logps/chosen": -281.13681640625, "logps/rejected": -448.9178059895833, "loss": 0.3687, "rewards/chosen": 0.3476558685302734, "rewards/margins": 2.820059076944987, "rewards/rejected": -2.4724032084147134, "step": 9455 }, { "epoch": 0.5012058410409986, "grad_norm": 78.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33612528.0, "logits/rejected": 29310402.666666668, "logps/chosen": -207.43943786621094, "logps/rejected": -452.8817952473958, "loss": 0.2801, "rewards/chosen": 0.21628788113594055, "rewards/margins": 1.8130638897418976, "rewards/rejected": -1.596776008605957, "step": 9456 }, { "epoch": 0.5012588450428007, "grad_norm": 38.75, "kl": 0.2798633575439453, "learning_rate": 5e-07, "logits/chosen": -79633125.33333333, "logits/rejected": -31374451.2, "logps/chosen": -267.21327718098956, "logps/rejected": -371.921826171875, "loss": 0.2009, "rewards/chosen": 1.0401901404062908, "rewards/margins": 3.4905357519785563, "rewards/rejected": -2.4503456115722657, "step": 9457 }, { "epoch": 0.5013118490446029, "grad_norm": 62.0, "kl": 3.005756378173828, "learning_rate": 5e-07, "logits/chosen": -39215769.6, "logits/rejected": 2027052.6666666667, "logps/chosen": -1055.7041015625, "logps/rejected": -159.3768107096354, "loss": 0.1973, "rewards/chosen": 1.9477386474609375, "rewards/margins": 4.358459949493408, "rewards/rejected": -2.4107213020324707, "step": 9458 }, { "epoch": 0.501364853046405, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12105506.4, "logits/rejected": -33632216.0, "logps/chosen": -264.17705078125, "logps/rejected": -583.1971028645834, "loss": 0.2584, "rewards/chosen": 0.6167703151702881, "rewards/margins": 3.971077489852905, "rewards/rejected": -3.354307174682617, "step": 9459 }, { "epoch": 0.5014178570482072, "grad_norm": 49.25, "kl": 0.7140302658081055, "learning_rate": 5e-07, "logits/chosen": 3105748.8, "logits/rejected": -16066706.666666666, "logps/chosen": -195.8827880859375, "logps/rejected": -101.8111063639323, "loss": 0.4575, "rewards/chosen": -0.05904888510704041, "rewards/margins": 0.8543551027774811, "rewards/rejected": -0.9134039878845215, "step": 9460 }, { "epoch": 0.5014708610500093, "grad_norm": 45.75, "kl": 0.14841651916503906, "learning_rate": 5e-07, "logits/chosen": -40289484.8, "logits/rejected": -4205920.666666667, "logps/chosen": -274.476611328125, "logps/rejected": -249.18701171875, "loss": 0.321, "rewards/chosen": 0.8507212638854981, "rewards/margins": 2.0832855542500814, "rewards/rejected": -1.2325642903645833, "step": 9461 }, { "epoch": 0.5015238650518115, "grad_norm": 40.25, "kl": 0.9655380249023438, "learning_rate": 5e-07, "logits/chosen": -16020805.0, "logits/rejected": -32647844.0, "logps/chosen": -213.4324493408203, "logps/rejected": -463.14190673828125, "loss": 0.2531, "rewards/chosen": 0.3803862929344177, "rewards/margins": 3.687915027141571, "rewards/rejected": -3.3075287342071533, "step": 9462 }, { "epoch": 0.5015768690536135, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18151200.0, "logits/rejected": 2622296.6666666665, "logps/chosen": -323.07012939453125, "logps/rejected": -286.7433268229167, "loss": 0.2864, "rewards/chosen": 0.44034653902053833, "rewards/margins": 1.994940419991811, "rewards/rejected": -1.5545938809712727, "step": 9463 }, { "epoch": 0.5016298730554157, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60267672.0, "logits/rejected": -25899056.0, "logps/chosen": -523.737548828125, "logps/rejected": -231.07357788085938, "loss": 0.2696, "rewards/chosen": 0.5530962944030762, "rewards/margins": 2.5644021034240723, "rewards/rejected": -2.011305809020996, "step": 9464 }, { "epoch": 0.5016828770572178, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29090332.0, "logits/rejected": -31745920.0, "logps/chosen": -502.65667724609375, "logps/rejected": -481.7223714192708, "loss": 0.136, "rewards/chosen": 0.7558868527412415, "rewards/margins": 3.989220122496287, "rewards/rejected": -3.2333332697550454, "step": 9465 }, { "epoch": 0.50173588105902, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24688948.0, "logits/rejected": -37693365.333333336, "logps/chosen": -217.31825256347656, "logps/rejected": -394.16455078125, "loss": 0.1836, "rewards/chosen": -0.1517108976840973, "rewards/margins": 2.644890397787094, "rewards/rejected": -2.7966012954711914, "step": 9466 }, { "epoch": 0.5017888850608221, "grad_norm": 44.0, "kl": 2.083134651184082, "learning_rate": 5e-07, "logits/chosen": -9733640.0, "logits/rejected": -22533702.0, "logps/chosen": -247.6799519856771, "logps/rejected": -96.15572357177734, "loss": 0.3254, "rewards/chosen": 0.9042548338572184, "rewards/margins": 2.4242366949717202, "rewards/rejected": -1.519981861114502, "step": 9467 }, { "epoch": 0.5018418890626243, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12404958.0, "logits/rejected": 123990560.0, "logps/chosen": -174.370849609375, "logps/rejected": -343.3413899739583, "loss": 0.2408, "rewards/chosen": -0.28098219633102417, "rewards/margins": 1.7223445375760398, "rewards/rejected": -2.003326733907064, "step": 9468 }, { "epoch": 0.5018948930644264, "grad_norm": 57.25, "kl": 1.1675491333007812, "learning_rate": 5e-07, "logits/chosen": -40042473.6, "logits/rejected": -12925122.666666666, "logps/chosen": -219.0421875, "logps/rejected": -272.0228678385417, "loss": 0.3914, "rewards/chosen": 0.22219712734222413, "rewards/margins": 1.5818782567977905, "rewards/rejected": -1.3596811294555664, "step": 9469 }, { "epoch": 0.5019478970662286, "grad_norm": 58.25, "kl": 1.3295211791992188, "learning_rate": 5e-07, "logits/chosen": -15826244.0, "logits/rejected": -23765536.0, "logps/chosen": -512.65966796875, "logps/rejected": -336.79901123046875, "loss": 0.3705, "rewards/chosen": -0.22291764616966248, "rewards/margins": 2.3110324442386627, "rewards/rejected": -2.533950090408325, "step": 9470 }, { "epoch": 0.5020009010680306, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11724685.333333334, "logits/rejected": -27937222.4, "logps/chosen": -211.2171834309896, "logps/rejected": -257.5888916015625, "loss": 0.2016, "rewards/chosen": 1.0705745220184326, "rewards/margins": 3.3715610027313234, "rewards/rejected": -2.300986480712891, "step": 9471 }, { "epoch": 0.5020539050698327, "grad_norm": 61.75, "kl": 4.804775238037109, "learning_rate": 5e-07, "logits/chosen": -17336204.0, "logits/rejected": -9565707.0, "logps/chosen": -515.6407470703125, "logps/rejected": -152.07662963867188, "loss": 0.4794, "rewards/chosen": 0.4301214814186096, "rewards/margins": 0.7376543879508972, "rewards/rejected": -0.3075329065322876, "step": 9472 }, { "epoch": 0.5021069090716349, "grad_norm": 32.0, "kl": 2.014608383178711, "learning_rate": 5e-07, "logits/chosen": -45512472.0, "logits/rejected": -38441014.85714286, "logps/chosen": -1551.61962890625, "logps/rejected": -408.4107142857143, "loss": 0.0935, "rewards/chosen": 3.148205518722534, "rewards/margins": 5.534249884741647, "rewards/rejected": -2.386044366019113, "step": 9473 }, { "epoch": 0.502159913073437, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27800852.0, "logits/rejected": -15902235.0, "logps/chosen": -162.64968872070312, "logps/rejected": -339.3659973144531, "loss": 0.2945, "rewards/chosen": 0.504151463508606, "rewards/margins": 2.22555148601532, "rewards/rejected": -1.7214000225067139, "step": 9474 }, { "epoch": 0.5022129170752392, "grad_norm": 53.0, "kl": 0.5502395629882812, "learning_rate": 5e-07, "logits/chosen": -105371840.0, "logits/rejected": -38152698.666666664, "logps/chosen": -285.88372802734375, "logps/rejected": -416.5010172526042, "loss": 0.2282, "rewards/chosen": 0.3498062193393707, "rewards/margins": 2.5714569787184396, "rewards/rejected": -2.221650759379069, "step": 9475 }, { "epoch": 0.5022659210770413, "grad_norm": 51.5, "kl": 0.026309967041015625, "learning_rate": 5e-07, "logits/chosen": -32281024.0, "logits/rejected": -27364659.2, "logps/chosen": -273.331787109375, "logps/rejected": -323.6730712890625, "loss": 0.303, "rewards/chosen": 0.8594339688618978, "rewards/margins": 1.923422654469808, "rewards/rejected": -1.0639886856079102, "step": 9476 }, { "epoch": 0.5023189250788435, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43177971.2, "logits/rejected": 18930756.0, "logps/chosen": -325.36982421875, "logps/rejected": -204.29182942708334, "loss": 0.362, "rewards/chosen": 0.23463006019592286, "rewards/margins": 1.5303065299987793, "rewards/rejected": -1.2956764698028564, "step": 9477 }, { "epoch": 0.5023719290806455, "grad_norm": 40.5, "kl": 0.20698833465576172, "learning_rate": 5e-07, "logits/chosen": -27928034.666666668, "logits/rejected": -31407648.0, "logps/chosen": -184.01529947916666, "logps/rejected": -428.69931640625, "loss": 0.2347, "rewards/chosen": 0.16401583949724832, "rewards/margins": 2.603237412373225, "rewards/rejected": -2.4392215728759767, "step": 9478 }, { "epoch": 0.5024249330824477, "grad_norm": 52.5, "kl": 3.3756237030029297, "learning_rate": 5e-07, "logits/chosen": -55331770.666666664, "logits/rejected": 2792636.5, "logps/chosen": -202.21622721354166, "logps/rejected": -184.4462127685547, "loss": 0.4029, "rewards/chosen": 0.5388331015904745, "rewards/margins": 2.1560670932133994, "rewards/rejected": -1.6172339916229248, "step": 9479 }, { "epoch": 0.5024779370842498, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65466917.333333336, "logits/rejected": -47454547.2, "logps/chosen": -736.6690266927084, "logps/rejected": -590.70380859375, "loss": 0.1509, "rewards/chosen": 0.901068369547526, "rewards/margins": 4.5351500193278, "rewards/rejected": -3.6340816497802733, "step": 9480 }, { "epoch": 0.502530941086052, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 79307304.0, "logits/rejected": -13676975.0, "logps/chosen": -404.9599609375, "logps/rejected": -459.9023742675781, "loss": 0.2466, "rewards/chosen": 0.39254531264305115, "rewards/margins": 3.0127435624599457, "rewards/rejected": -2.6201982498168945, "step": 9481 }, { "epoch": 0.5025839450878541, "grad_norm": 42.75, "kl": 0.5111331939697266, "learning_rate": 5e-07, "logits/chosen": -10716558.0, "logits/rejected": -30134985.6, "logps/chosen": -136.73143513997397, "logps/rejected": -353.316064453125, "loss": 0.22, "rewards/chosen": 0.6551836729049683, "rewards/margins": 2.895099902153015, "rewards/rejected": -2.239916229248047, "step": 9482 }, { "epoch": 0.5026369490896563, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58954085.333333336, "logits/rejected": -6131180.8, "logps/chosen": -646.9440511067709, "logps/rejected": -172.61162109375, "loss": 0.2686, "rewards/chosen": 0.25313111146291095, "rewards/margins": 2.1868208010991417, "rewards/rejected": -1.9336896896362306, "step": 9483 }, { "epoch": 0.5026899530914584, "grad_norm": 50.5, "kl": 0.8969497680664062, "learning_rate": 5e-07, "logits/chosen": -80624192.0, "logits/rejected": -14581616.0, "logps/chosen": -326.9842122395833, "logps/rejected": -380.649169921875, "loss": 0.218, "rewards/chosen": 0.7602890332539877, "rewards/margins": 3.127258332570394, "rewards/rejected": -2.3669692993164064, "step": 9484 }, { "epoch": 0.5027429570932606, "grad_norm": 30.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56909924.0, "logits/rejected": -38894232.0, "logps/chosen": -127.02439880371094, "logps/rejected": -373.352294921875, "loss": 0.1812, "rewards/chosen": -0.04753512144088745, "rewards/margins": 2.8118112683296204, "rewards/rejected": -2.859346389770508, "step": 9485 }, { "epoch": 0.5027959610950626, "grad_norm": 50.25, "kl": 1.9418716430664062, "learning_rate": 5e-07, "logits/chosen": -23998760.0, "logits/rejected": -39013664.0, "logps/chosen": -308.52829996744794, "logps/rejected": -272.404052734375, "loss": 0.3416, "rewards/chosen": 0.8018198013305664, "rewards/margins": 2.2502604722976685, "rewards/rejected": -1.448440670967102, "step": 9486 }, { "epoch": 0.5028489650968648, "grad_norm": 75.0, "kl": 5.634943008422852, "learning_rate": 5e-07, "logits/chosen": -28504630.4, "logits/rejected": -10831434.666666666, "logps/chosen": -605.833056640625, "logps/rejected": -193.8372599283854, "loss": 0.3485, "rewards/chosen": 1.0551558494567872, "rewards/margins": 3.3672441800435386, "rewards/rejected": -2.3120883305867515, "step": 9487 }, { "epoch": 0.5029019690986669, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -135229472.0, "logits/rejected": 15021314.285714285, "logps/chosen": -233.213623046875, "logps/rejected": -312.064208984375, "loss": 0.1714, "rewards/chosen": 0.7601348757743835, "rewards/margins": 2.6562900968960355, "rewards/rejected": -1.8961552211216517, "step": 9488 }, { "epoch": 0.5029549731004691, "grad_norm": 61.75, "kl": 0.9489130973815918, "learning_rate": 5e-07, "logits/chosen": 23031230.0, "logits/rejected": -40961224.0, "logps/chosen": -229.6332244873047, "logps/rejected": -576.0208740234375, "loss": 0.3215, "rewards/chosen": 0.30324143171310425, "rewards/margins": 3.156528413295746, "rewards/rejected": -2.8532869815826416, "step": 9489 }, { "epoch": 0.5030079771022712, "grad_norm": 29.125, "kl": 0.4857015609741211, "learning_rate": 5e-07, "logits/chosen": -15306904.0, "logits/rejected": -10016059.2, "logps/chosen": -352.63916015625, "logps/rejected": -247.6233154296875, "loss": 0.2098, "rewards/chosen": 1.215209722518921, "rewards/margins": 3.815625524520874, "rewards/rejected": -2.600415802001953, "step": 9490 }, { "epoch": 0.5030609811040734, "grad_norm": 49.0, "kl": 1.1755695343017578, "learning_rate": 5e-07, "logits/chosen": -10122317.6, "logits/rejected": -562958.6666666666, "logps/chosen": -319.0066650390625, "logps/rejected": -400.8271891276042, "loss": 0.3449, "rewards/chosen": 0.4332145690917969, "rewards/margins": 2.161998176574707, "rewards/rejected": -1.7287836074829102, "step": 9491 }, { "epoch": 0.5031139851058755, "grad_norm": 35.5, "kl": 0.4237251281738281, "learning_rate": 5e-07, "logits/chosen": -17637316.0, "logits/rejected": -37217752.0, "logps/chosen": -254.232666015625, "logps/rejected": -365.3048095703125, "loss": 0.204, "rewards/chosen": 0.7504886984825134, "rewards/margins": 3.807032287120819, "rewards/rejected": -3.0565435886383057, "step": 9492 }, { "epoch": 0.5031669891076777, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26627128.0, "logits/rejected": -9735315.333333334, "logps/chosen": -519.0374145507812, "logps/rejected": -335.22434488932294, "loss": 0.095, "rewards/chosen": 2.0173065662384033, "rewards/margins": 5.053694009780884, "rewards/rejected": -3.0363874435424805, "step": 9493 }, { "epoch": 0.5032199931094797, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33871516.0, "logits/rejected": 51324864.0, "logps/chosen": -462.9714660644531, "logps/rejected": -267.1783142089844, "loss": 0.3277, "rewards/chosen": 0.3566330075263977, "rewards/margins": 1.9506612420082092, "rewards/rejected": -1.5940282344818115, "step": 9494 }, { "epoch": 0.5032729971112819, "grad_norm": 45.5, "kl": 1.1585626602172852, "learning_rate": 5e-07, "logits/chosen": 11226584.0, "logits/rejected": 35966812.0, "logps/chosen": -211.24966430664062, "logps/rejected": -210.7802276611328, "loss": 0.3259, "rewards/chosen": 0.15614506602287292, "rewards/margins": 2.128741830587387, "rewards/rejected": -1.9725967645645142, "step": 9495 }, { "epoch": 0.503326001113084, "grad_norm": 49.0, "kl": 0.8763771057128906, "learning_rate": 5e-07, "logits/chosen": -23848782.4, "logits/rejected": -13779677.333333334, "logps/chosen": -389.132080078125, "logps/rejected": -167.5498046875, "loss": 0.3417, "rewards/chosen": 0.613980770111084, "rewards/margins": 1.768015702565511, "rewards/rejected": -1.154034932454427, "step": 9496 }, { "epoch": 0.5033790051148862, "grad_norm": 45.5, "kl": 0.26458168029785156, "learning_rate": 5e-07, "logits/chosen": -18454168.0, "logits/rejected": -19422882.0, "logps/chosen": -235.04115295410156, "logps/rejected": -326.7851257324219, "loss": 0.3179, "rewards/chosen": 0.1860477477312088, "rewards/margins": 2.0102679282426834, "rewards/rejected": -1.8242201805114746, "step": 9497 }, { "epoch": 0.5034320091166883, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46109738.666666664, "logits/rejected": 20947897.6, "logps/chosen": -336.2051595052083, "logps/rejected": -257.34296875, "loss": 0.3605, "rewards/chosen": -0.1494069496790568, "rewards/margins": 1.1024216254552204, "rewards/rejected": -1.2518285751342773, "step": 9498 }, { "epoch": 0.5034850131184905, "grad_norm": 56.5, "kl": 0.15095090866088867, "learning_rate": 5e-07, "logits/chosen": -12964520.0, "logits/rejected": -16049601.6, "logps/chosen": -282.5607503255208, "logps/rejected": -156.89229736328124, "loss": 0.3354, "rewards/chosen": 0.5773463646570841, "rewards/margins": 1.9500235954920448, "rewards/rejected": -1.3726772308349608, "step": 9499 }, { "epoch": 0.5035380171202926, "grad_norm": 50.0, "kl": 0.8686370849609375, "learning_rate": 5e-07, "logits/chosen": -94114688.0, "logits/rejected": -28956896.0, "logps/chosen": -665.893896484375, "logps/rejected": -309.6510416666667, "loss": 0.2259, "rewards/chosen": 1.3430497169494628, "rewards/margins": 5.186610571543375, "rewards/rejected": -3.8435608545939126, "step": 9500 }, { "epoch": 0.5035910211220948, "grad_norm": 54.75, "kl": 6.2125749588012695, "learning_rate": 5e-07, "logits/chosen": 4687769.142857143, "logits/rejected": 6619639.0, "logps/chosen": -318.54732840401783, "logps/rejected": -78.50315856933594, "loss": 0.4238, "rewards/chosen": 0.8591869218008858, "rewards/margins": 3.662691865648542, "rewards/rejected": -2.8035049438476562, "step": 9501 }, { "epoch": 0.5036440251238968, "grad_norm": 34.25, "kl": 1.210073471069336, "learning_rate": 5e-07, "logits/chosen": -16944264.0, "logits/rejected": -22441608.0, "logps/chosen": -312.8924255371094, "logps/rejected": -196.5940704345703, "loss": 0.2195, "rewards/chosen": 1.1352770328521729, "rewards/margins": 3.2613844871520996, "rewards/rejected": -2.1261074542999268, "step": 9502 }, { "epoch": 0.503697029125699, "grad_norm": 56.0, "kl": 4.737754821777344, "learning_rate": 5e-07, "logits/chosen": 35645657.6, "logits/rejected": 974064.0, "logps/chosen": -436.196044921875, "logps/rejected": -296.20119222005206, "loss": 0.2908, "rewards/chosen": 1.2640838623046875, "rewards/margins": 3.776031176249186, "rewards/rejected": -2.5119473139444985, "step": 9503 }, { "epoch": 0.5037500331275011, "grad_norm": 58.0, "kl": 1.9397001266479492, "learning_rate": 5e-07, "logits/chosen": -52505728.0, "logits/rejected": 5161850.5, "logps/chosen": -395.6572265625, "logps/rejected": -561.9495849609375, "loss": 0.317, "rewards/chosen": 0.8302503426869711, "rewards/margins": 3.8575878938039145, "rewards/rejected": -3.0273375511169434, "step": 9504 }, { "epoch": 0.5038030371293033, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27297884.0, "logits/rejected": -18432472.0, "logps/chosen": -230.66746520996094, "logps/rejected": -411.6708984375, "loss": 0.2772, "rewards/chosen": 0.4081488847732544, "rewards/margins": 2.4618865251541138, "rewards/rejected": -2.0537376403808594, "step": 9505 }, { "epoch": 0.5038560411311054, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12518055.0, "logits/rejected": -10518416.666666666, "logps/chosen": -470.8404541015625, "logps/rejected": -177.83709716796875, "loss": 0.2414, "rewards/chosen": 0.5933868288993835, "rewards/margins": 2.1943385004997253, "rewards/rejected": -1.6009516716003418, "step": 9506 }, { "epoch": 0.5039090451329076, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -96626664.0, "logits/rejected": -5465390.0, "logps/chosen": -291.1484680175781, "logps/rejected": -256.2287292480469, "loss": 0.2923, "rewards/chosen": 0.3257485628128052, "rewards/margins": 2.434264063835144, "rewards/rejected": -2.108515501022339, "step": 9507 }, { "epoch": 0.5039620491347097, "grad_norm": 61.5, "kl": 0.8575649261474609, "learning_rate": 5e-07, "logits/chosen": -8925006.0, "logits/rejected": -39714981.333333336, "logps/chosen": -243.93551635742188, "logps/rejected": -480.1717122395833, "loss": 0.2512, "rewards/chosen": 0.5227106213569641, "rewards/margins": 2.438373863697052, "rewards/rejected": -1.915663242340088, "step": 9508 }, { "epoch": 0.5040150531365118, "grad_norm": 49.0, "kl": 2.375847816467285, "learning_rate": 5e-07, "logits/chosen": 3505809.6, "logits/rejected": 73848245.33333333, "logps/chosen": -252.0580322265625, "logps/rejected": -532.0699055989584, "loss": 0.2669, "rewards/chosen": 0.9260669708251953, "rewards/margins": 3.872348721822103, "rewards/rejected": -2.9462817509969077, "step": 9509 }, { "epoch": 0.5040680571383139, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53087912.0, "logits/rejected": -23854754.285714287, "logps/chosen": -312.81817626953125, "logps/rejected": -369.505615234375, "loss": 0.2008, "rewards/chosen": 0.72918701171875, "rewards/margins": 2.5298134940011163, "rewards/rejected": -1.800626482282366, "step": 9510 }, { "epoch": 0.5041210611401161, "grad_norm": 65.0, "kl": 0.3416252136230469, "learning_rate": 5e-07, "logits/chosen": -25766960.0, "logits/rejected": -1477365.5, "logps/chosen": -289.39792887369794, "logps/rejected": -105.5751724243164, "loss": 0.5343, "rewards/chosen": -0.17728563149770102, "rewards/margins": -0.3705942829449972, "rewards/rejected": 0.19330865144729614, "step": 9511 }, { "epoch": 0.5041740651419182, "grad_norm": 68.0, "kl": 4.991530895233154, "learning_rate": 5e-07, "logits/chosen": -35156371.2, "logits/rejected": -3690560.0, "logps/chosen": -564.17490234375, "logps/rejected": -190.61236572265625, "loss": 0.4184, "rewards/chosen": 0.9783124923706055, "rewards/margins": 1.6630404790242515, "rewards/rejected": -0.6847279866536459, "step": 9512 }, { "epoch": 0.5042270691437204, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41872992.0, "logits/rejected": -1037999.5, "logps/chosen": -342.89801025390625, "logps/rejected": -581.4096069335938, "loss": 0.2946, "rewards/chosen": -0.06398563086986542, "rewards/margins": 2.8946978598833084, "rewards/rejected": -2.958683490753174, "step": 9513 }, { "epoch": 0.5042800731455225, "grad_norm": 53.5, "kl": 1.1426148414611816, "learning_rate": 5e-07, "logits/chosen": -41117314.666666664, "logits/rejected": -28387184.0, "logps/chosen": -435.422607421875, "logps/rejected": -225.55010986328125, "loss": 0.3483, "rewards/chosen": 0.798169215520223, "rewards/margins": 2.6372505029042563, "rewards/rejected": -1.8390812873840332, "step": 9514 }, { "epoch": 0.5043330771473247, "grad_norm": 47.5, "kl": 0.39225006103515625, "learning_rate": 5e-07, "logits/chosen": -44479432.0, "logits/rejected": -59564024.0, "logps/chosen": -273.0495300292969, "logps/rejected": -672.0065307617188, "loss": 0.2512, "rewards/chosen": 0.4265812933444977, "rewards/margins": 3.452700525522232, "rewards/rejected": -3.0261192321777344, "step": 9515 }, { "epoch": 0.5043860811491268, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27319682.666666668, "logits/rejected": -36442169.6, "logps/chosen": -233.1004435221354, "logps/rejected": -258.15146484375, "loss": 0.3374, "rewards/chosen": 0.04957282046477, "rewards/margins": 1.2663449952999752, "rewards/rejected": -1.2167721748352052, "step": 9516 }, { "epoch": 0.504439085150929, "grad_norm": 50.25, "kl": 1.2299003601074219, "learning_rate": 5e-07, "logits/chosen": -19462598.666666668, "logits/rejected": -61861340.0, "logps/chosen": -264.33221435546875, "logps/rejected": -464.070556640625, "loss": 0.4165, "rewards/chosen": 0.2986793319384257, "rewards/margins": 2.958787182966868, "rewards/rejected": -2.6601078510284424, "step": 9517 }, { "epoch": 0.504492089152731, "grad_norm": 95.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20140400.0, "logits/rejected": -22147952.0, "logps/chosen": -280.276611328125, "logps/rejected": -272.2829895019531, "loss": 0.3354, "rewards/chosen": 0.004320859909057617, "rewards/margins": 1.976919412612915, "rewards/rejected": -1.9725985527038574, "step": 9518 }, { "epoch": 0.5045450931545332, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44611232.0, "logits/rejected": 21468126.666666668, "logps/chosen": -291.950146484375, "logps/rejected": -282.8360188802083, "loss": 0.4116, "rewards/chosen": -0.13270503282546997, "rewards/margins": 1.81816961367925, "rewards/rejected": -1.95087464650472, "step": 9519 }, { "epoch": 0.5045980971563353, "grad_norm": 53.75, "kl": 2.0571765899658203, "learning_rate": 5e-07, "logits/chosen": -46078346.666666664, "logits/rejected": -22915120.0, "logps/chosen": -437.7639567057292, "logps/rejected": -397.42034912109375, "loss": 0.363, "rewards/chosen": 0.6681166489919027, "rewards/margins": 2.5150817235310874, "rewards/rejected": -1.8469650745391846, "step": 9520 }, { "epoch": 0.5046511011581375, "grad_norm": 35.5, "kl": 0.2220621109008789, "learning_rate": 5e-07, "logits/chosen": -4725333.0, "logits/rejected": -75134796.8, "logps/chosen": -87.62966918945312, "logps/rejected": -286.9921875, "loss": 0.2616, "rewards/chosen": 0.08410275975863139, "rewards/margins": 2.7155700067679085, "rewards/rejected": -2.6314672470092773, "step": 9521 }, { "epoch": 0.5047041051599396, "grad_norm": 31.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17459158.0, "logits/rejected": -9328574.0, "logps/chosen": -423.166015625, "logps/rejected": -237.96580505371094, "loss": 0.2046, "rewards/chosen": 1.4023433923721313, "rewards/margins": 3.4789870977401733, "rewards/rejected": -2.076643705368042, "step": 9522 }, { "epoch": 0.5047571091617417, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19515619.2, "logits/rejected": -8768413.333333334, "logps/chosen": -319.843359375, "logps/rejected": -399.2144368489583, "loss": 0.3335, "rewards/chosen": 0.1972750425338745, "rewards/margins": 2.7616334358851113, "rewards/rejected": -2.564358393351237, "step": 9523 }, { "epoch": 0.5048101131635438, "grad_norm": 73.0, "kl": 4.253025054931641, "learning_rate": 5e-07, "logits/chosen": -34446745.6, "logits/rejected": -17773720.0, "logps/chosen": -448.26318359375, "logps/rejected": -411.8822428385417, "loss": 0.3598, "rewards/chosen": 1.0955846786499024, "rewards/margins": 2.0504176298777264, "rewards/rejected": -0.9548329512278239, "step": 9524 }, { "epoch": 0.5048631171653459, "grad_norm": 51.25, "kl": 1.7622394561767578, "learning_rate": 5e-07, "logits/chosen": -3605045.2, "logits/rejected": -19072794.666666668, "logps/chosen": -156.34864501953126, "logps/rejected": -267.40399169921875, "loss": 0.3348, "rewards/chosen": 0.48987932205200196, "rewards/margins": 2.2995949109395344, "rewards/rejected": -1.8097155888875325, "step": 9525 }, { "epoch": 0.5049161211671481, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -89895440.0, "logits/rejected": -17508354.0, "logps/chosen": -412.672119140625, "logps/rejected": -295.527099609375, "loss": 0.2861, "rewards/chosen": 0.4444393217563629, "rewards/margins": 2.538301855325699, "rewards/rejected": -2.093862533569336, "step": 9526 }, { "epoch": 0.5049691251689502, "grad_norm": 50.5, "kl": 2.8817691802978516, "learning_rate": 5e-07, "logits/chosen": -54696844.8, "logits/rejected": -41077237.333333336, "logps/chosen": -366.7287109375, "logps/rejected": -527.1010335286459, "loss": 0.324, "rewards/chosen": 0.46183204650878906, "rewards/margins": 3.8616339365641275, "rewards/rejected": -3.3998018900553384, "step": 9527 }, { "epoch": 0.5050221291707524, "grad_norm": 62.0, "kl": 1.5167274475097656, "learning_rate": 5e-07, "logits/chosen": -8991808.0, "logits/rejected": -30680408.0, "logps/chosen": -248.750830078125, "logps/rejected": -384.0852864583333, "loss": 0.3071, "rewards/chosen": 0.7238716125488281, "rewards/margins": 2.532540702819824, "rewards/rejected": -1.808669090270996, "step": 9528 }, { "epoch": 0.5050751331725545, "grad_norm": 45.0, "kl": 0.2662544250488281, "learning_rate": 5e-07, "logits/chosen": -31540314.0, "logits/rejected": -8215270.5, "logps/chosen": -193.29006958007812, "logps/rejected": -221.80908203125, "loss": 0.2821, "rewards/chosen": 0.2916826903820038, "rewards/margins": 3.0838262736797333, "rewards/rejected": -2.7921435832977295, "step": 9529 }, { "epoch": 0.5051281371743567, "grad_norm": 54.25, "kl": 0.4513883590698242, "learning_rate": 5e-07, "logits/chosen": -51094080.0, "logits/rejected": -34706376.0, "logps/chosen": -351.7623697916667, "logps/rejected": -546.2025146484375, "loss": 0.346, "rewards/chosen": 0.27913276354471844, "rewards/margins": 4.334131638209025, "rewards/rejected": -4.054998874664307, "step": 9530 }, { "epoch": 0.5051811411761588, "grad_norm": 42.25, "kl": 0.5042448043823242, "learning_rate": 5e-07, "logits/chosen": -3264868.5, "logits/rejected": -10701700.0, "logps/chosen": -140.768310546875, "logps/rejected": -183.25360107421875, "loss": 0.3159, "rewards/chosen": 0.3567628264427185, "rewards/margins": 2.0184744000434875, "rewards/rejected": -1.661711573600769, "step": 9531 }, { "epoch": 0.505234145177961, "grad_norm": 47.25, "kl": 1.11724853515625, "learning_rate": 5e-07, "logits/chosen": -6621655.0, "logits/rejected": -826777.625, "logps/chosen": -293.2379150390625, "logps/rejected": -153.2276611328125, "loss": 0.2899, "rewards/chosen": 1.0578805208206177, "rewards/margins": 2.4805350303649902, "rewards/rejected": -1.4226545095443726, "step": 9532 }, { "epoch": 0.505287149179763, "grad_norm": 43.0, "kl": 2.752216339111328, "learning_rate": 5e-07, "logits/chosen": 23603672.0, "logits/rejected": -20555529.6, "logps/chosen": -286.29766845703125, "logps/rejected": -296.5059814453125, "loss": 0.3963, "rewards/chosen": 0.22685935099919638, "rewards/margins": 1.8114957531293232, "rewards/rejected": -1.584636402130127, "step": 9533 }, { "epoch": 0.5053401531815652, "grad_norm": 46.25, "kl": 0.11547088623046875, "learning_rate": 5e-07, "logits/chosen": -21403728.0, "logits/rejected": -15101422.0, "logps/chosen": -588.89208984375, "logps/rejected": -151.19171142578125, "loss": 0.3127, "rewards/chosen": 0.8800788521766663, "rewards/margins": 2.250929057598114, "rewards/rejected": -1.3708502054214478, "step": 9534 }, { "epoch": 0.5053931571833673, "grad_norm": 28.75, "kl": 1.3742761611938477, "learning_rate": 5e-07, "logits/chosen": -933282.8333333334, "logits/rejected": -53488512.0, "logps/chosen": -140.08067830403647, "logps/rejected": -338.7014404296875, "loss": 0.2233, "rewards/chosen": 0.3604429562886556, "rewards/margins": 3.6129766782124837, "rewards/rejected": -3.252533721923828, "step": 9535 }, { "epoch": 0.5054461611851695, "grad_norm": 48.5, "kl": 3.1517019271850586, "learning_rate": 5e-07, "logits/chosen": -2928251.0, "logits/rejected": -49669128.0, "logps/chosen": -131.76431274414062, "logps/rejected": -642.8141479492188, "loss": 0.4191, "rewards/chosen": 0.24560060103734335, "rewards/margins": 4.395974536736806, "rewards/rejected": -4.150373935699463, "step": 9536 }, { "epoch": 0.5054991651869716, "grad_norm": 82.5, "kl": 2.3168277740478516, "learning_rate": 5e-07, "logits/chosen": -27197936.0, "logits/rejected": -10523070.0, "logps/chosen": -427.3949381510417, "logps/rejected": -258.7228698730469, "loss": 0.2807, "rewards/chosen": 1.0275487899780273, "rewards/margins": 3.034127712249756, "rewards/rejected": -2.0065789222717285, "step": 9537 }, { "epoch": 0.5055521691887738, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17561368.0, "logits/rejected": -1472374.5, "logps/chosen": -210.49447631835938, "logps/rejected": -171.83106994628906, "loss": 0.3403, "rewards/chosen": 0.4153246283531189, "rewards/margins": 1.5122925639152527, "rewards/rejected": -1.0969679355621338, "step": 9538 }, { "epoch": 0.5056051731905759, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44506428.0, "logits/rejected": 43601242.666666664, "logps/chosen": -303.7615661621094, "logps/rejected": -390.1058756510417, "loss": 0.1635, "rewards/chosen": 0.7700660824775696, "rewards/margins": 3.173184057076772, "rewards/rejected": -2.4031179745992026, "step": 9539 }, { "epoch": 0.505658177192378, "grad_norm": 57.0, "kl": 1.6792116165161133, "learning_rate": 5e-07, "logits/chosen": -59346176.0, "logits/rejected": -35949133.333333336, "logps/chosen": -347.6869384765625, "logps/rejected": -366.0496826171875, "loss": 0.3002, "rewards/chosen": 0.6143903732299805, "rewards/margins": 3.068966865539551, "rewards/rejected": -2.4545764923095703, "step": 9540 }, { "epoch": 0.5057111811941801, "grad_norm": 47.5, "kl": 1.128885269165039, "learning_rate": 5e-07, "logits/chosen": -22805435.2, "logits/rejected": -805696.0833333334, "logps/chosen": -289.762451171875, "logps/rejected": -96.58450317382812, "loss": 0.4178, "rewards/chosen": 0.1643177390098572, "rewards/margins": 1.0884610533714294, "rewards/rejected": -0.9241433143615723, "step": 9541 }, { "epoch": 0.5057641851959823, "grad_norm": 36.0, "kl": 1.6853294372558594, "learning_rate": 5e-07, "logits/chosen": -28596468.0, "logits/rejected": -14323785.333333334, "logps/chosen": -898.7437133789062, "logps/rejected": -176.8358357747396, "loss": 0.1869, "rewards/chosen": 1.6772797107696533, "rewards/margins": 3.64690359433492, "rewards/rejected": -1.9696238835652669, "step": 9542 }, { "epoch": 0.5058171891977844, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2335695.5, "logits/rejected": -18805234.666666668, "logps/chosen": -159.80758666992188, "logps/rejected": -260.1088053385417, "loss": 0.1874, "rewards/chosen": 0.9379723072052002, "rewards/margins": 2.9156105518341064, "rewards/rejected": -1.9776382446289062, "step": 9543 }, { "epoch": 0.5058701931995866, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32161440.0, "logits/rejected": -36876108.0, "logps/chosen": -230.48440551757812, "logps/rejected": -187.3380126953125, "loss": 0.2942, "rewards/chosen": 0.5052252411842346, "rewards/margins": 1.975424349308014, "rewards/rejected": -1.4701991081237793, "step": 9544 }, { "epoch": 0.5059231972013887, "grad_norm": 61.25, "kl": 3.202695846557617, "learning_rate": 5e-07, "logits/chosen": -39324709.333333336, "logits/rejected": -87806488.0, "logps/chosen": -353.1178792317708, "logps/rejected": -611.1695556640625, "loss": 0.4211, "rewards/chosen": 0.26364074150721234, "rewards/margins": 2.5494669477144876, "rewards/rejected": -2.2858262062072754, "step": 9545 }, { "epoch": 0.5059762012031909, "grad_norm": 48.25, "kl": 0.17963409423828125, "learning_rate": 5e-07, "logits/chosen": -31335208.0, "logits/rejected": -24182248.0, "logps/chosen": -344.97021484375, "logps/rejected": -519.7043050130209, "loss": 0.1697, "rewards/chosen": 1.607203722000122, "rewards/margins": 3.557111819585164, "rewards/rejected": -1.9499080975850422, "step": 9546 }, { "epoch": 0.506029205204993, "grad_norm": 37.0, "kl": 2.035874366760254, "learning_rate": 5e-07, "logits/chosen": -17575804.8, "logits/rejected": -10891126.666666666, "logps/chosen": -201.2441650390625, "logps/rejected": -233.0031534830729, "loss": 0.2976, "rewards/chosen": 0.41916351318359374, "rewards/margins": 3.8845953623453773, "rewards/rejected": -3.4654318491617837, "step": 9547 }, { "epoch": 0.5060822092067951, "grad_norm": 59.25, "kl": 0.7413711547851562, "learning_rate": 5e-07, "logits/chosen": -11342032.8, "logits/rejected": 756592.6666666666, "logps/chosen": -212.141259765625, "logps/rejected": -195.1144816080729, "loss": 0.446, "rewards/chosen": -0.02915336489677429, "rewards/margins": 0.7333967506885528, "rewards/rejected": -0.7625501155853271, "step": 9548 }, { "epoch": 0.5061352132085972, "grad_norm": 42.5, "kl": 0.6204757690429688, "learning_rate": 5e-07, "logits/chosen": -34817456.0, "logits/rejected": -5504285.333333333, "logps/chosen": -123.08331298828125, "logps/rejected": -163.3684285481771, "loss": 0.2703, "rewards/chosen": 0.3205215632915497, "rewards/margins": 1.7553398311138153, "rewards/rejected": -1.4348182678222656, "step": 9549 }, { "epoch": 0.5061882172103994, "grad_norm": 37.75, "kl": 1.0462994575500488, "learning_rate": 5e-07, "logits/chosen": -18372969.6, "logits/rejected": -12858897.333333334, "logps/chosen": -148.7470703125, "logps/rejected": -1171.0011393229167, "loss": 0.2958, "rewards/chosen": 0.36611371040344237, "rewards/margins": 4.712868738174438, "rewards/rejected": -4.346755027770996, "step": 9550 }, { "epoch": 0.5062412212122015, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10241424.0, "logits/rejected": -14944300.8, "logps/chosen": -156.36760457356772, "logps/rejected": -134.6106689453125, "loss": 0.2706, "rewards/chosen": -0.17569885651270548, "rewards/margins": 2.2462379415829976, "rewards/rejected": -2.421936798095703, "step": 9551 }, { "epoch": 0.5062942252140037, "grad_norm": 64.5, "kl": 1.1011734008789062, "learning_rate": 5e-07, "logits/chosen": -14087253.333333334, "logits/rejected": -25399907.2, "logps/chosen": -398.2666015625, "logps/rejected": -231.58740234375, "loss": 0.2808, "rewards/chosen": 0.7798864046732584, "rewards/margins": 2.180813757578532, "rewards/rejected": -1.4009273529052735, "step": 9552 }, { "epoch": 0.5063472292158058, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21990150.0, "logits/rejected": -5041486.5, "logps/chosen": -319.7805480957031, "logps/rejected": -333.7003479003906, "loss": 0.3175, "rewards/chosen": -0.0942775160074234, "rewards/margins": 2.1628421396017075, "rewards/rejected": -2.257119655609131, "step": 9553 }, { "epoch": 0.506400233217608, "grad_norm": 51.5, "kl": 1.2319965362548828, "learning_rate": 5e-07, "logits/chosen": -888584.3333333334, "logits/rejected": -60362636.8, "logps/chosen": -96.00978597005208, "logps/rejected": -423.3701171875, "loss": 0.2841, "rewards/chosen": 0.032638922333717346, "rewards/margins": 2.2832235246896744, "rewards/rejected": -2.250584602355957, "step": 9554 }, { "epoch": 0.50645323721941, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15557177.6, "logits/rejected": -15764482.666666666, "logps/chosen": -152.8656494140625, "logps/rejected": -272.11883544921875, "loss": 0.3441, "rewards/chosen": 0.006363868713378906, "rewards/margins": 2.5064942042032876, "rewards/rejected": -2.5001303354899087, "step": 9555 }, { "epoch": 0.5065062412212122, "grad_norm": 31.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22929069.333333332, "logits/rejected": 11503870.4, "logps/chosen": -186.09330240885416, "logps/rejected": -479.3333984375, "loss": 0.2312, "rewards/chosen": -0.1593303680419922, "rewards/margins": 3.2633872985839845, "rewards/rejected": -3.4227176666259767, "step": 9556 }, { "epoch": 0.5065592452230143, "grad_norm": 45.75, "kl": 1.1175384521484375, "learning_rate": 5e-07, "logits/chosen": -43000268.0, "logits/rejected": -33269128.0, "logps/chosen": -378.1017761230469, "logps/rejected": -559.5399169921875, "loss": 0.2456, "rewards/chosen": 0.75466388463974, "rewards/margins": 3.654383361339569, "rewards/rejected": -2.899719476699829, "step": 9557 }, { "epoch": 0.5066122492248165, "grad_norm": 56.0, "kl": 2.3859214782714844, "learning_rate": 5e-07, "logits/chosen": -83533008.0, "logits/rejected": -16652030.0, "logps/chosen": -885.9742431640625, "logps/rejected": -184.29568481445312, "loss": 0.2573, "rewards/chosen": 1.1837611198425293, "rewards/margins": 2.648726463317871, "rewards/rejected": -1.4649653434753418, "step": 9558 }, { "epoch": 0.5066652532266186, "grad_norm": 57.0, "kl": 5.7171478271484375, "learning_rate": 5e-07, "logits/chosen": -38623547.428571425, "logits/rejected": -37441592.0, "logps/chosen": -257.95926339285717, "logps/rejected": -450.6590270996094, "loss": 0.3693, "rewards/chosen": 1.2219627925327845, "rewards/margins": 2.5097740718296597, "rewards/rejected": -1.287811279296875, "step": 9559 }, { "epoch": 0.5067182572284208, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4634477.0, "logits/rejected": -14894624.0, "logps/chosen": -33.52272033691406, "logps/rejected": -261.8577473958333, "loss": 0.2294, "rewards/chosen": 0.2642326354980469, "rewards/margins": 2.6279077529907227, "rewards/rejected": -2.363675117492676, "step": 9560 }, { "epoch": 0.5067712612302229, "grad_norm": 44.25, "kl": 1.136556625366211, "learning_rate": 5e-07, "logits/chosen": 737236.6666666666, "logits/rejected": -80820048.0, "logps/chosen": -107.80628458658855, "logps/rejected": -525.521240234375, "loss": 0.3745, "rewards/chosen": 0.2049310008684794, "rewards/margins": 3.281130532423655, "rewards/rejected": -3.076199531555176, "step": 9561 }, { "epoch": 0.5068242652320251, "grad_norm": 204.0, "kl": 1.4151840209960938, "learning_rate": 5e-07, "logits/chosen": -59177770.666666664, "logits/rejected": 192769113.6, "logps/chosen": -354.1041666666667, "logps/rejected": -340.6702392578125, "loss": 0.2863, "rewards/chosen": 0.02751260995864868, "rewards/margins": 1.854195272922516, "rewards/rejected": -1.8266826629638673, "step": 9562 }, { "epoch": 0.5068772692338271, "grad_norm": 55.25, "kl": 0.19752025604248047, "learning_rate": 5e-07, "logits/chosen": -23073528.0, "logits/rejected": -20261512.0, "logps/chosen": -279.6100667317708, "logps/rejected": -262.10528564453125, "loss": 0.3894, "rewards/chosen": 0.0669315109650294, "rewards/margins": 2.192808891336123, "rewards/rejected": -2.1258773803710938, "step": 9563 }, { "epoch": 0.5069302732356293, "grad_norm": 27.25, "kl": 1.6479778289794922, "learning_rate": 5e-07, "logits/chosen": 3142525.5, "logits/rejected": -27445101.714285713, "logps/chosen": -41.3671875, "logps/rejected": -196.69426618303572, "loss": 0.1583, "rewards/chosen": 0.8362594842910767, "rewards/margins": 3.2832744291850497, "rewards/rejected": -2.447014944893973, "step": 9564 }, { "epoch": 0.5069832772374314, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1024616.6666666666, "logits/rejected": -49518524.8, "logps/chosen": -176.6374308268229, "logps/rejected": -448.351123046875, "loss": 0.2051, "rewards/chosen": 0.6668917338053385, "rewards/margins": 3.1728054682413735, "rewards/rejected": -2.505913734436035, "step": 9565 }, { "epoch": 0.5070362812392336, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42779325.333333336, "logits/rejected": 2302136.0, "logps/chosen": -495.6920166015625, "logps/rejected": -862.73623046875, "loss": 0.1441, "rewards/chosen": 1.0850168863932292, "rewards/margins": 4.9777472178141275, "rewards/rejected": -3.8927303314208985, "step": 9566 }, { "epoch": 0.5070892852410357, "grad_norm": 27.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -530858.0, "logits/rejected": -29881682.666666668, "logps/chosen": -28.82318115234375, "logps/rejected": -203.4734090169271, "loss": 0.1882, "rewards/chosen": 0.27085790038108826, "rewards/margins": 2.8173822661240897, "rewards/rejected": -2.5465243657430015, "step": 9567 }, { "epoch": 0.5071422892428379, "grad_norm": 42.5, "kl": 0.36590099334716797, "learning_rate": 5e-07, "logits/chosen": -70111456.0, "logits/rejected": -28259996.0, "logps/chosen": -480.9056396484375, "logps/rejected": -380.3714599609375, "loss": 0.3337, "rewards/chosen": 0.08688086271286011, "rewards/margins": 2.3327309489250183, "rewards/rejected": -2.245850086212158, "step": 9568 }, { "epoch": 0.50719529324464, "grad_norm": 70.5, "kl": 1.3492012023925781, "learning_rate": 5e-07, "logits/chosen": -19010656.0, "logits/rejected": -89760528.0, "logps/chosen": -321.99072265625, "logps/rejected": -316.709228515625, "loss": 0.3152, "rewards/chosen": 0.8079327344894409, "rewards/margins": 2.0869301557540894, "rewards/rejected": -1.2789974212646484, "step": 9569 }, { "epoch": 0.5072482972464422, "grad_norm": 62.25, "kl": 1.320709228515625, "learning_rate": 5e-07, "logits/chosen": -75117234.28571428, "logits/rejected": -5790278.0, "logps/chosen": -382.05165318080356, "logps/rejected": -56.020530700683594, "loss": 0.4378, "rewards/chosen": 0.3221573829650879, "rewards/margins": 1.230709195137024, "rewards/rejected": -0.908551812171936, "step": 9570 }, { "epoch": 0.5073013012482442, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34862620.0, "logits/rejected": -24632196.0, "logps/chosen": -277.51470947265625, "logps/rejected": -279.94073486328125, "loss": 0.3268, "rewards/chosen": -0.12063980102539062, "rewards/margins": 2.2527804374694824, "rewards/rejected": -2.373420238494873, "step": 9571 }, { "epoch": 0.5073543052500464, "grad_norm": 45.75, "kl": 0.08767127990722656, "learning_rate": 5e-07, "logits/chosen": -41367180.0, "logits/rejected": -47264320.0, "logps/chosen": -320.12237548828125, "logps/rejected": -369.5915832519531, "loss": 0.2688, "rewards/chosen": 0.3197852373123169, "rewards/margins": 2.974915862083435, "rewards/rejected": -2.655130624771118, "step": 9572 }, { "epoch": 0.5074073092518485, "grad_norm": 51.75, "kl": 1.157853126525879, "learning_rate": 5e-07, "logits/chosen": -24248840.0, "logits/rejected": -25017466.666666668, "logps/chosen": -376.3644775390625, "logps/rejected": -407.0664469401042, "loss": 0.2888, "rewards/chosen": 0.5627519130706787, "rewards/margins": 3.211721086502075, "rewards/rejected": -2.6489691734313965, "step": 9573 }, { "epoch": 0.5074603132536506, "grad_norm": 105.0, "kl": 1.5778427124023438, "learning_rate": 5e-07, "logits/chosen": -16122461.333333334, "logits/rejected": -2473606.75, "logps/chosen": -358.4973958333333, "logps/rejected": -127.4439697265625, "loss": 0.4355, "rewards/chosen": 0.4570883512496948, "rewards/margins": 1.151413083076477, "rewards/rejected": -0.6943247318267822, "step": 9574 }, { "epoch": 0.5075133172554528, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54051340.0, "logits/rejected": -11530980.0, "logps/chosen": -548.6082763671875, "logps/rejected": -328.51340738932294, "loss": 0.2231, "rewards/chosen": 0.642437756061554, "rewards/margins": 2.3245920936266584, "rewards/rejected": -1.6821543375651042, "step": 9575 }, { "epoch": 0.5075663212572549, "grad_norm": 46.25, "kl": 1.732743263244629, "learning_rate": 5e-07, "logits/chosen": -11235329.333333334, "logits/rejected": -20080678.0, "logps/chosen": -180.72465006510416, "logps/rejected": -242.04006958007812, "loss": 0.3825, "rewards/chosen": 0.34897875785827637, "rewards/margins": 2.922224283218384, "rewards/rejected": -2.5732455253601074, "step": 9576 }, { "epoch": 0.5076193252590571, "grad_norm": 55.25, "kl": 0.9718780517578125, "learning_rate": 5e-07, "logits/chosen": -37932964.0, "logits/rejected": -69205536.0, "logps/chosen": -295.3563537597656, "logps/rejected": -447.21221923828125, "loss": 0.3015, "rewards/chosen": 0.32521992921829224, "rewards/margins": 2.8495448231697083, "rewards/rejected": -2.524324893951416, "step": 9577 }, { "epoch": 0.5076723292608591, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32465784.0, "logits/rejected": -25191885.333333332, "logps/chosen": -576.9606323242188, "logps/rejected": -332.65244547526044, "loss": 0.1293, "rewards/chosen": 1.2025772333145142, "rewards/margins": 3.7313324213027954, "rewards/rejected": -2.5287551879882812, "step": 9578 }, { "epoch": 0.5077253332626613, "grad_norm": 48.0, "kl": 1.6510047912597656, "learning_rate": 5e-07, "logits/chosen": -28997000.0, "logits/rejected": 39890236.0, "logps/chosen": -290.8233642578125, "logps/rejected": -310.1342468261719, "loss": 0.27, "rewards/chosen": 0.4633239209651947, "rewards/margins": 3.3048880994319916, "rewards/rejected": -2.841564178466797, "step": 9579 }, { "epoch": 0.5077783372644634, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3227657.0, "logits/rejected": -11076372.57142857, "logps/chosen": -9.216278076171875, "logps/rejected": -71.52189418247768, "loss": 0.2088, "rewards/chosen": -0.1133575439453125, "rewards/margins": 1.819812638419015, "rewards/rejected": -1.9331701823643275, "step": 9580 }, { "epoch": 0.5078313412662656, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -87312344.0, "logits/rejected": -15975016.0, "logps/chosen": -335.07489013671875, "logps/rejected": -348.7906901041667, "loss": 0.2623, "rewards/chosen": -0.08369293808937073, "rewards/margins": 1.630018800497055, "rewards/rejected": -1.7137117385864258, "step": 9581 }, { "epoch": 0.5078843452680677, "grad_norm": 51.75, "kl": 4.941247940063477, "learning_rate": 5e-07, "logits/chosen": -44657096.0, "logits/rejected": -8802021.0, "logps/chosen": -406.2008463541667, "logps/rejected": -276.9872131347656, "loss": 0.4142, "rewards/chosen": 0.7172943751017252, "rewards/margins": 1.5325127641359964, "rewards/rejected": -0.8152183890342712, "step": 9582 }, { "epoch": 0.5079373492698699, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25113769.14285714, "logits/rejected": -316605440.0, "logps/chosen": -249.40956333705358, "logps/rejected": -506.7092590332031, "loss": 0.4406, "rewards/chosen": 0.029741246785436357, "rewards/margins": 3.1547991825001582, "rewards/rejected": -3.1250579357147217, "step": 9583 }, { "epoch": 0.507990353271672, "grad_norm": 53.25, "kl": 1.0707149505615234, "learning_rate": 5e-07, "logits/chosen": -17041278.0, "logits/rejected": -51007132.0, "logps/chosen": -92.86714172363281, "logps/rejected": -338.6310729980469, "loss": 0.3234, "rewards/chosen": 0.3136676847934723, "rewards/margins": 2.57748880982399, "rewards/rejected": -2.2638211250305176, "step": 9584 }, { "epoch": 0.5080433572734742, "grad_norm": 63.75, "kl": 2.4846630096435547, "learning_rate": 5e-07, "logits/chosen": -56324052.0, "logps/chosen": -399.9652404785156, "loss": 0.4369, "rewards/chosen": 0.6156958937644958, "step": 9585 }, { "epoch": 0.5080963612752762, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27878298.0, "logits/rejected": -5106989.0, "logps/chosen": -302.84381103515625, "logps/rejected": -281.4036865234375, "loss": 0.252, "rewards/chosen": 0.7030032873153687, "rewards/margins": 2.8912214040756226, "rewards/rejected": -2.188218116760254, "step": 9586 }, { "epoch": 0.5081493652770784, "grad_norm": 57.25, "kl": 0.7510566711425781, "learning_rate": 5e-07, "logits/chosen": -38247546.666666664, "logits/rejected": -34619312.0, "logps/chosen": -262.2465413411458, "logps/rejected": -337.7115478515625, "loss": 0.3638, "rewards/chosen": 0.3325562874476115, "rewards/margins": 2.56116513411204, "rewards/rejected": -2.2286088466644287, "step": 9587 }, { "epoch": 0.5082023692788805, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53130048.0, "logits/rejected": -146615968.0, "logps/chosen": -567.150927734375, "logps/rejected": -401.9545491536458, "loss": 0.2799, "rewards/chosen": 0.5134414672851563, "rewards/margins": 3.1285079956054687, "rewards/rejected": -2.6150665283203125, "step": 9588 }, { "epoch": 0.5082553732806827, "grad_norm": 46.5, "kl": 0.43563365936279297, "learning_rate": 5e-07, "logits/chosen": -24552786.666666668, "logits/rejected": 34404928.0, "logps/chosen": -287.8403727213542, "logps/rejected": -383.77508544921875, "loss": 0.3814, "rewards/chosen": 0.3892677625020345, "rewards/margins": 1.8086583216985066, "rewards/rejected": -1.4193905591964722, "step": 9589 }, { "epoch": 0.5083083772824848, "grad_norm": 53.25, "kl": 2.1294002532958984, "learning_rate": 5e-07, "logits/chosen": -49288848.0, "logits/rejected": 4611841.5, "logps/chosen": -372.86614990234375, "logps/rejected": -159.27902221679688, "loss": 0.3419, "rewards/chosen": 0.9989689588546753, "rewards/margins": 1.8323525190353394, "rewards/rejected": -0.8333835601806641, "step": 9590 }, { "epoch": 0.508361381284287, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39525458.666666664, "logits/rejected": 2014714.2, "logps/chosen": -549.265380859375, "logps/rejected": -172.312255859375, "loss": 0.2172, "rewards/chosen": 0.6825429598490397, "rewards/margins": 2.8908898989359537, "rewards/rejected": -2.208346939086914, "step": 9591 }, { "epoch": 0.5084143852860891, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44846748.8, "logits/rejected": -9081662.666666666, "logps/chosen": -283.85693359375, "logps/rejected": -213.2410888671875, "loss": 0.3226, "rewards/chosen": 0.35036311149597166, "rewards/margins": 2.387670373916626, "rewards/rejected": -2.0373072624206543, "step": 9592 }, { "epoch": 0.5084673892878913, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23192899.2, "logits/rejected": -18701042.666666668, "logps/chosen": -764.3224609375, "logps/rejected": -357.4258626302083, "loss": 0.3001, "rewards/chosen": 0.6546792984008789, "rewards/margins": 3.3221222559611, "rewards/rejected": -2.667442957560221, "step": 9593 }, { "epoch": 0.5085203932896933, "grad_norm": 47.0, "kl": 0.6835441589355469, "learning_rate": 5e-07, "logits/chosen": -29050192.0, "logits/rejected": -11898952.8, "logps/chosen": -325.3810628255208, "logps/rejected": -341.2714599609375, "loss": 0.2494, "rewards/chosen": 0.30562134583791095, "rewards/margins": 2.9576454242070516, "rewards/rejected": -2.6520240783691404, "step": 9594 }, { "epoch": 0.5085733972914955, "grad_norm": 38.5, "kl": 1.4387187957763672, "learning_rate": 5e-07, "logits/chosen": 388621.25, "logits/rejected": -40942256.0, "logps/chosen": -127.33922576904297, "logps/rejected": -343.28558349609375, "loss": 0.228, "rewards/chosen": 0.9822657108306885, "rewards/margins": 3.0423319339752197, "rewards/rejected": -2.0600662231445312, "step": 9595 }, { "epoch": 0.5086264012932976, "grad_norm": 39.75, "kl": 1.3066997528076172, "learning_rate": 5e-07, "logits/chosen": 374353.05, "logits/rejected": -14462466.666666666, "logps/chosen": -48.229129028320315, "logps/rejected": -476.8971354166667, "loss": 0.3175, "rewards/chosen": 0.3650089740753174, "rewards/margins": 3.56537340482076, "rewards/rejected": -3.200364430745443, "step": 9596 }, { "epoch": 0.5086794052950998, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66078309.333333336, "logits/rejected": -22273836.8, "logps/chosen": -535.5911051432291, "logps/rejected": -248.818115234375, "loss": 0.2557, "rewards/chosen": 0.5502863725026449, "rewards/margins": 2.1562432130177815, "rewards/rejected": -1.6059568405151368, "step": 9597 }, { "epoch": 0.5087324092969019, "grad_norm": 68.5, "kl": 2.252591133117676, "learning_rate": 5e-07, "logits/chosen": -11322906.285714285, "logits/rejected": -66607248.0, "logps/chosen": -197.93551199776786, "logps/rejected": -426.909912109375, "loss": 0.4106, "rewards/chosen": 0.6887014933994838, "rewards/margins": 1.6799399086407254, "rewards/rejected": -0.9912384152412415, "step": 9598 }, { "epoch": 0.5087854132987041, "grad_norm": 43.25, "kl": 0.10356521606445312, "learning_rate": 5e-07, "logits/chosen": -24509785.6, "logits/rejected": -11297810.666666666, "logps/chosen": -221.846630859375, "logps/rejected": -150.62508138020834, "loss": 0.3239, "rewards/chosen": 0.3071880340576172, "rewards/margins": 2.3917187054951987, "rewards/rejected": -2.0845306714375815, "step": 9599 }, { "epoch": 0.5088384173005062, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22374490.666666668, "logits/rejected": -73871513.6, "logps/chosen": -330.90447998046875, "logps/rejected": -524.678173828125, "loss": 0.1474, "rewards/chosen": 0.8471109867095947, "rewards/margins": 4.0618654727935795, "rewards/rejected": -3.2147544860839843, "step": 9600 }, { "epoch": 0.5088914213023084, "grad_norm": 61.75, "kl": 1.5116195678710938, "learning_rate": 5e-07, "logits/chosen": -56623860.0, "logits/rejected": -3850439.0, "logps/chosen": -420.7374267578125, "logps/rejected": -231.7556610107422, "loss": 0.2833, "rewards/chosen": 0.8002593517303467, "rewards/margins": 2.509208917617798, "rewards/rejected": -1.7089495658874512, "step": 9601 }, { "epoch": 0.5089444253041104, "grad_norm": 48.5, "kl": 1.8710837364196777, "learning_rate": 5e-07, "logits/chosen": -20519220.0, "logits/rejected": -23928620.0, "logps/chosen": -236.68081665039062, "logps/rejected": -495.941650390625, "loss": 0.2447, "rewards/chosen": 0.7905129790306091, "rewards/margins": 3.331066310405731, "rewards/rejected": -2.540553331375122, "step": 9602 }, { "epoch": 0.5089974293059126, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17383822.0, "logits/rejected": -41021440.0, "logps/chosen": -225.83338928222656, "logps/rejected": -335.27724202473956, "loss": 0.1839, "rewards/chosen": 0.5756896734237671, "rewards/margins": 3.1385082006454468, "rewards/rejected": -2.5628185272216797, "step": 9603 }, { "epoch": 0.5090504333077147, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38408304.0, "logits/rejected": 11442152.0, "logps/chosen": -311.7125244140625, "logps/rejected": -293.07175699869794, "loss": 0.2694, "rewards/chosen": -0.38450852036476135, "rewards/margins": 1.531964550415675, "rewards/rejected": -1.9164730707804363, "step": 9604 }, { "epoch": 0.5091034373095169, "grad_norm": 39.0, "kl": 1.901759147644043, "learning_rate": 5e-07, "logits/chosen": -17907390.0, "logits/rejected": -21680296.0, "logps/chosen": -492.1630859375, "logps/rejected": -481.1552429199219, "loss": 0.1239, "rewards/chosen": 1.918852686882019, "rewards/margins": 5.1904579401016235, "rewards/rejected": -3.2716052532196045, "step": 9605 }, { "epoch": 0.509156441311319, "grad_norm": 46.0, "kl": 0.1647930145263672, "learning_rate": 5e-07, "logits/chosen": -19950793.333333332, "logits/rejected": -3665666.4, "logps/chosen": -207.34395345052084, "logps/rejected": -138.814697265625, "loss": 0.2783, "rewards/chosen": 0.5421590407689413, "rewards/margins": 2.0367191871007284, "rewards/rejected": -1.494560146331787, "step": 9606 }, { "epoch": 0.5092094453131212, "grad_norm": 45.5, "kl": 0.6696434020996094, "learning_rate": 5e-07, "logits/chosen": -20659556.0, "logits/rejected": -16198440.0, "logps/chosen": -258.66241455078125, "logps/rejected": -357.6658528645833, "loss": 0.1993, "rewards/chosen": 0.6021087765693665, "rewards/margins": 2.708802044391632, "rewards/rejected": -2.1066932678222656, "step": 9607 }, { "epoch": 0.5092624493149233, "grad_norm": 27.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6155391.0, "logits/rejected": -27788725.333333332, "logps/chosen": -155.98220825195312, "logps/rejected": -307.966796875, "loss": 0.1886, "rewards/chosen": 0.05901488661766052, "rewards/margins": 2.6945014894008636, "rewards/rejected": -2.635486602783203, "step": 9608 }, { "epoch": 0.5093154533167255, "grad_norm": 49.0, "kl": 1.2483329772949219, "learning_rate": 5e-07, "logits/chosen": 41915411.2, "logits/rejected": -36033677.333333336, "logps/chosen": -159.1891845703125, "logps/rejected": -178.210205078125, "loss": 0.3952, "rewards/chosen": 0.012442322075366974, "rewards/margins": 1.8237589711944262, "rewards/rejected": -1.8113166491190593, "step": 9609 }, { "epoch": 0.5093684573185275, "grad_norm": 71.0, "kl": 0.09711074829101562, "learning_rate": 5e-07, "logits/chosen": -75896858.66666667, "logits/rejected": -29840886.0, "logps/chosen": -468.4233805338542, "logps/rejected": -319.57586669921875, "loss": 0.3853, "rewards/chosen": 0.20618335405985513, "rewards/margins": 2.0677053133646646, "rewards/rejected": -1.8615219593048096, "step": 9610 }, { "epoch": 0.5094214613203297, "grad_norm": 59.25, "kl": 0.0098114013671875, "learning_rate": 5e-07, "logits/chosen": -3759053.5, "logits/rejected": -47450800.0, "logps/chosen": -92.55848693847656, "logps/rejected": -321.3837076822917, "loss": 0.2353, "rewards/chosen": 0.44726866483688354, "rewards/margins": 2.450943171977997, "rewards/rejected": -2.0036745071411133, "step": 9611 }, { "epoch": 0.5094744653221318, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18242402.0, "logits/rejected": -8307170.5, "logps/chosen": -185.4954833984375, "logps/rejected": -219.01535034179688, "loss": 0.385, "rewards/chosen": -0.20661143958568573, "rewards/margins": 1.137323573231697, "rewards/rejected": -1.3439350128173828, "step": 9612 }, { "epoch": 0.509527469323934, "grad_norm": 68.0, "kl": 2.263763427734375, "learning_rate": 5e-07, "logits/chosen": -72388512.0, "logits/rejected": -45183184.0, "logps/chosen": -418.504736328125, "logps/rejected": -287.3698323567708, "loss": 0.3922, "rewards/chosen": 0.6205810546875, "rewards/margins": 1.6035704453786215, "rewards/rejected": -0.9829893906911215, "step": 9613 }, { "epoch": 0.5095804733257361, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18768414.0, "logits/rejected": -49316026.666666664, "logps/chosen": -407.1529541015625, "logps/rejected": -414.3623046875, "loss": 0.2156, "rewards/chosen": -0.04241332784295082, "rewards/margins": 2.187763597816229, "rewards/rejected": -2.2301769256591797, "step": 9614 }, { "epoch": 0.5096334773275383, "grad_norm": 39.75, "kl": 2.0550060272216797, "learning_rate": 5e-07, "logits/chosen": -14073238.0, "logits/rejected": -21209096.0, "logps/chosen": -198.91213989257812, "logps/rejected": -265.2352600097656, "loss": 0.2286, "rewards/chosen": 1.172499418258667, "rewards/margins": 3.098941445350647, "rewards/rejected": -1.92644202709198, "step": 9615 }, { "epoch": 0.5096864813293404, "grad_norm": 35.0, "kl": 0.3772468566894531, "learning_rate": 5e-07, "logits/chosen": -16749187.0, "logits/rejected": -39660877.333333336, "logps/chosen": -157.3075714111328, "logps/rejected": -365.3580322265625, "loss": 0.1437, "rewards/chosen": 0.34914588928222656, "rewards/margins": 3.4138902028401694, "rewards/rejected": -3.064744313557943, "step": 9616 }, { "epoch": 0.5097394853311425, "grad_norm": 50.75, "kl": 0.7895545959472656, "learning_rate": 5e-07, "logits/chosen": -41657747.2, "logits/rejected": -75980373.33333333, "logps/chosen": -539.061962890625, "logps/rejected": -312.28271484375, "loss": 0.3735, "rewards/chosen": 0.20747921466827393, "rewards/margins": 2.2577524741490684, "rewards/rejected": -2.0502732594807944, "step": 9617 }, { "epoch": 0.5097924893329446, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70411888.0, "logits/rejected": -6203546.0, "logps/chosen": -144.1614786783854, "logps/rejected": -615.441650390625, "loss": 0.3578, "rewards/chosen": 0.34391005833943683, "rewards/margins": 2.1404434045155845, "rewards/rejected": -1.7965333461761475, "step": 9618 }, { "epoch": 0.5098454933347468, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29229306.666666668, "logits/rejected": -35285520.0, "logps/chosen": -236.5489501953125, "logps/rejected": -464.64248046875, "loss": 0.2758, "rewards/chosen": 0.09592387080192566, "rewards/margins": 1.9913868606090546, "rewards/rejected": -1.895462989807129, "step": 9619 }, { "epoch": 0.5098984973365489, "grad_norm": 109.0, "kl": 0.8476161956787109, "learning_rate": 5e-07, "logits/chosen": -45454828.8, "logits/rejected": -36275160.0, "logps/chosen": -258.231201171875, "logps/rejected": -300.07326253255206, "loss": 0.3137, "rewards/chosen": 0.42812886238098147, "rewards/margins": 2.5996215979258217, "rewards/rejected": -2.1714927355448403, "step": 9620 }, { "epoch": 0.5099515013383511, "grad_norm": 56.5, "kl": 0.8905496597290039, "learning_rate": 5e-07, "logits/chosen": 5123099.333333333, "logits/rejected": 14138089.6, "logps/chosen": -236.2872111002604, "logps/rejected": -333.18837890625, "loss": 0.2098, "rewards/chosen": 0.6362658341725668, "rewards/margins": 3.053827174504598, "rewards/rejected": -2.417561340332031, "step": 9621 }, { "epoch": 0.5100045053401532, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 92009984.0, "logits/rejected": -41726098.666666664, "logps/chosen": -344.3730773925781, "logps/rejected": -534.9186197916666, "loss": 0.2134, "rewards/chosen": 0.4028778076171875, "rewards/margins": 2.782992362976074, "rewards/rejected": -2.3801145553588867, "step": 9622 }, { "epoch": 0.5100575093419554, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65166042.666666664, "logits/rejected": -19731931.2, "logps/chosen": -690.9290364583334, "logps/rejected": -140.41195068359374, "loss": 0.2467, "rewards/chosen": 0.5452018976211548, "rewards/margins": 2.7637317895889284, "rewards/rejected": -2.2185298919677736, "step": 9623 }, { "epoch": 0.5101105133437575, "grad_norm": 35.25, "kl": 0.5319356918334961, "learning_rate": 5e-07, "logits/chosen": -802350.3333333334, "logits/rejected": -3716304.8, "logps/chosen": -78.76404317220052, "logps/rejected": -175.91519775390626, "loss": 0.3739, "rewards/chosen": -1.0126532713572185, "rewards/margins": 0.82105606396993, "rewards/rejected": -1.8337093353271485, "step": 9624 }, { "epoch": 0.5101635173455595, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19622204.0, "logits/rejected": -41388140.0, "logps/chosen": -376.1827392578125, "logps/rejected": -216.15924072265625, "loss": 0.2879, "rewards/chosen": 0.4025917053222656, "rewards/margins": 3.123173236846924, "rewards/rejected": -2.720581531524658, "step": 9625 }, { "epoch": 0.5102165213473617, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8572684.0, "logits/rejected": -17630766.0, "logps/chosen": -312.19805908203125, "logps/rejected": -438.8436584472656, "loss": 0.2235, "rewards/chosen": 1.0288364887237549, "rewards/margins": 3.150648593902588, "rewards/rejected": -2.121812105178833, "step": 9626 }, { "epoch": 0.5102695253491638, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12689886.0, "logits/rejected": -5020605.0, "logps/chosen": -100.20361328125, "logps/rejected": -233.6522979736328, "loss": 0.3496, "rewards/chosen": 0.03282230347394943, "rewards/margins": 1.961326651275158, "rewards/rejected": -1.9285043478012085, "step": 9627 }, { "epoch": 0.510322529350966, "grad_norm": 54.5, "kl": 2.152254104614258, "learning_rate": 5e-07, "logits/chosen": -11883888.0, "logits/rejected": -21846156.0, "logps/chosen": -530.957275390625, "logps/rejected": -258.351806640625, "loss": 0.256, "rewards/chosen": 1.2957477569580078, "rewards/margins": 2.6257073879241943, "rewards/rejected": -1.3299596309661865, "step": 9628 }, { "epoch": 0.5103755333527681, "grad_norm": 56.75, "kl": 1.3517236709594727, "learning_rate": 5e-07, "logits/chosen": -33500810.0, "logits/rejected": -25517342.0, "logps/chosen": -187.44784545898438, "logps/rejected": -266.4443359375, "loss": 0.2762, "rewards/chosen": 1.0181137323379517, "rewards/margins": 2.4810978174209595, "rewards/rejected": -1.4629840850830078, "step": 9629 }, { "epoch": 0.5104285373545703, "grad_norm": 59.0, "kl": 0.9346504211425781, "learning_rate": 5e-07, "logits/chosen": -4977640.571428572, "logits/rejected": -20053738.0, "logps/chosen": -106.36272321428571, "logps/rejected": -358.18841552734375, "loss": 0.4136, "rewards/chosen": 0.24765254770006453, "rewards/margins": 3.512429254395621, "rewards/rejected": -3.2647767066955566, "step": 9630 }, { "epoch": 0.5104815413563724, "grad_norm": 46.75, "kl": 3.6218929290771484, "learning_rate": 5e-07, "logits/chosen": -4415445.2, "logits/rejected": -58910048.0, "logps/chosen": -224.81767578125, "logps/rejected": -423.1087239583333, "loss": 0.3213, "rewards/chosen": 0.8130321502685547, "rewards/margins": 3.023421287536621, "rewards/rejected": -2.2103891372680664, "step": 9631 }, { "epoch": 0.5105345453581746, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11168642.666666666, "logits/rejected": -21858244.8, "logps/chosen": -203.86722819010416, "logps/rejected": -256.7611572265625, "loss": 0.1885, "rewards/chosen": 0.8984379768371582, "rewards/margins": 3.3971579551696776, "rewards/rejected": -2.4987199783325194, "step": 9632 }, { "epoch": 0.5105875493599766, "grad_norm": 37.5, "kl": 0.2149829864501953, "learning_rate": 5e-07, "logits/chosen": -2367309.25, "logits/rejected": -23523517.333333332, "logps/chosen": -177.05453491210938, "logps/rejected": -451.1459147135417, "loss": 0.2173, "rewards/chosen": 0.9069751501083374, "rewards/margins": 3.5047943194707236, "rewards/rejected": -2.597819169362386, "step": 9633 }, { "epoch": 0.5106405533617788, "grad_norm": 62.5, "kl": 3.361687183380127, "learning_rate": 5e-07, "logits/chosen": -33039280.0, "logits/rejected": -41200437.333333336, "logps/chosen": -571.37177734375, "logps/rejected": -250.6177978515625, "loss": 0.2773, "rewards/chosen": 1.251384449005127, "rewards/margins": 2.5412246704101564, "rewards/rejected": -1.2898402214050293, "step": 9634 }, { "epoch": 0.5106935573635809, "grad_norm": 42.75, "kl": 0.870330810546875, "learning_rate": 5e-07, "logits/chosen": -50069980.0, "logits/rejected": -40900084.0, "logps/chosen": -179.09481811523438, "logps/rejected": -243.67462158203125, "loss": 0.3202, "rewards/chosen": 0.06380286812782288, "rewards/margins": 2.0598314106464386, "rewards/rejected": -1.9960285425186157, "step": 9635 }, { "epoch": 0.5107465613653831, "grad_norm": 48.25, "kl": 0.949676513671875, "learning_rate": 5e-07, "logits/chosen": -11904470.666666666, "logits/rejected": -4088879.6, "logps/chosen": -260.08542887369794, "logps/rejected": -449.984716796875, "loss": 0.2837, "rewards/chosen": 0.6831130981445312, "rewards/margins": 2.952951431274414, "rewards/rejected": -2.269838333129883, "step": 9636 }, { "epoch": 0.5107995653671852, "grad_norm": 57.75, "kl": 2.5744800567626953, "learning_rate": 5e-07, "logits/chosen": -39530660.0, "logits/rejected": -6166874.5, "logps/chosen": -382.024169921875, "logps/rejected": -345.11700439453125, "loss": 0.323, "rewards/chosen": 0.6425445675849915, "rewards/margins": 2.545038163661957, "rewards/rejected": -1.9024935960769653, "step": 9637 }, { "epoch": 0.5108525693689874, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42499928.0, "logits/rejected": -19373086.0, "logps/chosen": -416.5160827636719, "logps/rejected": -277.1755065917969, "loss": 0.2796, "rewards/chosen": 0.06634864211082458, "rewards/margins": 2.742096036672592, "rewards/rejected": -2.6757473945617676, "step": 9638 }, { "epoch": 0.5109055733707895, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25345772.0, "logits/rejected": -16117315.42857143, "logps/chosen": -536.0101928710938, "logps/rejected": -206.99571010044642, "loss": 0.3312, "rewards/chosen": -0.222625732421875, "rewards/margins": 0.7931380271911621, "rewards/rejected": -1.015763759613037, "step": 9639 }, { "epoch": 0.5109585773725916, "grad_norm": 50.75, "kl": 0.3077526092529297, "learning_rate": 5e-07, "logits/chosen": -19840750.666666668, "logits/rejected": -21101004.0, "logps/chosen": -246.0253702799479, "logps/rejected": -366.1252746582031, "loss": 0.3397, "rewards/chosen": 0.29837791124979657, "rewards/margins": 3.2490705649058023, "rewards/rejected": -2.950692653656006, "step": 9640 }, { "epoch": 0.5110115813743937, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30975350.4, "logits/rejected": -43472202.666666664, "logps/chosen": -642.269775390625, "logps/rejected": -359.7815755208333, "loss": 0.2002, "rewards/chosen": 1.3682872772216796, "rewards/margins": 4.052079073588053, "rewards/rejected": -2.6837917963663735, "step": 9641 }, { "epoch": 0.5110645853761959, "grad_norm": 37.0, "kl": 3.5036659240722656, "learning_rate": 5e-07, "logits/chosen": -9072531.0, "logits/rejected": -25476436.0, "logps/chosen": -139.00650024414062, "logps/rejected": -347.6509704589844, "loss": 0.3048, "rewards/chosen": 0.4137064218521118, "rewards/margins": 2.9716182947158813, "rewards/rejected": -2.5579118728637695, "step": 9642 }, { "epoch": 0.511117589377998, "grad_norm": 47.75, "kl": 0.4325599670410156, "learning_rate": 5e-07, "logits/chosen": -35422560.0, "logits/rejected": -14347846.4, "logps/chosen": -251.44059244791666, "logps/rejected": -402.5336669921875, "loss": 0.251, "rewards/chosen": 0.3070841630299886, "rewards/margins": 2.4536396821339927, "rewards/rejected": -2.146555519104004, "step": 9643 }, { "epoch": 0.5111705933798002, "grad_norm": 67.0, "kl": 1.8422508239746094, "learning_rate": 5e-07, "logits/chosen": -13608800.0, "logits/rejected": 12175218.0, "logps/chosen": -216.69235229492188, "logps/rejected": -508.5101623535156, "loss": 0.2705, "rewards/chosen": 0.8618634939193726, "rewards/margins": 2.962165951728821, "rewards/rejected": -2.1003024578094482, "step": 9644 }, { "epoch": 0.5112235973816023, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81020074.66666667, "logits/rejected": -20229652.8, "logps/chosen": -184.15144856770834, "logps/rejected": -435.821875, "loss": 0.2643, "rewards/chosen": 0.038852756222089134, "rewards/margins": 3.074646442135175, "rewards/rejected": -3.035793685913086, "step": 9645 }, { "epoch": 0.5112766013834045, "grad_norm": 75.0, "kl": 3.559114456176758, "learning_rate": 5e-07, "logits/chosen": -55774290.28571428, "logits/rejected": -24657390.0, "logps/chosen": -429.67564174107144, "logps/rejected": -299.71142578125, "loss": 0.4145, "rewards/chosen": 0.6212471553257534, "rewards/margins": 2.3179695435932706, "rewards/rejected": -1.696722388267517, "step": 9646 }, { "epoch": 0.5113296053852066, "grad_norm": 44.25, "kl": 0.9441585540771484, "learning_rate": 5e-07, "logits/chosen": -37949222.4, "logits/rejected": -10513699.333333334, "logps/chosen": -436.415478515625, "logps/rejected": -364.5450439453125, "loss": 0.315, "rewards/chosen": 0.929278564453125, "rewards/margins": 2.802239449818929, "rewards/rejected": -1.872960885365804, "step": 9647 }, { "epoch": 0.5113826093870087, "grad_norm": 47.0, "kl": 0.15155982971191406, "learning_rate": 5e-07, "logits/chosen": -6266614.0, "logits/rejected": -3388667.3333333335, "logps/chosen": -254.910888671875, "logps/rejected": -65.1043701171875, "loss": 0.2715, "rewards/chosen": 0.7837154388427734, "rewards/margins": 2.6940163294474284, "rewards/rejected": -1.910300890604655, "step": 9648 }, { "epoch": 0.5114356133888108, "grad_norm": 56.25, "kl": 0.50286865234375, "learning_rate": 5e-07, "logits/chosen": 7133878.4, "logits/rejected": -7282114.666666667, "logps/chosen": -379.2741455078125, "logps/rejected": -152.50360107421875, "loss": 0.2917, "rewards/chosen": 0.6856475830078125, "rewards/margins": 3.3223785718282066, "rewards/rejected": -2.636730988820394, "step": 9649 }, { "epoch": 0.511488617390613, "grad_norm": 69.5, "kl": 0.8019142150878906, "learning_rate": 5e-07, "logits/chosen": -57048179.2, "logits/rejected": -41598522.666666664, "logps/chosen": -567.538623046875, "logps/rejected": -373.4999593098958, "loss": 0.3175, "rewards/chosen": 0.3455584764480591, "rewards/margins": 2.7234770377477013, "rewards/rejected": -2.377918561299642, "step": 9650 }, { "epoch": 0.5115416213924151, "grad_norm": 41.5, "kl": 1.2105464935302734, "learning_rate": 5e-07, "logits/chosen": 2979336.0, "logits/rejected": -45917146.666666664, "logps/chosen": -206.47557067871094, "logps/rejected": -515.09423828125, "loss": 0.174, "rewards/chosen": 0.614561915397644, "rewards/margins": 3.5590002139409385, "rewards/rejected": -2.9444382985432944, "step": 9651 }, { "epoch": 0.5115946253942173, "grad_norm": 48.75, "kl": 1.3595094680786133, "learning_rate": 5e-07, "logits/chosen": -37189786.666666664, "logits/rejected": -28233290.0, "logps/chosen": -187.9574178059896, "logps/rejected": -228.15774536132812, "loss": 0.3782, "rewards/chosen": 0.3276243011156718, "rewards/margins": 2.7583632270495095, "rewards/rejected": -2.430738925933838, "step": 9652 }, { "epoch": 0.5116476293960194, "grad_norm": 36.0, "kl": 1.426849365234375, "learning_rate": 5e-07, "logits/chosen": -22110770.0, "logits/rejected": -11515331.0, "logps/chosen": -198.07908630371094, "logps/rejected": -459.32720947265625, "loss": 0.2538, "rewards/chosen": 0.7153973579406738, "rewards/margins": 2.9352810382843018, "rewards/rejected": -2.219883680343628, "step": 9653 }, { "epoch": 0.5117006333978216, "grad_norm": 58.75, "kl": 1.8295478820800781, "learning_rate": 5e-07, "logits/chosen": -15455310.4, "logits/rejected": -13496528.0, "logps/chosen": -158.72833251953125, "logps/rejected": -239.45829264322916, "loss": 0.425, "rewards/chosen": 0.21729109287261963, "rewards/margins": 1.2692380507787069, "rewards/rejected": -1.0519469579060872, "step": 9654 }, { "epoch": 0.5117536373996237, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29920534.0, "logits/rejected": -46283392.0, "logps/chosen": -256.2510986328125, "logps/rejected": -543.7813720703125, "loss": 0.3307, "rewards/chosen": -0.08396434783935547, "rewards/margins": 2.8172221183776855, "rewards/rejected": -2.901186466217041, "step": 9655 }, { "epoch": 0.5118066414014258, "grad_norm": 52.25, "kl": 1.7577104568481445, "learning_rate": 5e-07, "logits/chosen": -39299497.6, "logits/rejected": 1038235.3333333334, "logps/chosen": -367.8634765625, "logps/rejected": -104.37381998697917, "loss": 0.3736, "rewards/chosen": 1.0790114402770996, "rewards/margins": 1.6773808797200522, "rewards/rejected": -0.5983694394429525, "step": 9656 }, { "epoch": 0.5118596454032279, "grad_norm": 56.0, "kl": 1.042642593383789, "learning_rate": 5e-07, "logits/chosen": -61538026.666666664, "logits/rejected": -13369187.2, "logps/chosen": -174.814208984375, "logps/rejected": -305.081298828125, "loss": 0.3288, "rewards/chosen": -0.1376999020576477, "rewards/margins": 2.0273397326469422, "rewards/rejected": -2.16503963470459, "step": 9657 }, { "epoch": 0.5119126494050301, "grad_norm": 54.25, "kl": 2.1732521057128906, "learning_rate": 5e-07, "logits/chosen": -25527637.333333332, "logits/rejected": -1642634.0, "logps/chosen": -311.15557861328125, "logps/rejected": -95.83757781982422, "loss": 0.4051, "rewards/chosen": 0.4834233522415161, "rewards/margins": 1.3974248170852661, "rewards/rejected": -0.91400146484375, "step": 9658 }, { "epoch": 0.5119656534068322, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56605668.0, "logits/rejected": -14605784.0, "logps/chosen": -350.31060791015625, "logps/rejected": -188.6390177408854, "loss": 0.2637, "rewards/chosen": 0.07445831596851349, "rewards/margins": 1.8371841460466385, "rewards/rejected": -1.762725830078125, "step": 9659 }, { "epoch": 0.5120186574086344, "grad_norm": 53.25, "kl": 0.054924964904785156, "learning_rate": 5e-07, "logits/chosen": 1831431.6, "logits/rejected": -13530666.666666666, "logps/chosen": -190.67080078125, "logps/rejected": -312.1106363932292, "loss": 0.43, "rewards/chosen": -0.36347188949584963, "rewards/margins": 1.2424365043640138, "rewards/rejected": -1.6059083938598633, "step": 9660 }, { "epoch": 0.5120716614104365, "grad_norm": 93.0, "kl": 4.1373291015625, "learning_rate": 5e-07, "logits/chosen": 7324956.0, "logits/rejected": -52635780.0, "logps/chosen": -371.7718098958333, "logps/rejected": -491.52423095703125, "loss": 0.3514, "rewards/chosen": 0.7616728146870931, "rewards/margins": 3.202439864476522, "rewards/rejected": -2.4407670497894287, "step": 9661 }, { "epoch": 0.5121246654122387, "grad_norm": 47.5, "kl": 0.2807464599609375, "learning_rate": 5e-07, "logits/chosen": -46241896.0, "logits/rejected": -19235044.0, "logps/chosen": -238.95057678222656, "logps/rejected": -470.0423583984375, "loss": 0.2725, "rewards/chosen": 0.2826657295227051, "rewards/margins": 2.5803260803222656, "rewards/rejected": -2.2976603507995605, "step": 9662 }, { "epoch": 0.5121776694140407, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28327844.0, "logits/rejected": 75351392.0, "logps/chosen": -649.157958984375, "logps/rejected": -659.1075032552084, "loss": 0.2467, "rewards/chosen": 0.7808166742324829, "rewards/margins": 3.5355332295099893, "rewards/rejected": -2.7547165552775064, "step": 9663 }, { "epoch": 0.5122306734158429, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50064140.0, "logits/rejected": -25335408.0, "logps/chosen": -390.355224609375, "logps/rejected": -376.3521728515625, "loss": 0.2883, "rewards/chosen": 0.6469253897666931, "rewards/margins": 2.548470914363861, "rewards/rejected": -1.901545524597168, "step": 9664 }, { "epoch": 0.512283677417645, "grad_norm": 63.75, "kl": 3.4391517639160156, "learning_rate": 5e-07, "logits/chosen": -75830950.4, "logits/rejected": -11222325.333333334, "logps/chosen": -783.487109375, "logps/rejected": -99.1105244954427, "loss": 0.3146, "rewards/chosen": 1.0195807456970214, "rewards/margins": 3.1390456199645995, "rewards/rejected": -2.119464874267578, "step": 9665 }, { "epoch": 0.5123366814194472, "grad_norm": 51.75, "kl": 1.094710350036621, "learning_rate": 5e-07, "logits/chosen": -40772659.2, "logits/rejected": -49702645.333333336, "logps/chosen": -255.323974609375, "logps/rejected": -367.9368896484375, "loss": 0.3887, "rewards/chosen": 0.04782357215881348, "rewards/margins": 2.244405730565389, "rewards/rejected": -2.1965821584065757, "step": 9666 }, { "epoch": 0.5123896854212493, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5043530.0, "logits/rejected": -39739049.14285714, "logps/chosen": -118.14154052734375, "logps/rejected": -345.1823032924107, "loss": 0.2093, "rewards/chosen": -0.6085349917411804, "rewards/margins": 1.5698947480746677, "rewards/rejected": -2.178429739815848, "step": 9667 }, { "epoch": 0.5124426894230515, "grad_norm": 43.5, "kl": 3.3948726654052734, "learning_rate": 5e-07, "logits/chosen": -25534946.0, "logits/rejected": -12302421.0, "logps/chosen": -388.40179443359375, "logps/rejected": -254.24302673339844, "loss": 0.3274, "rewards/chosen": 0.8737221360206604, "rewards/margins": 3.0054920315742493, "rewards/rejected": -2.131769895553589, "step": 9668 }, { "epoch": 0.5124956934248536, "grad_norm": 74.0, "kl": 8.454651832580566, "learning_rate": 5e-07, "logits/chosen": -965816.8571428572, "logits/rejected": -77547192.0, "logps/chosen": -617.9109235491071, "logps/rejected": -745.7490844726562, "loss": 0.4504, "rewards/chosen": 0.9734391484941755, "rewards/margins": 4.85377972466605, "rewards/rejected": -3.880340576171875, "step": 9669 }, { "epoch": 0.5125486974266558, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91059024.0, "logits/rejected": 2664416.5714285714, "logps/chosen": -243.53602600097656, "logps/rejected": -247.97813197544642, "loss": 0.2495, "rewards/chosen": 0.46456605195999146, "rewards/margins": 1.7439446704728263, "rewards/rejected": -1.2793786185128349, "step": 9670 }, { "epoch": 0.5126017014284578, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13795224.0, "logits/rejected": -39935653.333333336, "logps/chosen": -334.2430419921875, "logps/rejected": -269.64162190755206, "loss": 0.2535, "rewards/chosen": 0.05305023491382599, "rewards/margins": 1.8114123692115147, "rewards/rejected": -1.7583621342976887, "step": 9671 }, { "epoch": 0.51265470543026, "grad_norm": 49.5, "kl": 2.1143083572387695, "learning_rate": 5e-07, "logits/chosen": -39845299.2, "logits/rejected": -70557616.0, "logps/chosen": -292.7703369140625, "logps/rejected": -420.287353515625, "loss": 0.3241, "rewards/chosen": 0.5557410240173339, "rewards/margins": 3.4064649264017737, "rewards/rejected": -2.85072390238444, "step": 9672 }, { "epoch": 0.5127077094320621, "grad_norm": 75.0, "kl": 0.9358654022216797, "learning_rate": 5e-07, "logits/chosen": -27544134.4, "logits/rejected": -15604957.333333334, "logps/chosen": -251.3837890625, "logps/rejected": -712.4158528645834, "loss": 0.378, "rewards/chosen": 0.34660773277282714, "rewards/margins": 1.5584699789683025, "rewards/rejected": -1.2118622461954753, "step": 9673 }, { "epoch": 0.5127607134338642, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34147157.333333336, "logits/rejected": -4344550.0, "logps/chosen": -157.4979044596354, "logps/rejected": -261.637646484375, "loss": 0.2501, "rewards/chosen": -0.04187901814778646, "rewards/margins": 2.714975039164225, "rewards/rejected": -2.7568540573120117, "step": 9674 }, { "epoch": 0.5128137174356664, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -104079792.0, "logits/rejected": -19089134.85714286, "logps/chosen": -696.951416015625, "logps/rejected": -300.87227957589283, "loss": 0.18, "rewards/chosen": 1.3289062976837158, "rewards/margins": 2.9615046296800887, "rewards/rejected": -1.6325983319963728, "step": 9675 }, { "epoch": 0.5128667214374685, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21480420.0, "logits/rejected": -8694848.0, "logps/chosen": -304.8132019042969, "logps/rejected": -404.0609130859375, "loss": 0.1533, "rewards/chosen": 2.0164566040039062, "rewards/margins": 3.75277058283488, "rewards/rejected": -1.7363139788309734, "step": 9676 }, { "epoch": 0.5129197254392707, "grad_norm": 52.25, "kl": 1.412370204925537, "learning_rate": 5e-07, "logits/chosen": -19240048.0, "logits/rejected": -1659958.3333333333, "logps/chosen": -245.492333984375, "logps/rejected": -86.27259318033855, "loss": 0.3699, "rewards/chosen": 0.3884642839431763, "rewards/margins": 1.796184531847636, "rewards/rejected": -1.4077202479044597, "step": 9677 }, { "epoch": 0.5129727294410728, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5577944.5, "logits/rejected": -16503613.333333334, "logps/chosen": -241.5790557861328, "logps/rejected": -281.9758707682292, "loss": 0.1795, "rewards/chosen": 2.057687282562256, "rewards/margins": 4.372495810190836, "rewards/rejected": -2.3148085276285806, "step": 9678 }, { "epoch": 0.5130257334428749, "grad_norm": 76.5, "kl": 2.7657928466796875, "learning_rate": 5e-07, "logits/chosen": -18441420.0, "logits/rejected": -21406702.0, "logps/chosen": -525.3490600585938, "logps/rejected": -100.43213653564453, "loss": 0.2783, "rewards/chosen": 1.1710418462753296, "rewards/margins": 2.795728325843811, "rewards/rejected": -1.6246864795684814, "step": 9679 }, { "epoch": 0.513078737444677, "grad_norm": 38.5, "kl": 0.8434083461761475, "learning_rate": 5e-07, "logits/chosen": -2023147.5, "logits/rejected": -16398225.0, "logps/chosen": -297.7012023925781, "logps/rejected": -205.36111450195312, "loss": 0.2772, "rewards/chosen": 0.45923757553100586, "rewards/margins": 2.5748682022094727, "rewards/rejected": -2.115630626678467, "step": 9680 }, { "epoch": 0.5131317414464792, "grad_norm": 44.75, "kl": 3.557523727416992, "learning_rate": 5e-07, "logits/chosen": -3203328.75, "logits/rejected": -34966604.0, "logps/chosen": -174.41769409179688, "logps/rejected": -203.8350067138672, "loss": 0.2667, "rewards/chosen": 0.9574928283691406, "rewards/margins": 2.465235114097595, "rewards/rejected": -1.5077422857284546, "step": 9681 }, { "epoch": 0.5131847454482813, "grad_norm": 63.75, "kl": 1.0169200897216797, "learning_rate": 5e-07, "logits/chosen": -63319248.0, "logits/rejected": -31095540.0, "logps/chosen": -248.43888346354166, "logps/rejected": -468.62225341796875, "loss": 0.3947, "rewards/chosen": 0.19178442160288492, "rewards/margins": 4.255638639132182, "rewards/rejected": -4.063854217529297, "step": 9682 }, { "epoch": 0.5132377494500835, "grad_norm": 65.5, "kl": 1.7576446533203125, "learning_rate": 5e-07, "logits/chosen": -32430827.42857143, "logits/rejected": -133623360.0, "logps/chosen": -501.071044921875, "logps/rejected": -602.26806640625, "loss": 0.4606, "rewards/chosen": 0.2515830823353359, "rewards/margins": 1.996120435850961, "rewards/rejected": -1.744537353515625, "step": 9683 }, { "epoch": 0.5132907534518856, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33586968.0, "logits/rejected": -21991961.6, "logps/chosen": -316.0474039713542, "logps/rejected": -328.48330078125, "loss": 0.1776, "rewards/chosen": 1.081367015838623, "rewards/margins": 4.384021282196045, "rewards/rejected": -3.302654266357422, "step": 9684 }, { "epoch": 0.5133437574536878, "grad_norm": 59.0, "kl": 3.711832046508789, "learning_rate": 5e-07, "logits/chosen": -34118197.333333336, "logits/rejected": -103538472.0, "logps/chosen": -476.8394368489583, "logps/rejected": -367.82012939453125, "loss": 0.3097, "rewards/chosen": 1.0944469769795735, "rewards/margins": 3.747144778569539, "rewards/rejected": -2.652697801589966, "step": 9685 }, { "epoch": 0.5133967614554898, "grad_norm": 44.25, "kl": 0.15142536163330078, "learning_rate": 5e-07, "logits/chosen": -11455390.0, "logits/rejected": -36341752.0, "logps/chosen": -229.1177978515625, "logps/rejected": -389.58447265625, "loss": 0.2153, "rewards/chosen": 0.9804997444152832, "rewards/margins": 2.9555646181106567, "rewards/rejected": -1.9750648736953735, "step": 9686 }, { "epoch": 0.513449765457292, "grad_norm": 54.25, "kl": 3.615520477294922, "learning_rate": 5e-07, "logits/chosen": 11651346.666666666, "logits/rejected": -11781290.0, "logps/chosen": -240.1106160481771, "logps/rejected": -283.91937255859375, "loss": 0.411, "rewards/chosen": 0.4155052105585734, "rewards/margins": 3.3006120125452676, "rewards/rejected": -2.8851068019866943, "step": 9687 }, { "epoch": 0.5135027694590941, "grad_norm": 64.5, "kl": 3.406848907470703, "learning_rate": 5e-07, "logits/chosen": -11076797.333333334, "logits/rejected": -1235675.25, "logps/chosen": -387.9112548828125, "logps/rejected": -203.81851196289062, "loss": 0.464, "rewards/chosen": 0.15806426604588827, "rewards/margins": 2.061264177163442, "rewards/rejected": -1.9031999111175537, "step": 9688 }, { "epoch": 0.5135557734608963, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -840219.2, "logits/rejected": -79532138.66666667, "logps/chosen": -508.30234375, "logps/rejected": -358.7804361979167, "loss": 0.327, "rewards/chosen": 0.3016913890838623, "rewards/margins": 3.3392165660858155, "rewards/rejected": -3.037525177001953, "step": 9689 }, { "epoch": 0.5136087774626984, "grad_norm": 42.0, "kl": 0.3124523162841797, "learning_rate": 5e-07, "logits/chosen": -20600201.6, "logits/rejected": -23043410.666666668, "logps/chosen": -116.332177734375, "logps/rejected": -515.4525553385416, "loss": 0.3425, "rewards/chosen": 0.1008209228515625, "rewards/margins": 2.5004114151000976, "rewards/rejected": -2.399590492248535, "step": 9690 }, { "epoch": 0.5136617814645006, "grad_norm": 49.5, "kl": 0.24179458618164062, "learning_rate": 5e-07, "logits/chosen": -4461896.666666667, "logits/rejected": -5656221.6, "logps/chosen": -217.3429158528646, "logps/rejected": -154.9803955078125, "loss": 0.3504, "rewards/chosen": 0.09507242838541667, "rewards/margins": 1.1597103754679363, "rewards/rejected": -1.0646379470825196, "step": 9691 }, { "epoch": 0.5137147854663027, "grad_norm": 69.5, "kl": 2.7669830322265625, "learning_rate": 5e-07, "logits/chosen": -27527389.333333332, "logits/rejected": -60837092.0, "logps/chosen": -485.0712890625, "logps/rejected": -339.87713623046875, "loss": 0.3757, "rewards/chosen": 0.6498955885569254, "rewards/margins": 2.7822717825571694, "rewards/rejected": -2.132376194000244, "step": 9692 }, { "epoch": 0.5137677894681049, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13206606.666666666, "logits/rejected": -45174617.6, "logps/chosen": -286.8035074869792, "logps/rejected": -472.38310546875, "loss": 0.2009, "rewards/chosen": 0.8952411810557047, "rewards/margins": 3.1277427832285563, "rewards/rejected": -2.2325016021728517, "step": 9693 }, { "epoch": 0.513820793469907, "grad_norm": 45.75, "kl": 0.746363639831543, "learning_rate": 5e-07, "logits/chosen": -15006169.333333334, "logits/rejected": -21692360.0, "logps/chosen": -311.8055826822917, "logps/rejected": -160.6138427734375, "loss": 0.3129, "rewards/chosen": 0.5638290246327718, "rewards/margins": 2.0704559167226155, "rewards/rejected": -1.5066268920898438, "step": 9694 }, { "epoch": 0.5138737974717091, "grad_norm": 39.0, "kl": 3.5810108184814453, "learning_rate": 5e-07, "logits/chosen": -32865496.0, "logits/rejected": -8638052.8, "logps/chosen": -657.7819010416666, "logps/rejected": -228.754052734375, "loss": 0.1747, "rewards/chosen": 2.1858348846435547, "rewards/margins": 3.5734582901000977, "rewards/rejected": -1.387623405456543, "step": 9695 }, { "epoch": 0.5139268014735112, "grad_norm": 42.0, "kl": 0.9560155868530273, "learning_rate": 5e-07, "logits/chosen": -18968481.6, "logits/rejected": -44932064.0, "logps/chosen": -120.86217041015625, "logps/rejected": -209.38887532552084, "loss": 0.3397, "rewards/chosen": 0.4465781688690186, "rewards/margins": 1.8228017330169677, "rewards/rejected": -1.3762235641479492, "step": 9696 }, { "epoch": 0.5139798054753134, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6692808.0, "logits/rejected": -13668462.4, "logps/chosen": -56.20033264160156, "logps/rejected": -183.49478759765626, "loss": 0.2864, "rewards/chosen": 0.054229676723480225, "rewards/margins": 2.2479148268699647, "rewards/rejected": -2.1936851501464845, "step": 9697 }, { "epoch": 0.5140328094771155, "grad_norm": 38.75, "kl": 0.06836223602294922, "learning_rate": 5e-07, "logits/chosen": -18818297.333333332, "logits/rejected": -16453472.0, "logps/chosen": -125.35716756184895, "logps/rejected": -167.5826171875, "loss": 0.2739, "rewards/chosen": 0.20730960369110107, "rewards/margins": 2.0136534452438353, "rewards/rejected": -1.8063438415527344, "step": 9698 }, { "epoch": 0.5140858134789177, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45690186.666666664, "logits/rejected": -27640166.4, "logps/chosen": -319.3396809895833, "logps/rejected": -184.23702392578124, "loss": 0.3071, "rewards/chosen": 0.08124897877375285, "rewards/margins": 1.719697884718577, "rewards/rejected": -1.6384489059448242, "step": 9699 }, { "epoch": 0.5141388174807198, "grad_norm": 40.75, "kl": 2.6371231079101562, "learning_rate": 5e-07, "logits/chosen": -19831680.0, "logits/rejected": -14599448.0, "logps/chosen": -396.1461588541667, "logps/rejected": -644.1309814453125, "loss": 0.267, "rewards/chosen": 1.1728288332621257, "rewards/margins": 3.5578395525614424, "rewards/rejected": -2.3850107192993164, "step": 9700 }, { "epoch": 0.514191821482522, "grad_norm": 44.25, "kl": 0.9549179077148438, "learning_rate": 5e-07, "logits/chosen": -104867072.0, "logits/rejected": -37519606.4, "logps/chosen": -461.0104166666667, "logps/rejected": -441.3814453125, "loss": 0.173, "rewards/chosen": 1.0065236886342366, "rewards/margins": 3.835768111546834, "rewards/rejected": -2.8292444229125975, "step": 9701 }, { "epoch": 0.514244825484324, "grad_norm": 49.75, "kl": 5.827674865722656, "learning_rate": 5e-07, "logits/chosen": -13457188.8, "logits/rejected": -21167218.666666668, "logps/chosen": -748.828271484375, "logps/rejected": -309.5882161458333, "loss": 0.2101, "rewards/chosen": 1.9794937133789063, "rewards/margins": 4.287106450398763, "rewards/rejected": -2.307612737019857, "step": 9702 }, { "epoch": 0.5142978294861262, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31309410.666666668, "logits/rejected": -192931.4, "logps/chosen": -287.2740071614583, "logps/rejected": -228.475341796875, "loss": 0.2248, "rewards/chosen": 0.3340260982513428, "rewards/margins": 2.8612138271331786, "rewards/rejected": -2.527187728881836, "step": 9703 }, { "epoch": 0.5143508334879283, "grad_norm": 41.0, "kl": 1.225728988647461, "learning_rate": 5e-07, "logits/chosen": -32308067.2, "logits/rejected": -6504796.0, "logps/chosen": -136.7685546875, "logps/rejected": -99.84867350260417, "loss": 0.3346, "rewards/chosen": 0.6364196300506592, "rewards/margins": 2.0736521244049073, "rewards/rejected": -1.437232494354248, "step": 9704 }, { "epoch": 0.5144038374897305, "grad_norm": 65.5, "kl": 4.103740692138672, "learning_rate": 5e-07, "logits/chosen": -46206774.85714286, "logits/rejected": -74595888.0, "logps/chosen": -522.6964285714286, "logps/rejected": -1050.94921875, "loss": 0.3287, "rewards/chosen": 1.1171223776681083, "rewards/margins": 4.602437462125506, "rewards/rejected": -3.4853150844573975, "step": 9705 }, { "epoch": 0.5144568414915326, "grad_norm": 46.5, "kl": 0.9792327880859375, "learning_rate": 5e-07, "logits/chosen": -21528042.0, "logits/rejected": -22595420.0, "logps/chosen": -283.7643737792969, "logps/rejected": -244.84666442871094, "loss": 0.2866, "rewards/chosen": 0.4960242807865143, "rewards/margins": 2.509493499994278, "rewards/rejected": -2.0134692192077637, "step": 9706 }, { "epoch": 0.5145098454933348, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39672725.333333336, "logits/rejected": -6496175.2, "logps/chosen": -386.4876302083333, "logps/rejected": -356.7962158203125, "loss": 0.1673, "rewards/chosen": 0.5839970906575521, "rewards/margins": 3.8511548360188805, "rewards/rejected": -3.2671577453613283, "step": 9707 }, { "epoch": 0.5145628494951369, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37656356.0, "logits/rejected": -35172396.0, "logps/chosen": -273.394775390625, "logps/rejected": -212.70040893554688, "loss": 0.3035, "rewards/chosen": 0.06834542751312256, "rewards/margins": 2.2538760900497437, "rewards/rejected": -2.185530662536621, "step": 9708 }, { "epoch": 0.5146158534969391, "grad_norm": 51.5, "kl": 0.7341880798339844, "learning_rate": 5e-07, "logits/chosen": -49826568.0, "logits/rejected": -16490983.0, "logps/chosen": -405.2440185546875, "logps/rejected": -246.55445861816406, "loss": 0.2841, "rewards/chosen": 0.4785267114639282, "rewards/margins": 2.416577935218811, "rewards/rejected": -1.9380512237548828, "step": 9709 }, { "epoch": 0.5146688574987411, "grad_norm": 41.75, "kl": 1.0942716598510742, "learning_rate": 5e-07, "logits/chosen": -16674684.0, "logits/rejected": 3463753.25, "logps/chosen": -174.33120727539062, "logps/rejected": -122.65834045410156, "loss": 0.3162, "rewards/chosen": 0.458359956741333, "rewards/margins": 2.448966860771179, "rewards/rejected": -1.9906069040298462, "step": 9710 }, { "epoch": 0.5147218615005433, "grad_norm": 40.25, "kl": 4.08241081237793, "learning_rate": 5e-07, "logits/chosen": -15456478.666666666, "logits/rejected": -38653312.0, "logps/chosen": -878.09423828125, "logps/rejected": -372.865087890625, "loss": 0.2409, "rewards/chosen": 1.4953500429789226, "rewards/margins": 3.5379256884257, "rewards/rejected": -2.042575645446777, "step": 9711 }, { "epoch": 0.5147748655023454, "grad_norm": 30.25, "kl": 0.5311098098754883, "learning_rate": 5e-07, "logits/chosen": -1057076.0, "logits/rejected": -30532672.0, "logps/chosen": -93.5653076171875, "logps/rejected": -567.306884765625, "loss": 0.2074, "rewards/chosen": -0.37132692337036133, "rewards/margins": 2.640995820363363, "rewards/rejected": -3.012322743733724, "step": 9712 }, { "epoch": 0.5148278695041476, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46299306.666666664, "logits/rejected": -27477107.2, "logps/chosen": -302.4576416015625, "logps/rejected": -379.3839111328125, "loss": 0.2599, "rewards/chosen": 0.009214788675308228, "rewards/margins": 2.2975801527500153, "rewards/rejected": -2.288365364074707, "step": 9713 }, { "epoch": 0.5148808735059497, "grad_norm": 62.0, "kl": 1.4634876251220703, "learning_rate": 5e-07, "logits/chosen": -81649433.6, "logits/rejected": -59827706.666666664, "logps/chosen": -459.849658203125, "logps/rejected": -484.1744791666667, "loss": 0.333, "rewards/chosen": 0.19868199825286864, "rewards/margins": 3.8332479874293006, "rewards/rejected": -3.634565989176432, "step": 9714 }, { "epoch": 0.5149338775077519, "grad_norm": 45.25, "kl": 0.3569669723510742, "learning_rate": 5e-07, "logits/chosen": -24491316.8, "logits/rejected": 1618829.6666666667, "logps/chosen": -227.1754150390625, "logps/rejected": -137.4217529296875, "loss": 0.3104, "rewards/chosen": 0.7467382431030274, "rewards/margins": 2.368146228790283, "rewards/rejected": -1.6214079856872559, "step": 9715 }, { "epoch": 0.514986881509554, "grad_norm": 54.25, "kl": 2.6283111572265625, "learning_rate": 5e-07, "logits/chosen": -28626272.0, "logits/rejected": -20662470.0, "logps/chosen": -453.5450439453125, "logps/rejected": -369.6549072265625, "loss": 0.2248, "rewards/chosen": 0.9983627200126648, "rewards/margins": 3.9782552123069763, "rewards/rejected": -2.9798924922943115, "step": 9716 }, { "epoch": 0.5150398855113562, "grad_norm": 59.75, "kl": 0.28407955169677734, "learning_rate": 5e-07, "logits/chosen": -30955128.0, "logits/rejected": -15198712.0, "logps/chosen": -319.67580159505206, "logps/rejected": -276.60430908203125, "loss": 0.3942, "rewards/chosen": 0.2520590623219808, "rewards/margins": 1.4344542821248372, "rewards/rejected": -1.1823952198028564, "step": 9717 }, { "epoch": 0.5150928895131582, "grad_norm": 52.0, "kl": 0.7276592254638672, "learning_rate": 5e-07, "logits/chosen": 340812.5, "logits/rejected": -4999465.333333333, "logps/chosen": -338.0025390625, "logps/rejected": -234.0418701171875, "loss": 0.3313, "rewards/chosen": 0.5254782676696778, "rewards/margins": 1.887725575764974, "rewards/rejected": -1.3622473080952961, "step": 9718 }, { "epoch": 0.5151458935149604, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19844704.0, "logits/rejected": -33298706.666666668, "logps/chosen": -157.4744873046875, "logps/rejected": -306.7186279296875, "loss": 0.3136, "rewards/chosen": -0.3081108331680298, "rewards/margins": 1.1555284261703491, "rewards/rejected": -1.463639259338379, "step": 9719 }, { "epoch": 0.5151988975167625, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61729204.0, "logits/rejected": -43578368.0, "logps/chosen": -501.7860107421875, "logps/rejected": -291.53125, "loss": 0.2055, "rewards/chosen": 0.8292732238769531, "rewards/margins": 3.6347503662109375, "rewards/rejected": -2.8054771423339844, "step": 9720 }, { "epoch": 0.5152519015185647, "grad_norm": 54.25, "kl": 2.3488006591796875, "learning_rate": 5e-07, "logits/chosen": -34664464.0, "logits/rejected": -42473868.8, "logps/chosen": -240.16630045572916, "logps/rejected": -450.55146484375, "loss": 0.3462, "rewards/chosen": 0.09785257776578267, "rewards/margins": 1.9270292898019155, "rewards/rejected": -1.8291767120361329, "step": 9721 }, { "epoch": 0.5153049055203668, "grad_norm": 28.0, "kl": 0.6771106719970703, "learning_rate": 5e-07, "logits/chosen": -11272632.0, "logits/rejected": -24948635.2, "logps/chosen": -137.66549682617188, "logps/rejected": -487.99501953125, "loss": 0.1514, "rewards/chosen": 1.1069459915161133, "rewards/margins": 3.867610549926758, "rewards/rejected": -2.7606645584106446, "step": 9722 }, { "epoch": 0.515357909522169, "grad_norm": 62.25, "kl": 2.7495012283325195, "learning_rate": 5e-07, "logits/chosen": -51563308.0, "logits/rejected": -28082492.0, "logps/chosen": -264.98541259765625, "logps/rejected": -468.5535888671875, "loss": 0.3245, "rewards/chosen": 0.7107964158058167, "rewards/margins": 2.2110071778297424, "rewards/rejected": -1.5002107620239258, "step": 9723 }, { "epoch": 0.5154109135239711, "grad_norm": 51.25, "kl": 0.7599763870239258, "learning_rate": 5e-07, "logits/chosen": -20135240.0, "logits/rejected": -15391904.0, "logps/chosen": -272.4805094401042, "logps/rejected": -466.08721923828125, "loss": 0.3478, "rewards/chosen": 0.4523154099782308, "rewards/margins": 2.7006688912709556, "rewards/rejected": -2.2483534812927246, "step": 9724 }, { "epoch": 0.5154639175257731, "grad_norm": 40.5, "kl": 2.9596643447875977, "learning_rate": 5e-07, "logits/chosen": -11511588.0, "logits/rejected": -23534644.0, "logps/chosen": -398.5256042480469, "logps/rejected": -220.5296630859375, "loss": 0.2765, "rewards/chosen": 1.3595153093338013, "rewards/margins": 2.9567950963974, "rewards/rejected": -1.5972797870635986, "step": 9725 }, { "epoch": 0.5155169215275753, "grad_norm": 53.0, "kl": 1.7865982055664062, "learning_rate": 5e-07, "logits/chosen": -10254476.666666666, "logits/rejected": 2185781.0, "logps/chosen": -283.51953125, "logps/rejected": -124.03854370117188, "loss": 0.4059, "rewards/chosen": 0.39731693267822266, "rewards/margins": 1.607103943824768, "rewards/rejected": -1.2097870111465454, "step": 9726 }, { "epoch": 0.5155699255293774, "grad_norm": 49.0, "kl": 2.7864131927490234, "learning_rate": 5e-07, "logits/chosen": -46595688.0, "logits/rejected": -38056520.0, "logps/chosen": -275.4706726074219, "logps/rejected": -261.78790283203125, "loss": 0.3331, "rewards/chosen": -0.09261971712112427, "rewards/margins": 2.1434590220451355, "rewards/rejected": -2.2360787391662598, "step": 9727 }, { "epoch": 0.5156229295311796, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25163676.0, "logits/rejected": -18020236.0, "logps/chosen": -197.62806701660156, "logps/rejected": -291.2204996744792, "loss": 0.2125, "rewards/chosen": 0.6997267603874207, "rewards/margins": 2.4017070333162946, "rewards/rejected": -1.7019802729288738, "step": 9728 }, { "epoch": 0.5156759335329817, "grad_norm": 49.75, "kl": 1.896402359008789, "learning_rate": 5e-07, "logits/chosen": -99156520.0, "logits/rejected": -11239230.0, "logps/chosen": -340.3168029785156, "logps/rejected": -160.76324462890625, "loss": 0.282, "rewards/chosen": 0.4462085962295532, "rewards/margins": 2.133642315864563, "rewards/rejected": -1.6874337196350098, "step": 9729 }, { "epoch": 0.5157289375347839, "grad_norm": 42.5, "kl": 1.3764877319335938, "learning_rate": 5e-07, "logits/chosen": -1147809.5, "logits/rejected": -13885494.0, "logps/chosen": -94.55175018310547, "logps/rejected": -395.69976806640625, "loss": 0.2362, "rewards/chosen": 1.0433382987976074, "rewards/margins": 4.369735956192017, "rewards/rejected": -3.326397657394409, "step": 9730 }, { "epoch": 0.515781941536586, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19191737.333333332, "logits/rejected": -7177347.2, "logps/chosen": -405.600341796875, "logps/rejected": -553.5443359375, "loss": 0.1367, "rewards/chosen": 1.326317310333252, "rewards/margins": 4.208775234222412, "rewards/rejected": -2.8824579238891603, "step": 9731 }, { "epoch": 0.5158349455383882, "grad_norm": 43.0, "kl": 0.8985633850097656, "learning_rate": 5e-07, "logits/chosen": -60478996.0, "logits/rejected": -37255490.666666664, "logps/chosen": -341.58050537109375, "logps/rejected": -235.05879720052084, "loss": 0.2221, "rewards/chosen": 0.2735916078090668, "rewards/margins": 2.5706819792588553, "rewards/rejected": -2.2970903714497886, "step": 9732 }, { "epoch": 0.5158879495401902, "grad_norm": 45.75, "kl": 1.8083367347717285, "learning_rate": 5e-07, "logits/chosen": -10867144.0, "logits/rejected": -5045508.5, "logps/chosen": -200.35965983072916, "logps/rejected": -120.24237060546875, "loss": 0.3209, "rewards/chosen": 0.9046218395233154, "rewards/margins": 2.995187282562256, "rewards/rejected": -2.0905654430389404, "step": 9733 }, { "epoch": 0.5159409535419924, "grad_norm": 50.5, "kl": 2.27423095703125, "learning_rate": 5e-07, "logits/chosen": -34806552.0, "logits/rejected": -50512272.0, "logps/chosen": -357.0923767089844, "logps/rejected": -284.3668518066406, "loss": 0.2521, "rewards/chosen": 0.9995611310005188, "rewards/margins": 3.171798288822174, "rewards/rejected": -2.1722371578216553, "step": 9734 }, { "epoch": 0.5159939575437945, "grad_norm": 48.0, "kl": 0.36098670959472656, "learning_rate": 5e-07, "logits/chosen": -27012006.0, "logits/rejected": -10738994.666666666, "logps/chosen": -539.86865234375, "logps/rejected": -354.705322265625, "loss": 0.2511, "rewards/chosen": 0.5090183019638062, "rewards/margins": 2.416752537091573, "rewards/rejected": -1.9077342351277669, "step": 9735 }, { "epoch": 0.5160469615455967, "grad_norm": 45.5, "kl": 0.1490497589111328, "learning_rate": 5e-07, "logits/chosen": -9105711.0, "logits/rejected": 1993619.6666666667, "logps/chosen": -174.91671752929688, "logps/rejected": -378.2770182291667, "loss": 0.1797, "rewards/chosen": 0.8526687622070312, "rewards/margins": 2.9581759770711265, "rewards/rejected": -2.105507214864095, "step": 9736 }, { "epoch": 0.5160999655473988, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5985616.666666667, "logits/rejected": -44781000.0, "logps/chosen": -247.8213907877604, "logps/rejected": -492.4936218261719, "loss": 0.326, "rewards/chosen": 0.44980772336324054, "rewards/margins": 2.9484268029530845, "rewards/rejected": -2.4986190795898438, "step": 9737 }, { "epoch": 0.516152969549201, "grad_norm": 38.0, "kl": 2.029998779296875, "learning_rate": 5e-07, "logits/chosen": -29124634.666666668, "logits/rejected": -11642052.8, "logps/chosen": -391.7638346354167, "logps/rejected": -327.0282470703125, "loss": 0.2533, "rewards/chosen": 1.0759165287017822, "rewards/margins": 3.3814749240875246, "rewards/rejected": -2.3055583953857424, "step": 9738 }, { "epoch": 0.5162059735510031, "grad_norm": 48.25, "kl": 1.3152427673339844, "learning_rate": 5e-07, "logits/chosen": -39740185.6, "logits/rejected": -51967488.0, "logps/chosen": -230.7478271484375, "logps/rejected": -434.6333821614583, "loss": 0.3277, "rewards/chosen": 0.6292881965637207, "rewards/margins": 2.4739964803059893, "rewards/rejected": -1.8447082837422688, "step": 9739 }, { "epoch": 0.5162589775528053, "grad_norm": 39.5, "kl": 0.3619041442871094, "learning_rate": 5e-07, "logits/chosen": 11318912.0, "logits/rejected": -37564540.0, "logps/chosen": -250.3267822265625, "logps/rejected": -435.9885559082031, "loss": 0.2467, "rewards/chosen": 0.35541820526123047, "rewards/margins": 3.5125956535339355, "rewards/rejected": -3.157177448272705, "step": 9740 }, { "epoch": 0.5163119815546073, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56315344.0, "logits/rejected": -31181554.0, "logps/chosen": -754.1117553710938, "logps/rejected": -555.17919921875, "loss": 0.2242, "rewards/chosen": 0.9519474506378174, "rewards/margins": 3.554353952407837, "rewards/rejected": -2.6024065017700195, "step": 9741 }, { "epoch": 0.5163649855564095, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11826936.0, "logits/rejected": -1171729.3333333333, "logps/chosen": -492.24365234375, "logps/rejected": -371.121826171875, "loss": 0.2329, "rewards/chosen": 1.5117712020874023, "rewards/margins": 2.930047671000163, "rewards/rejected": -1.4182764689127605, "step": 9742 }, { "epoch": 0.5164179895582116, "grad_norm": 46.5, "kl": 0.3142967224121094, "learning_rate": 5e-07, "logits/chosen": -10088762.0, "logits/rejected": -9913905.6, "logps/chosen": -198.4293416341146, "logps/rejected": -207.942529296875, "loss": 0.3376, "rewards/chosen": -0.38229413827260333, "rewards/margins": 1.3740155617396037, "rewards/rejected": -1.756309700012207, "step": 9743 }, { "epoch": 0.5164709935600138, "grad_norm": 62.5, "kl": 0.14846420288085938, "learning_rate": 5e-07, "logits/chosen": -9536640.0, "logits/rejected": -17807442.0, "logps/chosen": -294.08074951171875, "logps/rejected": -246.45162963867188, "loss": 0.2452, "rewards/chosen": 0.8354794979095459, "rewards/margins": 3.046391725540161, "rewards/rejected": -2.2109122276306152, "step": 9744 }, { "epoch": 0.5165239975618159, "grad_norm": 59.25, "kl": 1.8078069686889648, "learning_rate": 5e-07, "logits/chosen": -37787484.0, "logits/rejected": 281484.65625, "logps/chosen": -364.50250244140625, "logps/rejected": -252.98475646972656, "loss": 0.3676, "rewards/chosen": 0.271488755941391, "rewards/margins": 1.6246162354946136, "rewards/rejected": -1.3531274795532227, "step": 9745 }, { "epoch": 0.5165770015636181, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46297043.2, "logits/rejected": -45650581.333333336, "logps/chosen": -363.395849609375, "logps/rejected": -440.3689371744792, "loss": 0.3065, "rewards/chosen": 0.24560303688049318, "rewards/margins": 3.5667182127634685, "rewards/rejected": -3.321115175882975, "step": 9746 }, { "epoch": 0.5166300055654202, "grad_norm": 35.5, "kl": 0.23726844787597656, "learning_rate": 5e-07, "logits/chosen": -43728240.0, "logits/rejected": -1311169.0833333333, "logps/chosen": -245.4283447265625, "logps/rejected": -178.3720703125, "loss": 0.2379, "rewards/chosen": 0.8946553468704224, "rewards/margins": 2.3895522356033325, "rewards/rejected": -1.4948968887329102, "step": 9747 }, { "epoch": 0.5166830095672224, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5452318.0, "logits/rejected": -4495684.0, "logps/chosen": -234.8126220703125, "logps/rejected": -283.89939371744794, "loss": 0.2843, "rewards/chosen": -0.3727424740791321, "rewards/margins": 1.6052883664766948, "rewards/rejected": -1.978030840555827, "step": 9748 }, { "epoch": 0.5167360135690244, "grad_norm": 59.5, "kl": 1.8664779663085938, "learning_rate": 5e-07, "logits/chosen": -50520181.333333336, "logits/rejected": -27943616.0, "logps/chosen": -606.2314453125, "logps/rejected": -313.4078674316406, "loss": 0.3218, "rewards/chosen": 0.9979864756266276, "rewards/margins": 3.226660410563151, "rewards/rejected": -2.2286739349365234, "step": 9749 }, { "epoch": 0.5167890175708266, "grad_norm": 48.5, "kl": 1.181161880493164, "learning_rate": 5e-07, "logits/chosen": 1474832.4, "logits/rejected": -72576949.33333333, "logps/chosen": -86.203759765625, "logps/rejected": -217.98006184895834, "loss": 0.3332, "rewards/chosen": 0.6816956520080566, "rewards/margins": 2.096562417348226, "rewards/rejected": -1.4148667653401692, "step": 9750 }, { "epoch": 0.5168420215726287, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23901776.0, "logits/rejected": -10720013.6, "logps/chosen": -361.8721923828125, "logps/rejected": -213.478759765625, "loss": 0.3023, "rewards/chosen": 0.2160222331682841, "rewards/margins": 1.9784608165423077, "rewards/rejected": -1.7624385833740235, "step": 9751 }, { "epoch": 0.5168950255744309, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18221640.0, "logits/rejected": -10290105.142857144, "logps/chosen": -275.26171875, "logps/rejected": -420.4462193080357, "loss": 0.2211, "rewards/chosen": -0.562268078327179, "rewards/margins": 1.530392825603485, "rewards/rejected": -2.092660903930664, "step": 9752 }, { "epoch": 0.516948029576233, "grad_norm": 55.25, "kl": 0.17071151733398438, "learning_rate": 5e-07, "logits/chosen": -62587536.0, "logits/rejected": 12623116.0, "logps/chosen": -407.24786376953125, "logps/rejected": -131.0952911376953, "loss": 0.2659, "rewards/chosen": 0.5780731439590454, "rewards/margins": 2.5990344285964966, "rewards/rejected": -2.020961284637451, "step": 9753 }, { "epoch": 0.5170010335780352, "grad_norm": 57.5, "kl": 1.7282028198242188, "learning_rate": 5e-07, "logits/chosen": -33545666.666666668, "logits/rejected": -6336095.6, "logps/chosen": -428.9388834635417, "logps/rejected": -179.5688720703125, "loss": 0.2941, "rewards/chosen": 0.8896583716074625, "rewards/margins": 2.293868080774943, "rewards/rejected": -1.4042097091674806, "step": 9754 }, { "epoch": 0.5170540375798373, "grad_norm": 49.0, "kl": 1.522674560546875, "learning_rate": 5e-07, "logits/chosen": -36083740.0, "logits/rejected": -17565366.0, "logps/chosen": -344.964599609375, "logps/rejected": -298.5528259277344, "loss": 0.2678, "rewards/chosen": 0.7798930406570435, "rewards/margins": 2.9682129621505737, "rewards/rejected": -2.1883199214935303, "step": 9755 }, { "epoch": 0.5171070415816394, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28036356.0, "logits/rejected": -9419022.0, "logps/chosen": -442.48394775390625, "logps/rejected": -274.61474609375, "loss": 0.2572, "rewards/chosen": 0.3351810574531555, "rewards/margins": 2.8024941086769104, "rewards/rejected": -2.467313051223755, "step": 9756 }, { "epoch": 0.5171600455834415, "grad_norm": 42.25, "kl": 1.3150177001953125, "learning_rate": 5e-07, "logits/chosen": -21792236.0, "logits/rejected": -54689340.0, "logps/chosen": -310.30230712890625, "logps/rejected": -345.1730651855469, "loss": 0.265, "rewards/chosen": 1.000096082687378, "rewards/margins": 3.0848264694213867, "rewards/rejected": -2.084730386734009, "step": 9757 }, { "epoch": 0.5172130495852437, "grad_norm": 30.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -88939056.0, "logits/rejected": -27617200.0, "logps/chosen": -183.89662170410156, "logps/rejected": -453.0748697916667, "loss": 0.2262, "rewards/chosen": -0.3868545591831207, "rewards/margins": 2.668360004822413, "rewards/rejected": -3.0552145640055337, "step": 9758 }, { "epoch": 0.5172660535870458, "grad_norm": 45.25, "kl": 0.05054473876953125, "learning_rate": 5e-07, "logits/chosen": -22899194.666666668, "logits/rejected": -28669203.2, "logps/chosen": -223.80729166666666, "logps/rejected": -357.001220703125, "loss": 0.2689, "rewards/chosen": 0.5741904576619467, "rewards/margins": 2.3486592610677084, "rewards/rejected": -1.7744688034057616, "step": 9759 }, { "epoch": 0.517319057588848, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15009610.666666666, "logits/rejected": -2932147.6, "logps/chosen": -218.47273763020834, "logps/rejected": -166.4689453125, "loss": 0.3122, "rewards/chosen": 0.09130022923151652, "rewards/margins": 1.8591299335161846, "rewards/rejected": -1.767829704284668, "step": 9760 }, { "epoch": 0.5173720615906501, "grad_norm": 51.25, "kl": 0.009083747863769531, "learning_rate": 5e-07, "logits/chosen": -5404206.8, "logits/rejected": -15499848.0, "logps/chosen": -249.1703369140625, "logps/rejected": -241.44270833333334, "loss": 0.4066, "rewards/chosen": -0.036847084760665894, "rewards/margins": 1.4164826373259227, "rewards/rejected": -1.4533297220865886, "step": 9761 }, { "epoch": 0.5174250655924523, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63061344.0, "logits/rejected": 2676044.0, "logps/chosen": -359.3446451822917, "logps/rejected": -103.24505615234375, "loss": 0.349, "rewards/chosen": 0.49721328417460126, "rewards/margins": 1.9417452017466228, "rewards/rejected": -1.4445319175720215, "step": 9762 }, { "epoch": 0.5174780695942544, "grad_norm": 42.5, "kl": 1.9403190612792969, "learning_rate": 5e-07, "logits/chosen": 39676115.2, "logits/rejected": -6872649.333333333, "logps/chosen": -219.50732421875, "logps/rejected": -226.5572306315104, "loss": 0.3241, "rewards/chosen": 0.47007250785827637, "rewards/margins": 2.579629818598429, "rewards/rejected": -2.109557310740153, "step": 9763 }, { "epoch": 0.5175310735960565, "grad_norm": 45.25, "kl": 0.6184501647949219, "learning_rate": 5e-07, "logits/chosen": -18543756.8, "logits/rejected": -7820456.0, "logps/chosen": -181.3256591796875, "logps/rejected": -155.90669759114584, "loss": 0.2933, "rewards/chosen": 0.5194300651550293, "rewards/margins": 2.7828024864196776, "rewards/rejected": -2.2633724212646484, "step": 9764 }, { "epoch": 0.5175840775978586, "grad_norm": 52.0, "kl": 5.500090599060059, "learning_rate": 5e-07, "logits/chosen": -19479442.285714287, "logits/rejected": -751038.375, "logps/chosen": -220.47019740513392, "logps/rejected": -311.73309326171875, "loss": 0.4293, "rewards/chosen": 0.735966546194894, "rewards/margins": 3.2338639327457974, "rewards/rejected": -2.4978973865509033, "step": 9765 }, { "epoch": 0.5176370815996608, "grad_norm": 64.0, "kl": 3.2954788208007812, "learning_rate": 5e-07, "logits/chosen": -48624416.0, "logits/rejected": -49788832.0, "logps/chosen": -608.4697265625, "logps/rejected": -349.9164733886719, "loss": 0.2371, "rewards/chosen": 1.3690781593322754, "rewards/margins": 3.3440966606140137, "rewards/rejected": -1.9750185012817383, "step": 9766 }, { "epoch": 0.5176900856014629, "grad_norm": 41.25, "kl": 1.5027484893798828, "learning_rate": 5e-07, "logits/chosen": 956043.125, "logits/rejected": -43985800.0, "logps/chosen": -258.9366760253906, "logps/rejected": -494.6951904296875, "loss": 0.235, "rewards/chosen": 0.7332812547683716, "rewards/margins": 4.074861407279968, "rewards/rejected": -3.3415801525115967, "step": 9767 }, { "epoch": 0.5177430896032651, "grad_norm": 45.5, "kl": 1.2526912689208984, "learning_rate": 5e-07, "logits/chosen": -41719120.0, "logits/rejected": -1722570.6666666667, "logps/chosen": -237.31251525878906, "logps/rejected": -309.2006429036458, "loss": 0.2068, "rewards/chosen": 0.5940713882446289, "rewards/margins": 2.602229118347168, "rewards/rejected": -2.008157730102539, "step": 9768 }, { "epoch": 0.5177960936050672, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52060.734375, "logits/rejected": -17287744.0, "logps/chosen": -256.7181701660156, "logps/rejected": -363.245361328125, "loss": 0.2454, "rewards/chosen": 0.49163004755973816, "rewards/margins": 3.0102231204509735, "rewards/rejected": -2.5185930728912354, "step": 9769 }, { "epoch": 0.5178490976068694, "grad_norm": 40.0, "kl": 4.826539993286133, "learning_rate": 5e-07, "logits/chosen": -30262320.0, "logits/rejected": -36487909.333333336, "logps/chosen": -249.2167236328125, "logps/rejected": -625.8957112630209, "loss": 0.3481, "rewards/chosen": 0.7397926807403564, "rewards/margins": 3.086726681391398, "rewards/rejected": -2.3469340006510415, "step": 9770 }, { "epoch": 0.5179021016086715, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28613696.0, "logits/rejected": -9197398.0, "logps/chosen": -384.03680419921875, "logps/rejected": -243.32179260253906, "loss": 0.2704, "rewards/chosen": 0.5980119705200195, "rewards/margins": 2.1854774951934814, "rewards/rejected": -1.587465524673462, "step": 9771 }, { "epoch": 0.5179551056104736, "grad_norm": 52.5, "kl": 3.413707733154297, "learning_rate": 5e-07, "logits/chosen": -55732952.0, "logits/rejected": -1716168.25, "logps/chosen": -320.7249450683594, "logps/rejected": -332.35498046875, "loss": 0.2809, "rewards/chosen": 0.9049277901649475, "rewards/margins": 3.072801172733307, "rewards/rejected": -2.1678733825683594, "step": 9772 }, { "epoch": 0.5180081096122757, "grad_norm": 37.25, "kl": 1.6763744354248047, "learning_rate": 5e-07, "logits/chosen": 10848158.666666666, "logits/rejected": -14207294.4, "logps/chosen": -258.33839925130206, "logps/rejected": -253.7749755859375, "loss": 0.1684, "rewards/chosen": 1.2618872324625652, "rewards/margins": 3.6524037043253585, "rewards/rejected": -2.390516471862793, "step": 9773 }, { "epoch": 0.5180611136140779, "grad_norm": 38.75, "kl": 0.1008615493774414, "learning_rate": 5e-07, "logits/chosen": 8641877.333333334, "logits/rejected": -15937512.0, "logps/chosen": -101.56888834635417, "logps/rejected": -459.818310546875, "loss": 0.2959, "rewards/chosen": 0.11519489685694377, "rewards/margins": 2.6111318627993265, "rewards/rejected": -2.495936965942383, "step": 9774 }, { "epoch": 0.51811411761588, "grad_norm": 52.0, "kl": 3.117910385131836, "learning_rate": 5e-07, "logits/chosen": -24850990.0, "logits/rejected": -1023480.0, "logps/chosen": -167.6012420654297, "logps/rejected": -166.5720672607422, "loss": 0.3035, "rewards/chosen": 0.8156330585479736, "rewards/margins": 2.9704020023345947, "rewards/rejected": -2.154768943786621, "step": 9775 }, { "epoch": 0.5181671216176821, "grad_norm": 53.5, "kl": 0.4250450134277344, "learning_rate": 5e-07, "logits/chosen": -30292680.0, "logits/rejected": -18503298.0, "logps/chosen": -346.8052978515625, "logps/rejected": -178.29116821289062, "loss": 0.2792, "rewards/chosen": 0.6477527618408203, "rewards/margins": 2.137784242630005, "rewards/rejected": -1.4900314807891846, "step": 9776 }, { "epoch": 0.5182201256194843, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4076429.5, "logits/rejected": -16188409.142857144, "logps/chosen": -184.57369995117188, "logps/rejected": -287.73343331473217, "loss": 0.1887, "rewards/chosen": 1.2200393676757812, "rewards/margins": 2.9570459638323103, "rewards/rejected": -1.737006596156529, "step": 9777 }, { "epoch": 0.5182731296212864, "grad_norm": 58.25, "kl": 1.5342693328857422, "learning_rate": 5e-07, "logits/chosen": -41732256.0, "logits/rejected": -36113312.0, "logps/chosen": -542.1279907226562, "logps/rejected": -230.3790283203125, "loss": 0.22, "rewards/chosen": 1.4390381574630737, "rewards/margins": 2.919890284538269, "rewards/rejected": -1.4808521270751953, "step": 9778 }, { "epoch": 0.5183261336230885, "grad_norm": 46.75, "kl": 0.9616832733154297, "learning_rate": 5e-07, "logits/chosen": -12774048.0, "logits/rejected": -68155962.66666667, "logps/chosen": -135.7343017578125, "logps/rejected": -359.6239013671875, "loss": 0.3951, "rewards/chosen": 0.0816308617591858, "rewards/margins": 1.6447056094805401, "rewards/rejected": -1.5630747477213542, "step": 9779 }, { "epoch": 0.5183791376248906, "grad_norm": 33.75, "kl": 1.9563922882080078, "learning_rate": 5e-07, "logits/chosen": -10982325.0, "logits/rejected": -28870380.0, "logps/chosen": -210.50906372070312, "logps/rejected": -578.472900390625, "loss": 0.2416, "rewards/chosen": 0.6705937385559082, "rewards/margins": 3.5101027488708496, "rewards/rejected": -2.8395090103149414, "step": 9780 }, { "epoch": 0.5184321416266928, "grad_norm": 41.5, "kl": 0.8607883453369141, "learning_rate": 5e-07, "logits/chosen": -5888073.0, "logits/rejected": -24328610.666666668, "logps/chosen": -160.28114318847656, "logps/rejected": -247.9638875325521, "loss": 0.2368, "rewards/chosen": 0.7826946377754211, "rewards/margins": 2.297249774138133, "rewards/rejected": -1.5145551363627117, "step": 9781 }, { "epoch": 0.5184851456284949, "grad_norm": 73.5, "kl": 3.251689910888672, "learning_rate": 5e-07, "logits/chosen": -40568520.0, "logps/chosen": -270.75897216796875, "loss": 0.4521, "rewards/chosen": 0.5197299718856812, "step": 9782 }, { "epoch": 0.5185381496302971, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22971520.0, "logits/rejected": -22903372.0, "logps/chosen": -316.8415222167969, "logps/rejected": -157.99542236328125, "loss": 0.2693, "rewards/chosen": 0.44082924723625183, "rewards/margins": 2.4029140174388885, "rewards/rejected": -1.9620847702026367, "step": 9783 }, { "epoch": 0.5185911536320992, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8727562.0, "logits/rejected": -14705267.0, "logps/chosen": -367.12030029296875, "logps/rejected": -377.53375244140625, "loss": 0.3025, "rewards/chosen": 0.05764961242675781, "rewards/margins": 3.148953914642334, "rewards/rejected": -3.091304302215576, "step": 9784 }, { "epoch": 0.5186441576339014, "grad_norm": 45.0, "kl": 0.644439697265625, "learning_rate": 5e-07, "logits/chosen": -28238122.666666668, "logits/rejected": -21179980.8, "logps/chosen": -276.3109944661458, "logps/rejected": -237.6512939453125, "loss": 0.2225, "rewards/chosen": 0.3801643451054891, "rewards/margins": 2.748881538709005, "rewards/rejected": -2.3687171936035156, "step": 9785 }, { "epoch": 0.5186971616357035, "grad_norm": 55.5, "kl": 1.752685546875, "learning_rate": 5e-07, "logits/chosen": -37102348.8, "logits/rejected": -16630537.333333334, "logps/chosen": -321.6968994140625, "logps/rejected": -228.10445149739584, "loss": 0.3386, "rewards/chosen": 0.6540654182434082, "rewards/margins": 1.966458288828532, "rewards/rejected": -1.3123928705851238, "step": 9786 }, { "epoch": 0.5187501656375056, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61181728.0, "logits/rejected": -23694464.0, "logps/chosen": -266.82135009765625, "logps/rejected": -521.2448323567709, "loss": 0.2514, "rewards/chosen": 0.09580765664577484, "rewards/margins": 2.019363984465599, "rewards/rejected": -1.9235563278198242, "step": 9787 }, { "epoch": 0.5188031696393077, "grad_norm": 41.75, "kl": 0.024377822875976562, "learning_rate": 5e-07, "logits/chosen": 10908573.0, "logits/rejected": -11263437.714285715, "logps/chosen": -49.53240203857422, "logps/rejected": -352.33939034598217, "loss": 0.1576, "rewards/chosen": 0.484109491109848, "rewards/margins": 3.014242329767772, "rewards/rejected": -2.530132838657924, "step": 9788 }, { "epoch": 0.5188561736411099, "grad_norm": 51.5, "kl": 1.1375999450683594, "learning_rate": 5e-07, "logits/chosen": -21380030.0, "logits/rejected": -10523029.0, "logps/chosen": -386.7982177734375, "logps/rejected": -451.96331787109375, "loss": 0.3226, "rewards/chosen": 0.4280715882778168, "rewards/margins": 2.532728523015976, "rewards/rejected": -2.104656934738159, "step": 9789 }, { "epoch": 0.518909177642912, "grad_norm": 58.75, "kl": 2.9478511810302734, "learning_rate": 5e-07, "logits/chosen": -27788528.0, "logits/rejected": -21364718.0, "logps/chosen": -261.88478597005206, "logps/rejected": -402.2706604003906, "loss": 0.42, "rewards/chosen": 0.23552505175272623, "rewards/margins": 3.322610060373942, "rewards/rejected": -3.087085008621216, "step": 9790 }, { "epoch": 0.5189621816447142, "grad_norm": 41.25, "kl": 0.20436859130859375, "learning_rate": 5e-07, "logits/chosen": -18182114.0, "logits/rejected": -92529816.0, "logps/chosen": -169.3558349609375, "logps/rejected": -437.78570556640625, "loss": 0.2659, "rewards/chosen": 0.2001289427280426, "rewards/margins": 2.8307301104068756, "rewards/rejected": -2.630601167678833, "step": 9791 }, { "epoch": 0.5190151856465163, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13399186.0, "logits/rejected": -16208706.666666666, "logps/chosen": -174.40536499023438, "logps/rejected": -154.47329711914062, "loss": 0.2575, "rewards/chosen": 0.4324398338794708, "rewards/margins": 2.2789504031340284, "rewards/rejected": -1.8465105692545574, "step": 9792 }, { "epoch": 0.5190681896483185, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26306410.666666668, "logits/rejected": -8079846.4, "logps/chosen": -236.2553507486979, "logps/rejected": -267.6708251953125, "loss": 0.1838, "rewards/chosen": 0.6611874500910441, "rewards/margins": 3.3041945377985633, "rewards/rejected": -2.6430070877075194, "step": 9793 }, { "epoch": 0.5191211936501205, "grad_norm": 47.75, "kl": 3.1722450256347656, "learning_rate": 5e-07, "logits/chosen": -12271614.0, "logits/rejected": -16887880.0, "logps/chosen": -440.2717590332031, "logps/rejected": -272.3535461425781, "loss": 0.2527, "rewards/chosen": 1.0273418426513672, "rewards/margins": 4.125136137008667, "rewards/rejected": -3.0977942943573, "step": 9794 }, { "epoch": 0.5191741976519227, "grad_norm": 67.5, "kl": 0.19440841674804688, "learning_rate": 5e-07, "logits/chosen": -58783600.0, "logits/rejected": 53438756.0, "logps/chosen": -366.20159912109375, "logps/rejected": -171.83383178710938, "loss": 0.3446, "rewards/chosen": 0.2987770140171051, "rewards/margins": 1.7813715040683746, "rewards/rejected": -1.4825944900512695, "step": 9795 }, { "epoch": 0.5192272016537248, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10014932.0, "logits/rejected": -126970794.66666667, "logps/chosen": -957.80302734375, "logps/rejected": -412.111328125, "loss": 0.2663, "rewards/chosen": 0.7760351657867431, "rewards/margins": 3.787129259109497, "rewards/rejected": -3.011094093322754, "step": 9796 }, { "epoch": 0.519280205655527, "grad_norm": 36.5, "kl": 0.036108970642089844, "learning_rate": 5e-07, "logits/chosen": -57792348.0, "logits/rejected": -27879301.333333332, "logps/chosen": -186.1859893798828, "logps/rejected": -576.7022298177084, "loss": 0.1782, "rewards/chosen": -0.13946056365966797, "rewards/margins": 2.8453925450642905, "rewards/rejected": -2.9848531087239585, "step": 9797 }, { "epoch": 0.5193332096573291, "grad_norm": 58.25, "kl": 0.36823177337646484, "learning_rate": 5e-07, "logits/chosen": 494705.0, "logits/rejected": 49189892.0, "logps/chosen": -241.6815185546875, "logps/rejected": -221.23135375976562, "loss": 0.3108, "rewards/chosen": 0.4839287996292114, "rewards/margins": 1.9540461301803589, "rewards/rejected": -1.4701173305511475, "step": 9798 }, { "epoch": 0.5193862136591313, "grad_norm": 32.75, "kl": 0.19289779663085938, "learning_rate": 5e-07, "logits/chosen": -26519092.0, "logits/rejected": -18926182.0, "logps/chosen": -182.781494140625, "logps/rejected": -341.7220764160156, "loss": 0.2653, "rewards/chosen": 0.2682649493217468, "rewards/margins": 2.9059972167015076, "rewards/rejected": -2.6377322673797607, "step": 9799 }, { "epoch": 0.5194392176609334, "grad_norm": 47.0, "kl": 2.0286521911621094, "learning_rate": 5e-07, "logits/chosen": -23070080.0, "logits/rejected": 290460864.0, "logps/chosen": -349.8717041015625, "logps/rejected": -327.3768615722656, "loss": 0.3726, "rewards/chosen": 0.2853878140449524, "rewards/margins": 1.7841117978096008, "rewards/rejected": -1.4987239837646484, "step": 9800 }, { "epoch": 0.5194922216627356, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 20818478.0, "logits/rejected": -53282320.0, "logps/chosen": -338.8095703125, "logps/rejected": -120.58177185058594, "loss": 0.3066, "rewards/chosen": 0.5690929889678955, "rewards/margins": 1.9418678283691406, "rewards/rejected": -1.3727748394012451, "step": 9801 }, { "epoch": 0.5195452256645376, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1389451.875, "logits/rejected": -9432758.857142856, "logps/chosen": -19.99588966369629, "logps/rejected": -315.5584193638393, "loss": 0.171, "rewards/chosen": 0.6093757748603821, "rewards/margins": 2.4944392187254767, "rewards/rejected": -1.8850634438650948, "step": 9802 }, { "epoch": 0.5195982296663398, "grad_norm": 44.75, "kl": 0.4970054626464844, "learning_rate": 5e-07, "logits/chosen": 9162293.0, "logits/rejected": -26889202.666666668, "logps/chosen": -36.96112823486328, "logps/rejected": -497.0888671875, "loss": 0.3033, "rewards/chosen": 0.27377527952194214, "rewards/margins": 1.7968691388765972, "rewards/rejected": -1.523093859354655, "step": 9803 }, { "epoch": 0.5196512336681419, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8366862.0, "logits/rejected": -19727156.0, "logps/chosen": -134.84786987304688, "logps/rejected": -222.90586853027344, "loss": 0.3484, "rewards/chosen": 0.1373714804649353, "rewards/margins": 1.650000512599945, "rewards/rejected": -1.5126290321350098, "step": 9804 }, { "epoch": 0.5197042376699441, "grad_norm": 50.5, "kl": 0.10837173461914062, "learning_rate": 5e-07, "logits/chosen": -41934672.0, "logits/rejected": -2601678.75, "logps/chosen": -466.33526611328125, "logps/rejected": -93.62932586669922, "loss": 0.2679, "rewards/chosen": 0.11665572971105576, "rewards/margins": 2.9774850830435753, "rewards/rejected": -2.8608293533325195, "step": 9805 }, { "epoch": 0.5197572416717462, "grad_norm": 57.25, "kl": 1.5559558868408203, "learning_rate": 5e-07, "logits/chosen": -2032394.8, "logits/rejected": -11548157.333333334, "logps/chosen": -208.9009521484375, "logps/rejected": -261.501708984375, "loss": 0.3798, "rewards/chosen": 0.05548274517059326, "rewards/margins": 1.752893328666687, "rewards/rejected": -1.6974105834960938, "step": 9806 }, { "epoch": 0.5198102456735484, "grad_norm": 56.25, "kl": 1.602163314819336, "learning_rate": 5e-07, "logits/chosen": -24726037.333333332, "logits/rejected": 15946662.0, "logps/chosen": -326.8537190755208, "logps/rejected": -137.0670166015625, "loss": 0.4599, "rewards/chosen": 0.21661798159281412, "rewards/margins": 0.9045257965723673, "rewards/rejected": -0.6879078149795532, "step": 9807 }, { "epoch": 0.5198632496753505, "grad_norm": 86.0, "kl": 6.113758087158203, "learning_rate": 5e-07, "logits/chosen": -25209778.0, "logits/rejected": -23852860.0, "logps/chosen": -1038.6888427734375, "logps/rejected": -397.99957275390625, "loss": 0.2296, "rewards/chosen": 1.6242311000823975, "rewards/margins": 4.50107741355896, "rewards/rejected": -2.8768463134765625, "step": 9808 }, { "epoch": 0.5199162536771527, "grad_norm": 100.5, "kl": 0.6103286743164062, "learning_rate": 5e-07, "logits/chosen": -556653.5, "logits/rejected": -15978151.0, "logps/chosen": -729.2005004882812, "logps/rejected": -127.65531158447266, "loss": 0.2724, "rewards/chosen": 0.7013595700263977, "rewards/margins": 2.4174912571907043, "rewards/rejected": -1.7161316871643066, "step": 9809 }, { "epoch": 0.5199692576789547, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13574926.0, "logits/rejected": -30412208.0, "logps/chosen": -243.32667541503906, "logps/rejected": -310.3121032714844, "loss": 0.2596, "rewards/chosen": 0.3377157747745514, "rewards/margins": 3.121647983789444, "rewards/rejected": -2.7839322090148926, "step": 9810 }, { "epoch": 0.5200222616807569, "grad_norm": 67.0, "kl": 4.859561920166016, "learning_rate": 5e-07, "logits/chosen": 16333756.8, "logits/rejected": -23206560.0, "logps/chosen": -589.66494140625, "logps/rejected": -221.30289713541666, "loss": 0.2481, "rewards/chosen": 1.5099494934082032, "rewards/margins": 3.5961654663085936, "rewards/rejected": -2.0862159729003906, "step": 9811 }, { "epoch": 0.520075265682559, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42370620.8, "logits/rejected": -17748064.0, "logps/chosen": -277.98896484375, "logps/rejected": -150.89087931315103, "loss": 0.3615, "rewards/chosen": 0.32782974243164065, "rewards/margins": 1.7865076700846356, "rewards/rejected": -1.4586779276529949, "step": 9812 }, { "epoch": 0.5201282696843612, "grad_norm": 46.0, "kl": 3.359041213989258, "learning_rate": 5e-07, "logits/chosen": -21657753.333333332, "logits/rejected": -9282854.4, "logps/chosen": -343.2357584635417, "logps/rejected": -195.7149658203125, "loss": 0.284, "rewards/chosen": 0.5521489381790161, "rewards/margins": 2.691680073738098, "rewards/rejected": -2.139531135559082, "step": 9813 }, { "epoch": 0.5201812736861633, "grad_norm": 39.5, "kl": 0.1711273193359375, "learning_rate": 5e-07, "logits/chosen": -57270416.0, "logits/rejected": -37710912.0, "logps/chosen": -332.25233968098956, "logps/rejected": -440.924365234375, "loss": 0.2499, "rewards/chosen": -0.24227319161097208, "rewards/margins": 2.9334706743558248, "rewards/rejected": -3.1757438659667967, "step": 9814 }, { "epoch": 0.5202342776879655, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5214359.5, "logits/rejected": -8803886.0, "logps/chosen": -155.8082275390625, "logps/rejected": -198.73947143554688, "loss": 0.3227, "rewards/chosen": 0.2594667971134186, "rewards/margins": 2.1252003014087677, "rewards/rejected": -1.8657335042953491, "step": 9815 }, { "epoch": 0.5202872816897676, "grad_norm": 53.75, "kl": 1.4859085083007812, "learning_rate": 5e-07, "logits/chosen": -29024841.6, "logits/rejected": -29463024.0, "logps/chosen": -382.83349609375, "logps/rejected": -260.65220133463544, "loss": 0.3129, "rewards/chosen": 0.37094864845275877, "rewards/margins": 3.674164946873983, "rewards/rejected": -3.303216298421224, "step": 9816 }, { "epoch": 0.5203402856915698, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10179097.333333334, "logits/rejected": -1891748.0, "logps/chosen": -238.3370564778646, "logps/rejected": -150.875537109375, "loss": 0.2968, "rewards/chosen": -0.03254254659016927, "rewards/margins": 1.8632004419962567, "rewards/rejected": -1.8957429885864259, "step": 9817 }, { "epoch": 0.5203932896933718, "grad_norm": 71.5, "kl": 5.097110748291016, "learning_rate": 5e-07, "logits/chosen": -13683432.0, "logps/chosen": -225.45645141601562, "loss": 0.54, "rewards/chosen": 0.3414927124977112, "step": 9818 }, { "epoch": 0.520446293695174, "grad_norm": 50.75, "kl": 1.1675949096679688, "learning_rate": 5e-07, "logits/chosen": -11671128.0, "logits/rejected": -15749699.0, "logps/chosen": -151.16860961914062, "logps/rejected": -248.3793487548828, "loss": 0.3466, "rewards/chosen": 0.35492751002311707, "rewards/margins": 1.479722112417221, "rewards/rejected": -1.124794602394104, "step": 9819 }, { "epoch": 0.5204992976969761, "grad_norm": 43.5, "kl": 1.0110855102539062, "learning_rate": 5e-07, "logits/chosen": 39316157.333333336, "logits/rejected": -92683961.6, "logps/chosen": -910.4934895833334, "logps/rejected": -261.882421875, "loss": 0.25, "rewards/chosen": 1.232580025990804, "rewards/margins": 2.965838082631429, "rewards/rejected": -1.733258056640625, "step": 9820 }, { "epoch": 0.5205523016987783, "grad_norm": 39.25, "kl": 0.5610885620117188, "learning_rate": 5e-07, "logits/chosen": -26417608.0, "logits/rejected": -19434956.0, "logps/chosen": -227.5269775390625, "logps/rejected": -246.91123962402344, "loss": 0.255, "rewards/chosen": 0.3997535705566406, "rewards/margins": 3.134859323501587, "rewards/rejected": -2.7351057529449463, "step": 9821 }, { "epoch": 0.5206053057005804, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59689996.8, "logits/rejected": -63427994.666666664, "logps/chosen": -470.015234375, "logps/rejected": -189.046630859375, "loss": 0.3201, "rewards/chosen": 0.26594834327697753, "rewards/margins": 2.4125926176706947, "rewards/rejected": -2.1466442743937173, "step": 9822 }, { "epoch": 0.5206583097023826, "grad_norm": 43.0, "kl": 0.70361328125, "learning_rate": 5e-07, "logits/chosen": -36110105.6, "logits/rejected": -70763253.33333333, "logps/chosen": -279.16591796875, "logps/rejected": -986.4359537760416, "loss": 0.2786, "rewards/chosen": 0.5110622882843018, "rewards/margins": 6.715288464228313, "rewards/rejected": -6.204226175944011, "step": 9823 }, { "epoch": 0.5207113137041847, "grad_norm": 44.25, "kl": 1.1998462677001953, "learning_rate": 5e-07, "logits/chosen": -58821116.0, "logits/rejected": -32476702.0, "logps/chosen": -157.19285583496094, "logps/rejected": -509.3645324707031, "loss": 0.3004, "rewards/chosen": 0.0959966704249382, "rewards/margins": 3.8697028681635857, "rewards/rejected": -3.7737061977386475, "step": 9824 }, { "epoch": 0.5207643177059869, "grad_norm": 35.25, "kl": 0.04665565490722656, "learning_rate": 5e-07, "logits/chosen": -6268802.0, "logits/rejected": -29747717.333333332, "logps/chosen": -318.5345153808594, "logps/rejected": -330.5262044270833, "loss": 0.2481, "rewards/chosen": 0.8497185111045837, "rewards/margins": 2.5676051179567976, "rewards/rejected": -1.7178866068522136, "step": 9825 }, { "epoch": 0.5208173217077889, "grad_norm": 47.75, "kl": 1.4968547821044922, "learning_rate": 5e-07, "logits/chosen": -4832652.5, "logits/rejected": 27404440.0, "logps/chosen": -277.2583923339844, "logps/rejected": -453.943359375, "loss": 0.2754, "rewards/chosen": 0.44132664799690247, "rewards/margins": 2.9279012978076935, "rewards/rejected": -2.486574649810791, "step": 9826 }, { "epoch": 0.520870325709591, "grad_norm": 53.0, "kl": 4.060110092163086, "learning_rate": 5e-07, "logits/chosen": 2926196.0, "logits/rejected": -17118398.666666668, "logps/chosen": -213.1022216796875, "logps/rejected": -229.47188313802084, "loss": 0.3399, "rewards/chosen": 0.8357448577880859, "rewards/margins": 2.990708827972412, "rewards/rejected": -2.154963970184326, "step": 9827 }, { "epoch": 0.5209233297113932, "grad_norm": 52.75, "kl": 3.8483638763427734, "learning_rate": 5e-07, "logits/chosen": -7856163.2, "logits/rejected": -20164272.0, "logps/chosen": -97.26555786132812, "logps/rejected": -182.8488566080729, "loss": 0.3453, "rewards/chosen": 0.5080587863922119, "rewards/margins": 3.3434393405914307, "rewards/rejected": -2.8353805541992188, "step": 9828 }, { "epoch": 0.5209763337131953, "grad_norm": 44.0, "kl": 2.1191349029541016, "learning_rate": 5e-07, "logits/chosen": -21536604.8, "logits/rejected": -52524160.0, "logps/chosen": -215.96904296875, "logps/rejected": -655.6035970052084, "loss": 0.3081, "rewards/chosen": 0.39220438003540037, "rewards/margins": 3.616854445139567, "rewards/rejected": -3.2246500651041665, "step": 9829 }, { "epoch": 0.5210293377149975, "grad_norm": 55.0, "kl": 1.8881969451904297, "learning_rate": 5e-07, "logits/chosen": -8546428.0, "logits/rejected": -19042464.0, "logps/chosen": -245.2059529622396, "logps/rejected": -430.3141174316406, "loss": 0.3413, "rewards/chosen": 0.5528864065806071, "rewards/margins": 2.5908991495768228, "rewards/rejected": -2.038012742996216, "step": 9830 }, { "epoch": 0.5210823417167996, "grad_norm": 69.5, "kl": 6.836078643798828, "learning_rate": 5e-07, "logits/chosen": -62278150.4, "logits/rejected": -17751160.0, "logps/chosen": -438.389501953125, "logps/rejected": -418.27978515625, "loss": 0.3328, "rewards/chosen": 1.1985528945922852, "rewards/margins": 3.8945281982421873, "rewards/rejected": -2.6959753036499023, "step": 9831 }, { "epoch": 0.5211353457186018, "grad_norm": 71.5, "kl": 1.1490592956542969, "learning_rate": 5e-07, "logits/chosen": 28235768.0, "logits/rejected": -18039488.0, "logps/chosen": -515.9688720703125, "logps/rejected": -198.83023071289062, "loss": 0.2033, "rewards/chosen": 1.253576636314392, "rewards/margins": 3.6131900548934937, "rewards/rejected": -2.3596134185791016, "step": 9832 }, { "epoch": 0.5211883497204038, "grad_norm": 35.75, "kl": 2.700407028198242, "learning_rate": 5e-07, "logits/chosen": -48219648.0, "logits/rejected": -18368044.0, "logps/chosen": -377.2418212890625, "logps/rejected": -191.62391662597656, "loss": 0.2573, "rewards/chosen": 0.8152391314506531, "rewards/margins": 3.2181920409202576, "rewards/rejected": -2.4029529094696045, "step": 9833 }, { "epoch": 0.521241353722206, "grad_norm": 68.5, "kl": 2.528789520263672, "learning_rate": 5e-07, "logits/chosen": -32285562.666666668, "logits/rejected": -5999245.0, "logps/chosen": -352.8759358723958, "logps/rejected": -181.4709930419922, "loss": 0.3492, "rewards/chosen": 0.7853511174519857, "rewards/margins": 1.7980991204579673, "rewards/rejected": -1.0127480030059814, "step": 9834 }, { "epoch": 0.5212943577240081, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36526028.0, "logits/rejected": -9867574.0, "logps/chosen": -287.5916442871094, "logps/rejected": -312.5575764973958, "loss": 0.274, "rewards/chosen": 0.5926119089126587, "rewards/margins": 1.8403506676355998, "rewards/rejected": -1.2477387587229412, "step": 9835 }, { "epoch": 0.5213473617258103, "grad_norm": 39.0, "kl": 0.6543731689453125, "learning_rate": 5e-07, "logits/chosen": -2142174.0, "logits/rejected": -23395880.0, "logps/chosen": -206.17576599121094, "logps/rejected": -281.1131591796875, "loss": 0.2968, "rewards/chosen": 0.6044769287109375, "rewards/margins": 2.3097801208496094, "rewards/rejected": -1.7053031921386719, "step": 9836 }, { "epoch": 0.5214003657276124, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67131381.33333333, "logits/rejected": -48005574.4, "logps/chosen": -580.3907470703125, "logps/rejected": -376.6119140625, "loss": 0.3119, "rewards/chosen": -0.11064453919728597, "rewards/margins": 1.7426996151606242, "rewards/rejected": -1.8533441543579101, "step": 9837 }, { "epoch": 0.5214533697294146, "grad_norm": 78.0, "kl": 4.932432174682617, "learning_rate": 5e-07, "logits/chosen": -17292413.333333332, "logits/rejected": -23906578.0, "logps/chosen": -288.4841715494792, "logps/rejected": -301.143798828125, "loss": 0.4083, "rewards/chosen": 0.8234960238138834, "rewards/margins": 2.198696772257487, "rewards/rejected": -1.3752007484436035, "step": 9838 }, { "epoch": 0.5215063737312167, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8731293.0, "logits/rejected": -25030946.666666668, "logps/chosen": -32.97224044799805, "logps/rejected": -426.5552571614583, "loss": 0.3026, "rewards/chosen": 0.022537611424922943, "rewards/margins": 2.3346330945690474, "rewards/rejected": -2.3120954831441245, "step": 9839 }, { "epoch": 0.5215593777330189, "grad_norm": 68.0, "kl": 1.1552467346191406, "learning_rate": 5e-07, "logits/chosen": -58352115.2, "logits/rejected": -28282426.666666668, "logps/chosen": -347.427783203125, "logps/rejected": -229.8504638671875, "loss": 0.3312, "rewards/chosen": 0.6641141414642334, "rewards/margins": 1.8901614189147948, "rewards/rejected": -1.2260472774505615, "step": 9840 }, { "epoch": 0.5216123817348209, "grad_norm": 45.25, "kl": 0.23487091064453125, "learning_rate": 5e-07, "logits/chosen": -6252376.0, "logits/rejected": -39243481.6, "logps/chosen": -376.1342366536458, "logps/rejected": -227.083447265625, "loss": 0.3611, "rewards/chosen": -0.19915529092152914, "rewards/margins": 1.3695163170496623, "rewards/rejected": -1.5686716079711913, "step": 9841 }, { "epoch": 0.5216653857366231, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52811072.0, "logits/rejected": -23616149.333333332, "logps/chosen": -439.300537109375, "logps/rejected": -189.53629557291666, "loss": 0.3027, "rewards/chosen": 0.5938384056091308, "rewards/margins": 2.553097120920817, "rewards/rejected": -1.9592587153116863, "step": 9842 }, { "epoch": 0.5217183897384252, "grad_norm": 38.75, "kl": 0.2732534408569336, "learning_rate": 5e-07, "logits/chosen": -19314729.6, "logits/rejected": 5231542.333333333, "logps/chosen": -207.435400390625, "logps/rejected": -318.87900797526044, "loss": 0.2028, "rewards/chosen": 1.1202200889587401, "rewards/margins": 3.644695504506429, "rewards/rejected": -2.524475415547689, "step": 9843 }, { "epoch": 0.5217713937402274, "grad_norm": 44.25, "kl": 2.7684431076049805, "learning_rate": 5e-07, "logits/chosen": -38607808.0, "logits/rejected": -23131212.0, "logps/chosen": -474.8186950683594, "logps/rejected": -309.66412353515625, "loss": 0.2833, "rewards/chosen": 0.7289919853210449, "rewards/margins": 3.5580315589904785, "rewards/rejected": -2.8290395736694336, "step": 9844 }, { "epoch": 0.5218243977420295, "grad_norm": 53.0, "kl": 0.6253604888916016, "learning_rate": 5e-07, "logits/chosen": -4343936.0, "logits/rejected": -14748770.666666666, "logps/chosen": -100.3010986328125, "logps/rejected": -822.4046223958334, "loss": 0.3972, "rewards/chosen": 0.19009673595428467, "rewards/margins": 3.5821627378463745, "rewards/rejected": -3.39206600189209, "step": 9845 }, { "epoch": 0.5218774017438317, "grad_norm": 30.5, "kl": 1.4351921081542969, "learning_rate": 5e-07, "logits/chosen": 7322106.0, "logits/rejected": -16850661.333333332, "logps/chosen": -151.31866455078125, "logps/rejected": -298.3008626302083, "loss": 0.15, "rewards/chosen": 1.4789671897888184, "rewards/margins": 3.9352733294169107, "rewards/rejected": -2.4563061396280923, "step": 9846 }, { "epoch": 0.5219304057456338, "grad_norm": 57.75, "kl": 1.0430240631103516, "learning_rate": 5e-07, "logits/chosen": -40849160.0, "logits/rejected": -23040752.0, "logps/chosen": -264.31658935546875, "logps/rejected": -287.0452473958333, "loss": 0.3764, "rewards/chosen": -0.2799171507358551, "rewards/margins": 0.6419208943843842, "rewards/rejected": -0.9218380451202393, "step": 9847 }, { "epoch": 0.521983409747436, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49803956.0, "logits/rejected": -33761376.0, "logps/chosen": -91.132080078125, "logps/rejected": -307.84954833984375, "loss": 0.4177, "rewards/chosen": -0.40726548433303833, "rewards/margins": 0.8878483176231384, "rewards/rejected": -1.2951138019561768, "step": 9848 }, { "epoch": 0.522036413749238, "grad_norm": 46.75, "kl": 4.9253950119018555, "learning_rate": 5e-07, "logits/chosen": -7315960.0, "logits/rejected": -17416458.666666668, "logps/chosen": -227.557763671875, "logps/rejected": -590.4426676432291, "loss": 0.4, "rewards/chosen": 0.42638869285583497, "rewards/margins": 3.5119116624196374, "rewards/rejected": -3.0855229695638022, "step": 9849 }, { "epoch": 0.5220894177510402, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3206687.0, "logits/rejected": 11036366.666666666, "logps/chosen": -92.71430969238281, "logps/rejected": -255.4962158203125, "loss": 0.3162, "rewards/chosen": 0.16933289170265198, "rewards/margins": 1.4808037181695302, "rewards/rejected": -1.3114708264668782, "step": 9850 }, { "epoch": 0.5221424217528423, "grad_norm": 51.75, "kl": 0.8202028274536133, "learning_rate": 5e-07, "logits/chosen": -35570560.0, "logits/rejected": -8089725.333333333, "logps/chosen": -125.7585693359375, "logps/rejected": -128.53518676757812, "loss": 0.3514, "rewards/chosen": 0.5579775810241699, "rewards/margins": 2.010908667246501, "rewards/rejected": -1.4529310862223308, "step": 9851 }, { "epoch": 0.5221954257546445, "grad_norm": 70.5, "kl": 0.6518173217773438, "learning_rate": 5e-07, "logits/chosen": -16281620.0, "logits/rejected": -18417732.0, "logps/chosen": -292.91522216796875, "logps/rejected": -237.69981384277344, "loss": 0.3582, "rewards/chosen": 0.05452112480998039, "rewards/margins": 1.405389230698347, "rewards/rejected": -1.3508681058883667, "step": 9852 }, { "epoch": 0.5222484297564466, "grad_norm": 34.25, "kl": 1.9769039154052734, "learning_rate": 5e-07, "logits/chosen": -4370509.6, "logits/rejected": -7960294.666666667, "logps/chosen": -200.9743896484375, "logps/rejected": -223.15059407552084, "loss": 0.2228, "rewards/chosen": 1.5144139289855958, "rewards/margins": 4.088832950592041, "rewards/rejected": -2.5744190216064453, "step": 9853 }, { "epoch": 0.5223014337582488, "grad_norm": 53.5, "kl": 1.0000534057617188, "learning_rate": 5e-07, "logits/chosen": -19372203.2, "logits/rejected": -17263136.0, "logps/chosen": -340.5799560546875, "logps/rejected": -242.107666015625, "loss": 0.3579, "rewards/chosen": 0.48205175399780276, "rewards/margins": 1.9568494796752929, "rewards/rejected": -1.4747977256774902, "step": 9854 }, { "epoch": 0.5223544377600509, "grad_norm": 38.0, "kl": 1.2932891845703125, "learning_rate": 5e-07, "logits/chosen": -499996.09375, "logits/rejected": -2080868.25, "logps/chosen": -82.87344360351562, "logps/rejected": -242.3120880126953, "loss": 0.3418, "rewards/chosen": 0.54587322473526, "rewards/margins": 2.0860827565193176, "rewards/rejected": -1.5402095317840576, "step": 9855 }, { "epoch": 0.522407441761853, "grad_norm": 48.5, "kl": 2.3512535095214844, "learning_rate": 5e-07, "logits/chosen": -6587309.333333333, "logits/rejected": -36984940.8, "logps/chosen": -279.1209716796875, "logps/rejected": -286.2713134765625, "loss": 0.2278, "rewards/chosen": 1.4086745580037434, "rewards/margins": 3.2571120580037434, "rewards/rejected": -1.8484375, "step": 9856 }, { "epoch": 0.5224604457636551, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40964845.333333336, "logits/rejected": -12796468.8, "logps/chosen": -339.3265787760417, "logps/rejected": -270.573681640625, "loss": 0.2883, "rewards/chosen": 0.21320192019144693, "rewards/margins": 2.1636030356089275, "rewards/rejected": -1.9504011154174805, "step": 9857 }, { "epoch": 0.5225134497654573, "grad_norm": 50.75, "kl": 1.941305160522461, "learning_rate": 5e-07, "logits/chosen": -23443832.0, "logits/rejected": -8228903.0, "logps/chosen": -394.8765869140625, "logps/rejected": -119.35565185546875, "loss": 0.4134, "rewards/chosen": 0.5873657464981079, "rewards/margins": 1.650275468826294, "rewards/rejected": -1.062909722328186, "step": 9858 }, { "epoch": 0.5225664537672594, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33875968.0, "logits/rejected": -57654954.666666664, "logps/chosen": -485.9087829589844, "logps/rejected": -631.880126953125, "loss": 0.1662, "rewards/chosen": 0.03568306565284729, "rewards/margins": 3.6461452543735504, "rewards/rejected": -3.610462188720703, "step": 9859 }, { "epoch": 0.5226194577690616, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19203877.333333332, "logits/rejected": -67920.8, "logps/chosen": -323.17506917317706, "logps/rejected": -357.35517578125, "loss": 0.2496, "rewards/chosen": 0.12519118189811707, "rewards/margins": 2.18741118311882, "rewards/rejected": -2.062220001220703, "step": 9860 }, { "epoch": 0.5226724617708637, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14995210.666666666, "logits/rejected": -14610513.6, "logps/chosen": -200.38968912760416, "logps/rejected": -349.90771484375, "loss": 0.2401, "rewards/chosen": 0.8542625109354655, "rewards/margins": 2.9336661974589027, "rewards/rejected": -2.0794036865234373, "step": 9861 }, { "epoch": 0.5227254657726659, "grad_norm": 52.5, "kl": 0.3185081481933594, "learning_rate": 5e-07, "logits/chosen": -18874636.8, "logits/rejected": -2719112.0, "logps/chosen": -324.3641357421875, "logps/rejected": -196.1998494466146, "loss": 0.4215, "rewards/chosen": -0.25291736125946046, "rewards/margins": 1.4425575812657674, "rewards/rejected": -1.6954749425252278, "step": 9862 }, { "epoch": 0.522778469774468, "grad_norm": 40.0, "kl": 2.0771894454956055, "learning_rate": 5e-07, "logits/chosen": -47184068.0, "logits/rejected": -31307480.0, "logps/chosen": -137.9016571044922, "logps/rejected": -373.6417236328125, "loss": 0.3617, "rewards/chosen": -0.05647982656955719, "rewards/margins": 2.2146705240011215, "rewards/rejected": -2.2711503505706787, "step": 9863 }, { "epoch": 0.5228314737762702, "grad_norm": 38.25, "kl": 1.92529296875, "learning_rate": 5e-07, "logits/chosen": -8353586.5, "logits/rejected": -20897981.333333332, "logps/chosen": -143.60076904296875, "logps/rejected": -193.3200887044271, "loss": 0.2208, "rewards/chosen": 0.6715168356895447, "rewards/margins": 3.0574551622072854, "rewards/rejected": -2.3859383265177407, "step": 9864 }, { "epoch": 0.5228844777780722, "grad_norm": 49.25, "kl": 0.7012577056884766, "learning_rate": 5e-07, "logits/chosen": -22712016.0, "logits/rejected": -6299778.4, "logps/chosen": -197.2032674153646, "logps/rejected": -364.5623291015625, "loss": 0.2578, "rewards/chosen": 0.6633242766062418, "rewards/margins": 3.0928748289744057, "rewards/rejected": -2.429550552368164, "step": 9865 }, { "epoch": 0.5229374817798744, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55349704.0, "logits/rejected": -12620583.0, "logps/chosen": -291.7041320800781, "logps/rejected": -180.40606689453125, "loss": 0.3907, "rewards/chosen": 0.07474341988563538, "rewards/margins": 1.0441305935382843, "rewards/rejected": -0.9693871736526489, "step": 9866 }, { "epoch": 0.5229904857816765, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1604539.6, "logits/rejected": -27556042.666666668, "logps/chosen": -275.4168212890625, "logps/rejected": -274.94578043619794, "loss": 0.3607, "rewards/chosen": 0.10014951229095459, "rewards/margins": 2.0788154999415083, "rewards/rejected": -1.9786659876505535, "step": 9867 }, { "epoch": 0.5230434897834787, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80947648.0, "logits/rejected": -29300740.57142857, "logps/chosen": -318.9310302734375, "logps/rejected": -379.77406529017856, "loss": 0.1415, "rewards/chosen": -0.06532897800207138, "rewards/margins": 2.498591996729374, "rewards/rejected": -2.5639209747314453, "step": 9868 }, { "epoch": 0.5230964937852808, "grad_norm": 50.5, "kl": 1.7652664184570312, "learning_rate": 5e-07, "logits/chosen": -17347336.0, "logits/rejected": -23127996.0, "logps/chosen": -420.75689697265625, "logps/rejected": -384.2704772949219, "loss": 0.2179, "rewards/chosen": 1.124749779701233, "rewards/margins": 3.7163137197494507, "rewards/rejected": -2.5915639400482178, "step": 9869 }, { "epoch": 0.523149497787083, "grad_norm": 51.25, "kl": 2.141653060913086, "learning_rate": 5e-07, "logits/chosen": -17544297.333333332, "logits/rejected": -30339078.4, "logps/chosen": -319.1272379557292, "logps/rejected": -453.59072265625, "loss": 0.1837, "rewards/chosen": 1.0780657927195232, "rewards/margins": 4.346709744135539, "rewards/rejected": -3.2686439514160157, "step": 9870 }, { "epoch": 0.523202501788885, "grad_norm": 64.5, "kl": 0.08575820922851562, "learning_rate": 5e-07, "logits/chosen": -87521957.33333333, "logits/rejected": -14601683.2, "logps/chosen": -375.8603108723958, "logps/rejected": -349.8083251953125, "loss": 0.3236, "rewards/chosen": 0.15781555573145548, "rewards/margins": 1.5139511148134868, "rewards/rejected": -1.3561355590820312, "step": 9871 }, { "epoch": 0.5232555057906872, "grad_norm": 45.0, "kl": 1.9021835327148438, "learning_rate": 5e-07, "logits/chosen": -44516044.8, "logits/rejected": 19859192.0, "logps/chosen": -578.6232421875, "logps/rejected": -485.8799641927083, "loss": 0.3106, "rewards/chosen": 0.5809399127960205, "rewards/margins": 3.2921775023142494, "rewards/rejected": -2.711237589518229, "step": 9872 }, { "epoch": 0.5233085097924893, "grad_norm": 46.75, "kl": 0.6241874694824219, "learning_rate": 5e-07, "logits/chosen": -11206635.0, "logits/rejected": -23767268.0, "logps/chosen": -347.0860595703125, "logps/rejected": -386.999755859375, "loss": 0.2661, "rewards/chosen": 0.3392740488052368, "rewards/margins": 2.7924000024795532, "rewards/rejected": -2.4531259536743164, "step": 9873 }, { "epoch": 0.5233615137942915, "grad_norm": 46.75, "kl": 0.4003162384033203, "learning_rate": 5e-07, "logits/chosen": -2622114.1666666665, "logits/rejected": 7442279.2, "logps/chosen": -150.43794759114584, "logps/rejected": -477.251025390625, "loss": 0.2691, "rewards/chosen": 0.38249389330546063, "rewards/margins": 2.6849604765574138, "rewards/rejected": -2.3024665832519533, "step": 9874 }, { "epoch": 0.5234145177960936, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5183810.0, "logits/rejected": -31136812.8, "logps/chosen": -329.21974690755206, "logps/rejected": -189.07314453125, "loss": 0.3573, "rewards/chosen": -0.30238596598307294, "rewards/margins": 1.1492303212483723, "rewards/rejected": -1.4516162872314453, "step": 9875 }, { "epoch": 0.5234675217978958, "grad_norm": 54.0, "kl": 1.1470756530761719, "learning_rate": 5e-07, "logits/chosen": -23390096.0, "logits/rejected": -34066768.0, "logps/chosen": -386.2991943359375, "logps/rejected": -644.5594482421875, "loss": 0.3807, "rewards/chosen": 0.3131604592005412, "rewards/margins": 3.0631499687830606, "rewards/rejected": -2.7499895095825195, "step": 9876 }, { "epoch": 0.5235205257996979, "grad_norm": 71.5, "kl": 2.580230712890625, "learning_rate": 5e-07, "logits/chosen": -51941613.71428572, "logits/rejected": 3000092.5, "logps/chosen": -470.78867885044644, "logps/rejected": -22.582914352416992, "loss": 0.3969, "rewards/chosen": 0.7945361818586077, "rewards/margins": 1.0540881838117326, "rewards/rejected": -0.259552001953125, "step": 9877 }, { "epoch": 0.5235735298015, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15852514.0, "logits/rejected": -22432501.333333332, "logps/chosen": -231.66778564453125, "logps/rejected": -365.608154296875, "loss": 0.1584, "rewards/chosen": 0.6967211961746216, "rewards/margins": 3.5599897305170694, "rewards/rejected": -2.8632685343424478, "step": 9878 }, { "epoch": 0.5236265338033022, "grad_norm": 53.5, "kl": 0.10205841064453125, "learning_rate": 5e-07, "logits/chosen": -54177676.8, "logits/rejected": -30165482.666666668, "logps/chosen": -527.30283203125, "logps/rejected": -710.511474609375, "loss": 0.2493, "rewards/chosen": 0.5541729927062988, "rewards/margins": 4.511119047800699, "rewards/rejected": -3.956946055094401, "step": 9879 }, { "epoch": 0.5236795378051042, "grad_norm": 47.25, "kl": 0.2231903076171875, "learning_rate": 5e-07, "logits/chosen": -30489941.333333332, "logits/rejected": -39096297.6, "logps/chosen": -331.8406982421875, "logps/rejected": -395.654345703125, "loss": 0.2474, "rewards/chosen": 0.7665417194366455, "rewards/margins": 2.268766450881958, "rewards/rejected": -1.5022247314453125, "step": 9880 }, { "epoch": 0.5237325418069064, "grad_norm": 56.5, "kl": 2.1453332901000977, "learning_rate": 5e-07, "logits/chosen": -59243110.4, "logits/rejected": -16248406.666666666, "logps/chosen": -301.6835693359375, "logps/rejected": -448.6944580078125, "loss": 0.2953, "rewards/chosen": 0.9921859741210938, "rewards/margins": 2.6452723185221356, "rewards/rejected": -1.6530863444010417, "step": 9881 }, { "epoch": 0.5237855458087085, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11368730.666666666, "logits/rejected": -33593302.4, "logps/chosen": -82.16580200195312, "logps/rejected": -278.662255859375, "loss": 0.3325, "rewards/chosen": -0.2947250207265218, "rewards/margins": 1.4522831122080486, "rewards/rejected": -1.7470081329345704, "step": 9882 }, { "epoch": 0.5238385498105107, "grad_norm": 51.0, "kl": 0.36861419677734375, "learning_rate": 5e-07, "logits/chosen": -73114392.0, "logits/rejected": -28834165.333333332, "logps/chosen": -462.3548278808594, "logps/rejected": -436.9254964192708, "loss": 0.202, "rewards/chosen": 0.5488495230674744, "rewards/margins": 2.5568149288495383, "rewards/rejected": -2.007965405782064, "step": 9883 }, { "epoch": 0.5238915538123128, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19991296.0, "logits/rejected": -54080986.666666664, "logps/chosen": -258.847998046875, "logps/rejected": -218.45894368489584, "loss": 0.3409, "rewards/chosen": 0.4013506412506104, "rewards/margins": 2.3906914234161376, "rewards/rejected": -1.9893407821655273, "step": 9884 }, { "epoch": 0.523944557814115, "grad_norm": 45.75, "kl": 4.095579147338867, "learning_rate": 5e-07, "logits/chosen": -21265777.6, "logits/rejected": -24150941.333333332, "logps/chosen": -202.752978515625, "logps/rejected": -184.36324055989584, "loss": 0.3481, "rewards/chosen": 0.7632739067077636, "rewards/margins": 2.5468998273213703, "rewards/rejected": -1.7836259206136067, "step": 9885 }, { "epoch": 0.5239975618159171, "grad_norm": 43.0, "kl": 0.32586669921875, "learning_rate": 5e-07, "logits/chosen": -2153274.0, "logits/rejected": -1885880.25, "logps/chosen": -409.88604736328125, "logps/rejected": -79.41096496582031, "loss": 0.2293, "rewards/chosen": 1.3132591247558594, "rewards/margins": 2.909707546234131, "rewards/rejected": -1.5964484214782715, "step": 9886 }, { "epoch": 0.5240505658177192, "grad_norm": 41.0, "kl": 0.07635879516601562, "learning_rate": 5e-07, "logits/chosen": -33911801.6, "logits/rejected": -38752824.0, "logps/chosen": -156.698095703125, "logps/rejected": -698.3922526041666, "loss": 0.3443, "rewards/chosen": -0.20216972827911378, "rewards/margins": 3.537515330314636, "rewards/rejected": -3.73968505859375, "step": 9887 }, { "epoch": 0.5241035698195213, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53975514.666666664, "logits/rejected": -37898771.2, "logps/chosen": -468.4072265625, "logps/rejected": -300.9073486328125, "loss": 0.2915, "rewards/chosen": -0.677141269048055, "rewards/margins": 2.0269465605417887, "rewards/rejected": -2.704087829589844, "step": 9888 }, { "epoch": 0.5241565738213235, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4985455.0, "logits/rejected": -37899636.0, "logps/chosen": -355.77117919921875, "logps/rejected": -213.68380737304688, "loss": 0.2504, "rewards/chosen": 0.40220099687576294, "rewards/margins": 2.775019943714142, "rewards/rejected": -2.372818946838379, "step": 9889 }, { "epoch": 0.5242095778231256, "grad_norm": 51.25, "kl": 0.101104736328125, "learning_rate": 5e-07, "logits/chosen": -18522502.4, "logits/rejected": -42495541.333333336, "logps/chosen": -414.901708984375, "logps/rejected": -281.9393310546875, "loss": 0.2227, "rewards/chosen": 0.9261101722717285, "rewards/margins": 3.2949947357177733, "rewards/rejected": -2.368884563446045, "step": 9890 }, { "epoch": 0.5242625818249278, "grad_norm": 61.25, "kl": 5.294342041015625, "learning_rate": 5e-07, "logits/chosen": -56117641.14285714, "logits/rejected": 1394435.0, "logps/chosen": -649.5073939732143, "logps/rejected": -56.75788497924805, "loss": 0.3946, "rewards/chosen": 1.1515049253191267, "rewards/margins": 2.810183814593724, "rewards/rejected": -1.6586788892745972, "step": 9891 }, { "epoch": 0.5243155858267299, "grad_norm": 54.75, "kl": 1.4470300674438477, "learning_rate": 5e-07, "logits/chosen": -26369688.0, "logits/rejected": -33815302.4, "logps/chosen": -267.0742594401042, "logps/rejected": -404.7748291015625, "loss": 0.2576, "rewards/chosen": 1.3193120956420898, "rewards/margins": 2.6220523834228517, "rewards/rejected": -1.3027402877807617, "step": 9892 }, { "epoch": 0.5243685898285321, "grad_norm": 52.5, "kl": 7.132328033447266, "learning_rate": 5e-07, "logits/chosen": -28134009.14285714, "logits/rejected": -35660076.0, "logps/chosen": -307.09824916294644, "logps/rejected": -837.1614379882812, "loss": 0.4348, "rewards/chosen": 0.8877137729099819, "rewards/margins": 3.8795412608555386, "rewards/rejected": -2.9918274879455566, "step": 9893 }, { "epoch": 0.5244215938303342, "grad_norm": 45.0, "kl": 1.7382240295410156, "learning_rate": 5e-07, "logits/chosen": -44528601.6, "logits/rejected": -51180106.666666664, "logps/chosen": -158.15732421875, "logps/rejected": -359.2803548177083, "loss": 0.3718, "rewards/chosen": 0.1623867154121399, "rewards/margins": 2.0563462376594543, "rewards/rejected": -1.8939595222473145, "step": 9894 }, { "epoch": 0.5244745978321363, "grad_norm": 60.0, "kl": 4.456414222717285, "learning_rate": 5e-07, "logits/chosen": -25371267.2, "logits/rejected": -7103453.333333333, "logps/chosen": -280.9123046875, "logps/rejected": -122.9456075032552, "loss": 0.3411, "rewards/chosen": 0.879417610168457, "rewards/margins": 2.4274897893269856, "rewards/rejected": -1.5480721791585286, "step": 9895 }, { "epoch": 0.5245276018339384, "grad_norm": 37.25, "kl": 0.4935493469238281, "learning_rate": 5e-07, "logits/chosen": -28527317.333333332, "logits/rejected": -6511541.6, "logps/chosen": -249.61531575520834, "logps/rejected": -171.402001953125, "loss": 0.2991, "rewards/chosen": 0.008976886669794718, "rewards/margins": 2.129741809765498, "rewards/rejected": -2.1207649230957033, "step": 9896 }, { "epoch": 0.5245806058357406, "grad_norm": 56.5, "kl": 2.6449661254882812, "learning_rate": 5e-07, "logits/chosen": -21556701.333333332, "logits/rejected": -16456079.0, "logps/chosen": -257.8133951822917, "logps/rejected": -237.93260192871094, "loss": 0.4136, "rewards/chosen": 0.3597232500712077, "rewards/margins": 2.106067697207133, "rewards/rejected": -1.7463444471359253, "step": 9897 }, { "epoch": 0.5246336098375427, "grad_norm": 44.0, "kl": 1.818826675415039, "learning_rate": 5e-07, "logits/chosen": -54313324.0, "logits/rejected": -17930530.0, "logps/chosen": -494.25628662109375, "logps/rejected": -186.935791015625, "loss": 0.227, "rewards/chosen": 1.0740543603897095, "rewards/margins": 3.4212948083877563, "rewards/rejected": -2.347240447998047, "step": 9898 }, { "epoch": 0.5246866138393449, "grad_norm": 47.75, "kl": 1.8081912994384766, "learning_rate": 5e-07, "logits/chosen": -8310855.333333333, "logits/rejected": -2233742.4, "logps/chosen": -177.00288899739584, "logps/rejected": -375.455224609375, "loss": 0.2268, "rewards/chosen": 1.083463986714681, "rewards/margins": 3.3397950490315758, "rewards/rejected": -2.2563310623168946, "step": 9899 }, { "epoch": 0.524739617841147, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41384960.0, "logits/rejected": -24030332.8, "logps/chosen": -229.09895833333334, "logps/rejected": -518.683251953125, "loss": 0.2438, "rewards/chosen": 0.04977952937285105, "rewards/margins": 3.1496441076199213, "rewards/rejected": -3.0998645782470704, "step": 9900 }, { "epoch": 0.5247926218429492, "grad_norm": 57.5, "kl": 0.3118114471435547, "learning_rate": 5e-07, "logits/chosen": -7148398.5, "logits/rejected": -63767936.0, "logps/chosen": -108.51175689697266, "logps/rejected": -718.6524047851562, "loss": 0.2352, "rewards/chosen": 0.880565881729126, "rewards/margins": 3.1404261589050293, "rewards/rejected": -2.2598602771759033, "step": 9901 }, { "epoch": 0.5248456258447513, "grad_norm": 28.0, "kl": 0.7003889083862305, "learning_rate": 5e-07, "logits/chosen": 2122664.5, "logits/rejected": -79350778.66666667, "logps/chosen": -92.33900451660156, "logps/rejected": -612.9055989583334, "loss": 0.1747, "rewards/chosen": 0.5233948826789856, "rewards/margins": 4.138427396615347, "rewards/rejected": -3.615032513936361, "step": 9902 }, { "epoch": 0.5248986298465534, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19211136.0, "logits/rejected": -28701452.0, "logps/chosen": -207.3405303955078, "logps/rejected": -414.3502197265625, "loss": 0.2843, "rewards/chosen": 0.12636713683605194, "rewards/margins": 2.51472906768322, "rewards/rejected": -2.388361930847168, "step": 9903 }, { "epoch": 0.5249516338483555, "grad_norm": 48.5, "kl": 0.37053680419921875, "learning_rate": 5e-07, "logits/chosen": -11905410.0, "logits/rejected": -15023312.0, "logps/chosen": -304.3367004394531, "logps/rejected": -314.56182861328125, "loss": 0.2736, "rewards/chosen": 0.2773788571357727, "rewards/margins": 2.676394999027252, "rewards/rejected": -2.3990161418914795, "step": 9904 }, { "epoch": 0.5250046378501577, "grad_norm": 53.5, "kl": 0.5782470703125, "learning_rate": 5e-07, "logits/chosen": -32779317.333333332, "logits/rejected": -5912860.0, "logps/chosen": -397.849853515625, "logps/rejected": -389.6728759765625, "loss": 0.2538, "rewards/chosen": 0.6000478267669678, "rewards/margins": 2.6973886013031008, "rewards/rejected": -2.097340774536133, "step": 9905 }, { "epoch": 0.5250576418519598, "grad_norm": 55.0, "kl": 0.11505317687988281, "learning_rate": 5e-07, "logits/chosen": -125949619.2, "logits/rejected": -21658056.0, "logps/chosen": -251.900439453125, "logps/rejected": -294.1287027994792, "loss": 0.3559, "rewards/chosen": 0.26569015979766847, "rewards/margins": 1.7733282963434855, "rewards/rejected": -1.5076381365458171, "step": 9906 }, { "epoch": 0.525110645853762, "grad_norm": 49.25, "kl": 0.13652992248535156, "learning_rate": 5e-07, "logits/chosen": -20442734.0, "logits/rejected": -38535748.0, "logps/chosen": -362.4592590332031, "logps/rejected": -394.0257263183594, "loss": 0.242, "rewards/chosen": 0.8552044630050659, "rewards/margins": 3.324400782585144, "rewards/rejected": -2.469196319580078, "step": 9907 }, { "epoch": 0.5251636498555641, "grad_norm": 55.5, "kl": 0.17685699462890625, "learning_rate": 5e-07, "logits/chosen": -30671664.0, "logits/rejected": -13986489.333333334, "logps/chosen": -216.5448974609375, "logps/rejected": -542.8229573567709, "loss": 0.3593, "rewards/chosen": 0.17818461656570433, "rewards/margins": 2.4359278162320455, "rewards/rejected": -2.2577431996663413, "step": 9908 }, { "epoch": 0.5252166538573663, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6005954.666666667, "logits/rejected": -72638937.6, "logps/chosen": -192.57670084635416, "logps/rejected": -352.63603515625, "loss": 0.3566, "rewards/chosen": -0.2451370358467102, "rewards/margins": 1.4168500781059266, "rewards/rejected": -1.6619871139526368, "step": 9909 }, { "epoch": 0.5252696578591683, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25007048.0, "logits/rejected": -47398300.0, "logps/chosen": -279.61077880859375, "logps/rejected": -304.114013671875, "loss": 0.2267, "rewards/chosen": 0.8167943954467773, "rewards/margins": 3.2582764625549316, "rewards/rejected": -2.4414820671081543, "step": 9910 }, { "epoch": 0.5253226618609705, "grad_norm": 47.75, "kl": 0.04036521911621094, "learning_rate": 5e-07, "logits/chosen": -33915168.0, "logits/rejected": -29581558.4, "logps/chosen": -284.1597493489583, "logps/rejected": -480.219189453125, "loss": 0.2656, "rewards/chosen": 0.06894430021444957, "rewards/margins": 2.5794747720162072, "rewards/rejected": -2.510530471801758, "step": 9911 }, { "epoch": 0.5253756658627726, "grad_norm": 56.25, "kl": 1.7911148071289062, "learning_rate": 5e-07, "logits/chosen": -43600344.0, "logits/rejected": -7792741.0, "logps/chosen": -387.38299560546875, "logps/rejected": -148.44537353515625, "loss": 0.3369, "rewards/chosen": 0.36148911714553833, "rewards/margins": 3.2258930802345276, "rewards/rejected": -2.8644039630889893, "step": 9912 }, { "epoch": 0.5254286698645748, "grad_norm": 70.5, "kl": 1.0191535949707031, "learning_rate": 5e-07, "logits/chosen": -40167012.571428575, "logits/rejected": -20074294.0, "logps/chosen": -402.0003138950893, "logps/rejected": -366.6392822265625, "loss": 0.4081, "rewards/chosen": 0.25909222875322613, "rewards/margins": 5.265806095940726, "rewards/rejected": -5.0067138671875, "step": 9913 }, { "epoch": 0.5254816738663769, "grad_norm": 39.25, "kl": 2.875493049621582, "learning_rate": 5e-07, "logits/chosen": -13315836.0, "logits/rejected": -5707296.0, "logps/chosen": -228.1634063720703, "logps/rejected": -145.05250549316406, "loss": 0.1969, "rewards/chosen": 1.6123913526535034, "rewards/margins": 3.3176982402801514, "rewards/rejected": -1.705306887626648, "step": 9914 }, { "epoch": 0.5255346778681791, "grad_norm": 60.0, "kl": 0.2999153137207031, "learning_rate": 5e-07, "logits/chosen": -2187123.5, "logits/rejected": -38139896.0, "logps/chosen": -371.9071044921875, "logps/rejected": -226.76220703125, "loss": 0.2841, "rewards/chosen": 0.9992210865020752, "rewards/margins": 2.1565818786621094, "rewards/rejected": -1.1573607921600342, "step": 9915 }, { "epoch": 0.5255876818699812, "grad_norm": 50.0, "kl": 3.3187503814697266, "learning_rate": 5e-07, "logits/chosen": 5144275.2, "logits/rejected": -42409408.0, "logps/chosen": -514.76533203125, "logps/rejected": -423.6188151041667, "loss": 0.244, "rewards/chosen": 1.307440948486328, "rewards/margins": 3.3058775583902995, "rewards/rejected": -1.9984366099039714, "step": 9916 }, { "epoch": 0.5256406858717834, "grad_norm": 55.25, "kl": 1.0510177612304688, "learning_rate": 5e-07, "logits/chosen": 514747.6666666667, "logits/rejected": -103966.4, "logps/chosen": -406.8730875651042, "logps/rejected": -331.590234375, "loss": 0.2642, "rewards/chosen": 0.40062793095906574, "rewards/margins": 2.1010246594746906, "rewards/rejected": -1.700396728515625, "step": 9917 }, { "epoch": 0.5256936898735854, "grad_norm": 50.25, "kl": 3.315479278564453, "learning_rate": 5e-07, "logits/chosen": -11074207.0, "logits/rejected": -21321584.0, "logps/chosen": -657.845947265625, "logps/rejected": -153.83541870117188, "loss": 0.2772, "rewards/chosen": 1.8581948280334473, "rewards/margins": 2.9323962926864624, "rewards/rejected": -1.0742014646530151, "step": 9918 }, { "epoch": 0.5257466938753876, "grad_norm": 52.5, "kl": 2.3609142303466797, "learning_rate": 5e-07, "logits/chosen": 7891072.666666667, "logits/rejected": -5804364.0, "logps/chosen": -38.05608622233073, "logps/rejected": -244.21710205078125, "loss": 0.401, "rewards/chosen": 0.4226776361465454, "rewards/margins": 2.1707069873809814, "rewards/rejected": -1.748029351234436, "step": 9919 }, { "epoch": 0.5257996978771897, "grad_norm": 50.5, "kl": 1.5972115993499756, "learning_rate": 5e-07, "logits/chosen": 77082233.6, "logits/rejected": -7364862.0, "logps/chosen": -258.3114013671875, "logps/rejected": -419.422607421875, "loss": 0.3406, "rewards/chosen": 0.6504498481750488, "rewards/margins": 2.4733949661254884, "rewards/rejected": -1.8229451179504395, "step": 9920 }, { "epoch": 0.5258527018789919, "grad_norm": 56.0, "kl": 1.0433464050292969, "learning_rate": 5e-07, "logits/chosen": -34793644.8, "logits/rejected": -18636281.333333332, "logps/chosen": -422.4216796875, "logps/rejected": -103.00386555989583, "loss": 0.265, "rewards/chosen": 0.9167914390563965, "rewards/margins": 2.396852334340413, "rewards/rejected": -1.4800608952840169, "step": 9921 }, { "epoch": 0.525905705880794, "grad_norm": 43.5, "kl": 0.9069709777832031, "learning_rate": 5e-07, "logits/chosen": -25587892.0, "logits/rejected": -43277348.0, "logps/chosen": -339.3507995605469, "logps/rejected": -573.0757446289062, "loss": 0.2419, "rewards/chosen": 1.2788883447647095, "rewards/margins": 3.4946399927139282, "rewards/rejected": -2.2157516479492188, "step": 9922 }, { "epoch": 0.5259587098825962, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34897556.0, "logits/rejected": -33327114.666666668, "logps/chosen": -722.2528686523438, "logps/rejected": -592.5321451822916, "loss": 0.1306, "rewards/chosen": 1.165185570716858, "rewards/margins": 3.937158226966858, "rewards/rejected": -2.77197265625, "step": 9923 }, { "epoch": 0.5260117138843983, "grad_norm": 112.5, "kl": 0.855712890625, "learning_rate": 5e-07, "logits/chosen": -34077948.8, "logits/rejected": -36551242.666666664, "logps/chosen": -367.2841552734375, "logps/rejected": -211.3308308919271, "loss": 0.2947, "rewards/chosen": 0.6652385711669921, "rewards/margins": 2.7094831784566242, "rewards/rejected": -2.0442446072896323, "step": 9924 }, { "epoch": 0.5260647178862005, "grad_norm": 49.5, "kl": 0.19185638427734375, "learning_rate": 5e-07, "logits/chosen": -13027988.0, "logits/rejected": -37360256.0, "logps/chosen": -95.79376983642578, "logps/rejected": -380.4247741699219, "loss": 0.3294, "rewards/chosen": -0.05493735522031784, "rewards/margins": 1.9966526105999947, "rewards/rejected": -2.0515899658203125, "step": 9925 }, { "epoch": 0.5261177218880025, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18619180.0, "logits/rejected": -39186528.0, "logps/chosen": -258.3546447753906, "logps/rejected": -460.9395345052083, "loss": 0.2344, "rewards/chosen": 0.7348961234092712, "rewards/margins": 2.805681804815928, "rewards/rejected": -2.0707856814066568, "step": 9926 }, { "epoch": 0.5261707258898047, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75181936.0, "logits/rejected": -22209289.333333332, "logps/chosen": -452.5568542480469, "logps/rejected": -252.80194091796875, "loss": 0.3007, "rewards/chosen": -0.7591568231582642, "rewards/margins": 1.1370484431584675, "rewards/rejected": -1.8962052663167317, "step": 9927 }, { "epoch": 0.5262237298916068, "grad_norm": 40.5, "kl": 3.8193416595458984, "learning_rate": 5e-07, "logits/chosen": -24658586.666666668, "logits/rejected": -25524956.8, "logps/chosen": -305.8621826171875, "logps/rejected": -209.93134765625, "loss": 0.2036, "rewards/chosen": 1.7219608624776204, "rewards/margins": 3.6925222714742025, "rewards/rejected": -1.970561408996582, "step": 9928 }, { "epoch": 0.5262767338934089, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -614625.1666666666, "logits/rejected": -14890873.6, "logps/chosen": -428.37109375, "logps/rejected": -173.10870361328125, "loss": 0.2448, "rewards/chosen": 0.36700646082560223, "rewards/margins": 2.5465158144632976, "rewards/rejected": -2.1795093536376955, "step": 9929 }, { "epoch": 0.5263297378952111, "grad_norm": 39.5, "kl": 1.3976936340332031, "learning_rate": 5e-07, "logits/chosen": -35941592.0, "logits/rejected": -48645840.0, "logps/chosen": -563.518798828125, "logps/rejected": -511.15087890625, "loss": 0.298, "rewards/chosen": 0.37957438826560974, "rewards/margins": 3.4792252480983734, "rewards/rejected": -3.0996508598327637, "step": 9930 }, { "epoch": 0.5263827418970132, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 158660213.33333334, "logits/rejected": -19072420.8, "logps/chosen": -671.7346598307291, "logps/rejected": -330.640673828125, "loss": 0.2371, "rewards/chosen": 0.16470134258270264, "rewards/margins": 2.8128775358200073, "rewards/rejected": -2.6481761932373047, "step": 9931 }, { "epoch": 0.5264357458988154, "grad_norm": 59.0, "kl": 1.356283187866211, "learning_rate": 5e-07, "logits/chosen": -42564256.0, "logits/rejected": -3150662.5, "logps/chosen": -282.721923828125, "logps/rejected": -221.2878875732422, "loss": 0.2772, "rewards/chosen": 0.7768146395683289, "rewards/margins": 2.6348413825035095, "rewards/rejected": -1.8580267429351807, "step": 9932 }, { "epoch": 0.5264887499006174, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41825322.666666664, "logits/rejected": -19469830.4, "logps/chosen": -187.8670654296875, "logps/rejected": -227.1384765625, "loss": 0.285, "rewards/chosen": 0.4193926652272542, "rewards/margins": 1.9848967393239338, "rewards/rejected": -1.5655040740966797, "step": 9933 }, { "epoch": 0.5265417539024196, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61472988.0, "logits/rejected": -76433736.0, "logps/chosen": -423.25506591796875, "logps/rejected": -366.245849609375, "loss": 0.2865, "rewards/chosen": 0.1691446304321289, "rewards/margins": 2.4396302700042725, "rewards/rejected": -2.2704856395721436, "step": 9934 }, { "epoch": 0.5265947579042217, "grad_norm": 65.0, "kl": 1.979879379272461, "learning_rate": 5e-07, "logits/chosen": -29782777.6, "logits/rejected": 16809517.333333332, "logps/chosen": -687.944921875, "logps/rejected": -747.1240234375, "loss": 0.329, "rewards/chosen": 0.9680723190307617, "rewards/margins": 2.922287368774414, "rewards/rejected": -1.9542150497436523, "step": 9935 }, { "epoch": 0.5266477619060239, "grad_norm": 59.0, "kl": 1.7070188522338867, "learning_rate": 5e-07, "logits/chosen": -10064106.285714285, "logits/rejected": -44979376.0, "logps/chosen": -214.96651785714286, "logps/rejected": -264.3033752441406, "loss": 0.4125, "rewards/chosen": 0.508458035332816, "rewards/margins": 1.817669766289847, "rewards/rejected": -1.3092117309570312, "step": 9936 }, { "epoch": 0.526700765907826, "grad_norm": 66.0, "kl": 1.3790454864501953, "learning_rate": 5e-07, "logits/chosen": -28210240.0, "logits/rejected": -99237520.0, "logps/chosen": -304.66678292410717, "logps/rejected": -415.4572448730469, "loss": 0.4217, "rewards/chosen": 0.2901392323630197, "rewards/margins": 2.6506281239645824, "rewards/rejected": -2.3604888916015625, "step": 9937 }, { "epoch": 0.5267537699096282, "grad_norm": 46.75, "kl": 1.8984088897705078, "learning_rate": 5e-07, "logits/chosen": -7754952.0, "logits/rejected": -16227083.2, "logps/chosen": -305.74717203776044, "logps/rejected": -280.5072265625, "loss": 0.24, "rewards/chosen": 1.4529660542805989, "rewards/margins": 2.7810557683308916, "rewards/rejected": -1.328089714050293, "step": 9938 }, { "epoch": 0.5268067739114303, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17436144.0, "logits/rejected": -19799262.4, "logps/chosen": -281.39300537109375, "logps/rejected": -378.1359375, "loss": 0.2338, "rewards/chosen": 0.6691953341166178, "rewards/margins": 2.5167678515116374, "rewards/rejected": -1.8475725173950195, "step": 9939 }, { "epoch": 0.5268597779132325, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -96352112.0, "logits/rejected": -8908786.666666666, "logps/chosen": -684.765869140625, "logps/rejected": -105.74692789713542, "loss": 0.2323, "rewards/chosen": 1.4464936256408691, "rewards/margins": 2.9496838251749677, "rewards/rejected": -1.5031901995340984, "step": 9940 }, { "epoch": 0.5269127819150345, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43858628.0, "logits/rejected": -10643201.0, "logps/chosen": -289.016357421875, "logps/rejected": -255.02027893066406, "loss": 0.2056, "rewards/chosen": 0.7736377716064453, "rewards/margins": 3.389641761779785, "rewards/rejected": -2.61600399017334, "step": 9941 }, { "epoch": 0.5269657859168367, "grad_norm": 58.5, "kl": 3.0197057723999023, "learning_rate": 5e-07, "logits/chosen": -19723016.0, "logits/rejected": -85107336.0, "logps/chosen": -369.095947265625, "logps/rejected": -156.71163940429688, "loss": 0.3411, "rewards/chosen": 0.6231754620869955, "rewards/margins": 5.018274625142415, "rewards/rejected": -4.39509916305542, "step": 9942 }, { "epoch": 0.5270187899186388, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40575602.666666664, "logits/rejected": -37428579.2, "logps/chosen": -371.0953776041667, "logps/rejected": -389.8630859375, "loss": 0.2712, "rewards/chosen": 0.0371876855691274, "rewards/margins": 2.064284815390905, "rewards/rejected": -2.0270971298217773, "step": 9943 }, { "epoch": 0.527071793920441, "grad_norm": 42.5, "kl": 0.8504266738891602, "learning_rate": 5e-07, "logits/chosen": 4144650.5, "logits/rejected": -25398574.0, "logps/chosen": -34.216793060302734, "logps/rejected": -288.16436767578125, "loss": 0.3496, "rewards/chosen": 0.2985493838787079, "rewards/margins": 1.8279282748699188, "rewards/rejected": -1.529378890991211, "step": 9944 }, { "epoch": 0.5271247979222431, "grad_norm": 50.5, "kl": 1.5956592559814453, "learning_rate": 5e-07, "logits/chosen": -4467278.666666667, "logits/rejected": -24607452.8, "logps/chosen": -140.5135498046875, "logps/rejected": -366.6197509765625, "loss": 0.302, "rewards/chosen": 0.3560456434885661, "rewards/margins": 2.019633976618449, "rewards/rejected": -1.6635883331298829, "step": 9945 }, { "epoch": 0.5271778019240453, "grad_norm": 41.5, "kl": 0.6356983184814453, "learning_rate": 5e-07, "logits/chosen": 1417765.6666666667, "logits/rejected": -49907865.6, "logps/chosen": -194.88934326171875, "logps/rejected": -444.817822265625, "loss": 0.2174, "rewards/chosen": 0.6064439614613851, "rewards/margins": 2.59232824643453, "rewards/rejected": -1.9858842849731446, "step": 9946 }, { "epoch": 0.5272308059258474, "grad_norm": 66.0, "kl": 0.3346672058105469, "learning_rate": 5e-07, "logits/chosen": -63100406.85714286, "logits/rejected": -6584555.0, "logps/chosen": -299.7180873325893, "logps/rejected": -129.2494354248047, "loss": 0.4934, "rewards/chosen": 0.006538586957114083, "rewards/margins": 0.45756779823984417, "rewards/rejected": -0.4510292112827301, "step": 9947 }, { "epoch": 0.5272838099276496, "grad_norm": 59.25, "kl": 3.2863998413085938, "learning_rate": 5e-07, "logits/chosen": -25017347.2, "logits/rejected": -29299021.333333332, "logps/chosen": -637.117578125, "logps/rejected": -258.6629231770833, "loss": 0.335, "rewards/chosen": 0.7616125583648682, "rewards/margins": 3.8173408349355062, "rewards/rejected": -3.055728276570638, "step": 9948 }, { "epoch": 0.5273368139294516, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66186964.0, "logits/rejected": -23252373.333333332, "logps/chosen": -537.996337890625, "logps/rejected": -165.58090209960938, "loss": 0.2859, "rewards/chosen": 0.5761963129043579, "rewards/margins": 1.7111432154973347, "rewards/rejected": -1.1349469025929768, "step": 9949 }, { "epoch": 0.5273898179312538, "grad_norm": 56.75, "kl": 0.05132293701171875, "learning_rate": 5e-07, "logits/chosen": -11259260.0, "logits/rejected": 7763451.2, "logps/chosen": -221.47076416015625, "logps/rejected": -205.4283203125, "loss": 0.2821, "rewards/chosen": 0.24142444133758545, "rewards/margins": 1.9433661222457885, "rewards/rejected": -1.701941680908203, "step": 9950 }, { "epoch": 0.5274428219330559, "grad_norm": 38.0, "kl": 0.7033977508544922, "learning_rate": 5e-07, "logits/chosen": -39244272.0, "logits/rejected": -34351093.333333336, "logps/chosen": -230.966748046875, "logps/rejected": -413.5341796875, "loss": 0.3409, "rewards/chosen": 0.13526954650878906, "rewards/margins": 2.436380132039388, "rewards/rejected": -2.301110585530599, "step": 9951 }, { "epoch": 0.5274958259348581, "grad_norm": 43.0, "kl": 0.7211627960205078, "learning_rate": 5e-07, "logits/chosen": -28545017.6, "logits/rejected": -39057018.666666664, "logps/chosen": -323.433447265625, "logps/rejected": -319.5478922526042, "loss": 0.3019, "rewards/chosen": 0.6154004096984863, "rewards/margins": 2.7175662676493326, "rewards/rejected": -2.102165857950846, "step": 9952 }, { "epoch": 0.5275488299366602, "grad_norm": 38.75, "kl": 0.7561931610107422, "learning_rate": 5e-07, "logits/chosen": -22222427.2, "logits/rejected": -73107168.0, "logps/chosen": -154.17296142578124, "logps/rejected": -411.6124674479167, "loss": 0.3174, "rewards/chosen": 0.23463222980499268, "rewards/margins": 2.7303475300470987, "rewards/rejected": -2.495715300242106, "step": 9953 }, { "epoch": 0.5276018339384624, "grad_norm": 39.25, "kl": 0.20515060424804688, "learning_rate": 5e-07, "logits/chosen": 28723536.0, "logits/rejected": -17956270.0, "logps/chosen": -913.32275390625, "logps/rejected": -200.5029296875, "loss": 0.1973, "rewards/chosen": 1.7001597881317139, "rewards/margins": 3.2461389303207397, "rewards/rejected": -1.5459791421890259, "step": 9954 }, { "epoch": 0.5276548379402645, "grad_norm": 40.5, "kl": 0.864410400390625, "learning_rate": 5e-07, "logits/chosen": -7001582.0, "logits/rejected": -23098722.0, "logps/chosen": -282.52777099609375, "logps/rejected": -303.9737854003906, "loss": 0.2472, "rewards/chosen": 0.8001939654350281, "rewards/margins": 2.9749805331230164, "rewards/rejected": -2.1747865676879883, "step": 9955 }, { "epoch": 0.5277078419420667, "grad_norm": 55.75, "kl": 1.4096336364746094, "learning_rate": 5e-07, "logits/chosen": -32727562.666666668, "logits/rejected": -41138233.6, "logps/chosen": -449.3203125, "logps/rejected": -219.774267578125, "loss": 0.2458, "rewards/chosen": 0.10321097572644551, "rewards/margins": 2.481911232074102, "rewards/rejected": -2.3787002563476562, "step": 9956 }, { "epoch": 0.5277608459438687, "grad_norm": 55.75, "kl": 0.6329307556152344, "learning_rate": 5e-07, "logits/chosen": -19364658.285714287, "logits/rejected": -81089888.0, "logps/chosen": -227.90815080915178, "logps/rejected": -636.4253540039062, "loss": 0.5183, "rewards/chosen": -0.29211507524762836, "rewards/margins": 3.0322684219905307, "rewards/rejected": -3.324383497238159, "step": 9957 }, { "epoch": 0.5278138499456709, "grad_norm": 43.0, "kl": 0.09846305847167969, "learning_rate": 5e-07, "logits/chosen": 9969490.0, "logits/rejected": -83015024.0, "logps/chosen": -95.75941467285156, "logps/rejected": -381.6119079589844, "loss": 0.3104, "rewards/chosen": 0.646821916103363, "rewards/margins": 2.1206101775169373, "rewards/rejected": -1.4737882614135742, "step": 9958 }, { "epoch": 0.527866853947473, "grad_norm": 65.5, "kl": 1.7131052017211914, "learning_rate": 5e-07, "logits/chosen": -7579073.6, "logits/rejected": -10789556.0, "logps/chosen": -448.825927734375, "logps/rejected": -164.86439005533853, "loss": 0.3351, "rewards/chosen": 0.979587459564209, "rewards/margins": 1.841537857055664, "rewards/rejected": -0.8619503974914551, "step": 9959 }, { "epoch": 0.5279198579492752, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11236150.4, "logits/rejected": -6264787.333333333, "logps/chosen": -341.3712890625, "logps/rejected": -316.58848063151044, "loss": 0.337, "rewards/chosen": 0.3978787660598755, "rewards/margins": 2.7126251459121704, "rewards/rejected": -2.314746379852295, "step": 9960 }, { "epoch": 0.5279728619510773, "grad_norm": 40.75, "kl": 0.6742887496948242, "learning_rate": 5e-07, "logits/chosen": -18172322.0, "logits/rejected": -15899298.666666666, "logps/chosen": -152.44970703125, "logps/rejected": -305.06313069661456, "loss": 0.2542, "rewards/chosen": 0.2985743284225464, "rewards/margins": 2.0649711688359575, "rewards/rejected": -1.7663968404134114, "step": 9961 }, { "epoch": 0.5280258659528795, "grad_norm": 61.75, "kl": 2.6640195846557617, "learning_rate": 5e-07, "logits/chosen": -3488469.6, "logits/rejected": -25074829.333333332, "logps/chosen": -132.205517578125, "logps/rejected": -290.73093668619794, "loss": 0.339, "rewards/chosen": 0.6103449821472168, "rewards/margins": 2.42231871287028, "rewards/rejected": -1.8119737307230632, "step": 9962 }, { "epoch": 0.5280788699546816, "grad_norm": 55.25, "kl": 0.8548755645751953, "learning_rate": 5e-07, "logits/chosen": -22675728.0, "logits/rejected": -8482430.0, "logps/chosen": -649.6710815429688, "logps/rejected": -76.97178649902344, "loss": 0.2924, "rewards/chosen": 1.076545000076294, "rewards/margins": 1.9751546382904053, "rewards/rejected": -0.8986096382141113, "step": 9963 }, { "epoch": 0.5281318739564838, "grad_norm": 52.25, "kl": 0.2897605895996094, "learning_rate": 5e-07, "logits/chosen": -5483773.5, "logits/rejected": -45025644.0, "logps/chosen": -257.950439453125, "logps/rejected": -648.2471313476562, "loss": 0.2189, "rewards/chosen": 1.0806567668914795, "rewards/margins": 3.862135648727417, "rewards/rejected": -2.7814788818359375, "step": 9964 }, { "epoch": 0.5281848779582858, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68690832.0, "logits/rejected": -8306157.333333333, "logps/chosen": -436.6415100097656, "logps/rejected": -553.2769775390625, "loss": 0.2955, "rewards/chosen": 0.16391143202781677, "rewards/margins": 1.7634696265061696, "rewards/rejected": -1.5995581944783528, "step": 9965 }, { "epoch": 0.528237881960088, "grad_norm": 63.0, "kl": 3.8066368103027344, "learning_rate": 5e-07, "logits/chosen": -12199586.666666666, "logits/rejected": -15416544.0, "logps/chosen": -502.9458821614583, "logps/rejected": -376.930908203125, "loss": 0.3941, "rewards/chosen": 0.5018366972605387, "rewards/margins": 4.272842327753703, "rewards/rejected": -3.771005630493164, "step": 9966 }, { "epoch": 0.5282908859618901, "grad_norm": 50.25, "kl": 2.672056198120117, "learning_rate": 5e-07, "logits/chosen": -55584554.666666664, "logits/rejected": -38338324.0, "logps/chosen": -388.8055826822917, "logps/rejected": -350.97418212890625, "loss": 0.3668, "rewards/chosen": 0.8072663942972819, "rewards/margins": 2.2806245485941568, "rewards/rejected": -1.473358154296875, "step": 9967 }, { "epoch": 0.5283438899636923, "grad_norm": 51.75, "kl": 3.4194979667663574, "learning_rate": 5e-07, "logits/chosen": 9503208.57142857, "logits/rejected": -83906352.0, "logps/chosen": -233.39179338727678, "logps/rejected": -522.0086669921875, "loss": 0.3807, "rewards/chosen": 0.7008108411516462, "rewards/margins": 4.9581107412065775, "rewards/rejected": -4.257299900054932, "step": 9968 }, { "epoch": 0.5283968939654944, "grad_norm": 44.0, "kl": 1.0932998657226562, "learning_rate": 5e-07, "logits/chosen": -8644318.0, "logits/rejected": -27227650.0, "logps/chosen": -136.8524627685547, "logps/rejected": -451.50677490234375, "loss": 0.273, "rewards/chosen": 0.5329298973083496, "rewards/margins": 2.5484726428985596, "rewards/rejected": -2.01554274559021, "step": 9969 }, { "epoch": 0.5284498979672966, "grad_norm": 56.75, "kl": 0.18083953857421875, "learning_rate": 5e-07, "logits/chosen": -27371221.333333332, "logits/rejected": -41410022.4, "logps/chosen": -136.6704305013021, "logps/rejected": -478.97080078125, "loss": 0.3402, "rewards/chosen": -0.2059656778971354, "rewards/margins": 1.2850237210591633, "rewards/rejected": -1.4909893989562988, "step": 9970 }, { "epoch": 0.5285029019690987, "grad_norm": 45.5, "kl": 0.23636293411254883, "learning_rate": 5e-07, "logits/chosen": -29459640.0, "logits/rejected": -26602422.0, "logps/chosen": -210.8697509765625, "logps/rejected": -192.38705444335938, "loss": 0.3375, "rewards/chosen": -0.05298256874084473, "rewards/margins": 1.897074580192566, "rewards/rejected": -1.9500571489334106, "step": 9971 }, { "epoch": 0.5285559059709009, "grad_norm": 47.75, "kl": 0.318695068359375, "learning_rate": 5e-07, "logits/chosen": -91604688.0, "logits/rejected": -8814827.0, "logps/chosen": -572.0487670898438, "logps/rejected": -160.73916625976562, "loss": 0.1876, "rewards/chosen": 1.054836630821228, "rewards/margins": 3.7967132329940796, "rewards/rejected": -2.7418766021728516, "step": 9972 }, { "epoch": 0.5286089099727029, "grad_norm": 37.25, "kl": 2.419316291809082, "learning_rate": 5e-07, "logits/chosen": -11540064.0, "logits/rejected": -962206.6, "logps/chosen": -333.2978108723958, "logps/rejected": -302.3076171875, "loss": 0.2301, "rewards/chosen": 1.227719783782959, "rewards/margins": 2.8825966835021974, "rewards/rejected": -1.6548768997192382, "step": 9973 }, { "epoch": 0.5286619139745051, "grad_norm": 74.5, "kl": 2.0377197265625, "learning_rate": 5e-07, "logits/chosen": -6834043.2, "logits/rejected": -30214664.0, "logps/chosen": -415.67685546875, "logps/rejected": -407.1445719401042, "loss": 0.3727, "rewards/chosen": 0.15040345191955568, "rewards/margins": 2.7293565591176354, "rewards/rejected": -2.5789531071980796, "step": 9974 }, { "epoch": 0.5287149179763072, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44432144.0, "logits/rejected": -20987528.0, "logps/chosen": -379.7565002441406, "logps/rejected": -440.87042236328125, "loss": 0.2446, "rewards/chosen": 0.3779814839363098, "rewards/margins": 3.642046272754669, "rewards/rejected": -3.2640647888183594, "step": 9975 }, { "epoch": 0.5287679219781094, "grad_norm": 50.75, "kl": 1.7424678802490234, "learning_rate": 5e-07, "logits/chosen": -39507146.666666664, "logits/rejected": -25398052.0, "logps/chosen": -351.8346761067708, "logps/rejected": -439.7452392578125, "loss": 0.3929, "rewards/chosen": 0.18575785557428995, "rewards/margins": 4.569688538710277, "rewards/rejected": -4.383930683135986, "step": 9976 }, { "epoch": 0.5288209259799115, "grad_norm": 55.0, "kl": 4.184577941894531, "learning_rate": 5e-07, "logits/chosen": -22615494.4, "logits/rejected": -190467157.33333334, "logps/chosen": -455.5884765625, "logps/rejected": -607.4076741536459, "loss": 0.3942, "rewards/chosen": 0.6032627105712891, "rewards/margins": 3.052944628397624, "rewards/rejected": -2.4496819178263345, "step": 9977 }, { "epoch": 0.5288739299817136, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29859130.666666668, "logits/rejected": 9249921.6, "logps/chosen": -309.3738606770833, "logps/rejected": -250.4323974609375, "loss": 0.2335, "rewards/chosen": 0.8380574385325114, "rewards/margins": 2.7910325209299724, "rewards/rejected": -1.952975082397461, "step": 9978 }, { "epoch": 0.5289269339835158, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50333232.0, "logits/rejected": -46283008.0, "logps/chosen": -298.74570719401044, "logps/rejected": -319.86151123046875, "loss": 0.3578, "rewards/chosen": 0.21056771278381348, "rewards/margins": 2.660045862197876, "rewards/rejected": -2.4494781494140625, "step": 9979 }, { "epoch": 0.5289799379853178, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -33681848.0, "logps/rejected": -317.44537353515625, "loss": 0.1576, "rewards/rejected": -1.8752102851867676, "step": 9980 }, { "epoch": 0.52903294198712, "grad_norm": 40.0, "kl": 0.8457317352294922, "learning_rate": 5e-07, "logits/chosen": -3596163.6, "logits/rejected": -27302557.333333332, "logps/chosen": -170.61705322265624, "logps/rejected": -478.1315511067708, "loss": 0.3257, "rewards/chosen": 0.1731808066368103, "rewards/margins": 2.903744606177012, "rewards/rejected": -2.7305637995402017, "step": 9981 }, { "epoch": 0.5290859459889221, "grad_norm": 35.5, "kl": 0.6585006713867188, "learning_rate": 5e-07, "logits/chosen": -3006005.0, "logits/rejected": -52318389.333333336, "logps/chosen": -121.9717529296875, "logps/rejected": -495.3270670572917, "loss": 0.3337, "rewards/chosen": 0.1083902359008789, "rewards/margins": 3.061158243815104, "rewards/rejected": -2.952768007914225, "step": 9982 }, { "epoch": 0.5291389499907243, "grad_norm": 43.25, "kl": 1.2741775512695312, "learning_rate": 5e-07, "logits/chosen": -22745884.8, "logits/rejected": -14038985.333333334, "logps/chosen": -191.28743896484374, "logps/rejected": -247.8153279622396, "loss": 0.294, "rewards/chosen": 0.5156521320343017, "rewards/margins": 3.4037667433420813, "rewards/rejected": -2.88811461130778, "step": 9983 }, { "epoch": 0.5291919539925264, "grad_norm": 69.0, "kl": 0.2669219970703125, "learning_rate": 5e-07, "logits/chosen": -40783760.0, "logits/rejected": -23405058.0, "logps/chosen": -304.1426188151042, "logps/rejected": -219.25999450683594, "loss": 0.4237, "rewards/chosen": 0.27397892872492474, "rewards/margins": 1.1822278300921123, "rewards/rejected": -0.9082489013671875, "step": 9984 }, { "epoch": 0.5292449579943286, "grad_norm": 88.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26916960.0, "logits/rejected": -9614022.666666666, "logps/chosen": -297.0107421875, "logps/rejected": -109.82810465494792, "loss": 0.3532, "rewards/chosen": -0.022703558206558228, "rewards/margins": 0.8791745603084564, "rewards/rejected": -0.9018781185150146, "step": 9985 }, { "epoch": 0.5292979619961307, "grad_norm": 39.75, "kl": 1.3871650695800781, "learning_rate": 5e-07, "logits/chosen": -6135671.5, "logits/rejected": -25264568.0, "logps/chosen": -302.5005187988281, "logps/rejected": -252.6643524169922, "loss": 0.3115, "rewards/chosen": 0.582817554473877, "rewards/margins": 2.5685510635375977, "rewards/rejected": -1.9857335090637207, "step": 9986 }, { "epoch": 0.5293509659979329, "grad_norm": 45.75, "kl": 2.038808822631836, "learning_rate": 5e-07, "logits/chosen": 20212572.8, "logits/rejected": -16181272.0, "logps/chosen": -170.83876953125, "logps/rejected": -650.1850992838541, "loss": 0.3318, "rewards/chosen": 0.36004390716552737, "rewards/margins": 3.749250793457031, "rewards/rejected": -3.389206886291504, "step": 9987 }, { "epoch": 0.5294039699997349, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12765900.0, "logits/rejected": -36733660.8, "logps/chosen": -390.898193359375, "logps/rejected": -387.65615234375, "loss": 0.2578, "rewards/chosen": -0.032815560698509216, "rewards/margins": 2.462483587861061, "rewards/rejected": -2.49529914855957, "step": 9988 }, { "epoch": 0.5294569740015371, "grad_norm": 44.25, "kl": 1.0030956268310547, "learning_rate": 5e-07, "logits/chosen": -22262008.0, "logits/rejected": -11495390.4, "logps/chosen": -270.0107014973958, "logps/rejected": -390.7380615234375, "loss": 0.1861, "rewards/chosen": 1.2509066263834636, "rewards/margins": 3.8645994822184244, "rewards/rejected": -2.613692855834961, "step": 9989 }, { "epoch": 0.5295099780033392, "grad_norm": 59.5, "kl": 0.39852142333984375, "learning_rate": 5e-07, "logits/chosen": -21711500.0, "logits/rejected": -1165310.5, "logps/chosen": -471.2025146484375, "logps/rejected": -169.87718200683594, "loss": 0.2817, "rewards/chosen": 0.3897644281387329, "rewards/margins": 2.3530489206314087, "rewards/rejected": -1.9632844924926758, "step": 9990 }, { "epoch": 0.5295629820051414, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53294741.333333336, "logits/rejected": -75847334.4, "logps/chosen": -271.05291748046875, "logps/rejected": -613.646484375, "loss": 0.2276, "rewards/chosen": 0.05839132269223531, "rewards/margins": 2.783500804503759, "rewards/rejected": -2.7251094818115233, "step": 9991 }, { "epoch": 0.5296159860069435, "grad_norm": 49.0, "kl": 6.887317657470703, "learning_rate": 5e-07, "logits/chosen": -63547108.0, "logits/rejected": -27810332.0, "logps/chosen": -1068.275390625, "logps/rejected": -276.5792541503906, "loss": 0.2302, "rewards/chosen": 2.1789231300354004, "rewards/margins": 4.306149959564209, "rewards/rejected": -2.1272268295288086, "step": 9992 }, { "epoch": 0.5296689900087457, "grad_norm": 48.0, "kl": 3.887568473815918, "learning_rate": 5e-07, "logits/chosen": -13703597.333333334, "logits/rejected": -9794307.0, "logps/chosen": -200.82171630859375, "logps/rejected": -381.1678466796875, "loss": 0.3428, "rewards/chosen": 1.002810796101888, "rewards/margins": 2.5298783381779986, "rewards/rejected": -1.5270675420761108, "step": 9993 }, { "epoch": 0.5297219940105478, "grad_norm": 50.25, "kl": 0.7085418701171875, "learning_rate": 5e-07, "logits/chosen": -22302244.0, "logits/rejected": -74864032.0, "logps/chosen": -473.6925048828125, "logps/rejected": -331.4031982421875, "loss": 0.2322, "rewards/chosen": 0.5476961135864258, "rewards/margins": 3.0738465785980225, "rewards/rejected": -2.5261504650115967, "step": 9994 }, { "epoch": 0.52977499801235, "grad_norm": 52.0, "kl": 1.172062873840332, "learning_rate": 5e-07, "logits/chosen": -40559901.333333336, "logits/rejected": -33167592.0, "logps/chosen": -328.86334228515625, "logps/rejected": -607.2051391601562, "loss": 0.386, "rewards/chosen": 0.1136467953523, "rewards/margins": 4.692263414462407, "rewards/rejected": -4.578616619110107, "step": 9995 }, { "epoch": 0.529828002014152, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25415018.666666668, "logits/rejected": -27359865.6, "logps/chosen": -292.810302734375, "logps/rejected": -290.311669921875, "loss": 0.2196, "rewards/chosen": 0.6140299638112386, "rewards/margins": 2.519635279973348, "rewards/rejected": -1.9056053161621094, "step": 9996 }, { "epoch": 0.5298810060159542, "grad_norm": 122.0, "kl": 0.40558624267578125, "learning_rate": 5e-07, "logits/chosen": 6029742.0, "logits/rejected": -24027224.0, "logps/chosen": -467.6556701660156, "logps/rejected": -281.75689697265625, "loss": 0.2479, "rewards/chosen": 1.1505331993103027, "rewards/margins": 3.275705575942993, "rewards/rejected": -2.1251723766326904, "step": 9997 }, { "epoch": 0.5299340100177563, "grad_norm": 40.25, "kl": 0.8742923736572266, "learning_rate": 5e-07, "logits/chosen": -21687432.0, "logits/rejected": 1289786.0, "logps/chosen": -358.88262939453125, "logps/rejected": -240.88497924804688, "loss": 0.2695, "rewards/chosen": 1.1776925325393677, "rewards/margins": 2.6208068132400513, "rewards/rejected": -1.4431142807006836, "step": 9998 }, { "epoch": 0.5299870140195585, "grad_norm": 50.75, "kl": 0.9539985656738281, "learning_rate": 5e-07, "logits/chosen": -17550644.0, "logits/rejected": -12084786.666666666, "logps/chosen": -297.6439208984375, "logps/rejected": -356.4012044270833, "loss": 0.2669, "rewards/chosen": 0.14231568574905396, "rewards/margins": 2.0439343253771467, "rewards/rejected": -1.9016186396280925, "step": 9999 }, { "epoch": 0.5300400180213606, "grad_norm": 46.25, "kl": 0.5202417373657227, "learning_rate": 5e-07, "logits/chosen": -41961280.0, "logits/rejected": 44234372.0, "logps/chosen": -295.9639892578125, "logps/rejected": -261.4198913574219, "loss": 0.2699, "rewards/chosen": 0.6520745158195496, "rewards/margins": 2.471380650997162, "rewards/rejected": -1.8193061351776123, "step": 10000 }, { "epoch": 0.5300930220231628, "grad_norm": 49.25, "kl": 1.6548776626586914, "learning_rate": 5e-07, "logits/chosen": -100014504.0, "logits/rejected": -21295100.0, "logps/chosen": -296.8710632324219, "logps/rejected": -286.9469909667969, "loss": 0.3862, "rewards/chosen": -0.11752153933048248, "rewards/margins": 2.09713576734066, "rewards/rejected": -2.2146573066711426, "step": 10001 }, { "epoch": 0.5301460260249649, "grad_norm": 37.0, "kl": 1.985025405883789, "learning_rate": 5e-07, "logits/chosen": 2781420.0, "logits/rejected": -42942169.6, "logps/chosen": -154.55675252278647, "logps/rejected": -181.69119873046876, "loss": 0.3019, "rewards/chosen": 0.24342588583628336, "rewards/margins": 2.6550824562708537, "rewards/rejected": -2.4116565704345705, "step": 10002 }, { "epoch": 0.530199030026767, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81533509.33333333, "logits/rejected": -22696916.8, "logps/chosen": -457.74462890625, "logps/rejected": -291.2984619140625, "loss": 0.2423, "rewards/chosen": 1.084675709406535, "rewards/margins": 2.7596169630686442, "rewards/rejected": -1.6749412536621093, "step": 10003 }, { "epoch": 0.5302520340285691, "grad_norm": 78.5, "kl": 0.3285980224609375, "learning_rate": 5e-07, "logits/chosen": -53997424.0, "logits/rejected": -6769035.5, "logps/chosen": -431.2788899739583, "logps/rejected": -122.58747863769531, "loss": 0.3878, "rewards/chosen": 0.21126302083333334, "rewards/margins": 2.4654813607533774, "rewards/rejected": -2.254218339920044, "step": 10004 }, { "epoch": 0.5303050380303713, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23481584.0, "logits/rejected": -84580160.0, "logps/chosen": -320.771484375, "logps/rejected": -447.83355712890625, "loss": 0.2414, "rewards/chosen": 0.640341579914093, "rewards/margins": 3.3857093453407288, "rewards/rejected": -2.7453677654266357, "step": 10005 }, { "epoch": 0.5303580420321734, "grad_norm": 38.25, "kl": 0.7801284790039062, "learning_rate": 5e-07, "logits/chosen": -17026822.0, "logits/rejected": -61419244.0, "logps/chosen": -251.82110595703125, "logps/rejected": -290.0089416503906, "loss": 0.2703, "rewards/chosen": 0.14820855855941772, "rewards/margins": 3.322709023952484, "rewards/rejected": -3.1745004653930664, "step": 10006 }, { "epoch": 0.5304110460339756, "grad_norm": 49.0, "kl": 0.16198348999023438, "learning_rate": 5e-07, "logits/chosen": -40209842.666666664, "logits/rejected": -2592836.8, "logps/chosen": -488.6156412760417, "logps/rejected": -136.55380859375, "loss": 0.2621, "rewards/chosen": 0.717231273651123, "rewards/margins": 2.5897889137268066, "rewards/rejected": -1.8725576400756836, "step": 10007 }, { "epoch": 0.5304640500357777, "grad_norm": 41.25, "kl": 0.2837991714477539, "learning_rate": 5e-07, "logits/chosen": -55862021.333333336, "logits/rejected": -24966814.4, "logps/chosen": -118.64856974283855, "logps/rejected": -541.490966796875, "loss": 0.2419, "rewards/chosen": 0.47957738240559894, "rewards/margins": 2.647314580281576, "rewards/rejected": -2.1677371978759767, "step": 10008 }, { "epoch": 0.5305170540375799, "grad_norm": 35.75, "kl": 0.5242233276367188, "learning_rate": 5e-07, "logits/chosen": -52050064.0, "logits/rejected": -18218528.0, "logps/chosen": -169.41688028971353, "logps/rejected": -232.431396484375, "loss": 0.2339, "rewards/chosen": 0.3498680194218953, "rewards/margins": 2.4545963366826378, "rewards/rejected": -2.1047283172607423, "step": 10009 }, { "epoch": 0.530570058039382, "grad_norm": 47.5, "kl": 4.378444671630859, "learning_rate": 5e-07, "logits/chosen": -39623408.0, "logits/rejected": -23484448.0, "logps/chosen": -726.427001953125, "logps/rejected": -222.065234375, "loss": 0.3045, "rewards/chosen": 1.0910993417104085, "rewards/margins": 2.1841146310170494, "rewards/rejected": -1.0930152893066407, "step": 10010 }, { "epoch": 0.5306230620411841, "grad_norm": 70.0, "kl": 2.1663994789123535, "learning_rate": 5e-07, "logits/chosen": -25227854.0, "logits/rejected": -33574112.0, "logps/chosen": -351.12982177734375, "logps/rejected": -551.352294921875, "loss": 0.3077, "rewards/chosen": 0.4432070553302765, "rewards/margins": 2.5309245884418488, "rewards/rejected": -2.0877175331115723, "step": 10011 }, { "epoch": 0.5306760660429862, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45603370.666666664, "logits/rejected": -41939491.2, "logps/chosen": -398.7853597005208, "logps/rejected": -429.22880859375, "loss": 0.1876, "rewards/chosen": 0.8696635564168295, "rewards/margins": 3.3961615880330407, "rewards/rejected": -2.526498031616211, "step": 10012 }, { "epoch": 0.5307290700447884, "grad_norm": 33.0, "kl": 1.1134529113769531, "learning_rate": 5e-07, "logits/chosen": -5242242.0, "logits/rejected": -29887644.0, "logps/chosen": -94.37138366699219, "logps/rejected": -301.96978759765625, "loss": 0.287, "rewards/chosen": -0.009478230029344559, "rewards/margins": 4.422011714428663, "rewards/rejected": -4.431489944458008, "step": 10013 }, { "epoch": 0.5307820740465905, "grad_norm": 39.25, "kl": 0.6757698059082031, "learning_rate": 5e-07, "logits/chosen": -8257498.666666667, "logits/rejected": -2045176.0, "logps/chosen": -200.2452392578125, "logps/rejected": -119.98634033203125, "loss": 0.2368, "rewards/chosen": 0.9251721700032552, "rewards/margins": 2.343984349568685, "rewards/rejected": -1.4188121795654296, "step": 10014 }, { "epoch": 0.5308350780483927, "grad_norm": 42.5, "kl": 1.2689399719238281, "learning_rate": 5e-07, "logits/chosen": -19847930.0, "logits/rejected": -31064000.0, "logps/chosen": -182.2245330810547, "logps/rejected": -229.37445068359375, "loss": 0.3115, "rewards/chosen": 0.12324409931898117, "rewards/margins": 2.108606867492199, "rewards/rejected": -1.9853627681732178, "step": 10015 }, { "epoch": 0.5308880820501948, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54860636.0, "logits/rejected": -41178176.0, "logps/chosen": -481.45147705078125, "logps/rejected": -479.2738444010417, "loss": 0.2202, "rewards/chosen": -0.2262725830078125, "rewards/margins": 2.360621611277262, "rewards/rejected": -2.5868941942850747, "step": 10016 }, { "epoch": 0.530941086051997, "grad_norm": 50.75, "kl": 0.011383056640625, "learning_rate": 5e-07, "logits/chosen": 2135140.0, "logits/rejected": -52517226.666666664, "logps/chosen": -258.926123046875, "logps/rejected": -355.6170247395833, "loss": 0.2923, "rewards/chosen": 0.5661257743835449, "rewards/margins": 2.3718365669250487, "rewards/rejected": -1.805710792541504, "step": 10017 }, { "epoch": 0.530994090053799, "grad_norm": 42.5, "kl": 1.6151800155639648, "learning_rate": 5e-07, "logits/chosen": -21093753.6, "logits/rejected": -30335405.333333332, "logps/chosen": -188.55791015625, "logps/rejected": -359.7974446614583, "loss": 0.3029, "rewards/chosen": 0.6485314846038819, "rewards/margins": 3.087877162297567, "rewards/rejected": -2.439345677693685, "step": 10018 }, { "epoch": 0.5310470940556012, "grad_norm": 50.5, "kl": 3.2672958374023438, "learning_rate": 5e-07, "logits/chosen": -54619648.0, "logits/rejected": -59607664.0, "logps/chosen": -440.56728515625, "logps/rejected": -545.5576171875, "loss": 0.294, "rewards/chosen": 0.8721231460571289, "rewards/margins": 3.5358823776245116, "rewards/rejected": -2.663759231567383, "step": 10019 }, { "epoch": 0.5311000980574033, "grad_norm": 68.0, "kl": 0.6897048950195312, "learning_rate": 5e-07, "logits/chosen": -4288030.4, "logits/rejected": -28958322.666666668, "logps/chosen": -518.54423828125, "logps/rejected": -176.96647135416666, "loss": 0.3059, "rewards/chosen": 0.6334081649780273, "rewards/margins": 2.1688927968343097, "rewards/rejected": -1.5354846318562825, "step": 10020 }, { "epoch": 0.5311531020592055, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39511488.0, "logits/rejected": -46641192.0, "logps/chosen": -318.3314514160156, "logps/rejected": -427.2418212890625, "loss": 0.2596, "rewards/chosen": 0.5189220309257507, "rewards/margins": 2.5638315081596375, "rewards/rejected": -2.0449094772338867, "step": 10021 }, { "epoch": 0.5312061060610076, "grad_norm": 34.5, "kl": 1.1452207565307617, "learning_rate": 5e-07, "logits/chosen": -6155718.5, "logits/rejected": -44863248.0, "logps/chosen": -144.75270080566406, "logps/rejected": -308.78399658203125, "loss": 0.2844, "rewards/chosen": 0.4135199785232544, "rewards/margins": 2.9032739400863647, "rewards/rejected": -2.4897539615631104, "step": 10022 }, { "epoch": 0.5312591100628098, "grad_norm": 39.0, "kl": 4.891188621520996, "learning_rate": 5e-07, "logits/chosen": -28870272.0, "logits/rejected": -18787900.0, "logps/chosen": -472.192333984375, "logps/rejected": -355.6923828125, "loss": 0.2624, "rewards/chosen": 1.4745463371276855, "rewards/margins": 3.988427702585856, "rewards/rejected": -2.5138813654581704, "step": 10023 }, { "epoch": 0.5313121140646119, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -118844712.0, "logits/rejected": -12183173.714285715, "logps/chosen": -639.61181640625, "logps/rejected": -347.6029575892857, "loss": 0.1249, "rewards/chosen": 0.852325439453125, "rewards/margins": 3.426872525896345, "rewards/rejected": -2.57454708644322, "step": 10024 }, { "epoch": 0.5313651180664141, "grad_norm": 57.75, "kl": 3.030181884765625, "learning_rate": 5e-07, "logits/chosen": -52950629.333333336, "logits/rejected": -36863481.6, "logps/chosen": -348.0161539713542, "logps/rejected": -258.1751708984375, "loss": 0.3599, "rewards/chosen": 0.04578756292661031, "rewards/margins": 1.070752849181493, "rewards/rejected": -1.0249652862548828, "step": 10025 }, { "epoch": 0.5314181220682161, "grad_norm": 44.5, "kl": 0.38758087158203125, "learning_rate": 5e-07, "logits/chosen": -18100516.0, "logits/rejected": -13045540.8, "logps/chosen": -152.4107869466146, "logps/rejected": -211.67626953125, "loss": 0.263, "rewards/chosen": 0.31371094783147174, "rewards/margins": 2.2080459316571552, "rewards/rejected": -1.8943349838256835, "step": 10026 }, { "epoch": 0.5314711260700183, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29291349.333333332, "logits/rejected": 4808559.2, "logps/chosen": -434.8798828125, "logps/rejected": -388.43115234375, "loss": 0.2213, "rewards/chosen": 0.9484639962514242, "rewards/margins": 3.846524222691854, "rewards/rejected": -2.8980602264404296, "step": 10027 }, { "epoch": 0.5315241300718204, "grad_norm": 39.5, "kl": 0.1270923614501953, "learning_rate": 5e-07, "logits/chosen": 7713164.8, "logits/rejected": -42255922.666666664, "logps/chosen": -127.393115234375, "logps/rejected": -278.010009765625, "loss": 0.3034, "rewards/chosen": 0.5047872543334961, "rewards/margins": 2.77486956914266, "rewards/rejected": -2.2700823148091636, "step": 10028 }, { "epoch": 0.5315771340736225, "grad_norm": 57.25, "kl": 1.7782402038574219, "learning_rate": 5e-07, "logits/chosen": -9590456.0, "logits/rejected": -65075642.666666664, "logps/chosen": -248.8367919921875, "logps/rejected": -499.33251953125, "loss": 0.3441, "rewards/chosen": 0.42641444206237794, "rewards/margins": 3.1526374340057375, "rewards/rejected": -2.7262229919433594, "step": 10029 }, { "epoch": 0.5316301380754247, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66853568.0, "logits/rejected": -54493514.666666664, "logps/chosen": -1173.8414306640625, "logps/rejected": -491.6978759765625, "loss": 0.1634, "rewards/chosen": 1.275579810142517, "rewards/margins": 3.7035545110702515, "rewards/rejected": -2.4279747009277344, "step": 10030 }, { "epoch": 0.5316831420772268, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19096282.666666668, "logits/rejected": -7426937.6, "logps/chosen": -408.123779296875, "logps/rejected": -300.081640625, "loss": 0.2199, "rewards/chosen": 1.022234598795573, "rewards/margins": 2.7560280481974284, "rewards/rejected": -1.7337934494018554, "step": 10031 }, { "epoch": 0.531736146079029, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -106805712.0, "logits/rejected": -50308229.333333336, "logps/chosen": -311.3055419921875, "logps/rejected": -590.44482421875, "loss": 0.154, "rewards/chosen": 0.4987274408340454, "rewards/margins": 3.586420019467672, "rewards/rejected": -3.0876925786336265, "step": 10032 }, { "epoch": 0.531789150080831, "grad_norm": 34.5, "kl": 0.9538493156433105, "learning_rate": 5e-07, "logits/chosen": -1924500.5, "logits/rejected": -43737411.2, "logps/chosen": -153.10148111979166, "logps/rejected": -231.581005859375, "loss": 0.2245, "rewards/chosen": 0.7516036033630371, "rewards/margins": 2.740742015838623, "rewards/rejected": -1.989138412475586, "step": 10033 }, { "epoch": 0.5318421540826332, "grad_norm": 51.0, "kl": 2.7140274047851562, "learning_rate": 5e-07, "logits/chosen": -17633868.0, "logits/rejected": 2785024.5, "logps/chosen": -380.48974609375, "logps/rejected": -99.08497619628906, "loss": 0.3435, "rewards/chosen": 0.6588617960611979, "rewards/margins": 3.008502165476481, "rewards/rejected": -2.349640369415283, "step": 10034 }, { "epoch": 0.5318951580844353, "grad_norm": 48.5, "kl": 0.7682819366455078, "learning_rate": 5e-07, "logits/chosen": -56403296.0, "logits/rejected": -24640590.4, "logps/chosen": -239.37674967447916, "logps/rejected": -223.0761474609375, "loss": 0.302, "rewards/chosen": -0.13901889324188232, "rewards/margins": 1.5617766618728637, "rewards/rejected": -1.700795555114746, "step": 10035 }, { "epoch": 0.5319481620862375, "grad_norm": 68.5, "kl": 1.5166168212890625, "learning_rate": 5e-07, "logits/chosen": 11269793.0, "logits/rejected": -72000752.0, "logps/chosen": -455.47406005859375, "logps/rejected": -414.3247375488281, "loss": 0.2345, "rewards/chosen": 1.1351134777069092, "rewards/margins": 3.1808509826660156, "rewards/rejected": -2.0457375049591064, "step": 10036 }, { "epoch": 0.5320011660880396, "grad_norm": 55.5, "kl": 3.153689384460449, "learning_rate": 5e-07, "logits/chosen": -23777186.0, "logits/rejected": -7942445.5, "logps/chosen": -511.57464599609375, "logps/rejected": -256.2358703613281, "loss": 0.229, "rewards/chosen": 1.7680636644363403, "rewards/margins": 3.7703269720077515, "rewards/rejected": -2.002263307571411, "step": 10037 }, { "epoch": 0.5320541700898418, "grad_norm": 43.25, "kl": 0.8880825042724609, "learning_rate": 5e-07, "logits/chosen": -33658148.0, "logits/rejected": -1802554.0, "logps/chosen": -375.9619140625, "logps/rejected": -226.64671325683594, "loss": 0.1873, "rewards/chosen": 1.4043068885803223, "rewards/margins": 4.065121412277222, "rewards/rejected": -2.6608145236968994, "step": 10038 }, { "epoch": 0.5321071740916439, "grad_norm": 51.25, "kl": 2.396524429321289, "learning_rate": 5e-07, "logits/chosen": -49399955.2, "logits/rejected": -6199538.0, "logps/chosen": -541.9984375, "logps/rejected": -94.72993977864583, "loss": 0.3046, "rewards/chosen": 0.5249083042144775, "rewards/margins": 3.146333614985148, "rewards/rejected": -2.6214253107706704, "step": 10039 }, { "epoch": 0.5321601780934461, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3132256.0, "logits/rejected": -23826440.0, "logps/chosen": -225.59039306640625, "logps/rejected": -330.742431640625, "loss": 0.2159, "rewards/chosen": -0.3433540463447571, "rewards/margins": 2.4058645764986673, "rewards/rejected": -2.7492186228434243, "step": 10040 }, { "epoch": 0.5322131820952481, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68174714.66666667, "logits/rejected": -9214411.0, "logps/chosen": -404.5176595052083, "logps/rejected": -61.838375091552734, "loss": 0.3969, "rewards/chosen": 0.10994913180669148, "rewards/margins": 1.9644253452618916, "rewards/rejected": -1.8544762134552002, "step": 10041 }, { "epoch": 0.5322661860970503, "grad_norm": 47.75, "kl": 0.6787185668945312, "learning_rate": 5e-07, "logits/chosen": -42530346.666666664, "logits/rejected": -24737326.4, "logps/chosen": -357.5758463541667, "logps/rejected": -377.4715087890625, "loss": 0.2014, "rewards/chosen": 0.5955708821614584, "rewards/margins": 3.44307492574056, "rewards/rejected": -2.8475040435791015, "step": 10042 }, { "epoch": 0.5323191900988524, "grad_norm": 67.5, "kl": 0.5171241760253906, "learning_rate": 5e-07, "logits/chosen": -19279278.0, "logits/rejected": -8321881.0, "logps/chosen": -343.6754150390625, "logps/rejected": -255.052978515625, "loss": 0.3291, "rewards/chosen": 0.37737780809402466, "rewards/margins": 1.616399347782135, "rewards/rejected": -1.2390215396881104, "step": 10043 }, { "epoch": 0.5323721941006546, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10448456.666666666, "logits/rejected": -2925008.0, "logps/chosen": -128.2093302408854, "logps/rejected": -665.0974609375, "loss": 0.1642, "rewards/chosen": 0.9082156817118326, "rewards/margins": 4.376760069529215, "rewards/rejected": -3.4685443878173827, "step": 10044 }, { "epoch": 0.5324251981024567, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57129792.0, "logits/rejected": 1115829.3333333333, "logps/chosen": -300.95947265625, "logps/rejected": -233.28715006510416, "loss": 0.3754, "rewards/chosen": 0.24979887008666993, "rewards/margins": 1.362604554494222, "rewards/rejected": -1.112805684407552, "step": 10045 }, { "epoch": 0.5324782021042589, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1940316.75, "logits/rejected": -37913154.666666664, "logps/chosen": -106.06597900390625, "logps/rejected": -427.5352376302083, "loss": 0.2476, "rewards/chosen": -0.25686272978782654, "rewards/margins": 1.9983907441298165, "rewards/rejected": -2.255253473917643, "step": 10046 }, { "epoch": 0.532531206106061, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22677438.0, "logits/rejected": -21642002.0, "logps/chosen": -355.2740173339844, "logps/rejected": -221.15374755859375, "loss": 0.2522, "rewards/chosen": 0.7644439935684204, "rewards/margins": 2.7853938341140747, "rewards/rejected": -2.0209498405456543, "step": 10047 }, { "epoch": 0.5325842101078632, "grad_norm": 38.5, "kl": 2.5696964263916016, "learning_rate": 5e-07, "logits/chosen": -25401232.0, "logits/rejected": -14028353.0, "logps/chosen": -316.5965169270833, "logps/rejected": -216.77667236328125, "loss": 0.3179, "rewards/chosen": 0.953404426574707, "rewards/margins": 2.985220193862915, "rewards/rejected": -2.031815767288208, "step": 10048 }, { "epoch": 0.5326372141096652, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39681048.0, "logits/rejected": -34090582.4, "logps/chosen": -282.62255859375, "logps/rejected": -555.40810546875, "loss": 0.2263, "rewards/chosen": 0.17943725983301798, "rewards/margins": 3.3365463296572364, "rewards/rejected": -3.1571090698242186, "step": 10049 }, { "epoch": 0.5326902181114674, "grad_norm": 61.0, "kl": 0.7363357543945312, "learning_rate": 5e-07, "logits/chosen": -23882770.0, "logits/rejected": -44643712.0, "logps/chosen": -287.62261962890625, "logps/rejected": -276.45489501953125, "loss": 0.3132, "rewards/chosen": 0.18883728981018066, "rewards/margins": 2.13216233253479, "rewards/rejected": -1.9433250427246094, "step": 10050 }, { "epoch": 0.5327432221132695, "grad_norm": 57.75, "kl": 3.1503610610961914, "learning_rate": 5e-07, "logits/chosen": -14353068.0, "logits/rejected": -18454280.0, "logps/chosen": -291.422607421875, "logps/rejected": -348.19818115234375, "loss": 0.2798, "rewards/chosen": 1.2878282864888508, "rewards/margins": 3.790470441182454, "rewards/rejected": -2.5026421546936035, "step": 10051 }, { "epoch": 0.5327962261150717, "grad_norm": 44.25, "kl": 0.657958984375, "learning_rate": 5e-07, "logits/chosen": -31042053.333333332, "logits/rejected": -71829088.0, "logps/chosen": -247.95263671875, "logps/rejected": -662.4412231445312, "loss": 0.3247, "rewards/chosen": 0.48529966672261554, "rewards/margins": 3.2191863854726157, "rewards/rejected": -2.73388671875, "step": 10052 }, { "epoch": 0.5328492301168738, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20188158.666666668, "logits/rejected": -8958095.2, "logps/chosen": -319.17384847005206, "logps/rejected": -175.8101806640625, "loss": 0.2426, "rewards/chosen": 0.20308717091878256, "rewards/margins": 2.5074326833089193, "rewards/rejected": -2.3043455123901366, "step": 10053 }, { "epoch": 0.532902234118676, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3514440.5, "logits/rejected": -19304240.0, "logps/chosen": -130.0654754638672, "logps/rejected": -282.71493094308033, "loss": 0.1636, "rewards/chosen": -0.1515243500471115, "rewards/margins": 2.5335369684866498, "rewards/rejected": -2.6850613185337613, "step": 10054 }, { "epoch": 0.5329552381204781, "grad_norm": 63.0, "kl": 1.8478050231933594, "learning_rate": 5e-07, "logits/chosen": -9734101.6, "logits/rejected": -118742389.33333333, "logps/chosen": -354.98779296875, "logps/rejected": -562.1328125, "loss": 0.1859, "rewards/chosen": 1.2990010261535645, "rewards/margins": 3.9397633552551268, "rewards/rejected": -2.6407623291015625, "step": 10055 }, { "epoch": 0.5330082421222803, "grad_norm": 39.0, "kl": 2.9863739013671875, "learning_rate": 5e-07, "logits/chosen": -12599989.0, "logits/rejected": -45392064.0, "logps/chosen": -188.38330078125, "logps/rejected": -429.5748291015625, "loss": 0.3454, "rewards/chosen": 0.21604615449905396, "rewards/margins": 2.723997414112091, "rewards/rejected": -2.507951259613037, "step": 10056 }, { "epoch": 0.5330612461240823, "grad_norm": 40.75, "kl": 0.5250644683837891, "learning_rate": 5e-07, "logits/chosen": -1639429.5, "logits/rejected": -17554289.333333332, "logps/chosen": -78.75550079345703, "logps/rejected": -252.7088419596354, "loss": 0.3168, "rewards/chosen": -0.35433894395828247, "rewards/margins": 1.2317697405815125, "rewards/rejected": -1.586108684539795, "step": 10057 }, { "epoch": 0.5331142501258845, "grad_norm": 56.0, "kl": 2.4064674377441406, "learning_rate": 5e-07, "logits/chosen": -39445641.14285714, "logits/rejected": -44623504.0, "logps/chosen": -240.80728585379464, "logps/rejected": -925.713134765625, "loss": 0.4429, "rewards/chosen": 0.2642104796000889, "rewards/margins": 3.796559112412589, "rewards/rejected": -3.5323486328125, "step": 10058 }, { "epoch": 0.5331672541276866, "grad_norm": 54.75, "kl": 1.5570068359375, "learning_rate": 5e-07, "logits/chosen": -51863552.0, "logits/rejected": -49972869.333333336, "logps/chosen": -323.9603271484375, "logps/rejected": -552.2022298177084, "loss": 0.3202, "rewards/chosen": 0.5360913276672363, "rewards/margins": 2.480404059092204, "rewards/rejected": -1.9443127314249675, "step": 10059 }, { "epoch": 0.5332202581294888, "grad_norm": 51.75, "kl": 4.107200622558594, "learning_rate": 5e-07, "logits/chosen": -22260006.666666668, "logits/rejected": -48775398.4, "logps/chosen": -602.8047688802084, "logps/rejected": -432.565576171875, "loss": 0.1758, "rewards/chosen": 1.9394276936848958, "rewards/margins": 4.617663892110189, "rewards/rejected": -2.678236198425293, "step": 10060 }, { "epoch": 0.5332732621312909, "grad_norm": 48.25, "kl": 2.84149169921875, "learning_rate": 5e-07, "logits/chosen": -12762352.0, "logits/rejected": -27003366.0, "logps/chosen": -216.633056640625, "logps/rejected": -251.376220703125, "loss": 0.3985, "rewards/chosen": 0.3353283405303955, "rewards/margins": 3.074169635772705, "rewards/rejected": -2.7388412952423096, "step": 10061 }, { "epoch": 0.5333262661330931, "grad_norm": 79.0, "kl": 4.829826354980469, "learning_rate": 5e-07, "logits/chosen": -9094694.4, "logits/rejected": -46708906.666666664, "logps/chosen": -306.9482421875, "logps/rejected": -310.1077880859375, "loss": 0.2994, "rewards/chosen": 1.317458438873291, "rewards/margins": 3.048643398284912, "rewards/rejected": -1.731184959411621, "step": 10062 }, { "epoch": 0.5333792701348952, "grad_norm": 55.75, "kl": 1.7094039916992188, "learning_rate": 5e-07, "logits/chosen": -61824640.0, "logits/rejected": 1453803.0, "logps/chosen": -373.65911865234375, "logps/rejected": -375.84454345703125, "loss": 0.2829, "rewards/chosen": 0.5328572392463684, "rewards/margins": 2.620458662509918, "rewards/rejected": -2.08760142326355, "step": 10063 }, { "epoch": 0.5334322741366974, "grad_norm": 44.5, "kl": 0.22721099853515625, "learning_rate": 5e-07, "logits/chosen": -14154465.6, "logits/rejected": -4243108.666666667, "logps/chosen": -361.85537109375, "logps/rejected": -465.7316487630208, "loss": 0.2906, "rewards/chosen": 0.9853928565979004, "rewards/margins": 3.7321115811665857, "rewards/rejected": -2.746718724568685, "step": 10064 }, { "epoch": 0.5334852781384994, "grad_norm": 39.5, "kl": 5.27390193939209, "learning_rate": 5e-07, "logits/chosen": -22711446.4, "logits/rejected": 4180350.0, "logps/chosen": -291.7501708984375, "logps/rejected": -65.47363789876302, "loss": 0.3472, "rewards/chosen": 0.9676074028015137, "rewards/margins": 2.5178700129191083, "rewards/rejected": -1.5502626101175945, "step": 10065 }, { "epoch": 0.5335382821403016, "grad_norm": 49.0, "kl": 0.8749771118164062, "learning_rate": 5e-07, "logits/chosen": -17695069.333333332, "logits/rejected": -68019264.0, "logps/chosen": -339.8113199869792, "logps/rejected": -325.698876953125, "loss": 0.2354, "rewards/chosen": 0.7088118394215902, "rewards/margins": 2.4254656632741294, "rewards/rejected": -1.716653823852539, "step": 10066 }, { "epoch": 0.5335912861421037, "grad_norm": 42.5, "kl": 0.004871368408203125, "learning_rate": 5e-07, "logits/chosen": -35061024.0, "logits/rejected": -40657168.0, "logps/chosen": -321.29656982421875, "logps/rejected": -401.1802062988281, "loss": 0.2255, "rewards/chosen": 0.9117685556411743, "rewards/margins": 2.9322065114974976, "rewards/rejected": -2.0204379558563232, "step": 10067 }, { "epoch": 0.5336442901439059, "grad_norm": 63.25, "kl": 2.2171106338500977, "learning_rate": 5e-07, "logits/chosen": -33510018.666666668, "logits/rejected": -18210956.0, "logps/chosen": -316.3617350260417, "logps/rejected": -118.92570495605469, "loss": 0.3707, "rewards/chosen": 0.5673206647237142, "rewards/margins": 2.6019512017567954, "rewards/rejected": -2.034630537033081, "step": 10068 }, { "epoch": 0.533697294145708, "grad_norm": 57.75, "kl": 0.27740478515625, "learning_rate": 5e-07, "logits/chosen": -46232915.2, "logits/rejected": -52239136.0, "logps/chosen": -437.224951171875, "logps/rejected": -232.030517578125, "loss": 0.3036, "rewards/chosen": 0.3931134223937988, "rewards/margins": 3.2812541007995604, "rewards/rejected": -2.8881406784057617, "step": 10069 }, { "epoch": 0.5337502981475102, "grad_norm": 52.75, "kl": 1.264103889465332, "learning_rate": 5e-07, "logits/chosen": -18920436.0, "logits/rejected": -138972688.0, "logps/chosen": -285.2941487630208, "logps/rejected": -286.6011962890625, "loss": 0.3338, "rewards/chosen": 0.7644868691762289, "rewards/margins": 2.1983500321706138, "rewards/rejected": -1.4338631629943848, "step": 10070 }, { "epoch": 0.5338033021493123, "grad_norm": 49.0, "kl": 1.3139915466308594, "learning_rate": 5e-07, "logits/chosen": -11328893.333333334, "logits/rejected": -31169376.0, "logps/chosen": -315.65576171875, "logps/rejected": -35.650115966796875, "loss": 0.4107, "rewards/chosen": 0.7348731358846029, "rewards/margins": 1.1132021447022757, "rewards/rejected": -0.37832900881767273, "step": 10071 }, { "epoch": 0.5338563061511145, "grad_norm": 58.75, "kl": 1.7649164199829102, "learning_rate": 5e-07, "logits/chosen": -23609282.666666668, "logits/rejected": -23690054.0, "logps/chosen": -388.7584635416667, "logps/rejected": -104.03731536865234, "loss": 0.4759, "rewards/chosen": 0.11443877220153809, "rewards/margins": 0.9160938858985901, "rewards/rejected": -0.801655113697052, "step": 10072 }, { "epoch": 0.5339093101529165, "grad_norm": 53.0, "kl": 0.16359519958496094, "learning_rate": 5e-07, "logits/chosen": -41649472.0, "logits/rejected": -1778924.2, "logps/chosen": -258.15625, "logps/rejected": -268.554443359375, "loss": 0.3104, "rewards/chosen": 0.1866193413734436, "rewards/margins": 1.8301368355751038, "rewards/rejected": -1.6435174942016602, "step": 10073 }, { "epoch": 0.5339623141547187, "grad_norm": 48.75, "kl": 1.1831302642822266, "learning_rate": 5e-07, "logits/chosen": -26624640.0, "logits/rejected": -15593644.0, "logps/chosen": -383.380859375, "logps/rejected": -199.2120361328125, "loss": 0.3214, "rewards/chosen": 0.37660789489746094, "rewards/margins": 2.0922956466674805, "rewards/rejected": -1.7156877517700195, "step": 10074 }, { "epoch": 0.5340153181565208, "grad_norm": 58.25, "kl": 2.842266082763672, "learning_rate": 5e-07, "logits/chosen": -24104960.0, "logits/rejected": -5340781.5, "logps/chosen": -388.9141031901042, "logps/rejected": -109.97238159179688, "loss": 0.3803, "rewards/chosen": 0.3868749141693115, "rewards/margins": 3.892970323562622, "rewards/rejected": -3.5060954093933105, "step": 10075 }, { "epoch": 0.534068322158323, "grad_norm": 43.0, "kl": 2.234142303466797, "learning_rate": 5e-07, "logits/chosen": -45823088.0, "logits/rejected": -41100355.2, "logps/chosen": -957.052734375, "logps/rejected": -315.192138671875, "loss": 0.2114, "rewards/chosen": 1.3972320556640625, "rewards/margins": 3.2910697937011717, "rewards/rejected": -1.8938377380371094, "step": 10076 }, { "epoch": 0.5341213261601251, "grad_norm": 40.25, "kl": 1.2975387573242188, "learning_rate": 5e-07, "logits/chosen": -32726268.8, "logits/rejected": -21650310.666666668, "logps/chosen": -298.3940185546875, "logps/rejected": -357.5594482421875, "loss": 0.2385, "rewards/chosen": 1.018308734893799, "rewards/margins": 3.509008502960205, "rewards/rejected": -2.4906997680664062, "step": 10077 }, { "epoch": 0.5341743301619273, "grad_norm": 49.0, "kl": 0.6784267425537109, "learning_rate": 5e-07, "logits/chosen": -14472808.0, "logits/rejected": -38249344.0, "logps/chosen": -329.9295247395833, "logps/rejected": -327.817919921875, "loss": 0.2376, "rewards/chosen": 0.5713876883188883, "rewards/margins": 2.6801897207895915, "rewards/rejected": -2.1088020324707033, "step": 10078 }, { "epoch": 0.5342273341637294, "grad_norm": 40.25, "kl": 1.2824649810791016, "learning_rate": 5e-07, "logits/chosen": -32237254.4, "logits/rejected": -37922242.666666664, "logps/chosen": -179.38564453125, "logps/rejected": -451.2007649739583, "loss": 0.3424, "rewards/chosen": 0.22685465812683106, "rewards/margins": 3.370186185836792, "rewards/rejected": -3.143331527709961, "step": 10079 }, { "epoch": 0.5342803381655314, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44547045.333333336, "logits/rejected": -25991408.0, "logps/chosen": -299.9707438151042, "logps/rejected": -231.45517578125, "loss": 0.3183, "rewards/chosen": 0.2199190855026245, "rewards/margins": 1.5627363920211792, "rewards/rejected": -1.3428173065185547, "step": 10080 }, { "epoch": 0.5343333421673336, "grad_norm": 43.75, "kl": 0.3434638977050781, "learning_rate": 5e-07, "logits/chosen": 6230206.5, "logits/rejected": -62113801.14285714, "logps/chosen": -65.79893493652344, "logps/rejected": -323.2207728794643, "loss": 0.1743, "rewards/chosen": 0.6441749930381775, "rewards/margins": 3.0357820051056996, "rewards/rejected": -2.391607012067522, "step": 10081 }, { "epoch": 0.5343863461691357, "grad_norm": 49.25, "kl": 1.1877613067626953, "learning_rate": 5e-07, "logits/chosen": -13198584.0, "logits/rejected": 2174039.0, "logps/chosen": -214.7668253580729, "logps/rejected": -202.65115356445312, "loss": 0.3541, "rewards/chosen": 0.6648481289545695, "rewards/margins": 1.9055234591166177, "rewards/rejected": -1.2406753301620483, "step": 10082 }, { "epoch": 0.5344393501709379, "grad_norm": 62.0, "kl": 0.04645538330078125, "learning_rate": 5e-07, "logits/chosen": 107548160.0, "logits/rejected": -8783244.0, "logps/chosen": -305.4364929199219, "logps/rejected": -245.3909149169922, "loss": 0.3414, "rewards/chosen": 0.12202168256044388, "rewards/margins": 1.7226178720593452, "rewards/rejected": -1.6005961894989014, "step": 10083 }, { "epoch": 0.53449235417274, "grad_norm": 44.5, "kl": 0.06658172607421875, "learning_rate": 5e-07, "logits/chosen": -36578760.0, "logits/rejected": -58908632.0, "logps/chosen": -595.3600463867188, "logps/rejected": -492.7741394042969, "loss": 0.234, "rewards/chosen": 0.7326675653457642, "rewards/margins": 3.414943814277649, "rewards/rejected": -2.6822762489318848, "step": 10084 }, { "epoch": 0.5345453581745422, "grad_norm": 43.75, "kl": 0.3760519027709961, "learning_rate": 5e-07, "logits/chosen": -22348556.8, "logits/rejected": -7725841.333333333, "logps/chosen": -181.65660400390624, "logps/rejected": -240.62544759114584, "loss": 0.3768, "rewards/chosen": 0.15655171871185303, "rewards/margins": 1.458152174949646, "rewards/rejected": -1.301600456237793, "step": 10085 }, { "epoch": 0.5345983621763443, "grad_norm": 46.0, "kl": 1.4669570922851562, "learning_rate": 5e-07, "logits/chosen": -4788411.333333333, "logits/rejected": -19502028.8, "logps/chosen": -146.5418497721354, "logps/rejected": -306.524365234375, "loss": 0.3109, "rewards/chosen": 0.032803475856781006, "rewards/margins": 1.9535428404808044, "rewards/rejected": -1.9207393646240234, "step": 10086 }, { "epoch": 0.5346513661781465, "grad_norm": 50.5, "kl": 2.518294334411621, "learning_rate": 5e-07, "logits/chosen": -53723718.4, "logits/rejected": -12242925.333333334, "logps/chosen": -272.06943359375, "logps/rejected": -283.1820882161458, "loss": 0.3313, "rewards/chosen": 0.38234331607818606, "rewards/margins": 3.7315295775731405, "rewards/rejected": -3.3491862614949546, "step": 10087 }, { "epoch": 0.5347043701799485, "grad_norm": 64.5, "kl": 0.6727924346923828, "learning_rate": 5e-07, "logits/chosen": -22533420.0, "logits/rejected": -38107440.0, "logps/chosen": -167.1557159423828, "logps/rejected": -347.058349609375, "loss": 0.4269, "rewards/chosen": -0.4182058572769165, "rewards/margins": 1.3579925298690796, "rewards/rejected": -1.776198387145996, "step": 10088 }, { "epoch": 0.5347573741817507, "grad_norm": 44.0, "kl": 1.2572860717773438, "learning_rate": 5e-07, "logits/chosen": -39372474.666666664, "logits/rejected": -36225472.0, "logps/chosen": -571.2062174479166, "logps/rejected": -176.28419189453126, "loss": 0.2704, "rewards/chosen": 0.8698212305704752, "rewards/margins": 2.8054012934366863, "rewards/rejected": -1.935580062866211, "step": 10089 }, { "epoch": 0.5348103781835528, "grad_norm": 63.5, "kl": 5.404985427856445, "learning_rate": 5e-07, "logits/chosen": -18648291.2, "logits/rejected": -18344504.0, "logps/chosen": -440.772119140625, "logps/rejected": -538.2707926432291, "loss": 0.3249, "rewards/chosen": 1.1868733406066894, "rewards/margins": 4.073547903696696, "rewards/rejected": -2.8866745630900064, "step": 10090 }, { "epoch": 0.534863382185355, "grad_norm": 43.5, "kl": 0.4289703369140625, "learning_rate": 5e-07, "logits/chosen": -15991915.2, "logits/rejected": -2926042.0, "logps/chosen": -140.872509765625, "logps/rejected": -181.5265096028646, "loss": 0.3438, "rewards/chosen": 0.35407099723815916, "rewards/margins": 2.1374244848887125, "rewards/rejected": -1.7833534876505535, "step": 10091 }, { "epoch": 0.5349163861871571, "grad_norm": 66.5, "kl": 0.015716552734375, "learning_rate": 5e-07, "logits/chosen": -55714675.2, "logits/rejected": -1973598.3333333333, "logps/chosen": -628.63154296875, "logps/rejected": -174.3854777018229, "loss": 0.3328, "rewards/chosen": 0.24214906692504884, "rewards/margins": 2.5992741902669274, "rewards/rejected": -2.3571251233418784, "step": 10092 }, { "epoch": 0.5349693901889593, "grad_norm": 44.25, "kl": 0.5034980773925781, "learning_rate": 5e-07, "logits/chosen": -17870618.666666668, "logits/rejected": -12693203.2, "logps/chosen": -583.2119140625, "logps/rejected": -238.280615234375, "loss": 0.2047, "rewards/chosen": 1.3856892585754395, "rewards/margins": 3.094943332672119, "rewards/rejected": -1.7092540740966797, "step": 10093 }, { "epoch": 0.5350223941907614, "grad_norm": 54.25, "kl": 1.5652580261230469, "learning_rate": 5e-07, "logits/chosen": -12276732.0, "logits/rejected": -19367390.666666668, "logps/chosen": -190.79376220703125, "logps/rejected": -254.81880696614584, "loss": 0.231, "rewards/chosen": 1.6204307079315186, "rewards/margins": 3.288080294926961, "rewards/rejected": -1.6676495869954426, "step": 10094 }, { "epoch": 0.5350753981925636, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10231580.0, "logits/rejected": 38458963.2, "logps/chosen": -238.6827392578125, "logps/rejected": -319.2112548828125, "loss": 0.2579, "rewards/chosen": 0.2513923645019531, "rewards/margins": 2.6702993392944334, "rewards/rejected": -2.4189069747924803, "step": 10095 }, { "epoch": 0.5351284021943656, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9470956.0, "logits/rejected": -9485716.8, "logps/chosen": -409.8499348958333, "logps/rejected": -296.5127685546875, "loss": 0.2181, "rewards/chosen": 0.5332099994023641, "rewards/margins": 3.0568040927251183, "rewards/rejected": -2.523594093322754, "step": 10096 }, { "epoch": 0.5351814061961678, "grad_norm": 63.25, "kl": 0.459991455078125, "learning_rate": 5e-07, "logits/chosen": -14408476.0, "logits/rejected": 2447228.0, "logps/chosen": -225.9010213216146, "logps/rejected": -108.86863708496094, "loss": 0.3794, "rewards/chosen": 0.34397558371225995, "rewards/margins": 2.030480662981669, "rewards/rejected": -1.6865050792694092, "step": 10097 }, { "epoch": 0.5352344101979699, "grad_norm": 54.0, "kl": 1.1245203018188477, "learning_rate": 5e-07, "logits/chosen": 16161638.4, "logits/rejected": 98795861.33333333, "logps/chosen": -292.266357421875, "logps/rejected": -537.4063720703125, "loss": 0.3377, "rewards/chosen": 0.31233141422271726, "rewards/margins": 2.384894108772278, "rewards/rejected": -2.0725626945495605, "step": 10098 }, { "epoch": 0.5352874141997721, "grad_norm": 46.75, "kl": 0.174774169921875, "learning_rate": 5e-07, "logits/chosen": 12238861.6, "logits/rejected": 723168.0, "logps/chosen": -189.5722900390625, "logps/rejected": -253.72578938802084, "loss": 0.3903, "rewards/chosen": 0.11515545845031738, "rewards/margins": 1.2364641030629475, "rewards/rejected": -1.1213086446126301, "step": 10099 }, { "epoch": 0.5353404182015742, "grad_norm": 39.75, "kl": 0.9052276611328125, "learning_rate": 5e-07, "logits/chosen": -14467542.4, "logits/rejected": -31873400.0, "logps/chosen": -161.43814697265626, "logps/rejected": -337.53395589192706, "loss": 0.3061, "rewards/chosen": 0.5220773696899415, "rewards/margins": 3.2585733413696287, "rewards/rejected": -2.7364959716796875, "step": 10100 }, { "epoch": 0.5353934222033764, "grad_norm": 63.0, "kl": 1.0755844116210938, "learning_rate": 5e-07, "logits/chosen": -8679518.666666666, "logits/rejected": -28090329.6, "logps/chosen": -179.344970703125, "logps/rejected": -435.440380859375, "loss": 0.3189, "rewards/chosen": -0.3993949890136719, "rewards/margins": 1.5736188888549805, "rewards/rejected": -1.9730138778686523, "step": 10101 }, { "epoch": 0.5354464262051785, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54023720.0, "logits/rejected": -24885046.0, "logps/chosen": -336.93035888671875, "logps/rejected": -328.51129150390625, "loss": 0.3252, "rewards/chosen": 0.29781800508499146, "rewards/margins": 1.793422281742096, "rewards/rejected": -1.4956042766571045, "step": 10102 }, { "epoch": 0.5354994302069807, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8686926.0, "logits/rejected": -68511146.66666667, "logps/chosen": -198.98617553710938, "logps/rejected": -497.1900227864583, "loss": 0.2733, "rewards/chosen": -0.046149443835020065, "rewards/margins": 1.5712782231469948, "rewards/rejected": -1.617427666982015, "step": 10103 }, { "epoch": 0.5355524342087827, "grad_norm": 49.75, "kl": 0.5607538223266602, "learning_rate": 5e-07, "logits/chosen": -20461218.0, "logits/rejected": -11037544.0, "logps/chosen": -271.2060241699219, "logps/rejected": -287.44976806640625, "loss": 0.2811, "rewards/chosen": 0.7261195182800293, "rewards/margins": 2.376896381378174, "rewards/rejected": -1.6507768630981445, "step": 10104 }, { "epoch": 0.5356054382105849, "grad_norm": 44.25, "kl": 1.3484992980957031, "learning_rate": 5e-07, "logits/chosen": -50258080.0, "logits/rejected": -73511973.33333333, "logps/chosen": -532.3531494140625, "logps/rejected": -306.1173909505208, "loss": 0.2144, "rewards/chosen": 0.8080413937568665, "rewards/margins": 3.1780967513720193, "rewards/rejected": -2.370055357615153, "step": 10105 }, { "epoch": 0.535658442212387, "grad_norm": 51.25, "kl": 1.1111106872558594, "learning_rate": 5e-07, "logits/chosen": 5171636.0, "logits/rejected": 12300296.0, "logps/chosen": -113.22992706298828, "logps/rejected": -189.6302693684896, "loss": 0.2119, "rewards/chosen": 1.2939748764038086, "rewards/margins": 2.9893798828125, "rewards/rejected": -1.6954050064086914, "step": 10106 }, { "epoch": 0.5357114462141892, "grad_norm": 59.25, "kl": 2.325521469116211, "learning_rate": 5e-07, "logits/chosen": -9528049.142857144, "logits/rejected": -18083820.0, "logps/chosen": -279.196044921875, "logps/rejected": -409.8851623535156, "loss": 0.3557, "rewards/chosen": 0.7443737302507673, "rewards/margins": 2.8416973863329207, "rewards/rejected": -2.0973236560821533, "step": 10107 }, { "epoch": 0.5357644502159913, "grad_norm": 45.0, "kl": 0.10564422607421875, "learning_rate": 5e-07, "logits/chosen": -23653930.0, "logits/rejected": -13091040.0, "logps/chosen": -166.26336669921875, "logps/rejected": -209.4211883544922, "loss": 0.2854, "rewards/chosen": 0.4099479913711548, "rewards/margins": 2.0594512224197388, "rewards/rejected": -1.649503231048584, "step": 10108 }, { "epoch": 0.5358174542177935, "grad_norm": 64.5, "kl": 1.3150520324707031, "learning_rate": 5e-07, "logits/chosen": 61939833.6, "logits/rejected": 1905655.8333333333, "logps/chosen": -358.2621826171875, "logps/rejected": -211.22979736328125, "loss": 0.4126, "rewards/chosen": -0.1947791814804077, "rewards/margins": 1.9932698170344034, "rewards/rejected": -2.188048998514811, "step": 10109 }, { "epoch": 0.5358704582195956, "grad_norm": 44.5, "kl": 1.9045305252075195, "learning_rate": 5e-07, "logits/chosen": -45550892.0, "logits/rejected": -46177936.0, "logps/chosen": -306.0121765136719, "logps/rejected": -342.61260986328125, "loss": 0.3463, "rewards/chosen": 0.42781299352645874, "rewards/margins": 1.973949134349823, "rewards/rejected": -1.5461361408233643, "step": 10110 }, { "epoch": 0.5359234622213978, "grad_norm": 53.75, "kl": 2.696316719055176, "learning_rate": 5e-07, "logits/chosen": -35019304.0, "logits/rejected": -12439073.0, "logps/chosen": -422.51214599609375, "logps/rejected": -157.54266357421875, "loss": 0.345, "rewards/chosen": 0.39036211371421814, "rewards/margins": 2.0194221436977386, "rewards/rejected": -1.6290600299835205, "step": 10111 }, { "epoch": 0.5359764662231998, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9438690.666666666, "logits/rejected": -101937728.0, "logps/chosen": -165.2622273763021, "logps/rejected": -336.2783935546875, "loss": 0.2182, "rewards/chosen": 0.5677623748779297, "rewards/margins": 3.4727237701416014, "rewards/rejected": -2.9049613952636717, "step": 10112 }, { "epoch": 0.536029470225002, "grad_norm": 51.25, "kl": 0.47231292724609375, "learning_rate": 5e-07, "logits/chosen": -27628396.0, "logits/rejected": -44887452.0, "logps/chosen": -444.1905517578125, "logps/rejected": -397.62664794921875, "loss": 0.2742, "rewards/chosen": 0.22008666396141052, "rewards/margins": 3.018667072057724, "rewards/rejected": -2.7985804080963135, "step": 10113 }, { "epoch": 0.5360824742268041, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52663082.666666664, "logits/rejected": -12548676.8, "logps/chosen": -250.62967936197916, "logps/rejected": -218.670361328125, "loss": 0.3669, "rewards/chosen": 0.16524916887283325, "rewards/margins": 1.261819851398468, "rewards/rejected": -1.0965706825256347, "step": 10114 }, { "epoch": 0.5361354782286063, "grad_norm": 47.75, "kl": 0.1742839813232422, "learning_rate": 5e-07, "logits/chosen": -19415436.0, "logits/rejected": -14443685.333333334, "logps/chosen": -191.80899047851562, "logps/rejected": -232.7352091471354, "loss": 0.3147, "rewards/chosen": -0.3517608642578125, "rewards/margins": 1.1212561925252278, "rewards/rejected": -1.4730170567830403, "step": 10115 }, { "epoch": 0.5361884822304084, "grad_norm": 59.5, "kl": 2.5849742889404297, "learning_rate": 5e-07, "logits/chosen": -1075939.8, "logits/rejected": -72655653.33333333, "logps/chosen": -279.9529052734375, "logps/rejected": -411.2439778645833, "loss": 0.337, "rewards/chosen": 0.4533065319061279, "rewards/margins": 3.1655874411265055, "rewards/rejected": -2.7122809092203775, "step": 10116 }, { "epoch": 0.5362414862322106, "grad_norm": 53.0, "kl": 0.3964080810546875, "learning_rate": 5e-07, "logits/chosen": -23999145.6, "logits/rejected": -40587101.333333336, "logps/chosen": -349.9728515625, "logps/rejected": -371.8498942057292, "loss": 0.3007, "rewards/chosen": 0.6125000953674317, "rewards/margins": 2.443702761332194, "rewards/rejected": -1.8312026659647624, "step": 10117 }, { "epoch": 0.5362944902340127, "grad_norm": 62.0, "kl": 4.201225757598877, "learning_rate": 5e-07, "logits/chosen": 3072906.8571428573, "logits/rejected": -27909276.0, "logps/chosen": -210.79593331473214, "logps/rejected": -346.7127685546875, "loss": 0.4646, "rewards/chosen": 0.36984058788844515, "rewards/margins": 3.8179974215371266, "rewards/rejected": -3.4481568336486816, "step": 10118 }, { "epoch": 0.5363474942358148, "grad_norm": 57.5, "kl": 0.4177284240722656, "learning_rate": 5e-07, "logits/chosen": -77356106.66666667, "logits/rejected": -14801164.0, "logps/chosen": -313.355712890625, "logps/rejected": -336.47412109375, "loss": 0.424, "rewards/chosen": -0.06821797291437785, "rewards/margins": 2.2715182503064475, "rewards/rejected": -2.339736223220825, "step": 10119 }, { "epoch": 0.5364004982376169, "grad_norm": 75.5, "kl": 6.8214263916015625, "learning_rate": 5e-07, "logits/chosen": -6798911.2, "logits/rejected": -57015173.333333336, "logps/chosen": -349.620458984375, "logps/rejected": -545.55517578125, "loss": 0.3177, "rewards/chosen": 1.2381329536437988, "rewards/margins": 4.486989180246988, "rewards/rejected": -3.24885622660319, "step": 10120 }, { "epoch": 0.5364535022394191, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5996170.5, "logits/rejected": -12384708.57142857, "logps/chosen": -304.1583557128906, "logps/rejected": -179.03412737165178, "loss": 0.2001, "rewards/chosen": 0.38894960284233093, "rewards/margins": 2.405452664409365, "rewards/rejected": -2.016503061567034, "step": 10121 }, { "epoch": 0.5365065062412212, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37471050.666666664, "logits/rejected": 32129724.8, "logps/chosen": -417.590576171875, "logps/rejected": -372.57412109375, "loss": 0.2244, "rewards/chosen": 1.0556127230326335, "rewards/margins": 3.3569471041361494, "rewards/rejected": -2.3013343811035156, "step": 10122 }, { "epoch": 0.5365595102430234, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -97538392.0, "logits/rejected": -12197602.666666666, "logps/chosen": -331.1661376953125, "logps/rejected": -334.5217692057292, "loss": 0.2865, "rewards/chosen": -0.5156257748603821, "rewards/margins": 1.3608694275220234, "rewards/rejected": -1.8764952023824055, "step": 10123 }, { "epoch": 0.5366125142448255, "grad_norm": 34.75, "kl": 0.4513578414916992, "learning_rate": 5e-07, "logits/chosen": 1856156.0, "logits/rejected": -7264633.333333333, "logps/chosen": -328.0630187988281, "logps/rejected": -461.5928548177083, "loss": 0.138, "rewards/chosen": 0.9265979528427124, "rewards/margins": 3.989162802696228, "rewards/rejected": -3.0625648498535156, "step": 10124 }, { "epoch": 0.5366655182466277, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1031250.0, "logits/rejected": 8830448.0, "logps/chosen": -94.62168884277344, "logps/rejected": -280.587158203125, "loss": 0.2668, "rewards/chosen": 0.17177486419677734, "rewards/margins": 2.0797629356384277, "rewards/rejected": -1.9079880714416504, "step": 10125 }, { "epoch": 0.5367185222484298, "grad_norm": 40.5, "kl": 4.115973472595215, "learning_rate": 5e-07, "logits/chosen": -10140399.2, "logits/rejected": -20173154.666666668, "logps/chosen": -167.76033935546874, "logps/rejected": -172.0050048828125, "loss": 0.2907, "rewards/chosen": 0.8267321586608887, "rewards/margins": 2.6915961901346845, "rewards/rejected": -1.8648640314737956, "step": 10126 }, { "epoch": 0.536771526250232, "grad_norm": 42.75, "kl": 1.77386474609375, "learning_rate": 5e-07, "logits/chosen": -3629354.6666666665, "logits/rejected": -11732122.4, "logps/chosen": -250.63981119791666, "logps/rejected": -261.52216796875, "loss": 0.2831, "rewards/chosen": 0.7938133080800375, "rewards/margins": 2.7357532342274986, "rewards/rejected": -1.941939926147461, "step": 10127 }, { "epoch": 0.536824530252034, "grad_norm": 39.5, "kl": 0.36815643310546875, "learning_rate": 5e-07, "logits/chosen": 5607915.333333333, "logits/rejected": -51955964.8, "logps/chosen": -380.083740234375, "logps/rejected": -336.305126953125, "loss": 0.2259, "rewards/chosen": 1.317884922027588, "rewards/margins": 3.39805269241333, "rewards/rejected": -2.080167770385742, "step": 10128 }, { "epoch": 0.5368775342538362, "grad_norm": 51.5, "kl": 0.4931793212890625, "learning_rate": 5e-07, "logits/chosen": -13614206.666666666, "logits/rejected": 2001540.0, "logps/chosen": -294.90875244140625, "logps/rejected": -456.165869140625, "loss": 0.2256, "rewards/chosen": 0.5252619981765747, "rewards/margins": 2.937868618965149, "rewards/rejected": -2.412606620788574, "step": 10129 }, { "epoch": 0.5369305382556383, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65247946.666666664, "logits/rejected": -5400799.2, "logps/chosen": -596.6838785807291, "logps/rejected": -315.483447265625, "loss": 0.1822, "rewards/chosen": 0.7784688472747803, "rewards/margins": 3.812451696395874, "rewards/rejected": -3.033982849121094, "step": 10130 }, { "epoch": 0.5369835422574404, "grad_norm": 56.5, "kl": 1.658803939819336, "learning_rate": 5e-07, "logits/chosen": -66401530.666666664, "logits/rejected": -41564444.8, "logps/chosen": -1261.1484375, "logps/rejected": -223.2201171875, "loss": 0.1437, "rewards/chosen": 1.7565511067708333, "rewards/margins": 4.303084500630696, "rewards/rejected": -2.546533393859863, "step": 10131 }, { "epoch": 0.5370365462592426, "grad_norm": 26.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4317843.0, "logits/rejected": -16261442.285714285, "logps/chosen": -41.972320556640625, "logps/rejected": -187.83755929129464, "loss": 0.1438, "rewards/chosen": 0.1177162155508995, "rewards/margins": 2.4778048364179477, "rewards/rejected": -2.360088620867048, "step": 10132 }, { "epoch": 0.5370895502610447, "grad_norm": 57.0, "kl": 0.8601970672607422, "learning_rate": 5e-07, "logits/chosen": -4517350.4, "logits/rejected": -1586802.0, "logps/chosen": -376.0186279296875, "logps/rejected": -295.8124186197917, "loss": 0.341, "rewards/chosen": 0.5632625102996827, "rewards/margins": 1.8269733270009358, "rewards/rejected": -1.2637108167012532, "step": 10133 }, { "epoch": 0.5371425542628468, "grad_norm": 72.0, "kl": 0.962554931640625, "learning_rate": 5e-07, "logits/chosen": -58738976.0, "logps/chosen": -268.9555969238281, "loss": 0.389, "rewards/chosen": 0.6437171101570129, "step": 10134 }, { "epoch": 0.5371955582646489, "grad_norm": 40.75, "kl": 0.5928182601928711, "learning_rate": 5e-07, "logits/chosen": -10794747.2, "logits/rejected": -43138445.333333336, "logps/chosen": -195.6802490234375, "logps/rejected": -276.86224365234375, "loss": 0.2387, "rewards/chosen": 1.2500082015991212, "rewards/margins": 2.809399223327637, "rewards/rejected": -1.5593910217285156, "step": 10135 }, { "epoch": 0.5372485622664511, "grad_norm": 41.0, "kl": 1.449737548828125, "learning_rate": 5e-07, "logits/chosen": -63789834.666666664, "logits/rejected": -35506624.0, "logps/chosen": -972.741943359375, "logps/rejected": -391.7082763671875, "loss": 0.1124, "rewards/chosen": 2.1783601442972818, "rewards/margins": 5.095432631174724, "rewards/rejected": -2.9170724868774416, "step": 10136 }, { "epoch": 0.5373015662682532, "grad_norm": 43.5, "kl": 0.17156219482421875, "learning_rate": 5e-07, "logits/chosen": -92022192.0, "logits/rejected": -26880848.0, "logps/chosen": -255.35809326171875, "logps/rejected": -433.34564208984375, "loss": 0.3034, "rewards/chosen": 0.14719486236572266, "rewards/margins": 2.392899513244629, "rewards/rejected": -2.2457046508789062, "step": 10137 }, { "epoch": 0.5373545702700554, "grad_norm": 53.25, "kl": 4.135103225708008, "learning_rate": 5e-07, "logits/chosen": -19319254.4, "logits/rejected": 21583362.666666668, "logps/chosen": -246.89404296875, "logps/rejected": -401.2096354166667, "loss": 0.3493, "rewards/chosen": 0.7263809204101562, "rewards/margins": 2.948047415415446, "rewards/rejected": -2.2216664950052896, "step": 10138 }, { "epoch": 0.5374075742718575, "grad_norm": 43.5, "kl": 2.483531951904297, "learning_rate": 5e-07, "logits/chosen": 7446120.0, "logits/rejected": -40183200.0, "logps/chosen": -295.0856018066406, "logps/rejected": -306.00323486328125, "loss": 0.2676, "rewards/chosen": 0.6056057214736938, "rewards/margins": 2.985609014829, "rewards/rejected": -2.380003293355306, "step": 10139 }, { "epoch": 0.5374605782736597, "grad_norm": 50.0, "kl": 1.5608940124511719, "learning_rate": 5e-07, "logits/chosen": -9270872.0, "logits/rejected": -40942442.666666664, "logps/chosen": -118.5671875, "logps/rejected": -349.2303873697917, "loss": 0.3445, "rewards/chosen": 0.19984855651855468, "rewards/margins": 2.6315168380737304, "rewards/rejected": -2.431668281555176, "step": 10140 }, { "epoch": 0.5375135822754618, "grad_norm": 43.75, "kl": 2.7605724334716797, "learning_rate": 5e-07, "logits/chosen": -4583726.0, "logits/rejected": -15811598.0, "logps/chosen": -189.03448486328125, "logps/rejected": -303.02618408203125, "loss": 0.3123, "rewards/chosen": 0.9164146780967712, "rewards/margins": 2.7824646830558777, "rewards/rejected": -1.8660500049591064, "step": 10141 }, { "epoch": 0.537566586277264, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6028061.333333333, "logits/rejected": -8347823.2, "logps/chosen": -291.01186116536456, "logps/rejected": -320.3330322265625, "loss": 0.3183, "rewards/chosen": 0.2942003806432088, "rewards/margins": 1.5983230193456013, "rewards/rejected": -1.3041226387023925, "step": 10142 }, { "epoch": 0.537619590279066, "grad_norm": 29.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15147797.333333334, "logits/rejected": -47998467.2, "logps/chosen": -59.700174967447914, "logps/rejected": -371.030126953125, "loss": 0.189, "rewards/chosen": 0.4959107240041097, "rewards/margins": 3.3510752518971763, "rewards/rejected": -2.8551645278930664, "step": 10143 }, { "epoch": 0.5376725942808682, "grad_norm": 38.5, "kl": 2.415821075439453, "learning_rate": 5e-07, "logits/chosen": -24270325.333333332, "logits/rejected": -13490212.8, "logps/chosen": -419.0936686197917, "logps/rejected": -200.71507568359374, "loss": 0.2335, "rewards/chosen": 1.0463231404622395, "rewards/margins": 2.9203011830647787, "rewards/rejected": -1.873978042602539, "step": 10144 }, { "epoch": 0.5377255982826703, "grad_norm": 31.0, "kl": 3.013424873352051, "learning_rate": 5e-07, "logits/chosen": -8770454.0, "logits/rejected": 12084774.0, "logps/chosen": -365.47802734375, "logps/rejected": -167.06072998046875, "loss": 0.2985, "rewards/chosen": 0.8097882270812988, "rewards/margins": 3.157278060913086, "rewards/rejected": -2.347489833831787, "step": 10145 }, { "epoch": 0.5377786022844725, "grad_norm": 47.0, "kl": 2.4016571044921875, "learning_rate": 5e-07, "logits/chosen": -30794952.0, "logits/rejected": -31003154.0, "logps/chosen": -859.2046508789062, "logps/rejected": -295.2451477050781, "loss": 0.2523, "rewards/chosen": 1.5513877868652344, "rewards/margins": 3.491562604904175, "rewards/rejected": -1.9401748180389404, "step": 10146 }, { "epoch": 0.5378316062862746, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11663881.333333334, "logits/rejected": 2344329.8, "logps/chosen": -256.1933186848958, "logps/rejected": -436.001904296875, "loss": 0.245, "rewards/chosen": 0.5327433347702026, "rewards/margins": 2.4831366300582887, "rewards/rejected": -1.950393295288086, "step": 10147 }, { "epoch": 0.5378846102880768, "grad_norm": 60.25, "kl": 1.5220775604248047, "learning_rate": 5e-07, "logits/chosen": -62301476.571428575, "logits/rejected": -11732449.0, "logps/chosen": -317.812255859375, "logps/rejected": -521.1236572265625, "loss": 0.509, "rewards/chosen": -0.09323057106563024, "rewards/margins": 1.8548102719443185, "rewards/rejected": -1.9480408430099487, "step": 10148 }, { "epoch": 0.5379376142898789, "grad_norm": 42.75, "kl": 2.1905670166015625, "learning_rate": 5e-07, "logits/chosen": -17903978.666666668, "logits/rejected": -33844979.2, "logps/chosen": -563.6178385416666, "logps/rejected": -329.771142578125, "loss": 0.2142, "rewards/chosen": 1.0846703847249348, "rewards/margins": 3.3468201955159502, "rewards/rejected": -2.2621498107910156, "step": 10149 }, { "epoch": 0.537990618291681, "grad_norm": 41.5, "kl": 1.3530349731445312, "learning_rate": 5e-07, "logits/chosen": -20694952.0, "logits/rejected": -38713264.0, "logps/chosen": -332.85174560546875, "logps/rejected": -356.9649658203125, "loss": 0.2576, "rewards/chosen": 0.5455307960510254, "rewards/margins": 2.631352424621582, "rewards/rejected": -2.0858216285705566, "step": 10150 }, { "epoch": 0.5380436222934831, "grad_norm": 32.0, "kl": 0.5096588134765625, "learning_rate": 5e-07, "logits/chosen": -22955029.333333332, "logits/rejected": -57286873.6, "logps/chosen": -123.66597493489583, "logps/rejected": -400.125146484375, "loss": 0.2669, "rewards/chosen": -0.03877640018860499, "rewards/margins": 2.477752492328485, "rewards/rejected": -2.5165288925170897, "step": 10151 }, { "epoch": 0.5380966262952853, "grad_norm": 41.5, "kl": 0.48381614685058594, "learning_rate": 5e-07, "logits/chosen": -65921296.0, "logits/rejected": -13343754.666666666, "logps/chosen": -348.0000305175781, "logps/rejected": -125.53322347005208, "loss": 0.2354, "rewards/chosen": 0.3248092830181122, "rewards/margins": 2.1720800896485644, "rewards/rejected": -1.8472708066304524, "step": 10152 }, { "epoch": 0.5381496302970874, "grad_norm": 59.5, "kl": 0.5944128036499023, "learning_rate": 5e-07, "logits/chosen": -28733125.333333332, "logits/rejected": -15153598.0, "logps/chosen": -305.6001790364583, "logps/rejected": -229.34439086914062, "loss": 0.3573, "rewards/chosen": 0.2502807180086772, "rewards/margins": 3.152964929739634, "rewards/rejected": -2.902684211730957, "step": 10153 }, { "epoch": 0.5382026342988896, "grad_norm": 85.0, "kl": 6.6317138671875, "learning_rate": 5e-07, "logits/chosen": -20455729.333333332, "logits/rejected": 76411296.0, "logps/chosen": -559.884521484375, "logps/rejected": -368.0364074707031, "loss": 0.3819, "rewards/chosen": 0.8364795049031576, "rewards/margins": 3.5105624993642173, "rewards/rejected": -2.6740829944610596, "step": 10154 }, { "epoch": 0.5382556383006917, "grad_norm": 59.5, "kl": 0.8980350494384766, "learning_rate": 5e-07, "logits/chosen": -77894712.0, "logits/rejected": -3090600.0, "logps/chosen": -313.7783203125, "logps/rejected": -659.2874145507812, "loss": 0.279, "rewards/chosen": 0.3981206715106964, "rewards/margins": 2.9948408901691437, "rewards/rejected": -2.5967202186584473, "step": 10155 }, { "epoch": 0.5383086423024939, "grad_norm": 36.75, "kl": 4.4144816398620605, "learning_rate": 5e-07, "logits/chosen": -3480141.3333333335, "logits/rejected": -26776836.0, "logps/chosen": -434.6081136067708, "logps/rejected": -318.68023681640625, "loss": 0.3422, "rewards/chosen": 1.1698204676310222, "rewards/margins": 3.1861919562021894, "rewards/rejected": -2.016371488571167, "step": 10156 }, { "epoch": 0.538361646304296, "grad_norm": 49.25, "kl": 2.1071996688842773, "learning_rate": 5e-07, "logits/chosen": -63919316.0, "logits/rejected": -19743080.0, "logps/chosen": -1013.583984375, "logps/rejected": -518.2435913085938, "loss": 0.1694, "rewards/chosen": 1.6440376043319702, "rewards/margins": 4.301663279533386, "rewards/rejected": -2.657625675201416, "step": 10157 }, { "epoch": 0.5384146503060981, "grad_norm": 48.5, "kl": 0.6490440368652344, "learning_rate": 5e-07, "logits/chosen": -16325786.666666666, "logits/rejected": -10167032.8, "logps/chosen": -518.8037109375, "logps/rejected": -95.27373657226562, "loss": 0.1947, "rewards/chosen": 1.4015086491902669, "rewards/margins": 3.027599843343099, "rewards/rejected": -1.626091194152832, "step": 10158 }, { "epoch": 0.5384676543079002, "grad_norm": 46.5, "kl": 1.9240970611572266, "learning_rate": 5e-07, "logits/chosen": -22534608.0, "logits/rejected": -87193000.0, "logps/chosen": -162.0944620768229, "logps/rejected": -493.4034423828125, "loss": 0.4294, "rewards/chosen": 0.005890144035220146, "rewards/margins": 2.1213266979902983, "rewards/rejected": -2.115436553955078, "step": 10159 }, { "epoch": 0.5385206583097024, "grad_norm": 37.25, "kl": 0.0016632080078125, "learning_rate": 5e-07, "logits/chosen": -16207822.666666666, "logits/rejected": -32887616.0, "logps/chosen": -216.0460408528646, "logps/rejected": -369.0533935546875, "loss": 0.2422, "rewards/chosen": -0.011642595132191976, "rewards/margins": 2.4473946134249367, "rewards/rejected": -2.459037208557129, "step": 10160 }, { "epoch": 0.5385736623115045, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11720198.0, "logits/rejected": -5025890.0, "logps/chosen": -280.74322509765625, "logps/rejected": -486.0186767578125, "loss": 0.1743, "rewards/chosen": 1.292675495147705, "rewards/margins": 3.738450765609741, "rewards/rejected": -2.445775270462036, "step": 10161 }, { "epoch": 0.5386266663133067, "grad_norm": 40.75, "kl": 1.3614096641540527, "learning_rate": 5e-07, "logits/chosen": 11953787.0, "logits/rejected": -22679626.666666668, "logps/chosen": -308.43292236328125, "logps/rejected": -340.82958984375, "loss": 0.2371, "rewards/chosen": -0.12290513515472412, "rewards/margins": 2.391335447629293, "rewards/rejected": -2.514240582784017, "step": 10162 }, { "epoch": 0.5386796703151088, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24319629.333333332, "logits/rejected": -2587611.2, "logps/chosen": -355.6559651692708, "logps/rejected": -242.1544921875, "loss": 0.2512, "rewards/chosen": 1.011354128519694, "rewards/margins": 2.5696015993754067, "rewards/rejected": -1.5582474708557128, "step": 10163 }, { "epoch": 0.538732674316911, "grad_norm": 59.5, "kl": 1.4831418991088867, "learning_rate": 5e-07, "logits/chosen": -26817602.666666668, "logits/rejected": -6741921.0, "logps/chosen": -396.1490071614583, "logps/rejected": -160.6120147705078, "loss": 0.3746, "rewards/chosen": 0.36117251714070636, "rewards/margins": 2.43693478902181, "rewards/rejected": -2.0757622718811035, "step": 10164 }, { "epoch": 0.538785678318713, "grad_norm": 54.5, "kl": 1.2278900146484375, "learning_rate": 5e-07, "logits/chosen": -19663652.0, "logits/rejected": -22350672.0, "logps/chosen": -247.36126708984375, "logps/rejected": -466.0538635253906, "loss": 0.3251, "rewards/chosen": 0.5384736061096191, "rewards/margins": 3.416774272918701, "rewards/rejected": -2.878300666809082, "step": 10165 }, { "epoch": 0.5388386823205152, "grad_norm": 102.0, "kl": 0.11048126220703125, "learning_rate": 5e-07, "logits/chosen": 3490284.75, "logits/rejected": 3524374.5, "logps/chosen": -233.70205688476562, "logps/rejected": -676.9007568359375, "loss": 0.2902, "rewards/chosen": 0.518267810344696, "rewards/margins": 3.7441765666007996, "rewards/rejected": -3.2259087562561035, "step": 10166 }, { "epoch": 0.5388916863223173, "grad_norm": 45.75, "kl": 0.9583854675292969, "learning_rate": 5e-07, "logits/chosen": -24976848.0, "logits/rejected": -26346137.6, "logps/chosen": -316.08473714192706, "logps/rejected": -315.9505859375, "loss": 0.1714, "rewards/chosen": 0.8708415031433105, "rewards/margins": 3.661449337005615, "rewards/rejected": -2.7906078338623046, "step": 10167 }, { "epoch": 0.5389446903241195, "grad_norm": 85.0, "kl": 0.4810447692871094, "learning_rate": 5e-07, "logits/chosen": -2233309.4285714286, "logits/rejected": -10316564.0, "logps/chosen": -255.0386962890625, "logps/rejected": -356.50250244140625, "loss": 0.4985, "rewards/chosen": -0.16762505258832658, "rewards/margins": 1.8522113731929235, "rewards/rejected": -2.01983642578125, "step": 10168 }, { "epoch": 0.5389976943259216, "grad_norm": 56.5, "kl": 2.1675939559936523, "learning_rate": 5e-07, "logits/chosen": -12575657.6, "logits/rejected": -1895545.3333333333, "logps/chosen": -258.50361328125, "logps/rejected": -189.90218098958334, "loss": 0.3953, "rewards/chosen": 0.5691877841949463, "rewards/margins": 1.7226172924041747, "rewards/rejected": -1.1534295082092285, "step": 10169 }, { "epoch": 0.5390506983277238, "grad_norm": 44.25, "kl": 0.029333114624023438, "learning_rate": 5e-07, "logits/chosen": 55185856.0, "logits/rejected": -13692754.285714285, "logps/chosen": -37.95807647705078, "logps/rejected": -236.82345145089286, "loss": 0.2141, "rewards/chosen": -0.09623413532972336, "rewards/margins": 2.001184922243868, "rewards/rejected": -2.097419057573591, "step": 10170 }, { "epoch": 0.5391037023295259, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6094786.0, "logits/rejected": -28803957.333333332, "logps/chosen": -108.74385070800781, "logps/rejected": -410.8356119791667, "loss": 0.2017, "rewards/chosen": 0.3989822268486023, "rewards/margins": 2.3400740822156267, "rewards/rejected": -1.9410918553670247, "step": 10171 }, { "epoch": 0.5391567063313281, "grad_norm": 44.5, "kl": 1.6009845733642578, "learning_rate": 5e-07, "logits/chosen": -10998272.8, "logits/rejected": -21681838.666666668, "logps/chosen": -239.2943359375, "logps/rejected": -278.3401285807292, "loss": 0.3584, "rewards/chosen": 0.4024641513824463, "rewards/margins": 1.916451374689738, "rewards/rejected": -1.5139872233072917, "step": 10172 }, { "epoch": 0.5392097103331301, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60158906.666666664, "logits/rejected": -19412728.0, "logps/chosen": -427.74169921875, "logps/rejected": -276.39130859375, "loss": 0.2379, "rewards/chosen": 0.11363271872202556, "rewards/margins": 2.490379659334818, "rewards/rejected": -2.376746940612793, "step": 10173 }, { "epoch": 0.5392627143349323, "grad_norm": 52.75, "kl": 0.8789119720458984, "learning_rate": 5e-07, "logits/chosen": -54223992.0, "logits/rejected": -6009991.0, "logps/chosen": -334.0440979003906, "logps/rejected": -194.92893981933594, "loss": 0.3137, "rewards/chosen": 0.7242558002471924, "rewards/margins": 2.1304818391799927, "rewards/rejected": -1.4062260389328003, "step": 10174 }, { "epoch": 0.5393157183367344, "grad_norm": 60.5, "kl": 1.3383417129516602, "learning_rate": 5e-07, "logits/chosen": -6591518.0, "logits/rejected": 134144140.8, "logps/chosen": -135.3116455078125, "logps/rejected": -158.535888671875, "loss": 0.2629, "rewards/chosen": 0.9711844126383463, "rewards/margins": 2.5707091013590495, "rewards/rejected": -1.599524688720703, "step": 10175 }, { "epoch": 0.5393687223385366, "grad_norm": 44.0, "kl": 1.8809185028076172, "learning_rate": 5e-07, "logits/chosen": -11544281.0, "logits/rejected": -20816894.0, "logps/chosen": -371.20953369140625, "logps/rejected": -324.0640869140625, "loss": 0.2057, "rewards/chosen": 1.584359884262085, "rewards/margins": 3.914577007293701, "rewards/rejected": -2.330217123031616, "step": 10176 }, { "epoch": 0.5394217263403387, "grad_norm": 45.25, "kl": 0.3093223571777344, "learning_rate": 5e-07, "logits/chosen": -65219704.0, "logits/rejected": -25457714.666666668, "logps/chosen": -581.3535766601562, "logps/rejected": -333.8459065755208, "loss": 0.0994, "rewards/chosen": 1.935333251953125, "rewards/margins": 4.9066871007283535, "rewards/rejected": -2.971353848775228, "step": 10177 }, { "epoch": 0.5394747303421409, "grad_norm": 52.5, "kl": 2.9706339836120605, "learning_rate": 5e-07, "logits/chosen": -15295440.0, "logits/rejected": -19507376.0, "logps/chosen": -255.89962332589286, "logps/rejected": -134.6243133544922, "loss": 0.4773, "rewards/chosen": 0.21548012324741908, "rewards/margins": 1.893600617136274, "rewards/rejected": -1.678120493888855, "step": 10178 }, { "epoch": 0.539527734343943, "grad_norm": 46.5, "kl": 2.272531509399414, "learning_rate": 5e-07, "logits/chosen": -15858256.0, "logits/rejected": -28274358.0, "logps/chosen": -284.5775960286458, "logps/rejected": -232.65684509277344, "loss": 0.3251, "rewards/chosen": 0.9651761849721273, "rewards/margins": 2.745105584462484, "rewards/rejected": -1.7799293994903564, "step": 10179 }, { "epoch": 0.5395807383457452, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22169870.666666668, "logits/rejected": -12804676.8, "logps/chosen": -331.6009928385417, "logps/rejected": -220.828173828125, "loss": 0.2937, "rewards/chosen": 0.5419066747029623, "rewards/margins": 2.4370957692464192, "rewards/rejected": -1.895189094543457, "step": 10180 }, { "epoch": 0.5396337423475472, "grad_norm": 43.5, "kl": 0.7204132080078125, "learning_rate": 5e-07, "logits/chosen": -26431546.666666668, "logits/rejected": -32317292.8, "logps/chosen": -392.632080078125, "logps/rejected": -326.856494140625, "loss": 0.2033, "rewards/chosen": 1.5006672541300456, "rewards/margins": 3.116404406229655, "rewards/rejected": -1.6157371520996093, "step": 10181 }, { "epoch": 0.5396867463493493, "grad_norm": 52.0, "kl": 2.6004905700683594, "learning_rate": 5e-07, "logits/chosen": -62035820.8, "logits/rejected": -34431552.0, "logps/chosen": -361.156103515625, "logps/rejected": -452.6169840494792, "loss": 0.3317, "rewards/chosen": 0.4016390800476074, "rewards/margins": 4.330886618296305, "rewards/rejected": -3.9292475382486978, "step": 10182 }, { "epoch": 0.5397397503511515, "grad_norm": 71.0, "kl": 0.00653076171875, "learning_rate": 5e-07, "logits/chosen": -12508541.0, "logits/rejected": 34797912.0, "logps/chosen": -202.92486572265625, "logps/rejected": -354.4799499511719, "loss": 0.3594, "rewards/chosen": 0.37492403388023376, "rewards/margins": 1.3164661228656769, "rewards/rejected": -0.9415420889854431, "step": 10183 }, { "epoch": 0.5397927543529536, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28199360.0, "logits/rejected": -57520115.2, "logps/chosen": -165.00253295898438, "logps/rejected": -335.7894775390625, "loss": 0.2408, "rewards/chosen": 0.34164631366729736, "rewards/margins": 2.608376622200012, "rewards/rejected": -2.266730308532715, "step": 10184 }, { "epoch": 0.5398457583547558, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55226752.0, "logits/rejected": -51588754.28571428, "logps/chosen": -331.9270935058594, "logps/rejected": -479.95448521205356, "loss": 0.146, "rewards/chosen": 0.47219544649124146, "rewards/margins": 2.8364927683557783, "rewards/rejected": -2.364297321864537, "step": 10185 }, { "epoch": 0.5398987623565579, "grad_norm": 56.25, "kl": 0.3287181854248047, "learning_rate": 5e-07, "logits/chosen": 26344138.0, "logits/rejected": -4861498.285714285, "logps/chosen": -77.77896881103516, "logps/rejected": -293.0576171875, "loss": 0.238, "rewards/chosen": 0.3104301393032074, "rewards/margins": 2.0416885316371918, "rewards/rejected": -1.7312583923339844, "step": 10186 }, { "epoch": 0.5399517663583601, "grad_norm": 68.0, "kl": 1.9062471389770508, "learning_rate": 5e-07, "logits/chosen": -48765477.333333336, "logits/rejected": 5909772.0, "logps/chosen": -370.678466796875, "logps/rejected": -48.010257720947266, "loss": 0.3514, "rewards/chosen": 0.6702247460683187, "rewards/margins": 1.5778581698735556, "rewards/rejected": -0.9076334238052368, "step": 10187 }, { "epoch": 0.5400047703601621, "grad_norm": 57.75, "kl": 0.9042611122131348, "learning_rate": 5e-07, "logits/chosen": -37587802.666666664, "logits/rejected": 3622922.5, "logps/chosen": -339.00872802734375, "logps/rejected": -114.12451934814453, "loss": 0.4458, "rewards/chosen": 0.13429304957389832, "rewards/margins": 1.5739993751049042, "rewards/rejected": -1.4397063255310059, "step": 10188 }, { "epoch": 0.5400577743619643, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24557280.0, "logits/rejected": -31687474.0, "logps/chosen": -431.775634765625, "logps/rejected": -174.11415100097656, "loss": 0.3239, "rewards/chosen": 0.03344458341598511, "rewards/margins": 1.9642197489738464, "rewards/rejected": -1.9307751655578613, "step": 10189 }, { "epoch": 0.5401107783637664, "grad_norm": 58.25, "kl": 1.2778348922729492, "learning_rate": 5e-07, "logits/chosen": -41391689.6, "logits/rejected": -17148153.333333332, "logps/chosen": -455.51357421875, "logps/rejected": -254.9801228841146, "loss": 0.2412, "rewards/chosen": 1.2661455154418946, "rewards/margins": 3.2342743555704754, "rewards/rejected": -1.9681288401285808, "step": 10190 }, { "epoch": 0.5401637823655686, "grad_norm": 58.5, "kl": 1.3591861724853516, "learning_rate": 5e-07, "logits/chosen": -1614220.8, "logits/rejected": -31175997.333333332, "logps/chosen": -465.82734375, "logps/rejected": -309.7642415364583, "loss": 0.2963, "rewards/chosen": 0.6476371765136719, "rewards/margins": 3.2986669222513836, "rewards/rejected": -2.6510297457377114, "step": 10191 }, { "epoch": 0.5402167863673707, "grad_norm": 58.75, "kl": 2.045551300048828, "learning_rate": 5e-07, "logits/chosen": -28365608.0, "logits/rejected": -11318744.0, "logps/chosen": -290.2712809244792, "logps/rejected": -270.0060729980469, "loss": 0.3513, "rewards/chosen": 0.5973705450693766, "rewards/margins": 2.525872508684794, "rewards/rejected": -1.9285019636154175, "step": 10192 }, { "epoch": 0.5402697903691729, "grad_norm": 43.25, "kl": 0.16181182861328125, "learning_rate": 5e-07, "logits/chosen": -29822574.0, "logits/rejected": -16520076.0, "logps/chosen": -350.28790283203125, "logps/rejected": -289.48272705078125, "loss": 0.2779, "rewards/chosen": 0.43496134877204895, "rewards/margins": 3.161390572786331, "rewards/rejected": -2.7264292240142822, "step": 10193 }, { "epoch": 0.540322794370975, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64409587.2, "logits/rejected": -18323468.0, "logps/chosen": -277.023193359375, "logps/rejected": -306.62603759765625, "loss": 0.3311, "rewards/chosen": 0.25439844131469724, "rewards/margins": 2.3252119382222496, "rewards/rejected": -2.0708134969075522, "step": 10194 }, { "epoch": 0.5403757983727772, "grad_norm": 51.5, "kl": 0.16257667541503906, "learning_rate": 5e-07, "logits/chosen": -54891064.0, "logits/rejected": -5008926.5, "logps/chosen": -314.5438232421875, "logps/rejected": -158.73690795898438, "loss": 0.3583, "rewards/chosen": 0.5068348050117493, "rewards/margins": 1.467427372932434, "rewards/rejected": -0.9605925679206848, "step": 10195 }, { "epoch": 0.5404288023745792, "grad_norm": 49.25, "kl": 0.9099273681640625, "learning_rate": 5e-07, "logits/chosen": -25053006.0, "logits/rejected": -41667320.0, "logps/chosen": -184.1580047607422, "logps/rejected": -92.51614379882812, "loss": 0.3978, "rewards/chosen": 0.01793476939201355, "rewards/margins": 1.1093719899654388, "rewards/rejected": -1.0914372205734253, "step": 10196 }, { "epoch": 0.5404818063763814, "grad_norm": 48.75, "kl": 0.2988128662109375, "learning_rate": 5e-07, "logits/chosen": -3198748.6666666665, "logits/rejected": -12067351.2, "logps/chosen": -173.49434407552084, "logps/rejected": -332.9687255859375, "loss": 0.3135, "rewards/chosen": -0.004624565442403157, "rewards/margins": 2.2281091610590615, "rewards/rejected": -2.232733726501465, "step": 10197 }, { "epoch": 0.5405348103781835, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10071558.0, "logits/rejected": -45065540.0, "logps/chosen": -306.15765380859375, "logps/rejected": -439.080810546875, "loss": 0.1884, "rewards/chosen": 0.9907352924346924, "rewards/margins": 3.8835103511810303, "rewards/rejected": -2.892775058746338, "step": 10198 }, { "epoch": 0.5405878143799857, "grad_norm": 48.0, "kl": 0.06998062133789062, "learning_rate": 5e-07, "logits/chosen": -21345131.2, "logits/rejected": 27338104.0, "logps/chosen": -148.0177001953125, "logps/rejected": -305.9680582682292, "loss": 0.3172, "rewards/chosen": 0.33692958354949953, "rewards/margins": 2.241890533765157, "rewards/rejected": -1.9049609502156575, "step": 10199 }, { "epoch": 0.5406408183817878, "grad_norm": 52.25, "kl": 1.3131446838378906, "learning_rate": 5e-07, "logits/chosen": -69454282.66666667, "logits/rejected": -31286899.2, "logps/chosen": -552.266357421875, "logps/rejected": -362.295947265625, "loss": 0.2861, "rewards/chosen": 0.17715760072072348, "rewards/margins": 2.095625122388204, "rewards/rejected": -1.9184675216674805, "step": 10200 }, { "epoch": 0.54069382238359, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15117124.0, "logits/rejected": -19774563.2, "logps/chosen": -131.7377726236979, "logps/rejected": -433.96748046875, "loss": 0.241, "rewards/chosen": -0.11841724316279094, "rewards/margins": 2.8069688042004906, "rewards/rejected": -2.9253860473632813, "step": 10201 }, { "epoch": 0.5407468263853921, "grad_norm": 60.0, "kl": 2.1329774856567383, "learning_rate": 5e-07, "logits/chosen": -17720630.4, "logits/rejected": -18255876.0, "logps/chosen": -509.85517578125, "logps/rejected": -244.99055989583334, "loss": 0.2838, "rewards/chosen": 0.7476256370544434, "rewards/margins": 3.6706259727478026, "rewards/rejected": -2.9230003356933594, "step": 10202 }, { "epoch": 0.5407998303871943, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 29880468.0, "logits/rejected": -11348741.333333334, "logps/chosen": -305.9687194824219, "logps/rejected": -162.20929972330728, "loss": 0.2347, "rewards/chosen": 0.004746079444885254, "rewards/margins": 2.237592339515686, "rewards/rejected": -2.232846260070801, "step": 10203 }, { "epoch": 0.5408528343889963, "grad_norm": 43.25, "kl": 0.6773109436035156, "learning_rate": 5e-07, "logits/chosen": -4103048.0, "logits/rejected": 26797993.6, "logps/chosen": -135.64297485351562, "logps/rejected": -428.223583984375, "loss": 0.3007, "rewards/chosen": 0.5574854612350464, "rewards/margins": 2.047917342185974, "rewards/rejected": -1.4904318809509278, "step": 10204 }, { "epoch": 0.5409058383907985, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45229814.85714286, "logits/rejected": -38882068.0, "logps/chosen": -333.56944056919644, "logps/rejected": -449.05059814453125, "loss": 0.4182, "rewards/chosen": 0.4373506818498884, "rewards/margins": 2.2415438209261214, "rewards/rejected": -1.804193139076233, "step": 10205 }, { "epoch": 0.5409588423926006, "grad_norm": 36.25, "kl": 0.4600381851196289, "learning_rate": 5e-07, "logits/chosen": -8148509.333333333, "logits/rejected": -42963094.4, "logps/chosen": -121.59670003255208, "logps/rejected": -312.1036865234375, "loss": 0.2776, "rewards/chosen": 0.371186097462972, "rewards/margins": 2.1558037122090656, "rewards/rejected": -1.7846176147460937, "step": 10206 }, { "epoch": 0.5410118463944028, "grad_norm": 55.75, "kl": 0.5609331130981445, "learning_rate": 5e-07, "logits/chosen": -6925507.2, "logits/rejected": -9262273.333333334, "logps/chosen": -325.666796875, "logps/rejected": -286.6860758463542, "loss": 0.3609, "rewards/chosen": 0.2149430274963379, "rewards/margins": 2.188671525319417, "rewards/rejected": -1.9737284978230794, "step": 10207 }, { "epoch": 0.5410648503962049, "grad_norm": 58.75, "kl": 3.033109664916992, "learning_rate": 5e-07, "logits/chosen": -23995272.0, "logits/rejected": -30792874.666666668, "logps/chosen": -237.083251953125, "logps/rejected": -210.57303873697916, "loss": 0.351, "rewards/chosen": 0.5869465827941894, "rewards/margins": 2.8066871325174967, "rewards/rejected": -2.219740549723307, "step": 10208 }, { "epoch": 0.5411178543980071, "grad_norm": 41.75, "kl": 0.03936004638671875, "learning_rate": 5e-07, "logits/chosen": -30543618.666666668, "logits/rejected": -69858617.6, "logps/chosen": -308.5520426432292, "logps/rejected": -371.4601318359375, "loss": 0.1926, "rewards/chosen": 0.85835067431132, "rewards/margins": 3.035813824335734, "rewards/rejected": -2.177463150024414, "step": 10209 }, { "epoch": 0.5411708583998092, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25524261.333333332, "logits/rejected": -16078630.4, "logps/chosen": -324.6051839192708, "logps/rejected": -256.1297607421875, "loss": 0.1275, "rewards/chosen": 2.034422238667806, "rewards/margins": 4.427447827657064, "rewards/rejected": -2.393025588989258, "step": 10210 }, { "epoch": 0.5412238624016114, "grad_norm": 70.5, "kl": 1.655322551727295, "learning_rate": 5e-07, "logits/chosen": -4807276.0, "logits/rejected": -5595367.333333333, "logps/chosen": -225.088232421875, "logps/rejected": -112.32558186848958, "loss": 0.4297, "rewards/chosen": 0.260407543182373, "rewards/margins": 0.7774186293284098, "rewards/rejected": -0.5170110861460367, "step": 10211 }, { "epoch": 0.5412768664034134, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23148594.666666668, "logits/rejected": -74186003.2, "logps/chosen": -276.89654541015625, "logps/rejected": -340.0851318359375, "loss": 0.2097, "rewards/chosen": 0.5095911820729574, "rewards/margins": 3.259682544072469, "rewards/rejected": -2.7500913619995115, "step": 10212 }, { "epoch": 0.5413298704052156, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41415418.666666664, "logits/rejected": -21798283.2, "logps/chosen": -243.73291015625, "logps/rejected": -243.226904296875, "loss": 0.1885, "rewards/chosen": 1.0147695541381836, "rewards/margins": 3.3317975997924805, "rewards/rejected": -2.317028045654297, "step": 10213 }, { "epoch": 0.5413828744070177, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1186166.0, "logits/rejected": -43849920.0, "logps/chosen": -185.63136291503906, "logps/rejected": -545.15087890625, "loss": 0.3053, "rewards/chosen": 0.15112078189849854, "rewards/margins": 2.4143816232681274, "rewards/rejected": -2.263260841369629, "step": 10214 }, { "epoch": 0.5414358784088199, "grad_norm": 55.25, "kl": 1.7595014572143555, "learning_rate": 5e-07, "logits/chosen": -45434140.8, "logits/rejected": -37276549.333333336, "logps/chosen": -491.610107421875, "logps/rejected": -498.7141927083333, "loss": 0.299, "rewards/chosen": 0.6050447463989258, "rewards/margins": 2.8858980178833007, "rewards/rejected": -2.280853271484375, "step": 10215 }, { "epoch": 0.541488882410622, "grad_norm": 41.25, "kl": 0.7534723281860352, "learning_rate": 5e-07, "logits/chosen": -6952134.5, "logits/rejected": -11050766.0, "logps/chosen": -174.46517944335938, "logps/rejected": -215.1527099609375, "loss": 0.3257, "rewards/chosen": 0.544985294342041, "rewards/margins": 2.050278663635254, "rewards/rejected": -1.505293369293213, "step": 10216 }, { "epoch": 0.5415418864124242, "grad_norm": 53.75, "kl": 2.4121932983398438, "learning_rate": 5e-07, "logits/chosen": -34197126.4, "logits/rejected": -24390693.333333332, "logps/chosen": -451.354443359375, "logps/rejected": -325.77036539713544, "loss": 0.2703, "rewards/chosen": 0.6603890895843506, "rewards/margins": 3.31436759630839, "rewards/rejected": -2.6539785067240396, "step": 10217 }, { "epoch": 0.5415948904142263, "grad_norm": 45.0, "kl": 1.4896793365478516, "learning_rate": 5e-07, "logits/chosen": -49764736.0, "logits/rejected": -33148088.0, "logps/chosen": -501.790576171875, "logps/rejected": -347.8678385416667, "loss": 0.3181, "rewards/chosen": 0.9532876968383789, "rewards/margins": 3.512993907928467, "rewards/rejected": -2.559706211090088, "step": 10218 }, { "epoch": 0.5416478944160285, "grad_norm": 52.5, "kl": 1.6980648040771484, "learning_rate": 5e-07, "logits/chosen": -69724800.0, "logits/rejected": 11848733.333333334, "logps/chosen": -499.2843017578125, "logps/rejected": -253.7445068359375, "loss": 0.2178, "rewards/chosen": 1.5974105596542358, "rewards/margins": 2.9353946447372437, "rewards/rejected": -1.3379840850830078, "step": 10219 }, { "epoch": 0.5417008984178305, "grad_norm": 54.5, "kl": 2.9916019439697266, "learning_rate": 5e-07, "logits/chosen": -16731542.4, "logits/rejected": -6297860.666666667, "logps/chosen": -236.1815673828125, "logps/rejected": -165.56612141927084, "loss": 0.4396, "rewards/chosen": 0.4112089157104492, "rewards/margins": 1.5981466452280682, "rewards/rejected": -1.186937729517619, "step": 10220 }, { "epoch": 0.5417539024196327, "grad_norm": 55.0, "kl": 6.172995567321777, "learning_rate": 5e-07, "logits/chosen": -34230044.0, "logits/rejected": -25722292.0, "logps/chosen": -245.4609375, "logps/rejected": -170.10415649414062, "loss": 0.3866, "rewards/chosen": 0.5228291749954224, "rewards/margins": 2.6017967462539673, "rewards/rejected": -2.078967571258545, "step": 10221 }, { "epoch": 0.5418069064214348, "grad_norm": 45.25, "kl": 0.7765159606933594, "learning_rate": 5e-07, "logits/chosen": -65005152.0, "logits/rejected": -16437890.666666666, "logps/chosen": -594.99130859375, "logps/rejected": -448.1858723958333, "loss": 0.2344, "rewards/chosen": 0.9029947280883789, "rewards/margins": 4.237246131896972, "rewards/rejected": -3.3342514038085938, "step": 10222 }, { "epoch": 0.541859910423237, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18889842.0, "logits/rejected": -56938250.666666664, "logps/chosen": -289.1053466796875, "logps/rejected": -443.1134033203125, "loss": 0.2673, "rewards/chosen": 0.2382354736328125, "rewards/margins": 2.037590980529785, "rewards/rejected": -1.7993555068969727, "step": 10223 }, { "epoch": 0.5419129144250391, "grad_norm": 45.25, "kl": 2.0609092712402344, "learning_rate": 5e-07, "logits/chosen": -47907045.333333336, "logits/rejected": -38667897.6, "logps/chosen": -828.9137369791666, "logps/rejected": -331.8121826171875, "loss": 0.1438, "rewards/chosen": 2.35205078125, "rewards/margins": 4.289896774291992, "rewards/rejected": -1.9378459930419922, "step": 10224 }, { "epoch": 0.5419659184268413, "grad_norm": 58.25, "kl": 2.2874927520751953, "learning_rate": 5e-07, "logits/chosen": -9517229.714285715, "logits/rejected": -17567490.0, "logps/chosen": -272.8991176060268, "logps/rejected": -947.5299682617188, "loss": 0.3273, "rewards/chosen": 0.856807027544294, "rewards/margins": 4.740407024111066, "rewards/rejected": -3.8835999965667725, "step": 10225 }, { "epoch": 0.5420189224286434, "grad_norm": 83.5, "kl": 2.8535003662109375, "learning_rate": 5e-07, "logits/chosen": -45592512.0, "logits/rejected": -10781502.666666666, "logps/chosen": -439.311572265625, "logps/rejected": -775.8372395833334, "loss": 0.3618, "rewards/chosen": 0.4928278923034668, "rewards/margins": 2.8617396354675293, "rewards/rejected": -2.3689117431640625, "step": 10226 }, { "epoch": 0.5420719264304455, "grad_norm": 42.5, "kl": 0.008647918701171875, "learning_rate": 5e-07, "logits/chosen": -20588678.4, "logits/rejected": -46087125.333333336, "logps/chosen": -200.1039306640625, "logps/rejected": -562.4668375651041, "loss": 0.357, "rewards/chosen": -0.15884357690811157, "rewards/margins": 2.7711971004803977, "rewards/rejected": -2.9300406773885093, "step": 10227 }, { "epoch": 0.5421249304322476, "grad_norm": 48.0, "kl": 3.05023193359375, "learning_rate": 5e-07, "logits/chosen": -45647382.85714286, "logits/rejected": 30817838.0, "logps/chosen": -174.80583844866072, "logps/rejected": -324.784912109375, "loss": 0.4558, "rewards/chosen": 0.30450756209237234, "rewards/margins": 1.0883576955114092, "rewards/rejected": -0.7838501334190369, "step": 10228 }, { "epoch": 0.5421779344340498, "grad_norm": 38.5, "kl": 0.6594200134277344, "learning_rate": 5e-07, "logits/chosen": -14428120.0, "logits/rejected": -12539565.333333334, "logps/chosen": -459.1177062988281, "logps/rejected": -202.96514892578125, "loss": 0.164, "rewards/chosen": 0.9310181140899658, "rewards/margins": 3.6550397078196206, "rewards/rejected": -2.724021593729655, "step": 10229 }, { "epoch": 0.5422309384358519, "grad_norm": 61.75, "kl": 1.601999282836914, "learning_rate": 5e-07, "logits/chosen": -29202524.0, "logits/rejected": 46137296.0, "logps/chosen": -137.50469970703125, "logps/rejected": -262.6108093261719, "loss": 0.3419, "rewards/chosen": 0.10987721383571625, "rewards/margins": 2.2749873250722885, "rewards/rejected": -2.1651101112365723, "step": 10230 }, { "epoch": 0.542283942437654, "grad_norm": 52.5, "kl": 0.3644065856933594, "learning_rate": 5e-07, "logits/chosen": 148575136.0, "logits/rejected": -18378713.333333332, "logps/chosen": -325.2978515625, "logps/rejected": -146.0552978515625, "loss": 0.2787, "rewards/chosen": 0.9833297729492188, "rewards/margins": 2.1540610790252686, "rewards/rejected": -1.1707313060760498, "step": 10231 }, { "epoch": 0.5423369464394562, "grad_norm": 50.25, "kl": 1.3873443603515625, "learning_rate": 5e-07, "logits/chosen": -18263756.0, "logits/rejected": -28471763.2, "logps/chosen": -284.7593180338542, "logps/rejected": -567.18671875, "loss": 0.2317, "rewards/chosen": 0.9976942539215088, "rewards/margins": 3.5245325565338135, "rewards/rejected": -2.5268383026123047, "step": 10232 }, { "epoch": 0.5423899504412583, "grad_norm": 44.75, "kl": 0.9275937080383301, "learning_rate": 5e-07, "logits/chosen": -14516460.8, "logits/rejected": -23777362.666666668, "logps/chosen": -239.495556640625, "logps/rejected": -276.51015218098956, "loss": 0.3188, "rewards/chosen": 0.40055389404296876, "rewards/margins": 2.898816458384196, "rewards/rejected": -2.498262564341227, "step": 10233 }, { "epoch": 0.5424429544430605, "grad_norm": 49.0, "kl": 0.36810779571533203, "learning_rate": 5e-07, "logits/chosen": -5621292.666666667, "logits/rejected": -26085491.2, "logps/chosen": -92.55197143554688, "logps/rejected": -133.80360107421876, "loss": 0.3713, "rewards/chosen": -0.2983254591623942, "rewards/margins": 1.1819119294484455, "rewards/rejected": -1.4802373886108398, "step": 10234 }, { "epoch": 0.5424959584448625, "grad_norm": 78.0, "kl": 2.858783721923828, "learning_rate": 5e-07, "logits/chosen": -68444812.8, "logits/rejected": -38866696.0, "logps/chosen": -590.41806640625, "logps/rejected": -410.0879313151042, "loss": 0.3434, "rewards/chosen": 0.9184119224548339, "rewards/margins": 2.3999468803405763, "rewards/rejected": -1.4815349578857422, "step": 10235 }, { "epoch": 0.5425489624466647, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3616968.0, "logits/rejected": -25093208.0, "logps/chosen": -334.54852294921875, "logps/rejected": -360.9296875, "loss": 0.253, "rewards/chosen": 0.3259872496128082, "rewards/margins": 2.0960436562697096, "rewards/rejected": -1.7700564066569011, "step": 10236 }, { "epoch": 0.5426019664484668, "grad_norm": 32.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1423718.375, "logits/rejected": -52655280.0, "logps/chosen": -91.4007568359375, "logps/rejected": -348.8267415364583, "loss": 0.1561, "rewards/chosen": 0.38646307587623596, "rewards/margins": 3.165347327788671, "rewards/rejected": -2.778884251912435, "step": 10237 }, { "epoch": 0.542654970450269, "grad_norm": 49.75, "kl": 0.740966796875, "learning_rate": 5e-07, "logits/chosen": 15587934.666666666, "logits/rejected": -3494216.0, "logps/chosen": -451.0334065755208, "logps/rejected": -145.49039306640626, "loss": 0.1711, "rewards/chosen": 1.1284406979878743, "rewards/margins": 3.4574667294820145, "rewards/rejected": -2.3290260314941404, "step": 10238 }, { "epoch": 0.5427079744520711, "grad_norm": 41.25, "kl": 0.5114059448242188, "learning_rate": 5e-07, "logits/chosen": -18751984.0, "logits/rejected": -16370380.8, "logps/chosen": -213.90657552083334, "logps/rejected": -244.2904296875, "loss": 0.307, "rewards/chosen": 0.27481818199157715, "rewards/margins": 2.0907351970672607, "rewards/rejected": -1.8159170150756836, "step": 10239 }, { "epoch": 0.5427609784538733, "grad_norm": 66.5, "kl": 3.0339202880859375, "learning_rate": 5e-07, "logits/chosen": -119526592.0, "logits/rejected": 35456372.0, "logps/chosen": -477.29193115234375, "logps/rejected": -445.982666015625, "loss": 0.251, "rewards/chosen": 1.1255815029144287, "rewards/margins": 2.5938172340393066, "rewards/rejected": -1.468235731124878, "step": 10240 }, { "epoch": 0.5428139824556754, "grad_norm": 42.75, "kl": 1.5494155883789062, "learning_rate": 5e-07, "logits/chosen": -36577988.0, "logits/rejected": -31093082.666666668, "logps/chosen": -270.7821044921875, "logps/rejected": -308.4056396484375, "loss": 0.1817, "rewards/chosen": 1.1566787958145142, "rewards/margins": 3.8048715194066367, "rewards/rejected": -2.6481927235921225, "step": 10241 }, { "epoch": 0.5428669864574776, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10173063.333333334, "logits/rejected": -16276944.0, "logps/chosen": -350.0918782552083, "logps/rejected": -255.636181640625, "loss": 0.1853, "rewards/chosen": 0.5576812823613485, "rewards/margins": 3.2946460803349815, "rewards/rejected": -2.736964797973633, "step": 10242 }, { "epoch": 0.5429199904592796, "grad_norm": 45.5, "kl": 2.013652801513672, "learning_rate": 5e-07, "logits/chosen": -22481852.8, "logits/rejected": -31509650.666666668, "logps/chosen": -204.23916015625, "logps/rejected": -291.719970703125, "loss": 0.3002, "rewards/chosen": 0.8655920028686523, "rewards/margins": 2.525928497314453, "rewards/rejected": -1.6603364944458008, "step": 10243 }, { "epoch": 0.5429729944610818, "grad_norm": 45.25, "kl": 1.893035888671875, "learning_rate": 5e-07, "logits/chosen": -28315900.0, "logits/rejected": -48391188.0, "logps/chosen": -224.50738525390625, "logps/rejected": -329.03558349609375, "loss": 0.3061, "rewards/chosen": 0.6363551020622253, "rewards/margins": 2.3056522011756897, "rewards/rejected": -1.6692970991134644, "step": 10244 }, { "epoch": 0.5430259984628839, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34038760.0, "logits/rejected": -29237200.0, "logps/chosen": -147.5750732421875, "logps/rejected": -514.9349975585938, "loss": 0.2679, "rewards/chosen": 0.10764852166175842, "rewards/margins": 2.923708587884903, "rewards/rejected": -2.8160600662231445, "step": 10245 }, { "epoch": 0.5430790024646861, "grad_norm": 39.5, "kl": 0.7583637237548828, "learning_rate": 5e-07, "logits/chosen": 7798612.0, "logits/rejected": -36502121.6, "logps/chosen": -58.87554931640625, "logps/rejected": -303.3395263671875, "loss": 0.27, "rewards/chosen": 0.41665637493133545, "rewards/margins": 2.443699526786804, "rewards/rejected": -2.0270431518554686, "step": 10246 }, { "epoch": 0.5431320064664882, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36065240.0, "logits/rejected": 1957127.1666666667, "logps/chosen": -324.28326416015625, "logps/rejected": -137.18888346354166, "loss": 0.4254, "rewards/chosen": -0.019932560622692108, "rewards/margins": 0.4591212545831998, "rewards/rejected": -0.4790538152058919, "step": 10247 }, { "epoch": 0.5431850104682904, "grad_norm": 31.5, "kl": 1.191152572631836, "learning_rate": 5e-07, "logits/chosen": 5507803.0, "logits/rejected": 2601879.4, "logps/chosen": -55.08678181966146, "logps/rejected": -104.6730712890625, "loss": 0.3086, "rewards/chosen": 0.08237989743550618, "rewards/margins": 1.7418281396230062, "rewards/rejected": -1.6594482421875, "step": 10248 }, { "epoch": 0.5432380144700925, "grad_norm": 41.0, "kl": 1.5342493057250977, "learning_rate": 5e-07, "logits/chosen": -38073704.0, "logits/rejected": -8457794.0, "logps/chosen": -194.1583251953125, "logps/rejected": -242.99560546875, "loss": 0.3626, "rewards/chosen": 0.49150538444519043, "rewards/margins": 2.525744676589966, "rewards/rejected": -2.0342392921447754, "step": 10249 }, { "epoch": 0.5432910184718946, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19659248.0, "logits/rejected": -13138518.666666666, "logps/chosen": -243.621142578125, "logps/rejected": -328.2792154947917, "loss": 0.2822, "rewards/chosen": 0.42861018180847166, "rewards/margins": 3.6240453879038492, "rewards/rejected": -3.1954352060953775, "step": 10250 }, { "epoch": 0.5433440224736967, "grad_norm": 39.0, "kl": 1.0424938201904297, "learning_rate": 5e-07, "logits/chosen": 1468719.0, "logits/rejected": -23071609.6, "logps/chosen": -31.130889892578125, "logps/rejected": -165.86866455078126, "loss": 0.341, "rewards/chosen": -0.37045125166575116, "rewards/margins": 1.3831366618474323, "rewards/rejected": -1.7535879135131835, "step": 10251 }, { "epoch": 0.5433970264754989, "grad_norm": 48.25, "kl": 0.16422653198242188, "learning_rate": 5e-07, "logits/chosen": -48079269.333333336, "logits/rejected": -7559363.2, "logps/chosen": -396.5039876302083, "logps/rejected": -209.7371337890625, "loss": 0.2764, "rewards/chosen": 0.38123472531636554, "rewards/margins": 2.3180436929066977, "rewards/rejected": -1.936808967590332, "step": 10252 }, { "epoch": 0.543450030477301, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24410904.0, "logits/rejected": -13145180.0, "logps/chosen": -376.6099853515625, "logps/rejected": -231.66419982910156, "loss": 0.2748, "rewards/chosen": 0.4450351595878601, "rewards/margins": 3.0833117365837097, "rewards/rejected": -2.6382765769958496, "step": 10253 }, { "epoch": 0.5435030344791032, "grad_norm": 46.0, "kl": 0.7696418762207031, "learning_rate": 5e-07, "logits/chosen": -12598045.6, "logits/rejected": -8548972.666666666, "logps/chosen": -268.632421875, "logps/rejected": -222.59806315104166, "loss": 0.3903, "rewards/chosen": -0.06751210689544677, "rewards/margins": 2.0136023441950477, "rewards/rejected": -2.0811144510904946, "step": 10254 }, { "epoch": 0.5435560384809053, "grad_norm": 55.5, "kl": 1.853921890258789, "learning_rate": 5e-07, "logits/chosen": -26836968.0, "logits/rejected": -34940496.0, "logps/chosen": -328.8314615885417, "logps/rejected": -431.86041259765625, "loss": 0.2622, "rewards/chosen": 1.1054445902506511, "rewards/margins": 3.7040194670359297, "rewards/rejected": -2.5985748767852783, "step": 10255 }, { "epoch": 0.5436090424827075, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48107268.0, "logits/rejected": -51560176.0, "logps/chosen": -171.6624755859375, "logps/rejected": -517.7780151367188, "loss": 0.2055, "rewards/chosen": 0.5784189701080322, "rewards/margins": 3.7691900730133057, "rewards/rejected": -3.1907711029052734, "step": 10256 }, { "epoch": 0.5436620464845096, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61650922.666666664, "logits/rejected": -8516310.4, "logps/chosen": -438.6048583984375, "logps/rejected": -518.797216796875, "loss": 0.2413, "rewards/chosen": 0.3914591471354167, "rewards/margins": 2.3959044138590495, "rewards/rejected": -2.004445266723633, "step": 10257 }, { "epoch": 0.5437150504863117, "grad_norm": 33.25, "kl": 0.341799259185791, "learning_rate": 5e-07, "logits/chosen": -28003036.0, "logits/rejected": -12996764.0, "logps/chosen": -149.5222625732422, "logps/rejected": -179.7046915690104, "loss": 0.2421, "rewards/chosen": -0.06603008508682251, "rewards/margins": 1.907386561234792, "rewards/rejected": -1.9734166463216145, "step": 10258 }, { "epoch": 0.5437680544881138, "grad_norm": 44.25, "kl": 3.534176826477051, "learning_rate": 5e-07, "logits/chosen": -31110613.333333332, "logits/rejected": -87361496.0, "logps/chosen": -481.4820963541667, "logps/rejected": -257.6583557128906, "loss": 0.4117, "rewards/chosen": 0.6346549193064371, "rewards/margins": 3.1112541357676187, "rewards/rejected": -2.4765992164611816, "step": 10259 }, { "epoch": 0.543821058489916, "grad_norm": 31.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21243498.666666668, "logits/rejected": -34231110.4, "logps/chosen": -74.59679158528645, "logps/rejected": -229.019287109375, "loss": 0.2676, "rewards/chosen": -0.036333719889322914, "rewards/margins": 1.9844415028889972, "rewards/rejected": -2.0207752227783202, "step": 10260 }, { "epoch": 0.5438740624917181, "grad_norm": 34.75, "kl": 0.09939384460449219, "learning_rate": 5e-07, "logits/chosen": -38003176.0, "logits/rejected": -19698714.0, "logps/chosen": -125.33061981201172, "logps/rejected": -235.92413330078125, "loss": 0.2275, "rewards/chosen": 0.8908705711364746, "rewards/margins": 3.3893003463745117, "rewards/rejected": -2.498429775238037, "step": 10261 }, { "epoch": 0.5439270664935203, "grad_norm": 38.75, "kl": 0.37888526916503906, "learning_rate": 5e-07, "logits/chosen": -53538280.0, "logits/rejected": 161304448.0, "logps/chosen": -182.81793212890625, "logps/rejected": -327.55169677734375, "loss": 0.2655, "rewards/chosen": 0.7245585322380066, "rewards/margins": 2.288477838039398, "rewards/rejected": -1.5639193058013916, "step": 10262 }, { "epoch": 0.5439800704953224, "grad_norm": 55.25, "kl": 2.515225410461426, "learning_rate": 5e-07, "logits/chosen": -19661789.714285713, "logits/rejected": -43700276.0, "logps/chosen": -269.2344273158482, "logps/rejected": -554.3848266601562, "loss": 0.3472, "rewards/chosen": 0.8845298630850655, "rewards/margins": 4.669509206499372, "rewards/rejected": -3.7849793434143066, "step": 10263 }, { "epoch": 0.5440330744971246, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -171182250.66666666, "logits/rejected": -11392168.0, "logps/chosen": -398.0728352864583, "logps/rejected": -323.9401123046875, "loss": 0.2454, "rewards/chosen": 0.2606635093688965, "rewards/margins": 2.3658480644226074, "rewards/rejected": -2.105184555053711, "step": 10264 }, { "epoch": 0.5440860784989267, "grad_norm": 57.0, "kl": 1.5062313079833984, "learning_rate": 5e-07, "logits/chosen": -17046313.333333332, "logits/rejected": -52887209.6, "logps/chosen": -230.54923502604166, "logps/rejected": -480.9263671875, "loss": 0.3821, "rewards/chosen": -0.21101915836334229, "rewards/margins": 1.6958217382431031, "rewards/rejected": -1.9068408966064454, "step": 10265 }, { "epoch": 0.5441390825007288, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22960600.0, "logits/rejected": -4778820.4, "logps/chosen": -355.8954671223958, "logps/rejected": -173.2421875, "loss": 0.1555, "rewards/chosen": 1.4339148203531902, "rewards/margins": 4.265055720011393, "rewards/rejected": -2.831140899658203, "step": 10266 }, { "epoch": 0.5441920865025309, "grad_norm": 46.5, "kl": 0.07095527648925781, "learning_rate": 5e-07, "logits/chosen": -13836049.0, "logits/rejected": -28721412.0, "logps/chosen": -239.90240478515625, "logps/rejected": -361.92498779296875, "loss": 0.258, "rewards/chosen": 0.46786755323410034, "rewards/margins": 2.960236966609955, "rewards/rejected": -2.4923694133758545, "step": 10267 }, { "epoch": 0.5442450905043331, "grad_norm": 41.25, "kl": 1.0351171493530273, "learning_rate": 5e-07, "logits/chosen": -62584772.0, "logits/rejected": -26549440.0, "logps/chosen": -552.6817626953125, "logps/rejected": -333.1891784667969, "loss": 0.1856, "rewards/chosen": 1.489087462425232, "rewards/margins": 4.569077849388123, "rewards/rejected": -3.0799903869628906, "step": 10268 }, { "epoch": 0.5442980945061352, "grad_norm": 32.5, "kl": 0.960148811340332, "learning_rate": 5e-07, "logits/chosen": -23840682.666666668, "logits/rejected": 50890496.0, "logps/chosen": -186.9276326497396, "logps/rejected": -597.90888671875, "loss": 0.2729, "rewards/chosen": -0.13823224107424417, "rewards/margins": 2.57497195204099, "rewards/rejected": -2.713204193115234, "step": 10269 }, { "epoch": 0.5443510985079374, "grad_norm": 61.0, "kl": 1.926142692565918, "learning_rate": 5e-07, "logits/chosen": -43949560.0, "logits/rejected": -1580117.0, "logps/chosen": -444.2580159505208, "logps/rejected": -266.55804443359375, "loss": 0.3725, "rewards/chosen": 0.32834388812383014, "rewards/margins": 3.965599556763967, "rewards/rejected": -3.6372556686401367, "step": 10270 }, { "epoch": 0.5444041025097395, "grad_norm": 47.75, "kl": 5.332996368408203, "learning_rate": 5e-07, "logits/chosen": -26893424.0, "logits/rejected": -5982606.5, "logps/chosen": -540.58056640625, "logps/rejected": -77.08842468261719, "loss": 0.3599, "rewards/chosen": 0.9230947494506836, "rewards/margins": 3.5190460681915283, "rewards/rejected": -2.5959513187408447, "step": 10271 }, { "epoch": 0.5444571065115417, "grad_norm": 47.25, "kl": 0.8282558917999268, "learning_rate": 5e-07, "logits/chosen": 127344688.0, "logits/rejected": -50741436.0, "logps/chosen": -248.29306030273438, "logps/rejected": -463.579833984375, "loss": 0.2887, "rewards/chosen": 0.06356102228164673, "rewards/margins": 3.3166165947914124, "rewards/rejected": -3.2530555725097656, "step": 10272 }, { "epoch": 0.5445101105133437, "grad_norm": 36.0, "kl": 0.407501220703125, "learning_rate": 5e-07, "logits/chosen": -23102957.333333332, "logits/rejected": -35946483.2, "logps/chosen": -228.99214680989584, "logps/rejected": -486.97060546875, "loss": 0.2255, "rewards/chosen": 0.23227806886037192, "rewards/margins": 3.063956650098165, "rewards/rejected": -2.831678581237793, "step": 10273 }, { "epoch": 0.5445631145151459, "grad_norm": 55.75, "kl": 0.691070556640625, "learning_rate": 5e-07, "logits/chosen": -16063985.0, "logits/rejected": -19424718.0, "logps/chosen": -148.5115966796875, "logps/rejected": -184.7001495361328, "loss": 0.3724, "rewards/chosen": 0.2181028425693512, "rewards/margins": 1.2837906181812286, "rewards/rejected": -1.0656877756118774, "step": 10274 }, { "epoch": 0.544616118516948, "grad_norm": 91.0, "kl": 1.6926860809326172, "learning_rate": 5e-07, "logits/chosen": -73109920.0, "logits/rejected": -29958848.0, "logps/chosen": -391.2138977050781, "logps/rejected": -186.04129028320312, "loss": 0.3325, "rewards/chosen": 0.17045392096042633, "rewards/margins": 2.580409422516823, "rewards/rejected": -2.4099555015563965, "step": 10275 }, { "epoch": 0.5446691225187502, "grad_norm": 47.75, "kl": 1.5838127136230469, "learning_rate": 5e-07, "logits/chosen": -18597810.0, "logits/rejected": -11792573.0, "logps/chosen": -242.86390686035156, "logps/rejected": -168.2550506591797, "loss": 0.3489, "rewards/chosen": 0.2913726270198822, "rewards/margins": 1.7746311128139496, "rewards/rejected": -1.4832584857940674, "step": 10276 }, { "epoch": 0.5447221265205523, "grad_norm": 54.75, "kl": 1.4966144561767578, "learning_rate": 5e-07, "logits/chosen": -88478739.2, "logits/rejected": 3167119.6666666665, "logps/chosen": -396.0181640625, "logps/rejected": -87.79750569661458, "loss": 0.3069, "rewards/chosen": 0.6933572292327881, "rewards/margins": 2.701365868250529, "rewards/rejected": -2.0080086390177407, "step": 10277 }, { "epoch": 0.5447751305223545, "grad_norm": 28.75, "kl": 0.8877615928649902, "learning_rate": 5e-07, "logits/chosen": -19211259.2, "logits/rejected": -47235536.0, "logps/chosen": -155.825, "logps/rejected": -375.6728108723958, "loss": 0.237, "rewards/chosen": 1.0630913734436036, "rewards/margins": 3.6607352256774903, "rewards/rejected": -2.5976438522338867, "step": 10278 }, { "epoch": 0.5448281345241566, "grad_norm": 62.25, "kl": 0.9082412719726562, "learning_rate": 5e-07, "logits/chosen": -5897450.5, "logits/rejected": -12290604.0, "logps/chosen": -502.04400634765625, "logps/rejected": -447.8171081542969, "loss": 0.2973, "rewards/chosen": 0.5817672610282898, "rewards/margins": 2.6824750304222107, "rewards/rejected": -2.100707769393921, "step": 10279 }, { "epoch": 0.5448811385259588, "grad_norm": 52.25, "kl": 3.356194496154785, "learning_rate": 5e-07, "logits/chosen": -13619537.6, "logits/rejected": 5791654.0, "logps/chosen": -266.3367431640625, "logps/rejected": -162.46497599283853, "loss": 0.3021, "rewards/chosen": 1.2208446502685546, "rewards/margins": 2.8609285990397133, "rewards/rejected": -1.640083948771159, "step": 10280 }, { "epoch": 0.5449341425277608, "grad_norm": 58.0, "kl": 0.9279003143310547, "learning_rate": 5e-07, "logits/chosen": -14631454.0, "logits/rejected": -21993190.0, "logps/chosen": -151.1905517578125, "logps/rejected": -184.92013549804688, "loss": 0.4251, "rewards/chosen": -0.1530582457780838, "rewards/margins": 1.2767688482999802, "rewards/rejected": -1.429827094078064, "step": 10281 }, { "epoch": 0.5449871465295629, "grad_norm": 29.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9985472.666666666, "logits/rejected": -38481302.4, "logps/chosen": -178.12540690104166, "logps/rejected": -192.92750244140626, "loss": 0.2081, "rewards/chosen": 0.32859055201212567, "rewards/margins": 3.311895720163981, "rewards/rejected": -2.9833051681518556, "step": 10282 }, { "epoch": 0.5450401505313651, "grad_norm": 56.75, "kl": 0.71710205078125, "learning_rate": 5e-07, "logits/chosen": 22472986.0, "logits/rejected": -31748866.0, "logps/chosen": -336.8744201660156, "logps/rejected": -501.22625732421875, "loss": 0.268, "rewards/chosen": 0.2586601972579956, "rewards/margins": 3.05034339427948, "rewards/rejected": -2.7916831970214844, "step": 10283 }, { "epoch": 0.5450931545331672, "grad_norm": 46.0, "kl": 0.4971427917480469, "learning_rate": 5e-07, "logits/chosen": -30888042.666666668, "logits/rejected": -67943795.2, "logps/chosen": -431.649658203125, "logps/rejected": -377.5770263671875, "loss": 0.1944, "rewards/chosen": 0.8373235066731771, "rewards/margins": 3.8913382848103844, "rewards/rejected": -3.054014778137207, "step": 10284 }, { "epoch": 0.5451461585349694, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13476128.0, "logits/rejected": -6980883.5, "logps/chosen": -160.05148315429688, "logps/rejected": -158.06124877929688, "loss": 0.2996, "rewards/chosen": 0.16205808520317078, "rewards/margins": 2.4777385890483856, "rewards/rejected": -2.315680503845215, "step": 10285 }, { "epoch": 0.5451991625367715, "grad_norm": 52.25, "kl": 0.3036994934082031, "learning_rate": 5e-07, "logits/chosen": -46151644.8, "logits/rejected": -57240016.0, "logps/chosen": -389.0896240234375, "logps/rejected": -289.6979166666667, "loss": 0.2395, "rewards/chosen": 1.1632862091064453, "rewards/margins": 2.790634473164876, "rewards/rejected": -1.627348264058431, "step": 10286 }, { "epoch": 0.5452521665385737, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21345542.666666668, "logits/rejected": -14160678.4, "logps/chosen": -323.4794921875, "logps/rejected": -283.92607421875, "loss": 0.2725, "rewards/chosen": 0.42291259765625, "rewards/margins": 2.3342031478881835, "rewards/rejected": -1.9112905502319335, "step": 10287 }, { "epoch": 0.5453051705403757, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35214400.0, "logits/rejected": -24625952.0, "logps/chosen": -298.196533203125, "logps/rejected": -336.09023030598956, "loss": 0.2711, "rewards/chosen": 0.5678474307060242, "rewards/margins": 2.296116689840953, "rewards/rejected": -1.7282692591349285, "step": 10288 }, { "epoch": 0.5453581745421779, "grad_norm": 49.25, "kl": 1.4617013931274414, "learning_rate": 5e-07, "logits/chosen": -27043584.0, "logits/rejected": -1138093.3333333333, "logps/chosen": -263.34013671875, "logps/rejected": -147.17095947265625, "loss": 0.3277, "rewards/chosen": 0.4521904468536377, "rewards/margins": 2.4448428630828856, "rewards/rejected": -1.992652416229248, "step": 10289 }, { "epoch": 0.54541117854398, "grad_norm": 42.0, "kl": 5.990932464599609, "learning_rate": 5e-07, "logits/chosen": -10286985.0, "logits/rejected": -7138256.5, "logps/chosen": -1504.6810302734375, "logps/rejected": -105.87179565429688, "loss": 0.2305, "rewards/chosen": 2.5870683193206787, "rewards/margins": 4.471007466316223, "rewards/rejected": -1.8839391469955444, "step": 10290 }, { "epoch": 0.5454641825457822, "grad_norm": 48.25, "kl": 2.151937484741211, "learning_rate": 5e-07, "logits/chosen": -11343060.0, "logits/rejected": -24999824.0, "logps/chosen": -175.4835205078125, "logps/rejected": -286.03973388671875, "loss": 0.3592, "rewards/chosen": 0.49248796701431274, "rewards/margins": 1.9547147154808044, "rewards/rejected": -1.4622267484664917, "step": 10291 }, { "epoch": 0.5455171865475843, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55362128.0, "logits/rejected": -29465024.0, "logps/chosen": -1046.07861328125, "logps/rejected": -317.58095296223956, "loss": 0.1479, "rewards/chosen": 1.5122191905975342, "rewards/margins": 4.093962589899698, "rewards/rejected": -2.5817433993021646, "step": 10292 }, { "epoch": 0.5455701905493865, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64840696.0, "logits/rejected": -33510510.0, "logps/chosen": -249.8138427734375, "logps/rejected": -258.92132568359375, "loss": 0.3288, "rewards/chosen": 0.0589752197265625, "rewards/margins": 1.7184795141220093, "rewards/rejected": -1.6595042943954468, "step": 10293 }, { "epoch": 0.5456231945511886, "grad_norm": 70.0, "kl": 8.121757507324219, "learning_rate": 5e-07, "logits/chosen": -30111920.0, "logps/chosen": -380.6864318847656, "loss": 0.4549, "rewards/chosen": 1.0320484638214111, "step": 10294 }, { "epoch": 0.5456761985529908, "grad_norm": 104.5, "kl": 0.824213981628418, "learning_rate": 5e-07, "logits/chosen": -23540698.666666668, "logits/rejected": 708950.5, "logps/chosen": -268.81435139973956, "logps/rejected": -239.6756134033203, "loss": 0.3873, "rewards/chosen": 0.3061808745066325, "rewards/margins": 1.9182750384012859, "rewards/rejected": -1.6120941638946533, "step": 10295 }, { "epoch": 0.5457292025547928, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -121942968.0, "logits/rejected": -28402650.666666668, "logps/chosen": -356.973876953125, "logps/rejected": -355.638427734375, "loss": 0.2703, "rewards/chosen": 0.015454098582267761, "rewards/margins": 1.6554655681053798, "rewards/rejected": -1.640011469523112, "step": 10296 }, { "epoch": 0.545782206556595, "grad_norm": 62.25, "kl": 3.2834434509277344, "learning_rate": 5e-07, "logits/chosen": -32967876.57142857, "logits/rejected": -141203232.0, "logps/chosen": -566.9822474888393, "logps/rejected": -357.92633056640625, "loss": 0.2877, "rewards/chosen": 1.2755000250680106, "rewards/margins": 3.2563502447945734, "rewards/rejected": -1.9808502197265625, "step": 10297 }, { "epoch": 0.5458352105583971, "grad_norm": 60.5, "kl": 1.4874591827392578, "learning_rate": 5e-07, "logits/chosen": -14283686.666666666, "logits/rejected": -78166656.0, "logps/chosen": -494.1044108072917, "logps/rejected": -324.1338806152344, "loss": 0.3473, "rewards/chosen": 0.5919491052627563, "rewards/margins": 2.1844723224639893, "rewards/rejected": -1.592523217201233, "step": 10298 }, { "epoch": 0.5458882145601993, "grad_norm": 46.25, "kl": 4.120235443115234, "learning_rate": 5e-07, "logits/chosen": -14324784.0, "logits/rejected": -23897966.0, "logps/chosen": -206.6357879638672, "logps/rejected": -273.29608154296875, "loss": 0.331, "rewards/chosen": 0.5450602769851685, "rewards/margins": 2.205370545387268, "rewards/rejected": -1.6603102684020996, "step": 10299 }, { "epoch": 0.5459412185620014, "grad_norm": 45.0, "kl": 0.08287239074707031, "learning_rate": 5e-07, "logits/chosen": -10771130.0, "logits/rejected": -8008667.5, "logps/chosen": -318.9122009277344, "logps/rejected": -144.37855529785156, "loss": 0.2303, "rewards/chosen": 0.706692099571228, "rewards/margins": 3.7419837713241577, "rewards/rejected": -3.0352916717529297, "step": 10300 }, { "epoch": 0.5459942225638036, "grad_norm": 50.25, "kl": 0.4395256042480469, "learning_rate": 5e-07, "logits/chosen": -16103904.0, "logits/rejected": -20689954.666666668, "logps/chosen": -208.33935546875, "logps/rejected": -286.24853515625, "loss": 0.3331, "rewards/chosen": 0.2772336959838867, "rewards/margins": 2.2995226542154947, "rewards/rejected": -2.022288958231608, "step": 10301 }, { "epoch": 0.5460472265656057, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9822188.0, "logits/rejected": -61175516.0, "logps/chosen": -161.02931213378906, "logps/rejected": -282.1403503417969, "loss": 0.3097, "rewards/chosen": 0.00587482750415802, "rewards/margins": 2.039134219288826, "rewards/rejected": -2.033259391784668, "step": 10302 }, { "epoch": 0.5461002305674079, "grad_norm": 51.75, "kl": 1.1608161926269531, "learning_rate": 5e-07, "logits/chosen": -14549869.714285715, "logits/rejected": -3772962.0, "logps/chosen": -322.32578822544644, "logps/rejected": -88.91108703613281, "loss": 0.3443, "rewards/chosen": 0.7050235612051827, "rewards/margins": 2.5094615561621527, "rewards/rejected": -1.8044379949569702, "step": 10303 }, { "epoch": 0.5461532345692099, "grad_norm": 51.75, "kl": 0.5033931732177734, "learning_rate": 5e-07, "logits/chosen": -73066704.0, "logits/rejected": -30202540.0, "logps/chosen": -190.48219299316406, "logps/rejected": -277.6069030761719, "loss": 0.3154, "rewards/chosen": -0.09252339601516724, "rewards/margins": 2.348573625087738, "rewards/rejected": -2.4410970211029053, "step": 10304 }, { "epoch": 0.5462062385710121, "grad_norm": 60.5, "kl": 1.5933761596679688, "learning_rate": 5e-07, "logits/chosen": -10569826.4, "logits/rejected": -34144320.0, "logps/chosen": -310.4211669921875, "logps/rejected": -476.21044921875, "loss": 0.2509, "rewards/chosen": 0.8104192733764648, "rewards/margins": 4.508436393737793, "rewards/rejected": -3.698017120361328, "step": 10305 }, { "epoch": 0.5462592425728142, "grad_norm": 45.75, "kl": 2.344766616821289, "learning_rate": 5e-07, "logits/chosen": 13555594.666666666, "logits/rejected": -12098464.0, "logps/chosen": -68.55774434407552, "logps/rejected": -261.3234619140625, "loss": 0.3329, "rewards/chosen": -0.22568054993947348, "rewards/margins": 2.6203090588251747, "rewards/rejected": -2.8459896087646483, "step": 10306 }, { "epoch": 0.5463122465746164, "grad_norm": 50.5, "kl": 2.1661720275878906, "learning_rate": 5e-07, "logits/chosen": -16872.0, "logits/rejected": 3958423.5, "logps/chosen": -276.57232666015625, "logps/rejected": -345.41357421875, "loss": 0.3615, "rewards/chosen": -0.12304116040468216, "rewards/margins": 2.2792620584368706, "rewards/rejected": -2.4023032188415527, "step": 10307 }, { "epoch": 0.5463652505764185, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -122491392.0, "logits/rejected": -40557098.666666664, "logps/chosen": -707.3358154296875, "logps/rejected": -273.696533203125, "loss": 0.1936, "rewards/chosen": 0.5207778811454773, "rewards/margins": 2.738629996776581, "rewards/rejected": -2.2178521156311035, "step": 10308 }, { "epoch": 0.5464182545782207, "grad_norm": 75.5, "kl": 2.2208099365234375, "learning_rate": 5e-07, "logits/chosen": 21047216.0, "logits/rejected": -24040138.666666668, "logps/chosen": -433.48564453125, "logps/rejected": -461.3877360026042, "loss": 0.3391, "rewards/chosen": 0.30025634765625, "rewards/margins": 3.4853956858317057, "rewards/rejected": -3.1851393381754556, "step": 10309 }, { "epoch": 0.5464712585800228, "grad_norm": 57.75, "kl": 0.728607177734375, "learning_rate": 5e-07, "logits/chosen": -4831465.6, "logits/rejected": -17148288.0, "logps/chosen": -291.3998046875, "logps/rejected": -163.44393920898438, "loss": 0.3816, "rewards/chosen": 0.2726428508758545, "rewards/margins": 1.3426576455434163, "rewards/rejected": -1.0700147946675618, "step": 10310 }, { "epoch": 0.546524262581825, "grad_norm": 49.25, "kl": 0.9487380981445312, "learning_rate": 5e-07, "logits/chosen": -18800912.0, "logits/rejected": -3672364.8, "logps/chosen": -285.212890625, "logps/rejected": -297.3241455078125, "loss": 0.3015, "rewards/chosen": 0.44515403111775714, "rewards/margins": 2.641373284657796, "rewards/rejected": -2.196219253540039, "step": 10311 }, { "epoch": 0.546577266583627, "grad_norm": 62.5, "kl": 1.7361717224121094, "learning_rate": 5e-07, "logits/chosen": 17703764.0, "logits/rejected": -6541752.5, "logps/chosen": -458.0707702636719, "logps/rejected": -231.63885498046875, "loss": 0.3003, "rewards/chosen": 0.6068222522735596, "rewards/margins": 2.691477060317993, "rewards/rejected": -2.0846548080444336, "step": 10312 }, { "epoch": 0.5466302705854292, "grad_norm": 22.875, "kl": 1.1365585327148438, "learning_rate": 5e-07, "logits/chosen": -1126340.5, "logits/rejected": -20578256.0, "logps/chosen": -224.50936889648438, "logps/rejected": -348.56890869140625, "loss": 0.2043, "rewards/chosen": 1.1259509325027466, "rewards/margins": 4.261988997459412, "rewards/rejected": -3.136038064956665, "step": 10313 }, { "epoch": 0.5466832745872313, "grad_norm": 59.25, "kl": 1.8741378784179688, "learning_rate": 5e-07, "logits/chosen": -49540773.333333336, "logits/rejected": 271212.8125, "logps/chosen": -530.981201171875, "logps/rejected": -90.93260192871094, "loss": 0.4056, "rewards/chosen": 0.20988313357035318, "rewards/margins": 2.56374184290568, "rewards/rejected": -2.353858709335327, "step": 10314 }, { "epoch": 0.5467362785890335, "grad_norm": 46.0, "kl": 2.160097122192383, "learning_rate": 5e-07, "logits/chosen": -33744656.0, "logits/rejected": -3354014.5, "logps/chosen": -322.8551330566406, "logps/rejected": -289.04888916015625, "loss": 0.3053, "rewards/chosen": 0.3831297755241394, "rewards/margins": 2.1655072569847107, "rewards/rejected": -1.7823774814605713, "step": 10315 }, { "epoch": 0.5467892825908356, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -331335.5, "logits/rejected": -20940120.0, "logps/chosen": -41.89775848388672, "logps/rejected": -235.4499715169271, "loss": 0.2257, "rewards/chosen": 0.6273972392082214, "rewards/margins": 2.919700006643931, "rewards/rejected": -2.2923027674357095, "step": 10316 }, { "epoch": 0.5468422865926378, "grad_norm": 43.25, "kl": 0.9225711822509766, "learning_rate": 5e-07, "logits/chosen": -15065022.4, "logits/rejected": -71200042.66666667, "logps/chosen": -261.372900390625, "logps/rejected": -102.69722493489583, "loss": 0.2807, "rewards/chosen": 0.8367094039916992, "rewards/margins": 2.635517183939616, "rewards/rejected": -1.7988077799479167, "step": 10317 }, { "epoch": 0.5468952905944399, "grad_norm": 54.75, "kl": 1.593958854675293, "learning_rate": 5e-07, "logits/chosen": -47233056.0, "logits/rejected": 8786513.333333334, "logps/chosen": -206.1659423828125, "logps/rejected": -217.77498372395834, "loss": 0.2761, "rewards/chosen": 0.7707038402557373, "rewards/margins": 2.927537329991659, "rewards/rejected": -2.1568334897359214, "step": 10318 }, { "epoch": 0.5469482945962421, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68864346.66666667, "logits/rejected": -737056.8, "logps/chosen": -579.062744140625, "logps/rejected": -440.444189453125, "loss": 0.2484, "rewards/chosen": 1.0967906316121419, "rewards/margins": 3.6105948766072586, "rewards/rejected": -2.513804244995117, "step": 10319 }, { "epoch": 0.5470012985980441, "grad_norm": 95.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 49805116.0, "logits/rejected": 731759.4285714285, "logps/chosen": -621.7955322265625, "logps/rejected": -257.03909737723217, "loss": 0.1932, "rewards/chosen": 0.823138415813446, "rewards/margins": 2.4893758382116045, "rewards/rejected": -1.6662374223981584, "step": 10320 }, { "epoch": 0.5470543025998463, "grad_norm": 42.0, "kl": 0.8244647979736328, "learning_rate": 5e-07, "logits/chosen": 8673111.0, "logits/rejected": -24955052.0, "logps/chosen": -82.08430480957031, "logps/rejected": -301.1907653808594, "loss": 0.3063, "rewards/chosen": 0.39750564098358154, "rewards/margins": 2.0577688217163086, "rewards/rejected": -1.660263180732727, "step": 10321 }, { "epoch": 0.5471073066016484, "grad_norm": 44.75, "kl": 2.3063583374023438, "learning_rate": 5e-07, "logits/chosen": -50852044.8, "logits/rejected": -57442256.0, "logps/chosen": -166.365966796875, "logps/rejected": -557.3787434895834, "loss": 0.3457, "rewards/chosen": 0.3877244234085083, "rewards/margins": 2.5858375469843544, "rewards/rejected": -2.198113123575846, "step": 10322 }, { "epoch": 0.5471603106034506, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8749576.0, "logits/rejected": -18722481.333333332, "logps/chosen": -378.5846252441406, "logps/rejected": -295.2527669270833, "loss": 0.2066, "rewards/chosen": 0.0060520172119140625, "rewards/margins": 2.4385476112365723, "rewards/rejected": -2.432495594024658, "step": 10323 }, { "epoch": 0.5472133146052527, "grad_norm": 50.75, "kl": 2.337299346923828, "learning_rate": 5e-07, "logits/chosen": -15428141.333333334, "logits/rejected": -15382752.0, "logps/chosen": -407.6617024739583, "logps/rejected": -481.92978515625, "loss": 0.2228, "rewards/chosen": 1.0673166910807292, "rewards/margins": 4.189919344584148, "rewards/rejected": -3.122602653503418, "step": 10324 }, { "epoch": 0.5472663186070549, "grad_norm": 44.75, "kl": 1.538747787475586, "learning_rate": 5e-07, "logits/chosen": -15655473.6, "logits/rejected": -45160426.666666664, "logps/chosen": -430.2671875, "logps/rejected": -536.8053792317709, "loss": 0.2701, "rewards/chosen": 0.7523616313934326, "rewards/margins": 3.725044266382853, "rewards/rejected": -2.9726826349894204, "step": 10325 }, { "epoch": 0.547319322608857, "grad_norm": 37.5, "kl": 1.3235588073730469, "learning_rate": 5e-07, "logits/chosen": -16796352.0, "logits/rejected": -35477220.0, "logps/chosen": -295.66180419921875, "logps/rejected": -377.6606140136719, "loss": 0.2794, "rewards/chosen": 0.5656589865684509, "rewards/margins": 3.301384389400482, "rewards/rejected": -2.7357254028320312, "step": 10326 }, { "epoch": 0.5473723266106592, "grad_norm": 47.5, "kl": 2.18131160736084, "learning_rate": 5e-07, "logits/chosen": -15144292.8, "logits/rejected": -7021794.666666667, "logps/chosen": -136.67900390625, "logps/rejected": -86.291259765625, "loss": 0.4612, "rewards/chosen": -0.02932872772216797, "rewards/margins": 1.0023670355478924, "rewards/rejected": -1.0316957632700603, "step": 10327 }, { "epoch": 0.5474253306124612, "grad_norm": 96.5, "kl": 2.7710647583007812, "learning_rate": 5e-07, "logits/chosen": -48528288.0, "logits/rejected": -1392483.0, "logps/chosen": -496.1670328776042, "logps/rejected": -73.33683776855469, "loss": 0.3421, "rewards/chosen": 0.970344066619873, "rewards/margins": 2.080211877822876, "rewards/rejected": -1.109867811203003, "step": 10328 }, { "epoch": 0.5474783346142634, "grad_norm": 37.0, "kl": 1.3163642883300781, "learning_rate": 5e-07, "logits/chosen": -26173636.0, "logits/rejected": -16909706.0, "logps/chosen": -151.77175903320312, "logps/rejected": -143.14840698242188, "loss": 0.4217, "rewards/chosen": -0.10851383209228516, "rewards/margins": 1.1128108501434326, "rewards/rejected": -1.2213246822357178, "step": 10329 }, { "epoch": 0.5475313386160655, "grad_norm": 39.5, "kl": 1.6741600036621094, "learning_rate": 5e-07, "logits/chosen": -38090669.333333336, "logits/rejected": -17246674.0, "logps/chosen": -244.4268595377604, "logps/rejected": -383.13568115234375, "loss": 0.3419, "rewards/chosen": 0.629207452138265, "rewards/margins": 2.819310744603475, "rewards/rejected": -2.19010329246521, "step": 10330 }, { "epoch": 0.5475843426178677, "grad_norm": 49.25, "kl": 0.42877197265625, "learning_rate": 5e-07, "logits/chosen": -19312776.0, "logits/rejected": -3879293.6666666665, "logps/chosen": -240.38466796875, "logps/rejected": -190.40142822265625, "loss": 0.2841, "rewards/chosen": 0.6484253883361817, "rewards/margins": 2.6292519251505535, "rewards/rejected": -1.9808265368143718, "step": 10331 }, { "epoch": 0.5476373466196698, "grad_norm": 42.75, "kl": 3.458209991455078, "learning_rate": 5e-07, "logits/chosen": -36282748.0, "logits/rejected": -9023868.0, "logps/chosen": -659.8935546875, "logps/rejected": -212.72235107421875, "loss": 0.2517, "rewards/chosen": 1.2224712371826172, "rewards/margins": 4.01062536239624, "rewards/rejected": -2.788154125213623, "step": 10332 }, { "epoch": 0.5476903506214719, "grad_norm": 69.5, "kl": 1.2556915283203125, "learning_rate": 5e-07, "logits/chosen": -58377520.0, "logits/rejected": -53102728.0, "logps/chosen": -452.8340250651042, "logps/rejected": -365.7691955566406, "loss": 0.3932, "rewards/chosen": 0.28681743144989014, "rewards/margins": 2.017222046852112, "rewards/rejected": -1.7304046154022217, "step": 10333 }, { "epoch": 0.5477433546232741, "grad_norm": 35.75, "kl": 0.13909530639648438, "learning_rate": 5e-07, "logits/chosen": -203436.0, "logits/rejected": -23407172.0, "logps/chosen": -142.26905822753906, "logps/rejected": -259.11468505859375, "loss": 0.2432, "rewards/chosen": 0.571685254573822, "rewards/margins": 2.8408841490745544, "rewards/rejected": -2.2691988945007324, "step": 10334 }, { "epoch": 0.5477963586250761, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32420386.666666668, "logits/rejected": -29145539.2, "logps/chosen": -176.20979817708334, "logps/rejected": -263.6375, "loss": 0.389, "rewards/chosen": -0.6596855322519938, "rewards/margins": 1.0626935799916586, "rewards/rejected": -1.7223791122436523, "step": 10335 }, { "epoch": 0.5478493626268783, "grad_norm": 69.5, "kl": 2.1397743225097656, "learning_rate": 5e-07, "logits/chosen": -53524298.666666664, "logits/rejected": -9349075.0, "logps/chosen": -384.7872314453125, "logps/rejected": -326.5813903808594, "loss": 0.2494, "rewards/chosen": 1.3778978983561199, "rewards/margins": 4.062761704126994, "rewards/rejected": -2.684863805770874, "step": 10336 }, { "epoch": 0.5479023666286804, "grad_norm": 41.0, "kl": 1.6564922332763672, "learning_rate": 5e-07, "logits/chosen": -250307.625, "logits/rejected": -31972182.0, "logps/chosen": -241.555908203125, "logps/rejected": -326.2198791503906, "loss": 0.3049, "rewards/chosen": 0.14449501037597656, "rewards/margins": 2.840531826019287, "rewards/rejected": -2.6960368156433105, "step": 10337 }, { "epoch": 0.5479553706304826, "grad_norm": 50.0, "kl": 3.2546157836914062, "learning_rate": 5e-07, "logits/chosen": -186080.3125, "logits/rejected": -397478.75, "logps/chosen": -586.980712890625, "logps/rejected": -158.8978271484375, "loss": 0.2305, "rewards/chosen": 1.4265587329864502, "rewards/margins": 3.137016177177429, "rewards/rejected": -1.710457444190979, "step": 10338 }, { "epoch": 0.5480083746322847, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16485813.333333334, "logits/rejected": -10859787.2, "logps/chosen": -544.2052408854166, "logps/rejected": -480.172216796875, "loss": 0.1544, "rewards/chosen": 1.2951242129007976, "rewards/margins": 4.2657392183939615, "rewards/rejected": -2.970615005493164, "step": 10339 }, { "epoch": 0.5480613786340869, "grad_norm": 40.0, "kl": 1.3341827392578125, "learning_rate": 5e-07, "logits/chosen": -56694076.0, "logits/rejected": -4879698.0, "logps/chosen": -143.67857360839844, "logps/rejected": -277.6966552734375, "loss": 0.2137, "rewards/chosen": 0.04818287491798401, "rewards/margins": 2.857901324828466, "rewards/rejected": -2.809718449910482, "step": 10340 }, { "epoch": 0.548114382635889, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55732864.0, "logits/rejected": -24142683.2, "logps/chosen": -416.1060791015625, "logps/rejected": -249.277294921875, "loss": 0.2903, "rewards/chosen": 0.08056233823299408, "rewards/margins": 2.1065392822027205, "rewards/rejected": -2.0259769439697264, "step": 10341 }, { "epoch": 0.5481673866376912, "grad_norm": 67.0, "kl": 1.650075912475586, "learning_rate": 5e-07, "logits/chosen": -40002214.4, "logits/rejected": -34584370.666666664, "logps/chosen": -362.7971923828125, "logps/rejected": -600.4037272135416, "loss": 0.3078, "rewards/chosen": 0.39529540538787844, "rewards/margins": 3.0672365268071493, "rewards/rejected": -2.671941121419271, "step": 10342 }, { "epoch": 0.5482203906394932, "grad_norm": 39.25, "kl": 0.2574462890625, "learning_rate": 5e-07, "logits/chosen": -3922961.5, "logits/rejected": 5147589.0, "logps/chosen": -160.017822265625, "logps/rejected": -264.51678466796875, "loss": 0.187, "rewards/chosen": 1.0577452182769775, "rewards/margins": 3.4996750354766846, "rewards/rejected": -2.441929817199707, "step": 10343 }, { "epoch": 0.5482733946412954, "grad_norm": 47.75, "kl": 0.0081024169921875, "learning_rate": 5e-07, "logits/chosen": -21276256.0, "logits/rejected": -48104438.4, "logps/chosen": -410.83154296875, "logps/rejected": -410.68017578125, "loss": 0.2084, "rewards/chosen": 1.0291656653086345, "rewards/margins": 3.192195717493693, "rewards/rejected": -2.1630300521850585, "step": 10344 }, { "epoch": 0.5483263986430975, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43822888.0, "logits/rejected": 15730880.0, "logps/chosen": -194.55238342285156, "logps/rejected": -189.30594308035714, "loss": 0.1987, "rewards/chosen": -0.0035049438010901213, "rewards/margins": 1.8027006694714405, "rewards/rejected": -1.8062056132725306, "step": 10345 }, { "epoch": 0.5483794026448997, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7805837.333333333, "logits/rejected": -64465638.4, "logps/chosen": -441.6259765625, "logps/rejected": -307.28212890625, "loss": 0.2236, "rewards/chosen": 0.7536580562591553, "rewards/margins": 3.1298673152923584, "rewards/rejected": -2.376209259033203, "step": 10346 }, { "epoch": 0.5484324066467018, "grad_norm": 74.5, "kl": 0.1057891845703125, "learning_rate": 5e-07, "logits/chosen": 8484738.0, "logits/rejected": -16574073.0, "logps/chosen": -432.1953125, "logps/rejected": -281.397705078125, "loss": 0.2804, "rewards/chosen": 0.233140766620636, "rewards/margins": 2.6571388840675354, "rewards/rejected": -2.4239981174468994, "step": 10347 }, { "epoch": 0.548485410648504, "grad_norm": 63.75, "kl": 7.0253496170043945, "learning_rate": 5e-07, "logits/chosen": -6602314.4, "logits/rejected": -828756.0, "logps/chosen": -640.480517578125, "logps/rejected": -164.584716796875, "loss": 0.346, "rewards/chosen": 1.365126132965088, "rewards/margins": 3.1377968788146973, "rewards/rejected": -1.7726707458496094, "step": 10348 }, { "epoch": 0.5485384146503061, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -157975072.0, "logits/rejected": -18336020.0, "logps/chosen": -519.6519165039062, "logps/rejected": -321.0449625651042, "loss": 0.1772, "rewards/chosen": 1.0746017694473267, "rewards/margins": 2.9855335156122846, "rewards/rejected": -1.9109317461649578, "step": 10349 }, { "epoch": 0.5485914186521083, "grad_norm": 58.25, "kl": 1.2274398803710938, "learning_rate": 5e-07, "logits/chosen": -25720604.0, "logits/rejected": -15143096.0, "logps/chosen": -454.4468994140625, "logps/rejected": -171.7806599934896, "loss": 0.2601, "rewards/chosen": 0.6470824480056763, "rewards/margins": 2.270610610644023, "rewards/rejected": -1.6235281626383464, "step": 10350 }, { "epoch": 0.5486444226539103, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -89042016.0, "logits/rejected": -16284889.6, "logps/chosen": -705.716552734375, "logps/rejected": -299.0591796875, "loss": 0.2581, "rewards/chosen": 0.05361632506052653, "rewards/margins": 2.074173347155253, "rewards/rejected": -2.0205570220947267, "step": 10351 }, { "epoch": 0.5486974266557125, "grad_norm": 48.0, "kl": 0.6378955841064453, "learning_rate": 5e-07, "logits/chosen": -54906218.666666664, "logits/rejected": -8189834.4, "logps/chosen": -51.30987040201823, "logps/rejected": -193.46090087890624, "loss": 0.3115, "rewards/chosen": 0.01741097867488861, "rewards/margins": 1.6052126258611679, "rewards/rejected": -1.5878016471862793, "step": 10352 }, { "epoch": 0.5487504306575146, "grad_norm": 46.0, "kl": 2.742781639099121, "learning_rate": 5e-07, "logits/chosen": -59449386.666666664, "logits/rejected": -22810177.6, "logps/chosen": -1181.57470703125, "logps/rejected": -347.986181640625, "loss": 0.1783, "rewards/chosen": 1.3146179517110188, "rewards/margins": 3.822679932912191, "rewards/rejected": -2.508061981201172, "step": 10353 }, { "epoch": 0.5488034346593168, "grad_norm": 42.0, "kl": 0.9964237213134766, "learning_rate": 5e-07, "logits/chosen": -12923458.666666666, "logits/rejected": -17819473.6, "logps/chosen": -162.34487915039062, "logps/rejected": -222.6872802734375, "loss": 0.2697, "rewards/chosen": 0.5089883804321289, "rewards/margins": 2.588872718811035, "rewards/rejected": -2.079884338378906, "step": 10354 }, { "epoch": 0.5488564386611189, "grad_norm": 52.25, "kl": 2.616887092590332, "learning_rate": 5e-07, "logits/chosen": -7194750.285714285, "logits/rejected": -97787384.0, "logps/chosen": -213.74009486607142, "logps/rejected": -666.1962890625, "loss": 0.4303, "rewards/chosen": 0.459716728755406, "rewards/margins": 1.990624955722264, "rewards/rejected": -1.530908226966858, "step": 10355 }, { "epoch": 0.5489094426629211, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55764787.2, "logits/rejected": -34793370.666666664, "logps/chosen": -283.454833984375, "logps/rejected": -368.4501546223958, "loss": 0.377, "rewards/chosen": 0.008392789959907531, "rewards/margins": 1.9944305211305617, "rewards/rejected": -1.9860377311706543, "step": 10356 }, { "epoch": 0.5489624466647232, "grad_norm": 49.0, "kl": 1.8195610046386719, "learning_rate": 5e-07, "logits/chosen": -20856448.0, "logits/rejected": -16621606.0, "logps/chosen": -224.57548014322916, "logps/rejected": -225.75103759765625, "loss": 0.3925, "rewards/chosen": 0.25517765680948895, "rewards/margins": 3.8433803717295327, "rewards/rejected": -3.588202714920044, "step": 10357 }, { "epoch": 0.5490154506665254, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2613921.6, "logits/rejected": -18471032.0, "logps/chosen": -113.089306640625, "logps/rejected": -293.5823160807292, "loss": 0.3617, "rewards/chosen": -0.3258363723754883, "rewards/margins": 3.9368608474731444, "rewards/rejected": -4.262697219848633, "step": 10358 }, { "epoch": 0.5490684546683274, "grad_norm": 36.5, "kl": 1.4378852844238281, "learning_rate": 5e-07, "logits/chosen": 1013884.25, "logits/rejected": -22968018.0, "logps/chosen": -238.8337860107422, "logps/rejected": -190.846923828125, "loss": 0.2517, "rewards/chosen": 1.2421101331710815, "rewards/margins": 2.984996199607849, "rewards/rejected": -1.7428860664367676, "step": 10359 }, { "epoch": 0.5491214586701296, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4218630.0, "logits/rejected": -39407688.0, "logps/chosen": -502.0567626953125, "logps/rejected": -543.7598876953125, "loss": 0.2708, "rewards/chosen": 0.41529229283332825, "rewards/margins": 2.9655082523822784, "rewards/rejected": -2.55021595954895, "step": 10360 }, { "epoch": 0.5491744626719317, "grad_norm": 42.75, "kl": 0.5708684921264648, "learning_rate": 5e-07, "logits/chosen": -35765034.666666664, "logits/rejected": -219501.4, "logps/chosen": -228.2002970377604, "logps/rejected": -132.0433837890625, "loss": 0.2616, "rewards/chosen": 0.33914045492808026, "rewards/margins": 2.4876269737879433, "rewards/rejected": -2.148486518859863, "step": 10361 }, { "epoch": 0.5492274666737339, "grad_norm": 63.5, "kl": 0.9437408447265625, "learning_rate": 5e-07, "logits/chosen": 155375530.66666666, "logits/rejected": -39407904.0, "logps/chosen": -715.6809895833334, "logps/rejected": -384.203271484375, "loss": 0.3099, "rewards/chosen": 0.393035888671875, "rewards/margins": 2.165790557861328, "rewards/rejected": -1.7727546691894531, "step": 10362 }, { "epoch": 0.549280470675536, "grad_norm": 55.0, "kl": 0.6864380836486816, "learning_rate": 5e-07, "logits/chosen": -34160704.0, "logits/rejected": -57062389.333333336, "logps/chosen": -209.4116943359375, "logps/rejected": -149.1726277669271, "loss": 0.3623, "rewards/chosen": 0.5324887275695801, "rewards/margins": 1.4776235262552897, "rewards/rejected": -0.9451347986857096, "step": 10363 }, { "epoch": 0.5493334746773382, "grad_norm": 49.0, "kl": 3.4403305053710938, "learning_rate": 5e-07, "logits/chosen": -19685892.0, "logits/rejected": -12265661.0, "logps/chosen": -157.7427775065104, "logps/rejected": -281.4990234375, "loss": 0.3816, "rewards/chosen": 0.5246424277623495, "rewards/margins": 2.8142160971959433, "rewards/rejected": -2.2895736694335938, "step": 10364 }, { "epoch": 0.5493864786791403, "grad_norm": 47.5, "kl": 1.0424613952636719, "learning_rate": 5e-07, "logits/chosen": -35142352.0, "logits/rejected": -40832740.0, "logps/chosen": -311.8931579589844, "logps/rejected": -471.63885498046875, "loss": 0.2599, "rewards/chosen": 0.5561972856521606, "rewards/margins": 3.0000187158584595, "rewards/rejected": -2.443821430206299, "step": 10365 }, { "epoch": 0.5494394826809424, "grad_norm": 24.375, "kl": 1.8574085235595703, "learning_rate": 5e-07, "logits/chosen": 6031270.0, "logits/rejected": -22078908.0, "logps/chosen": -41.59224319458008, "logps/rejected": -372.4781494140625, "loss": 0.1906, "rewards/chosen": 0.35683098435401917, "rewards/margins": 3.2708782255649567, "rewards/rejected": -2.9140472412109375, "step": 10366 }, { "epoch": 0.5494924866827445, "grad_norm": 52.75, "kl": 1.0481452941894531, "learning_rate": 5e-07, "logits/chosen": -15559737.6, "logits/rejected": 1690632.6666666667, "logps/chosen": -228.8152587890625, "logps/rejected": -236.09171549479166, "loss": 0.3634, "rewards/chosen": 0.14836690425872803, "rewards/margins": 2.0680667320887247, "rewards/rejected": -1.9196998278299968, "step": 10367 }, { "epoch": 0.5495454906845467, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -106179184.0, "logits/rejected": -8090442.285714285, "logps/chosen": -418.4759216308594, "logps/rejected": -252.34950474330358, "loss": 0.2257, "rewards/chosen": -0.0521697998046875, "rewards/margins": 1.7169066837855749, "rewards/rejected": -1.7690764835902624, "step": 10368 }, { "epoch": 0.5495984946863488, "grad_norm": 51.75, "kl": 0.2584037780761719, "learning_rate": 5e-07, "logits/chosen": 2722291.1666666665, "logits/rejected": -27585651.2, "logps/chosen": -100.68953450520833, "logps/rejected": -481.63310546875, "loss": 0.3275, "rewards/chosen": -0.19636104504267374, "rewards/margins": 2.2590501030286156, "rewards/rejected": -2.4554111480712892, "step": 10369 }, { "epoch": 0.549651498688151, "grad_norm": 37.0, "kl": 0.0963592529296875, "learning_rate": 5e-07, "logits/chosen": -37121613.333333336, "logits/rejected": -11979247.2, "logps/chosen": -193.9547119140625, "logps/rejected": -317.13701171875, "loss": 0.2751, "rewards/chosen": 0.03665288289388021, "rewards/margins": 2.206910832722982, "rewards/rejected": -2.1702579498291015, "step": 10370 }, { "epoch": 0.5497045026899531, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57739514.666666664, "logits/rejected": -47179852.8, "logps/chosen": -279.1683349609375, "logps/rejected": -365.2345458984375, "loss": 0.2404, "rewards/chosen": 0.32492677370707196, "rewards/margins": 2.48516313234965, "rewards/rejected": -2.160236358642578, "step": 10371 }, { "epoch": 0.5497575066917553, "grad_norm": 57.0, "kl": 1.653855323791504, "learning_rate": 5e-07, "logits/chosen": -68387594.66666667, "logits/rejected": -49608643.2, "logps/chosen": -229.99149576822916, "logps/rejected": -190.412451171875, "loss": 0.2944, "rewards/chosen": 0.29678891102472943, "rewards/margins": 1.9095576564470929, "rewards/rejected": -1.6127687454223634, "step": 10372 }, { "epoch": 0.5498105106935574, "grad_norm": 52.75, "kl": 0.159332275390625, "learning_rate": 5e-07, "logits/chosen": -30369482.666666668, "logits/rejected": -50767548.0, "logps/chosen": -376.7145182291667, "logps/rejected": -469.8638000488281, "loss": 0.2871, "rewards/chosen": 0.6184768676757812, "rewards/margins": 3.921534776687622, "rewards/rejected": -3.303057909011841, "step": 10373 }, { "epoch": 0.5498635146953595, "grad_norm": 60.5, "kl": 3.7913055419921875, "learning_rate": 5e-07, "logits/chosen": 3011197.2, "logits/rejected": -11685836.0, "logps/chosen": -39.82887573242188, "logps/rejected": -723.4860026041666, "loss": 0.3322, "rewards/chosen": 0.5837454795837402, "rewards/margins": 2.5353344281514483, "rewards/rejected": -1.9515889485677083, "step": 10374 }, { "epoch": 0.5499165186971616, "grad_norm": 39.75, "kl": 1.032358169555664, "learning_rate": 5e-07, "logits/chosen": -35865362.666666664, "logits/rejected": -32689964.8, "logps/chosen": -139.35992431640625, "logps/rejected": -276.9478271484375, "loss": 0.3109, "rewards/chosen": 0.07200686633586884, "rewards/margins": 1.613724872469902, "rewards/rejected": -1.5417180061340332, "step": 10375 }, { "epoch": 0.5499695226989638, "grad_norm": 44.25, "kl": 0.2825031280517578, "learning_rate": 5e-07, "logits/chosen": -44269373.333333336, "logits/rejected": -48626048.0, "logps/chosen": -210.23421223958334, "logps/rejected": -503.9974609375, "loss": 0.2487, "rewards/chosen": 0.12275238831837972, "rewards/margins": 2.4649814685185754, "rewards/rejected": -2.3422290802001955, "step": 10376 }, { "epoch": 0.5500225267007659, "grad_norm": 60.75, "kl": 4.30499267578125, "learning_rate": 5e-07, "logits/chosen": -17205012.0, "logits/rejected": -8943152.0, "logps/chosen": -365.6018880208333, "logps/rejected": -84.66117858886719, "loss": 0.3948, "rewards/chosen": 0.5909158786137899, "rewards/margins": 2.592268427213033, "rewards/rejected": -2.001352548599243, "step": 10377 }, { "epoch": 0.5500755307025681, "grad_norm": 59.25, "kl": 4.259038925170898, "learning_rate": 5e-07, "logits/chosen": -13455824.0, "logits/rejected": -10353690.666666666, "logps/chosen": -571.511181640625, "logps/rejected": -325.15268961588544, "loss": 0.3667, "rewards/chosen": 0.735032320022583, "rewards/margins": 2.6977574825286865, "rewards/rejected": -1.9627251625061035, "step": 10378 }, { "epoch": 0.5501285347043702, "grad_norm": 57.75, "kl": 0.7791271209716797, "learning_rate": 5e-07, "logits/chosen": -47834069.333333336, "logits/rejected": -12361180.8, "logps/chosen": -761.932373046875, "logps/rejected": -398.85166015625, "loss": 0.2554, "rewards/chosen": 0.798797607421875, "rewards/margins": 2.708692741394043, "rewards/rejected": -1.909895133972168, "step": 10379 }, { "epoch": 0.5501815387061724, "grad_norm": 48.5, "kl": 0.8195457458496094, "learning_rate": 5e-07, "logits/chosen": 1157025.6, "logits/rejected": -46659040.0, "logps/chosen": -256.2125732421875, "logps/rejected": -386.6321614583333, "loss": 0.2906, "rewards/chosen": 0.4618197441101074, "rewards/margins": 3.450076198577881, "rewards/rejected": -2.9882564544677734, "step": 10380 }, { "epoch": 0.5502345427079744, "grad_norm": 56.75, "kl": 1.9391841888427734, "learning_rate": 5e-07, "logits/chosen": -48078944.0, "logits/rejected": -3387639.5, "logps/chosen": -244.81277901785714, "logps/rejected": -148.819091796875, "loss": 0.4296, "rewards/chosen": 0.37068428312029156, "rewards/margins": 1.9739680460521152, "rewards/rejected": -1.6032837629318237, "step": 10381 }, { "epoch": 0.5502875467097766, "grad_norm": 36.5, "kl": 2.2387657165527344, "learning_rate": 5e-07, "logits/chosen": -8561844.0, "logits/rejected": -43845604.571428575, "logps/chosen": -54.290771484375, "logps/rejected": -443.8213588169643, "loss": 0.2157, "rewards/chosen": -0.14605216681957245, "rewards/margins": 2.179144780550684, "rewards/rejected": -2.3251969473702565, "step": 10382 }, { "epoch": 0.5503405507115787, "grad_norm": 52.75, "kl": 1.281646728515625, "learning_rate": 5e-07, "logits/chosen": -37510432.0, "logits/rejected": -16197124.0, "logps/chosen": -304.77275390625, "logps/rejected": -207.91363525390625, "loss": 0.3122, "rewards/chosen": 0.6815695285797119, "rewards/margins": 2.3346790790557863, "rewards/rejected": -1.6531095504760742, "step": 10383 }, { "epoch": 0.5503935547133808, "grad_norm": 51.25, "kl": 2.9651918411254883, "learning_rate": 5e-07, "logits/chosen": -19283289.333333332, "logits/rejected": -1291104.5, "logps/chosen": -187.6993204752604, "logps/rejected": -146.40061950683594, "loss": 0.4892, "rewards/chosen": -0.184807817141215, "rewards/margins": 3.6091413100560508, "rewards/rejected": -3.7939491271972656, "step": 10384 }, { "epoch": 0.550446558715183, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5364012.8, "logits/rejected": -15776648.0, "logps/chosen": -373.67626953125, "logps/rejected": -283.72808837890625, "loss": 0.2976, "rewards/chosen": 0.5786407947540283, "rewards/margins": 3.1277008215586344, "rewards/rejected": -2.549060026804606, "step": 10385 }, { "epoch": 0.5504995627169851, "grad_norm": 49.5, "kl": 1.049468994140625, "learning_rate": 5e-07, "logits/chosen": -23381126.4, "logits/rejected": -21771538.666666668, "logps/chosen": -313.36669921875, "logps/rejected": -393.4078369140625, "loss": 0.3938, "rewards/chosen": -0.05152984857559204, "rewards/margins": 2.7353538235028587, "rewards/rejected": -2.7868836720784507, "step": 10386 }, { "epoch": 0.5505525667187873, "grad_norm": 51.25, "kl": 1.158137321472168, "learning_rate": 5e-07, "logits/chosen": -37199286.85714286, "logits/rejected": -31269892.0, "logps/chosen": -431.8626185825893, "logps/rejected": -914.2962646484375, "loss": 0.3626, "rewards/chosen": 0.6342969621930804, "rewards/margins": 4.849989005497524, "rewards/rejected": -4.215692043304443, "step": 10387 }, { "epoch": 0.5506055707205894, "grad_norm": 37.75, "kl": 0.47646522521972656, "learning_rate": 5e-07, "logits/chosen": 3167206.0, "logits/rejected": -16037630.4, "logps/chosen": -165.09290568033853, "logps/rejected": -237.87080078125, "loss": 0.2448, "rewards/chosen": 0.7111450036366781, "rewards/margins": 2.433666976292928, "rewards/rejected": -1.72252197265625, "step": 10388 }, { "epoch": 0.5506585747223915, "grad_norm": 54.0, "kl": 2.3873863220214844, "learning_rate": 5e-07, "logits/chosen": -38096890.666666664, "logits/rejected": -25736700.0, "logps/chosen": -311.0609944661458, "logps/rejected": -223.23301696777344, "loss": 0.3141, "rewards/chosen": 0.7992729345957438, "rewards/margins": 3.5027281443277993, "rewards/rejected": -2.7034552097320557, "step": 10389 }, { "epoch": 0.5507115787241936, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48364348.0, "logits/rejected": -27170965.333333332, "logps/chosen": -357.17926025390625, "logps/rejected": -396.0205891927083, "loss": 0.223, "rewards/chosen": 0.07012863457202911, "rewards/margins": 2.1435006807247796, "rewards/rejected": -2.0733720461527505, "step": 10390 }, { "epoch": 0.5507645827259958, "grad_norm": 49.5, "kl": 1.2292404174804688, "learning_rate": 5e-07, "logits/chosen": -37413395.2, "logits/rejected": -26585824.0, "logps/chosen": -348.0660400390625, "logps/rejected": -260.3436686197917, "loss": 0.3417, "rewards/chosen": 0.2574197292327881, "rewards/margins": 2.9564426581064858, "rewards/rejected": -2.6990229288736978, "step": 10391 }, { "epoch": 0.5508175867277979, "grad_norm": 57.5, "kl": 0.09119415283203125, "learning_rate": 5e-07, "logits/chosen": -10473195.0, "logits/rejected": -15049092.0, "logps/chosen": -197.41036987304688, "logps/rejected": -557.951171875, "loss": 0.3437, "rewards/chosen": -0.016749948263168335, "rewards/margins": 1.9793063700199127, "rewards/rejected": -1.996056318283081, "step": 10392 }, { "epoch": 0.5508705907296001, "grad_norm": 41.25, "kl": 1.1341361999511719, "learning_rate": 5e-07, "logits/chosen": -42237941.333333336, "logits/rejected": -19145051.2, "logps/chosen": -414.6931966145833, "logps/rejected": -239.3843994140625, "loss": 0.2581, "rewards/chosen": 0.11212056875228882, "rewards/margins": 2.801389443874359, "rewards/rejected": -2.68926887512207, "step": 10393 }, { "epoch": 0.5509235947314022, "grad_norm": 40.5, "kl": 1.0149917602539062, "learning_rate": 5e-07, "logits/chosen": 280572.625, "logits/rejected": -17115900.0, "logps/chosen": -130.98695373535156, "logps/rejected": -451.12701416015625, "loss": 0.3121, "rewards/chosen": 0.34580913186073303, "rewards/margins": 2.5570150911808014, "rewards/rejected": -2.2112059593200684, "step": 10394 }, { "epoch": 0.5509765987332044, "grad_norm": 44.0, "kl": 4.326824188232422, "learning_rate": 5e-07, "logits/chosen": -37277420.8, "logits/rejected": -30595653.333333332, "logps/chosen": -642.451708984375, "logps/rejected": -222.99763997395834, "loss": 0.1753, "rewards/chosen": 1.6639244079589843, "rewards/margins": 4.174817911783854, "rewards/rejected": -2.5108935038248696, "step": 10395 }, { "epoch": 0.5510296027350065, "grad_norm": 56.0, "kl": 2.1670608520507812, "learning_rate": 5e-07, "logits/chosen": -50295744.0, "logits/rejected": -7635361.6, "logps/chosen": -840.5381673177084, "logps/rejected": -122.615869140625, "loss": 0.2128, "rewards/chosen": 1.2919713656107585, "rewards/margins": 3.127428976694743, "rewards/rejected": -1.8354576110839844, "step": 10396 }, { "epoch": 0.5510826067368086, "grad_norm": 54.0, "kl": 2.600879669189453, "learning_rate": 5e-07, "logits/chosen": -23024541.333333332, "logits/rejected": -84706120.0, "logps/chosen": -211.08199055989584, "logps/rejected": -839.8636474609375, "loss": 0.3336, "rewards/chosen": 0.7195959091186523, "rewards/margins": 2.8667852878570557, "rewards/rejected": -2.1471893787384033, "step": 10397 }, { "epoch": 0.5511356107386107, "grad_norm": 36.5, "kl": 1.1958379745483398, "learning_rate": 5e-07, "logits/chosen": -17264221.333333332, "logits/rejected": -12150403.2, "logps/chosen": -134.27962239583334, "logps/rejected": -157.19840087890626, "loss": 0.246, "rewards/chosen": 0.3913865884145101, "rewards/margins": 2.515749533971151, "rewards/rejected": -2.1243629455566406, "step": 10398 }, { "epoch": 0.5511886147404129, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 82519104.0, "logits/rejected": -10357738.666666666, "logps/chosen": -489.030224609375, "logps/rejected": -386.337646484375, "loss": 0.386, "rewards/chosen": -0.07652863264083862, "rewards/margins": 1.607722516854604, "rewards/rejected": -1.6842511494954426, "step": 10399 }, { "epoch": 0.551241618742215, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 639593.3333333334, "logits/rejected": -26043825.6, "logps/chosen": -316.9974365234375, "logps/rejected": -260.906005859375, "loss": 0.3437, "rewards/chosen": -0.3075537880261739, "rewards/margins": 1.6758442680040997, "rewards/rejected": -1.9833980560302735, "step": 10400 }, { "epoch": 0.5512946227440172, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53705124.0, "logits/rejected": -15126812.0, "logps/chosen": -436.109130859375, "logps/rejected": -266.14202880859375, "loss": 0.374, "rewards/chosen": -0.059375010430812836, "rewards/margins": 1.5399461165070534, "rewards/rejected": -1.5993211269378662, "step": 10401 }, { "epoch": 0.5513476267458193, "grad_norm": 37.5, "kl": 2.1358470916748047, "learning_rate": 5e-07, "logits/chosen": 5995160.0, "logits/rejected": -29119386.666666668, "logps/chosen": -213.6475067138672, "logps/rejected": -493.371826171875, "loss": 0.2402, "rewards/chosen": 0.6423171162605286, "rewards/margins": 3.1351734598477683, "rewards/rejected": -2.4928563435872397, "step": 10402 }, { "epoch": 0.5514006307476215, "grad_norm": 44.5, "kl": 1.6195926666259766, "learning_rate": 5e-07, "logits/chosen": -29001258.666666668, "logits/rejected": -29880288.0, "logps/chosen": -104.72682698567708, "logps/rejected": -215.189453125, "loss": 0.2941, "rewards/chosen": 0.1594650149345398, "rewards/margins": 2.06210173368454, "rewards/rejected": -1.90263671875, "step": 10403 }, { "epoch": 0.5514536347494235, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 641353.0, "logits/rejected": -18117742.0, "logps/chosen": -322.5812072753906, "logps/rejected": -577.2236328125, "loss": 0.2406, "rewards/chosen": 0.30762290954589844, "rewards/margins": 3.492954730987549, "rewards/rejected": -3.1853318214416504, "step": 10404 }, { "epoch": 0.5515066387512257, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25780077.333333332, "logits/rejected": -69107129.6, "logps/chosen": -255.0435587565104, "logps/rejected": -462.364794921875, "loss": 0.2552, "rewards/chosen": 0.33732275168100995, "rewards/margins": 2.6837230126063027, "rewards/rejected": -2.346400260925293, "step": 10405 }, { "epoch": 0.5515596427530278, "grad_norm": 71.0, "kl": 4.472291946411133, "learning_rate": 5e-07, "logits/chosen": -46903429.333333336, "logits/rejected": 6767743.2, "logps/chosen": -264.91823323567706, "logps/rejected": -85.53214721679687, "loss": 0.3924, "rewards/chosen": 0.8466461499532064, "rewards/margins": 1.868003304799398, "rewards/rejected": -1.0213571548461915, "step": 10406 }, { "epoch": 0.55161264675483, "grad_norm": 34.25, "kl": 1.8180809020996094, "learning_rate": 5e-07, "logits/chosen": -3796532.25, "logits/rejected": -8428970.0, "logps/chosen": -76.54583740234375, "logps/rejected": -211.44686889648438, "loss": 0.3518, "rewards/chosen": -0.1279887706041336, "rewards/margins": 2.1247835606336594, "rewards/rejected": -2.252772331237793, "step": 10407 }, { "epoch": 0.5516656507566321, "grad_norm": 36.5, "kl": 0.34621429443359375, "learning_rate": 5e-07, "logits/chosen": -9704761.333333334, "logits/rejected": -22987828.0, "logps/chosen": -131.33387247721353, "logps/rejected": -329.27783203125, "loss": 0.3433, "rewards/chosen": 0.26106321811676025, "rewards/margins": 3.819302201271057, "rewards/rejected": -3.558238983154297, "step": 10408 }, { "epoch": 0.5517186547584343, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -89123936.0, "logits/rejected": -27143318.85714286, "logps/chosen": -517.482177734375, "logps/rejected": -453.08834402901783, "loss": 0.1343, "rewards/chosen": 0.754443347454071, "rewards/margins": 3.7210462178502763, "rewards/rejected": -2.9666028703962053, "step": 10409 }, { "epoch": 0.5517716587602364, "grad_norm": 31.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8898460.0, "logits/rejected": -56258660.571428575, "logps/chosen": -152.13607788085938, "logps/rejected": -593.6964285714286, "loss": 0.1631, "rewards/chosen": -0.08544617146253586, "rewards/margins": 3.321784750691482, "rewards/rejected": -3.407230922154018, "step": 10410 }, { "epoch": 0.5518246627620386, "grad_norm": 57.0, "kl": 2.9176559448242188, "learning_rate": 5e-07, "logits/chosen": -27277232.0, "logits/rejected": -50054826.666666664, "logps/chosen": -262.318994140625, "logps/rejected": -369.75390625, "loss": 0.3921, "rewards/chosen": 0.1683577060699463, "rewards/margins": 2.5109960397084556, "rewards/rejected": -2.3426383336385093, "step": 10411 }, { "epoch": 0.5518776667638406, "grad_norm": 60.0, "kl": 1.642099380493164, "learning_rate": 5e-07, "logits/chosen": 9645892.0, "logits/rejected": -38164380.0, "logps/chosen": -412.383544921875, "logps/rejected": -342.711669921875, "loss": 0.2659, "rewards/chosen": 0.9559848308563232, "rewards/margins": 2.5630892515182495, "rewards/rejected": -1.6071044206619263, "step": 10412 }, { "epoch": 0.5519306707656428, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13839040.0, "logits/rejected": -26796336.0, "logps/chosen": -293.88494873046875, "logps/rejected": -352.7593994140625, "loss": 0.1783, "rewards/chosen": -0.0854591429233551, "rewards/margins": 3.102981815735499, "rewards/rejected": -3.188440958658854, "step": 10413 }, { "epoch": 0.5519836747674449, "grad_norm": 57.75, "kl": 0.8731269836425781, "learning_rate": 5e-07, "logits/chosen": -21385941.333333332, "logits/rejected": -30428137.6, "logps/chosen": -426.7809651692708, "logps/rejected": -290.4884765625, "loss": 0.3151, "rewards/chosen": 0.2077465057373047, "rewards/margins": 1.8109474182128906, "rewards/rejected": -1.603200912475586, "step": 10414 }, { "epoch": 0.5520366787692471, "grad_norm": 38.0, "kl": 1.3000869750976562, "learning_rate": 5e-07, "logits/chosen": 2437604.2, "logits/rejected": -26703458.666666668, "logps/chosen": -154.07889404296876, "logps/rejected": -391.0748697916667, "loss": 0.3192, "rewards/chosen": 0.37453570365905764, "rewards/margins": 2.9515313943227133, "rewards/rejected": -2.5769956906636557, "step": 10415 }, { "epoch": 0.5520896827710492, "grad_norm": 45.0, "kl": 3.7333240509033203, "learning_rate": 5e-07, "logits/chosen": 9265801.0, "logits/rejected": -28421008.0, "logps/chosen": -86.25778198242188, "logps/rejected": -871.7097778320312, "loss": 0.2322, "rewards/chosen": 1.0354681015014648, "rewards/margins": 5.546804428100586, "rewards/rejected": -4.511336326599121, "step": 10416 }, { "epoch": 0.5521426867728514, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -217348.0, "logits/rejected": -46895520.0, "logps/chosen": -449.1868896484375, "logps/rejected": -502.7438557942708, "loss": 0.185, "rewards/chosen": 0.9016144275665283, "rewards/margins": 3.1951919396718345, "rewards/rejected": -2.293577512105306, "step": 10417 }, { "epoch": 0.5521956907746535, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58497860.0, "logits/rejected": -22920672.0, "logps/chosen": -331.9073486328125, "logps/rejected": -346.3049011230469, "loss": 0.2501, "rewards/chosen": 0.35429880023002625, "rewards/margins": 3.2237682044506073, "rewards/rejected": -2.869469404220581, "step": 10418 }, { "epoch": 0.5522486947764557, "grad_norm": 56.5, "kl": 0.5657501220703125, "learning_rate": 5e-07, "logits/chosen": -37051114.666666664, "logits/rejected": -42281796.0, "logps/chosen": -334.7295328776042, "logps/rejected": -221.03115844726562, "loss": 0.4066, "rewards/chosen": 0.3044211069742839, "rewards/margins": 1.2402342955271404, "rewards/rejected": -0.9358131885528564, "step": 10419 }, { "epoch": 0.5523016987782577, "grad_norm": 62.0, "kl": 0.36914634704589844, "learning_rate": 5e-07, "logits/chosen": -27384604.8, "logits/rejected": -25714378.666666668, "logps/chosen": -204.929296875, "logps/rejected": -345.6267496744792, "loss": 0.3313, "rewards/chosen": 0.47301478385925294, "rewards/margins": 2.111948919296265, "rewards/rejected": -1.6389341354370117, "step": 10420 }, { "epoch": 0.5523547027800599, "grad_norm": 41.25, "kl": 2.9568262100219727, "learning_rate": 5e-07, "logits/chosen": -54601064.0, "logits/rejected": -21316504.0, "logps/chosen": -292.0577087402344, "logps/rejected": -280.0977783203125, "loss": 0.2635, "rewards/chosen": 0.47224730253219604, "rewards/margins": 3.27472847700119, "rewards/rejected": -2.802481174468994, "step": 10421 }, { "epoch": 0.552407706781862, "grad_norm": 44.75, "kl": 0.10523223876953125, "learning_rate": 5e-07, "logits/chosen": -11831138.0, "logits/rejected": -11716946.0, "logps/chosen": -266.0072937011719, "logps/rejected": -282.59552001953125, "loss": 0.2602, "rewards/chosen": 0.4708581864833832, "rewards/margins": 2.85725674033165, "rewards/rejected": -2.3863985538482666, "step": 10422 }, { "epoch": 0.5524607107836642, "grad_norm": 69.0, "kl": 1.0235710144042969, "learning_rate": 5e-07, "logits/chosen": -45350390.4, "logits/rejected": -1579423.1666666667, "logps/chosen": -503.77958984375, "logps/rejected": -69.71220397949219, "loss": 0.3059, "rewards/chosen": 0.8247014999389648, "rewards/margins": 2.3226175626118977, "rewards/rejected": -1.4979160626729329, "step": 10423 }, { "epoch": 0.5525137147854663, "grad_norm": 55.25, "kl": 2.623859405517578, "learning_rate": 5e-07, "logits/chosen": -32203683.2, "logits/rejected": -6045962.0, "logps/chosen": -266.070068359375, "logps/rejected": -131.86895751953125, "loss": 0.2856, "rewards/chosen": 0.7684615135192872, "rewards/margins": 3.5053380012512205, "rewards/rejected": -2.7368764877319336, "step": 10424 }, { "epoch": 0.5525667187872685, "grad_norm": 47.25, "kl": 1.0005273818969727, "learning_rate": 5e-07, "logits/chosen": -38536736.0, "logits/rejected": -11171694.0, "logps/chosen": -201.0274169921875, "logps/rejected": -120.66272989908855, "loss": 0.3605, "rewards/chosen": 0.4048454284667969, "rewards/margins": 2.027940622965495, "rewards/rejected": -1.623095194498698, "step": 10425 }, { "epoch": 0.5526197227890706, "grad_norm": 44.75, "kl": 1.9836769104003906, "learning_rate": 5e-07, "logits/chosen": -5051363.0, "logits/rejected": -34921888.0, "logps/chosen": -268.36279296875, "logps/rejected": -374.8861999511719, "loss": 0.295, "rewards/chosen": 1.1456689834594727, "rewards/margins": 2.638493776321411, "rewards/rejected": -1.4928247928619385, "step": 10426 }, { "epoch": 0.5526727267908728, "grad_norm": 63.0, "kl": 0.4799461364746094, "learning_rate": 5e-07, "logits/chosen": -10069428.8, "logits/rejected": -8150693.333333333, "logps/chosen": -272.71259765625, "logps/rejected": -254.89449055989584, "loss": 0.34, "rewards/chosen": 0.43437681198120115, "rewards/margins": 1.8852079073588053, "rewards/rejected": -1.4508310953776042, "step": 10427 }, { "epoch": 0.5527257307926748, "grad_norm": 76.0, "kl": 0.7834596633911133, "learning_rate": 5e-07, "logits/chosen": 13918905.333333334, "logits/rejected": -16067379.2, "logps/chosen": -116.31151326497395, "logps/rejected": -240.7806884765625, "loss": 0.4069, "rewards/chosen": 0.3930378754933675, "rewards/margins": 1.074087651570638, "rewards/rejected": -0.6810497760772705, "step": 10428 }, { "epoch": 0.552778734794477, "grad_norm": 52.5, "kl": 0.0136260986328125, "learning_rate": 5e-07, "logits/chosen": -2554889.0, "logits/rejected": -39060966.4, "logps/chosen": -195.15254720052084, "logps/rejected": -219.493408203125, "loss": 0.3206, "rewards/chosen": 0.5588236649831136, "rewards/margins": 1.6001021226247154, "rewards/rejected": -1.0412784576416017, "step": 10429 }, { "epoch": 0.5528317387962791, "grad_norm": 47.25, "kl": 0.9428367614746094, "learning_rate": 5e-07, "logits/chosen": -18006656.0, "logits/rejected": -1766188.0, "logps/chosen": -278.0608825683594, "logps/rejected": -358.9617919921875, "loss": 0.3344, "rewards/chosen": 0.13160064816474915, "rewards/margins": 1.5498071014881134, "rewards/rejected": -1.4182064533233643, "step": 10430 }, { "epoch": 0.5528847427980813, "grad_norm": 45.25, "kl": 0.6872196197509766, "learning_rate": 5e-07, "logits/chosen": -12890583.0, "logits/rejected": -61744612.0, "logps/chosen": -246.47415161132812, "logps/rejected": -291.23431396484375, "loss": 0.27, "rewards/chosen": 0.5081119537353516, "rewards/margins": 2.4871981143951416, "rewards/rejected": -1.97908616065979, "step": 10431 }, { "epoch": 0.5529377467998834, "grad_norm": 47.75, "kl": 0.3202228546142578, "learning_rate": 5e-07, "logits/chosen": -35618612.0, "logits/rejected": -2864016.75, "logps/chosen": -204.46949768066406, "logps/rejected": -177.945068359375, "loss": 0.3653, "rewards/chosen": 0.24928519129753113, "rewards/margins": 1.303467482328415, "rewards/rejected": -1.0541822910308838, "step": 10432 }, { "epoch": 0.5529907508016856, "grad_norm": 51.5, "kl": 2.167675018310547, "learning_rate": 5e-07, "logits/chosen": -24409088.0, "logits/rejected": -54729284.0, "logps/chosen": -404.2328287760417, "logps/rejected": -781.4238891601562, "loss": 0.3215, "rewards/chosen": 0.7512814203898112, "rewards/margins": 4.563061157862346, "rewards/rejected": -3.811779737472534, "step": 10433 }, { "epoch": 0.5530437548034877, "grad_norm": 37.5, "kl": 1.5606956481933594, "learning_rate": 5e-07, "logits/chosen": -6948136.666666667, "logits/rejected": -5630629.2, "logps/chosen": -158.40508015950522, "logps/rejected": -290.5828857421875, "loss": 0.1831, "rewards/chosen": 1.01126233736674, "rewards/margins": 3.4266859213511154, "rewards/rejected": -2.415423583984375, "step": 10434 }, { "epoch": 0.5530967588052897, "grad_norm": 43.25, "kl": 0.12357139587402344, "learning_rate": 5e-07, "logits/chosen": 5850386.0, "logits/rejected": -69189312.0, "logps/chosen": -110.55828857421875, "logps/rejected": -303.9906982421875, "loss": 0.3389, "rewards/chosen": -0.4715828100840251, "rewards/margins": 1.3418699105580647, "rewards/rejected": -1.8134527206420898, "step": 10435 }, { "epoch": 0.5531497628070919, "grad_norm": 39.75, "kl": 0.5927925109863281, "learning_rate": 5e-07, "logits/chosen": -125535200.0, "logits/rejected": -33813491.2, "logps/chosen": -252.67582194010416, "logps/rejected": -336.94267578125, "loss": 0.1904, "rewards/chosen": 1.1288224061330159, "rewards/margins": 3.25272110303243, "rewards/rejected": -2.1238986968994142, "step": 10436 }, { "epoch": 0.553202766808894, "grad_norm": 60.25, "kl": 1.17706298828125, "learning_rate": 5e-07, "logits/chosen": -74498976.0, "logits/rejected": 8863358.0, "logps/chosen": -492.26251220703125, "logps/rejected": -196.4439697265625, "loss": 0.3153, "rewards/chosen": 0.6100350618362427, "rewards/margins": 2.246261715888977, "rewards/rejected": -1.6362266540527344, "step": 10437 }, { "epoch": 0.5532557708106962, "grad_norm": 56.25, "kl": 3.5949630737304688, "learning_rate": 5e-07, "logits/chosen": -11474149.333333334, "logits/rejected": -31139686.4, "logps/chosen": -574.18408203125, "logps/rejected": -387.7305419921875, "loss": 0.2206, "rewards/chosen": 1.7561330795288086, "rewards/margins": 3.8480545043945313, "rewards/rejected": -2.0919214248657227, "step": 10438 }, { "epoch": 0.5533087748124983, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19397824.0, "logits/rejected": -18847044.0, "logps/chosen": -414.1677551269531, "logps/rejected": -151.02279663085938, "loss": 0.2888, "rewards/chosen": 0.5960525274276733, "rewards/margins": 2.0344685316085815, "rewards/rejected": -1.4384160041809082, "step": 10439 }, { "epoch": 0.5533617788143005, "grad_norm": 66.0, "kl": 1.0530433654785156, "learning_rate": 5e-07, "logits/chosen": -25123420.8, "logits/rejected": -61304661.333333336, "logps/chosen": -435.58671875, "logps/rejected": -250.50276692708334, "loss": 0.3508, "rewards/chosen": 0.3038495063781738, "rewards/margins": 2.1103119532267254, "rewards/rejected": -1.8064624468485515, "step": 10440 }, { "epoch": 0.5534147828161026, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6370712.0, "logits/rejected": -42549034.666666664, "logps/chosen": -217.45401000976562, "logps/rejected": -405.5555826822917, "loss": 0.218, "rewards/chosen": 0.565180242061615, "rewards/margins": 2.471555173397064, "rewards/rejected": -1.9063749313354492, "step": 10441 }, { "epoch": 0.5534677868179048, "grad_norm": 53.5, "kl": 4.193183898925781, "learning_rate": 5e-07, "logits/chosen": -38227322.666666664, "logits/rejected": 1840887.0, "logps/chosen": -321.2471516927083, "logps/rejected": -450.19635009765625, "loss": 0.3642, "rewards/chosen": 0.67942214012146, "rewards/margins": 2.789743185043335, "rewards/rejected": -2.110321044921875, "step": 10442 }, { "epoch": 0.5535207908197068, "grad_norm": 44.0, "kl": 1.0369138717651367, "learning_rate": 5e-07, "logits/chosen": 305419.2, "logits/rejected": -25973632.0, "logps/chosen": -194.13995361328125, "logps/rejected": -277.8318277994792, "loss": 0.356, "rewards/chosen": 0.11269204616546631, "rewards/margins": 2.57951435248057, "rewards/rejected": -2.466822306315104, "step": 10443 }, { "epoch": 0.553573794821509, "grad_norm": 52.5, "kl": 3.1323165893554688, "learning_rate": 5e-07, "logits/chosen": -39573424.0, "logits/rejected": -34869084.0, "logps/chosen": -595.9470825195312, "logps/rejected": -370.1893005371094, "loss": 0.2308, "rewards/chosen": 1.4324171543121338, "rewards/margins": 3.7469232082366943, "rewards/rejected": -2.3145060539245605, "step": 10444 }, { "epoch": 0.5536267988233111, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11595528.0, "logits/rejected": -16947833.333333332, "logps/chosen": -700.19775390625, "logps/rejected": -327.52341715494794, "loss": 0.2031, "rewards/chosen": 1.306176781654358, "rewards/margins": 3.5761133432388306, "rewards/rejected": -2.2699365615844727, "step": 10445 }, { "epoch": 0.5536798028251133, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -40423256.0, "logps/rejected": -321.21063232421875, "loss": 0.1037, "rewards/rejected": -2.7309584617614746, "step": 10446 }, { "epoch": 0.5537328068269154, "grad_norm": 37.25, "kl": 0.5471305847167969, "learning_rate": 5e-07, "logits/chosen": 593377.9166666666, "logits/rejected": -7773308.8, "logps/chosen": -38.813201904296875, "logps/rejected": -240.669384765625, "loss": 0.239, "rewards/chosen": 0.59522545337677, "rewards/margins": 2.4969363451004027, "rewards/rejected": -1.9017108917236327, "step": 10447 }, { "epoch": 0.5537858108287176, "grad_norm": 59.5, "kl": 0.5189456939697266, "learning_rate": 5e-07, "logits/chosen": -12756812.8, "logits/rejected": -21466622.666666668, "logps/chosen": -327.9411376953125, "logps/rejected": -325.50954182942706, "loss": 0.3544, "rewards/chosen": 0.0353607177734375, "rewards/margins": 2.8998480478922524, "rewards/rejected": -2.864487330118815, "step": 10448 }, { "epoch": 0.5538388148305197, "grad_norm": 50.0, "kl": 5.444303512573242, "learning_rate": 5e-07, "logits/chosen": -4381526.666666667, "logits/rejected": -42712728.0, "logps/chosen": -181.62337239583334, "logps/rejected": -340.76202392578125, "loss": 0.3769, "rewards/chosen": 1.0512040456136067, "rewards/margins": 2.5808515151341753, "rewards/rejected": -1.5296474695205688, "step": 10449 }, { "epoch": 0.5538918188323219, "grad_norm": 50.5, "kl": 1.276824951171875, "learning_rate": 5e-07, "logits/chosen": -25516308.0, "logits/rejected": -15844876.0, "logps/chosen": -271.4994812011719, "logps/rejected": -427.25830078125, "loss": 0.3453, "rewards/chosen": 0.11789263784885406, "rewards/margins": 1.7540617138147354, "rewards/rejected": -1.6361690759658813, "step": 10450 }, { "epoch": 0.5539448228341239, "grad_norm": 44.25, "kl": 2.1643295288085938, "learning_rate": 5e-07, "logits/chosen": 11712013.333333334, "logits/rejected": -24399468.8, "logps/chosen": -129.25032552083334, "logps/rejected": -383.112060546875, "loss": 0.2928, "rewards/chosen": 0.01877949635187785, "rewards/margins": 2.3332015315691628, "rewards/rejected": -2.314422035217285, "step": 10451 }, { "epoch": 0.5539978268359261, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26113253.333333332, "logits/rejected": -48888131.2, "logps/chosen": -159.4578857421875, "logps/rejected": -429.2392578125, "loss": 0.2677, "rewards/chosen": -0.24419709046681723, "rewards/margins": 2.4161752621332804, "rewards/rejected": -2.6603723526000977, "step": 10452 }, { "epoch": 0.5540508308377282, "grad_norm": 46.25, "kl": 0.3951120376586914, "learning_rate": 5e-07, "logits/chosen": -108505.875, "logits/rejected": -212769.33333333334, "logps/chosen": -206.0355682373047, "logps/rejected": -305.9993896484375, "loss": 0.2269, "rewards/chosen": 1.404498815536499, "rewards/margins": 3.025484800338745, "rewards/rejected": -1.620985984802246, "step": 10453 }, { "epoch": 0.5541038348395304, "grad_norm": 55.0, "kl": 0.8849639892578125, "learning_rate": 5e-07, "logits/chosen": -31958547.2, "logits/rejected": -23208920.0, "logps/chosen": -647.506640625, "logps/rejected": -327.14145914713544, "loss": 0.2569, "rewards/chosen": 0.9515478134155273, "rewards/margins": 3.026361020406087, "rewards/rejected": -2.07481320699056, "step": 10454 }, { "epoch": 0.5541568388413325, "grad_norm": 47.25, "kl": 0.3322563171386719, "learning_rate": 5e-07, "logits/chosen": -54836403.2, "logits/rejected": -6132282.0, "logps/chosen": -309.6566162109375, "logps/rejected": -235.77811686197916, "loss": 0.317, "rewards/chosen": 0.1934809923171997, "rewards/margins": 2.6462270657221474, "rewards/rejected": -2.4527460734049478, "step": 10455 }, { "epoch": 0.5542098428431347, "grad_norm": 64.0, "kl": 2.3701705932617188, "learning_rate": 5e-07, "logits/chosen": -77910890.66666667, "logits/rejected": -28842778.0, "logps/chosen": -599.8446451822916, "logps/rejected": -303.1858825683594, "loss": 0.2715, "rewards/chosen": 1.0740127563476562, "rewards/margins": 3.376694679260254, "rewards/rejected": -2.3026819229125977, "step": 10456 }, { "epoch": 0.5542628468449368, "grad_norm": 48.5, "kl": 1.2074203491210938, "learning_rate": 5e-07, "logits/chosen": -112066000.0, "logits/rejected": -39131576.0, "logps/chosen": -248.34902954101562, "logps/rejected": -371.92034912109375, "loss": 0.3031, "rewards/chosen": 0.28161391615867615, "rewards/margins": 2.348561614751816, "rewards/rejected": -2.0669476985931396, "step": 10457 }, { "epoch": 0.554315850846739, "grad_norm": 52.75, "kl": 1.869150161743164, "learning_rate": 5e-07, "logits/chosen": -31250803.2, "logits/rejected": -28780928.0, "logps/chosen": -295.9048095703125, "logps/rejected": -179.5621337890625, "loss": 0.3463, "rewards/chosen": 0.2956662893295288, "rewards/margins": 2.684157919883728, "rewards/rejected": -2.388491630554199, "step": 10458 }, { "epoch": 0.554368854848541, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64519818.666666664, "logits/rejected": -33379475.2, "logps/chosen": -271.7204996744792, "logps/rejected": -327.71552734375, "loss": 0.2867, "rewards/chosen": -0.22828012704849243, "rewards/margins": 1.8971585631370544, "rewards/rejected": -2.125438690185547, "step": 10459 }, { "epoch": 0.5544218588503432, "grad_norm": 47.0, "kl": 0.5024147033691406, "learning_rate": 5e-07, "logits/chosen": -18992818.666666668, "logits/rejected": -2413850.0, "logps/chosen": -216.0739542643229, "logps/rejected": -361.580078125, "loss": 0.3447, "rewards/chosen": 0.5986471970876058, "rewards/margins": 2.408132036526998, "rewards/rejected": -1.809484839439392, "step": 10460 }, { "epoch": 0.5544748628521453, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38209685.333333336, "logits/rejected": 117584.5, "logps/chosen": -254.94120279947916, "logps/rejected": -215.9898681640625, "loss": 0.4124, "rewards/chosen": 0.04590180516242981, "rewards/margins": 1.5616550743579865, "rewards/rejected": -1.5157532691955566, "step": 10461 }, { "epoch": 0.5545278668539475, "grad_norm": 40.75, "kl": 0.04736518859863281, "learning_rate": 5e-07, "logits/chosen": -11643416.0, "logits/rejected": -45853779.2, "logps/chosen": -235.63909912109375, "logps/rejected": -327.781689453125, "loss": 0.2176, "rewards/chosen": 0.5398521820704142, "rewards/margins": 3.165124742190043, "rewards/rejected": -2.6252725601196287, "step": 10462 }, { "epoch": 0.5545808708557496, "grad_norm": 66.5, "kl": 3.629023551940918, "learning_rate": 5e-07, "logits/chosen": -2356560.8571428573, "logits/rejected": -1916391.5, "logps/chosen": -498.2460239955357, "logps/rejected": -51.39885711669922, "loss": 0.387, "rewards/chosen": 0.8402384349278041, "rewards/margins": 1.8048067518642972, "rewards/rejected": -0.9645683169364929, "step": 10463 }, { "epoch": 0.5546338748575518, "grad_norm": 42.0, "kl": 2.9799728393554688, "learning_rate": 5e-07, "logits/chosen": -13431248.0, "logits/rejected": -27459212.0, "logps/chosen": -369.9836120605469, "logps/rejected": -338.63018798828125, "loss": 0.2556, "rewards/chosen": 0.9181610345840454, "rewards/margins": 4.092018246650696, "rewards/rejected": -3.1738572120666504, "step": 10464 }, { "epoch": 0.5546868788593539, "grad_norm": 48.25, "kl": 3.2189483642578125, "learning_rate": 5e-07, "logits/chosen": -13556304.0, "logits/rejected": -10103872.0, "logps/chosen": -318.1977945963542, "logps/rejected": -284.30230712890625, "loss": 0.425, "rewards/chosen": 0.516690174738566, "rewards/margins": 2.979506174723307, "rewards/rejected": -2.462815999984741, "step": 10465 }, { "epoch": 0.554739882861156, "grad_norm": 65.0, "kl": 4.4409027099609375, "learning_rate": 5e-07, "logits/chosen": -21297702.0, "logits/rejected": -38910024.0, "logps/chosen": -431.41552734375, "logps/rejected": -248.39755249023438, "loss": 0.1973, "rewards/chosen": 1.7386016845703125, "rewards/margins": 4.117861747741699, "rewards/rejected": -2.3792600631713867, "step": 10466 }, { "epoch": 0.5547928868629581, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45061556.0, "logits/rejected": -21728976.0, "logps/chosen": -377.19476318359375, "logps/rejected": -388.00746372767856, "loss": 0.2636, "rewards/chosen": -0.03486938402056694, "rewards/margins": 1.9520844875701837, "rewards/rejected": -1.9869538715907507, "step": 10467 }, { "epoch": 0.5548458908647603, "grad_norm": 54.75, "kl": 2.576035499572754, "learning_rate": 5e-07, "logits/chosen": -34039216.0, "logits/rejected": -38621075.2, "logps/chosen": -245.45953369140625, "logps/rejected": -446.0833984375, "loss": 0.3147, "rewards/chosen": 0.7351193428039551, "rewards/margins": 3.0492146492004393, "rewards/rejected": -2.314095306396484, "step": 10468 }, { "epoch": 0.5548988948665624, "grad_norm": 38.0, "kl": 0.06249237060546875, "learning_rate": 5e-07, "logits/chosen": -18205882.666666668, "logits/rejected": -20034660.8, "logps/chosen": -210.8951619466146, "logps/rejected": -228.223828125, "loss": 0.2831, "rewards/chosen": 0.23468685150146484, "rewards/margins": 1.8305689811706543, "rewards/rejected": -1.5958821296691894, "step": 10469 }, { "epoch": 0.5549518988683646, "grad_norm": 40.5, "kl": 0.386749267578125, "learning_rate": 5e-07, "logits/chosen": -25763072.0, "logits/rejected": 473749.25, "logps/chosen": -232.53976440429688, "logps/rejected": -267.64520263671875, "loss": 0.2817, "rewards/chosen": 0.969408392906189, "rewards/margins": 2.5216327905654907, "rewards/rejected": -1.5522243976593018, "step": 10470 }, { "epoch": 0.5550049028701667, "grad_norm": 43.5, "kl": 1.4714946746826172, "learning_rate": 5e-07, "logits/chosen": -22471589.333333332, "logits/rejected": -42488844.0, "logps/chosen": -207.80122884114584, "logps/rejected": -860.4789428710938, "loss": 0.3144, "rewards/chosen": 0.510938803354899, "rewards/margins": 6.263082186381022, "rewards/rejected": -5.752143383026123, "step": 10471 }, { "epoch": 0.5550579068719689, "grad_norm": 52.25, "kl": 1.0968856811523438, "learning_rate": 5e-07, "logits/chosen": -15044714.0, "logits/rejected": -50217156.0, "logps/chosen": -346.8921813964844, "logps/rejected": -532.078857421875, "loss": 0.226, "rewards/chosen": 0.8631057739257812, "rewards/margins": 3.3616859912872314, "rewards/rejected": -2.49858021736145, "step": 10472 }, { "epoch": 0.555110910873771, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52917092.0, "logits/rejected": -38333988.0, "logps/chosen": -433.5159606933594, "logps/rejected": -564.735595703125, "loss": 0.3315, "rewards/chosen": 0.033021554350852966, "rewards/margins": 3.2272385209798813, "rewards/rejected": -3.1942169666290283, "step": 10473 }, { "epoch": 0.5551639148755732, "grad_norm": 75.5, "kl": 7.197403907775879, "learning_rate": 5e-07, "logits/chosen": -29538940.0, "logps/chosen": -444.60443115234375, "loss": 0.4061, "rewards/chosen": 1.1507375240325928, "step": 10474 }, { "epoch": 0.5552169188773752, "grad_norm": 59.5, "kl": 5.575319290161133, "learning_rate": 5e-07, "logits/chosen": -39123696.0, "logits/rejected": -17074060.0, "logps/chosen": -288.9151204427083, "logps/rejected": -572.4580078125, "loss": 0.3916, "rewards/chosen": 0.8108081817626953, "rewards/margins": 3.2846405506134033, "rewards/rejected": -2.473832368850708, "step": 10475 }, { "epoch": 0.5552699228791774, "grad_norm": 63.0, "kl": 3.52960205078125, "learning_rate": 5e-07, "logits/chosen": -4062147.3333333335, "logits/rejected": -31930796.8, "logps/chosen": -403.0801188151042, "logps/rejected": -372.810888671875, "loss": 0.2468, "rewards/chosen": 1.0120759010314941, "rewards/margins": 3.5895503044128416, "rewards/rejected": -2.5774744033813475, "step": 10476 }, { "epoch": 0.5553229268809795, "grad_norm": 41.0, "kl": 0.23659515380859375, "learning_rate": 5e-07, "logits/chosen": -15718579.2, "logits/rejected": 10629280.666666666, "logps/chosen": -157.212109375, "logps/rejected": -612.3905843098959, "loss": 0.3205, "rewards/chosen": 0.13445335626602173, "rewards/margins": 3.1191163659095764, "rewards/rejected": -2.9846630096435547, "step": 10477 }, { "epoch": 0.5553759308827817, "grad_norm": 47.25, "kl": 1.2694110870361328, "learning_rate": 5e-07, "logits/chosen": -20386380.8, "logits/rejected": -33321376.0, "logps/chosen": -179.5532470703125, "logps/rejected": -336.532470703125, "loss": 0.321, "rewards/chosen": 0.22105660438537597, "rewards/margins": 3.945267343521118, "rewards/rejected": -3.724210739135742, "step": 10478 }, { "epoch": 0.5554289348845838, "grad_norm": 64.5, "kl": 2.1536788940429688, "learning_rate": 5e-07, "logits/chosen": -29979530.666666668, "logits/rejected": 3554994.5, "logps/chosen": -318.5826822916667, "logps/rejected": -282.2970275878906, "loss": 0.4833, "rewards/chosen": 0.03665955364704132, "rewards/margins": 1.2911848574876785, "rewards/rejected": -1.2545253038406372, "step": 10479 }, { "epoch": 0.555481938886386, "grad_norm": 50.75, "kl": 0.6565685272216797, "learning_rate": 5e-07, "logits/chosen": -39126784.0, "logits/rejected": -5806474.0, "logps/chosen": -189.21978759765625, "logps/rejected": -223.4153289794922, "loss": 0.3246, "rewards/chosen": 0.1347297728061676, "rewards/margins": 2.0454900562763214, "rewards/rejected": -1.9107602834701538, "step": 10480 }, { "epoch": 0.555534942888188, "grad_norm": 44.25, "kl": 1.9276561737060547, "learning_rate": 5e-07, "logits/chosen": -68775718.4, "logits/rejected": -54754288.0, "logps/chosen": -147.31826171875, "logps/rejected": -182.2132771809896, "loss": 0.4008, "rewards/chosen": 0.1161004662513733, "rewards/margins": 1.8566317518552145, "rewards/rejected": -1.740531285603841, "step": 10481 }, { "epoch": 0.5555879468899902, "grad_norm": 57.0, "kl": 2.018064498901367, "learning_rate": 5e-07, "logits/chosen": -23869972.0, "logits/rejected": -5840890.5, "logps/chosen": -329.02850341796875, "logps/rejected": -353.0999755859375, "loss": 0.3091, "rewards/chosen": 0.07068353146314621, "rewards/margins": 2.933495096862316, "rewards/rejected": -2.86281156539917, "step": 10482 }, { "epoch": 0.5556409508917923, "grad_norm": 52.5, "kl": 0.3587913513183594, "learning_rate": 5e-07, "logits/chosen": -29666876.0, "logits/rejected": -3577684.5, "logps/chosen": -312.0187072753906, "logps/rejected": -275.3840637207031, "loss": 0.3813, "rewards/chosen": 0.06347092986106873, "rewards/margins": 1.2905778586864471, "rewards/rejected": -1.2271069288253784, "step": 10483 }, { "epoch": 0.5556939548935945, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6988804.666666667, "logits/rejected": -37449948.8, "logps/chosen": -98.71408081054688, "logps/rejected": -234.57568359375, "loss": 0.2013, "rewards/chosen": 0.4587983290354411, "rewards/margins": 3.000783077875773, "rewards/rejected": -2.541984748840332, "step": 10484 }, { "epoch": 0.5557469588953966, "grad_norm": 53.25, "kl": 2.5492420196533203, "learning_rate": 5e-07, "logits/chosen": -17046464.0, "logits/rejected": 176439370.66666666, "logps/chosen": -244.9401611328125, "logps/rejected": -52.42145792643229, "loss": 0.4118, "rewards/chosen": 0.5814781665802002, "rewards/margins": 1.0883247931798299, "rewards/rejected": -0.5068466265996298, "step": 10485 }, { "epoch": 0.5557999628971987, "grad_norm": 46.0, "kl": 3.799367904663086, "learning_rate": 5e-07, "logits/chosen": -25460054.0, "logits/rejected": -46422624.0, "logps/chosen": -383.8692321777344, "logps/rejected": -517.7838541666666, "loss": 0.1738, "rewards/chosen": 2.15594220161438, "rewards/margins": 4.415765682856241, "rewards/rejected": -2.259823481241862, "step": 10486 }, { "epoch": 0.5558529668990009, "grad_norm": 25.625, "kl": 1.7663650512695312, "learning_rate": 5e-07, "logits/chosen": 20194.515625, "logits/rejected": -49026288.0, "logps/chosen": -167.5034637451172, "logps/rejected": -307.7145182291667, "loss": 0.1588, "rewards/chosen": 0.9890571236610413, "rewards/margins": 3.9915678302447, "rewards/rejected": -3.0025107065836587, "step": 10487 }, { "epoch": 0.555905970900803, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13821300.0, "logits/rejected": -2809270.25, "logps/chosen": -471.67626953125, "logps/rejected": -116.0185775756836, "loss": 0.4085, "rewards/chosen": -0.38072431087493896, "rewards/margins": 0.9694629907608032, "rewards/rejected": -1.3501873016357422, "step": 10488 }, { "epoch": 0.5559589749026052, "grad_norm": 49.25, "kl": 0.7816696166992188, "learning_rate": 5e-07, "logits/chosen": -190746464.0, "logits/rejected": -6289798.0, "logps/chosen": -1962.544189453125, "logps/rejected": -346.0142008463542, "loss": 0.216, "rewards/chosen": 1.5480256080627441, "rewards/margins": 3.638869603474935, "rewards/rejected": -2.090843995412191, "step": 10489 }, { "epoch": 0.5560119789044072, "grad_norm": 51.25, "kl": 1.2937698364257812, "learning_rate": 5e-07, "logits/chosen": -83832656.0, "logits/rejected": -15276034.0, "logps/chosen": -476.32623291015625, "logps/rejected": -580.259765625, "loss": 0.3187, "rewards/chosen": 0.358529657125473, "rewards/margins": 2.3593893945217133, "rewards/rejected": -2.0008597373962402, "step": 10490 }, { "epoch": 0.5560649829062094, "grad_norm": 30.75, "kl": 3.143573760986328, "learning_rate": 5e-07, "logits/chosen": -10141958.666666666, "logits/rejected": -41067116.8, "logps/chosen": -246.0960896809896, "logps/rejected": -424.90380859375, "loss": 0.191, "rewards/chosen": 1.5529967943827312, "rewards/margins": 4.422564665476481, "rewards/rejected": -2.86956787109375, "step": 10491 }, { "epoch": 0.5561179869080115, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11885201.6, "logits/rejected": -16106689.333333334, "logps/chosen": -325.222802734375, "logps/rejected": -576.2984619140625, "loss": 0.3398, "rewards/chosen": 0.3235626220703125, "rewards/margins": 2.6071228981018066, "rewards/rejected": -2.283560276031494, "step": 10492 }, { "epoch": 0.5561709909098137, "grad_norm": 42.25, "kl": 0.8996315002441406, "learning_rate": 5e-07, "logits/chosen": -31309580.8, "logits/rejected": -12044618.666666666, "logps/chosen": -146.57452392578125, "logps/rejected": -452.8048095703125, "loss": 0.3875, "rewards/chosen": 0.17434605360031127, "rewards/margins": 2.647426370779673, "rewards/rejected": -2.473080317179362, "step": 10493 }, { "epoch": 0.5562239949116158, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3752841.5, "logits/rejected": -11514770.666666666, "logps/chosen": -421.64300537109375, "logps/rejected": -372.8081461588542, "loss": 0.157, "rewards/chosen": 1.427882432937622, "rewards/margins": 3.6342238585154214, "rewards/rejected": -2.2063414255777993, "step": 10494 }, { "epoch": 0.556276998913418, "grad_norm": 58.5, "kl": 1.721583366394043, "learning_rate": 5e-07, "logits/chosen": -55349043.2, "logits/rejected": -6725602.0, "logps/chosen": -475.808203125, "logps/rejected": -236.872314453125, "loss": 0.3244, "rewards/chosen": 0.6224200248718261, "rewards/margins": 2.39704008102417, "rewards/rejected": -1.7746200561523438, "step": 10495 }, { "epoch": 0.5563300029152201, "grad_norm": 62.75, "kl": 1.9780731201171875, "learning_rate": 5e-07, "logits/chosen": -32305401.6, "logits/rejected": -42621701.333333336, "logps/chosen": -341.973974609375, "logps/rejected": -344.679443359375, "loss": 0.3719, "rewards/chosen": 0.3272500514984131, "rewards/margins": 2.6699945608774818, "rewards/rejected": -2.342744509379069, "step": 10496 }, { "epoch": 0.5563830069170222, "grad_norm": 43.0, "kl": 1.4168415069580078, "learning_rate": 5e-07, "logits/chosen": -24094600.0, "logits/rejected": -22413636.0, "logps/chosen": -197.30369567871094, "logps/rejected": -402.513671875, "loss": 0.2384, "rewards/chosen": 0.8785179853439331, "rewards/margins": 3.064813256263733, "rewards/rejected": -2.1862952709198, "step": 10497 }, { "epoch": 0.5564360109188243, "grad_norm": 48.25, "kl": 1.8600006103515625, "learning_rate": 5e-07, "logits/chosen": -76748464.0, "logits/rejected": -7435064.0, "logps/chosen": -654.9600219726562, "logps/rejected": -190.11007690429688, "loss": 0.3105, "rewards/chosen": 0.5535740256309509, "rewards/margins": 2.61975234746933, "rewards/rejected": -2.066178321838379, "step": 10498 }, { "epoch": 0.5564890149206265, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24615898.666666668, "logits/rejected": -34627008.0, "logps/chosen": -174.75069173177084, "logps/rejected": -387.881201171875, "loss": 0.196, "rewards/chosen": 0.6229512691497803, "rewards/margins": 3.4176909923553467, "rewards/rejected": -2.7947397232055664, "step": 10499 }, { "epoch": 0.5565420189224286, "grad_norm": 45.5, "kl": 0.647705078125, "learning_rate": 5e-07, "logits/chosen": -52097488.0, "logits/rejected": 1292850.0, "logps/chosen": -388.26910400390625, "logps/rejected": -322.08685302734375, "loss": 0.2588, "rewards/chosen": 0.48870086669921875, "rewards/margins": 3.1307854652404785, "rewards/rejected": -2.6420845985412598, "step": 10500 }, { "epoch": 0.5565950229242308, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4067503.3333333335, "logits/rejected": -52333024.0, "logps/chosen": -287.9677734375, "logps/rejected": -451.9703674316406, "loss": 0.2931, "rewards/chosen": 0.522629459698995, "rewards/margins": 3.6245307524998984, "rewards/rejected": -3.1019012928009033, "step": 10501 }, { "epoch": 0.5566480269260329, "grad_norm": 50.25, "kl": 0.7472476959228516, "learning_rate": 5e-07, "logits/chosen": -71562470.4, "logits/rejected": -17276316.0, "logps/chosen": -323.2166748046875, "logps/rejected": -225.54913330078125, "loss": 0.2165, "rewards/chosen": 0.9750492095947265, "rewards/margins": 4.088959884643555, "rewards/rejected": -3.113910675048828, "step": 10502 }, { "epoch": 0.5567010309278351, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38772693.333333336, "logits/rejected": -35460720.0, "logps/chosen": -246.86248779296875, "logps/rejected": -138.68251037597656, "loss": 0.3641, "rewards/chosen": 0.248376727104187, "rewards/margins": 2.886250376701355, "rewards/rejected": -2.637873649597168, "step": 10503 }, { "epoch": 0.5567540349296372, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17501560.0, "logits/rejected": -51549429.333333336, "logps/chosen": -226.63243103027344, "logps/rejected": -386.0486653645833, "loss": 0.1761, "rewards/chosen": 0.2866760492324829, "rewards/margins": 2.6969931523005166, "rewards/rejected": -2.4103171030680337, "step": 10504 }, { "epoch": 0.5568070389314393, "grad_norm": 53.25, "kl": 1.2488746643066406, "learning_rate": 5e-07, "logits/chosen": -72116632.0, "logits/rejected": -40573760.0, "logps/chosen": -424.49493408203125, "logps/rejected": -437.7481689453125, "loss": 0.3143, "rewards/chosen": 0.39560627937316895, "rewards/margins": 2.31242835521698, "rewards/rejected": -1.916822075843811, "step": 10505 }, { "epoch": 0.5568600429332414, "grad_norm": 48.0, "kl": 1.1923809051513672, "learning_rate": 5e-07, "logits/chosen": -16557176.0, "logits/rejected": -32615202.0, "logps/chosen": -268.493896484375, "logps/rejected": -190.79786682128906, "loss": 0.3824, "rewards/chosen": -0.1222776472568512, "rewards/margins": 1.5217402875423431, "rewards/rejected": -1.6440179347991943, "step": 10506 }, { "epoch": 0.5569130469350436, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66876992.0, "logits/rejected": -82124576.0, "logps/chosen": -338.3212193080357, "logps/rejected": -320.26171875, "loss": 0.4926, "rewards/chosen": -0.15968969890049525, "rewards/margins": 1.453466500554766, "rewards/rejected": -1.6131561994552612, "step": 10507 }, { "epoch": 0.5569660509368457, "grad_norm": 56.25, "kl": 0.4414691925048828, "learning_rate": 5e-07, "logits/chosen": -6241832.0, "logits/rejected": 2118636.8, "logps/chosen": -57.921529134114586, "logps/rejected": -228.4043212890625, "loss": 0.3304, "rewards/chosen": -0.008769363164901733, "rewards/margins": 1.5244530260562896, "rewards/rejected": -1.5332223892211914, "step": 10508 }, { "epoch": 0.5570190549386479, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20612140.0, "logits/rejected": -25665788.0, "logps/chosen": -204.47430419921875, "logps/rejected": -270.25128173828125, "loss": 0.3413, "rewards/chosen": -0.40764084458351135, "rewards/margins": 2.3650480210781097, "rewards/rejected": -2.772688865661621, "step": 10509 }, { "epoch": 0.55707205894045, "grad_norm": 57.0, "kl": 0.2330303192138672, "learning_rate": 5e-07, "logits/chosen": -57022112.0, "logits/rejected": -4902731.2, "logps/chosen": -444.3170166015625, "logps/rejected": -150.26341552734374, "loss": 0.3066, "rewards/chosen": 0.348297119140625, "rewards/margins": 1.6909589767456055, "rewards/rejected": -1.3426618576049805, "step": 10510 }, { "epoch": 0.5571250629422522, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6552902.0, "logits/rejected": 73076629.33333333, "logps/chosen": -235.14755249023438, "logps/rejected": -312.9480387369792, "loss": 0.2667, "rewards/chosen": 0.05752144753932953, "rewards/margins": 1.8177387962738674, "rewards/rejected": -1.7602173487345378, "step": 10511 }, { "epoch": 0.5571780669440543, "grad_norm": 49.0, "kl": 2.4368038177490234, "learning_rate": 5e-07, "logits/chosen": -4461340.5, "logits/rejected": -28518840.0, "logps/chosen": -307.70684814453125, "logps/rejected": -474.1938171386719, "loss": 0.2588, "rewards/chosen": 0.9029372334480286, "rewards/margins": 3.74165803194046, "rewards/rejected": -2.8387207984924316, "step": 10512 }, { "epoch": 0.5572310709458564, "grad_norm": 55.75, "kl": 1.3107414245605469, "learning_rate": 5e-07, "logits/chosen": -21262398.666666668, "logits/rejected": -7374365.0, "logps/chosen": -399.3324381510417, "logps/rejected": -85.28255462646484, "loss": 0.2676, "rewards/chosen": 0.9129695892333984, "rewards/margins": 3.478394031524658, "rewards/rejected": -2.5654244422912598, "step": 10513 }, { "epoch": 0.5572840749476585, "grad_norm": 45.5, "kl": 0.9881019592285156, "learning_rate": 5e-07, "logits/chosen": -25960424.0, "logits/rejected": -44790384.0, "logps/chosen": -448.4086608886719, "logps/rejected": -219.62973022460938, "loss": 0.2935, "rewards/chosen": 0.7150870561599731, "rewards/margins": 2.685243010520935, "rewards/rejected": -1.970155954360962, "step": 10514 }, { "epoch": 0.5573370789494607, "grad_norm": 56.25, "kl": 3.126955509185791, "learning_rate": 5e-07, "logits/chosen": -29745605.333333332, "logits/rejected": 3675785.25, "logps/chosen": -296.1513264973958, "logps/rejected": -80.01927185058594, "loss": 0.4777, "rewards/chosen": 0.15979242324829102, "rewards/margins": 0.9851086735725403, "rewards/rejected": -0.8253162503242493, "step": 10515 }, { "epoch": 0.5573900829512628, "grad_norm": 59.75, "kl": 0.5333023071289062, "learning_rate": 5e-07, "logits/chosen": -21637971.2, "logits/rejected": -17823370.666666668, "logps/chosen": -513.859375, "logps/rejected": -307.46238199869794, "loss": 0.3161, "rewards/chosen": 0.4651947021484375, "rewards/margins": 2.3606958389282227, "rewards/rejected": -1.8955011367797852, "step": 10516 }, { "epoch": 0.557443086953065, "grad_norm": 50.25, "kl": 0.022260665893554688, "learning_rate": 5e-07, "logits/chosen": -21544276.0, "logits/rejected": -20803244.0, "logps/chosen": -221.2334747314453, "logps/rejected": -272.2867431640625, "loss": 0.2834, "rewards/chosen": -0.16849595308303833, "rewards/margins": 1.3071699341138203, "rewards/rejected": -1.4756658871968586, "step": 10517 }, { "epoch": 0.5574960909548671, "grad_norm": 47.25, "kl": 1.1652450561523438, "learning_rate": 5e-07, "logits/chosen": -20748456.0, "logits/rejected": -20909896.0, "logps/chosen": -257.8467102050781, "logps/rejected": -339.68701171875, "loss": 0.172, "rewards/chosen": 1.6406677961349487, "rewards/margins": 4.094837307929993, "rewards/rejected": -2.454169511795044, "step": 10518 }, { "epoch": 0.5575490949566693, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21573268.0, "logits/rejected": -28542707.2, "logps/chosen": -274.2474365234375, "logps/rejected": -353.5852294921875, "loss": 0.2253, "rewards/chosen": 0.2950297196706136, "rewards/margins": 3.0757622559865316, "rewards/rejected": -2.780732536315918, "step": 10519 }, { "epoch": 0.5576020989584713, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51071865.6, "logits/rejected": -22730909.333333332, "logps/chosen": -350.830322265625, "logps/rejected": -474.2869466145833, "loss": 0.2825, "rewards/chosen": 0.43579726219177245, "rewards/margins": 3.942066717147827, "rewards/rejected": -3.5062694549560547, "step": 10520 }, { "epoch": 0.5576551029602735, "grad_norm": 55.0, "kl": 0.1032867431640625, "learning_rate": 5e-07, "logits/chosen": -80111616.0, "logits/rejected": -13866004.0, "logps/chosen": -473.501953125, "logps/rejected": -259.87249755859375, "loss": 0.241, "rewards/chosen": 0.5309661626815796, "rewards/margins": 3.277643322944641, "rewards/rejected": -2.7466771602630615, "step": 10521 }, { "epoch": 0.5577081069620756, "grad_norm": 58.5, "kl": 1.719949722290039, "learning_rate": 5e-07, "logits/chosen": -2338153.5, "logits/rejected": -23426526.0, "logps/chosen": -201.72390747070312, "logps/rejected": -688.2071533203125, "loss": 0.2321, "rewards/chosen": 1.0568755865097046, "rewards/margins": 3.287969946861267, "rewards/rejected": -2.2310943603515625, "step": 10522 }, { "epoch": 0.5577611109638778, "grad_norm": 48.5, "kl": 2.076584815979004, "learning_rate": 5e-07, "logits/chosen": -10775803.42857143, "logits/rejected": -7599172.0, "logps/chosen": -442.54366629464283, "logps/rejected": -65.03514099121094, "loss": 0.4089, "rewards/chosen": 0.6271249226161412, "rewards/margins": 2.634475128991263, "rewards/rejected": -2.007350206375122, "step": 10523 }, { "epoch": 0.5578141149656799, "grad_norm": 50.25, "kl": 3.6008434295654297, "learning_rate": 5e-07, "logits/chosen": -86576256.0, "logits/rejected": -25603168.0, "logps/chosen": -955.1297607421875, "logps/rejected": -366.95770263671875, "loss": 0.2441, "rewards/chosen": 1.4102667570114136, "rewards/margins": 3.5679620504379272, "rewards/rejected": -2.1576952934265137, "step": 10524 }, { "epoch": 0.5578671189674821, "grad_norm": 37.0, "kl": 1.3789396286010742, "learning_rate": 5e-07, "logits/chosen": -4028880.25, "logits/rejected": -24584590.0, "logps/chosen": -143.5187225341797, "logps/rejected": -253.7567901611328, "loss": 0.2971, "rewards/chosen": 0.08439421653747559, "rewards/margins": 4.037546873092651, "rewards/rejected": -3.953152656555176, "step": 10525 }, { "epoch": 0.5579201229692842, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -729789.0, "logits/rejected": -10875035.0, "logps/chosen": -199.39649963378906, "logps/rejected": -390.52899169921875, "loss": 0.3714, "rewards/chosen": -0.5479589700698853, "rewards/margins": 1.5206378698349, "rewards/rejected": -2.068596839904785, "step": 10526 }, { "epoch": 0.5579731269710864, "grad_norm": 46.25, "kl": 0.8999347686767578, "learning_rate": 5e-07, "logits/chosen": -9584356.0, "logits/rejected": -36399308.0, "logps/chosen": -126.17649841308594, "logps/rejected": -227.7678985595703, "loss": 0.3594, "rewards/chosen": -0.11956749856472015, "rewards/margins": 2.271933689713478, "rewards/rejected": -2.3915011882781982, "step": 10527 }, { "epoch": 0.5580261309728884, "grad_norm": 53.5, "kl": 7.089834213256836, "learning_rate": 5e-07, "logits/chosen": -8087404.8, "logits/rejected": -11997528.0, "logps/chosen": -534.7716796875, "logps/rejected": -284.68670654296875, "loss": 0.3474, "rewards/chosen": 1.2864237785339356, "rewards/margins": 3.28597141901652, "rewards/rejected": -1.9995476404825847, "step": 10528 }, { "epoch": 0.5580791349746906, "grad_norm": 56.0, "kl": 0.6299896240234375, "learning_rate": 5e-07, "logits/chosen": -13600122.666666666, "logits/rejected": -15493206.0, "logps/chosen": -309.91392008463544, "logps/rejected": -87.08699035644531, "loss": 0.4014, "rewards/chosen": 0.31482475996017456, "rewards/margins": 2.330523431301117, "rewards/rejected": -2.0156986713409424, "step": 10529 }, { "epoch": 0.5581321389764927, "grad_norm": 20.875, "kl": 1.7725181579589844, "learning_rate": 5e-07, "logits/chosen": 4258400.666666667, "logits/rejected": 15828532.8, "logps/chosen": -912.45947265625, "logps/rejected": -747.798388671875, "loss": 0.156, "rewards/chosen": 1.4480106035868328, "rewards/margins": 5.673626677195231, "rewards/rejected": -4.225616073608398, "step": 10530 }, { "epoch": 0.5581851429782949, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22422068.0, "logits/rejected": -6973826.666666667, "logps/chosen": -181.3913116455078, "logps/rejected": -277.8609619140625, "loss": 0.281, "rewards/chosen": -0.23791007697582245, "rewards/margins": 1.4019899715979893, "rewards/rejected": -1.6399000485738118, "step": 10531 }, { "epoch": 0.558238146980097, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11546312.0, "logits/rejected": -23574880.0, "logps/chosen": -265.22308349609375, "logps/rejected": -415.0636901855469, "loss": 0.2449, "rewards/chosen": 0.5155439376831055, "rewards/margins": 3.1661508083343506, "rewards/rejected": -2.650606870651245, "step": 10532 }, { "epoch": 0.5582911509818992, "grad_norm": 49.5, "kl": 2.153903007507324, "learning_rate": 5e-07, "logits/chosen": -40276292.0, "logits/rejected": -18957424.0, "logps/chosen": -280.87567138671875, "logps/rejected": -228.79409790039062, "loss": 0.2633, "rewards/chosen": 0.7873103618621826, "rewards/margins": 2.859196662902832, "rewards/rejected": -2.0718863010406494, "step": 10533 }, { "epoch": 0.5583441549837013, "grad_norm": 44.75, "kl": 0.515960693359375, "learning_rate": 5e-07, "logits/chosen": -21853030.0, "logits/rejected": -25869184.0, "logps/chosen": -358.83905029296875, "logps/rejected": -242.7231903076172, "loss": 0.3135, "rewards/chosen": 0.6515592336654663, "rewards/margins": 2.117772936820984, "rewards/rejected": -1.4662137031555176, "step": 10534 }, { "epoch": 0.5583971589855034, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20116893.333333332, "logits/rejected": -50662076.8, "logps/chosen": -169.7491658528646, "logps/rejected": -220.484521484375, "loss": 0.2899, "rewards/chosen": 0.03658244510491689, "rewards/margins": 1.7850035419066745, "rewards/rejected": -1.7484210968017577, "step": 10535 }, { "epoch": 0.5584501629873055, "grad_norm": 44.25, "kl": 1.1657180786132812, "learning_rate": 5e-07, "logits/chosen": 529871.6666666666, "logits/rejected": -9285913.6, "logps/chosen": -220.10603841145834, "logps/rejected": -334.0875, "loss": 0.2405, "rewards/chosen": 0.4907408555348714, "rewards/margins": 2.6951483567555745, "rewards/rejected": -2.204407501220703, "step": 10536 }, { "epoch": 0.5585031669891076, "grad_norm": 74.5, "kl": 2.626922607421875, "learning_rate": 5e-07, "logits/chosen": -32069856.0, "logits/rejected": -15107065.333333334, "logps/chosen": -651.95146484375, "logps/rejected": -259.013427734375, "loss": 0.3666, "rewards/chosen": 0.36697189807891845, "rewards/margins": 2.295143214861552, "rewards/rejected": -1.9281713167826335, "step": 10537 }, { "epoch": 0.5585561709909098, "grad_norm": 57.25, "kl": 2.569626808166504, "learning_rate": 5e-07, "logits/chosen": -11544542.0, "logits/rejected": -26503394.666666668, "logps/chosen": -135.20147705078125, "logps/rejected": -254.4178263346354, "loss": 0.2716, "rewards/chosen": 0.4601762592792511, "rewards/margins": 2.1701667606830597, "rewards/rejected": -1.7099905014038086, "step": 10538 }, { "epoch": 0.5586091749927119, "grad_norm": 52.0, "kl": 0.6272773742675781, "learning_rate": 5e-07, "logits/chosen": -5851864.0, "logits/rejected": -8901552.0, "logps/chosen": -226.75424194335938, "logps/rejected": -159.74066162109375, "loss": 0.3566, "rewards/chosen": 0.30163657665252686, "rewards/margins": 1.6043983697891235, "rewards/rejected": -1.3027617931365967, "step": 10539 }, { "epoch": 0.5586621789945141, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58248981.333333336, "logits/rejected": -91818432.0, "logps/chosen": -666.9999593098959, "logps/rejected": -452.5342102050781, "loss": 0.2939, "rewards/chosen": 0.7757658958435059, "rewards/margins": 3.1745269298553467, "rewards/rejected": -2.398761034011841, "step": 10540 }, { "epoch": 0.5587151829963162, "grad_norm": 51.0, "kl": 0.8144874572753906, "learning_rate": 5e-07, "logits/chosen": -26843593.6, "logits/rejected": -2029065.1666666667, "logps/chosen": -423.04599609375, "logps/rejected": -115.44639078776042, "loss": 0.2555, "rewards/chosen": 1.0791610717773437, "rewards/margins": 3.1415897051493324, "rewards/rejected": -2.062428633371989, "step": 10541 }, { "epoch": 0.5587681869981184, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44544650.666666664, "logits/rejected": -5443816.8, "logps/chosen": -322.2207438151042, "logps/rejected": -155.223583984375, "loss": 0.2966, "rewards/chosen": 1.0579091707865398, "rewards/margins": 2.15028559366862, "rewards/rejected": -1.09237642288208, "step": 10542 }, { "epoch": 0.5588211909999204, "grad_norm": 50.5, "kl": 0.018972396850585938, "learning_rate": 5e-07, "logits/chosen": -29340108.0, "logits/rejected": -16011741.0, "logps/chosen": -301.3641357421875, "logps/rejected": -373.1796569824219, "loss": 0.2971, "rewards/chosen": 0.5274953842163086, "rewards/margins": 2.4282774925231934, "rewards/rejected": -1.9007821083068848, "step": 10543 }, { "epoch": 0.5588741950017226, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9294536.666666666, "logits/rejected": -52436544.0, "logps/chosen": -208.12451171875, "logps/rejected": -223.5044708251953, "loss": 0.3517, "rewards/chosen": 0.43471165498097736, "rewards/margins": 2.3494951327641806, "rewards/rejected": -1.9147834777832031, "step": 10544 }, { "epoch": 0.5589271990035247, "grad_norm": 55.75, "kl": 0.7550582885742188, "learning_rate": 5e-07, "logits/chosen": -62170784.0, "logits/rejected": 6739680.0, "logps/chosen": -527.4827473958334, "logps/rejected": -355.960986328125, "loss": 0.2401, "rewards/chosen": 0.8281229337056478, "rewards/margins": 2.8289125760396323, "rewards/rejected": -2.0007896423339844, "step": 10545 }, { "epoch": 0.5589802030053269, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17796846.0, "logits/rejected": -29167258.0, "logps/chosen": -255.6782684326172, "logps/rejected": -348.87359619140625, "loss": 0.2308, "rewards/chosen": 0.8535141348838806, "rewards/margins": 3.0112492442131042, "rewards/rejected": -2.1577351093292236, "step": 10546 }, { "epoch": 0.559033207007129, "grad_norm": 55.5, "kl": 0.7347011566162109, "learning_rate": 5e-07, "logits/chosen": -36262160.0, "logits/rejected": 44671896.0, "logps/chosen": -230.9678466796875, "logps/rejected": -134.35107421875, "loss": 0.4376, "rewards/chosen": 0.001894479990005493, "rewards/margins": 0.7453627645969391, "rewards/rejected": -0.7434682846069336, "step": 10547 }, { "epoch": 0.5590862110089312, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51092096.0, "logits/rejected": -26224546.666666668, "logps/chosen": -358.5067138671875, "logps/rejected": -290.85459391276044, "loss": 0.2059, "rewards/chosen": 0.04254608601331711, "rewards/margins": 2.359673631687959, "rewards/rejected": -2.317127545674642, "step": 10548 }, { "epoch": 0.5591392150107333, "grad_norm": 52.25, "kl": 0.3462333679199219, "learning_rate": 5e-07, "logits/chosen": -61609032.0, "logits/rejected": -7591993.0, "logps/chosen": -423.5771484375, "logps/rejected": -138.3045654296875, "loss": 0.3259, "rewards/chosen": 0.6361801028251648, "rewards/margins": 1.6621468663215637, "rewards/rejected": -1.025966763496399, "step": 10549 }, { "epoch": 0.5591922190125355, "grad_norm": 58.0, "kl": 1.4062843322753906, "learning_rate": 5e-07, "logits/chosen": -69217632.0, "logits/rejected": -22289472.0, "logps/chosen": -358.257861328125, "logps/rejected": -269.23956298828125, "loss": 0.2935, "rewards/chosen": 0.47318272590637206, "rewards/margins": 3.7902053674062093, "rewards/rejected": -3.3170226414998374, "step": 10550 }, { "epoch": 0.5592452230143375, "grad_norm": 53.75, "kl": 2.3239078521728516, "learning_rate": 5e-07, "logits/chosen": -22287446.0, "logits/rejected": -31880110.0, "logps/chosen": -215.10202026367188, "logps/rejected": -294.904541015625, "loss": 0.3084, "rewards/chosen": 1.0840836763381958, "rewards/margins": 2.4775609970092773, "rewards/rejected": -1.3934773206710815, "step": 10551 }, { "epoch": 0.5592982270161397, "grad_norm": 50.0, "kl": 0.8512077331542969, "learning_rate": 5e-07, "logits/chosen": 14335550.0, "logits/rejected": -15921696.0, "logps/chosen": -188.18869018554688, "logps/rejected": -591.7548828125, "loss": 0.2514, "rewards/chosen": 0.761343240737915, "rewards/margins": 3.939995765686035, "rewards/rejected": -3.17865252494812, "step": 10552 }, { "epoch": 0.5593512310179418, "grad_norm": 46.75, "kl": 1.049112319946289, "learning_rate": 5e-07, "logits/chosen": -14472699.0, "logits/rejected": -60879412.0, "logps/chosen": -181.8893585205078, "logps/rejected": -488.400146484375, "loss": 0.3128, "rewards/chosen": 0.16254273056983948, "rewards/margins": 2.569465070962906, "rewards/rejected": -2.4069223403930664, "step": 10553 }, { "epoch": 0.559404235019744, "grad_norm": 61.0, "kl": 2.733302116394043, "learning_rate": 5e-07, "logits/chosen": -30373674.666666668, "logits/rejected": -35870512.0, "logps/chosen": -286.29164632161456, "logps/rejected": -238.4622802734375, "loss": 0.4223, "rewards/chosen": 0.17862993478775024, "rewards/margins": 3.020849049091339, "rewards/rejected": -2.842219114303589, "step": 10554 }, { "epoch": 0.5594572390215461, "grad_norm": 57.25, "kl": 0.6744194030761719, "learning_rate": 5e-07, "logits/chosen": -22519514.666666668, "logits/rejected": -20298818.0, "logps/chosen": -298.517578125, "logps/rejected": -467.0382080078125, "loss": 0.4578, "rewards/chosen": -0.30451633532842, "rewards/margins": 3.1410298148790994, "rewards/rejected": -3.4455461502075195, "step": 10555 }, { "epoch": 0.5595102430233483, "grad_norm": 65.5, "kl": 1.827162742614746, "learning_rate": 5e-07, "logits/chosen": -35268582.4, "logits/rejected": -50134160.0, "logps/chosen": -192.74847412109375, "logps/rejected": -596.8489583333334, "loss": 0.2692, "rewards/chosen": 0.7765775203704834, "rewards/margins": 3.9532040437062586, "rewards/rejected": -3.176626523335775, "step": 10556 }, { "epoch": 0.5595632470251504, "grad_norm": 48.5, "kl": 1.3917884826660156, "learning_rate": 5e-07, "logits/chosen": -59894240.0, "logits/rejected": -6838524.8, "logps/chosen": -324.748046875, "logps/rejected": -173.04996337890626, "loss": 0.3507, "rewards/chosen": -0.020041386286417644, "rewards/margins": 1.235619815190633, "rewards/rejected": -1.2556612014770507, "step": 10557 }, { "epoch": 0.5596162510269526, "grad_norm": 35.0, "kl": 1.119405746459961, "learning_rate": 5e-07, "logits/chosen": 13632602.0, "logits/rejected": -16902650.666666668, "logps/chosen": -133.84580993652344, "logps/rejected": -353.6542561848958, "loss": 0.2087, "rewards/chosen": 0.4412159025669098, "rewards/margins": 2.8669895231723785, "rewards/rejected": -2.4257736206054688, "step": 10558 }, { "epoch": 0.5596692550287546, "grad_norm": 71.0, "kl": 1.7537498474121094, "learning_rate": 5e-07, "logits/chosen": -82393034.66666667, "logits/rejected": 1548140.2, "logps/chosen": -803.099609375, "logps/rejected": -94.6376708984375, "loss": 0.2799, "rewards/chosen": 1.3963074684143066, "rewards/margins": 2.5341847419738768, "rewards/rejected": -1.1378772735595704, "step": 10559 }, { "epoch": 0.5597222590305568, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5842950.0, "logits/rejected": -50747373.71428572, "logps/chosen": -38.30546569824219, "logps/rejected": -269.3059779575893, "loss": 0.2217, "rewards/chosen": 0.1986236572265625, "rewards/margins": 1.9098542077200753, "rewards/rejected": -1.7112305504935128, "step": 10560 }, { "epoch": 0.5597752630323589, "grad_norm": 36.0, "kl": 1.0212936401367188, "learning_rate": 5e-07, "logits/chosen": -171526.25, "logits/rejected": -23906061.333333332, "logps/chosen": -42.32841491699219, "logps/rejected": -400.922119140625, "loss": 0.2603, "rewards/chosen": 0.09903478622436523, "rewards/margins": 2.562203566233317, "rewards/rejected": -2.4631687800089517, "step": 10561 }, { "epoch": 0.5598282670341611, "grad_norm": 38.0, "kl": 2.864988327026367, "learning_rate": 5e-07, "logits/chosen": -20985458.666666668, "logits/rejected": -21317806.0, "logps/chosen": -282.62359619140625, "logps/rejected": -308.171630859375, "loss": 0.3803, "rewards/chosen": 0.4620491663614909, "rewards/margins": 3.132103125254313, "rewards/rejected": -2.6700539588928223, "step": 10562 }, { "epoch": 0.5598812710359632, "grad_norm": 45.75, "kl": 2.3435916900634766, "learning_rate": 5e-07, "logits/chosen": -4782578.666666667, "logits/rejected": -30163478.4, "logps/chosen": -141.62186686197916, "logps/rejected": -476.64033203125, "loss": 0.2494, "rewards/chosen": 0.6467258532842001, "rewards/margins": 3.452983101209005, "rewards/rejected": -2.8062572479248047, "step": 10563 }, { "epoch": 0.5599342750377654, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53305557.333333336, "logits/rejected": -449669.3, "logps/chosen": -368.5834147135417, "logps/rejected": -276.4828125, "loss": 0.2158, "rewards/chosen": 0.5717787742614746, "rewards/margins": 2.8823454856872557, "rewards/rejected": -2.310566711425781, "step": 10564 }, { "epoch": 0.5599872790395675, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10647702.0, "logits/rejected": -11520001.6, "logps/chosen": -466.6634521484375, "logps/rejected": -290.49658203125, "loss": 0.2224, "rewards/chosen": 0.9516774813334147, "rewards/margins": 3.0247569719950356, "rewards/rejected": -2.073079490661621, "step": 10565 }, { "epoch": 0.5600402830413697, "grad_norm": 43.25, "kl": 1.3206796646118164, "learning_rate": 5e-07, "logits/chosen": -14667000.0, "logits/rejected": -34283926.4, "logps/chosen": -186.7626749674479, "logps/rejected": -334.788330078125, "loss": 0.2756, "rewards/chosen": 0.48392311731974286, "rewards/margins": 2.0268109957377116, "rewards/rejected": -1.5428878784179687, "step": 10566 }, { "epoch": 0.5600932870431717, "grad_norm": 51.25, "kl": 2.5228185653686523, "learning_rate": 5e-07, "logits/chosen": 231085.14285714287, "logits/rejected": -5607980.5, "logps/chosen": -291.2830287388393, "logps/rejected": -74.50654602050781, "loss": 0.3959, "rewards/chosen": 0.5697193826947894, "rewards/margins": 2.6805790151868547, "rewards/rejected": -2.1108596324920654, "step": 10567 }, { "epoch": 0.5601462910449739, "grad_norm": 100.0, "kl": 1.4573707580566406, "learning_rate": 5e-07, "logits/chosen": -19178256.0, "logits/rejected": -45481450.666666664, "logps/chosen": -701.6060546875, "logps/rejected": -412.475830078125, "loss": 0.3365, "rewards/chosen": 0.4126945972442627, "rewards/margins": 2.784065548578898, "rewards/rejected": -2.3713709513346353, "step": 10568 }, { "epoch": 0.560199295046776, "grad_norm": 45.5, "kl": 2.0097217559814453, "learning_rate": 5e-07, "logits/chosen": -26073560.0, "logits/rejected": -7780160.666666667, "logps/chosen": -379.30205078125, "logps/rejected": -337.73781331380206, "loss": 0.3625, "rewards/chosen": 0.5766328334808349, "rewards/margins": 2.878832197189331, "rewards/rejected": -2.302199363708496, "step": 10569 }, { "epoch": 0.5602522990485782, "grad_norm": 44.25, "kl": 0.6749057769775391, "learning_rate": 5e-07, "logits/chosen": -47619285.333333336, "logits/rejected": -23205056.0, "logps/chosen": -258.8822428385417, "logps/rejected": -160.71058349609376, "loss": 0.3076, "rewards/chosen": 0.25386786460876465, "rewards/margins": 1.5427241802215577, "rewards/rejected": -1.288856315612793, "step": 10570 }, { "epoch": 0.5603053030503803, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25145960.0, "logits/rejected": -24326281.6, "logps/chosen": -305.0174560546875, "logps/rejected": -268.9057373046875, "loss": 0.2249, "rewards/chosen": 0.6715655326843262, "rewards/margins": 2.827598476409912, "rewards/rejected": -2.156032943725586, "step": 10571 }, { "epoch": 0.5603583070521825, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45211253.333333336, "logits/rejected": -26854166.4, "logps/chosen": -357.1507161458333, "logps/rejected": -350.10732421875, "loss": 0.2585, "rewards/chosen": 0.2070260445276896, "rewards/margins": 2.3349353233973185, "rewards/rejected": -2.127909278869629, "step": 10572 }, { "epoch": 0.5604113110539846, "grad_norm": 52.25, "kl": 0.8847217559814453, "learning_rate": 5e-07, "logits/chosen": -27709542.4, "logits/rejected": -29232896.0, "logps/chosen": -240.67470703125, "logps/rejected": -285.3499348958333, "loss": 0.4181, "rewards/chosen": 0.07929906845092774, "rewards/margins": 1.5827229181925457, "rewards/rejected": -1.503423849741618, "step": 10573 }, { "epoch": 0.5604643150557868, "grad_norm": 30.875, "kl": 1.6784591674804688, "learning_rate": 5e-07, "logits/chosen": 6619633.333333333, "logits/rejected": -27744524.8, "logps/chosen": -291.6864420572917, "logps/rejected": -282.91337890625, "loss": 0.1496, "rewards/chosen": 1.347413698832194, "rewards/margins": 3.8706419626871744, "rewards/rejected": -2.5232282638549806, "step": 10574 }, { "epoch": 0.5605173190575888, "grad_norm": 69.0, "kl": 4.773011207580566, "learning_rate": 5e-07, "logits/chosen": -9968950.0, "logits/rejected": -21284700.0, "logps/chosen": -499.0041097005208, "logps/rejected": -175.81715393066406, "loss": 0.3356, "rewards/chosen": 1.0833070278167725, "rewards/margins": 2.777269959449768, "rewards/rejected": -1.6939629316329956, "step": 10575 }, { "epoch": 0.560570323059391, "grad_norm": 53.25, "kl": 0.564178466796875, "learning_rate": 5e-07, "logits/chosen": -31708748.8, "logits/rejected": -58185482.666666664, "logps/chosen": -345.615087890625, "logps/rejected": -264.28448486328125, "loss": 0.309, "rewards/chosen": 0.5001486301422119, "rewards/margins": 3.354265387852987, "rewards/rejected": -2.854116757710775, "step": 10576 }, { "epoch": 0.5606233270611931, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86477472.0, "logits/rejected": -22316164.0, "logps/chosen": -368.4396057128906, "logps/rejected": -352.0611267089844, "loss": 0.2333, "rewards/chosen": 0.8171468377113342, "rewards/margins": 3.1036437153816223, "rewards/rejected": -2.286496877670288, "step": 10577 }, { "epoch": 0.5606763310629953, "grad_norm": 41.75, "kl": 0.7275390625, "learning_rate": 5e-07, "logits/chosen": -42716760.0, "logits/rejected": -13292116.0, "logps/chosen": -281.45188395182294, "logps/rejected": -265.008984375, "loss": 0.2566, "rewards/chosen": 0.3524765173594157, "rewards/margins": 2.3481359640757242, "rewards/rejected": -1.9956594467163087, "step": 10578 }, { "epoch": 0.5607293350647974, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28701342.0, "logits/rejected": -32492298.0, "logps/chosen": -330.4610290527344, "logps/rejected": -476.0132141113281, "loss": 0.2836, "rewards/chosen": 0.4652973413467407, "rewards/margins": 2.2840908765792847, "rewards/rejected": -1.818793535232544, "step": 10579 }, { "epoch": 0.5607823390665996, "grad_norm": 62.25, "kl": 0.7687149047851562, "learning_rate": 5e-07, "logits/chosen": -22362736.0, "logits/rejected": -12596008.0, "logps/chosen": -286.1379638671875, "logps/rejected": -249.70638020833334, "loss": 0.4212, "rewards/chosen": -0.15620802640914916, "rewards/margins": 1.122885239124298, "rewards/rejected": -1.2790932655334473, "step": 10580 }, { "epoch": 0.5608353430684017, "grad_norm": 31.875, "kl": 1.2187862396240234, "learning_rate": 5e-07, "logits/chosen": -23876824.0, "logits/rejected": -23128436.0, "logps/chosen": -336.107421875, "logps/rejected": -330.6659851074219, "loss": 0.182, "rewards/chosen": 1.3200167417526245, "rewards/margins": 3.98261296749115, "rewards/rejected": -2.6625962257385254, "step": 10581 }, { "epoch": 0.5608883470702039, "grad_norm": 66.5, "kl": 3.8936405181884766, "learning_rate": 5e-07, "logits/chosen": -8621860.0, "logits/rejected": 24414980.0, "logps/chosen": -197.68892415364584, "logps/rejected": -458.21075439453125, "loss": 0.3989, "rewards/chosen": 0.6627038319905599, "rewards/margins": 2.2065507968266806, "rewards/rejected": -1.5438469648361206, "step": 10582 }, { "epoch": 0.5609413510720059, "grad_norm": 38.25, "kl": 1.3018693923950195, "learning_rate": 5e-07, "logits/chosen": -14524314.0, "logits/rejected": -1961386.5, "logps/chosen": -112.78326416015625, "logps/rejected": -272.40887451171875, "loss": 0.338, "rewards/chosen": 0.3700639605522156, "rewards/margins": 2.4727277159690857, "rewards/rejected": -2.10266375541687, "step": 10583 }, { "epoch": 0.5609943550738081, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49542832.0, "logits/rejected": -24764840.0, "logps/chosen": -557.5182495117188, "logps/rejected": -366.8141682942708, "loss": 0.1595, "rewards/chosen": 0.3603256344795227, "rewards/margins": 3.0944307446479797, "rewards/rejected": -2.734105110168457, "step": 10584 }, { "epoch": 0.5610473590756102, "grad_norm": 66.0, "kl": 0.048274993896484375, "learning_rate": 5e-07, "logits/chosen": 40249197.333333336, "logits/rejected": -16684870.4, "logps/chosen": -1076.6092122395833, "logps/rejected": -259.5037841796875, "loss": 0.1915, "rewards/chosen": 0.9511373043060303, "rewards/margins": 3.2040711879730224, "rewards/rejected": -2.252933883666992, "step": 10585 }, { "epoch": 0.5611003630774123, "grad_norm": 52.25, "kl": 3.507089614868164, "learning_rate": 5e-07, "logits/chosen": -26156524.8, "logits/rejected": -46816133.333333336, "logps/chosen": -394.7113525390625, "logps/rejected": -478.954833984375, "loss": 0.2818, "rewards/chosen": 1.1264379501342774, "rewards/margins": 3.577742703755697, "rewards/rejected": -2.4513047536214194, "step": 10586 }, { "epoch": 0.5611533670792145, "grad_norm": 66.0, "kl": 1.8574085235595703, "learning_rate": 5e-07, "logits/chosen": -31418441.14285714, "logits/rejected": -24078960.0, "logps/chosen": -540.2724609375, "logps/rejected": -251.5177001953125, "loss": 0.406, "rewards/chosen": 0.46423074177333284, "rewards/margins": 2.4529697554452077, "rewards/rejected": -1.988739013671875, "step": 10587 }, { "epoch": 0.5612063710810166, "grad_norm": 76.5, "kl": 4.605098724365234, "learning_rate": 5e-07, "logits/chosen": -17648142.4, "logits/rejected": 2725679.6666666665, "logps/chosen": -480.543310546875, "logps/rejected": -170.93400065104166, "loss": 0.2962, "rewards/chosen": 1.1678847312927245, "rewards/margins": 3.3931750297546386, "rewards/rejected": -2.225290298461914, "step": 10588 }, { "epoch": 0.5612593750828188, "grad_norm": 41.0, "kl": 1.1026840209960938, "learning_rate": 5e-07, "logits/chosen": -26457540.0, "logits/rejected": -37928988.0, "logps/chosen": -418.3377380371094, "logps/rejected": -301.53912353515625, "loss": 0.2143, "rewards/chosen": 1.336188554763794, "rewards/margins": 3.8840525150299072, "rewards/rejected": -2.5478639602661133, "step": 10589 }, { "epoch": 0.5613123790846208, "grad_norm": 50.25, "kl": 0.631627082824707, "learning_rate": 5e-07, "logits/chosen": -22751124.57142857, "logits/rejected": -1789887.375, "logps/chosen": -208.85675920758928, "logps/rejected": -90.96150207519531, "loss": 0.4827, "rewards/chosen": -0.1618828092302595, "rewards/margins": 3.17479259627206, "rewards/rejected": -3.3366754055023193, "step": 10590 }, { "epoch": 0.561365383086423, "grad_norm": 63.25, "kl": 1.470083236694336, "learning_rate": 5e-07, "logits/chosen": -59740544.0, "logits/rejected": -28513648.0, "logps/chosen": -295.3741861979167, "logps/rejected": -346.9384765625, "loss": 0.3481, "rewards/chosen": 0.6827417214711508, "rewards/margins": 2.2516256173451743, "rewards/rejected": -1.5688838958740234, "step": 10591 }, { "epoch": 0.5614183870882251, "grad_norm": 82.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21844549.333333332, "logits/rejected": -10038952.0, "logps/chosen": -255.1757609049479, "logps/rejected": -421.68720703125, "loss": 0.2244, "rewards/chosen": 0.6612386306126913, "rewards/margins": 3.0857410033543906, "rewards/rejected": -2.424502372741699, "step": 10592 }, { "epoch": 0.5614713910900273, "grad_norm": 50.0, "kl": 0.10493183135986328, "learning_rate": 5e-07, "logits/chosen": -37662666.666666664, "logits/rejected": -21013264.0, "logps/chosen": -229.16455078125, "logps/rejected": -468.872216796875, "loss": 0.3212, "rewards/chosen": -0.5389469067255656, "rewards/margins": 1.822700889905294, "rewards/rejected": -2.3616477966308596, "step": 10593 }, { "epoch": 0.5615243950918294, "grad_norm": 49.25, "kl": 1.3408546447753906, "learning_rate": 5e-07, "logits/chosen": -26515177.6, "logits/rejected": -25567952.0, "logps/chosen": -397.7996337890625, "logps/rejected": -312.43896484375, "loss": 0.2504, "rewards/chosen": 0.7889244079589843, "rewards/margins": 3.5741836547851564, "rewards/rejected": -2.785259246826172, "step": 10594 }, { "epoch": 0.5615773990936316, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61447872.0, "logits/rejected": -8193819.333333333, "logps/chosen": -255.014990234375, "logps/rejected": -150.04777018229166, "loss": 0.3533, "rewards/chosen": -0.13518457412719725, "rewards/margins": 3.1546917279561364, "rewards/rejected": -3.2898763020833335, "step": 10595 }, { "epoch": 0.5616304030954337, "grad_norm": 42.25, "kl": 8.864761352539062, "learning_rate": 5e-07, "logits/chosen": -20383286.4, "logits/rejected": -11055588.0, "logps/chosen": -520.347265625, "logps/rejected": -340.1364339192708, "loss": 0.1694, "rewards/chosen": 2.231705093383789, "rewards/margins": 5.906225903828939, "rewards/rejected": -3.67452081044515, "step": 10596 }, { "epoch": 0.5616834070972359, "grad_norm": 56.25, "kl": 4.54254150390625, "learning_rate": 5e-07, "logits/chosen": -14329425.333333334, "logits/rejected": -11921148.0, "logps/chosen": -162.37165323893228, "logps/rejected": -163.04190063476562, "loss": 0.3615, "rewards/chosen": 0.8533804416656494, "rewards/margins": 2.34364914894104, "rewards/rejected": -1.4902687072753906, "step": 10597 }, { "epoch": 0.5617364110990379, "grad_norm": 60.5, "kl": 0.7581310272216797, "learning_rate": 5e-07, "logits/chosen": -44061308.8, "logits/rejected": 8413595.333333334, "logps/chosen": -297.1912109375, "logps/rejected": -447.7360026041667, "loss": 0.3528, "rewards/chosen": 0.2605471134185791, "rewards/margins": 1.8734459400177002, "rewards/rejected": -1.612898826599121, "step": 10598 }, { "epoch": 0.5617894151008401, "grad_norm": 32.25, "kl": 2.0731372833251953, "learning_rate": 5e-07, "logits/chosen": -15937040.0, "logits/rejected": -41572697.6, "logps/chosen": -229.68900553385416, "logps/rejected": -558.5748046875, "loss": 0.1644, "rewards/chosen": 0.9672460556030273, "rewards/margins": 4.139152336120605, "rewards/rejected": -3.171906280517578, "step": 10599 }, { "epoch": 0.5618424191026422, "grad_norm": 46.75, "kl": 1.033585548400879, "learning_rate": 5e-07, "logits/chosen": -47830048.0, "logits/rejected": -24683678.4, "logps/chosen": -245.89444986979166, "logps/rejected": -187.8797119140625, "loss": 0.2125, "rewards/chosen": 0.8322494029998779, "rewards/margins": 2.9264087200164797, "rewards/rejected": -2.0941593170166017, "step": 10600 }, { "epoch": 0.5618954231044444, "grad_norm": 41.75, "kl": 2.4208145141601562, "learning_rate": 5e-07, "logits/chosen": -46764904.0, "logits/rejected": -41170572.0, "logps/chosen": -358.9853515625, "logps/rejected": -673.0328979492188, "loss": 0.3312, "rewards/chosen": 0.40916702151298523, "rewards/margins": 3.7656480967998505, "rewards/rejected": -3.3564810752868652, "step": 10601 }, { "epoch": 0.5619484271062465, "grad_norm": 47.75, "kl": 4.043172836303711, "learning_rate": 5e-07, "logits/chosen": -25980706.666666668, "logits/rejected": -19471044.0, "logps/chosen": -214.18489583333334, "logps/rejected": -395.52130126953125, "loss": 0.4112, "rewards/chosen": 0.5597021579742432, "rewards/margins": 2.035367488861084, "rewards/rejected": -1.4756653308868408, "step": 10602 }, { "epoch": 0.5620014311080487, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19582382.4, "logits/rejected": -1141844.0, "logps/chosen": -282.731689453125, "logps/rejected": -173.81498209635416, "loss": 0.4109, "rewards/chosen": 0.15088956356048583, "rewards/margins": 1.5485278844833374, "rewards/rejected": -1.3976383209228516, "step": 10603 }, { "epoch": 0.5620544351098508, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7935673.0, "logits/rejected": -35113556.571428575, "logps/chosen": -98.17186737060547, "logps/rejected": -297.42714146205356, "loss": 0.1745, "rewards/chosen": -0.37349167466163635, "rewards/margins": 1.7549927788121358, "rewards/rejected": -2.128484453473772, "step": 10604 }, { "epoch": 0.562107439111653, "grad_norm": 47.75, "kl": 0.5637168884277344, "learning_rate": 5e-07, "logits/chosen": -44132794.666666664, "logits/rejected": -61294388.0, "logps/chosen": -298.6858317057292, "logps/rejected": -574.0079956054688, "loss": 0.3018, "rewards/chosen": 0.541917641957601, "rewards/margins": 3.99811585744222, "rewards/rejected": -3.456198215484619, "step": 10605 }, { "epoch": 0.562160443113455, "grad_norm": 27.75, "kl": 0.7256011962890625, "learning_rate": 5e-07, "logits/chosen": 5011613.5, "logits/rejected": -38002890.666666664, "logps/chosen": -15.542255401611328, "logps/rejected": -259.8111165364583, "loss": 0.1486, "rewards/chosen": 1.0090441703796387, "rewards/margins": 3.5873878796895347, "rewards/rejected": -2.578343709309896, "step": 10606 }, { "epoch": 0.5622134471152572, "grad_norm": 42.5, "kl": 1.0205039978027344, "learning_rate": 5e-07, "logits/chosen": -32362889.6, "logits/rejected": -18901138.666666668, "logps/chosen": -347.11044921875, "logps/rejected": -243.2031453450521, "loss": 0.2702, "rewards/chosen": 0.9773358345031739, "rewards/margins": 3.129896386464437, "rewards/rejected": -2.152560551961263, "step": 10607 }, { "epoch": 0.5622664511170593, "grad_norm": 66.0, "kl": 4.545102119445801, "learning_rate": 5e-07, "logits/chosen": -26396305.6, "logits/rejected": -9240737.333333334, "logps/chosen": -596.4876953125, "logps/rejected": -373.3236490885417, "loss": 0.3699, "rewards/chosen": 0.9464821815490723, "rewards/margins": 2.075141191482544, "rewards/rejected": -1.1286590099334717, "step": 10608 }, { "epoch": 0.5623194551188615, "grad_norm": 59.5, "kl": 0.36826324462890625, "learning_rate": 5e-07, "logits/chosen": -71019434.66666667, "logits/rejected": 7683929.6, "logps/chosen": -376.1083170572917, "logps/rejected": -239.86845703125, "loss": 0.2619, "rewards/chosen": 0.22316590944925943, "rewards/margins": 2.2419588247934974, "rewards/rejected": -2.018792915344238, "step": 10609 }, { "epoch": 0.5623724591206636, "grad_norm": 68.5, "kl": 0.4961223602294922, "learning_rate": 5e-07, "logits/chosen": -27846556.0, "logits/rejected": 122612432.0, "logps/chosen": -346.25018310546875, "logps/rejected": -189.32012939453125, "loss": 0.3566, "rewards/chosen": 0.49023115634918213, "rewards/margins": 1.3925939202308655, "rewards/rejected": -0.9023627638816833, "step": 10610 }, { "epoch": 0.5624254631224658, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37391424.0, "logits/rejected": -18853218.666666668, "logps/chosen": -113.37141418457031, "logps/rejected": -310.3890787760417, "loss": 0.2031, "rewards/chosen": 0.2088308334350586, "rewards/margins": 2.3018528620402017, "rewards/rejected": -2.093022028605143, "step": 10611 }, { "epoch": 0.5624784671242679, "grad_norm": 50.25, "kl": 0.09459304809570312, "learning_rate": 5e-07, "logits/chosen": -8326800.0, "logits/rejected": -11163299.0, "logps/chosen": -268.55306570870533, "logps/rejected": -166.97348022460938, "loss": 0.3423, "rewards/chosen": 0.6141700744628906, "rewards/margins": 2.4814246892929077, "rewards/rejected": -1.867254614830017, "step": 10612 }, { "epoch": 0.56253147112607, "grad_norm": 30.125, "kl": 2.8656253814697266, "learning_rate": 5e-07, "logits/chosen": 31640035.2, "logits/rejected": -22231957.333333332, "logps/chosen": -58.74586791992188, "logps/rejected": -352.1855061848958, "loss": 0.377, "rewards/chosen": 0.5041949272155761, "rewards/margins": 2.649005921681722, "rewards/rejected": -2.144810994466146, "step": 10613 }, { "epoch": 0.5625844751278721, "grad_norm": 64.0, "kl": 1.3438749313354492, "learning_rate": 5e-07, "logits/chosen": -13784333.333333334, "logits/rejected": -29156434.0, "logps/chosen": -297.7503662109375, "logps/rejected": -615.456787109375, "loss": 0.3061, "rewards/chosen": 0.6572239796320597, "rewards/margins": 2.8880498806635537, "rewards/rejected": -2.230825901031494, "step": 10614 }, { "epoch": 0.5626374791296743, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5381283.5, "logits/rejected": -34420632.0, "logps/chosen": -29.473756790161133, "logps/rejected": -409.3505859375, "loss": 0.18, "rewards/chosen": -0.034424491226673126, "rewards/margins": 3.6034462973475456, "rewards/rejected": -3.6378707885742188, "step": 10615 }, { "epoch": 0.5626904831314764, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9220680.0, "logits/rejected": -48915610.666666664, "logps/chosen": -276.36798095703125, "logps/rejected": -324.29180908203125, "loss": 0.235, "rewards/chosen": 0.26466214656829834, "rewards/margins": 2.1076724131902056, "rewards/rejected": -1.8430102666219075, "step": 10616 }, { "epoch": 0.5627434871332786, "grad_norm": 29.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3785667.25, "logits/rejected": -34238122.666666664, "logps/chosen": -243.23036193847656, "logps/rejected": -420.260498046875, "loss": 0.1371, "rewards/chosen": 0.8711792230606079, "rewards/margins": 3.749083479245504, "rewards/rejected": -2.877904256184896, "step": 10617 }, { "epoch": 0.5627964911350807, "grad_norm": 52.75, "kl": 3.2154312133789062, "learning_rate": 5e-07, "logits/chosen": -4950273.666666667, "logits/rejected": -49494320.0, "logps/chosen": -313.6415201822917, "logps/rejected": -386.82855224609375, "loss": 0.2219, "rewards/chosen": 1.278960386912028, "rewards/margins": 3.5890175501505537, "rewards/rejected": -2.3100571632385254, "step": 10618 }, { "epoch": 0.5628494951368829, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42440941.333333336, "logits/rejected": -20698515.2, "logps/chosen": -770.089111328125, "logps/rejected": -319.7419189453125, "loss": 0.2067, "rewards/chosen": 1.6777598063151042, "rewards/margins": 3.139535395304362, "rewards/rejected": -1.4617755889892579, "step": 10619 }, { "epoch": 0.562902499138685, "grad_norm": 25.5, "kl": 0.1246337890625, "learning_rate": 5e-07, "logits/chosen": 13072508.0, "logits/rejected": -34736842.666666664, "logps/chosen": -67.18160247802734, "logps/rejected": -350.9836018880208, "loss": 0.1449, "rewards/chosen": 0.26130563020706177, "rewards/margins": 3.871622860431671, "rewards/rejected": -3.6103172302246094, "step": 10620 }, { "epoch": 0.5629555031404871, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 43372232.0, "logits/rejected": -31857680.0, "logps/chosen": -198.87801106770834, "logps/rejected": -490.987890625, "loss": 0.2141, "rewards/chosen": 0.34012651443481445, "rewards/margins": 3.5293461799621584, "rewards/rejected": -3.189219665527344, "step": 10621 }, { "epoch": 0.5630085071422892, "grad_norm": 58.0, "kl": 0.7880916595458984, "learning_rate": 5e-07, "logits/chosen": -56146762.666666664, "logits/rejected": -16178014.0, "logps/chosen": -511.5770670572917, "logps/rejected": -128.8339080810547, "loss": 0.374, "rewards/chosen": 0.42921586831410724, "rewards/margins": 2.7186617453893027, "rewards/rejected": -2.2894458770751953, "step": 10622 }, { "epoch": 0.5630615111440914, "grad_norm": 48.25, "kl": 3.112039566040039, "learning_rate": 5e-07, "logits/chosen": -4332873.333333333, "logits/rejected": -2922091.0, "logps/chosen": -535.206787109375, "logps/rejected": -233.3074951171875, "loss": 0.2971, "rewards/chosen": 0.8028670152028402, "rewards/margins": 2.9838789304097495, "rewards/rejected": -2.181011915206909, "step": 10623 }, { "epoch": 0.5631145151458935, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20651858.666666668, "logits/rejected": -26534616.0, "logps/chosen": -344.1232503255208, "logps/rejected": -145.36497802734374, "loss": 0.2698, "rewards/chosen": 0.3597035805384318, "rewards/margins": 2.006058637301127, "rewards/rejected": -1.6463550567626952, "step": 10624 }, { "epoch": 0.5631675191476957, "grad_norm": 26.5, "kl": 5.228555679321289, "learning_rate": 5e-07, "logits/chosen": -13805313.6, "logits/rejected": -127938805.33333333, "logps/chosen": -71.76552734375, "logps/rejected": -436.4171549479167, "loss": 0.3559, "rewards/chosen": 0.782445764541626, "rewards/margins": 3.582851266860962, "rewards/rejected": -2.800405502319336, "step": 10625 }, { "epoch": 0.5632205231494978, "grad_norm": 42.75, "kl": 0.27909183502197266, "learning_rate": 5e-07, "logits/chosen": -17574562.0, "logits/rejected": -23377192.0, "logps/chosen": -161.40731811523438, "logps/rejected": -286.3841552734375, "loss": 0.2181, "rewards/chosen": 0.709675669670105, "rewards/margins": 3.121862769126892, "rewards/rejected": -2.412187099456787, "step": 10626 }, { "epoch": 0.5632735271513, "grad_norm": 47.75, "kl": 0.32775115966796875, "learning_rate": 5e-07, "logits/chosen": -31805036.8, "logits/rejected": -27099946.666666668, "logps/chosen": -328.860400390625, "logps/rejected": -431.4092203776042, "loss": 0.2518, "rewards/chosen": 0.661109209060669, "rewards/margins": 4.2910161813100185, "rewards/rejected": -3.629906972249349, "step": 10627 }, { "epoch": 0.563326531153102, "grad_norm": 54.75, "kl": 1.666330337524414, "learning_rate": 5e-07, "logits/chosen": -17570468.0, "logits/rejected": -16965588.0, "logps/chosen": -241.97572326660156, "logps/rejected": -358.1632080078125, "loss": 0.2786, "rewards/chosen": 0.4295860826969147, "rewards/margins": 2.9802633821964264, "rewards/rejected": -2.5506772994995117, "step": 10628 }, { "epoch": 0.5633795351549042, "grad_norm": 43.25, "kl": 3.5561904907226562, "learning_rate": 5e-07, "logits/chosen": -58458598.4, "logits/rejected": -5476960.333333333, "logps/chosen": -332.5062255859375, "logps/rejected": -213.30291748046875, "loss": 0.2669, "rewards/chosen": 1.1530345916748046, "rewards/margins": 3.578232987721761, "rewards/rejected": -2.4251983960469565, "step": 10629 }, { "epoch": 0.5634325391567063, "grad_norm": 43.75, "kl": 1.372262954711914, "learning_rate": 5e-07, "logits/chosen": -26738030.4, "logits/rejected": -19606152.0, "logps/chosen": -207.999072265625, "logps/rejected": -279.088134765625, "loss": 0.3918, "rewards/chosen": 0.1282922625541687, "rewards/margins": 2.3655727903048196, "rewards/rejected": -2.237280527750651, "step": 10630 }, { "epoch": 0.5634855431585085, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25182854.4, "logits/rejected": -28684792.0, "logps/chosen": -340.0264892578125, "logps/rejected": -275.7073974609375, "loss": 0.2511, "rewards/chosen": 0.9043127059936523, "rewards/margins": 2.827307955423991, "rewards/rejected": -1.9229952494303386, "step": 10631 }, { "epoch": 0.5635385471603106, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8525145.333333334, "logits/rejected": -5145978.4, "logps/chosen": -318.9775797526042, "logps/rejected": -216.53603515625, "loss": 0.3227, "rewards/chosen": -0.14603952566782633, "rewards/margins": 1.7062460501988728, "rewards/rejected": -1.8522855758666992, "step": 10632 }, { "epoch": 0.5635915511621128, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17461182.666666668, "logits/rejected": -9692947.2, "logps/chosen": -303.34326171875, "logps/rejected": -286.5781494140625, "loss": 0.2746, "rewards/chosen": 0.07533824443817139, "rewards/margins": 2.454078936576843, "rewards/rejected": -2.378740692138672, "step": 10633 }, { "epoch": 0.5636445551639149, "grad_norm": 46.0, "kl": 0.40326881408691406, "learning_rate": 5e-07, "logits/chosen": -49130896.0, "logits/rejected": -47737973.333333336, "logps/chosen": -242.0471923828125, "logps/rejected": -272.5327962239583, "loss": 0.3194, "rewards/chosen": 0.1842833638191223, "rewards/margins": 3.059776731332143, "rewards/rejected": -2.875493367513021, "step": 10634 }, { "epoch": 0.5636975591657171, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13018714.666666666, "logits/rejected": -36537032.0, "logps/chosen": -366.8171793619792, "logps/rejected": -409.80975341796875, "loss": 0.2995, "rewards/chosen": 0.649100144704183, "rewards/margins": 2.5084508260091147, "rewards/rejected": -1.8593506813049316, "step": 10635 }, { "epoch": 0.5637505631675191, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12437238.0, "logits/rejected": -9655466.0, "logps/chosen": -281.1176452636719, "logps/rejected": -221.18717447916666, "loss": 0.2312, "rewards/chosen": 0.640484631061554, "rewards/margins": 2.59384948015213, "rewards/rejected": -1.9533648490905762, "step": 10636 }, { "epoch": 0.5638035671693212, "grad_norm": 31.0, "kl": 1.5921669006347656, "learning_rate": 5e-07, "logits/chosen": 1932322.5, "logits/rejected": -4356800.333333333, "logps/chosen": -103.19712829589844, "logps/rejected": -61.47693379720052, "loss": 0.3315, "rewards/chosen": -0.16103707253932953, "rewards/margins": 0.9893336842457454, "rewards/rejected": -1.150370756785075, "step": 10637 }, { "epoch": 0.5638565711711234, "grad_norm": 39.5, "kl": 0.442230224609375, "learning_rate": 5e-07, "logits/chosen": -17528232.0, "logits/rejected": -13888906.0, "logps/chosen": -741.8263549804688, "logps/rejected": -176.6610107421875, "loss": 0.154, "rewards/chosen": 1.8683292865753174, "rewards/margins": 4.292210340499878, "rewards/rejected": -2.4238810539245605, "step": 10638 }, { "epoch": 0.5639095751729255, "grad_norm": 42.5, "kl": 1.7598190307617188, "learning_rate": 5e-07, "logits/chosen": -33063074.666666668, "logits/rejected": -12961203.2, "logps/chosen": -221.72196451822916, "logps/rejected": -185.5708984375, "loss": 0.2452, "rewards/chosen": 0.8380883534749349, "rewards/margins": 2.905753262837728, "rewards/rejected": -2.067664909362793, "step": 10639 }, { "epoch": 0.5639625791747277, "grad_norm": 60.25, "kl": 6.311725616455078, "learning_rate": 5e-07, "logits/chosen": -34846272.0, "logits/rejected": -33759088.0, "logps/chosen": -763.0837890625, "logps/rejected": -146.9465128580729, "loss": 0.2654, "rewards/chosen": 1.7273977279663086, "rewards/margins": 2.4255547205607098, "rewards/rejected": -0.698156992594401, "step": 10640 }, { "epoch": 0.5640155831765298, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7570854.0, "logits/rejected": -69387624.0, "logps/chosen": -245.43533325195312, "logps/rejected": -556.9920043945312, "loss": 0.2326, "rewards/chosen": 0.6947300434112549, "rewards/margins": 3.6432642936706543, "rewards/rejected": -2.9485342502593994, "step": 10641 }, { "epoch": 0.564068587178332, "grad_norm": 68.5, "kl": 1.1598358154296875, "learning_rate": 5e-07, "logits/chosen": -11161910.0, "logits/rejected": -69191072.0, "logps/chosen": -397.5914611816406, "logps/rejected": -427.0666198730469, "loss": 0.3164, "rewards/chosen": 0.24648666381835938, "rewards/margins": 2.3266289234161377, "rewards/rejected": -2.0801422595977783, "step": 10642 }, { "epoch": 0.564121591180134, "grad_norm": 56.25, "kl": 0.9047470092773438, "learning_rate": 5e-07, "logits/chosen": 34211237.333333336, "logits/rejected": -25103332.0, "logps/chosen": -134.11180623372397, "logps/rejected": -360.8717346191406, "loss": 0.3855, "rewards/chosen": 0.5092122157414755, "rewards/margins": 1.4375676115353904, "rewards/rejected": -0.9283553957939148, "step": 10643 }, { "epoch": 0.5641745951819362, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24301834.666666668, "logits/rejected": -86898707.2, "logps/chosen": -488.3246256510417, "logps/rejected": -242.377392578125, "loss": 0.2937, "rewards/chosen": -0.10111795862515767, "rewards/margins": 1.7380092004934948, "rewards/rejected": -1.8391271591186524, "step": 10644 }, { "epoch": 0.5642275991837383, "grad_norm": 51.5, "kl": 4.01799201965332, "learning_rate": 5e-07, "logits/chosen": -48648504.0, "logits/rejected": -37377860.0, "logps/chosen": -494.2084655761719, "logps/rejected": -323.94866943359375, "loss": 0.2467, "rewards/chosen": 0.8507720828056335, "rewards/margins": 3.3760239481925964, "rewards/rejected": -2.525251865386963, "step": 10645 }, { "epoch": 0.5642806031855405, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29422576.0, "logits/rejected": -18429613.333333332, "logps/chosen": -256.0323486328125, "logps/rejected": -379.4536946614583, "loss": 0.317, "rewards/chosen": 0.2924065113067627, "rewards/margins": 2.3714368661244714, "rewards/rejected": -2.0790303548177085, "step": 10646 }, { "epoch": 0.5643336071873426, "grad_norm": 39.75, "kl": 0.8233509063720703, "learning_rate": 5e-07, "logits/chosen": 2628431.0, "logits/rejected": -18340416.0, "logps/chosen": -114.56413269042969, "logps/rejected": -281.65228271484375, "loss": 0.2795, "rewards/chosen": 0.27159708738327026, "rewards/margins": 2.662953555583954, "rewards/rejected": -2.3913564682006836, "step": 10647 }, { "epoch": 0.5643866111891448, "grad_norm": 45.25, "kl": 3.7636966705322266, "learning_rate": 5e-07, "logits/chosen": -3281890.5, "logits/rejected": -14779291.0, "logps/chosen": -191.0419464111328, "logps/rejected": -315.33819580078125, "loss": 0.2885, "rewards/chosen": 0.49661046266555786, "rewards/margins": 2.755014955997467, "rewards/rejected": -2.258404493331909, "step": 10648 }, { "epoch": 0.5644396151909469, "grad_norm": 63.0, "kl": 1.7788925170898438, "learning_rate": 5e-07, "logits/chosen": -37165833.6, "logits/rejected": -30154781.333333332, "logps/chosen": -753.401171875, "logps/rejected": -304.4788818359375, "loss": 0.3105, "rewards/chosen": 0.8255529403686523, "rewards/margins": 3.138530413309733, "rewards/rejected": -2.3129774729410806, "step": 10649 }, { "epoch": 0.5644926191927491, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17070946.0, "logits/rejected": -22900226.666666668, "logps/chosen": -211.20989990234375, "logps/rejected": -407.3285319010417, "loss": 0.1922, "rewards/chosen": 0.5076446533203125, "rewards/margins": 2.8135124842325845, "rewards/rejected": -2.305867830912272, "step": 10650 }, { "epoch": 0.5645456231945511, "grad_norm": 45.25, "kl": 0.58453369140625, "learning_rate": 5e-07, "logits/chosen": -25586066.0, "logits/rejected": -13368836.0, "logps/chosen": -483.7039794921875, "logps/rejected": -273.3642578125, "loss": 0.2311, "rewards/chosen": 0.8657395839691162, "rewards/margins": 3.5323078632354736, "rewards/rejected": -2.6665682792663574, "step": 10651 }, { "epoch": 0.5645986271963533, "grad_norm": 58.25, "kl": 1.255904197692871, "learning_rate": 5e-07, "logits/chosen": -58280998.4, "logits/rejected": -14911473.333333334, "logps/chosen": -399.5615966796875, "logps/rejected": -141.04717000325522, "loss": 0.3206, "rewards/chosen": 0.8409564971923829, "rewards/margins": 1.425606695810954, "rewards/rejected": -0.5846501986185709, "step": 10652 }, { "epoch": 0.5646516311981554, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5521411.5, "logits/rejected": -23550666.666666668, "logps/chosen": -571.830810546875, "logps/rejected": -232.5883585611979, "loss": 0.2565, "rewards/chosen": 1.3792802095413208, "rewards/margins": 2.6859510342280073, "rewards/rejected": -1.3066708246866863, "step": 10653 }, { "epoch": 0.5647046351999576, "grad_norm": 77.0, "kl": 2.506317138671875, "learning_rate": 5e-07, "logits/chosen": 16530557.0, "logps/chosen": -309.59613037109375, "loss": 0.4083, "rewards/chosen": 0.6939588785171509, "step": 10654 }, { "epoch": 0.5647576392017597, "grad_norm": 75.5, "kl": 0.41057586669921875, "learning_rate": 5e-07, "logits/chosen": -65763642.666666664, "logits/rejected": -16451958.4, "logps/chosen": -592.1739908854166, "logps/rejected": -353.0880859375, "loss": 0.2742, "rewards/chosen": 0.9081563949584961, "rewards/margins": 2.5428691864013673, "rewards/rejected": -1.6347127914428712, "step": 10655 }, { "epoch": 0.5648106432035619, "grad_norm": 55.25, "kl": 1.55718994140625, "learning_rate": 5e-07, "logits/chosen": -29386005.333333332, "logits/rejected": 6399811.2, "logps/chosen": -518.1682942708334, "logps/rejected": -194.4309326171875, "loss": 0.1816, "rewards/chosen": 1.0920227368672688, "rewards/margins": 2.897574456532796, "rewards/rejected": -1.8055517196655273, "step": 10656 }, { "epoch": 0.564863647205364, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18797026.0, "logits/rejected": -27792677.333333332, "logps/chosen": -144.4805145263672, "logps/rejected": -337.37636311848956, "loss": 0.2175, "rewards/chosen": -0.012843996286392212, "rewards/margins": 2.3752146661281586, "rewards/rejected": -2.388058662414551, "step": 10657 }, { "epoch": 0.5649166512071662, "grad_norm": 68.0, "kl": 0.3643302917480469, "learning_rate": 5e-07, "logits/chosen": 131413976.0, "logits/rejected": -18939820.0, "logps/chosen": -297.4226989746094, "logps/rejected": -95.24828338623047, "loss": 0.2983, "rewards/chosen": 0.32379022240638733, "rewards/margins": 2.0782555639743805, "rewards/rejected": -1.7544653415679932, "step": 10658 }, { "epoch": 0.5649696552089682, "grad_norm": 47.5, "kl": 0.6436004638671875, "learning_rate": 5e-07, "logits/chosen": -23254672.0, "logits/rejected": -21453484.8, "logps/chosen": -371.3808186848958, "logps/rejected": -383.706494140625, "loss": 0.2502, "rewards/chosen": 0.04480844736099243, "rewards/margins": 2.399696981906891, "rewards/rejected": -2.3548885345458985, "step": 10659 }, { "epoch": 0.5650226592107704, "grad_norm": 80.5, "kl": 2.5234031677246094, "learning_rate": 5e-07, "logits/chosen": 21675957.333333332, "logits/rejected": -27672920.0, "logps/chosen": -690.7897135416666, "logps/rejected": -385.33001708984375, "loss": 0.3868, "rewards/chosen": 0.7039709091186523, "rewards/margins": 2.778470516204834, "rewards/rejected": -2.0744996070861816, "step": 10660 }, { "epoch": 0.5650756632125725, "grad_norm": 44.0, "kl": 1.4527053833007812, "learning_rate": 5e-07, "logits/chosen": -25906278.4, "logits/rejected": -42886960.0, "logps/chosen": -305.1522705078125, "logps/rejected": -433.4359944661458, "loss": 0.3202, "rewards/chosen": 0.44356460571289064, "rewards/margins": 3.423049799601237, "rewards/rejected": -2.979485193888346, "step": 10661 }, { "epoch": 0.5651286672143747, "grad_norm": 51.5, "kl": 0.37766265869140625, "learning_rate": 5e-07, "logits/chosen": -59880112.0, "logits/rejected": -14846555.0, "logps/chosen": -340.2229309082031, "logps/rejected": -197.25149536132812, "loss": 0.2798, "rewards/chosen": 0.7076210379600525, "rewards/margins": 2.305048644542694, "rewards/rejected": -1.5974276065826416, "step": 10662 }, { "epoch": 0.5651816712161768, "grad_norm": 84.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44929908.0, "logits/rejected": -25895018.666666668, "logps/chosen": -441.5223388671875, "logps/rejected": -343.1250813802083, "loss": 0.1989, "rewards/chosen": 0.488638311624527, "rewards/margins": 2.869251320759455, "rewards/rejected": -2.3806130091349282, "step": 10663 }, { "epoch": 0.565234675217979, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5985676.666666667, "logits/rejected": -4065492.0, "logps/chosen": -294.611328125, "logps/rejected": -223.087158203125, "loss": 0.2691, "rewards/chosen": 0.3733050028483073, "rewards/margins": 1.9705910364786785, "rewards/rejected": -1.5972860336303711, "step": 10664 }, { "epoch": 0.5652876792197811, "grad_norm": 59.75, "kl": 0.9749393463134766, "learning_rate": 5e-07, "logits/chosen": 125159520.0, "logits/rejected": -42413188.0, "logps/chosen": -385.42138671875, "logps/rejected": -586.2713623046875, "loss": 0.3284, "rewards/chosen": 0.1844337433576584, "rewards/margins": 2.1806245297193527, "rewards/rejected": -1.9961907863616943, "step": 10665 }, { "epoch": 0.5653406832215833, "grad_norm": 50.25, "kl": 2.242307662963867, "learning_rate": 5e-07, "logits/chosen": -30308090.666666668, "logits/rejected": -11337566.0, "logps/chosen": -341.1841634114583, "logps/rejected": -193.01040649414062, "loss": 0.331, "rewards/chosen": 0.6067474683125814, "rewards/margins": 3.171662171681722, "rewards/rejected": -2.5649147033691406, "step": 10666 }, { "epoch": 0.5653936872233853, "grad_norm": 73.0, "kl": 0.7919044494628906, "learning_rate": 5e-07, "logits/chosen": -27787813.333333332, "logits/rejected": -604975.6875, "logps/chosen": -319.58689371744794, "logps/rejected": -225.5498046875, "loss": 0.3582, "rewards/chosen": 0.403048833211263, "rewards/margins": 2.1199143727620444, "rewards/rejected": -1.7168655395507812, "step": 10667 }, { "epoch": 0.5654466912251875, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4069535.0, "logits/rejected": -26372923.42857143, "logps/chosen": -27.479446411132812, "logps/rejected": -292.3888462611607, "loss": 0.2119, "rewards/chosen": 0.4669555723667145, "rewards/margins": 2.2680400907993317, "rewards/rejected": -1.8010845184326172, "step": 10668 }, { "epoch": 0.5654996952269896, "grad_norm": 47.25, "kl": 2.1512985229492188, "learning_rate": 5e-07, "logits/chosen": -23612634.666666668, "logits/rejected": -47065708.0, "logps/chosen": -449.246826171875, "logps/rejected": -451.7816162109375, "loss": 0.2284, "rewards/chosen": 1.2556740442911785, "rewards/margins": 3.407607237497966, "rewards/rejected": -2.151933193206787, "step": 10669 }, { "epoch": 0.5655526992287918, "grad_norm": 45.0, "kl": 4.6009979248046875, "learning_rate": 5e-07, "logits/chosen": -25717648.0, "logits/rejected": -19442585.333333332, "logps/chosen": -286.521923828125, "logps/rejected": -299.6111653645833, "loss": 0.35, "rewards/chosen": 0.0896488904953003, "rewards/margins": 1.854316242535909, "rewards/rejected": -1.7646673520406086, "step": 10670 }, { "epoch": 0.5656057032305939, "grad_norm": 38.75, "kl": 1.6145744323730469, "learning_rate": 5e-07, "logits/chosen": -48963830.4, "logits/rejected": -12130020.0, "logps/chosen": -510.13515625, "logps/rejected": -160.76933797200522, "loss": 0.2628, "rewards/chosen": 1.137942123413086, "rewards/margins": 3.555577405293783, "rewards/rejected": -2.4176352818806968, "step": 10671 }, { "epoch": 0.5656587072323961, "grad_norm": 41.0, "kl": 0.17539596557617188, "learning_rate": 5e-07, "logits/chosen": -12198037.333333334, "logits/rejected": -17944012.8, "logps/chosen": -191.09566243489584, "logps/rejected": -322.7135986328125, "loss": 0.2784, "rewards/chosen": -0.23131662607192993, "rewards/margins": 2.3407822966575624, "rewards/rejected": -2.5720989227294924, "step": 10672 }, { "epoch": 0.5657117112341982, "grad_norm": 51.75, "kl": 0.6259613037109375, "learning_rate": 5e-07, "logits/chosen": -16409194.666666666, "logits/rejected": -20550920.0, "logps/chosen": -487.2918701171875, "logps/rejected": -384.0699462890625, "loss": 0.2597, "rewards/chosen": 0.9858142534891764, "rewards/margins": 4.888671080271403, "rewards/rejected": -3.9028568267822266, "step": 10673 }, { "epoch": 0.5657647152360004, "grad_norm": 48.25, "kl": 0.5626678466796875, "learning_rate": 5e-07, "logits/chosen": -50788688.0, "logits/rejected": -44206400.0, "logps/chosen": -306.31826171875, "logps/rejected": -188.00618489583334, "loss": 0.2637, "rewards/chosen": 0.8039815902709961, "rewards/margins": 2.393575159708659, "rewards/rejected": -1.5895935694376628, "step": 10674 }, { "epoch": 0.5658177192378024, "grad_norm": 52.75, "kl": 0.5935850143432617, "learning_rate": 5e-07, "logits/chosen": -4725119.0, "logits/rejected": -7692884.5, "logps/chosen": -712.244384765625, "logps/rejected": -192.76109313964844, "loss": 0.2989, "rewards/chosen": 0.35728949308395386, "rewards/margins": 2.06697815656662, "rewards/rejected": -1.709688663482666, "step": 10675 }, { "epoch": 0.5658707232396046, "grad_norm": 65.0, "kl": 0.9564971923828125, "learning_rate": 5e-07, "logits/chosen": -51851619.2, "logits/rejected": 76835253.33333333, "logps/chosen": -314.487255859375, "logps/rejected": -433.06103515625, "loss": 0.3068, "rewards/chosen": 0.5372321128845214, "rewards/margins": 2.8742164293924963, "rewards/rejected": -2.336984316507975, "step": 10676 }, { "epoch": 0.5659237272414067, "grad_norm": 55.5, "kl": 1.0521297454833984, "learning_rate": 5e-07, "logits/chosen": -99120633.6, "logits/rejected": -19771744.0, "logps/chosen": -324.524267578125, "logps/rejected": -375.1373697916667, "loss": 0.2579, "rewards/chosen": 0.8469844818115234, "rewards/margins": 3.0009087880452476, "rewards/rejected": -2.153924306233724, "step": 10677 }, { "epoch": 0.5659767312432089, "grad_norm": 54.75, "kl": 3.891537666320801, "learning_rate": 5e-07, "logits/chosen": -47135024.0, "logits/rejected": -22878938.666666668, "logps/chosen": -376.688427734375, "logps/rejected": -465.153564453125, "loss": 0.2822, "rewards/chosen": 0.9705570220947266, "rewards/margins": 3.3503247578938806, "rewards/rejected": -2.379767735799154, "step": 10678 }, { "epoch": 0.566029735245011, "grad_norm": 77.5, "kl": 3.4661598205566406, "learning_rate": 5e-07, "logits/chosen": -59047730.28571428, "logits/rejected": -47750220.0, "logps/chosen": -501.57425362723217, "logps/rejected": -67.82589721679688, "loss": 0.2869, "rewards/chosen": 1.2245024272373743, "rewards/margins": 1.4669424572161265, "rewards/rejected": -0.24244002997875214, "step": 10679 }, { "epoch": 0.5660827392468132, "grad_norm": 58.75, "kl": 1.0978012084960938, "learning_rate": 5e-07, "logits/chosen": 11993956.0, "logits/rejected": -25689072.0, "logps/chosen": -233.004150390625, "logps/rejected": -314.99973551432294, "loss": 0.4334, "rewards/chosen": -0.16683868169784546, "rewards/margins": 1.4687060952186584, "rewards/rejected": -1.635544776916504, "step": 10680 }, { "epoch": 0.5661357432486153, "grad_norm": 62.75, "kl": 0.36881256103515625, "learning_rate": 5e-07, "logits/chosen": -48092336.0, "logits/rejected": -9613695.0, "logps/chosen": -248.83116149902344, "logps/rejected": -230.73690795898438, "loss": 0.2204, "rewards/chosen": 0.7627156972885132, "rewards/margins": 3.8043562173843384, "rewards/rejected": -3.041640520095825, "step": 10681 }, { "epoch": 0.5661887472504175, "grad_norm": 71.0, "kl": 0.855316162109375, "learning_rate": 5e-07, "logits/chosen": -77911526.4, "logits/rejected": -38210101.333333336, "logps/chosen": -466.602880859375, "logps/rejected": -313.0749104817708, "loss": 0.3081, "rewards/chosen": 0.32730755805969236, "rewards/margins": 2.196206267674764, "rewards/rejected": -1.8688987096150715, "step": 10682 }, { "epoch": 0.5662417512522195, "grad_norm": 64.0, "kl": 5.477357864379883, "learning_rate": 5e-07, "logits/chosen": -10250648.0, "logits/rejected": -35073300.0, "logps/chosen": -321.7695719401042, "logps/rejected": -408.7422790527344, "loss": 0.2385, "rewards/chosen": 1.4835422833760579, "rewards/margins": 5.3789487679799395, "rewards/rejected": -3.895406484603882, "step": 10683 }, { "epoch": 0.5662947552540217, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1090402.0, "logits/rejected": -31608658.285714287, "logps/chosen": -49.30107879638672, "logps/rejected": -196.71576799665178, "loss": 0.2183, "rewards/chosen": -0.4476150572299957, "rewards/margins": 1.9240647937570299, "rewards/rejected": -2.3716798509870256, "step": 10684 }, { "epoch": 0.5663477592558238, "grad_norm": 37.75, "kl": 2.8532638549804688, "learning_rate": 5e-07, "logits/chosen": -41651576.0, "logits/rejected": -34608008.0, "logps/chosen": -167.17274475097656, "logps/rejected": -327.3767395019531, "loss": 0.4117, "rewards/chosen": -0.20773839950561523, "rewards/margins": 1.7643253803253174, "rewards/rejected": -1.9720637798309326, "step": 10685 }, { "epoch": 0.566400763257626, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48954640.0, "logits/rejected": -11618530.4, "logps/chosen": -341.502197265625, "logps/rejected": -153.532763671875, "loss": 0.2967, "rewards/chosen": 0.19104107220967612, "rewards/margins": 1.6785399278004964, "rewards/rejected": -1.4874988555908204, "step": 10686 }, { "epoch": 0.5664537672594281, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15788362.666666666, "logits/rejected": -8066985.6, "logps/chosen": -278.662109375, "logps/rejected": -287.2925048828125, "loss": 0.2403, "rewards/chosen": 0.589621901512146, "rewards/margins": 2.4200006246566774, "rewards/rejected": -1.8303787231445312, "step": 10687 }, { "epoch": 0.5665067712612302, "grad_norm": 45.75, "kl": 1.3141841888427734, "learning_rate": 5e-07, "logits/chosen": -13832306.0, "logits/rejected": -20810898.0, "logps/chosen": -211.8056182861328, "logps/rejected": -201.55545043945312, "loss": 0.298, "rewards/chosen": 0.6379919052124023, "rewards/margins": 2.2651243209838867, "rewards/rejected": -1.6271324157714844, "step": 10688 }, { "epoch": 0.5665597752630324, "grad_norm": 78.5, "kl": 3.0538101196289062, "learning_rate": 5e-07, "logits/chosen": -75025094.4, "logits/rejected": 3909287.3333333335, "logps/chosen": -688.06533203125, "logps/rejected": -147.4889119466146, "loss": 0.317, "rewards/chosen": 1.1937477111816406, "rewards/margins": 2.2188881556193034, "rewards/rejected": -1.0251404444376628, "step": 10689 }, { "epoch": 0.5666127792648344, "grad_norm": 38.5, "kl": 3.142240524291992, "learning_rate": 5e-07, "logits/chosen": -16093342.4, "logits/rejected": -13655245.333333334, "logps/chosen": -231.07265625, "logps/rejected": -192.3932902018229, "loss": 0.2821, "rewards/chosen": 1.0513654708862306, "rewards/margins": 3.1467494328816734, "rewards/rejected": -2.095383961995443, "step": 10690 }, { "epoch": 0.5666657832666366, "grad_norm": 47.25, "kl": 2.6532554626464844, "learning_rate": 5e-07, "logits/chosen": -2400894.5, "logits/rejected": -38600040.0, "logps/chosen": -577.6890869140625, "logps/rejected": -269.91748046875, "loss": 0.3131, "rewards/chosen": 1.4714343547821045, "rewards/margins": 2.737550735473633, "rewards/rejected": -1.2661163806915283, "step": 10691 }, { "epoch": 0.5667187872684387, "grad_norm": 37.75, "kl": 1.910226821899414, "learning_rate": 5e-07, "logits/chosen": -21095120.0, "logits/rejected": -5920872.0, "logps/chosen": -193.18844604492188, "logps/rejected": -187.60592651367188, "loss": 0.2193, "rewards/chosen": 0.9087762832641602, "rewards/margins": 3.185786724090576, "rewards/rejected": -2.277010440826416, "step": 10692 }, { "epoch": 0.5667717912702409, "grad_norm": 49.0, "kl": 0.18824005126953125, "learning_rate": 5e-07, "logits/chosen": -22071594.666666668, "logits/rejected": 20949163.2, "logps/chosen": -163.0985107421875, "logps/rejected": -300.880908203125, "loss": 0.2763, "rewards/chosen": 0.12709579865137735, "rewards/margins": 2.5568893472353618, "rewards/rejected": -2.4297935485839846, "step": 10693 }, { "epoch": 0.566824795272043, "grad_norm": 45.25, "kl": 0.770721435546875, "learning_rate": 5e-07, "logits/chosen": -70167248.0, "logits/rejected": -42142598.4, "logps/chosen": -223.4070027669271, "logps/rejected": -266.07822265625, "loss": 0.2514, "rewards/chosen": 0.3801875114440918, "rewards/margins": 2.1329440116882323, "rewards/rejected": -1.7527565002441405, "step": 10694 }, { "epoch": 0.5668777992738452, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54205480.0, "logits/rejected": -63896544.0, "logps/chosen": -503.99200439453125, "logps/rejected": -353.0128173828125, "loss": 0.1938, "rewards/chosen": 1.1266378164291382, "rewards/margins": 3.4489318132400513, "rewards/rejected": -2.322293996810913, "step": 10695 }, { "epoch": 0.5669308032756473, "grad_norm": 46.75, "kl": 2.4389591217041016, "learning_rate": 5e-07, "logits/chosen": -30923667.2, "logits/rejected": -46837488.0, "logps/chosen": -253.3076904296875, "logps/rejected": -344.231201171875, "loss": 0.3379, "rewards/chosen": 0.3381075620651245, "rewards/margins": 2.477965490023295, "rewards/rejected": -2.1398579279581704, "step": 10696 }, { "epoch": 0.5669838072774495, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5278034.0, "logits/rejected": -28666740.0, "logps/chosen": -274.713623046875, "logps/rejected": -325.2760009765625, "loss": 0.2302, "rewards/chosen": 0.3588164448738098, "rewards/margins": 3.601363956928253, "rewards/rejected": -3.2425475120544434, "step": 10697 }, { "epoch": 0.5670368112792515, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57603968.0, "logits/rejected": -17477899.2, "logps/chosen": -511.4473470052083, "logps/rejected": -488.50888671875, "loss": 0.2067, "rewards/chosen": 0.8753214677174886, "rewards/margins": 3.468554385503133, "rewards/rejected": -2.5932329177856444, "step": 10698 }, { "epoch": 0.5670898152810537, "grad_norm": 60.5, "kl": 0.41759204864501953, "learning_rate": 5e-07, "logits/chosen": -12942476.0, "logits/rejected": -40188332.8, "logps/chosen": -307.8425699869792, "logps/rejected": -613.77275390625, "loss": 0.1941, "rewards/chosen": 0.3508668343226115, "rewards/margins": 3.876415769259135, "rewards/rejected": -3.5255489349365234, "step": 10699 }, { "epoch": 0.5671428192828558, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10187498.666666666, "logits/rejected": -21142555.2, "logps/chosen": -468.4659016927083, "logps/rejected": -326.393798828125, "loss": 0.2914, "rewards/chosen": -0.11178792516390483, "rewards/margins": 2.201592125495275, "rewards/rejected": -2.3133800506591795, "step": 10700 }, { "epoch": 0.567195823284658, "grad_norm": 29.375, "kl": 1.875701904296875, "learning_rate": 5e-07, "logits/chosen": 453175.4, "logits/rejected": -38838680.0, "logps/chosen": -164.407080078125, "logps/rejected": -358.7996012369792, "loss": 0.3358, "rewards/chosen": 0.5092998504638672, "rewards/margins": 2.840840752919515, "rewards/rejected": -2.331540902455648, "step": 10701 }, { "epoch": 0.5672488272864601, "grad_norm": 43.75, "kl": 1.4352798461914062, "learning_rate": 5e-07, "logits/chosen": -7311311.0, "logits/rejected": -26680704.0, "logps/chosen": -307.46636962890625, "logps/rejected": -184.5688018798828, "loss": 0.3566, "rewards/chosen": 0.17292450368404388, "rewards/margins": 2.2123355716466904, "rewards/rejected": -2.0394110679626465, "step": 10702 }, { "epoch": 0.5673018312882623, "grad_norm": 40.25, "kl": 0.6875782012939453, "learning_rate": 5e-07, "logits/chosen": -12722928.0, "logits/rejected": 15991070.666666666, "logps/chosen": -139.67584228515625, "logps/rejected": -353.7320149739583, "loss": 0.2364, "rewards/chosen": 1.1057218313217163, "rewards/margins": 2.8776828845341997, "rewards/rejected": -1.7719610532124836, "step": 10703 }, { "epoch": 0.5673548352900644, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22166160.0, "logits/rejected": -22985708.0, "logps/chosen": -122.64765930175781, "logps/rejected": -293.495361328125, "loss": 0.3328, "rewards/chosen": 0.09156012535095215, "rewards/margins": 1.7116715908050537, "rewards/rejected": -1.6201114654541016, "step": 10704 }, { "epoch": 0.5674078392918666, "grad_norm": 74.5, "kl": 2.307598114013672, "learning_rate": 5e-07, "logits/chosen": -36251476.0, "logits/rejected": 21524654.0, "logps/chosen": -306.1629333496094, "logps/rejected": -206.1725311279297, "loss": 0.2704, "rewards/chosen": 0.8796734809875488, "rewards/margins": 2.7318758964538574, "rewards/rejected": -1.8522024154663086, "step": 10705 }, { "epoch": 0.5674608432936686, "grad_norm": 69.0, "kl": 2.847991943359375, "learning_rate": 5e-07, "logits/chosen": -29495573.333333332, "logits/rejected": -2421857.0, "logps/chosen": -199.88321940104166, "logps/rejected": -154.417431640625, "loss": 0.2619, "rewards/chosen": 0.6734107335408529, "rewards/margins": 2.644078000386556, "rewards/rejected": -1.970667266845703, "step": 10706 }, { "epoch": 0.5675138472954708, "grad_norm": 69.0, "kl": 0.37775421142578125, "learning_rate": 5e-07, "logits/chosen": -3919459.2, "logits/rejected": -128136629.33333333, "logps/chosen": -410.38046875, "logps/rejected": -401.3703206380208, "loss": 0.1981, "rewards/chosen": 1.2976386070251464, "rewards/margins": 3.4701725959777834, "rewards/rejected": -2.1725339889526367, "step": 10707 }, { "epoch": 0.5675668512972729, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15854014.0, "logits/rejected": -25089285.333333332, "logps/chosen": -153.18414306640625, "logps/rejected": -220.30415852864584, "loss": 0.283, "rewards/chosen": -0.09589463472366333, "rewards/margins": 2.002613882223765, "rewards/rejected": -2.0985085169474282, "step": 10708 }, { "epoch": 0.5676198552990751, "grad_norm": 32.0, "kl": 1.0054206848144531, "learning_rate": 5e-07, "logits/chosen": -25984966.0, "logits/rejected": -16648565.333333334, "logps/chosen": -160.1234130859375, "logps/rejected": -526.1572672526041, "loss": 0.1761, "rewards/chosen": 0.10619869828224182, "rewards/margins": 3.4118907352288566, "rewards/rejected": -3.3056920369466147, "step": 10709 }, { "epoch": 0.5676728593008772, "grad_norm": 56.0, "kl": 0.4279937744140625, "learning_rate": 5e-07, "logits/chosen": -62920248.0, "logits/rejected": 8554018.0, "logps/chosen": -374.0189514160156, "logps/rejected": -628.9441528320312, "loss": 0.1681, "rewards/chosen": 1.214818000793457, "rewards/margins": 3.8965861797332764, "rewards/rejected": -2.6817681789398193, "step": 10710 }, { "epoch": 0.5677258633026794, "grad_norm": 48.0, "kl": 1.3218460083007812, "learning_rate": 5e-07, "logits/chosen": -34420501.333333336, "logits/rejected": -11181796.8, "logps/chosen": -430.52685546875, "logps/rejected": -97.8290771484375, "loss": 0.198, "rewards/chosen": 1.5876749356587727, "rewards/margins": 2.9704601605733236, "rewards/rejected": -1.3827852249145507, "step": 10711 }, { "epoch": 0.5677788673044815, "grad_norm": 38.75, "kl": 1.7910423278808594, "learning_rate": 5e-07, "logits/chosen": -43690304.0, "logits/rejected": -40902420.0, "logps/chosen": -121.50154876708984, "logps/rejected": -236.65512084960938, "loss": 0.2794, "rewards/chosen": 0.862838864326477, "rewards/margins": 2.748215675354004, "rewards/rejected": -1.8853768110275269, "step": 10712 }, { "epoch": 0.5678318713062837, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24795780.8, "logits/rejected": -21899933.333333332, "logps/chosen": -360.942578125, "logps/rejected": -342.7276204427083, "loss": 0.3349, "rewards/chosen": 0.22108125686645508, "rewards/margins": 2.2016005516052246, "rewards/rejected": -1.9805192947387695, "step": 10713 }, { "epoch": 0.5678848753080857, "grad_norm": 49.25, "kl": 0.6272735595703125, "learning_rate": 5e-07, "logits/chosen": -8227878.0, "logits/rejected": -744475.25, "logps/chosen": -279.12892659505206, "logps/rejected": -107.05261993408203, "loss": 0.354, "rewards/chosen": 0.3755679527918498, "rewards/margins": 2.6284745136896768, "rewards/rejected": -2.252906560897827, "step": 10714 }, { "epoch": 0.5679378793098879, "grad_norm": 49.25, "kl": 2.8681821823120117, "learning_rate": 5e-07, "logits/chosen": -2505395.2, "logits/rejected": -27705266.666666668, "logps/chosen": -95.7423583984375, "logps/rejected": -165.91553751627603, "loss": 0.3548, "rewards/chosen": 0.5036767482757568, "rewards/margins": 2.4045944372812906, "rewards/rejected": -1.900917689005534, "step": 10715 }, { "epoch": 0.56799088331169, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28696352.0, "logits/rejected": -40354700.0, "logps/chosen": -133.47900390625, "logps/rejected": -645.8173217773438, "loss": 0.2734, "rewards/chosen": 0.27507033944129944, "rewards/margins": 2.643045574426651, "rewards/rejected": -2.3679752349853516, "step": 10716 }, { "epoch": 0.5680438873134922, "grad_norm": 65.0, "kl": 3.7731680870056152, "learning_rate": 5e-07, "logits/chosen": -36801416.0, "logps/chosen": -337.7718200683594, "loss": 0.4228, "rewards/chosen": 0.7407810688018799, "step": 10717 }, { "epoch": 0.5680968913152943, "grad_norm": 21.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5989866.5, "logits/rejected": -45837786.666666664, "logps/chosen": -187.03126525878906, "logps/rejected": -487.9806315104167, "loss": 0.1433, "rewards/chosen": 1.2299765348434448, "rewards/margins": 4.046472748120626, "rewards/rejected": -2.816496213277181, "step": 10718 }, { "epoch": 0.5681498953170965, "grad_norm": 21.625, "kl": 1.0205373764038086, "learning_rate": 5e-07, "logits/chosen": 69025075.2, "logits/rejected": 1727718.5, "logps/chosen": -32.732861328125, "logps/rejected": -77.55397033691406, "loss": 0.3365, "rewards/chosen": 0.45093836784362795, "rewards/margins": 2.3365761280059814, "rewards/rejected": -1.8856377601623535, "step": 10719 }, { "epoch": 0.5682028993188986, "grad_norm": 40.5, "kl": 1.041132926940918, "learning_rate": 5e-07, "logits/chosen": -16969702.666666668, "logits/rejected": -29740880.0, "logps/chosen": -186.85107421875, "logps/rejected": -349.375537109375, "loss": 0.2746, "rewards/chosen": 0.26995118459065753, "rewards/margins": 1.7933708508809407, "rewards/rejected": -1.5234196662902832, "step": 10720 }, { "epoch": 0.5682559033207008, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33708364.0, "logits/rejected": -24573693.333333332, "logps/chosen": -303.28692626953125, "logps/rejected": -217.9548136393229, "loss": 0.3007, "rewards/chosen": 1.363904595375061, "rewards/margins": 2.6683211723963423, "rewards/rejected": -1.304416577021281, "step": 10721 }, { "epoch": 0.5683089073225028, "grad_norm": 46.5, "kl": 0.004618644714355469, "learning_rate": 5e-07, "logits/chosen": 3246415.0, "logits/rejected": -9445682.4, "logps/chosen": -351.89453125, "logps/rejected": -225.6687744140625, "loss": 0.1771, "rewards/chosen": 1.6445536613464355, "rewards/margins": 3.6611844062805177, "rewards/rejected": -2.016630744934082, "step": 10722 }, { "epoch": 0.568361911324305, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27320233.6, "logits/rejected": -52209344.0, "logps/chosen": -410.37265625, "logps/rejected": -476.8121337890625, "loss": 0.2882, "rewards/chosen": 0.4834275722503662, "rewards/margins": 2.868706432978312, "rewards/rejected": -2.385278860727946, "step": 10723 }, { "epoch": 0.5684149153261071, "grad_norm": 56.0, "kl": 1.8463668823242188, "learning_rate": 5e-07, "logits/chosen": -21794340.0, "logits/rejected": -42671824.0, "logps/chosen": -229.52857971191406, "logps/rejected": -341.29046630859375, "loss": 0.3066, "rewards/chosen": 0.02920818328857422, "rewards/margins": 3.01157283782959, "rewards/rejected": -2.9823646545410156, "step": 10724 }, { "epoch": 0.5684679193279093, "grad_norm": 32.5, "kl": 3.421588897705078, "learning_rate": 5e-07, "logits/chosen": -29324416.0, "logits/rejected": -4185329.6666666665, "logps/chosen": -256.9775390625, "logps/rejected": -66.68995157877605, "loss": 0.3353, "rewards/chosen": 0.7761946678161621, "rewards/margins": 2.172933832804362, "rewards/rejected": -1.3967391649882, "step": 10725 }, { "epoch": 0.5685209233297114, "grad_norm": 66.5, "kl": 0.051219940185546875, "learning_rate": 5e-07, "logits/chosen": -30044809.14285714, "logits/rejected": -42560512.0, "logps/chosen": -328.00048828125, "logps/rejected": -857.096435546875, "loss": 0.3353, "rewards/chosen": 0.543123585837228, "rewards/margins": 6.122103078024728, "rewards/rejected": -5.5789794921875, "step": 10726 }, { "epoch": 0.5685739273315136, "grad_norm": 42.5, "kl": 3.950359344482422, "learning_rate": 5e-07, "logits/chosen": -5333820.8, "logits/rejected": -4189138.6666666665, "logps/chosen": -110.18001708984374, "logps/rejected": -121.2864990234375, "loss": 0.4782, "rewards/chosen": 0.14898048639297484, "rewards/margins": 1.1957624236742657, "rewards/rejected": -1.0467819372812908, "step": 10727 }, { "epoch": 0.5686269313333157, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 31933766.0, "logits/rejected": -46748412.0, "logps/chosen": -551.6640625, "logps/rejected": -465.4932861328125, "loss": 0.2149, "rewards/chosen": 0.742785632610321, "rewards/margins": 3.538667142391205, "rewards/rejected": -2.795881509780884, "step": 10728 }, { "epoch": 0.5686799353351178, "grad_norm": 35.0, "kl": 0.1981964111328125, "learning_rate": 5e-07, "logits/chosen": -9740456.0, "logits/rejected": -31470425.6, "logps/chosen": -196.9124755859375, "logps/rejected": -266.3972412109375, "loss": 0.2219, "rewards/chosen": 0.497151255607605, "rewards/margins": 2.8407341718673704, "rewards/rejected": -2.3435829162597654, "step": 10729 }, { "epoch": 0.5687329393369199, "grad_norm": 43.0, "kl": 5.225773811340332, "learning_rate": 5e-07, "logits/chosen": -12826482.4, "logits/rejected": -10092695.333333334, "logps/chosen": -285.1178466796875, "logps/rejected": -122.31508382161458, "loss": 0.3145, "rewards/chosen": 1.2424446105957032, "rewards/margins": 2.8968783378601075, "rewards/rejected": -1.6544337272644043, "step": 10730 }, { "epoch": 0.5687859433387221, "grad_norm": 41.75, "kl": 0.1594219207763672, "learning_rate": 5e-07, "logits/chosen": -21080928.0, "logits/rejected": -9010378.0, "logps/chosen": -403.10687255859375, "logps/rejected": -77.44090270996094, "loss": 0.2774, "rewards/chosen": 1.066036581993103, "rewards/margins": 2.3092154264450073, "rewards/rejected": -1.2431788444519043, "step": 10731 }, { "epoch": 0.5688389473405242, "grad_norm": 66.0, "kl": 4.709552764892578, "learning_rate": 5e-07, "logits/chosen": -16569145.6, "logits/rejected": -13356717.333333334, "logps/chosen": -370.42333984375, "logps/rejected": -379.537841796875, "loss": 0.2761, "rewards/chosen": 1.0740812301635743, "rewards/margins": 3.0983131408691404, "rewards/rejected": -2.0242319107055664, "step": 10732 }, { "epoch": 0.5688919513423264, "grad_norm": 71.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19580472.0, "logits/rejected": -4064900.3333333335, "logps/chosen": -503.71650390625, "logps/rejected": -96.45137532552083, "loss": 0.3313, "rewards/chosen": 0.6745329380035401, "rewards/margins": 2.1776806990305584, "rewards/rejected": -1.5031477610270183, "step": 10733 }, { "epoch": 0.5689449553441285, "grad_norm": 62.0, "kl": 3.0270614624023438, "learning_rate": 5e-07, "logits/chosen": -1359382.75, "logits/rejected": -24453838.0, "logps/chosen": -207.79972330729166, "logps/rejected": -298.15130615234375, "loss": 0.4177, "rewards/chosen": 0.4034458001454671, "rewards/margins": 2.3268055518468223, "rewards/rejected": -1.923359751701355, "step": 10734 }, { "epoch": 0.5689979593459307, "grad_norm": 68.0, "kl": 0.39806365966796875, "learning_rate": 5e-07, "logits/chosen": -16830780.0, "logits/rejected": 28250028.0, "logps/chosen": -323.7057698567708, "logps/rejected": -877.8842163085938, "loss": 0.2663, "rewards/chosen": 0.7369815508524576, "rewards/margins": 3.215533892313639, "rewards/rejected": -2.4785523414611816, "step": 10735 }, { "epoch": 0.5690509633477328, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58250515.2, "logits/rejected": -55348149.333333336, "logps/chosen": -907.808203125, "logps/rejected": -462.47265625, "loss": 0.2688, "rewards/chosen": 0.5820816040039063, "rewards/margins": 3.7979387919108074, "rewards/rejected": -3.215857187906901, "step": 10736 }, { "epoch": 0.569103967349535, "grad_norm": 47.75, "kl": 3.262340545654297, "learning_rate": 5e-07, "logits/chosen": -87443832.0, "logits/rejected": -29721690.0, "logps/chosen": -1144.884521484375, "logps/rejected": -299.3526916503906, "loss": 0.3417, "rewards/chosen": 0.5096391439437866, "rewards/margins": 2.7080386877059937, "rewards/rejected": -2.198399543762207, "step": 10737 }, { "epoch": 0.569156971351337, "grad_norm": 51.75, "kl": 0.13269805908203125, "learning_rate": 5e-07, "logits/chosen": -43946357.333333336, "logits/rejected": -11782112.0, "logps/chosen": -441.9478352864583, "logps/rejected": -158.74942016601562, "loss": 0.3476, "rewards/chosen": 0.4687119722366333, "rewards/margins": 2.545906901359558, "rewards/rejected": -2.077194929122925, "step": 10738 }, { "epoch": 0.5692099753531391, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36933416.0, "logits/rejected": -29204090.0, "logps/chosen": -238.54049682617188, "logps/rejected": -362.00115966796875, "loss": 0.3577, "rewards/chosen": -0.21173031628131866, "rewards/margins": 1.719293162226677, "rewards/rejected": -1.9310234785079956, "step": 10739 }, { "epoch": 0.5692629793549413, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29785240.0, "logits/rejected": -11701884.0, "logps/chosen": -158.32376098632812, "logps/rejected": -359.3191324869792, "loss": 0.2821, "rewards/chosen": -0.08315429091453552, "rewards/margins": 1.825099378824234, "rewards/rejected": -1.9082536697387695, "step": 10740 }, { "epoch": 0.5693159833567434, "grad_norm": 41.75, "kl": 2.0364341735839844, "learning_rate": 5e-07, "logits/chosen": -49608821.333333336, "logits/rejected": -15041900.8, "logps/chosen": -764.6268717447916, "logps/rejected": -226.629736328125, "loss": 0.2607, "rewards/chosen": 1.4775686264038086, "rewards/margins": 2.9495230674743653, "rewards/rejected": -1.4719544410705567, "step": 10741 }, { "epoch": 0.5693689873585456, "grad_norm": 42.75, "kl": 3.569314956665039, "learning_rate": 5e-07, "logits/chosen": -15459476.0, "logits/rejected": -26507688.0, "logps/chosen": -568.4725952148438, "logps/rejected": -178.80738830566406, "loss": 0.2484, "rewards/chosen": 1.772984504699707, "rewards/margins": 3.0460795164108276, "rewards/rejected": -1.2730950117111206, "step": 10742 }, { "epoch": 0.5694219913603477, "grad_norm": 42.75, "kl": 3.655590057373047, "learning_rate": 5e-07, "logits/chosen": -34472138.666666664, "logits/rejected": -58758152.0, "logps/chosen": -183.52803548177084, "logps/rejected": -506.82916259765625, "loss": 0.4973, "rewards/chosen": -0.16078693668047586, "rewards/margins": 2.4415277739365897, "rewards/rejected": -2.6023147106170654, "step": 10743 }, { "epoch": 0.5694749953621498, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9404588.0, "logits/rejected": -47931312.0, "logps/chosen": -272.7469482421875, "logps/rejected": -269.1358337402344, "loss": 0.3048, "rewards/chosen": -0.001137852668762207, "rewards/margins": 2.7475370168685913, "rewards/rejected": -2.7486748695373535, "step": 10744 }, { "epoch": 0.5695279993639519, "grad_norm": 71.5, "kl": 2.377826690673828, "learning_rate": 5e-07, "logits/chosen": -34696720.0, "logits/rejected": -11299376.0, "logps/chosen": -315.8096400669643, "logps/rejected": -318.7837829589844, "loss": 0.4315, "rewards/chosen": 0.29373598098754883, "rewards/margins": 1.9942669868469238, "rewards/rejected": -1.700531005859375, "step": 10745 }, { "epoch": 0.5695810033657541, "grad_norm": 57.0, "kl": 0.6178665161132812, "learning_rate": 5e-07, "logits/chosen": -84335632.0, "logits/rejected": -1928047.375, "logps/chosen": -366.6990051269531, "logps/rejected": -112.39443969726562, "loss": 0.2662, "rewards/chosen": 0.7361389398574829, "rewards/margins": 2.5050315856933594, "rewards/rejected": -1.7688926458358765, "step": 10746 }, { "epoch": 0.5696340073675562, "grad_norm": 52.25, "kl": 0.9692230224609375, "learning_rate": 5e-07, "logits/chosen": -75646544.0, "logits/rejected": -28425316.0, "logps/chosen": -629.408203125, "logps/rejected": -515.221435546875, "loss": 0.2061, "rewards/chosen": 1.2046905755996704, "rewards/margins": 3.16031277179718, "rewards/rejected": -1.9556221961975098, "step": 10747 }, { "epoch": 0.5696870113693584, "grad_norm": 56.25, "kl": 4.340991973876953, "learning_rate": 5e-07, "logits/chosen": -51449829.333333336, "logits/rejected": -29894080.0, "logps/chosen": -380.7337239583333, "logps/rejected": -261.812060546875, "loss": 0.2391, "rewards/chosen": 1.1885898907979329, "rewards/margins": 3.5315285046895344, "rewards/rejected": -2.3429386138916017, "step": 10748 }, { "epoch": 0.5697400153711605, "grad_norm": 80.5, "kl": 3.077145576477051, "learning_rate": 5e-07, "logits/chosen": 33282168.0, "logits/rejected": -11385172.0, "logps/chosen": -759.543212890625, "logps/rejected": -391.5950927734375, "loss": 0.4127, "rewards/chosen": 0.48764363924662274, "rewards/margins": 1.583432952562968, "rewards/rejected": -1.0957893133163452, "step": 10749 }, { "epoch": 0.5697930193729627, "grad_norm": 45.25, "kl": 3.0421199798583984, "learning_rate": 5e-07, "logits/chosen": -52391365.333333336, "logits/rejected": -24174664.0, "logps/chosen": -410.0804443359375, "logps/rejected": -313.36787109375, "loss": 0.2583, "rewards/chosen": 0.37067023913065594, "rewards/margins": 2.2107241789499916, "rewards/rejected": -1.8400539398193358, "step": 10750 }, { "epoch": 0.5698460233747648, "grad_norm": 46.5, "kl": 0.017104148864746094, "learning_rate": 5e-07, "logits/chosen": 3541316.0, "logits/rejected": -32246576.0, "logps/chosen": -79.83980814615886, "logps/rejected": -211.5131103515625, "loss": 0.2211, "rewards/chosen": 1.0544355710347493, "rewards/margins": 3.1197100003560383, "rewards/rejected": -2.0652744293212892, "step": 10751 }, { "epoch": 0.569899027376567, "grad_norm": 92.0, "kl": 1.5377578735351562, "learning_rate": 5e-07, "logits/chosen": -44456425.6, "logits/rejected": -42083120.0, "logps/chosen": -460.17451171875, "logps/rejected": -293.25917561848956, "loss": 0.355, "rewards/chosen": 0.34446351528167723, "rewards/margins": 2.254496105511983, "rewards/rejected": -1.910032590230306, "step": 10752 }, { "epoch": 0.569952031378369, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34385472.0, "logits/rejected": -23991704.0, "logps/chosen": -345.68402099609375, "logps/rejected": -291.91522216796875, "loss": 0.2788, "rewards/chosen": 0.7018014788627625, "rewards/margins": 2.418108880519867, "rewards/rejected": -1.7163074016571045, "step": 10753 }, { "epoch": 0.5700050353801712, "grad_norm": 57.75, "kl": 2.556285858154297, "learning_rate": 5e-07, "logits/chosen": -30327316.57142857, "logits/rejected": -3022014.25, "logps/chosen": -451.77744838169644, "logps/rejected": -77.67564392089844, "loss": 0.2957, "rewards/chosen": 1.1625659125191825, "rewards/margins": 3.867555584226336, "rewards/rejected": -2.7049896717071533, "step": 10754 }, { "epoch": 0.5700580393819733, "grad_norm": 68.5, "kl": 1.33392333984375, "learning_rate": 5e-07, "logits/chosen": -63132810.666666664, "logits/rejected": 1648510.25, "logps/chosen": -416.6265869140625, "logps/rejected": -138.0205078125, "loss": 0.3314, "rewards/chosen": 0.6894930203755697, "rewards/margins": 2.041053613026937, "rewards/rejected": -1.3515605926513672, "step": 10755 }, { "epoch": 0.5701110433837755, "grad_norm": 57.0, "kl": 0.31084442138671875, "learning_rate": 5e-07, "logits/chosen": -22557454.0, "logps/chosen": -297.782958984375, "loss": 0.3694, "rewards/chosen": 0.7286895513534546, "step": 10756 }, { "epoch": 0.5701640473855776, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25376672.0, "logits/rejected": -13425970.0, "logps/chosen": -299.6871643066406, "logps/rejected": -142.1014404296875, "loss": 0.3768, "rewards/chosen": 0.16754093766212463, "rewards/margins": 1.0896000564098358, "rewards/rejected": -0.9220591187477112, "step": 10757 }, { "epoch": 0.5702170513873798, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17306800.0, "logits/rejected": -9976441.0, "logps/chosen": -311.3059387207031, "logps/rejected": -254.98922729492188, "loss": 0.2574, "rewards/chosen": 0.5890140533447266, "rewards/margins": 2.823498487472534, "rewards/rejected": -2.2344844341278076, "step": 10758 }, { "epoch": 0.5702700553891819, "grad_norm": 48.25, "kl": 0.04494476318359375, "learning_rate": 5e-07, "logits/chosen": -27077198.0, "logits/rejected": -11755532.0, "logps/chosen": -331.58197021484375, "logps/rejected": -185.49215698242188, "loss": 0.2338, "rewards/chosen": 0.8958951234817505, "rewards/margins": 2.681813955307007, "rewards/rejected": -1.7859188318252563, "step": 10759 }, { "epoch": 0.570323059390984, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7318243.333333333, "logits/rejected": -14677750.4, "logps/chosen": -146.18400065104166, "logps/rejected": -154.27623291015624, "loss": 0.3147, "rewards/chosen": 0.0600807269414266, "rewards/margins": 1.4164854129155475, "rewards/rejected": -1.356404685974121, "step": 10760 }, { "epoch": 0.5703760633927861, "grad_norm": 56.75, "kl": 1.7855348587036133, "learning_rate": 5e-07, "logits/chosen": -26566204.8, "logits/rejected": -138693173.33333334, "logps/chosen": -192.0311279296875, "logps/rejected": -478.8892008463542, "loss": 0.398, "rewards/chosen": 0.06968417167663574, "rewards/margins": 2.2304094473520912, "rewards/rejected": -2.1607252756754556, "step": 10761 }, { "epoch": 0.5704290673945883, "grad_norm": 50.25, "kl": 0.4271430969238281, "learning_rate": 5e-07, "logits/chosen": -32029342.0, "logits/rejected": 39437760.0, "logps/chosen": -622.3008422851562, "logps/rejected": -384.895263671875, "loss": 0.2209, "rewards/chosen": 0.9881615042686462, "rewards/margins": 3.3797301650047302, "rewards/rejected": -2.391568660736084, "step": 10762 }, { "epoch": 0.5704820713963904, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1804833.8333333333, "logits/rejected": -7143632.0, "logps/chosen": -99.51084391276042, "logps/rejected": -173.7090087890625, "loss": 0.2941, "rewards/chosen": 0.42333634694417316, "rewards/margins": 1.7280201276143392, "rewards/rejected": -1.304683780670166, "step": 10763 }, { "epoch": 0.5705350753981926, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -104178776.0, "logits/rejected": -40156909.333333336, "logps/chosen": -485.30645751953125, "logps/rejected": -289.2423909505208, "loss": 0.1783, "rewards/chosen": 0.8315094113349915, "rewards/margins": 3.2570628921190896, "rewards/rejected": -2.425553480784098, "step": 10764 }, { "epoch": 0.5705880793999947, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29095484.0, "logits/rejected": -39888330.666666664, "logps/chosen": -367.1569519042969, "logps/rejected": -535.5396728515625, "loss": 0.1774, "rewards/chosen": -0.11377564072608948, "rewards/margins": 2.9463522533575692, "rewards/rejected": -3.0601278940836587, "step": 10765 }, { "epoch": 0.5706410834017969, "grad_norm": 27.125, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -27485610.0, "logps/rejected": -302.076416015625, "loss": 0.1107, "rewards/rejected": -2.8460869789123535, "step": 10766 }, { "epoch": 0.570694087403599, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34877061.333333336, "logits/rejected": -31178230.4, "logps/chosen": -170.86273193359375, "logps/rejected": -183.39921875, "loss": 0.2853, "rewards/chosen": 0.5542713403701782, "rewards/margins": 1.7572102785110473, "rewards/rejected": -1.202938938140869, "step": 10767 }, { "epoch": 0.5707470914054011, "grad_norm": 44.0, "kl": 1.237290382385254, "learning_rate": 5e-07, "logits/chosen": -20330484.0, "logits/rejected": -87224624.0, "logps/chosen": -313.86419677734375, "logps/rejected": -565.0242919921875, "loss": 0.2273, "rewards/chosen": 0.7656691074371338, "rewards/margins": 2.8209149837493896, "rewards/rejected": -2.055245876312256, "step": 10768 }, { "epoch": 0.5708000954072032, "grad_norm": 39.25, "kl": 2.6085853576660156, "learning_rate": 5e-07, "logits/chosen": -43475801.6, "logits/rejected": -16315832.0, "logps/chosen": -442.855615234375, "logps/rejected": -688.9187825520834, "loss": 0.2665, "rewards/chosen": 1.2452059745788575, "rewards/margins": 4.683308378855387, "rewards/rejected": -3.43810240427653, "step": 10769 }, { "epoch": 0.5708530994090054, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37771037.333333336, "logits/rejected": -13129594.4, "logps/chosen": -267.17917887369794, "logps/rejected": -401.411962890625, "loss": 0.171, "rewards/chosen": 0.682939608891805, "rewards/margins": 3.998919375737508, "rewards/rejected": -3.315979766845703, "step": 10770 }, { "epoch": 0.5709061034108075, "grad_norm": 48.25, "kl": 3.920318603515625, "learning_rate": 5e-07, "logits/chosen": -33947418.666666664, "logits/rejected": -14549182.0, "logps/chosen": -448.924072265625, "logps/rejected": -197.607666015625, "loss": 0.3193, "rewards/chosen": 0.7093241214752197, "rewards/margins": 2.8016531467437744, "rewards/rejected": -2.0923290252685547, "step": 10771 }, { "epoch": 0.5709591074126097, "grad_norm": 58.25, "kl": 0.32987403869628906, "learning_rate": 5e-07, "logits/chosen": 11720953.6, "logits/rejected": -63031861.333333336, "logps/chosen": -332.99990234375, "logps/rejected": -121.00881958007812, "loss": 0.3226, "rewards/chosen": 0.6316878795623779, "rewards/margins": 1.9630938371022542, "rewards/rejected": -1.3314059575398762, "step": 10772 }, { "epoch": 0.5710121114144118, "grad_norm": 63.75, "kl": 2.3227920532226562, "learning_rate": 5e-07, "logits/chosen": 41751936.0, "logits/rejected": 13053160.0, "logps/chosen": -341.2329406738281, "logps/rejected": -189.007080078125, "loss": 0.3917, "rewards/chosen": 0.3341568112373352, "rewards/margins": 0.950330376625061, "rewards/rejected": -0.6161735653877258, "step": 10773 }, { "epoch": 0.571065115416214, "grad_norm": 75.5, "kl": 0.052520751953125, "learning_rate": 5e-07, "logits/chosen": -48692896.0, "logits/rejected": -19490984.0, "logps/chosen": -552.2550048828125, "logps/rejected": -283.1780090332031, "loss": 0.2749, "rewards/chosen": 0.6823623776435852, "rewards/margins": 2.2271193861961365, "rewards/rejected": -1.5447570085525513, "step": 10774 }, { "epoch": 0.571118119418016, "grad_norm": 67.5, "kl": 0.09505081176757812, "learning_rate": 5e-07, "logits/chosen": -59430950.4, "logits/rejected": -30433336.0, "logps/chosen": -470.6736328125, "logps/rejected": -309.6031901041667, "loss": 0.3684, "rewards/chosen": -0.2569964647293091, "rewards/margins": 3.092373220125834, "rewards/rejected": -3.349369684855143, "step": 10775 }, { "epoch": 0.5711711234198182, "grad_norm": 50.25, "kl": 0.3485260009765625, "learning_rate": 5e-07, "logits/chosen": 3209124.75, "logits/rejected": 66454912.0, "logps/chosen": -113.0278091430664, "logps/rejected": -245.0528767903646, "loss": 0.2208, "rewards/chosen": 0.6960231065750122, "rewards/margins": 2.536325256029765, "rewards/rejected": -1.8403021494547527, "step": 10776 }, { "epoch": 0.5712241274216203, "grad_norm": 68.0, "kl": 0.14497756958007812, "learning_rate": 5e-07, "logits/chosen": -68496250.66666667, "logits/rejected": -23586382.0, "logps/chosen": -242.03324381510416, "logps/rejected": -207.9296875, "loss": 0.3803, "rewards/chosen": 0.11288544535636902, "rewards/margins": 2.7182924449443817, "rewards/rejected": -2.6054069995880127, "step": 10777 }, { "epoch": 0.5712771314234225, "grad_norm": 44.25, "kl": 0.7225685119628906, "learning_rate": 5e-07, "logits/chosen": -84336582.4, "logits/rejected": -27121317.333333332, "logps/chosen": -304.788671875, "logps/rejected": -165.32023111979166, "loss": 0.2698, "rewards/chosen": 0.6889474868774415, "rewards/margins": 2.780340003967285, "rewards/rejected": -2.0913925170898438, "step": 10778 }, { "epoch": 0.5713301354252246, "grad_norm": 46.5, "kl": 3.5495986938476562, "learning_rate": 5e-07, "logits/chosen": -23449700.0, "logits/rejected": -39967888.0, "logps/chosen": -386.3811950683594, "logps/rejected": -357.346923828125, "loss": 0.2527, "rewards/chosen": 0.9798194766044617, "rewards/margins": 3.8175856471061707, "rewards/rejected": -2.837766170501709, "step": 10779 }, { "epoch": 0.5713831394270268, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30510237.333333332, "logits/rejected": -21645001.6, "logps/chosen": -435.2758382161458, "logps/rejected": -377.612548828125, "loss": 0.184, "rewards/chosen": 0.46343080202738446, "rewards/margins": 3.3853428045908607, "rewards/rejected": -2.9219120025634764, "step": 10780 }, { "epoch": 0.5714361434288289, "grad_norm": 55.0, "kl": 1.7024669647216797, "learning_rate": 5e-07, "logits/chosen": -30612985.6, "logits/rejected": -10819572.0, "logps/chosen": -367.41005859375, "logps/rejected": -209.1563924153646, "loss": 0.4029, "rewards/chosen": 0.13024283647537233, "rewards/margins": 1.44023832877477, "rewards/rejected": -1.3099954922993977, "step": 10781 }, { "epoch": 0.5714891474306311, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43143434.666666664, "logits/rejected": -68646672.0, "logps/chosen": -317.55934651692706, "logps/rejected": -582.5682373046875, "loss": 0.3355, "rewards/chosen": 0.34574000040690106, "rewards/margins": 3.0354038874308267, "rewards/rejected": -2.689663887023926, "step": 10782 }, { "epoch": 0.5715421514324331, "grad_norm": 51.75, "kl": 1.2620134353637695, "learning_rate": 5e-07, "logits/chosen": -25760177.6, "logits/rejected": 14584452.0, "logps/chosen": -294.732568359375, "logps/rejected": -657.89599609375, "loss": 0.349, "rewards/chosen": 0.17037124633789064, "rewards/margins": 4.664451789855957, "rewards/rejected": -4.494080543518066, "step": 10783 }, { "epoch": 0.5715951554342353, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23893898.0, "logits/rejected": -22230161.333333332, "logps/chosen": -238.9756317138672, "logps/rejected": -223.68145751953125, "loss": 0.2797, "rewards/chosen": 0.3078346252441406, "rewards/margins": 1.7039809226989746, "rewards/rejected": -1.396146297454834, "step": 10784 }, { "epoch": 0.5716481594360374, "grad_norm": 51.25, "kl": 0.11198234558105469, "learning_rate": 5e-07, "logits/chosen": -4121764.0, "logits/rejected": 5451968.0, "logps/chosen": -395.654052734375, "logps/rejected": -287.432958984375, "loss": 0.2541, "rewards/chosen": 0.3832143147786458, "rewards/margins": 2.6017379124959312, "rewards/rejected": -2.2185235977172852, "step": 10785 }, { "epoch": 0.5717011634378396, "grad_norm": 31.875, "kl": 1.5458621978759766, "learning_rate": 5e-07, "logits/chosen": -10642219.0, "logits/rejected": -27628704.0, "logps/chosen": -790.500244140625, "logps/rejected": -282.8895670572917, "loss": 0.0847, "rewards/chosen": 1.9352699518203735, "rewards/margins": 4.566487272580465, "rewards/rejected": -2.6312173207600913, "step": 10786 }, { "epoch": 0.5717541674396417, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41221256.0, "logits/rejected": -74504128.0, "logps/chosen": -296.9078674316406, "logps/rejected": -183.5363006591797, "loss": 0.372, "rewards/chosen": -0.25736430287361145, "rewards/margins": 1.4812929332256317, "rewards/rejected": -1.7386572360992432, "step": 10787 }, { "epoch": 0.5718071714414439, "grad_norm": 57.0, "kl": 0.7089767456054688, "learning_rate": 5e-07, "logits/chosen": -7593988.0, "logits/rejected": -11727090.4, "logps/chosen": -272.37229410807294, "logps/rejected": -250.9004150390625, "loss": 0.2432, "rewards/chosen": 0.12143046657244365, "rewards/margins": 2.8755213479201, "rewards/rejected": -2.7540908813476563, "step": 10788 }, { "epoch": 0.571860175443246, "grad_norm": 45.25, "kl": 0.40935230255126953, "learning_rate": 5e-07, "logits/chosen": -46737589.333333336, "logits/rejected": -33485056.0, "logps/chosen": -393.907470703125, "logps/rejected": -491.4228515625, "loss": 0.2154, "rewards/chosen": 1.2059179941813152, "rewards/margins": 3.448962656656901, "rewards/rejected": -2.2430446624755858, "step": 10789 }, { "epoch": 0.571913179445048, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9252038.4, "logits/rejected": -37932410.666666664, "logps/chosen": -745.21513671875, "logps/rejected": -383.6436767578125, "loss": 0.3154, "rewards/chosen": 0.48728718757629397, "rewards/margins": 3.275803550084432, "rewards/rejected": -2.788516362508138, "step": 10790 }, { "epoch": 0.5719661834468502, "grad_norm": 51.5, "kl": 1.175262451171875, "learning_rate": 5e-07, "logits/chosen": -64249557.333333336, "logits/rejected": -22977792.0, "logps/chosen": -395.4564615885417, "logps/rejected": -236.5443115234375, "loss": 0.296, "rewards/chosen": 0.4227554400761922, "rewards/margins": 1.9406885226567585, "rewards/rejected": -1.5179330825805664, "step": 10791 }, { "epoch": 0.5720191874486523, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40019322.666666664, "logits/rejected": -12188805.6, "logps/chosen": -272.3849690755208, "logps/rejected": -190.2080322265625, "loss": 0.2802, "rewards/chosen": 0.24277091026306152, "rewards/margins": 2.4291638851165773, "rewards/rejected": -2.186392974853516, "step": 10792 }, { "epoch": 0.5720721914504545, "grad_norm": 51.0, "kl": 0.19382858276367188, "learning_rate": 5e-07, "logits/chosen": -59872080.0, "logits/rejected": -10520178.0, "logps/chosen": -209.784423828125, "logps/rejected": -277.5211486816406, "loss": 0.2742, "rewards/chosen": 0.23058585822582245, "rewards/margins": 2.5000277012586594, "rewards/rejected": -2.269441843032837, "step": 10793 }, { "epoch": 0.5721251954522566, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10652938.0, "logits/rejected": -11585260.0, "logps/chosen": -81.67747497558594, "logps/rejected": -318.71217854817706, "loss": 0.2504, "rewards/chosen": -0.520398736000061, "rewards/margins": 1.5474826892217002, "rewards/rejected": -2.067881425221761, "step": 10794 }, { "epoch": 0.5721781994540588, "grad_norm": 30.625, "kl": 3.798884391784668, "learning_rate": 5e-07, "logits/chosen": 951721.5625, "logits/rejected": 3223610.25, "logps/chosen": -105.24520874023438, "logps/rejected": -317.163818359375, "loss": 0.3006, "rewards/chosen": 0.3339155316352844, "rewards/margins": 3.2499383091926575, "rewards/rejected": -2.916022777557373, "step": 10795 }, { "epoch": 0.5722312034558609, "grad_norm": 44.75, "kl": 1.09625244140625, "learning_rate": 5e-07, "logits/chosen": -30751653.333333332, "logits/rejected": -25203547.2, "logps/chosen": -281.3162841796875, "logps/rejected": -443.628271484375, "loss": 0.1872, "rewards/chosen": 0.561518390973409, "rewards/margins": 3.4545279582341513, "rewards/rejected": -2.893009567260742, "step": 10796 }, { "epoch": 0.5722842074576631, "grad_norm": 38.75, "kl": 1.2420463562011719, "learning_rate": 5e-07, "logits/chosen": -22655920.0, "logits/rejected": -43666650.666666664, "logps/chosen": -85.845361328125, "logps/rejected": -380.6486409505208, "loss": 0.3036, "rewards/chosen": 0.33054659366607664, "rewards/margins": 2.885140951474508, "rewards/rejected": -2.554594357808431, "step": 10797 }, { "epoch": 0.5723372114594651, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4468408.0, "logits/rejected": -58183466.666666664, "logps/chosen": -766.119580078125, "logps/rejected": -339.13287353515625, "loss": 0.2437, "rewards/chosen": 1.185302734375, "rewards/margins": 3.355827967325846, "rewards/rejected": -2.170525232950846, "step": 10798 }, { "epoch": 0.5723902154612673, "grad_norm": 47.25, "kl": 3.239386558532715, "learning_rate": 5e-07, "logits/chosen": -39696912.0, "logits/rejected": -21718718.666666668, "logps/chosen": -227.13779296875, "logps/rejected": -235.0322469075521, "loss": 0.3962, "rewards/chosen": 0.2865553379058838, "rewards/margins": 1.6369012673695882, "rewards/rejected": -1.3503459294637044, "step": 10799 }, { "epoch": 0.5724432194630694, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1520494.0, "logits/rejected": -53137904.0, "logps/chosen": -168.4355926513672, "logps/rejected": -444.4797058105469, "loss": 0.2482, "rewards/chosen": 0.30394449830055237, "rewards/margins": 3.0170630514621735, "rewards/rejected": -2.713118553161621, "step": 10800 }, { "epoch": 0.5724962234648716, "grad_norm": 53.75, "kl": 3.338348388671875, "learning_rate": 5e-07, "logits/chosen": 9435628.8, "logits/rejected": -25778829.333333332, "logps/chosen": -260.2410400390625, "logps/rejected": -284.1956380208333, "loss": 0.3338, "rewards/chosen": 0.617486047744751, "rewards/margins": 3.6089970747629803, "rewards/rejected": -2.991511027018229, "step": 10801 }, { "epoch": 0.5725492274666737, "grad_norm": 48.25, "kl": 1.1730308532714844, "learning_rate": 5e-07, "logits/chosen": -43055058.666666664, "logits/rejected": -2583054.8, "logps/chosen": -436.2080485026042, "logps/rejected": -227.84443359375, "loss": 0.1702, "rewards/chosen": 1.1797200838724773, "rewards/margins": 3.350014273325602, "rewards/rejected": -2.170294189453125, "step": 10802 }, { "epoch": 0.5726022314684759, "grad_norm": 62.25, "kl": 1.1393299102783203, "learning_rate": 5e-07, "logits/chosen": -15025366.4, "logits/rejected": -4499448.666666667, "logps/chosen": -232.9042236328125, "logps/rejected": -217.18212890625, "loss": 0.3358, "rewards/chosen": 0.152940833568573, "rewards/margins": 2.5938432256380715, "rewards/rejected": -2.4409023920694985, "step": 10803 }, { "epoch": 0.572655235470278, "grad_norm": 71.5, "kl": 2.2429723739624023, "learning_rate": 5e-07, "logits/chosen": -38448554.666666664, "logits/rejected": -59596212.0, "logps/chosen": -430.8881022135417, "logps/rejected": -319.7981872558594, "loss": 0.2741, "rewards/chosen": 1.0269138018290203, "rewards/margins": 4.0448958079020185, "rewards/rejected": -3.017982006072998, "step": 10804 }, { "epoch": 0.5727082394720802, "grad_norm": 65.5, "kl": 2.898634910583496, "learning_rate": 5e-07, "logits/chosen": 23777050.666666668, "logits/rejected": -62891088.0, "logps/chosen": -682.4425862630209, "logps/rejected": -432.70892333984375, "loss": 0.4181, "rewards/chosen": 0.35727469126383465, "rewards/margins": 2.683126131693522, "rewards/rejected": -2.3258514404296875, "step": 10805 }, { "epoch": 0.5727612434738822, "grad_norm": 46.25, "kl": 0.2617607116699219, "learning_rate": 5e-07, "logits/chosen": -41334588.0, "logits/rejected": -13544390.857142856, "logps/chosen": -428.2629089355469, "logps/rejected": -318.86879185267856, "loss": 0.235, "rewards/chosen": -0.2397308349609375, "rewards/margins": 1.6190028871808733, "rewards/rejected": -1.8587337221418108, "step": 10806 }, { "epoch": 0.5728142474756844, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24695932.8, "logits/rejected": -2329416.0, "logps/chosen": -216.2464111328125, "logps/rejected": -103.54073079427083, "loss": 0.3141, "rewards/chosen": 0.12892738580703736, "rewards/margins": 3.2972648898760477, "rewards/rejected": -3.1683375040690103, "step": 10807 }, { "epoch": 0.5728672514774865, "grad_norm": 40.0, "kl": 1.287400245666504, "learning_rate": 5e-07, "logits/chosen": -11567497.333333334, "logits/rejected": -18962032.0, "logps/chosen": -231.0722859700521, "logps/rejected": -85.11315307617187, "loss": 0.2985, "rewards/chosen": 0.3027978340784709, "rewards/margins": 1.842201368014018, "rewards/rejected": -1.539403533935547, "step": 10808 }, { "epoch": 0.5729202554792887, "grad_norm": 55.25, "kl": 4.392295837402344, "learning_rate": 5e-07, "logits/chosen": -40435280.0, "logits/rejected": -18090500.0, "logps/chosen": -487.2912109375, "logps/rejected": -254.0605672200521, "loss": 0.3795, "rewards/chosen": 1.0350802421569825, "rewards/margins": 2.412538496653239, "rewards/rejected": -1.3774582544962566, "step": 10809 }, { "epoch": 0.5729732594810908, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19033916.8, "logits/rejected": -23334064.0, "logps/chosen": -277.2987548828125, "logps/rejected": -189.4593505859375, "loss": 0.3268, "rewards/chosen": 0.5276927471160888, "rewards/margins": 2.039927275975545, "rewards/rejected": -1.5122345288594563, "step": 10810 }, { "epoch": 0.573026263482893, "grad_norm": 51.5, "kl": 0.03332710266113281, "learning_rate": 5e-07, "logits/chosen": -26940957.333333332, "logits/rejected": -14592526.4, "logps/chosen": -141.78024291992188, "logps/rejected": -211.201171875, "loss": 0.2302, "rewards/chosen": 0.49290696779886883, "rewards/margins": 2.587517754236857, "rewards/rejected": -2.0946107864379884, "step": 10811 }, { "epoch": 0.5730792674846951, "grad_norm": 65.5, "kl": 1.2096328735351562, "learning_rate": 5e-07, "logits/chosen": -5330296.0, "logits/rejected": -18086569.6, "logps/chosen": -211.74104817708334, "logps/rejected": -342.9568603515625, "loss": 0.2677, "rewards/chosen": 0.6160868803660074, "rewards/margins": 3.03881770769755, "rewards/rejected": -2.422730827331543, "step": 10812 }, { "epoch": 0.5731322714864973, "grad_norm": 46.5, "kl": 0.6331253051757812, "learning_rate": 5e-07, "logits/chosen": -25153290.0, "logits/rejected": -69138664.0, "logps/chosen": -661.427734375, "logps/rejected": -279.60968017578125, "loss": 0.1923, "rewards/chosen": 1.2897627353668213, "rewards/margins": 3.5882272720336914, "rewards/rejected": -2.29846453666687, "step": 10813 }, { "epoch": 0.5731852754882993, "grad_norm": 47.75, "kl": 2.4493579864501953, "learning_rate": 5e-07, "logits/chosen": -11273584.8, "logits/rejected": -44110770.666666664, "logps/chosen": -148.4322021484375, "logps/rejected": -488.9655354817708, "loss": 0.375, "rewards/chosen": 0.4098963737487793, "rewards/margins": 2.0858750343322754, "rewards/rejected": -1.675978660583496, "step": 10814 }, { "epoch": 0.5732382794901015, "grad_norm": 35.0, "kl": 1.8925342559814453, "learning_rate": 5e-07, "logits/chosen": -10848288.0, "logits/rejected": -4950115.0, "logps/chosen": -204.6351318359375, "logps/rejected": -152.25320434570312, "loss": 0.3987, "rewards/chosen": -0.4511148929595947, "rewards/margins": 1.7814970016479492, "rewards/rejected": -2.232611894607544, "step": 10815 }, { "epoch": 0.5732912834919036, "grad_norm": 42.25, "kl": 1.0081443786621094, "learning_rate": 5e-07, "logits/chosen": -21740697.6, "logits/rejected": -33541098.666666668, "logps/chosen": -144.38619384765624, "logps/rejected": -331.01906331380206, "loss": 0.3707, "rewards/chosen": 0.17390496730804444, "rewards/margins": 2.4755875984827678, "rewards/rejected": -2.301682631174723, "step": 10816 }, { "epoch": 0.5733442874937058, "grad_norm": 67.5, "kl": 0.6891555786132812, "learning_rate": 5e-07, "logits/chosen": -7910918.4, "logits/rejected": -11072991.333333334, "logps/chosen": -274.1080322265625, "logps/rejected": -250.04583740234375, "loss": 0.4466, "rewards/chosen": 0.4660125732421875, "rewards/margins": 0.5148981670538584, "rewards/rejected": -0.04888559381167094, "step": 10817 }, { "epoch": 0.5733972914955079, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13776804.0, "logits/rejected": -12049776.0, "logps/chosen": -264.1430969238281, "logps/rejected": -402.361083984375, "loss": 0.285, "rewards/chosen": 0.32868319749832153, "rewards/margins": 2.277140200138092, "rewards/rejected": -1.9484570026397705, "step": 10818 }, { "epoch": 0.5734502954973101, "grad_norm": 42.75, "kl": 0.2923431396484375, "learning_rate": 5e-07, "logits/chosen": -62674360.0, "logits/rejected": -12655497.0, "logps/chosen": -403.03656005859375, "logps/rejected": -293.7702941894531, "loss": 0.2095, "rewards/chosen": 0.9222536087036133, "rewards/margins": 3.4923667907714844, "rewards/rejected": -2.570113182067871, "step": 10819 }, { "epoch": 0.5735032994991122, "grad_norm": 44.0, "kl": 1.189565658569336, "learning_rate": 5e-07, "logits/chosen": -13666663.0, "logits/rejected": 12662575.0, "logps/chosen": -276.65081787109375, "logps/rejected": -445.545654296875, "loss": 0.2186, "rewards/chosen": 0.825927734375, "rewards/margins": 3.7754602432250977, "rewards/rejected": -2.9495325088500977, "step": 10820 }, { "epoch": 0.5735563035009144, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44403925.333333336, "logits/rejected": -47735046.4, "logps/chosen": -284.765625, "logps/rejected": -224.8958984375, "loss": 0.277, "rewards/chosen": 0.3656868537267049, "rewards/margins": 1.8680428107579548, "rewards/rejected": -1.50235595703125, "step": 10821 }, { "epoch": 0.5736093075027164, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8976674.0, "logits/rejected": -39291366.4, "logps/chosen": -344.9477945963542, "logps/rejected": -424.475537109375, "loss": 0.2317, "rewards/chosen": 0.38580119609832764, "rewards/margins": 2.887151598930359, "rewards/rejected": -2.5013504028320312, "step": 10822 }, { "epoch": 0.5736623115045186, "grad_norm": 58.25, "kl": 1.9946022033691406, "learning_rate": 5e-07, "logits/chosen": -43094240.0, "logits/rejected": -18453090.666666668, "logps/chosen": -346.05478515625, "logps/rejected": -327.09141031901044, "loss": 0.3579, "rewards/chosen": 0.2007589817047119, "rewards/margins": 2.5810126145680745, "rewards/rejected": -2.380253632863363, "step": 10823 }, { "epoch": 0.5737153155063207, "grad_norm": 46.0, "kl": 1.4359664916992188, "learning_rate": 5e-07, "logits/chosen": 9259743.0, "logits/rejected": -27169307.42857143, "logps/chosen": -554.8516235351562, "logps/rejected": -282.90426199776783, "loss": 0.1071, "rewards/chosen": 2.002471923828125, "rewards/margins": 4.212410381862096, "rewards/rejected": -2.2099384580339705, "step": 10824 }, { "epoch": 0.5737683195081229, "grad_norm": 54.0, "kl": 0.5197944641113281, "learning_rate": 5e-07, "logits/chosen": -14598630.4, "logits/rejected": -17137330.666666668, "logps/chosen": -247.2066650390625, "logps/rejected": -155.23505655924478, "loss": 0.2763, "rewards/chosen": 0.7516611576080322, "rewards/margins": 2.5857083161671954, "rewards/rejected": -1.8340471585591633, "step": 10825 }, { "epoch": 0.573821323509925, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -82912224.0, "logits/rejected": 10341648.0, "logps/chosen": -399.8580810546875, "logps/rejected": -345.9009195963542, "loss": 0.3088, "rewards/chosen": 0.3710321426391602, "rewards/margins": 2.504229132334391, "rewards/rejected": -2.133196989695231, "step": 10826 }, { "epoch": 0.5738743275117272, "grad_norm": 49.75, "kl": 5.622772216796875, "learning_rate": 5e-07, "logits/chosen": -17023781.714285713, "logits/rejected": 11362258.0, "logps/chosen": -300.04195731026783, "logps/rejected": -314.2626953125, "loss": 0.4773, "rewards/chosen": 0.66504853112357, "rewards/margins": 2.221255830356053, "rewards/rejected": -1.556207299232483, "step": 10827 }, { "epoch": 0.5739273315135293, "grad_norm": 60.0, "kl": 2.833629608154297, "learning_rate": 5e-07, "logits/chosen": -40484141.71428572, "logits/rejected": 4568035.0, "logps/chosen": -328.42152622767856, "logps/rejected": -40.425209045410156, "loss": 0.3975, "rewards/chosen": 0.8889711924961635, "rewards/margins": 0.9445429385772773, "rewards/rejected": -0.055571746081113815, "step": 10828 }, { "epoch": 0.5739803355153315, "grad_norm": 47.25, "kl": 2.116196632385254, "learning_rate": 5e-07, "logits/chosen": -24979422.4, "logits/rejected": 5524856.666666667, "logps/chosen": -243.3833984375, "logps/rejected": -208.9049275716146, "loss": 0.3095, "rewards/chosen": 0.7911478996276855, "rewards/margins": 2.2400915145874025, "rewards/rejected": -1.4489436149597168, "step": 10829 }, { "epoch": 0.5740333395171335, "grad_norm": 57.25, "kl": 1.2334823608398438, "learning_rate": 5e-07, "logits/chosen": 8682162.666666666, "logits/rejected": -32842960.0, "logps/chosen": -424.5313720703125, "logps/rejected": -511.03427734375, "loss": 0.1958, "rewards/chosen": 0.5976933638254801, "rewards/margins": 3.3279664198557533, "rewards/rejected": -2.7302730560302733, "step": 10830 }, { "epoch": 0.5740863435189357, "grad_norm": 62.0, "kl": 1.0844717025756836, "learning_rate": 5e-07, "logits/chosen": -27407488.0, "logits/rejected": -6437184.0, "logps/chosen": -550.0254516601562, "logps/rejected": -394.11309814453125, "loss": 0.2902, "rewards/chosen": 0.7674127817153931, "rewards/margins": 2.741620421409607, "rewards/rejected": -1.9742076396942139, "step": 10831 }, { "epoch": 0.5741393475207378, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21638411.2, "logits/rejected": -29451784.0, "logps/chosen": -191.411083984375, "logps/rejected": -329.59027099609375, "loss": 0.3305, "rewards/chosen": 0.18474565744400023, "rewards/margins": 2.6663874268531798, "rewards/rejected": -2.4816417694091797, "step": 10832 }, { "epoch": 0.57419235152254, "grad_norm": 49.5, "kl": 3.2335205078125, "learning_rate": 5e-07, "logits/chosen": -3496926.25, "logits/rejected": -9861944.0, "logps/chosen": -441.5977478027344, "logps/rejected": -135.4359130859375, "loss": 0.3125, "rewards/chosen": 0.2735549807548523, "rewards/margins": 2.2195865511894226, "rewards/rejected": -1.9460315704345703, "step": 10833 }, { "epoch": 0.5742453555243421, "grad_norm": 72.0, "kl": 1.173253059387207, "learning_rate": 5e-07, "logits/chosen": -61111449.6, "logits/rejected": -11368804.0, "logps/chosen": -441.02548828125, "logps/rejected": -339.0194905598958, "loss": 0.3309, "rewards/chosen": 0.48371429443359376, "rewards/margins": 3.148148250579834, "rewards/rejected": -2.6644339561462402, "step": 10834 }, { "epoch": 0.5742983595261443, "grad_norm": 53.5, "kl": 0.23699188232421875, "learning_rate": 5e-07, "logits/chosen": -39473925.333333336, "logits/rejected": -44000265.6, "logps/chosen": -362.0120849609375, "logps/rejected": -518.06005859375, "loss": 0.2528, "rewards/chosen": 0.32481157779693604, "rewards/margins": 2.550616478919983, "rewards/rejected": -2.225804901123047, "step": 10835 }, { "epoch": 0.5743513635279464, "grad_norm": 47.75, "kl": 0.4691009521484375, "learning_rate": 5e-07, "logits/chosen": -24325368.0, "logits/rejected": -40541800.0, "logps/chosen": -354.0438232421875, "logps/rejected": -471.682861328125, "loss": 0.2828, "rewards/chosen": 0.4188787341117859, "rewards/margins": 2.858486831188202, "rewards/rejected": -2.439608097076416, "step": 10836 }, { "epoch": 0.5744043675297485, "grad_norm": 45.5, "kl": 1.9422225952148438, "learning_rate": 5e-07, "logits/chosen": -25812402.0, "logits/rejected": -62392064.0, "logps/chosen": -284.097900390625, "logps/rejected": -183.9354451497396, "loss": 0.2874, "rewards/chosen": 0.3160633146762848, "rewards/margins": 1.925813267628352, "rewards/rejected": -1.6097499529520671, "step": 10837 }, { "epoch": 0.5744573715315506, "grad_norm": 44.25, "kl": 1.8069095611572266, "learning_rate": 5e-07, "logits/chosen": -31213491.2, "logits/rejected": -14181520.0, "logps/chosen": -265.5072265625, "logps/rejected": -361.1371256510417, "loss": 0.2689, "rewards/chosen": 1.1856669425964355, "rewards/margins": 2.8824283917744955, "rewards/rejected": -1.6967614491780598, "step": 10838 }, { "epoch": 0.5745103755333527, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45911450.666666664, "logits/rejected": -25269334.4, "logps/chosen": -494.3139241536458, "logps/rejected": -190.8418212890625, "loss": 0.2906, "rewards/chosen": 0.25629274050394696, "rewards/margins": 1.6942007223765057, "rewards/rejected": -1.4379079818725586, "step": 10839 }, { "epoch": 0.5745633795351549, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -143307696.0, "logits/rejected": -2572554.5, "logps/chosen": -211.9827423095703, "logps/rejected": -161.4136199951172, "loss": 0.2009, "rewards/chosen": 0.7366833686828613, "rewards/margins": 3.7442216873168945, "rewards/rejected": -3.007538318634033, "step": 10840 }, { "epoch": 0.574616383536957, "grad_norm": 35.5, "kl": 0.8429813385009766, "learning_rate": 5e-07, "logits/chosen": 9539353.333333334, "logits/rejected": -49550.2, "logps/chosen": -178.9509073893229, "logps/rejected": -232.9515380859375, "loss": 0.2583, "rewards/chosen": 0.9647078514099121, "rewards/margins": 2.3061777114868165, "rewards/rejected": -1.3414698600769044, "step": 10841 }, { "epoch": 0.5746693875387592, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33129922.0, "logits/rejected": 6116503.5, "logps/chosen": -211.43600463867188, "logps/rejected": -191.8804931640625, "loss": 0.3276, "rewards/chosen": 0.8920660018920898, "rewards/margins": 1.9102205038070679, "rewards/rejected": -1.018154501914978, "step": 10842 }, { "epoch": 0.5747223915405613, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18117124.8, "logits/rejected": -4292681.333333333, "logps/chosen": -272.3890869140625, "logps/rejected": -393.34130859375, "loss": 0.3358, "rewards/chosen": 0.1695024013519287, "rewards/margins": 2.683975839614868, "rewards/rejected": -2.5144734382629395, "step": 10843 }, { "epoch": 0.5747753955423635, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14949912.0, "logits/rejected": -13528936.0, "logps/chosen": -408.8377990722656, "logps/rejected": -206.50482177734375, "loss": 0.3546, "rewards/chosen": -0.07937563955783844, "rewards/margins": 1.5709684938192368, "rewards/rejected": -1.6503441333770752, "step": 10844 }, { "epoch": 0.5748283995441655, "grad_norm": 53.0, "kl": 3.2038345336914062, "learning_rate": 5e-07, "logits/chosen": -30564790.0, "logits/rejected": 751051.0, "logps/chosen": -298.34979248046875, "logps/rejected": -183.28518676757812, "loss": 0.314, "rewards/chosen": 0.6049783229827881, "rewards/margins": 1.9883817434310913, "rewards/rejected": -1.3834034204483032, "step": 10845 }, { "epoch": 0.5748814035459677, "grad_norm": 52.25, "kl": 1.5585460662841797, "learning_rate": 5e-07, "logits/chosen": -21580337.6, "logits/rejected": -37258381.333333336, "logps/chosen": -307.6365478515625, "logps/rejected": -356.8702799479167, "loss": 0.266, "rewards/chosen": 0.7223014831542969, "rewards/margins": 3.6156946818033853, "rewards/rejected": -2.8933931986490884, "step": 10846 }, { "epoch": 0.5749344075477698, "grad_norm": 55.25, "kl": 1.1243724822998047, "learning_rate": 5e-07, "logits/chosen": -26433664.0, "logits/rejected": -8019599.0, "logps/chosen": -304.0726013183594, "logps/rejected": -119.13434600830078, "loss": 0.3195, "rewards/chosen": 0.3241207003593445, "rewards/margins": 2.69533509016037, "rewards/rejected": -2.3712143898010254, "step": 10847 }, { "epoch": 0.574987411549572, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6752417.0, "logits/rejected": -19323618.666666668, "logps/chosen": -69.47657012939453, "logps/rejected": -395.7169596354167, "loss": 0.2668, "rewards/chosen": -0.25259217619895935, "rewards/margins": 1.4658709267775218, "rewards/rejected": -1.7184631029764812, "step": 10848 }, { "epoch": 0.5750404155513741, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5039427.0, "logits/rejected": -15175540.57142857, "logps/chosen": -82.83341979980469, "logps/rejected": -506.68422154017856, "loss": 0.2139, "rewards/chosen": -1.1072815656661987, "rewards/margins": 1.9069535221372331, "rewards/rejected": -3.014235087803432, "step": 10849 }, { "epoch": 0.5750934195531763, "grad_norm": 44.75, "kl": 3.5967321395874023, "learning_rate": 5e-07, "logits/chosen": -13697295.0, "logits/rejected": -18215878.0, "logps/chosen": -221.186279296875, "logps/rejected": -266.1688537597656, "loss": 0.2643, "rewards/chosen": 0.924086332321167, "rewards/margins": 3.225783348083496, "rewards/rejected": -2.301697015762329, "step": 10850 }, { "epoch": 0.5751464235549784, "grad_norm": 59.5, "kl": 0.12406730651855469, "learning_rate": 5e-07, "logits/chosen": -28061033.6, "logits/rejected": -38204616.0, "logps/chosen": -367.35771484375, "logps/rejected": -204.35262044270834, "loss": 0.2762, "rewards/chosen": 0.5989551067352294, "rewards/margins": 2.610677989323934, "rewards/rejected": -2.0117228825887046, "step": 10851 }, { "epoch": 0.5751994275567806, "grad_norm": 47.5, "kl": 0.9394378662109375, "learning_rate": 5e-07, "logits/chosen": -31552069.333333332, "logits/rejected": -28748288.0, "logps/chosen": -207.6069132486979, "logps/rejected": -306.8819580078125, "loss": 0.2494, "rewards/chosen": 0.593292236328125, "rewards/margins": 2.310409927368164, "rewards/rejected": -1.717117691040039, "step": 10852 }, { "epoch": 0.5752524315585826, "grad_norm": 40.25, "kl": 6.315252304077148, "learning_rate": 5e-07, "logits/chosen": -22055320.0, "logits/rejected": -20408194.666666668, "logps/chosen": -236.4314697265625, "logps/rejected": -286.2696126302083, "loss": 0.2669, "rewards/chosen": 1.4752711296081542, "rewards/margins": 2.9110058784484862, "rewards/rejected": -1.435734748840332, "step": 10853 }, { "epoch": 0.5753054355603848, "grad_norm": 44.25, "kl": 1.219583511352539, "learning_rate": 5e-07, "logits/chosen": -13051078.4, "logits/rejected": -30004448.0, "logps/chosen": -345.268994140625, "logps/rejected": -288.4862060546875, "loss": 0.2174, "rewards/chosen": 1.0213871002197266, "rewards/margins": 3.8675969441731772, "rewards/rejected": -2.8462098439534507, "step": 10854 }, { "epoch": 0.5753584395621869, "grad_norm": 55.5, "kl": 2.7299022674560547, "learning_rate": 5e-07, "logits/chosen": -14210365.333333334, "logits/rejected": -49178630.4, "logps/chosen": -352.5114339192708, "logps/rejected": -270.592333984375, "loss": 0.2602, "rewards/chosen": 0.633735179901123, "rewards/margins": 2.151337718963623, "rewards/rejected": -1.5176025390625, "step": 10855 }, { "epoch": 0.5754114435639891, "grad_norm": 48.0, "kl": 0.10723876953125, "learning_rate": 5e-07, "logits/chosen": -14544956.8, "logits/rejected": -36196330.666666664, "logps/chosen": -284.594677734375, "logps/rejected": -174.9177449544271, "loss": 0.2848, "rewards/chosen": 0.37523193359375, "rewards/margins": 3.331862004597982, "rewards/rejected": -2.956630071004232, "step": 10856 }, { "epoch": 0.5754644475657912, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66913476.0, "logits/rejected": -29958976.0, "logps/chosen": -107.95863342285156, "logps/rejected": -339.92640904017856, "loss": 0.1872, "rewards/chosen": 0.329476922750473, "rewards/margins": 2.4529171202863966, "rewards/rejected": -2.1234401975359236, "step": 10857 }, { "epoch": 0.5755174515675934, "grad_norm": 64.5, "kl": 3.1248550415039062, "learning_rate": 5e-07, "logits/chosen": -46250355.2, "logits/rejected": -34012666.666666664, "logps/chosen": -413.9677734375, "logps/rejected": -193.07088216145834, "loss": 0.4729, "rewards/chosen": 0.01655837893486023, "rewards/margins": 0.38774141669273376, "rewards/rejected": -0.37118303775787354, "step": 10858 }, { "epoch": 0.5755704555693955, "grad_norm": 44.25, "kl": 3.8564834594726562, "learning_rate": 5e-07, "logits/chosen": -18592206.85714286, "logits/rejected": 1318503.5, "logps/chosen": -158.48793247767858, "logps/rejected": -158.81570434570312, "loss": 0.3573, "rewards/chosen": 0.8561116627284459, "rewards/margins": 2.2990255526133945, "rewards/rejected": -1.4429138898849487, "step": 10859 }, { "epoch": 0.5756234595711976, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18444006.666666668, "logits/rejected": -29277596.8, "logps/chosen": -178.40045166015625, "logps/rejected": -153.44180908203126, "loss": 0.2693, "rewards/chosen": 0.3543039560317993, "rewards/margins": 2.128555512428284, "rewards/rejected": -1.7742515563964845, "step": 10860 }, { "epoch": 0.5756764635729997, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13622480.0, "logits/rejected": -10445071.2, "logps/chosen": -423.5001627604167, "logps/rejected": -438.30107421875, "loss": 0.3204, "rewards/chosen": 0.15719095865885416, "rewards/margins": 1.7096602121988933, "rewards/rejected": -1.552469253540039, "step": 10861 }, { "epoch": 0.5757294675748019, "grad_norm": 44.75, "kl": 0.09045791625976562, "learning_rate": 5e-07, "logits/chosen": -9782132.0, "logits/rejected": -26005040.0, "logps/chosen": -315.32073974609375, "logps/rejected": -421.1148986816406, "loss": 0.2331, "rewards/chosen": 0.8419455289840698, "rewards/margins": 3.1353472471237183, "rewards/rejected": -2.2934017181396484, "step": 10862 }, { "epoch": 0.575782471576604, "grad_norm": 41.0, "kl": 2.359940528869629, "learning_rate": 5e-07, "logits/chosen": -20148174.0, "logits/rejected": -27762696.0, "logps/chosen": -248.33828735351562, "logps/rejected": -339.1108703613281, "loss": 0.2442, "rewards/chosen": 0.6053231954574585, "rewards/margins": 3.0506116151809692, "rewards/rejected": -2.4452884197235107, "step": 10863 }, { "epoch": 0.5758354755784062, "grad_norm": 35.75, "kl": 0.18034744262695312, "learning_rate": 5e-07, "logits/chosen": 12578138.0, "logits/rejected": -34010213.333333336, "logps/chosen": -42.78617858886719, "logps/rejected": -417.2476399739583, "loss": 0.2084, "rewards/chosen": 0.5885820388793945, "rewards/margins": 2.851473331451416, "rewards/rejected": -2.2628912925720215, "step": 10864 }, { "epoch": 0.5758884795802083, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44030448.0, "logits/rejected": -34330004.0, "logps/chosen": -256.2267150878906, "logps/rejected": -492.9580383300781, "loss": 0.2616, "rewards/chosen": 0.17632831633090973, "rewards/margins": 3.1440261751413345, "rewards/rejected": -2.967697858810425, "step": 10865 }, { "epoch": 0.5759414835820105, "grad_norm": 34.25, "kl": 2.5551109313964844, "learning_rate": 5e-07, "logits/chosen": 7357596.0, "logits/rejected": -46518793.6, "logps/chosen": -120.64707438151042, "logps/rejected": -446.959912109375, "loss": 0.2367, "rewards/chosen": 0.666595458984375, "rewards/margins": 3.0113622665405275, "rewards/rejected": -2.3447668075561525, "step": 10866 }, { "epoch": 0.5759944875838126, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17095422.0, "logits/rejected": -56233882.666666664, "logps/chosen": -459.4251708984375, "logps/rejected": -239.39518229166666, "loss": 0.2429, "rewards/chosen": 0.06876528263092041, "rewards/margins": 2.3169354995091758, "rewards/rejected": -2.2481702168782554, "step": 10867 }, { "epoch": 0.5760474915856147, "grad_norm": 70.5, "kl": 2.064962387084961, "learning_rate": 5e-07, "logits/chosen": -20681902.4, "logits/rejected": -63763594.666666664, "logps/chosen": -312.781396484375, "logps/rejected": -360.1796468098958, "loss": 0.3907, "rewards/chosen": 0.40804176330566405, "rewards/margins": 1.7076058864593506, "rewards/rejected": -1.2995641231536865, "step": 10868 }, { "epoch": 0.5761004955874168, "grad_norm": 45.0, "kl": 0.6354293823242188, "learning_rate": 5e-07, "logits/chosen": -20720108.8, "logits/rejected": -38629680.0, "logps/chosen": -562.313916015625, "logps/rejected": -331.8835856119792, "loss": 0.1863, "rewards/chosen": 1.541346549987793, "rewards/margins": 3.574378172556559, "rewards/rejected": -2.033031622568766, "step": 10869 }, { "epoch": 0.576153499589219, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25685500.8, "logits/rejected": -4960203.333333333, "logps/chosen": -266.4767578125, "logps/rejected": -442.6673583984375, "loss": 0.3598, "rewards/chosen": -0.09546059370040894, "rewards/margins": 2.733780841032664, "rewards/rejected": -2.8292414347330728, "step": 10870 }, { "epoch": 0.5762065035910211, "grad_norm": 94.5, "kl": 7.442041397094727, "learning_rate": 5e-07, "logits/chosen": -22638480.0, "logits/rejected": -2346136.0, "logps/chosen": -387.2137044270833, "logps/rejected": -85.46585845947266, "loss": 0.444, "rewards/chosen": 0.8102885087331136, "rewards/margins": 2.640259106953939, "rewards/rejected": -1.8299705982208252, "step": 10871 }, { "epoch": 0.5762595075928233, "grad_norm": 34.25, "kl": 2.318695068359375, "learning_rate": 5e-07, "logits/chosen": -11426784.0, "logits/rejected": -29409852.0, "logps/chosen": -376.5546875, "logps/rejected": -386.2838134765625, "loss": 0.3491, "rewards/chosen": 1.056729861668178, "rewards/margins": 3.0778542246137346, "rewards/rejected": -2.0211243629455566, "step": 10872 }, { "epoch": 0.5763125115946254, "grad_norm": 51.75, "kl": 3.3175411224365234, "learning_rate": 5e-07, "logits/chosen": -25038692.57142857, "logits/rejected": -14983786.0, "logps/chosen": -333.0556640625, "logps/rejected": -434.796630859375, "loss": 0.4959, "rewards/chosen": 0.09871890715190343, "rewards/margins": 2.6284125106675282, "rewards/rejected": -2.529693603515625, "step": 10873 }, { "epoch": 0.5763655155964276, "grad_norm": 56.0, "kl": 2.9532394409179688, "learning_rate": 5e-07, "logits/chosen": -70337322.66666667, "logits/rejected": -8338059.0, "logps/chosen": -350.4833984375, "logps/rejected": -381.4648742675781, "loss": 0.3724, "rewards/chosen": 0.6586539347966512, "rewards/margins": 2.522155006726583, "rewards/rejected": -1.8635010719299316, "step": 10874 }, { "epoch": 0.5764185195982297, "grad_norm": 56.25, "kl": 2.577198028564453, "learning_rate": 5e-07, "logits/chosen": 434988.9, "logits/rejected": -12871981.333333334, "logps/chosen": -487.031884765625, "logps/rejected": -705.19873046875, "loss": 0.2729, "rewards/chosen": 0.833198070526123, "rewards/margins": 3.802693525950114, "rewards/rejected": -2.9694954554239907, "step": 10875 }, { "epoch": 0.5764715236000318, "grad_norm": 49.0, "kl": 0.2081775665283203, "learning_rate": 5e-07, "logits/chosen": -27181646.0, "logits/rejected": -750357.625, "logps/chosen": -363.0559387207031, "logps/rejected": -146.97232055664062, "loss": 0.3158, "rewards/chosen": 0.059828683733940125, "rewards/margins": 2.275355026125908, "rewards/rejected": -2.2155263423919678, "step": 10876 }, { "epoch": 0.5765245276018339, "grad_norm": 63.0, "kl": 0.45792388916015625, "learning_rate": 5e-07, "logits/chosen": -29457120.0, "logits/rejected": -46776416.0, "logps/chosen": -231.7156219482422, "logps/rejected": -557.5286865234375, "loss": 0.2425, "rewards/chosen": 0.33590754866600037, "rewards/margins": 3.2853971421718597, "rewards/rejected": -2.9494895935058594, "step": 10877 }, { "epoch": 0.5765775316036361, "grad_norm": 42.0, "kl": 3.6377792358398438, "learning_rate": 5e-07, "logits/chosen": -32552707.2, "logits/rejected": -2798392.6666666665, "logps/chosen": -234.216943359375, "logps/rejected": -119.16905721028645, "loss": 0.295, "rewards/chosen": 0.719627571105957, "rewards/margins": 2.6156401952107746, "rewards/rejected": -1.8960126241048176, "step": 10878 }, { "epoch": 0.5766305356054382, "grad_norm": 38.25, "kl": 0.2275228500366211, "learning_rate": 5e-07, "logits/chosen": -21594770.0, "logits/rejected": -11744646.666666666, "logps/chosen": -147.4810791015625, "logps/rejected": -402.7156168619792, "loss": 0.1588, "rewards/chosen": 0.47954466938972473, "rewards/margins": 3.057416627804438, "rewards/rejected": -2.5778719584147134, "step": 10879 }, { "epoch": 0.5766835396072404, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2862029.75, "logits/rejected": -54251589.333333336, "logps/chosen": -445.7061767578125, "logps/rejected": -491.6253255208333, "loss": 0.156, "rewards/chosen": 1.5787216424942017, "rewards/margins": 3.743801395098368, "rewards/rejected": -2.1650797526041665, "step": 10880 }, { "epoch": 0.5767365436090425, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8774974.0, "logits/rejected": -52280784.0, "logps/chosen": -267.4005432128906, "logps/rejected": -279.3679504394531, "loss": 0.3343, "rewards/chosen": 0.20608064532279968, "rewards/margins": 1.7589806020259857, "rewards/rejected": -1.552899956703186, "step": 10881 }, { "epoch": 0.5767895476108447, "grad_norm": 33.25, "kl": 1.154855728149414, "learning_rate": 5e-07, "logits/chosen": -21642541.333333332, "logits/rejected": -22768420.8, "logps/chosen": -218.30025227864584, "logps/rejected": -418.6724609375, "loss": 0.1775, "rewards/chosen": 0.8905728658040365, "rewards/margins": 3.1710803349812826, "rewards/rejected": -2.280507469177246, "step": 10882 }, { "epoch": 0.5768425516126467, "grad_norm": 40.0, "kl": 0.9151506423950195, "learning_rate": 5e-07, "logits/chosen": -37303520.0, "logits/rejected": -12892711.0, "logps/chosen": -277.2091979980469, "logps/rejected": -193.947509765625, "loss": 0.2958, "rewards/chosen": 0.5706284046173096, "rewards/margins": 2.9433891773223877, "rewards/rejected": -2.372760772705078, "step": 10883 }, { "epoch": 0.5768955556144489, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12906557.0, "logits/rejected": -24732224.0, "logps/chosen": -54.83123016357422, "logps/rejected": -369.45211356026783, "loss": 0.2274, "rewards/chosen": -0.5261203646659851, "rewards/margins": 1.5798279472759793, "rewards/rejected": -2.1059483119419644, "step": 10884 }, { "epoch": 0.576948559616251, "grad_norm": 33.0, "kl": 2.586872100830078, "learning_rate": 5e-07, "logits/chosen": -5776230.0, "logits/rejected": -27165270.0, "logps/chosen": -566.99462890625, "logps/rejected": -239.8152313232422, "loss": 0.3281, "rewards/chosen": 0.6725884675979614, "rewards/margins": 2.234005331993103, "rewards/rejected": -1.5614168643951416, "step": 10885 }, { "epoch": 0.5770015636180532, "grad_norm": 50.75, "kl": 2.3388633728027344, "learning_rate": 5e-07, "logits/chosen": -20536618.666666668, "logits/rejected": 10266894.0, "logps/chosen": -233.95699055989584, "logps/rejected": -315.15277099609375, "loss": 0.3639, "rewards/chosen": 0.7618043422698975, "rewards/margins": 2.268019199371338, "rewards/rejected": -1.5062148571014404, "step": 10886 }, { "epoch": 0.5770545676198553, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16530053.0, "logits/rejected": 3332051.0, "logps/chosen": -294.5520935058594, "logps/rejected": -216.92122395833334, "loss": 0.2376, "rewards/chosen": 0.5977863073348999, "rewards/margins": 2.383858561515808, "rewards/rejected": -1.7860722541809082, "step": 10887 }, { "epoch": 0.5771075716216575, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8159178.0, "logits/rejected": -28117116.8, "logps/chosen": -204.33182779947916, "logps/rejected": -341.337255859375, "loss": 0.1729, "rewards/chosen": 0.7468918164571127, "rewards/margins": 3.2273304303487143, "rewards/rejected": -2.4804386138916015, "step": 10888 }, { "epoch": 0.5771605756234596, "grad_norm": 65.0, "kl": 0.18732452392578125, "learning_rate": 5e-07, "logits/chosen": -33936514.666666664, "logits/rejected": -40846896.0, "logps/chosen": -496.33203125, "logps/rejected": -437.81390380859375, "loss": 0.3655, "rewards/chosen": 0.15719654162724814, "rewards/margins": 2.941944142182668, "rewards/rejected": -2.78474760055542, "step": 10889 }, { "epoch": 0.5772135796252617, "grad_norm": 41.75, "kl": 2.3064727783203125, "learning_rate": 5e-07, "logits/chosen": -70217817.6, "logits/rejected": -25684314.666666668, "logps/chosen": -387.95947265625, "logps/rejected": -303.6458333333333, "loss": 0.3444, "rewards/chosen": 0.9226817131042481, "rewards/margins": 2.543930912017822, "rewards/rejected": -1.6212491989135742, "step": 10890 }, { "epoch": 0.5772665836270638, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5052592.666666667, "logits/rejected": -18974352.0, "logps/chosen": -96.29395548502605, "logps/rejected": -219.3912841796875, "loss": 0.2855, "rewards/chosen": -0.14413978656133017, "rewards/margins": 2.1095345298449195, "rewards/rejected": -2.25367431640625, "step": 10891 }, { "epoch": 0.5773195876288659, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65714613.333333336, "logits/rejected": -3768556.4, "logps/chosen": -345.9274088541667, "logps/rejected": -408.599169921875, "loss": 0.2467, "rewards/chosen": 0.08822836478551228, "rewards/margins": 2.9976754625638327, "rewards/rejected": -2.9094470977783202, "step": 10892 }, { "epoch": 0.5773725916306681, "grad_norm": 59.5, "kl": 0.1581287384033203, "learning_rate": 5e-07, "logits/chosen": -22615948.8, "logits/rejected": -12780830.666666666, "logps/chosen": -390.4362548828125, "logps/rejected": -382.4287923177083, "loss": 0.316, "rewards/chosen": 0.2573408603668213, "rewards/margins": 2.86101876894633, "rewards/rejected": -2.6036779085795083, "step": 10893 }, { "epoch": 0.5774255956324702, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57007046.4, "logits/rejected": 241737.91666666666, "logps/chosen": -388.0984130859375, "logps/rejected": -161.845703125, "loss": 0.3541, "rewards/chosen": 0.6065991401672364, "rewards/margins": 1.3239648818969727, "rewards/rejected": -0.7173657417297363, "step": 10894 }, { "epoch": 0.5774785996342724, "grad_norm": 89.0, "kl": 7.261627197265625, "learning_rate": 5e-07, "logits/chosen": -29428378.666666668, "logits/rejected": -19830424.0, "logps/chosen": -707.5362955729166, "logps/rejected": -323.640380859375, "loss": 0.418, "rewards/chosen": 1.0190059343973796, "rewards/margins": 2.4654720226923628, "rewards/rejected": -1.446466088294983, "step": 10895 }, { "epoch": 0.5775316036360745, "grad_norm": 43.75, "kl": 5.854217529296875, "learning_rate": 5e-07, "logits/chosen": -35196761.6, "logits/rejected": -11442768.0, "logps/chosen": -394.517626953125, "logps/rejected": -149.80318196614584, "loss": 0.2897, "rewards/chosen": 1.4880802154541015, "rewards/margins": 3.0070030212402346, "rewards/rejected": -1.5189228057861328, "step": 10896 }, { "epoch": 0.5775846076378767, "grad_norm": 41.0, "kl": 0.1456012725830078, "learning_rate": 5e-07, "logits/chosen": -38889876.0, "logits/rejected": -11515681.333333334, "logps/chosen": -748.5994873046875, "logps/rejected": -167.12789916992188, "loss": 0.1911, "rewards/chosen": 1.5480026006698608, "rewards/margins": 3.345030188560486, "rewards/rejected": -1.797027587890625, "step": 10897 }, { "epoch": 0.5776376116396787, "grad_norm": 46.5, "kl": 0.6251258850097656, "learning_rate": 5e-07, "logits/chosen": -26048662.4, "logits/rejected": -52865402.666666664, "logps/chosen": -190.07020263671876, "logps/rejected": -452.9716796875, "loss": 0.2209, "rewards/chosen": 0.9285702705383301, "rewards/margins": 4.119983514149984, "rewards/rejected": -3.191413243611654, "step": 10898 }, { "epoch": 0.5776906156414809, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36439664.0, "logits/rejected": 26321434.0, "logps/chosen": -268.3853352864583, "logps/rejected": -259.4220275878906, "loss": 0.3961, "rewards/chosen": 0.36973877747853595, "rewards/margins": 1.0932561953862507, "rewards/rejected": -0.7235174179077148, "step": 10899 }, { "epoch": 0.577743619643283, "grad_norm": 43.25, "kl": 0.040863037109375, "learning_rate": 5e-07, "logits/chosen": -47705504.0, "logits/rejected": -16268296.0, "logps/chosen": -422.8503112792969, "logps/rejected": -442.2522277832031, "loss": 0.2559, "rewards/chosen": 0.5870607495307922, "rewards/margins": 2.934254229068756, "rewards/rejected": -2.347193479537964, "step": 10900 }, { "epoch": 0.5777966236450852, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7719185.0, "logits/rejected": -16472582.857142856, "logps/chosen": -680.153564453125, "logps/rejected": -296.84458705357144, "loss": 0.1819, "rewards/chosen": 1.7645447254180908, "rewards/margins": 3.4406217506953647, "rewards/rejected": -1.676077025277274, "step": 10901 }, { "epoch": 0.5778496276468873, "grad_norm": 41.75, "kl": 0.893829345703125, "learning_rate": 5e-07, "logits/chosen": -33614889.6, "logits/rejected": -51349194.666666664, "logps/chosen": -211.7412353515625, "logps/rejected": -410.4187825520833, "loss": 0.3785, "rewards/chosen": -0.25090005397796633, "rewards/margins": 3.052420194943746, "rewards/rejected": -3.3033202489217124, "step": 10902 }, { "epoch": 0.5779026316486895, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8921075.0, "logits/rejected": 149925237.33333334, "logps/chosen": -401.3598937988281, "logps/rejected": -269.746337890625, "loss": 0.1729, "rewards/chosen": 1.152439832687378, "rewards/margins": 3.0450135072072344, "rewards/rejected": -1.8925736745198567, "step": 10903 }, { "epoch": 0.5779556356504916, "grad_norm": 62.75, "kl": 0.8826227188110352, "learning_rate": 5e-07, "logits/chosen": 28866760.0, "logits/rejected": -14627083.2, "logps/chosen": -396.0517578125, "logps/rejected": -184.06663818359374, "loss": 0.2996, "rewards/chosen": 0.2533927957216899, "rewards/margins": 1.966335109869639, "rewards/rejected": -1.7129423141479492, "step": 10904 }, { "epoch": 0.5780086396522938, "grad_norm": 40.75, "kl": 0.37783050537109375, "learning_rate": 5e-07, "logits/chosen": -42540940.0, "logits/rejected": -38400660.0, "logps/chosen": -560.362060546875, "logps/rejected": -381.00048828125, "loss": 0.2503, "rewards/chosen": 0.8578495383262634, "rewards/margins": 3.3748039603233337, "rewards/rejected": -2.5169544219970703, "step": 10905 }, { "epoch": 0.5780616436540958, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35890776.0, "logits/rejected": -32407686.0, "logps/chosen": -267.89007568359375, "logps/rejected": -345.66094970703125, "loss": 0.2084, "rewards/chosen": 0.6854511499404907, "rewards/margins": 3.72845995426178, "rewards/rejected": -3.043008804321289, "step": 10906 }, { "epoch": 0.578114647655898, "grad_norm": 36.0, "kl": 2.0075387954711914, "learning_rate": 5e-07, "logits/chosen": 1747446.0, "logits/rejected": -9831090.0, "logps/chosen": -288.96075439453125, "logps/rejected": -226.51849365234375, "loss": 0.3634, "rewards/chosen": 0.8817004561424255, "rewards/margins": 1.6053648591041565, "rewards/rejected": -0.723664402961731, "step": 10907 }, { "epoch": 0.5781676516577001, "grad_norm": 75.5, "kl": 1.8061752319335938, "learning_rate": 5e-07, "logits/chosen": 12233904.8, "logits/rejected": 21707816.0, "logps/chosen": -105.85565185546875, "logps/rejected": -256.64117431640625, "loss": 0.4078, "rewards/chosen": 0.1472449541091919, "rewards/margins": 1.1159714301427206, "rewards/rejected": -0.9687264760335287, "step": 10908 }, { "epoch": 0.5782206556595023, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1662350.25, "logits/rejected": -34639800.0, "logps/chosen": -130.8402099609375, "logps/rejected": -555.6124267578125, "loss": 0.1917, "rewards/chosen": 0.8018472790718079, "rewards/margins": 3.974643290042877, "rewards/rejected": -3.1727960109710693, "step": 10909 }, { "epoch": 0.5782736596613044, "grad_norm": 58.75, "kl": 0.3935880661010742, "learning_rate": 5e-07, "logits/chosen": -48799018.666666664, "logits/rejected": 393319.0, "logps/chosen": -265.146240234375, "logps/rejected": -324.700146484375, "loss": 0.3408, "rewards/chosen": 0.11348596215248108, "rewards/margins": 1.2348669588565826, "rewards/rejected": -1.1213809967041015, "step": 10910 }, { "epoch": 0.5783266636631066, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -83211104.0, "logits/rejected": -27442472.0, "logps/chosen": -629.9205932617188, "logps/rejected": -472.6255696614583, "loss": 0.1591, "rewards/chosen": 0.9722229242324829, "rewards/margins": 3.8854687611262, "rewards/rejected": -2.9132458368937173, "step": 10911 }, { "epoch": 0.5783796676649087, "grad_norm": 50.75, "kl": 2.6027984619140625, "learning_rate": 5e-07, "logits/chosen": -26162307.2, "logits/rejected": -36819712.0, "logps/chosen": -417.186181640625, "logps/rejected": -362.4198811848958, "loss": 0.2863, "rewards/chosen": 1.2016304016113282, "rewards/margins": 3.079283841451009, "rewards/rejected": -1.877653439839681, "step": 10912 }, { "epoch": 0.5784326716667109, "grad_norm": 52.75, "kl": 4.081230163574219, "learning_rate": 5e-07, "logits/chosen": -20371844.57142857, "logits/rejected": -66099808.0, "logps/chosen": -346.337646484375, "logps/rejected": -221.72720336914062, "loss": 0.2955, "rewards/chosen": 1.4721975326538086, "rewards/margins": 3.043131113052368, "rewards/rejected": -1.5709335803985596, "step": 10913 }, { "epoch": 0.5784856756685129, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15172172.0, "logits/rejected": -31439945.14285714, "logps/chosen": -330.5965576171875, "logps/rejected": -297.8125, "loss": 0.1475, "rewards/chosen": 0.7860962152481079, "rewards/margins": 3.4211166075297763, "rewards/rejected": -2.6350203922816684, "step": 10914 }, { "epoch": 0.5785386796703151, "grad_norm": 72.0, "kl": 0.13446044921875, "learning_rate": 5e-07, "logits/chosen": -19955008.0, "logits/rejected": -10707304.666666666, "logps/chosen": -275.50830078125, "logps/rejected": -347.2126871744792, "loss": 0.4099, "rewards/chosen": -0.2488095760345459, "rewards/margins": 1.528657039006551, "rewards/rejected": -1.777466615041097, "step": 10915 }, { "epoch": 0.5785916836721172, "grad_norm": 63.25, "kl": 0.162353515625, "learning_rate": 5e-07, "logits/chosen": -37211644.8, "logits/rejected": -7867348.0, "logps/chosen": -359.64892578125, "logps/rejected": -243.67793782552084, "loss": 0.2846, "rewards/chosen": 0.7657500267028808, "rewards/margins": 2.1336963335673014, "rewards/rejected": -1.3679463068644206, "step": 10916 }, { "epoch": 0.5786446876739194, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2389475.5, "logits/rejected": -22271440.0, "logps/chosen": -249.45654296875, "logps/rejected": -136.5054443359375, "loss": 0.2386, "rewards/chosen": 0.5962401231129965, "rewards/margins": 2.6162093003590905, "rewards/rejected": -2.019969177246094, "step": 10917 }, { "epoch": 0.5786976916757215, "grad_norm": 32.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -95450600.0, "logits/rejected": -34404742.85714286, "logps/chosen": -172.23675537109375, "logps/rejected": -407.33677455357144, "loss": 0.1307, "rewards/chosen": 0.06143493577837944, "rewards/margins": 3.9626266472041607, "rewards/rejected": -3.9011917114257812, "step": 10918 }, { "epoch": 0.5787506956775237, "grad_norm": 61.0, "kl": 0.6643829345703125, "learning_rate": 5e-07, "logits/chosen": -15311595.2, "logits/rejected": -20580645.333333332, "logps/chosen": -319.1525390625, "logps/rejected": -335.0957845052083, "loss": 0.3338, "rewards/chosen": 0.40521607398986814, "rewards/margins": 2.5482362270355225, "rewards/rejected": -2.1430201530456543, "step": 10919 }, { "epoch": 0.5788036996793258, "grad_norm": 46.5, "kl": 1.3793296813964844, "learning_rate": 5e-07, "logits/chosen": -189490336.0, "logits/rejected": -18625312.0, "logps/chosen": -1447.6669921875, "logps/rejected": -333.94370524088544, "loss": 0.1454, "rewards/chosen": 1.27043616771698, "rewards/margins": 4.205360611279806, "rewards/rejected": -2.9349244435628257, "step": 10920 }, { "epoch": 0.578856703681128, "grad_norm": 59.5, "kl": 0.5088653564453125, "learning_rate": 5e-07, "logits/chosen": 950314.0, "logits/rejected": -9291646.0, "logps/chosen": -239.572021484375, "logps/rejected": -394.889892578125, "loss": 0.3422, "rewards/chosen": -0.33632031083106995, "rewards/margins": 2.495772212743759, "rewards/rejected": -2.832092523574829, "step": 10921 }, { "epoch": 0.57890970768293, "grad_norm": 30.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26538118.0, "logits/rejected": -13672114.285714285, "logps/chosen": -218.19166564941406, "logps/rejected": -220.76747349330358, "loss": 0.1311, "rewards/chosen": 1.2515701055526733, "rewards/margins": 3.6927363021033153, "rewards/rejected": -2.441166196550642, "step": 10922 }, { "epoch": 0.5789627116847322, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56588800.0, "logits/rejected": -48988533.333333336, "logps/chosen": -305.652734375, "logps/rejected": -326.307373046875, "loss": 0.3124, "rewards/chosen": 0.3388259172439575, "rewards/margins": 2.4644861777623497, "rewards/rejected": -2.125660260518392, "step": 10923 }, { "epoch": 0.5790157156865343, "grad_norm": 45.25, "kl": 0.8710479736328125, "learning_rate": 5e-07, "logits/chosen": -29974714.666666668, "logits/rejected": -24543008.0, "logps/chosen": -294.94818115234375, "logps/rejected": -400.602197265625, "loss": 0.1778, "rewards/chosen": 1.3455684979756672, "rewards/margins": 3.1907554944356282, "rewards/rejected": -1.845186996459961, "step": 10924 }, { "epoch": 0.5790687196883365, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19867192.0, "logits/rejected": 340745.15, "logps/chosen": -138.57035319010416, "logps/rejected": -164.72027587890625, "loss": 0.3202, "rewards/chosen": -0.2680509090423584, "rewards/margins": 1.4818254947662353, "rewards/rejected": -1.7498764038085937, "step": 10925 }, { "epoch": 0.5791217236901386, "grad_norm": 38.75, "kl": 0.5964469909667969, "learning_rate": 5e-07, "logits/chosen": -23144229.333333332, "logits/rejected": -60715782.4, "logps/chosen": -289.4940185546875, "logps/rejected": -407.1583984375, "loss": 0.2141, "rewards/chosen": 1.523450215657552, "rewards/margins": 3.4713765462239583, "rewards/rejected": -1.9479263305664063, "step": 10926 }, { "epoch": 0.5791747276919408, "grad_norm": 54.5, "kl": 1.1298713684082031, "learning_rate": 5e-07, "logits/chosen": -38761546.666666664, "logits/rejected": -19303496.0, "logps/chosen": -345.9632161458333, "logps/rejected": -627.9722290039062, "loss": 0.3026, "rewards/chosen": 0.5436152617136637, "rewards/margins": 3.7980708281199136, "rewards/rejected": -3.25445556640625, "step": 10927 }, { "epoch": 0.5792277316937429, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62804876.0, "logits/rejected": -23392324.57142857, "logps/chosen": -398.701416015625, "logps/rejected": -189.73006766183036, "loss": 0.1464, "rewards/chosen": 0.44631654024124146, "rewards/margins": 2.978925117424556, "rewards/rejected": -2.5326085771833147, "step": 10928 }, { "epoch": 0.5792807356955451, "grad_norm": 35.25, "kl": 2.3579254150390625, "learning_rate": 5e-07, "logits/chosen": -2754700.5, "logits/rejected": 28296034.0, "logps/chosen": -48.53827667236328, "logps/rejected": -238.34011840820312, "loss": 0.3126, "rewards/chosen": 0.29120731353759766, "rewards/margins": 2.1596109867095947, "rewards/rejected": -1.868403673171997, "step": 10929 }, { "epoch": 0.5793337396973471, "grad_norm": 44.75, "kl": 1.7891674041748047, "learning_rate": 5e-07, "logits/chosen": -11852167.2, "logits/rejected": -966933.5, "logps/chosen": -179.0370849609375, "logps/rejected": -112.32269287109375, "loss": 0.3863, "rewards/chosen": 0.2661633253097534, "rewards/margins": 2.006623085339864, "rewards/rejected": -1.7404597600301106, "step": 10930 }, { "epoch": 0.5793867436991493, "grad_norm": 45.25, "kl": 2.9422407150268555, "learning_rate": 5e-07, "logits/chosen": -9043582.0, "logits/rejected": -59206936.0, "logps/chosen": -224.46099853515625, "logps/rejected": -367.02545166015625, "loss": 0.3134, "rewards/chosen": 0.7034014860788981, "rewards/margins": 4.39516297976176, "rewards/rejected": -3.6917614936828613, "step": 10931 }, { "epoch": 0.5794397477009514, "grad_norm": 70.5, "kl": 0.06750869750976562, "learning_rate": 5e-07, "logits/chosen": -48957980.0, "logits/rejected": -18630090.0, "logps/chosen": -304.683349609375, "logps/rejected": -328.52227783203125, "loss": 0.3342, "rewards/chosen": 0.2220727950334549, "rewards/margins": 1.5288271456956863, "rewards/rejected": -1.3067543506622314, "step": 10932 }, { "epoch": 0.5794927517027536, "grad_norm": 59.0, "kl": 1.0115976333618164, "learning_rate": 5e-07, "logits/chosen": -16167241.6, "logits/rejected": 2425698.0, "logps/chosen": -195.3493896484375, "logps/rejected": -66.16626485188802, "loss": 0.4455, "rewards/chosen": -0.14458069801330567, "rewards/margins": 1.1664763927459716, "rewards/rejected": -1.3110570907592773, "step": 10933 }, { "epoch": 0.5795457557045557, "grad_norm": 47.5, "kl": 0.6155567169189453, "learning_rate": 5e-07, "logits/chosen": -16402153.6, "logits/rejected": 87263504.0, "logps/chosen": -341.8459716796875, "logps/rejected": -676.1517333984375, "loss": 0.2515, "rewards/chosen": 0.8808878898620606, "rewards/margins": 4.5369849840799965, "rewards/rejected": -3.656097094217936, "step": 10934 }, { "epoch": 0.5795987597063579, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14934872.0, "logits/rejected": -38882419.2, "logps/chosen": -170.07425944010416, "logps/rejected": -268.795703125, "loss": 0.2502, "rewards/chosen": 0.2440017064412435, "rewards/margins": 2.5258647282918294, "rewards/rejected": -2.2818630218505858, "step": 10935 }, { "epoch": 0.57965176370816, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21007370.0, "logits/rejected": -18002986.666666668, "logps/chosen": -354.12188720703125, "logps/rejected": -231.9721476236979, "loss": 0.2254, "rewards/chosen": 0.33433836698532104, "rewards/margins": 2.1453242500623064, "rewards/rejected": -1.8109858830769856, "step": 10936 }, { "epoch": 0.5797047677099622, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15129160.0, "logits/rejected": -29198104.0, "logps/chosen": -140.48512268066406, "logps/rejected": -284.9808654785156, "loss": 0.3554, "rewards/chosen": -0.07307635247707367, "rewards/margins": 1.5655299574136734, "rewards/rejected": -1.638606309890747, "step": 10937 }, { "epoch": 0.5797577717117642, "grad_norm": 70.0, "kl": 4.759471893310547, "learning_rate": 5e-07, "logits/chosen": 27325078.4, "logits/rejected": -21239192.0, "logps/chosen": -644.617724609375, "logps/rejected": -101.76171875, "loss": 0.33, "rewards/chosen": 0.7018681049346924, "rewards/margins": 2.5560849984486897, "rewards/rejected": -1.8542168935139973, "step": 10938 }, { "epoch": 0.5798107757135664, "grad_norm": 61.25, "kl": 1.8210453987121582, "learning_rate": 5e-07, "logits/chosen": 25326658.666666668, "logits/rejected": -66519784.0, "logps/chosen": -433.2662760416667, "logps/rejected": -279.71136474609375, "loss": 0.377, "rewards/chosen": 0.4720723628997803, "rewards/margins": 2.7997031211853027, "rewards/rejected": -2.3276307582855225, "step": 10939 }, { "epoch": 0.5798637797153685, "grad_norm": 44.5, "kl": 1.3654708862304688, "learning_rate": 5e-07, "logits/chosen": -22436773.333333332, "logits/rejected": -28243923.2, "logps/chosen": -148.98729451497397, "logps/rejected": -334.1589111328125, "loss": 0.3188, "rewards/chosen": 0.18036290009816489, "rewards/margins": 1.933400543530782, "rewards/rejected": -1.7530376434326171, "step": 10940 }, { "epoch": 0.5799167837171706, "grad_norm": 54.25, "kl": 2.8274078369140625, "learning_rate": 5e-07, "logits/chosen": -23063378.285714287, "logits/rejected": 836433.125, "logps/chosen": -308.84151785714283, "logps/rejected": -57.13987350463867, "loss": 0.4145, "rewards/chosen": 0.4420114244733538, "rewards/margins": 2.454613276890346, "rewards/rejected": -2.012601852416992, "step": 10941 }, { "epoch": 0.5799697877189728, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4802323.6, "logits/rejected": 16991440.0, "logps/chosen": -415.54970703125, "logps/rejected": -594.0023193359375, "loss": 0.2068, "rewards/chosen": 1.13415584564209, "rewards/margins": 4.260702069600423, "rewards/rejected": -3.1265462239583335, "step": 10942 }, { "epoch": 0.5800227917207749, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64460272.0, "logits/rejected": -41305977.6, "logps/chosen": -334.15488688151044, "logps/rejected": -397.62646484375, "loss": 0.2537, "rewards/chosen": 0.35605470339457196, "rewards/margins": 2.712709824244181, "rewards/rejected": -2.3566551208496094, "step": 10943 }, { "epoch": 0.5800757957225771, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1618302.0, "logits/rejected": -15111952.0, "logps/chosen": -204.16910400390626, "logps/rejected": -222.209228515625, "loss": 0.3324, "rewards/chosen": 0.2504566669464111, "rewards/margins": 2.8557353178660074, "rewards/rejected": -2.605278650919596, "step": 10944 }, { "epoch": 0.5801287997243791, "grad_norm": 58.25, "kl": 0.027868270874023438, "learning_rate": 5e-07, "logits/chosen": 12034080.0, "logits/rejected": -36325852.0, "logps/chosen": -395.6728515625, "logps/rejected": -202.97479248046875, "loss": 0.3303, "rewards/chosen": 0.41702595353126526, "rewards/margins": 1.841551810503006, "rewards/rejected": -1.4245258569717407, "step": 10945 }, { "epoch": 0.5801818037261813, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1351744.0, "logits/rejected": -52540723.2, "logps/chosen": -57.96233113606771, "logps/rejected": -352.6181640625, "loss": 0.2503, "rewards/chosen": 0.19098206361134848, "rewards/margins": 2.2799648364384972, "rewards/rejected": -2.0889827728271486, "step": 10946 }, { "epoch": 0.5802348077279834, "grad_norm": 49.0, "kl": 1.2964630126953125, "learning_rate": 5e-07, "logits/chosen": -54054137.6, "logits/rejected": -8312378.666666667, "logps/chosen": -320.0699462890625, "logps/rejected": -214.29329427083334, "loss": 0.3045, "rewards/chosen": 0.4674523830413818, "rewards/margins": 2.949827845891317, "rewards/rejected": -2.482375462849935, "step": 10947 }, { "epoch": 0.5802878117297856, "grad_norm": 52.75, "kl": 0.8897609710693359, "learning_rate": 5e-07, "logits/chosen": -41808156.8, "logits/rejected": -6001496.0, "logps/chosen": -350.044970703125, "logps/rejected": -434.1630045572917, "loss": 0.3438, "rewards/chosen": 0.35479786396026614, "rewards/margins": 2.5609970331192016, "rewards/rejected": -2.2061991691589355, "step": 10948 }, { "epoch": 0.5803408157315877, "grad_norm": 90.5, "kl": 7.145445823669434, "learning_rate": 5e-07, "logits/chosen": -39271014.4, "logits/rejected": -46609546.666666664, "logps/chosen": -716.0384765625, "logps/rejected": -245.57889811197916, "loss": 0.3585, "rewards/chosen": 1.2564599990844727, "rewards/margins": 3.2781179428100584, "rewards/rejected": -2.021657943725586, "step": 10949 }, { "epoch": 0.5803938197333899, "grad_norm": 40.0, "kl": 1.6499366760253906, "learning_rate": 5e-07, "logits/chosen": 11212721.333333334, "logits/rejected": -30810156.8, "logps/chosen": -196.19677734375, "logps/rejected": -329.3464111328125, "loss": 0.2326, "rewards/chosen": 0.6225912570953369, "rewards/margins": 3.042260980606079, "rewards/rejected": -2.4196697235107423, "step": 10950 }, { "epoch": 0.580446823735192, "grad_norm": 99.5, "kl": 1.2946624755859375, "learning_rate": 5e-07, "logits/chosen": -43102908.8, "logits/rejected": -13090400.0, "logps/chosen": -562.384912109375, "logps/rejected": -263.5670979817708, "loss": 0.1975, "rewards/chosen": 1.3228071212768555, "rewards/margins": 4.0818228403727215, "rewards/rejected": -2.7590157190958657, "step": 10951 }, { "epoch": 0.5804998277369942, "grad_norm": 48.75, "kl": 0.7082958221435547, "learning_rate": 5e-07, "logits/chosen": -13299490.0, "logits/rejected": -12334739.0, "logps/chosen": -321.0419921875, "logps/rejected": -138.7538299560547, "loss": 0.2657, "rewards/chosen": 1.2401642799377441, "rewards/margins": 2.7165942192077637, "rewards/rejected": -1.4764299392700195, "step": 10952 }, { "epoch": 0.5805528317387962, "grad_norm": 56.5, "kl": 1.7922630310058594, "learning_rate": 5e-07, "logits/chosen": -38788684.8, "logits/rejected": -5649614.666666667, "logps/chosen": -267.48056640625, "logps/rejected": -258.3756103515625, "loss": 0.3768, "rewards/chosen": 0.247727370262146, "rewards/margins": 2.271474496523539, "rewards/rejected": -2.023747126261393, "step": 10953 }, { "epoch": 0.5806058357405984, "grad_norm": 47.0, "kl": 1.791421890258789, "learning_rate": 5e-07, "logits/chosen": -32953331.2, "logits/rejected": -21364508.0, "logps/chosen": -364.6616943359375, "logps/rejected": -382.812255859375, "loss": 0.2749, "rewards/chosen": 0.7027293682098389, "rewards/margins": 3.1479105790456137, "rewards/rejected": -2.445181210835775, "step": 10954 }, { "epoch": 0.5806588397424005, "grad_norm": 38.25, "kl": 0.6964855194091797, "learning_rate": 5e-07, "logits/chosen": -12942186.0, "logits/rejected": -25807556.0, "logps/chosen": -373.542724609375, "logps/rejected": -428.34918212890625, "loss": 0.1392, "rewards/chosen": 1.7276722192764282, "rewards/margins": 4.326302647590637, "rewards/rejected": -2.598630428314209, "step": 10955 }, { "epoch": 0.5807118437442027, "grad_norm": 51.25, "kl": 4.424569129943848, "learning_rate": 5e-07, "logits/chosen": -40688378.666666664, "logits/rejected": -13807582.0, "logps/chosen": -201.83251953125, "logps/rejected": -295.8458557128906, "loss": 0.4573, "rewards/chosen": 0.25223976373672485, "rewards/margins": 2.6291432976722717, "rewards/rejected": -2.376903533935547, "step": 10956 }, { "epoch": 0.5807648477460048, "grad_norm": 33.75, "kl": 0.53076171875, "learning_rate": 5e-07, "logits/chosen": -53339880.0, "logits/rejected": -10192752.0, "logps/chosen": -223.03372192382812, "logps/rejected": -347.8970947265625, "loss": 0.187, "rewards/chosen": -0.22309723496437073, "rewards/margins": 2.711209545532862, "rewards/rejected": -2.934306780497233, "step": 10957 }, { "epoch": 0.580817851747807, "grad_norm": 52.0, "kl": 0.7210254669189453, "learning_rate": 5e-07, "logits/chosen": -59824677.333333336, "logits/rejected": -44549708.0, "logps/chosen": -333.0517578125, "logps/rejected": -85.5477523803711, "loss": 0.3141, "rewards/chosen": 0.6369623343149821, "rewards/margins": 2.6355638901392617, "rewards/rejected": -1.9986015558242798, "step": 10958 }, { "epoch": 0.5808708557496091, "grad_norm": 44.0, "kl": 1.0599365234375, "learning_rate": 5e-07, "logits/chosen": -44233656.0, "logits/rejected": -19644753.6, "logps/chosen": -309.97878011067706, "logps/rejected": -158.137939453125, "loss": 0.3014, "rewards/chosen": 0.49883874257405597, "rewards/margins": 1.7709308942159017, "rewards/rejected": -1.2720921516418457, "step": 10959 }, { "epoch": 0.5809238597514113, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 57854640.0, "logits/rejected": -28207612.0, "logps/chosen": -676.7880859375, "logps/rejected": -400.27728271484375, "loss": 0.2559, "rewards/chosen": 0.6457068920135498, "rewards/margins": 3.070636510848999, "rewards/rejected": -2.424929618835449, "step": 10960 }, { "epoch": 0.5809768637532133, "grad_norm": 48.75, "kl": 0.5445175170898438, "learning_rate": 5e-07, "logits/chosen": -42770320.0, "logits/rejected": -19632648.0, "logps/chosen": -265.9091491699219, "logps/rejected": -285.77886962890625, "loss": 0.317, "rewards/chosen": 0.39733582735061646, "rewards/margins": 1.9935484528541565, "rewards/rejected": -1.59621262550354, "step": 10961 }, { "epoch": 0.5810298677550155, "grad_norm": 41.25, "kl": 1.0243968963623047, "learning_rate": 5e-07, "logits/chosen": -20343382.0, "logits/rejected": -30584036.0, "logps/chosen": -364.8281555175781, "logps/rejected": -321.2994384765625, "loss": 0.261, "rewards/chosen": 0.5827089548110962, "rewards/margins": 2.7039605379104614, "rewards/rejected": -2.1212515830993652, "step": 10962 }, { "epoch": 0.5810828717568176, "grad_norm": 50.5, "kl": 1.0512943267822266, "learning_rate": 5e-07, "logits/chosen": -25335877.333333332, "logits/rejected": -86520832.0, "logps/chosen": -299.1253255208333, "logps/rejected": -775.5226440429688, "loss": 0.3757, "rewards/chosen": 0.19902515411376953, "rewards/margins": 3.4327287673950195, "rewards/rejected": -3.23370361328125, "step": 10963 }, { "epoch": 0.5811358757586198, "grad_norm": 52.75, "kl": 1.3491363525390625, "learning_rate": 5e-07, "logits/chosen": -54466412.8, "logits/rejected": -24165306.666666668, "logps/chosen": -299.1141845703125, "logps/rejected": -185.91861979166666, "loss": 0.3449, "rewards/chosen": 0.5850908279418945, "rewards/margins": 1.8615913709004719, "rewards/rejected": -1.2765005429585774, "step": 10964 }, { "epoch": 0.5811888797604219, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9167796.666666666, "logits/rejected": -5105322.8, "logps/chosen": -204.28472900390625, "logps/rejected": -283.73525390625, "loss": 0.2266, "rewards/chosen": 0.3376798629760742, "rewards/margins": 2.9005769729614257, "rewards/rejected": -2.5628971099853515, "step": 10965 }, { "epoch": 0.5812418837622241, "grad_norm": 44.5, "kl": 0.7242107391357422, "learning_rate": 5e-07, "logits/chosen": 948066.8, "logits/rejected": -24352874.666666668, "logps/chosen": -51.61964111328125, "logps/rejected": -350.727294921875, "loss": 0.3438, "rewards/chosen": 0.1886391520500183, "rewards/margins": 2.9777251760164893, "rewards/rejected": -2.789086023966471, "step": 10966 }, { "epoch": 0.5812948877640262, "grad_norm": 51.5, "kl": 1.6994199752807617, "learning_rate": 5e-07, "logits/chosen": -21526537.6, "logits/rejected": -34832866.666666664, "logps/chosen": -186.4943115234375, "logps/rejected": -330.6796061197917, "loss": 0.3771, "rewards/chosen": 0.3083677053451538, "rewards/margins": 1.9282636404037476, "rewards/rejected": -1.6198959350585938, "step": 10967 }, { "epoch": 0.5813478917658284, "grad_norm": 59.5, "kl": 0.33090782165527344, "learning_rate": 5e-07, "logits/chosen": 36867976.0, "logits/rejected": -22305064.0, "logps/chosen": -412.5922444661458, "logps/rejected": -535.3787109375, "loss": 0.2289, "rewards/chosen": 0.9700052738189697, "rewards/margins": 4.485773324966431, "rewards/rejected": -3.515768051147461, "step": 10968 }, { "epoch": 0.5814008957676304, "grad_norm": 43.0, "kl": 3.3069610595703125, "learning_rate": 5e-07, "logits/chosen": -20789069.333333332, "logits/rejected": -14181900.8, "logps/chosen": -286.94626871744794, "logps/rejected": -246.903662109375, "loss": 0.2585, "rewards/chosen": 0.4079963763554891, "rewards/margins": 2.2586772998174034, "rewards/rejected": -1.8506809234619142, "step": 10969 }, { "epoch": 0.5814538997694326, "grad_norm": 47.75, "kl": 0.4104499816894531, "learning_rate": 5e-07, "logits/chosen": -12838227.2, "logits/rejected": -17169381.333333332, "logps/chosen": -356.970263671875, "logps/rejected": -202.3861083984375, "loss": 0.3358, "rewards/chosen": 0.4139823913574219, "rewards/margins": 1.8978254000345867, "rewards/rejected": -1.4838430086771648, "step": 10970 }, { "epoch": 0.5815069037712347, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32779390.0, "logits/rejected": -5004216.0, "logps/chosen": -426.13616943359375, "logps/rejected": -319.77734375, "loss": 0.4476, "rewards/chosen": -0.6734862923622131, "rewards/margins": 1.5399715304374695, "rewards/rejected": -2.2134578227996826, "step": 10971 }, { "epoch": 0.5815599077730369, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55426608.0, "logits/rejected": -51293376.0, "logps/chosen": -252.15313720703125, "logps/rejected": -449.642822265625, "loss": 0.2796, "rewards/chosen": 0.4287574887275696, "rewards/margins": 2.371975004673004, "rewards/rejected": -1.9432175159454346, "step": 10972 }, { "epoch": 0.581612911774839, "grad_norm": 53.75, "kl": 1.6851463317871094, "learning_rate": 5e-07, "logits/chosen": -52392416.0, "logits/rejected": -27100374.0, "logps/chosen": -424.8325500488281, "logps/rejected": -303.38446044921875, "loss": 0.3707, "rewards/chosen": 0.17114810645580292, "rewards/margins": 1.873974248766899, "rewards/rejected": -1.7028261423110962, "step": 10973 }, { "epoch": 0.5816659157766412, "grad_norm": 23.75, "kl": 2.9805068969726562, "learning_rate": 5e-07, "logits/chosen": -38131314.666666664, "logits/rejected": -38117292.8, "logps/chosen": -1027.616455078125, "logps/rejected": -415.61005859375, "loss": 0.118, "rewards/chosen": 2.754150390625, "rewards/margins": 5.333085632324218, "rewards/rejected": -2.578935241699219, "step": 10974 }, { "epoch": 0.5817189197784433, "grad_norm": 47.0, "kl": 0.7999114990234375, "learning_rate": 5e-07, "logits/chosen": -7602880.5, "logits/rejected": -14274595.0, "logps/chosen": -213.90231323242188, "logps/rejected": -151.95278930664062, "loss": 0.3364, "rewards/chosen": 0.05624869465827942, "rewards/margins": 1.8117847740650177, "rewards/rejected": -1.7555360794067383, "step": 10975 }, { "epoch": 0.5817719237802454, "grad_norm": 62.0, "kl": 1.8599176406860352, "learning_rate": 5e-07, "logits/chosen": -34065141.333333336, "logits/rejected": 4685075.5, "logps/chosen": -151.69474283854166, "logps/rejected": -32.2223014831543, "loss": 0.4572, "rewards/chosen": 0.3414916197458903, "rewards/margins": 1.4276262919108074, "rewards/rejected": -1.086134672164917, "step": 10976 }, { "epoch": 0.5818249277820475, "grad_norm": 56.5, "kl": 0.24909305572509766, "learning_rate": 5e-07, "logits/chosen": -66323507.2, "logits/rejected": -6099400.0, "logps/chosen": -431.77998046875, "logps/rejected": -454.1394449869792, "loss": 0.3759, "rewards/chosen": 0.24342286586761475, "rewards/margins": 1.960428277651469, "rewards/rejected": -1.7170054117838542, "step": 10977 }, { "epoch": 0.5818779317838497, "grad_norm": 35.25, "kl": 4.457859992980957, "learning_rate": 5e-07, "logits/chosen": -17745276.0, "logits/rejected": -51607804.0, "logps/chosen": -562.056396484375, "logps/rejected": -523.1180419921875, "loss": 0.2342, "rewards/chosen": 1.3252911567687988, "rewards/margins": 3.7147767543792725, "rewards/rejected": -2.3894855976104736, "step": 10978 }, { "epoch": 0.5819309357856518, "grad_norm": 65.0, "kl": 0.62322998046875, "learning_rate": 5e-07, "logits/chosen": -47531579.428571425, "logits/rejected": -13560304.0, "logps/chosen": -283.93369838169644, "logps/rejected": -55.72673034667969, "loss": 0.2925, "rewards/chosen": 0.9159674644470215, "rewards/margins": 2.814487338066101, "rewards/rejected": -1.8985198736190796, "step": 10979 }, { "epoch": 0.581983939787454, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39591084.0, "logits/rejected": -23407965.333333332, "logps/chosen": -295.7802429199219, "logps/rejected": -352.7618408203125, "loss": 0.201, "rewards/chosen": 0.7285804748535156, "rewards/margins": 2.5022246042887373, "rewards/rejected": -1.7736441294352214, "step": 10980 }, { "epoch": 0.5820369437892561, "grad_norm": 56.0, "kl": 0.7429981231689453, "learning_rate": 5e-07, "logits/chosen": -6371332.666666667, "logits/rejected": -17723824.0, "logps/chosen": -136.65877278645834, "logps/rejected": -123.17718505859375, "loss": 0.2473, "rewards/chosen": 0.16863789161046347, "rewards/margins": 2.470099874337514, "rewards/rejected": -2.3014619827270506, "step": 10981 }, { "epoch": 0.5820899477910583, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76916115.2, "logits/rejected": -83043349.33333333, "logps/chosen": -337.2935791015625, "logps/rejected": -387.5113118489583, "loss": 0.469, "rewards/chosen": -0.923765754699707, "rewards/margins": 1.8653469721476235, "rewards/rejected": -2.7891127268473306, "step": 10982 }, { "epoch": 0.5821429517928604, "grad_norm": 45.25, "kl": 0.9158782958984375, "learning_rate": 5e-07, "logits/chosen": -3680148.0, "logits/rejected": 1482588.3333333333, "logps/chosen": -184.02860107421876, "logps/rejected": -176.68216959635416, "loss": 0.2567, "rewards/chosen": 0.9986419677734375, "rewards/margins": 2.766234874725342, "rewards/rejected": -1.7675929069519043, "step": 10983 }, { "epoch": 0.5821959557946625, "grad_norm": 45.25, "kl": 0.6467247009277344, "learning_rate": 5e-07, "logits/chosen": -48003637.333333336, "logits/rejected": -18331731.2, "logps/chosen": -351.4419759114583, "logps/rejected": -363.5793701171875, "loss": 0.2745, "rewards/chosen": 0.4702141284942627, "rewards/margins": 2.271663522720337, "rewards/rejected": -1.8014493942260743, "step": 10984 }, { "epoch": 0.5822489597964646, "grad_norm": 46.0, "kl": 4.661733627319336, "learning_rate": 5e-07, "logits/chosen": 25778662.4, "logits/rejected": -17555685.333333332, "logps/chosen": -284.1498779296875, "logps/rejected": -316.85251871744794, "loss": 0.3018, "rewards/chosen": 1.0598992347717284, "rewards/margins": 3.005133215586344, "rewards/rejected": -1.945233980814616, "step": 10985 }, { "epoch": 0.5823019637982668, "grad_norm": 53.25, "kl": 2.5861968994140625, "learning_rate": 5e-07, "logits/chosen": -57203360.0, "logits/rejected": -55360698.666666664, "logps/chosen": -826.33994140625, "logps/rejected": -297.1536051432292, "loss": 0.212, "rewards/chosen": 1.946490478515625, "rewards/margins": 3.830266507466634, "rewards/rejected": -1.883776028951009, "step": 10986 }, { "epoch": 0.5823549678000689, "grad_norm": 54.75, "kl": 3.549560546875, "learning_rate": 5e-07, "logits/chosen": -50184784.0, "logits/rejected": -33380066.0, "logps/chosen": -781.0963745117188, "logps/rejected": -288.77984619140625, "loss": 0.2585, "rewards/chosen": 1.209967851638794, "rewards/margins": 3.3179280757904053, "rewards/rejected": -2.1079602241516113, "step": 10987 }, { "epoch": 0.5824079718018711, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94345941.33333333, "logits/rejected": 13726105.6, "logps/chosen": -498.4973958333333, "logps/rejected": -234.570458984375, "loss": 0.3326, "rewards/chosen": -0.001997878154118856, "rewards/margins": 1.568418190876643, "rewards/rejected": -1.5704160690307618, "step": 10988 }, { "epoch": 0.5824609758036732, "grad_norm": 65.0, "kl": 0.5157852172851562, "learning_rate": 5e-07, "logits/chosen": -3930904.0, "logits/rejected": -7311831.0, "logps/chosen": -372.8534342447917, "logps/rejected": -182.0128631591797, "loss": 0.3802, "rewards/chosen": 0.719781239827474, "rewards/margins": 1.1152940293153129, "rewards/rejected": -0.39551278948783875, "step": 10989 }, { "epoch": 0.5825139798054754, "grad_norm": 43.25, "kl": 1.2571420669555664, "learning_rate": 5e-07, "logits/chosen": 459864.8, "logits/rejected": -39417778.666666664, "logps/chosen": -218.40322265625, "logps/rejected": -295.8270263671875, "loss": 0.2545, "rewards/chosen": 0.8112860679626465, "rewards/margins": 2.7127049128214518, "rewards/rejected": -1.9014188448588054, "step": 10990 }, { "epoch": 0.5825669838072774, "grad_norm": 49.0, "kl": 0.7353801727294922, "learning_rate": 5e-07, "logits/chosen": -17765536.0, "logits/rejected": -14218015.0, "logps/chosen": -232.98216247558594, "logps/rejected": -65.14571380615234, "loss": 0.3888, "rewards/chosen": 0.39371317625045776, "rewards/margins": 0.9928436279296875, "rewards/rejected": -0.5991304516792297, "step": 10991 }, { "epoch": 0.5826199878090795, "grad_norm": 45.25, "kl": 2.3130264282226562, "learning_rate": 5e-07, "logits/chosen": -51346728.0, "logits/rejected": -50279824.0, "logps/chosen": -318.58349609375, "logps/rejected": -340.11212158203125, "loss": 0.2041, "rewards/chosen": 0.4508316218852997, "rewards/margins": 2.3255588710308075, "rewards/rejected": -1.8747272491455078, "step": 10992 }, { "epoch": 0.5826729918108817, "grad_norm": 65.0, "kl": 1.5550460815429688, "learning_rate": 5e-07, "logits/chosen": -9399153.6, "logits/rejected": -27981322.666666668, "logps/chosen": -255.90888671875, "logps/rejected": -508.2352294921875, "loss": 0.2836, "rewards/chosen": 0.9389409065246582, "rewards/margins": 2.931169096628825, "rewards/rejected": -1.9922281901041667, "step": 10993 }, { "epoch": 0.5827259958126838, "grad_norm": 46.5, "kl": 3.409116744995117, "learning_rate": 5e-07, "logits/chosen": -42232880.0, "logits/rejected": -1062907.75, "logps/chosen": -415.97265625, "logps/rejected": -91.06477864583333, "loss": 0.3292, "rewards/chosen": 0.8017922401428222, "rewards/margins": 2.3383313179016114, "rewards/rejected": -1.536539077758789, "step": 10994 }, { "epoch": 0.582778999814486, "grad_norm": 52.5, "kl": 0.6314735412597656, "learning_rate": 5e-07, "logits/chosen": -15393396.0, "logits/rejected": -31370364.0, "logps/chosen": -155.9351806640625, "logps/rejected": -338.0472717285156, "loss": 0.3012, "rewards/chosen": 0.6166771054267883, "rewards/margins": 2.169757068157196, "rewards/rejected": -1.5530799627304077, "step": 10995 }, { "epoch": 0.5828320038162881, "grad_norm": 54.5, "kl": 1.6911802291870117, "learning_rate": 5e-07, "logits/chosen": -61424857.6, "logits/rejected": -34700642.666666664, "logps/chosen": -418.00712890625, "logps/rejected": -318.4792887369792, "loss": 0.303, "rewards/chosen": 0.8447094917297363, "rewards/margins": 2.3057050069173175, "rewards/rejected": -1.4609955151875813, "step": 10996 }, { "epoch": 0.5828850078180903, "grad_norm": 48.75, "kl": 3.3858413696289062, "learning_rate": 5e-07, "logits/chosen": -33333050.666666668, "logits/rejected": -32881657.6, "logps/chosen": -331.61016845703125, "logps/rejected": -222.589794921875, "loss": 0.2049, "rewards/chosen": 0.8865052858988444, "rewards/margins": 3.376690069834391, "rewards/rejected": -2.490184783935547, "step": 10997 }, { "epoch": 0.5829380118198924, "grad_norm": 45.0, "kl": 1.1389694213867188, "learning_rate": 5e-07, "logits/chosen": -7761174.0, "logits/rejected": 4565645.0, "logps/chosen": -280.7847900390625, "logps/rejected": -80.68428802490234, "loss": 0.3495, "rewards/chosen": 0.3819415867328644, "rewards/margins": 2.2117317020893097, "rewards/rejected": -1.8297901153564453, "step": 10998 }, { "epoch": 0.5829910158216945, "grad_norm": 53.0, "kl": 2.1431331634521484, "learning_rate": 5e-07, "logits/chosen": -39452736.0, "logits/rejected": -23153760.0, "logps/chosen": -520.863720703125, "logps/rejected": -391.266357421875, "loss": 0.3284, "rewards/chosen": 0.6641491889953614, "rewards/margins": 2.5270694732666015, "rewards/rejected": -1.8629202842712402, "step": 10999 }, { "epoch": 0.5830440198234966, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5240607.2, "logits/rejected": -19565600.0, "logps/chosen": -478.07568359375, "logps/rejected": -252.60406494140625, "loss": 0.3596, "rewards/chosen": 0.3406115770339966, "rewards/margins": 1.7249596516291301, "rewards/rejected": -1.3843480745951335, "step": 11000 }, { "epoch": 0.5830970238252988, "grad_norm": 43.25, "kl": 1.2282981872558594, "learning_rate": 5e-07, "logits/chosen": -26870952.0, "logits/rejected": -8033862.4, "logps/chosen": -313.70343017578125, "logps/rejected": -119.4447021484375, "loss": 0.2135, "rewards/chosen": 0.8298481305440267, "rewards/margins": 2.9968969662984213, "rewards/rejected": -2.1670488357543944, "step": 11001 }, { "epoch": 0.5831500278271009, "grad_norm": 36.75, "kl": 4.38604736328125, "learning_rate": 5e-07, "logits/chosen": -14576596.0, "logits/rejected": -23547513.6, "logps/chosen": -88.21828206380208, "logps/rejected": -311.472412109375, "loss": 0.2557, "rewards/chosen": 0.6876383622487386, "rewards/margins": 3.887679942448934, "rewards/rejected": -3.200041580200195, "step": 11002 }, { "epoch": 0.5832030318289031, "grad_norm": 37.25, "kl": 0.08750534057617188, "learning_rate": 5e-07, "logits/chosen": -85062860.8, "logits/rejected": -47069712.0, "logps/chosen": -201.52078857421876, "logps/rejected": -257.0910237630208, "loss": 0.2854, "rewards/chosen": 0.5565561771392822, "rewards/margins": 2.8648960908253986, "rewards/rejected": -2.3083399136861167, "step": 11003 }, { "epoch": 0.5832560358307052, "grad_norm": 61.5, "kl": 0.3752613067626953, "learning_rate": 5e-07, "logits/chosen": -52729002.666666664, "logits/rejected": -21195996.0, "logps/chosen": -342.5845947265625, "logps/rejected": -220.02865600585938, "loss": 0.4223, "rewards/chosen": 0.22048848867416382, "rewards/margins": 0.9964345693588257, "rewards/rejected": -0.7759460806846619, "step": 11004 }, { "epoch": 0.5833090398325074, "grad_norm": 72.5, "kl": 0.2926979064941406, "learning_rate": 5e-07, "logits/chosen": -48230784.0, "logits/rejected": -4241790.4, "logps/chosen": -334.0528971354167, "logps/rejected": -101.00225830078125, "loss": 0.252, "rewards/chosen": 0.7781539758046468, "rewards/margins": 2.7979163964589437, "rewards/rejected": -2.019762420654297, "step": 11005 }, { "epoch": 0.5833620438343095, "grad_norm": 47.25, "kl": 4.026571273803711, "learning_rate": 5e-07, "logits/chosen": -51019139.2, "logits/rejected": -36637152.0, "logps/chosen": -459.2697265625, "logps/rejected": -420.0799967447917, "loss": 0.2475, "rewards/chosen": 1.1304399490356445, "rewards/margins": 2.9172282536824543, "rewards/rejected": -1.7867883046468098, "step": 11006 }, { "epoch": 0.5834150478361116, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70961224.0, "logits/rejected": -29246602.0, "logps/chosen": -609.4804077148438, "logps/rejected": -237.9954376220703, "loss": 0.334, "rewards/chosen": 0.007659919559955597, "rewards/margins": 2.08525563031435, "rewards/rejected": -2.0775957107543945, "step": 11007 }, { "epoch": 0.5834680518379137, "grad_norm": 53.5, "kl": 0.7550621032714844, "learning_rate": 5e-07, "logits/chosen": -25972480.0, "logits/rejected": -51404144.0, "logps/chosen": -317.1683872767857, "logps/rejected": -327.65167236328125, "loss": 0.3667, "rewards/chosen": 0.5431274005344936, "rewards/margins": 2.6740447112492154, "rewards/rejected": -2.1309173107147217, "step": 11008 }, { "epoch": 0.5835210558397159, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17812691.2, "logits/rejected": -13292253.333333334, "logps/chosen": -198.345556640625, "logps/rejected": -127.64317830403645, "loss": 0.2963, "rewards/chosen": 0.2628387451171875, "rewards/margins": 3.6528553009033202, "rewards/rejected": -3.390016555786133, "step": 11009 }, { "epoch": 0.583574059841518, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1591159.25, "logits/rejected": -31474994.285714287, "logps/chosen": -113.640869140625, "logps/rejected": -232.9459228515625, "loss": 0.1845, "rewards/chosen": -0.5858787894248962, "rewards/margins": 1.6161413107599532, "rewards/rejected": -2.2020201001848494, "step": 11010 }, { "epoch": 0.5836270638433202, "grad_norm": 54.75, "kl": 0.6418228149414062, "learning_rate": 5e-07, "logits/chosen": -36056776.0, "logits/rejected": -19580790.0, "logps/chosen": -360.6171875, "logps/rejected": -198.68423461914062, "loss": 0.3564, "rewards/chosen": -0.046939849853515625, "rewards/margins": 1.7658882141113281, "rewards/rejected": -1.8128280639648438, "step": 11011 }, { "epoch": 0.5836800678451223, "grad_norm": 49.0, "kl": 0.8614044189453125, "learning_rate": 5e-07, "logits/chosen": -59555270.4, "logits/rejected": -39507221.333333336, "logps/chosen": -277.977490234375, "logps/rejected": -467.7769775390625, "loss": 0.3521, "rewards/chosen": 0.2532601833343506, "rewards/margins": 2.8261218547821043, "rewards/rejected": -2.572861671447754, "step": 11012 }, { "epoch": 0.5837330718469245, "grad_norm": 55.25, "kl": 2.3626022338867188, "learning_rate": 5e-07, "logits/chosen": -34056013.333333336, "logits/rejected": 55752612.0, "logps/chosen": -437.6306966145833, "logps/rejected": -141.22109985351562, "loss": 0.3997, "rewards/chosen": 0.5877099831899008, "rewards/margins": 2.1007707913716636, "rewards/rejected": -1.5130608081817627, "step": 11013 }, { "epoch": 0.5837860758487265, "grad_norm": 53.5, "kl": 0.21063518524169922, "learning_rate": 5e-07, "logits/chosen": -42007352.0, "logits/rejected": -36658074.666666664, "logps/chosen": -955.5618286132812, "logps/rejected": -293.3900553385417, "loss": 0.1858, "rewards/chosen": 0.8263580203056335, "rewards/margins": 3.1685959696769714, "rewards/rejected": -2.342237949371338, "step": 11014 }, { "epoch": 0.5838390798505287, "grad_norm": 69.0, "kl": 2.0883407592773438, "learning_rate": 5e-07, "logits/chosen": 7735718.4, "logits/rejected": -19971640.0, "logps/chosen": -302.8419921875, "logps/rejected": -417.8057047526042, "loss": 0.3702, "rewards/chosen": -0.04749870300292969, "rewards/margins": 3.2120946248372397, "rewards/rejected": -3.2595933278401694, "step": 11015 }, { "epoch": 0.5838920838523308, "grad_norm": 52.0, "kl": 0.5975418090820312, "learning_rate": 5e-07, "logits/chosen": -11271332.8, "logits/rejected": -66691178.666666664, "logps/chosen": -330.2933349609375, "logps/rejected": -380.0299479166667, "loss": 0.2892, "rewards/chosen": 0.9630126953125, "rewards/margins": 3.2718302408854165, "rewards/rejected": -2.3088175455729165, "step": 11016 }, { "epoch": 0.583945087854133, "grad_norm": 38.5, "kl": 3.6411914825439453, "learning_rate": 5e-07, "logits/chosen": -33738262.4, "logits/rejected": 1736856.6666666667, "logps/chosen": -161.6433349609375, "logps/rejected": -217.7854207356771, "loss": 0.3514, "rewards/chosen": 0.7859031677246093, "rewards/margins": 2.677864106496175, "rewards/rejected": -1.8919609387715657, "step": 11017 }, { "epoch": 0.5839980918559351, "grad_norm": 53.75, "kl": 2.6355209350585938, "learning_rate": 5e-07, "logits/chosen": -33220224.0, "logits/rejected": -25794202.0, "logps/chosen": -256.33457438151044, "logps/rejected": -248.0703582763672, "loss": 0.4045, "rewards/chosen": 0.34169328212738037, "rewards/margins": 1.6635621786117554, "rewards/rejected": -1.321868896484375, "step": 11018 }, { "epoch": 0.5840510958577373, "grad_norm": 47.75, "kl": 0.4878978729248047, "learning_rate": 5e-07, "logits/chosen": -64194186.666666664, "logits/rejected": -18722900.8, "logps/chosen": -521.1283365885416, "logps/rejected": -233.42060546875, "loss": 0.1816, "rewards/chosen": 1.4183154106140137, "rewards/margins": 3.4044116020202635, "rewards/rejected": -1.98609619140625, "step": 11019 }, { "epoch": 0.5841040998595394, "grad_norm": 52.25, "kl": 3.3883705139160156, "learning_rate": 5e-07, "logits/chosen": -75276096.0, "logits/rejected": 62937177.6, "logps/chosen": -537.4894612630209, "logps/rejected": -303.784033203125, "loss": 0.1643, "rewards/chosen": 1.235070784886678, "rewards/margins": 3.284558852513631, "rewards/rejected": -2.049488067626953, "step": 11020 }, { "epoch": 0.5841571038613416, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19421892.0, "logits/rejected": -7525756.0, "logps/chosen": -104.6077880859375, "logps/rejected": -200.69014485677084, "loss": 0.215, "rewards/chosen": 0.0621742308139801, "rewards/margins": 2.4754129151503244, "rewards/rejected": -2.4132386843363443, "step": 11021 }, { "epoch": 0.5842101078631436, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24031924.0, "logits/rejected": -35952189.333333336, "logps/chosen": -535.69580078125, "logps/rejected": -345.1913248697917, "loss": 0.2217, "rewards/chosen": 0.9195800423622131, "rewards/margins": 3.145148495833079, "rewards/rejected": -2.2255684534708657, "step": 11022 }, { "epoch": 0.5842631118649458, "grad_norm": 51.75, "kl": 0.7885017395019531, "learning_rate": 5e-07, "logits/chosen": -28615334.4, "logits/rejected": -23256304.0, "logps/chosen": -306.2700927734375, "logps/rejected": -250.07755533854166, "loss": 0.4037, "rewards/chosen": -0.05075350999832153, "rewards/margins": 1.6736899256706237, "rewards/rejected": -1.7244434356689453, "step": 11023 }, { "epoch": 0.5843161158667479, "grad_norm": 35.25, "kl": 0.9667129516601562, "learning_rate": 5e-07, "logits/chosen": -10114708.0, "logits/rejected": -53400640.0, "logps/chosen": -275.3416748046875, "logps/rejected": -365.3184509277344, "loss": 0.2824, "rewards/chosen": 0.5277599692344666, "rewards/margins": 3.0877737402915955, "rewards/rejected": -2.560013771057129, "step": 11024 }, { "epoch": 0.5843691198685501, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -96725077.33333333, "logits/rejected": -27601094.4, "logps/chosen": -410.7205810546875, "logps/rejected": -362.360107421875, "loss": 0.2187, "rewards/chosen": 0.6631459395090739, "rewards/margins": 2.794794193903605, "rewards/rejected": -2.1316482543945314, "step": 11025 }, { "epoch": 0.5844221238703522, "grad_norm": 28.5, "kl": 1.4799785614013672, "learning_rate": 5e-07, "logits/chosen": 10877966.0, "logits/rejected": -53074069.333333336, "logps/chosen": -262.8223571777344, "logps/rejected": -626.7019449869791, "loss": 0.1124, "rewards/chosen": 1.3061977624893188, "rewards/margins": 5.645265221595764, "rewards/rejected": -4.339067459106445, "step": 11026 }, { "epoch": 0.5844751278721544, "grad_norm": 31.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19611824.0, "logits/rejected": -20946909.333333332, "logps/chosen": -250.6006317138672, "logps/rejected": -359.4930826822917, "loss": 0.1687, "rewards/chosen": 0.19062307476997375, "rewards/margins": 2.9454181492328644, "rewards/rejected": -2.7547950744628906, "step": 11027 }, { "epoch": 0.5845281318739565, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5444620.0, "logits/rejected": -45100004.0, "logps/chosen": -260.86285400390625, "logps/rejected": -253.93682861328125, "loss": 0.3284, "rewards/chosen": 0.035406485199928284, "rewards/margins": 1.660388246178627, "rewards/rejected": -1.6249817609786987, "step": 11028 }, { "epoch": 0.5845811358757587, "grad_norm": 45.5, "kl": 0.5022411346435547, "learning_rate": 5e-07, "logits/chosen": -40997152.0, "logits/rejected": -2877759.2, "logps/chosen": -511.3604329427083, "logps/rejected": -555.770556640625, "loss": 0.1732, "rewards/chosen": 1.4572113355000813, "rewards/margins": 3.5894085248311356, "rewards/rejected": -2.1321971893310545, "step": 11029 }, { "epoch": 0.5846341398775607, "grad_norm": 49.25, "kl": 2.247304916381836, "learning_rate": 5e-07, "logits/chosen": -9608766.0, "logits/rejected": -28580704.0, "logps/chosen": -183.44844563802084, "logps/rejected": -277.08453369140625, "loss": 0.4222, "rewards/chosen": 0.13677183787027994, "rewards/margins": 2.636098543802897, "rewards/rejected": -2.499326705932617, "step": 11030 }, { "epoch": 0.5846871438793629, "grad_norm": 45.5, "kl": 2.816884994506836, "learning_rate": 5e-07, "logits/chosen": -7195430.0, "logits/rejected": -18039072.0, "logps/chosen": -188.99122619628906, "logps/rejected": -403.5611877441406, "loss": 0.3085, "rewards/chosen": 0.6254676580429077, "rewards/margins": 2.262071132659912, "rewards/rejected": -1.6366034746170044, "step": 11031 }, { "epoch": 0.584740147881165, "grad_norm": 44.25, "kl": 1.0119714736938477, "learning_rate": 5e-07, "logits/chosen": -28213168.0, "logits/rejected": -47515804.0, "logps/chosen": -438.7324625651042, "logps/rejected": -457.5798645019531, "loss": 0.3427, "rewards/chosen": 0.4907561143239339, "rewards/margins": 2.8324645360310874, "rewards/rejected": -2.3417084217071533, "step": 11032 }, { "epoch": 0.5847931518829672, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23887069.333333332, "logits/rejected": -46148313.6, "logps/chosen": -371.6394856770833, "logps/rejected": -578.921923828125, "loss": 0.2023, "rewards/chosen": 0.1313054362932841, "rewards/margins": 3.5189801494280495, "rewards/rejected": -3.3876747131347655, "step": 11033 }, { "epoch": 0.5848461558847693, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21716361.333333332, "logits/rejected": -19678374.4, "logps/chosen": -298.0045166015625, "logps/rejected": -326.3959228515625, "loss": 0.3149, "rewards/chosen": 0.18660837411880493, "rewards/margins": 1.5935535073280334, "rewards/rejected": -1.4069451332092284, "step": 11034 }, { "epoch": 0.5848991598865715, "grad_norm": 42.75, "kl": 0.8239879608154297, "learning_rate": 5e-07, "logits/chosen": 2026629.625, "logits/rejected": -65100397.71428572, "logps/chosen": -91.4046859741211, "logps/rejected": -141.9025617327009, "loss": 0.2857, "rewards/chosen": 0.3788414001464844, "rewards/margins": 1.6154605320521764, "rewards/rejected": -1.236619131905692, "step": 11035 }, { "epoch": 0.5849521638883736, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29246240.0, "logits/rejected": -43196824.0, "logps/chosen": -299.10662841796875, "logps/rejected": -204.9142608642578, "loss": 0.3466, "rewards/chosen": -0.004430785775184631, "rewards/margins": 1.6155024617910385, "rewards/rejected": -1.6199332475662231, "step": 11036 }, { "epoch": 0.5850051678901758, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27750460.0, "logits/rejected": -36013716.0, "logps/chosen": -151.6295166015625, "logps/rejected": -384.36614990234375, "loss": 0.2354, "rewards/chosen": 0.399787575006485, "rewards/margins": 3.318984180688858, "rewards/rejected": -2.919196605682373, "step": 11037 }, { "epoch": 0.5850581718919778, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51252736.0, "logits/rejected": -21506793.6, "logps/chosen": -507.9673665364583, "logps/rejected": -146.5290771484375, "loss": 0.3277, "rewards/chosen": 0.26389211416244507, "rewards/margins": 1.3922734618186952, "rewards/rejected": -1.12838134765625, "step": 11038 }, { "epoch": 0.58511117589378, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20267408.0, "logits/rejected": -11424165.714285715, "logps/chosen": -415.90960693359375, "logps/rejected": -256.9330357142857, "loss": 0.1734, "rewards/chosen": 1.537023901939392, "rewards/margins": 3.792613250868661, "rewards/rejected": -2.255589348929269, "step": 11039 }, { "epoch": 0.5851641798955821, "grad_norm": 41.25, "kl": 2.9207301139831543, "learning_rate": 5e-07, "logits/chosen": -6401872.0, "logits/rejected": -15915767.0, "logps/chosen": -62.76214599609375, "logps/rejected": -306.66510009765625, "loss": 0.4567, "rewards/chosen": 0.24943395455678305, "rewards/margins": 1.4199516375859578, "rewards/rejected": -1.1705176830291748, "step": 11040 }, { "epoch": 0.5852171838973843, "grad_norm": 49.5, "kl": 2.3126888275146484, "learning_rate": 5e-07, "logits/chosen": -24205052.0, "logps/chosen": -350.0730285644531, "loss": 0.3329, "rewards/chosen": 1.127153754234314, "step": 11041 }, { "epoch": 0.5852701878991864, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52597188.0, "logits/rejected": -33356773.333333332, "logps/chosen": -150.12835693359375, "logps/rejected": -522.7537027994791, "loss": 0.1989, "rewards/chosen": -0.21858444809913635, "rewards/margins": 2.928132305542628, "rewards/rejected": -3.146716753641764, "step": 11042 }, { "epoch": 0.5853231919009885, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74878841.6, "logits/rejected": -30259477.333333332, "logps/chosen": -859.6341796875, "logps/rejected": -213.423828125, "loss": 0.2627, "rewards/chosen": 0.9552594184875488, "rewards/margins": 4.383315499623617, "rewards/rejected": -3.428056081136068, "step": 11043 }, { "epoch": 0.5853761959027907, "grad_norm": 48.75, "kl": 1.5844268798828125, "learning_rate": 5e-07, "logits/chosen": 4941227.0, "logits/rejected": 3816957.5, "logps/chosen": -323.9534912109375, "logps/rejected": -65.7858657836914, "loss": 0.3491, "rewards/chosen": 0.327478289604187, "rewards/margins": 2.180862069129944, "rewards/rejected": -1.8533837795257568, "step": 11044 }, { "epoch": 0.5854291999045927, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16199838.0, "logits/rejected": -25371952.0, "logps/chosen": -237.94390869140625, "logps/rejected": -169.4987589518229, "loss": 0.2707, "rewards/chosen": 0.38901710510253906, "rewards/margins": 1.9754077593485515, "rewards/rejected": -1.5863906542460124, "step": 11045 }, { "epoch": 0.5854822039063949, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38861522.666666664, "logits/rejected": 288915.6, "logps/chosen": -316.9080810546875, "logps/rejected": -159.94554443359374, "loss": 0.3209, "rewards/chosen": -0.017084757486979168, "rewards/margins": 1.6154839833577472, "rewards/rejected": -1.6325687408447265, "step": 11046 }, { "epoch": 0.585535207908197, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71248240.0, "logits/rejected": -23550458.666666668, "logps/chosen": -341.8685302734375, "logps/rejected": -308.3942057291667, "loss": 0.2667, "rewards/chosen": -0.48262864351272583, "rewards/margins": 2.032505532105764, "rewards/rejected": -2.5151341756184897, "step": 11047 }, { "epoch": 0.5855882119099992, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8851180.0, "logits/rejected": -36394560.0, "logps/chosen": -48.9908447265625, "logps/rejected": -335.96787806919644, "loss": 0.1891, "rewards/chosen": -0.16190719604492188, "rewards/margins": 2.130554744175502, "rewards/rejected": -2.292461940220424, "step": 11048 }, { "epoch": 0.5856412159118013, "grad_norm": 59.75, "kl": 0.7133560180664062, "learning_rate": 5e-07, "logits/chosen": -49802072.0, "logits/rejected": 69972.625, "logps/chosen": -303.333984375, "logps/rejected": -390.0057678222656, "loss": 0.2772, "rewards/chosen": 0.776333212852478, "rewards/margins": 2.3154358863830566, "rewards/rejected": -1.5391026735305786, "step": 11049 }, { "epoch": 0.5856942199136035, "grad_norm": 57.5, "kl": 1.0096664428710938, "learning_rate": 5e-07, "logits/chosen": -6000784.666666667, "logits/rejected": 19925366.4, "logps/chosen": -403.978759765625, "logps/rejected": -671.980419921875, "loss": 0.2143, "rewards/chosen": 0.429698904355367, "rewards/margins": 3.1484098037083945, "rewards/rejected": -2.7187108993530273, "step": 11050 }, { "epoch": 0.5857472239154056, "grad_norm": 53.0, "kl": 0.3130035400390625, "learning_rate": 5e-07, "logits/chosen": -23045410.666666668, "logits/rejected": -10657750.4, "logps/chosen": -274.0799560546875, "logps/rejected": -194.4376953125, "loss": 0.2907, "rewards/chosen": -0.24988911549250284, "rewards/margins": 2.6145876606305443, "rewards/rejected": -2.864476776123047, "step": 11051 }, { "epoch": 0.5858002279172078, "grad_norm": 51.75, "kl": 1.7885589599609375, "learning_rate": 5e-07, "logits/chosen": -11307072.0, "logits/rejected": -26573659.2, "logps/chosen": -234.2623087565104, "logps/rejected": -261.107470703125, "loss": 0.2644, "rewards/chosen": 0.4947058359781901, "rewards/margins": 2.526066462198893, "rewards/rejected": -2.031360626220703, "step": 11052 }, { "epoch": 0.5858532319190098, "grad_norm": 51.25, "kl": 2.3157691955566406, "learning_rate": 5e-07, "logits/chosen": -51955056.0, "logits/rejected": -28034808.0, "logps/chosen": -176.95993041992188, "logps/rejected": -223.05950927734375, "loss": 0.3072, "rewards/chosen": 0.6962578892707825, "rewards/margins": 1.964596688747406, "rewards/rejected": -1.2683387994766235, "step": 11053 }, { "epoch": 0.585906235920812, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43722472.0, "logits/rejected": -18279769.14285714, "logps/chosen": -673.3165283203125, "logps/rejected": -315.19248744419644, "loss": 0.1814, "rewards/chosen": -0.130615234375, "rewards/margins": 1.9723990304129466, "rewards/rejected": -2.1030142647879466, "step": 11054 }, { "epoch": 0.5859592399226141, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13637138.666666666, "logits/rejected": -8104225.6, "logps/chosen": -191.28719075520834, "logps/rejected": -407.2295166015625, "loss": 0.2659, "rewards/chosen": 0.20879389842351279, "rewards/margins": 2.6078150471051535, "rewards/rejected": -2.3990211486816406, "step": 11055 }, { "epoch": 0.5860122439244163, "grad_norm": 38.5, "kl": 2.5283203125, "learning_rate": 5e-07, "logits/chosen": -12265896.0, "logits/rejected": -42092376.0, "logps/chosen": -519.0465698242188, "logps/rejected": -334.3907063802083, "loss": 0.1475, "rewards/chosen": 1.7023075819015503, "rewards/margins": 3.8899957736333213, "rewards/rejected": -2.187688191731771, "step": 11056 }, { "epoch": 0.5860652479262184, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28998644.0, "logits/rejected": -19523012.0, "logps/chosen": -182.3836669921875, "logps/rejected": -312.14052327473956, "loss": 0.259, "rewards/chosen": -0.35003662109375, "rewards/margins": 2.0407260258992515, "rewards/rejected": -2.3907626469930015, "step": 11057 }, { "epoch": 0.5861182519280206, "grad_norm": 40.75, "kl": 1.270864486694336, "learning_rate": 5e-07, "logits/chosen": -43750170.666666664, "logits/rejected": 7813872.0, "logps/chosen": -478.3990885416667, "logps/rejected": -474.076220703125, "loss": 0.1944, "rewards/chosen": 0.6665008862813314, "rewards/margins": 3.300316651662191, "rewards/rejected": -2.6338157653808594, "step": 11058 }, { "epoch": 0.5861712559298227, "grad_norm": 58.0, "kl": 0.6323127746582031, "learning_rate": 5e-07, "logits/chosen": -558450.4, "logits/rejected": -63026416.0, "logps/chosen": -223.403173828125, "logps/rejected": -477.9342447916667, "loss": 0.4466, "rewards/chosen": -0.23608217239379883, "rewards/margins": 0.8964811007181804, "rewards/rejected": -1.1325632731119792, "step": 11059 }, { "epoch": 0.5862242599316249, "grad_norm": 59.5, "kl": 0.4546833038330078, "learning_rate": 5e-07, "logits/chosen": -44317922.666666664, "logits/rejected": 64832556.0, "logps/chosen": -423.9354654947917, "logps/rejected": -374.6217041015625, "loss": 0.3392, "rewards/chosen": 0.44281919797261554, "rewards/margins": 2.6350303490956626, "rewards/rejected": -2.192211151123047, "step": 11060 }, { "epoch": 0.5862772639334269, "grad_norm": 42.25, "kl": 1.1654586791992188, "learning_rate": 5e-07, "logits/chosen": -40748217.6, "logits/rejected": -55532981.333333336, "logps/chosen": -229.935546875, "logps/rejected": -134.3554890950521, "loss": 0.3265, "rewards/chosen": 0.5403464317321778, "rewards/margins": 2.572465705871582, "rewards/rejected": -2.0321192741394043, "step": 11061 }, { "epoch": 0.5863302679352291, "grad_norm": 38.0, "kl": 0.5851497650146484, "learning_rate": 5e-07, "logits/chosen": -26287222.0, "logits/rejected": -8189093.0, "logps/chosen": -288.47857666015625, "logps/rejected": -101.20177459716797, "loss": 0.3379, "rewards/chosen": 0.46984371542930603, "rewards/margins": 1.7812115848064423, "rewards/rejected": -1.3113678693771362, "step": 11062 }, { "epoch": 0.5863832719370312, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -159582944.0, "logits/rejected": -8063792.8, "logps/chosen": -61.75328063964844, "logps/rejected": -382.764501953125, "loss": 0.2275, "rewards/chosen": 0.7341102759043375, "rewards/margins": 2.8057853857676185, "rewards/rejected": -2.071675109863281, "step": 11063 }, { "epoch": 0.5864362759388334, "grad_norm": 46.75, "kl": 2.745999336242676, "learning_rate": 5e-07, "logits/chosen": 6109931.5, "logits/rejected": -42235976.0, "logps/chosen": -220.11898803710938, "logps/rejected": -451.9468994140625, "loss": 0.3335, "rewards/chosen": 0.25885188579559326, "rewards/margins": 3.080762267112732, "rewards/rejected": -2.8219103813171387, "step": 11064 }, { "epoch": 0.5864892799406355, "grad_norm": 34.5, "kl": 1.73187255859375, "learning_rate": 5e-07, "logits/chosen": -12729800.8, "logits/rejected": -40930584.0, "logps/chosen": -219.0444580078125, "logps/rejected": -550.1927490234375, "loss": 0.3077, "rewards/chosen": 0.47280058860778806, "rewards/margins": 4.100443538029989, "rewards/rejected": -3.6276429494222007, "step": 11065 }, { "epoch": 0.5865422839424377, "grad_norm": 35.5, "kl": 0.31688690185546875, "learning_rate": 5e-07, "logits/chosen": -44327696.0, "logits/rejected": -1642434.3, "logps/chosen": -208.9708455403646, "logps/rejected": -237.3086181640625, "loss": 0.2361, "rewards/chosen": 0.7746554215749105, "rewards/margins": 2.74733894666036, "rewards/rejected": -1.9726835250854493, "step": 11066 }, { "epoch": 0.5865952879442398, "grad_norm": 53.5, "kl": 2.4566116333007812, "learning_rate": 5e-07, "logits/chosen": -40527244.0, "logits/rejected": 7719017.0, "logps/chosen": -164.5560760498047, "logps/rejected": -421.33038330078125, "loss": 0.3023, "rewards/chosen": 0.2108008861541748, "rewards/margins": 2.4627952575683594, "rewards/rejected": -2.2519943714141846, "step": 11067 }, { "epoch": 0.586648291946042, "grad_norm": 41.5, "kl": 1.7721633911132812, "learning_rate": 5e-07, "logits/chosen": -38308368.0, "logits/rejected": -16937321.6, "logps/chosen": -400.9036458333333, "logps/rejected": -474.5837890625, "loss": 0.2639, "rewards/chosen": -0.0024016102155049643, "rewards/margins": 4.079144410292308, "rewards/rejected": -4.081546020507813, "step": 11068 }, { "epoch": 0.586701295947844, "grad_norm": 46.25, "kl": 0.27950096130371094, "learning_rate": 5e-07, "logits/chosen": -20049740.0, "logits/rejected": -11266353.333333334, "logps/chosen": -408.19952392578125, "logps/rejected": -423.9525553385417, "loss": 0.1771, "rewards/chosen": 0.12115556001663208, "rewards/margins": 2.7909722129503884, "rewards/rejected": -2.6698166529337564, "step": 11069 }, { "epoch": 0.5867542999496462, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51983756.8, "logits/rejected": -61739349.333333336, "logps/chosen": -409.77509765625, "logps/rejected": -435.014404296875, "loss": 0.2701, "rewards/chosen": 0.6787481784820557, "rewards/margins": 3.092453686396281, "rewards/rejected": -2.413705507914225, "step": 11070 }, { "epoch": 0.5868073039514483, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41056052.0, "logits/rejected": -15082128.0, "logps/chosen": -351.7869873046875, "logps/rejected": -180.23365783691406, "loss": 0.3013, "rewards/chosen": 0.305767685174942, "rewards/margins": 2.3104638159275055, "rewards/rejected": -2.0046961307525635, "step": 11071 }, { "epoch": 0.5868603079532505, "grad_norm": 53.75, "kl": 0.41942596435546875, "learning_rate": 5e-07, "logits/chosen": -40607888.0, "logits/rejected": -36592640.0, "logps/chosen": -746.61669921875, "logps/rejected": -442.6131896972656, "loss": 0.2456, "rewards/chosen": 0.7975151538848877, "rewards/margins": 3.333256244659424, "rewards/rejected": -2.535741090774536, "step": 11072 }, { "epoch": 0.5869133119550526, "grad_norm": 78.5, "kl": 2.7236547470092773, "learning_rate": 5e-07, "logits/chosen": -53887472.0, "logits/rejected": -51657328.0, "logps/chosen": -275.64524332682294, "logps/rejected": -217.80709838867188, "loss": 0.2607, "rewards/chosen": 1.2325884501139324, "rewards/margins": 2.298893253008525, "rewards/rejected": -1.0663048028945923, "step": 11073 }, { "epoch": 0.5869663159568548, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31482297.6, "logits/rejected": -21268906.666666668, "logps/chosen": -460.176953125, "logps/rejected": -247.32828776041666, "loss": 0.2322, "rewards/chosen": 0.6985842704772949, "rewards/margins": 3.9315128644307453, "rewards/rejected": -3.2329285939534507, "step": 11074 }, { "epoch": 0.5870193199586569, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20206860.0, "logits/rejected": -24262442.0, "logps/chosen": -260.34259033203125, "logps/rejected": -378.3280334472656, "loss": 0.2467, "rewards/chosen": 0.6009795665740967, "rewards/margins": 2.8534393310546875, "rewards/rejected": -2.252459764480591, "step": 11075 }, { "epoch": 0.587072323960459, "grad_norm": 33.5, "kl": 0.8926277160644531, "learning_rate": 5e-07, "logits/chosen": -14655656.0, "logits/rejected": -49019552.0, "logps/chosen": -147.41097005208334, "logps/rejected": -458.7888671875, "loss": 0.2623, "rewards/chosen": -0.4539855321248372, "rewards/margins": 2.570905558268229, "rewards/rejected": -3.0248910903930666, "step": 11076 }, { "epoch": 0.5871253279622611, "grad_norm": 27.125, "kl": 0.5019149780273438, "learning_rate": 5e-07, "logits/chosen": -32376920.0, "logits/rejected": -44378432.0, "logps/chosen": -562.17529296875, "logps/rejected": -480.2532552083333, "loss": 0.133, "rewards/chosen": 1.528476595878601, "rewards/margins": 4.566341598828634, "rewards/rejected": -3.0378650029500327, "step": 11077 }, { "epoch": 0.5871783319640633, "grad_norm": 70.5, "kl": 2.1175460815429688, "learning_rate": 5e-07, "logits/chosen": -51896405.333333336, "logits/rejected": -12458586.0, "logps/chosen": -624.2266845703125, "logps/rejected": -257.92535400390625, "loss": 0.3722, "rewards/chosen": 0.7687917550404867, "rewards/margins": 4.60392959912618, "rewards/rejected": -3.8351378440856934, "step": 11078 }, { "epoch": 0.5872313359658654, "grad_norm": 60.25, "kl": 0.1362457275390625, "learning_rate": 5e-07, "logits/chosen": -48126890.666666664, "logits/rejected": -12189116.8, "logps/chosen": -354.042236328125, "logps/rejected": -268.94580078125, "loss": 0.2666, "rewards/chosen": 0.1161298155784607, "rewards/margins": 2.498610055446625, "rewards/rejected": -2.382480239868164, "step": 11079 }, { "epoch": 0.5872843399676676, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13798653.0, "logits/rejected": -20741638.0, "logps/chosen": -189.60536193847656, "logps/rejected": -245.4903564453125, "loss": 0.2621, "rewards/chosen": 0.24436531960964203, "rewards/margins": 2.9353052228689194, "rewards/rejected": -2.6909399032592773, "step": 11080 }, { "epoch": 0.5873373439694697, "grad_norm": 24.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80263688.0, "logits/rejected": -8995077.714285715, "logps/chosen": -778.5527954101562, "logps/rejected": -225.47176688058036, "loss": 0.0609, "rewards/chosen": 2.840100049972534, "rewards/margins": 5.783310106822423, "rewards/rejected": -2.9432100568498885, "step": 11081 }, { "epoch": 0.5873903479712719, "grad_norm": 47.0, "kl": 1.8561115264892578, "learning_rate": 5e-07, "logits/chosen": -20645218.666666668, "logits/rejected": -23404308.0, "logps/chosen": -239.12520345052084, "logps/rejected": -558.5255737304688, "loss": 0.2928, "rewards/chosen": 1.042661984761556, "rewards/margins": 3.154995759328206, "rewards/rejected": -2.1123337745666504, "step": 11082 }, { "epoch": 0.587443351973074, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 470658.25, "logits/rejected": -42140541.333333336, "logps/chosen": -35.514041900634766, "logps/rejected": -479.8667805989583, "loss": 0.2287, "rewards/chosen": -0.2071990966796875, "rewards/margins": 2.644465128580729, "rewards/rejected": -2.8516642252604165, "step": 11083 }, { "epoch": 0.5874963559748761, "grad_norm": 55.75, "kl": 0.08134078979492188, "learning_rate": 5e-07, "logits/chosen": -20347456.0, "logits/rejected": -30575808.0, "logps/chosen": -283.1013916015625, "logps/rejected": -389.740234375, "loss": 0.3693, "rewards/chosen": 0.06359658241271973, "rewards/margins": 1.9154226144154867, "rewards/rejected": -1.8518260320027669, "step": 11084 }, { "epoch": 0.5875493599766782, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32186256.0, "logits/rejected": -9360352.0, "logps/chosen": -340.6038818359375, "logps/rejected": -250.86279296875, "loss": 0.3009, "rewards/chosen": 0.42328818639119464, "rewards/margins": 1.8478742917378743, "rewards/rejected": -1.4245861053466797, "step": 11085 }, { "epoch": 0.5876023639784804, "grad_norm": 59.75, "kl": 3.409674644470215, "learning_rate": 5e-07, "logits/chosen": -44214154.666666664, "logits/rejected": -22917004.0, "logps/chosen": -417.2706298828125, "logps/rejected": -500.30523681640625, "loss": 0.2993, "rewards/chosen": 0.9728646278381348, "rewards/margins": 5.568577289581299, "rewards/rejected": -4.595712661743164, "step": 11086 }, { "epoch": 0.5876553679802825, "grad_norm": 45.25, "kl": 2.8706893920898438, "learning_rate": 5e-07, "logits/chosen": -2888145.2, "logits/rejected": -1454863.5, "logps/chosen": -125.7978515625, "logps/rejected": -175.3860066731771, "loss": 0.43, "rewards/chosen": 0.03746490478515625, "rewards/margins": 2.0228862444559734, "rewards/rejected": -1.9854213396708171, "step": 11087 }, { "epoch": 0.5877083719820847, "grad_norm": 30.25, "kl": 1.2985286712646484, "learning_rate": 5e-07, "logits/chosen": 1150532.5, "logits/rejected": -10456787.0, "logps/chosen": -60.15380859375, "logps/rejected": -421.6480712890625, "loss": 0.2161, "rewards/chosen": 1.0639151334762573, "rewards/margins": 3.778349757194519, "rewards/rejected": -2.7144346237182617, "step": 11088 }, { "epoch": 0.5877613759838868, "grad_norm": 39.25, "kl": 0.9376411437988281, "learning_rate": 5e-07, "logits/chosen": -16244040.0, "logits/rejected": -27041932.8, "logps/chosen": -338.1039632161458, "logps/rejected": -229.572509765625, "loss": 0.2059, "rewards/chosen": 1.0119982560475667, "rewards/margins": 2.8580797990163167, "rewards/rejected": -1.84608154296875, "step": 11089 }, { "epoch": 0.587814379985689, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21255840.0, "logits/rejected": -19595049.333333332, "logps/chosen": -239.23402404785156, "logps/rejected": -255.59027099609375, "loss": 0.1882, "rewards/chosen": 0.8275619745254517, "rewards/margins": 3.1442846854527793, "rewards/rejected": -2.3167227109273276, "step": 11090 }, { "epoch": 0.587867383987491, "grad_norm": 40.25, "kl": 3.769268035888672, "learning_rate": 5e-07, "logits/chosen": 1604161.5, "logits/rejected": -47804032.0, "logps/chosen": -213.97882080078125, "logps/rejected": -237.72786458333334, "loss": 0.2234, "rewards/chosen": 1.3976891040802002, "rewards/margins": 2.9212682247161865, "rewards/rejected": -1.5235791206359863, "step": 11091 }, { "epoch": 0.5879203879892931, "grad_norm": 57.75, "kl": 2.4749908447265625, "learning_rate": 5e-07, "logits/chosen": 46372234.666666664, "logits/rejected": -6173917.0, "logps/chosen": -552.839111328125, "logps/rejected": -190.49417114257812, "loss": 0.3762, "rewards/chosen": 0.8096958796183268, "rewards/margins": 3.700824419657389, "rewards/rejected": -2.8911285400390625, "step": 11092 }, { "epoch": 0.5879733919910953, "grad_norm": 39.0, "kl": 1.5581855773925781, "learning_rate": 5e-07, "logits/chosen": 6984634.666666667, "logits/rejected": 13469902.4, "logps/chosen": -47.924560546875, "logps/rejected": -330.054150390625, "loss": 0.3271, "rewards/chosen": -0.03235328197479248, "rewards/margins": 1.833721661567688, "rewards/rejected": -1.8660749435424804, "step": 11093 }, { "epoch": 0.5880263959928974, "grad_norm": 46.75, "kl": 0.034506797790527344, "learning_rate": 5e-07, "logits/chosen": -13094502.0, "logits/rejected": -125585680.0, "logps/chosen": -142.05532836914062, "logps/rejected": -509.7644348144531, "loss": 0.2484, "rewards/chosen": 0.8919844031333923, "rewards/margins": 2.475121557712555, "rewards/rejected": -1.5831371545791626, "step": 11094 }, { "epoch": 0.5880793999946996, "grad_norm": 38.0, "kl": 1.2817316055297852, "learning_rate": 5e-07, "logits/chosen": -9528328.0, "logits/rejected": -57423961.6, "logps/chosen": -255.12799072265625, "logps/rejected": -243.96943359375, "loss": 0.2253, "rewards/chosen": 1.229906400044759, "rewards/margins": 3.324193127950032, "rewards/rejected": -2.0942867279052733, "step": 11095 }, { "epoch": 0.5881324039965017, "grad_norm": 42.5, "kl": 1.5562820434570312, "learning_rate": 5e-07, "logits/chosen": -41461252.0, "logits/rejected": -43393380.0, "logps/chosen": -318.9991455078125, "logps/rejected": -468.9407958984375, "loss": 0.2186, "rewards/chosen": 0.9902616739273071, "rewards/margins": 3.3912242650985718, "rewards/rejected": -2.4009625911712646, "step": 11096 }, { "epoch": 0.5881854079983039, "grad_norm": 37.25, "kl": 0.6357612609863281, "learning_rate": 5e-07, "logits/chosen": 908660.375, "logits/rejected": -20442434.0, "logps/chosen": -129.022705078125, "logps/rejected": -255.7877960205078, "loss": 0.2827, "rewards/chosen": 0.1389995962381363, "rewards/margins": 2.7521830946207047, "rewards/rejected": -2.6131834983825684, "step": 11097 }, { "epoch": 0.588238412000106, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37209464.0, "logits/rejected": -4667593.6, "logps/chosen": -370.8251139322917, "logps/rejected": -197.96278076171876, "loss": 0.3111, "rewards/chosen": 0.14302422602971396, "rewards/margins": 1.7145624836285909, "rewards/rejected": -1.571538257598877, "step": 11098 }, { "epoch": 0.5882914160019082, "grad_norm": 36.25, "kl": 0.27690887451171875, "learning_rate": 5e-07, "logits/chosen": 2907711.0, "logits/rejected": -44514045.333333336, "logps/chosen": -191.2734130859375, "logps/rejected": -228.07515462239584, "loss": 0.2698, "rewards/chosen": 0.675034761428833, "rewards/margins": 2.7801636854807534, "rewards/rejected": -2.1051289240519204, "step": 11099 }, { "epoch": 0.5883444200037102, "grad_norm": 66.5, "kl": 0.9146060943603516, "learning_rate": 5e-07, "logits/chosen": 511823.0, "logits/rejected": -46017640.0, "logps/chosen": -269.85821533203125, "logps/rejected": -693.349853515625, "loss": 0.1989, "rewards/chosen": 0.8198217749595642, "rewards/margins": 3.6389493346214294, "rewards/rejected": -2.8191275596618652, "step": 11100 }, { "epoch": 0.5883974240055124, "grad_norm": 32.75, "kl": 0.29050445556640625, "learning_rate": 5e-07, "logits/chosen": -154407376.0, "logits/rejected": -20944489.14285714, "logps/chosen": -160.29473876953125, "logps/rejected": -183.58721051897322, "loss": 0.1681, "rewards/chosen": 0.22127990424633026, "rewards/margins": 2.4119947403669357, "rewards/rejected": -2.1907148361206055, "step": 11101 }, { "epoch": 0.5884504280073145, "grad_norm": 45.5, "kl": 0.5821475982666016, "learning_rate": 5e-07, "logits/chosen": -25008858.666666668, "logits/rejected": -12117924.0, "logps/chosen": -247.77020263671875, "logps/rejected": -491.99554443359375, "loss": 0.2999, "rewards/chosen": 0.576331377029419, "rewards/margins": 3.291327714920044, "rewards/rejected": -2.714996337890625, "step": 11102 }, { "epoch": 0.5885034320091167, "grad_norm": 53.75, "kl": 4.491922378540039, "learning_rate": 5e-07, "logits/chosen": -33918552.0, "logits/rejected": -80361792.0, "logps/chosen": -299.34328206380206, "logps/rejected": -541.9545288085938, "loss": 0.4435, "rewards/chosen": 0.3255879084269206, "rewards/margins": 2.1126195589701333, "rewards/rejected": -1.787031650543213, "step": 11103 }, { "epoch": 0.5885564360109188, "grad_norm": 60.75, "kl": 0.7020053863525391, "learning_rate": 5e-07, "logits/chosen": -44016996.0, "logits/rejected": -15138812.0, "logps/chosen": -332.07147216796875, "logps/rejected": -378.98077392578125, "loss": 0.3763, "rewards/chosen": -0.0010155588388442993, "rewards/margins": 1.3145328611135483, "rewards/rejected": -1.3155484199523926, "step": 11104 }, { "epoch": 0.588609440012721, "grad_norm": 36.0, "kl": 1.0769424438476562, "learning_rate": 5e-07, "logits/chosen": -55804296.0, "logits/rejected": 33764032.0, "logps/chosen": -480.3554382324219, "logps/rejected": -591.040771484375, "loss": 0.2097, "rewards/chosen": 0.9410544633865356, "rewards/margins": 4.851768612861633, "rewards/rejected": -3.9107141494750977, "step": 11105 }, { "epoch": 0.5886624440145231, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21569944.0, "logits/rejected": -21543880.0, "logps/chosen": -154.97503662109375, "logps/rejected": -554.37255859375, "loss": 0.2306, "rewards/chosen": 0.5075733065605164, "rewards/margins": 3.371057689189911, "rewards/rejected": -2.8634843826293945, "step": 11106 }, { "epoch": 0.5887154480163252, "grad_norm": 36.25, "kl": 0.7428054809570312, "learning_rate": 5e-07, "logits/chosen": -81984616.0, "logits/rejected": -14539129.0, "logps/chosen": -222.48179626464844, "logps/rejected": -695.1367797851562, "loss": 0.2356, "rewards/chosen": 0.3758777678012848, "rewards/margins": 4.123974233865738, "rewards/rejected": -3.748096466064453, "step": 11107 }, { "epoch": 0.5887684520181273, "grad_norm": 54.25, "kl": 0.6053924560546875, "learning_rate": 5e-07, "logits/chosen": -56824217.6, "logits/rejected": -52175520.0, "logps/chosen": -411.962353515625, "logps/rejected": -372.1024576822917, "loss": 0.284, "rewards/chosen": 0.4425073146820068, "rewards/margins": 3.199555285771688, "rewards/rejected": -2.757047971089681, "step": 11108 }, { "epoch": 0.5888214560199295, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19241542.666666668, "logits/rejected": -58229612.8, "logps/chosen": -298.0507405598958, "logps/rejected": -429.910302734375, "loss": 0.1803, "rewards/chosen": 0.7989049752553304, "rewards/margins": 3.628252204259237, "rewards/rejected": -2.8293472290039063, "step": 11109 }, { "epoch": 0.5888744600217316, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -88781194.66666667, "logits/rejected": -29691078.4, "logps/chosen": -469.8834635416667, "logps/rejected": -401.6032958984375, "loss": 0.3139, "rewards/chosen": 0.24122214317321777, "rewards/margins": 1.8207004070281982, "rewards/rejected": -1.5794782638549805, "step": 11110 }, { "epoch": 0.5889274640235338, "grad_norm": 39.0, "kl": 2.5319881439208984, "learning_rate": 5e-07, "logits/chosen": -7170212.0, "logits/rejected": -8033532.8, "logps/chosen": -220.5136515299479, "logps/rejected": -382.102099609375, "loss": 0.2019, "rewards/chosen": 0.817083994547526, "rewards/margins": 2.953532854715983, "rewards/rejected": -2.136448860168457, "step": 11111 }, { "epoch": 0.5889804680253359, "grad_norm": 52.0, "kl": 1.9424009323120117, "learning_rate": 5e-07, "logits/chosen": -867326.8571428572, "logits/rejected": -5988065.5, "logps/chosen": -159.97893415178572, "logps/rejected": -84.72176361083984, "loss": 0.4953, "rewards/chosen": -0.07256638152258736, "rewards/margins": 2.0092109526906694, "rewards/rejected": -2.081777334213257, "step": 11112 }, { "epoch": 0.5890334720271381, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 91529576.0, "logits/rejected": -42180528.0, "logps/chosen": -857.225830078125, "logps/rejected": -405.041259765625, "loss": 0.1665, "rewards/chosen": 0.27508240938186646, "rewards/margins": 3.2937254707018533, "rewards/rejected": -3.018643061319987, "step": 11113 }, { "epoch": 0.5890864760289402, "grad_norm": 43.5, "kl": 1.1155166625976562, "learning_rate": 5e-07, "logits/chosen": -15960553.0, "logits/rejected": -32072872.0, "logps/chosen": -300.6690979003906, "logps/rejected": -276.8797912597656, "loss": 0.2995, "rewards/chosen": 0.9507638216018677, "rewards/margins": 2.604660749435425, "rewards/rejected": -1.6538969278335571, "step": 11114 }, { "epoch": 0.5891394800307423, "grad_norm": 45.75, "kl": 0.006336212158203125, "learning_rate": 5e-07, "logits/chosen": 3672997.75, "logits/rejected": -31454164.57142857, "logps/chosen": -17.740745544433594, "logps/rejected": -322.99148995535717, "loss": 0.191, "rewards/chosen": 0.07444343715906143, "rewards/margins": 2.365685491689614, "rewards/rejected": -2.2912420545305525, "step": 11115 }, { "epoch": 0.5891924840325444, "grad_norm": 50.75, "kl": 0.79559326171875, "learning_rate": 5e-07, "logits/chosen": 47598000.0, "logits/rejected": -48537113.6, "logps/chosen": -430.5171305338542, "logps/rejected": -377.701025390625, "loss": 0.1624, "rewards/chosen": 1.0152665774027507, "rewards/margins": 3.4370137850443525, "rewards/rejected": -2.4217472076416016, "step": 11116 }, { "epoch": 0.5892454880343466, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4406674.0, "logits/rejected": -25360236.0, "logps/chosen": -177.5485076904297, "logps/rejected": -173.7175750732422, "loss": 0.3498, "rewards/chosen": -0.2983408570289612, "rewards/margins": 1.668904960155487, "rewards/rejected": -1.9672458171844482, "step": 11117 }, { "epoch": 0.5892984920361487, "grad_norm": 56.25, "kl": 3.101848602294922, "learning_rate": 5e-07, "logits/chosen": -21538876.0, "logits/rejected": -33708888.0, "logps/chosen": -112.9869384765625, "logps/rejected": -242.40542602539062, "loss": 0.274, "rewards/chosen": 0.7094601392745972, "rewards/margins": 2.332990050315857, "rewards/rejected": -1.6235299110412598, "step": 11118 }, { "epoch": 0.5893514960379509, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22913114.0, "logits/rejected": -24669868.0, "logps/chosen": -204.51876831054688, "logps/rejected": -276.082275390625, "loss": 0.2087, "rewards/chosen": 0.7799269556999207, "rewards/margins": 3.3683285117149353, "rewards/rejected": -2.5884015560150146, "step": 11119 }, { "epoch": 0.589404500039753, "grad_norm": 69.0, "kl": 0.8845329284667969, "learning_rate": 5e-07, "logits/chosen": -24224270.0, "logits/rejected": -31281184.0, "logps/chosen": -522.4486083984375, "logps/rejected": -216.69476318359375, "loss": 0.2316, "rewards/chosen": 0.8432220816612244, "rewards/margins": 3.0874552130699158, "rewards/rejected": -2.2442331314086914, "step": 11120 }, { "epoch": 0.5894575040415552, "grad_norm": 51.5, "kl": 1.9086122512817383, "learning_rate": 5e-07, "logits/chosen": 597657.1666666666, "logits/rejected": -8314872.5, "logps/chosen": -221.47623697916666, "logps/rejected": -111.74104309082031, "loss": 0.2941, "rewards/chosen": 0.9164742628733317, "rewards/margins": 2.359270970026652, "rewards/rejected": -1.4427967071533203, "step": 11121 }, { "epoch": 0.5895105080433573, "grad_norm": 68.0, "kl": 1.2215156555175781, "learning_rate": 5e-07, "logits/chosen": -104122336.0, "logits/rejected": -15306700.0, "logps/chosen": -682.609375, "logps/rejected": -214.991455078125, "loss": 0.3721, "rewards/chosen": 0.181365966796875, "rewards/margins": 1.4673843383789062, "rewards/rejected": -1.2860183715820312, "step": 11122 }, { "epoch": 0.5895635120451594, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15644883.0, "logits/rejected": -23421422.0, "logps/chosen": -105.05040740966797, "logps/rejected": -424.5357666015625, "loss": 0.2341, "rewards/chosen": 0.43231430649757385, "rewards/margins": 3.464635044336319, "rewards/rejected": -3.032320737838745, "step": 11123 }, { "epoch": 0.5896165160469615, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48648640.0, "logits/rejected": -21376948.0, "logps/chosen": -283.4363708496094, "logps/rejected": -398.5311686197917, "loss": 0.2126, "rewards/chosen": 0.15235061943531036, "rewards/margins": 2.2575615594784417, "rewards/rejected": -2.1052109400431314, "step": 11124 }, { "epoch": 0.5896695200487637, "grad_norm": 51.25, "kl": 2.576770782470703, "learning_rate": 5e-07, "logits/chosen": -52443912.0, "logps/chosen": -230.70819091796875, "loss": 0.5045, "rewards/chosen": 0.2734639048576355, "step": 11125 }, { "epoch": 0.5897225240505658, "grad_norm": 58.5, "kl": 0.3477439880371094, "learning_rate": 5e-07, "logits/chosen": -23348480.0, "logits/rejected": -29758838.0, "logps/chosen": -223.39453125, "logps/rejected": -262.1276550292969, "loss": 0.3974, "rewards/chosen": -0.007689118385314941, "rewards/margins": 2.5738357305526733, "rewards/rejected": -2.5815248489379883, "step": 11126 }, { "epoch": 0.589775528052368, "grad_norm": 49.25, "kl": 1.334646224975586, "learning_rate": 5e-07, "logits/chosen": -7136234.0, "logits/rejected": -14675298.666666666, "logps/chosen": -138.0051727294922, "logps/rejected": -556.1904296875, "loss": 0.1819, "rewards/chosen": 2.3665599822998047, "rewards/margins": 4.261832555135091, "rewards/rejected": -1.8952725728352864, "step": 11127 }, { "epoch": 0.5898285320541701, "grad_norm": 56.25, "kl": 2.1524105072021484, "learning_rate": 5e-07, "logits/chosen": -12053338.0, "logits/rejected": -32244972.0, "logps/chosen": -199.87362670898438, "logps/rejected": -247.83123779296875, "loss": 0.3228, "rewards/chosen": 0.6161131262779236, "rewards/margins": 2.8772887587547302, "rewards/rejected": -2.2611756324768066, "step": 11128 }, { "epoch": 0.5898815360559723, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45059280.0, "logits/rejected": -76004394.66666667, "logps/chosen": -399.8402587890625, "logps/rejected": -253.97078450520834, "loss": 0.314, "rewards/chosen": 0.26130692958831786, "rewards/margins": 2.9669245163599647, "rewards/rejected": -2.705617586771647, "step": 11129 }, { "epoch": 0.5899345400577743, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31353634.666666668, "logits/rejected": 9538507.2, "logps/chosen": -329.0347900390625, "logps/rejected": -414.033203125, "loss": 0.2253, "rewards/chosen": 1.0845515727996826, "rewards/margins": 3.2090739727020265, "rewards/rejected": -2.124522399902344, "step": 11130 }, { "epoch": 0.5899875440595765, "grad_norm": 52.75, "kl": 0.3635978698730469, "learning_rate": 5e-07, "logits/chosen": -8590551.0, "logits/rejected": -47251764.0, "logps/chosen": -235.70777893066406, "logps/rejected": -573.7034912109375, "loss": 0.2856, "rewards/chosen": 0.4729280471801758, "rewards/margins": 2.765486240386963, "rewards/rejected": -2.292558193206787, "step": 11131 }, { "epoch": 0.5900405480613786, "grad_norm": 60.75, "kl": 1.587799072265625, "learning_rate": 5e-07, "logits/chosen": -31895397.333333332, "logits/rejected": -49964968.0, "logps/chosen": -351.123046875, "logps/rejected": -338.2106628417969, "loss": 0.4066, "rewards/chosen": 0.4242049853006999, "rewards/margins": 1.925496021906535, "rewards/rejected": -1.501291036605835, "step": 11132 }, { "epoch": 0.5900935520631808, "grad_norm": 52.25, "kl": 3.380923271179199, "learning_rate": 5e-07, "logits/chosen": -15055137.333333334, "logits/rejected": 16827203.2, "logps/chosen": -1196.0360514322917, "logps/rejected": -339.494482421875, "loss": 0.2532, "rewards/chosen": 1.0808162689208984, "rewards/margins": 2.755611801147461, "rewards/rejected": -1.6747955322265624, "step": 11133 }, { "epoch": 0.5901465560649829, "grad_norm": 46.25, "kl": 0.25292205810546875, "learning_rate": 5e-07, "logits/chosen": -32395514.666666668, "logits/rejected": -12313060.8, "logps/chosen": -297.1582438151042, "logps/rejected": -145.765185546875, "loss": 0.2472, "rewards/chosen": 0.6752395629882812, "rewards/margins": 2.4991954803466796, "rewards/rejected": -1.8239559173583983, "step": 11134 }, { "epoch": 0.5901995600667851, "grad_norm": 52.5, "kl": 1.6995792388916016, "learning_rate": 5e-07, "logits/chosen": -39766708.0, "logits/rejected": -22054880.0, "logps/chosen": -297.3525085449219, "logps/rejected": -299.2300720214844, "loss": 0.2935, "rewards/chosen": 0.3694305419921875, "rewards/margins": 2.2476704120635986, "rewards/rejected": -1.8782398700714111, "step": 11135 }, { "epoch": 0.5902525640685872, "grad_norm": 50.75, "kl": 0.15373992919921875, "learning_rate": 5e-07, "logits/chosen": -16713456.0, "logits/rejected": -25893612.8, "logps/chosen": -373.5990397135417, "logps/rejected": -276.643798828125, "loss": 0.2655, "rewards/chosen": 0.29451855023701984, "rewards/margins": 2.1650051911671957, "rewards/rejected": -1.8704866409301757, "step": 11136 }, { "epoch": 0.5903055680703894, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30548576.0, "logits/rejected": -32326899.2, "logps/chosen": -351.5845133463542, "logps/rejected": -404.0295166015625, "loss": 0.2543, "rewards/chosen": 0.3305193583170573, "rewards/margins": 2.738156763712565, "rewards/rejected": -2.4076374053955076, "step": 11137 }, { "epoch": 0.5903585720721914, "grad_norm": 54.0, "kl": 1.27056884765625, "learning_rate": 5e-07, "logits/chosen": -110824793.6, "logits/rejected": -39081717.333333336, "logps/chosen": -1091.112890625, "logps/rejected": -368.1995849609375, "loss": 0.22, "rewards/chosen": 2.0400970458984373, "rewards/margins": 4.064940579732259, "rewards/rejected": -2.0248435338338218, "step": 11138 }, { "epoch": 0.5904115760739936, "grad_norm": 40.75, "kl": 0.352142333984375, "learning_rate": 5e-07, "logits/chosen": -63288536.0, "logits/rejected": -34496149.333333336, "logps/chosen": -305.1384582519531, "logps/rejected": -282.0597737630208, "loss": 0.2339, "rewards/chosen": 0.33701249957084656, "rewards/margins": 2.568723887205124, "rewards/rejected": -2.2317113876342773, "step": 11139 }, { "epoch": 0.5904645800757957, "grad_norm": 55.0, "kl": 1.9276847839355469, "learning_rate": 5e-07, "logits/chosen": -22926342.4, "logits/rejected": -5446668.0, "logps/chosen": -484.0646484375, "logps/rejected": -90.90066528320312, "loss": 0.3138, "rewards/chosen": 1.0799489974975587, "rewards/margins": 2.1593932469685875, "rewards/rejected": -1.0794442494710286, "step": 11140 }, { "epoch": 0.5905175840775979, "grad_norm": 54.25, "kl": 0.9942378997802734, "learning_rate": 5e-07, "logits/chosen": -36485517.71428572, "logits/rejected": -12058696.0, "logps/chosen": -214.44320242745536, "logps/rejected": -130.2398681640625, "loss": 0.4165, "rewards/chosen": 0.26526919433048796, "rewards/margins": 3.3990568263190135, "rewards/rejected": -3.1337876319885254, "step": 11141 }, { "epoch": 0.5905705880794, "grad_norm": 45.25, "kl": 0.43055152893066406, "learning_rate": 5e-07, "logits/chosen": -303704.0, "logits/rejected": -20322674.0, "logps/chosen": -340.87286376953125, "logps/rejected": -252.75003051757812, "loss": 0.2183, "rewards/chosen": 1.2270219326019287, "rewards/margins": 3.0927798748016357, "rewards/rejected": -1.865757942199707, "step": 11142 }, { "epoch": 0.5906235920812021, "grad_norm": 33.75, "kl": 1.4352455139160156, "learning_rate": 5e-07, "logits/chosen": -23693338.0, "logits/rejected": -21456622.0, "logps/chosen": -279.5696105957031, "logps/rejected": -550.27734375, "loss": 0.1735, "rewards/chosen": 1.1005204916000366, "rewards/margins": 4.579827189445496, "rewards/rejected": -3.479306697845459, "step": 11143 }, { "epoch": 0.5906765960830043, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27470972.0, "logits/rejected": -8169476.0, "logps/chosen": -470.3582763671875, "logps/rejected": -257.8967590332031, "loss": 0.2762, "rewards/chosen": 0.5743721127510071, "rewards/margins": 2.2345033288002014, "rewards/rejected": -1.6601312160491943, "step": 11144 }, { "epoch": 0.5907296000848063, "grad_norm": 49.25, "kl": 5.860141754150391, "learning_rate": 5e-07, "logits/chosen": -15272771.2, "logits/rejected": -13418013.333333334, "logps/chosen": -271.7676025390625, "logps/rejected": -146.9199015299479, "loss": 0.3467, "rewards/chosen": 0.5069299697875976, "rewards/margins": 2.3020740509033204, "rewards/rejected": -1.7951440811157227, "step": 11145 }, { "epoch": 0.5907826040866085, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81531808.0, "logits/rejected": -37551504.0, "logps/chosen": -563.4196166992188, "logps/rejected": -324.79986572265625, "loss": 0.1301, "rewards/chosen": 1.718348741531372, "rewards/margins": 3.951428174972534, "rewards/rejected": -2.233079433441162, "step": 11146 }, { "epoch": 0.5908356080884106, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84671096.0, "logits/rejected": -20832397.333333332, "logps/chosen": -381.700439453125, "logps/rejected": -308.5667724609375, "loss": 0.2032, "rewards/chosen": 0.2252655029296875, "rewards/margins": 2.7657179832458496, "rewards/rejected": -2.540452480316162, "step": 11147 }, { "epoch": 0.5908886120902128, "grad_norm": 40.75, "kl": 0.9046077728271484, "learning_rate": 5e-07, "logits/chosen": -10649134.0, "logits/rejected": -7725408.0, "logps/chosen": -276.44171142578125, "logps/rejected": -264.43182373046875, "loss": 0.2475, "rewards/chosen": 0.991828441619873, "rewards/margins": 2.7283132076263428, "rewards/rejected": -1.7364847660064697, "step": 11148 }, { "epoch": 0.5909416160920149, "grad_norm": 49.5, "kl": 2.4531402587890625, "learning_rate": 5e-07, "logits/chosen": -30699952.0, "logits/rejected": -30636646.0, "logps/chosen": -270.6741638183594, "logps/rejected": -190.4109344482422, "loss": 0.3056, "rewards/chosen": 0.3933466970920563, "rewards/margins": 2.7151979506015778, "rewards/rejected": -2.3218512535095215, "step": 11149 }, { "epoch": 0.5909946200938171, "grad_norm": 25.875, "kl": 0.99847412109375, "learning_rate": 5e-07, "logits/chosen": 1705603.6666666667, "logits/rejected": -44360438.4, "logps/chosen": -219.756591796875, "logps/rejected": -279.703857421875, "loss": 0.1861, "rewards/chosen": 1.7832492192586262, "rewards/margins": 3.6471465428670244, "rewards/rejected": -1.8638973236083984, "step": 11150 }, { "epoch": 0.5910476240956192, "grad_norm": 35.25, "kl": 0.23028135299682617, "learning_rate": 5e-07, "logits/chosen": -38001704.0, "logits/rejected": -20044354.666666668, "logps/chosen": -865.5848388671875, "logps/rejected": -149.43535359700522, "loss": 0.1229, "rewards/chosen": 1.8130066394805908, "rewards/margins": 4.167521079381308, "rewards/rejected": -2.3545144399007163, "step": 11151 }, { "epoch": 0.5911006280974214, "grad_norm": 40.0, "kl": 0.1888103485107422, "learning_rate": 5e-07, "logits/chosen": -20242808.0, "logits/rejected": -2597502.25, "logps/chosen": -344.69091796875, "logps/rejected": -165.38949584960938, "loss": 0.272, "rewards/chosen": 0.9863793253898621, "rewards/margins": 2.301393210887909, "rewards/rejected": -1.3150138854980469, "step": 11152 }, { "epoch": 0.5911536320992234, "grad_norm": 54.5, "kl": 1.8148040771484375, "learning_rate": 5e-07, "logits/chosen": 2885715.0, "logits/rejected": -9599213.0, "logps/chosen": -306.5915832519531, "logps/rejected": -262.3440856933594, "loss": 0.3757, "rewards/chosen": -0.1295224279165268, "rewards/margins": 2.140640154480934, "rewards/rejected": -2.270162582397461, "step": 11153 }, { "epoch": 0.5912066361010256, "grad_norm": 27.625, "kl": 1.3991432189941406, "learning_rate": 5e-07, "logits/chosen": -2873247.0, "logits/rejected": -20930864.0, "logps/chosen": -928.0983276367188, "logps/rejected": -175.14249674479166, "loss": 0.1652, "rewards/chosen": 2.4235475063323975, "rewards/margins": 4.034664074579875, "rewards/rejected": -1.6111165682474773, "step": 11154 }, { "epoch": 0.5912596401028277, "grad_norm": 35.0, "kl": 0.661468505859375, "learning_rate": 5e-07, "logits/chosen": -29550634.666666668, "logits/rejected": -26879750.4, "logps/chosen": -313.64182535807294, "logps/rejected": -588.534326171875, "loss": 0.2186, "rewards/chosen": 0.8670082092285156, "rewards/margins": 4.233664321899414, "rewards/rejected": -3.3666561126708983, "step": 11155 }, { "epoch": 0.5913126441046299, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72399237.33333333, "logits/rejected": -11910249.6, "logps/chosen": -291.57989501953125, "logps/rejected": -140.6973388671875, "loss": 0.23, "rewards/chosen": 0.8418407440185547, "rewards/margins": 2.6228105545043947, "rewards/rejected": -1.7809698104858398, "step": 11156 }, { "epoch": 0.591365648106432, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10033602.0, "logits/rejected": -21088473.14285714, "logps/chosen": -287.0819091796875, "logps/rejected": -262.1246337890625, "loss": 0.1509, "rewards/chosen": 2.0076844692230225, "rewards/margins": 4.1521996429988315, "rewards/rejected": -2.144515173775809, "step": 11157 }, { "epoch": 0.5914186521082342, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14596337.333333334, "logits/rejected": -31677334.4, "logps/chosen": -220.45638020833334, "logps/rejected": -322.91552734375, "loss": 0.224, "rewards/chosen": 0.5747477213541666, "rewards/margins": 2.6244908014933266, "rewards/rejected": -2.04974308013916, "step": 11158 }, { "epoch": 0.5914716561100363, "grad_norm": 49.5, "kl": 3.22698974609375, "learning_rate": 5e-07, "logits/chosen": -14260529.6, "logits/rejected": -42838656.0, "logps/chosen": -142.81605224609376, "logps/rejected": -533.923828125, "loss": 0.4182, "rewards/chosen": 0.17817507982254027, "rewards/margins": 2.116238701343536, "rewards/rejected": -1.938063621520996, "step": 11159 }, { "epoch": 0.5915246601118385, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55617258.666666664, "logits/rejected": 3898639.2, "logps/chosen": -623.3808186848959, "logps/rejected": -168.8498046875, "loss": 0.2226, "rewards/chosen": 1.0735626220703125, "rewards/margins": 2.93028564453125, "rewards/rejected": -1.8567230224609375, "step": 11160 }, { "epoch": 0.5915776641136405, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68418208.0, "logits/rejected": -70035334.4, "logps/chosen": -709.1348470052084, "logps/rejected": -205.0015625, "loss": 0.1599, "rewards/chosen": 1.5597802797953289, "rewards/margins": 4.102188650767008, "rewards/rejected": -2.5424083709716796, "step": 11161 }, { "epoch": 0.5916306681154427, "grad_norm": 36.25, "kl": 0.46512413024902344, "learning_rate": 5e-07, "logits/chosen": -25048284.0, "logits/rejected": -23824768.0, "logps/chosen": -193.82925415039062, "logps/rejected": -379.8545735677083, "loss": 0.2566, "rewards/chosen": 0.83475261926651, "rewards/margins": 2.451928675174713, "rewards/rejected": -1.6171760559082031, "step": 11162 }, { "epoch": 0.5916836721172448, "grad_norm": 63.5, "kl": 2.4577903747558594, "learning_rate": 5e-07, "logits/chosen": -6800713.0, "logits/rejected": -14140042.0, "logps/chosen": -243.3518829345703, "logps/rejected": -146.78631591796875, "loss": 0.3506, "rewards/chosen": 0.4295949339866638, "rewards/margins": 1.6306487917900085, "rewards/rejected": -1.2010538578033447, "step": 11163 }, { "epoch": 0.591736676119047, "grad_norm": 55.25, "kl": 0.6106367111206055, "learning_rate": 5e-07, "logits/chosen": -49777578.666666664, "logits/rejected": -76739161.6, "logps/chosen": -508.3008219401042, "logps/rejected": -544.140625, "loss": 0.2274, "rewards/chosen": 0.6446837584177653, "rewards/margins": 2.533014980951945, "rewards/rejected": -1.8883312225341797, "step": 11164 }, { "epoch": 0.5917896801208491, "grad_norm": 28.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7254632.0, "logits/rejected": -20465728.0, "logps/chosen": -384.4253845214844, "logps/rejected": -326.09340122767856, "loss": 0.0884, "rewards/chosen": 1.7057465314865112, "rewards/margins": 4.728493026324681, "rewards/rejected": -3.0227464948381697, "step": 11165 }, { "epoch": 0.5918426841226513, "grad_norm": 59.0, "kl": 0.9515705108642578, "learning_rate": 5e-07, "logits/chosen": -60380985.6, "logits/rejected": -19353918.666666668, "logps/chosen": -203.87132568359374, "logps/rejected": -327.3610432942708, "loss": 0.3809, "rewards/chosen": 0.202705979347229, "rewards/margins": 1.8091554244359334, "rewards/rejected": -1.6064494450887044, "step": 11166 }, { "epoch": 0.5918956881244534, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2642919.0, "logits/rejected": -29820316.0, "logps/chosen": -252.44439697265625, "logps/rejected": -346.9162902832031, "loss": 0.3129, "rewards/chosen": -0.1397686004638672, "rewards/margins": 2.735219955444336, "rewards/rejected": -2.874988555908203, "step": 11167 }, { "epoch": 0.5919486921262556, "grad_norm": 61.75, "kl": 1.2288970947265625, "learning_rate": 5e-07, "logits/chosen": -72378310.4, "logits/rejected": 14852952.0, "logps/chosen": -494.40224609375, "logps/rejected": -144.6997273763021, "loss": 0.3834, "rewards/chosen": 0.6067192077636718, "rewards/margins": 1.8377487182617187, "rewards/rejected": -1.2310295104980469, "step": 11168 }, { "epoch": 0.5920016961280576, "grad_norm": 44.5, "kl": 1.7615432739257812, "learning_rate": 5e-07, "logits/chosen": -31854992.0, "logits/rejected": -19726558.666666668, "logps/chosen": -322.973828125, "logps/rejected": -250.84037272135416, "loss": 0.2229, "rewards/chosen": 1.0207173347473144, "rewards/margins": 3.3401011149088538, "rewards/rejected": -2.3193837801615396, "step": 11169 }, { "epoch": 0.5920547001298598, "grad_norm": 38.0, "kl": 1.4979209899902344, "learning_rate": 5e-07, "logits/chosen": 10138616.0, "logits/rejected": -20001573.333333332, "logps/chosen": -31.06389617919922, "logps/rejected": -459.2869873046875, "loss": 0.186, "rewards/chosen": 0.5402604341506958, "rewards/margins": 3.1999897559483848, "rewards/rejected": -2.659729321797689, "step": 11170 }, { "epoch": 0.5921077041316619, "grad_norm": 45.5, "kl": 1.1425418853759766, "learning_rate": 5e-07, "logits/chosen": -29121914.666666668, "logits/rejected": 11220974.0, "logps/chosen": -178.23551432291666, "logps/rejected": -550.4572143554688, "loss": 0.3731, "rewards/chosen": 0.20898024241129556, "rewards/margins": 3.5476979414621987, "rewards/rejected": -3.3387176990509033, "step": 11171 }, { "epoch": 0.5921607081334641, "grad_norm": 45.25, "kl": 1.4806671142578125, "learning_rate": 5e-07, "logits/chosen": -22278328.0, "logits/rejected": -27041062.4, "logps/chosen": -209.23685709635416, "logps/rejected": -269.601416015625, "loss": 0.2126, "rewards/chosen": 0.5146691004435221, "rewards/margins": 2.930706278483073, "rewards/rejected": -2.416037178039551, "step": 11172 }, { "epoch": 0.5922137121352662, "grad_norm": 43.75, "kl": 4.917819023132324, "learning_rate": 5e-07, "logits/chosen": -47646201.6, "logits/rejected": -3611924.0, "logps/chosen": -666.74765625, "logps/rejected": -268.8640950520833, "loss": 0.3243, "rewards/chosen": 1.0266963005065919, "rewards/margins": 3.3907875378926597, "rewards/rejected": -2.364091237386068, "step": 11173 }, { "epoch": 0.5922667161370684, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35161904.0, "logits/rejected": -38069632.0, "logps/chosen": -431.16058349609375, "logps/rejected": -414.27813720703125, "loss": 0.2534, "rewards/chosen": 0.43285274505615234, "rewards/margins": 3.1551897525787354, "rewards/rejected": -2.722337007522583, "step": 11174 }, { "epoch": 0.5923197201388705, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40642556.0, "logits/rejected": -16819372.0, "logps/chosen": -275.4422302246094, "logps/rejected": -267.41015625, "loss": 0.2148, "rewards/chosen": 0.8279056549072266, "rewards/margins": 3.938532829284668, "rewards/rejected": -3.1106271743774414, "step": 11175 }, { "epoch": 0.5923727241406727, "grad_norm": 43.75, "kl": 0.6373977661132812, "learning_rate": 5e-07, "logits/chosen": -82252517.33333333, "logits/rejected": -8043073.6, "logps/chosen": -460.8438313802083, "logps/rejected": -272.9943603515625, "loss": 0.3158, "rewards/chosen": -0.07368979851404826, "rewards/margins": 1.66214413245519, "rewards/rejected": -1.7358339309692383, "step": 11176 }, { "epoch": 0.5924257281424747, "grad_norm": 83.5, "kl": 1.3520889282226562, "learning_rate": 5e-07, "logits/chosen": -32751136.0, "logits/rejected": -74605.75, "logps/chosen": -298.97959391276044, "logps/rejected": -332.614892578125, "loss": 0.3256, "rewards/chosen": 0.31911730766296387, "rewards/margins": 1.5466725826263428, "rewards/rejected": -1.227555274963379, "step": 11177 }, { "epoch": 0.5924787321442769, "grad_norm": 47.0, "kl": 6.028108596801758, "learning_rate": 5e-07, "logits/chosen": -24903650.666666668, "logits/rejected": 1704682.0, "logps/chosen": -566.5098876953125, "logps/rejected": -410.9254150390625, "loss": 0.3144, "rewards/chosen": 1.5239640871683757, "rewards/margins": 3.224521080652873, "rewards/rejected": -1.700556993484497, "step": 11178 }, { "epoch": 0.592531736146079, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17611138.0, "logits/rejected": 62353552.0, "logps/chosen": -112.58111572265625, "logps/rejected": -449.2633972167969, "loss": 0.3024, "rewards/chosen": 0.21470804512500763, "rewards/margins": 2.157565787434578, "rewards/rejected": -1.9428577423095703, "step": 11179 }, { "epoch": 0.5925847401478812, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51246040.0, "logits/rejected": -33512504.0, "logps/chosen": -306.46636962890625, "logps/rejected": -341.7316080729167, "loss": 0.2452, "rewards/chosen": 0.6868057250976562, "rewards/margins": 2.405704816182454, "rewards/rejected": -1.718899091084798, "step": 11180 }, { "epoch": 0.5926377441496833, "grad_norm": 45.5, "kl": 2.369291305541992, "learning_rate": 5e-07, "logits/chosen": -32971842.0, "logits/rejected": 26887412.0, "logps/chosen": -287.3297119140625, "logps/rejected": -295.6568908691406, "loss": 0.29, "rewards/chosen": 0.6186496615409851, "rewards/margins": 2.058082640171051, "rewards/rejected": -1.439432978630066, "step": 11181 }, { "epoch": 0.5926907481514855, "grad_norm": 42.5, "kl": 1.3069915771484375, "learning_rate": 5e-07, "logits/chosen": -21673496.0, "logits/rejected": -10507752.0, "logps/chosen": -523.828857421875, "logps/rejected": -260.546826171875, "loss": 0.2341, "rewards/chosen": 0.7616740067799886, "rewards/margins": 2.756632502873739, "rewards/rejected": -1.99495849609375, "step": 11182 }, { "epoch": 0.5927437521532876, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24396096.0, "logits/rejected": -8260105.6, "logps/chosen": -507.6829427083333, "logps/rejected": -204.1322265625, "loss": 0.2433, "rewards/chosen": 0.20532532533009848, "rewards/margins": 2.3346469958623253, "rewards/rejected": -2.1293216705322267, "step": 11183 }, { "epoch": 0.5927967561550898, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55973317.333333336, "logits/rejected": -29982092.8, "logps/chosen": -939.2550455729166, "logps/rejected": -458.756591796875, "loss": 0.1904, "rewards/chosen": 1.2300994396209717, "rewards/margins": 3.6935052394866945, "rewards/rejected": -2.463405799865723, "step": 11184 }, { "epoch": 0.5928497601568918, "grad_norm": 35.75, "kl": 0.9147300720214844, "learning_rate": 5e-07, "logits/chosen": -30615634.666666668, "logits/rejected": -37578316.8, "logps/chosen": -287.9785970052083, "logps/rejected": -343.4065673828125, "loss": 0.2052, "rewards/chosen": 0.8908398946126302, "rewards/margins": 3.459266026814779, "rewards/rejected": -2.5684261322021484, "step": 11185 }, { "epoch": 0.592902764158694, "grad_norm": 46.0, "kl": 0.0587921142578125, "learning_rate": 5e-07, "logits/chosen": -61289344.0, "logits/rejected": -17511144.0, "logps/chosen": -380.73602294921875, "logps/rejected": -277.46734619140625, "loss": 0.2436, "rewards/chosen": 0.5616927742958069, "rewards/margins": 3.1460371613502502, "rewards/rejected": -2.5843443870544434, "step": 11186 }, { "epoch": 0.5929557681604961, "grad_norm": 51.75, "kl": 0.5235681533813477, "learning_rate": 5e-07, "logits/chosen": -11008587.2, "logits/rejected": -38285506.666666664, "logps/chosen": -173.2733154296875, "logps/rejected": -614.9867350260416, "loss": 0.2602, "rewards/chosen": 0.4830780982971191, "rewards/margins": 4.579518731435139, "rewards/rejected": -4.0964406331380205, "step": 11187 }, { "epoch": 0.5930087721622983, "grad_norm": 44.75, "kl": 2.9744701385498047, "learning_rate": 5e-07, "logits/chosen": -9843180.57142857, "logits/rejected": -55584504.0, "logps/chosen": -277.0653773716518, "logps/rejected": -269.0133361816406, "loss": 0.4166, "rewards/chosen": 0.6027300698416573, "rewards/margins": 2.746852431978498, "rewards/rejected": -2.144122362136841, "step": 11188 }, { "epoch": 0.5930617761641004, "grad_norm": 54.75, "kl": 1.3575897216796875, "learning_rate": 5e-07, "logits/chosen": -18046620.0, "logits/rejected": 607064.6875, "logps/chosen": -706.5826416015625, "logps/rejected": -82.4910888671875, "loss": 0.2238, "rewards/chosen": 1.2013095617294312, "rewards/margins": 2.9589556455612183, "rewards/rejected": -1.757646083831787, "step": 11189 }, { "epoch": 0.5931147801659026, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20656140.0, "logits/rejected": -30060074.666666668, "logps/chosen": -204.84280395507812, "logps/rejected": -403.73046875, "loss": 0.1977, "rewards/chosen": 0.31226846575737, "rewards/margins": 2.576609343290329, "rewards/rejected": -2.264340877532959, "step": 11190 }, { "epoch": 0.5931677841677047, "grad_norm": 55.75, "kl": 1.8727874755859375, "learning_rate": 5e-07, "logits/chosen": -18532098.666666668, "logits/rejected": -6825863.0, "logps/chosen": -433.3550618489583, "logps/rejected": -209.40367126464844, "loss": 0.3592, "rewards/chosen": 0.8808917999267578, "rewards/margins": 2.2233734130859375, "rewards/rejected": -1.3424816131591797, "step": 11191 }, { "epoch": 0.5932207881695069, "grad_norm": 42.5, "kl": 1.7304582595825195, "learning_rate": 5e-07, "logits/chosen": 10569315.2, "logits/rejected": -88862581.33333333, "logps/chosen": -106.1985595703125, "logps/rejected": -180.23624674479166, "loss": 0.4337, "rewards/chosen": 0.10985673666000366, "rewards/margins": 1.1503886898358662, "rewards/rejected": -1.0405319531758626, "step": 11192 }, { "epoch": 0.5932737921713089, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69332659.2, "logits/rejected": -15786089.333333334, "logps/chosen": -566.58955078125, "logps/rejected": -248.5914510091146, "loss": 0.3238, "rewards/chosen": 0.3821081638336182, "rewards/margins": 2.0927592436472575, "rewards/rejected": -1.7106510798136394, "step": 11193 }, { "epoch": 0.593326796173111, "grad_norm": 52.5, "kl": 0.9293651580810547, "learning_rate": 5e-07, "logits/chosen": -10160808.0, "logits/rejected": -41254000.0, "logps/chosen": -531.1133422851562, "logps/rejected": -275.38014729817706, "loss": 0.133, "rewards/chosen": 1.8333282470703125, "rewards/margins": 3.8649023373921714, "rewards/rejected": -2.031574090321859, "step": 11194 }, { "epoch": 0.5933798001749132, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55704256.0, "logits/rejected": -22675104.0, "logps/chosen": -627.1578369140625, "logps/rejected": -285.07352120535717, "loss": 0.1646, "rewards/chosen": -0.16887207329273224, "rewards/margins": 2.617833216275488, "rewards/rejected": -2.78670528956822, "step": 11195 }, { "epoch": 0.5934328041767153, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31060146.0, "logits/rejected": -50611853.71428572, "logps/chosen": -286.9032897949219, "logps/rejected": -307.878662109375, "loss": 0.2036, "rewards/chosen": 0.303598016500473, "rewards/margins": 2.2667992668492456, "rewards/rejected": -1.9632012503487724, "step": 11196 }, { "epoch": 0.5934858081785175, "grad_norm": 39.5, "kl": 2.052978515625, "learning_rate": 5e-07, "logits/chosen": -19884674.0, "logits/rejected": -36876048.0, "logps/chosen": -216.1321258544922, "logps/rejected": -400.836669921875, "loss": 0.3293, "rewards/chosen": 0.5265373587608337, "rewards/margins": 2.0806578993797302, "rewards/rejected": -1.5541205406188965, "step": 11197 }, { "epoch": 0.5935388121803196, "grad_norm": 64.5, "kl": 2.1303787231445312, "learning_rate": 5e-07, "logits/chosen": 35860202.666666664, "logits/rejected": -42169740.0, "logps/chosen": -418.3048909505208, "logps/rejected": -424.27203369140625, "loss": 0.4078, "rewards/chosen": 0.14607277512550354, "rewards/margins": 3.791609674692154, "rewards/rejected": -3.6455368995666504, "step": 11198 }, { "epoch": 0.5935918161821218, "grad_norm": 45.25, "kl": 0.8775177001953125, "learning_rate": 5e-07, "logits/chosen": -37283402.666666664, "logits/rejected": -30119112.0, "logps/chosen": -261.0606689453125, "logps/rejected": -405.09375, "loss": 0.2916, "rewards/chosen": 0.8171812693277994, "rewards/margins": 3.1184629599253335, "rewards/rejected": -2.301281690597534, "step": 11199 }, { "epoch": 0.5936448201839238, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45375541.333333336, "logits/rejected": -17609929.6, "logps/chosen": -500.3212890625, "logps/rejected": -238.448291015625, "loss": 0.2754, "rewards/chosen": 0.36459048589070636, "rewards/margins": 2.2590236981709797, "rewards/rejected": -1.8944332122802734, "step": 11200 }, { "epoch": 0.593697824185726, "grad_norm": 44.75, "kl": 0.2008056640625, "learning_rate": 5e-07, "logits/chosen": -37962432.0, "logits/rejected": -57521510.4, "logps/chosen": -352.2266438802083, "logps/rejected": -554.904345703125, "loss": 0.2215, "rewards/chosen": 0.5089170138041178, "rewards/margins": 2.8196942965189615, "rewards/rejected": -2.310777282714844, "step": 11201 }, { "epoch": 0.5937508281875281, "grad_norm": 53.0, "kl": 3.1175384521484375, "learning_rate": 5e-07, "logits/chosen": -27541866.666666668, "logits/rejected": -31665664.0, "logps/chosen": -504.6124267578125, "logps/rejected": -161.94921875, "loss": 0.2811, "rewards/chosen": 1.438944657643636, "rewards/margins": 3.4001895984013872, "rewards/rejected": -1.9612449407577515, "step": 11202 }, { "epoch": 0.5938038321893303, "grad_norm": 38.25, "kl": 0.08259868621826172, "learning_rate": 5e-07, "logits/chosen": -21454602.0, "logits/rejected": -33129780.0, "logps/chosen": -211.514404296875, "logps/rejected": -269.03765869140625, "loss": 0.2602, "rewards/chosen": 0.43320268392562866, "rewards/margins": 2.962606370449066, "rewards/rejected": -2.5294036865234375, "step": 11203 }, { "epoch": 0.5938568361911324, "grad_norm": 93.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20559505.333333332, "logits/rejected": -23214241.6, "logps/chosen": -264.8824869791667, "logps/rejected": -209.90390625, "loss": 0.2742, "rewards/chosen": 0.6617557605107626, "rewards/margins": 2.0383360942204796, "rewards/rejected": -1.3765803337097169, "step": 11204 }, { "epoch": 0.5939098401929346, "grad_norm": 74.5, "kl": 3.351696014404297, "learning_rate": 5e-07, "logits/chosen": -62421897.14285714, "logits/rejected": -16685087.0, "logps/chosen": -537.0912388392857, "logps/rejected": -141.35366821289062, "loss": 0.3464, "rewards/chosen": 0.9514132227216449, "rewards/margins": 1.553459414413997, "rewards/rejected": -0.6020461916923523, "step": 11205 }, { "epoch": 0.5939628441947367, "grad_norm": 58.75, "kl": 2.6938629150390625, "learning_rate": 5e-07, "logits/chosen": -22550762.666666668, "logits/rejected": -49323408.0, "logps/chosen": -345.6092529296875, "logps/rejected": -186.46876525878906, "loss": 0.3902, "rewards/chosen": 0.55348801612854, "rewards/margins": 1.3588423132896423, "rewards/rejected": -0.8053542971611023, "step": 11206 }, { "epoch": 0.5940158481965389, "grad_norm": 27.375, "kl": 2.275485038757324, "learning_rate": 5e-07, "logits/chosen": -27176.5, "logits/rejected": -34374296.0, "logps/chosen": -103.39278411865234, "logps/rejected": -291.35040283203125, "loss": 0.2368, "rewards/chosen": 0.6984536647796631, "rewards/margins": 3.433380603790283, "rewards/rejected": -2.73492693901062, "step": 11207 }, { "epoch": 0.5940688521983409, "grad_norm": 41.75, "kl": 0.5360202789306641, "learning_rate": 5e-07, "logits/chosen": -1445759.25, "logits/rejected": -30190956.0, "logps/chosen": -84.46135711669922, "logps/rejected": -478.0517272949219, "loss": 0.1819, "rewards/chosen": 1.0167417526245117, "rewards/margins": 4.124398946762085, "rewards/rejected": -3.1076571941375732, "step": 11208 }, { "epoch": 0.5941218562001431, "grad_norm": 32.25, "kl": 4.3859710693359375, "learning_rate": 5e-07, "logits/chosen": -26977008.0, "logits/rejected": -43330380.0, "logps/chosen": -263.39162190755206, "logps/rejected": -436.7880554199219, "loss": 0.3458, "rewards/chosen": 0.8677364190419515, "rewards/margins": 4.2788301308949785, "rewards/rejected": -3.4110937118530273, "step": 11209 }, { "epoch": 0.5941748602019452, "grad_norm": 47.5, "kl": 0.19865798950195312, "learning_rate": 5e-07, "logits/chosen": -41997980.0, "logits/rejected": -31359181.333333332, "logps/chosen": -323.7798767089844, "logps/rejected": -270.6781819661458, "loss": 0.243, "rewards/chosen": 1.044464111328125, "rewards/margins": 2.5620020230611162, "rewards/rejected": -1.5175379117329915, "step": 11210 }, { "epoch": 0.5942278642037474, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -104196736.0, "logits/rejected": -16055574.4, "logps/chosen": -724.34912109375, "logps/rejected": -224.7933837890625, "loss": 0.2394, "rewards/chosen": 0.6355977058410645, "rewards/margins": 2.4225508689880373, "rewards/rejected": -1.7869531631469726, "step": 11211 }, { "epoch": 0.5942808682055495, "grad_norm": 49.5, "kl": 0.045952796936035156, "learning_rate": 5e-07, "logits/chosen": -38889833.6, "logits/rejected": -36135757.333333336, "logps/chosen": -328.6141845703125, "logps/rejected": -402.5300699869792, "loss": 0.3475, "rewards/chosen": 0.15098658800125123, "rewards/margins": 2.444614326953888, "rewards/rejected": -2.2936277389526367, "step": 11212 }, { "epoch": 0.5943338722073517, "grad_norm": 46.25, "kl": 1.914337158203125, "learning_rate": 5e-07, "logits/chosen": -58585248.0, "logits/rejected": -11217281.333333334, "logps/chosen": -450.88818359375, "logps/rejected": -194.63541666666666, "loss": 0.2659, "rewards/chosen": 0.6357402801513672, "rewards/margins": 2.2697733243306475, "rewards/rejected": -1.6340330441792805, "step": 11213 }, { "epoch": 0.5943868762091538, "grad_norm": 49.75, "kl": 2.2606945037841797, "learning_rate": 5e-07, "logits/chosen": -37433190.4, "logits/rejected": -25567618.666666668, "logps/chosen": -289.6948974609375, "logps/rejected": -162.05178833007812, "loss": 0.4012, "rewards/chosen": -0.17818634510040282, "rewards/margins": 2.2957497358322145, "rewards/rejected": -2.473936080932617, "step": 11214 }, { "epoch": 0.594439880210956, "grad_norm": 48.75, "kl": 1.7501964569091797, "learning_rate": 5e-07, "logits/chosen": -21020640.0, "logits/rejected": -147024640.0, "logps/chosen": -250.52179827008928, "logps/rejected": -265.15093994140625, "loss": 0.3301, "rewards/chosen": 0.8789535249982562, "rewards/margins": 2.2271743501935686, "rewards/rejected": -1.3482208251953125, "step": 11215 }, { "epoch": 0.594492884212758, "grad_norm": 23.75, "kl": 5.11978816986084, "learning_rate": 5e-07, "logits/chosen": -26862098.666666668, "logits/rejected": -64441683.2, "logps/chosen": -351.029541015625, "logps/rejected": -128.00157470703124, "loss": 0.2252, "rewards/chosen": 1.539777119954427, "rewards/margins": 4.348524030049642, "rewards/rejected": -2.808746910095215, "step": 11216 }, { "epoch": 0.5945458882145602, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -92941216.0, "logits/rejected": -27731771.42857143, "logps/chosen": -425.89117431640625, "logps/rejected": -345.99686104910717, "loss": 0.1331, "rewards/chosen": 0.633343517780304, "rewards/margins": 3.2049224972724915, "rewards/rejected": -2.5715789794921875, "step": 11217 }, { "epoch": 0.5945988922163623, "grad_norm": 54.0, "kl": 9.618651390075684, "learning_rate": 5e-07, "logits/chosen": -40188901.333333336, "logits/rejected": -34551990.4, "logps/chosen": -1397.0592447916667, "logps/rejected": -270.75341796875, "loss": 0.1553, "rewards/chosen": 2.898735682169596, "rewards/margins": 5.537886301676432, "rewards/rejected": -2.639150619506836, "step": 11218 }, { "epoch": 0.5946518962181645, "grad_norm": 42.25, "kl": 1.6403942108154297, "learning_rate": 5e-07, "logits/chosen": -4383714.4, "logits/rejected": -55612650.666666664, "logps/chosen": -649.761181640625, "logps/rejected": -270.0826009114583, "loss": 0.2143, "rewards/chosen": 1.4108027458190917, "rewards/margins": 3.348493480682373, "rewards/rejected": -1.9376907348632812, "step": 11219 }, { "epoch": 0.5947049002199666, "grad_norm": 38.75, "kl": 1.2530174255371094, "learning_rate": 5e-07, "logits/chosen": -17093723.2, "logits/rejected": -52234021.333333336, "logps/chosen": -184.4413330078125, "logps/rejected": -213.4244588216146, "loss": 0.3116, "rewards/chosen": 0.4855154037475586, "rewards/margins": 3.4772308349609373, "rewards/rejected": -2.991715431213379, "step": 11220 }, { "epoch": 0.5947579042217688, "grad_norm": 47.75, "kl": 5.202088356018066, "learning_rate": 5e-07, "logits/chosen": -22752770.285714287, "logits/rejected": -51513328.0, "logps/chosen": -177.95802525111608, "logps/rejected": -84.40328216552734, "loss": 0.3945, "rewards/chosen": 0.781914302280971, "rewards/margins": 2.154846736363002, "rewards/rejected": -1.3729324340820312, "step": 11221 }, { "epoch": 0.5948109082235709, "grad_norm": 63.25, "kl": 0.09322738647460938, "learning_rate": 5e-07, "logits/chosen": 7467959.333333333, "logits/rejected": -41841580.8, "logps/chosen": -150.39918009440103, "logps/rejected": -347.062109375, "loss": 0.2189, "rewards/chosen": 0.37889862060546875, "rewards/margins": 2.685210037231445, "rewards/rejected": -2.3063114166259764, "step": 11222 }, { "epoch": 0.594863912225373, "grad_norm": 46.25, "kl": 2.6748905181884766, "learning_rate": 5e-07, "logits/chosen": -25088944.0, "logits/rejected": -23667501.333333332, "logps/chosen": -396.468505859375, "logps/rejected": -647.35205078125, "loss": 0.3051, "rewards/chosen": 0.6911735534667969, "rewards/margins": 3.6666997273763022, "rewards/rejected": -2.9755261739095054, "step": 11223 }, { "epoch": 0.5949169162271751, "grad_norm": 55.5, "kl": 0.142852783203125, "learning_rate": 5e-07, "logits/chosen": -32231260.8, "logits/rejected": -20897446.666666668, "logps/chosen": -390.3432861328125, "logps/rejected": -218.12127685546875, "loss": 0.3516, "rewards/chosen": 0.33983821868896485, "rewards/margins": 1.7109411875406901, "rewards/rejected": -1.3711029688517253, "step": 11224 }, { "epoch": 0.5949699202289773, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43207232.0, "logits/rejected": -32264178.666666668, "logps/chosen": -229.96966552734375, "logps/rejected": -314.9697672526042, "loss": 0.2074, "rewards/chosen": 0.8968605399131775, "rewards/margins": 3.198551595211029, "rewards/rejected": -2.3016910552978516, "step": 11225 }, { "epoch": 0.5950229242307794, "grad_norm": 40.0, "kl": 1.8844528198242188, "learning_rate": 5e-07, "logits/chosen": -23310338.666666668, "logits/rejected": -14631382.4, "logps/chosen": -216.714111328125, "logps/rejected": -383.7115234375, "loss": 0.25, "rewards/chosen": -0.031196850041548412, "rewards/margins": 2.867434818049272, "rewards/rejected": -2.8986316680908204, "step": 11226 }, { "epoch": 0.5950759282325816, "grad_norm": 39.5, "kl": 0.14798927307128906, "learning_rate": 5e-07, "logits/chosen": -22583966.4, "logits/rejected": -39038362.666666664, "logps/chosen": -134.8197021484375, "logps/rejected": -147.9041951497396, "loss": 0.3272, "rewards/chosen": 0.11911454200744628, "rewards/margins": 2.856536213556925, "rewards/rejected": -2.737421671549479, "step": 11227 }, { "epoch": 0.5951289322343837, "grad_norm": 31.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9706284.0, "logits/rejected": -31521364.0, "logps/chosen": -246.07704162597656, "logps/rejected": -427.69091796875, "loss": 0.1914, "rewards/chosen": 1.1295098066329956, "rewards/margins": 3.9320133924484253, "rewards/rejected": -2.8025035858154297, "step": 11228 }, { "epoch": 0.5951819362361859, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47396636.0, "logits/rejected": -2529782.6666666665, "logps/chosen": -153.47171020507812, "logps/rejected": -302.3519287109375, "loss": 0.2115, "rewards/chosen": 0.9308362007141113, "rewards/margins": 2.51858647664388, "rewards/rejected": -1.5877502759297688, "step": 11229 }, { "epoch": 0.595234940237988, "grad_norm": 38.75, "kl": 0.5038461685180664, "learning_rate": 5e-07, "logits/chosen": -18373284.8, "logits/rejected": 2811887.0, "logps/chosen": -162.318310546875, "logps/rejected": -125.75437418619792, "loss": 0.3629, "rewards/chosen": 0.06503387093544007, "rewards/margins": 2.3224423309167226, "rewards/rejected": -2.2574084599812827, "step": 11230 }, { "epoch": 0.5952879442397901, "grad_norm": 63.5, "kl": 1.4060163497924805, "learning_rate": 5e-07, "logits/chosen": -54831080.0, "logits/rejected": -1264964.0, "logps/chosen": -839.0621948242188, "logps/rejected": -55.092926025390625, "loss": 0.2617, "rewards/chosen": 0.7793557643890381, "rewards/margins": 2.507762312889099, "rewards/rejected": -1.728406548500061, "step": 11231 }, { "epoch": 0.5953409482415922, "grad_norm": 41.75, "kl": 1.1050338745117188, "learning_rate": 5e-07, "logits/chosen": -26133804.8, "logits/rejected": 907912.5, "logps/chosen": -223.3405517578125, "logps/rejected": -60.618326822916664, "loss": 0.3998, "rewards/chosen": -0.03487812578678131, "rewards/margins": 1.742472458879153, "rewards/rejected": -1.7773505846659343, "step": 11232 }, { "epoch": 0.5953939522433944, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -970677.125, "logits/rejected": -17839384.0, "logps/chosen": -48.335975646972656, "logps/rejected": -320.59250895182294, "loss": 0.2425, "rewards/chosen": 0.13743790984153748, "rewards/margins": 2.2697462340195975, "rewards/rejected": -2.13230832417806, "step": 11233 }, { "epoch": 0.5954469562451965, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5143595.0, "logits/rejected": -52170336.0, "logps/chosen": -223.54151916503906, "logps/rejected": -497.5639343261719, "loss": 0.2616, "rewards/chosen": 0.6444287300109863, "rewards/margins": 3.0576529502868652, "rewards/rejected": -2.413224220275879, "step": 11234 }, { "epoch": 0.5954999602469987, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1146480.5, "logits/rejected": -57566982.4, "logps/chosen": -124.91483561197917, "logps/rejected": -391.4336181640625, "loss": 0.2067, "rewards/chosen": 0.5069086949030558, "rewards/margins": 3.0130968968073524, "rewards/rejected": -2.5061882019042967, "step": 11235 }, { "epoch": 0.5955529642488008, "grad_norm": 44.5, "kl": 0.9641075134277344, "learning_rate": 5e-07, "logits/chosen": -12662608.8, "logits/rejected": -25794928.0, "logps/chosen": -114.43310546875, "logps/rejected": -251.1169230143229, "loss": 0.3265, "rewards/chosen": 0.1863029956817627, "rewards/margins": 3.07366992632548, "rewards/rejected": -2.8873669306437173, "step": 11236 }, { "epoch": 0.595605968250603, "grad_norm": 48.25, "kl": 0.2678565979003906, "learning_rate": 5e-07, "logits/chosen": -22683948.8, "logits/rejected": -15096861.333333334, "logps/chosen": -224.9157470703125, "logps/rejected": -151.10919189453125, "loss": 0.278, "rewards/chosen": 0.8884719848632813, "rewards/margins": 2.5208022435506185, "rewards/rejected": -1.6323302586873372, "step": 11237 }, { "epoch": 0.595658972252405, "grad_norm": 37.25, "kl": 1.500274658203125, "learning_rate": 5e-07, "logits/chosen": 19146261.333333332, "logits/rejected": -5151142.4, "logps/chosen": -975.2945149739584, "logps/rejected": -256.980859375, "loss": 0.1716, "rewards/chosen": 1.9734880129496257, "rewards/margins": 4.368422285715739, "rewards/rejected": -2.3949342727661134, "step": 11238 }, { "epoch": 0.5957119762542072, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4817322.0, "logits/rejected": -14980057.142857144, "logps/chosen": -47.2406120300293, "logps/rejected": -304.89662388392856, "loss": 0.2637, "rewards/chosen": -0.18586578965187073, "rewards/margins": 1.758373690502984, "rewards/rejected": -1.9442394801548548, "step": 11239 }, { "epoch": 0.5957649802560093, "grad_norm": 31.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28386898.0, "logits/rejected": -14145649.142857144, "logps/chosen": -372.6403503417969, "logps/rejected": -376.9822474888393, "loss": 0.1451, "rewards/chosen": 0.28174135088920593, "rewards/margins": 3.0311857674803053, "rewards/rejected": -2.7494444165910994, "step": 11240 }, { "epoch": 0.5958179842578115, "grad_norm": 71.0, "kl": 1.5962066650390625, "learning_rate": 5e-07, "logits/chosen": -25680722.666666668, "logits/rejected": -49576376.0, "logps/chosen": -340.8060302734375, "logps/rejected": -265.3147277832031, "loss": 0.3395, "rewards/chosen": 0.4292287826538086, "rewards/margins": 3.6682987213134766, "rewards/rejected": -3.239069938659668, "step": 11241 }, { "epoch": 0.5958709882596136, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33055565.333333332, "logits/rejected": -31123724.8, "logps/chosen": -136.78656005859375, "logps/rejected": -270.18115234375, "loss": 0.2433, "rewards/chosen": 0.7395137945810953, "rewards/margins": 2.3953277746836346, "rewards/rejected": -1.655813980102539, "step": 11242 }, { "epoch": 0.5959239922614158, "grad_norm": 37.25, "kl": 1.5897903442382812, "learning_rate": 5e-07, "logits/chosen": -24656368.0, "logits/rejected": -19291206.4, "logps/chosen": -462.0907389322917, "logps/rejected": -151.521337890625, "loss": 0.1918, "rewards/chosen": 1.036759376525879, "rewards/margins": 3.207193946838379, "rewards/rejected": -2.1704345703125, "step": 11243 }, { "epoch": 0.5959769962632179, "grad_norm": 31.5, "kl": 6.400178909301758, "learning_rate": 5e-07, "logits/chosen": -9376611.42857143, "logits/rejected": -64849768.0, "logps/chosen": -318.3251953125, "logps/rejected": -494.659423828125, "loss": 0.4254, "rewards/chosen": 0.8064567702157157, "rewards/margins": 4.807345117841448, "rewards/rejected": -4.000888347625732, "step": 11244 }, { "epoch": 0.59603000026502, "grad_norm": 31.625, "kl": 2.0273122787475586, "learning_rate": 5e-07, "logits/chosen": -32743696.0, "logits/rejected": -35449974.4, "logps/chosen": -445.178466796875, "logps/rejected": -442.913818359375, "loss": 0.1135, "rewards/chosen": 1.0801867643992107, "rewards/margins": 4.968458477656046, "rewards/rejected": -3.888271713256836, "step": 11245 }, { "epoch": 0.5960830042668221, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10053260.0, "logits/rejected": -7200794.666666667, "logps/chosen": -262.67791748046875, "logps/rejected": -451.2259114583333, "loss": 0.2117, "rewards/chosen": 1.1748939752578735, "rewards/margins": 3.3133986393610635, "rewards/rejected": -2.13850466410319, "step": 11246 }, { "epoch": 0.5961360082686242, "grad_norm": 68.0, "kl": 0.033447265625, "learning_rate": 5e-07, "logits/chosen": -33672264.0, "logits/rejected": -6108391.5, "logps/chosen": -368.1437683105469, "logps/rejected": -451.1976013183594, "loss": 0.2651, "rewards/chosen": 0.6900825500488281, "rewards/margins": 3.0979971885681152, "rewards/rejected": -2.407914638519287, "step": 11247 }, { "epoch": 0.5961890122704264, "grad_norm": 62.25, "kl": 0.5192470550537109, "learning_rate": 5e-07, "logits/chosen": -31852613.333333332, "logits/rejected": 786635.75, "logps/chosen": -458.2788899739583, "logps/rejected": -96.928955078125, "loss": 0.3182, "rewards/chosen": 0.47355345884958905, "rewards/margins": 3.353981296221415, "rewards/rejected": -2.880427837371826, "step": 11248 }, { "epoch": 0.5962420162722285, "grad_norm": 91.5, "kl": 3.446125030517578, "learning_rate": 5e-07, "logits/chosen": 25048932.0, "logits/rejected": -8648709.0, "logps/chosen": -450.94219970703125, "logps/rejected": -90.50434875488281, "loss": 0.3778, "rewards/chosen": 0.7376102209091187, "rewards/margins": 1.121282547712326, "rewards/rejected": -0.3836723268032074, "step": 11249 }, { "epoch": 0.5962950202740307, "grad_norm": 53.75, "kl": 0.41182708740234375, "learning_rate": 5e-07, "logits/chosen": -12961416.0, "logits/rejected": -13639868.0, "logps/chosen": -323.4405029296875, "logps/rejected": -329.92498779296875, "loss": 0.2633, "rewards/chosen": 0.7572636127471923, "rewards/margins": 3.0420798778533937, "rewards/rejected": -2.284816265106201, "step": 11250 }, { "epoch": 0.5963480242758328, "grad_norm": 52.25, "kl": 2.2265968322753906, "learning_rate": 5e-07, "logits/chosen": -18924016.0, "logits/rejected": -6508396.0, "logps/chosen": -305.9640380859375, "logps/rejected": -309.5853678385417, "loss": 0.3384, "rewards/chosen": 0.5135597229003906, "rewards/margins": 1.9018308957417807, "rewards/rejected": -1.38827117284139, "step": 11251 }, { "epoch": 0.596401028277635, "grad_norm": 42.0, "kl": 1.053558349609375, "learning_rate": 5e-07, "logits/chosen": -12214981.0, "logits/rejected": -76836104.0, "logps/chosen": -144.59092712402344, "logps/rejected": -534.262451171875, "loss": 0.3337, "rewards/chosen": -0.2646772563457489, "rewards/margins": 3.231806069612503, "rewards/rejected": -3.496483325958252, "step": 11252 }, { "epoch": 0.596454032279437, "grad_norm": 55.5, "kl": 3.3589324951171875, "learning_rate": 5e-07, "logits/chosen": -8812336.0, "logits/rejected": -1082681.5, "logps/chosen": -207.90884399414062, "logps/rejected": -330.421630859375, "loss": 0.3629, "rewards/chosen": 0.5006385445594788, "rewards/margins": 2.112923800945282, "rewards/rejected": -1.6122852563858032, "step": 11253 }, { "epoch": 0.5965070362812392, "grad_norm": 56.75, "kl": 2.352354049682617, "learning_rate": 5e-07, "logits/chosen": -19873094.666666668, "logits/rejected": -14445227.0, "logps/chosen": -212.08744303385416, "logps/rejected": -228.10922241210938, "loss": 0.3398, "rewards/chosen": 0.5377340316772461, "rewards/margins": 2.667635917663574, "rewards/rejected": -2.129901885986328, "step": 11254 }, { "epoch": 0.5965600402830413, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60065260.8, "logits/rejected": -20179381.333333332, "logps/chosen": -233.3205322265625, "logps/rejected": -384.9423421223958, "loss": 0.3758, "rewards/chosen": -0.2962256669998169, "rewards/margins": 3.1173917849858603, "rewards/rejected": -3.4136174519856772, "step": 11255 }, { "epoch": 0.5966130442848435, "grad_norm": 46.0, "kl": 1.0838470458984375, "learning_rate": 5e-07, "logits/chosen": -52106502.4, "logits/rejected": -51959221.333333336, "logps/chosen": -830.32490234375, "logps/rejected": -393.0211181640625, "loss": 0.2444, "rewards/chosen": 1.0149259567260742, "rewards/margins": 4.15888532002767, "rewards/rejected": -3.143959363301595, "step": 11256 }, { "epoch": 0.5966660482866456, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39233456.0, "logits/rejected": -62987176.0, "logps/chosen": -399.7484130859375, "logps/rejected": -618.1473999023438, "loss": 0.2446, "rewards/chosen": 0.274514764547348, "rewards/margins": 3.1901274621486664, "rewards/rejected": -2.9156126976013184, "step": 11257 }, { "epoch": 0.5967190522884478, "grad_norm": 28.75, "kl": 3.0041351318359375, "learning_rate": 5e-07, "logits/chosen": 14059094.0, "logits/rejected": -37915138.28571428, "logps/chosen": -42.14769744873047, "logps/rejected": -223.24201311383928, "loss": 0.1773, "rewards/chosen": -0.10697708278894424, "rewards/margins": 2.0828863946454867, "rewards/rejected": -2.189863477434431, "step": 11258 }, { "epoch": 0.5967720562902499, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32731980.0, "logits/rejected": -22785032.0, "logps/chosen": -236.17486572265625, "logps/rejected": -422.2459411621094, "loss": 0.253, "rewards/chosen": 0.36003628373146057, "rewards/margins": 3.2435041964054108, "rewards/rejected": -2.88346791267395, "step": 11259 }, { "epoch": 0.5968250602920521, "grad_norm": 41.75, "kl": 0.1026296615600586, "learning_rate": 5e-07, "logits/chosen": -29559126.0, "logits/rejected": -20011824.0, "logps/chosen": -95.89728546142578, "logps/rejected": -167.01510620117188, "loss": 0.3224, "rewards/chosen": 0.23377324640750885, "rewards/margins": 2.022295728325844, "rewards/rejected": -1.788522481918335, "step": 11260 }, { "epoch": 0.5968780642938541, "grad_norm": 36.25, "kl": 0.4464759826660156, "learning_rate": 5e-07, "logits/chosen": -70703466.66666667, "logits/rejected": -20058835.2, "logps/chosen": -114.07810465494792, "logps/rejected": -298.8718017578125, "loss": 0.2365, "rewards/chosen": 0.04759800930817922, "rewards/margins": 3.1068223287661874, "rewards/rejected": -3.059224319458008, "step": 11261 }, { "epoch": 0.5969310682956563, "grad_norm": 58.0, "kl": 0.06256484985351562, "learning_rate": 5e-07, "logits/chosen": -58513504.0, "logits/rejected": -24935233.6, "logps/chosen": -543.3732096354166, "logps/rejected": -330.4658447265625, "loss": 0.323, "rewards/chosen": 0.16524505615234375, "rewards/margins": 1.8254987716674804, "rewards/rejected": -1.6602537155151367, "step": 11262 }, { "epoch": 0.5969840722974584, "grad_norm": 39.0, "kl": 4.093730926513672, "learning_rate": 5e-07, "logits/chosen": -37617225.6, "logits/rejected": -15943042.666666666, "logps/chosen": -328.5380615234375, "logps/rejected": -164.9777628580729, "loss": 0.3011, "rewards/chosen": 1.0911730766296386, "rewards/margins": 3.0636346181233725, "rewards/rejected": -1.9724615414937336, "step": 11263 }, { "epoch": 0.5970370762992606, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57132569.6, "logits/rejected": -61485760.0, "logps/chosen": -364.4076171875, "logps/rejected": -554.8116861979166, "loss": 0.1956, "rewards/chosen": 1.1150884628295898, "rewards/margins": 3.6034480730692544, "rewards/rejected": -2.4883596102396646, "step": 11264 }, { "epoch": 0.5970900803010627, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56535240.0, "logits/rejected": -7828952.0, "logps/chosen": -568.7447509765625, "logps/rejected": -354.86474609375, "loss": 0.1985, "rewards/chosen": -0.14385832846164703, "rewards/margins": 2.6338904152313867, "rewards/rejected": -2.7777487436930337, "step": 11265 }, { "epoch": 0.5971430843028649, "grad_norm": 31.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64089956.0, "logits/rejected": -19859472.0, "logps/chosen": -437.6857604980469, "logps/rejected": -318.01318359375, "loss": 0.1246, "rewards/chosen": -0.372048944234848, "rewards/margins": 2.4472597454275404, "rewards/rejected": -2.8193086896623885, "step": 11266 }, { "epoch": 0.597196088304667, "grad_norm": 67.0, "kl": 0.9865741729736328, "learning_rate": 5e-07, "logits/chosen": -17819840.0, "logits/rejected": -8959673.0, "logps/chosen": -179.99819946289062, "logps/rejected": -128.71189880371094, "loss": 0.3869, "rewards/chosen": -0.1513446867465973, "rewards/margins": 1.2480809390544891, "rewards/rejected": -1.3994256258010864, "step": 11267 }, { "epoch": 0.5972490923064692, "grad_norm": 55.0, "kl": 1.864518165588379, "learning_rate": 5e-07, "logits/chosen": -14092682.666666666, "logits/rejected": -26054632.0, "logps/chosen": -280.5937906901042, "logps/rejected": -269.39532470703125, "loss": 0.3106, "rewards/chosen": 0.6869215170542399, "rewards/margins": 2.199058453241984, "rewards/rejected": -1.5121369361877441, "step": 11268 }, { "epoch": 0.5973020963082712, "grad_norm": 49.25, "kl": 0.5982437133789062, "learning_rate": 5e-07, "logits/chosen": -41532757.333333336, "logits/rejected": 1621694.0, "logps/chosen": -297.22906494140625, "logps/rejected": -214.778564453125, "loss": 0.2721, "rewards/chosen": 0.35202280680338544, "rewards/margins": 2.0767492930094402, "rewards/rejected": -1.7247264862060547, "step": 11269 }, { "epoch": 0.5973551003100734, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53251734.4, "logits/rejected": -9279650.0, "logps/chosen": -250.51591796875, "logps/rejected": -220.15958658854166, "loss": 0.3094, "rewards/chosen": 0.4732964992523193, "rewards/margins": 2.071307675043742, "rewards/rejected": -1.5980111757914226, "step": 11270 }, { "epoch": 0.5974081043118755, "grad_norm": 48.25, "kl": 0.5215854644775391, "learning_rate": 5e-07, "logits/chosen": -19546268.8, "logits/rejected": 5930252.0, "logps/chosen": -245.0929931640625, "logps/rejected": -170.69974772135416, "loss": 0.3295, "rewards/chosen": 0.4344949245452881, "rewards/margins": 2.230956029891968, "rewards/rejected": -1.7964611053466797, "step": 11271 }, { "epoch": 0.5974611083136777, "grad_norm": 41.25, "kl": 0.9535741806030273, "learning_rate": 5e-07, "logits/chosen": -2233419.0, "logits/rejected": -38071404.0, "logps/chosen": -200.59498596191406, "logps/rejected": -421.57427978515625, "loss": 0.2344, "rewards/chosen": 0.398487389087677, "rewards/margins": 3.1359151005744934, "rewards/rejected": -2.7374277114868164, "step": 11272 }, { "epoch": 0.5975141123154798, "grad_norm": 31.75, "kl": 0.06648635864257812, "learning_rate": 5e-07, "logits/chosen": -3796322.25, "logits/rejected": -61652176.0, "logps/chosen": -263.8935241699219, "logps/rejected": -567.5341796875, "loss": 0.2205, "rewards/chosen": 0.43814951181411743, "rewards/margins": 4.0447545647621155, "rewards/rejected": -3.606605052947998, "step": 11273 }, { "epoch": 0.597567116317282, "grad_norm": 59.0, "kl": 3.792008876800537, "learning_rate": 5e-07, "logits/chosen": 13577395.2, "logits/rejected": -29267874.666666668, "logps/chosen": -606.474658203125, "logps/rejected": -269.02459716796875, "loss": 0.3709, "rewards/chosen": 1.258596420288086, "rewards/margins": 2.46299409866333, "rewards/rejected": -1.2043976783752441, "step": 11274 }, { "epoch": 0.5976201203190841, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6863710.857142857, "logits/rejected": -26982700.0, "logps/chosen": -370.89048549107144, "logps/rejected": -720.404052734375, "loss": 0.3283, "rewards/chosen": 0.5846399579729352, "rewards/margins": 4.039608172007969, "rewards/rejected": -3.454968214035034, "step": 11275 }, { "epoch": 0.5976731243208863, "grad_norm": 45.0, "kl": 1.2828292846679688, "learning_rate": 5e-07, "logits/chosen": -20210126.4, "logits/rejected": -36068120.0, "logps/chosen": -576.30244140625, "logps/rejected": -363.7349039713542, "loss": 0.2223, "rewards/chosen": 1.1875593185424804, "rewards/margins": 3.2902624130249025, "rewards/rejected": -2.102703094482422, "step": 11276 }, { "epoch": 0.5977261283226883, "grad_norm": 53.25, "kl": 4.974063873291016, "learning_rate": 5e-07, "logits/chosen": -8705394.666666666, "logits/rejected": -49061272.0, "logps/chosen": -215.2968546549479, "logps/rejected": -447.68603515625, "loss": 0.4099, "rewards/chosen": 0.41144080956776935, "rewards/margins": 3.4938458998998008, "rewards/rejected": -3.0824050903320312, "step": 11277 }, { "epoch": 0.5977791323244905, "grad_norm": 41.5, "kl": 3.0461502075195312, "learning_rate": 5e-07, "logits/chosen": -5094668.0, "logits/rejected": -15912064.0, "logps/chosen": -295.8463134765625, "logps/rejected": -257.49945068359375, "loss": 0.2594, "rewards/chosen": 1.2930271625518799, "rewards/margins": 2.798749089241028, "rewards/rejected": -1.505721926689148, "step": 11278 }, { "epoch": 0.5978321363262926, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5256057.333333333, "logits/rejected": -57542630.4, "logps/chosen": -240.9295857747396, "logps/rejected": -475.8609375, "loss": 0.2224, "rewards/chosen": 0.6005765597025553, "rewards/margins": 2.5289874712626137, "rewards/rejected": -1.9284109115600585, "step": 11279 }, { "epoch": 0.5978851403280948, "grad_norm": 35.5, "kl": 3.200969696044922, "learning_rate": 5e-07, "logits/chosen": -5170201.5, "logits/rejected": -13819453.333333334, "logps/chosen": -263.8749694824219, "logps/rejected": -190.68902587890625, "loss": 0.2935, "rewards/chosen": 0.419551283121109, "rewards/margins": 1.8663642307122548, "rewards/rejected": -1.4468129475911458, "step": 11280 }, { "epoch": 0.5979381443298969, "grad_norm": 54.5, "kl": 0.2572669982910156, "learning_rate": 5e-07, "logits/chosen": -30220502.4, "logits/rejected": -76295130.66666667, "logps/chosen": -414.76416015625, "logps/rejected": -587.8352864583334, "loss": 0.2776, "rewards/chosen": 0.36737229824066164, "rewards/margins": 3.5127907832463583, "rewards/rejected": -3.1454184850056968, "step": 11281 }, { "epoch": 0.5979911483316991, "grad_norm": 49.25, "kl": 1.3813018798828125, "learning_rate": 5e-07, "logits/chosen": -4843116.0, "logits/rejected": -31218083.2, "logps/chosen": -316.57843017578125, "logps/rejected": -332.8064697265625, "loss": 0.2036, "rewards/chosen": 1.159016768137614, "rewards/margins": 3.7912261644999186, "rewards/rejected": -2.6322093963623048, "step": 11282 }, { "epoch": 0.5980441523335012, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26505298.0, "logits/rejected": -15339047.0, "logps/chosen": -403.14923095703125, "logps/rejected": -339.424072265625, "loss": 0.2729, "rewards/chosen": 0.313503623008728, "rewards/margins": 2.3925827741622925, "rewards/rejected": -2.0790791511535645, "step": 11283 }, { "epoch": 0.5980971563353034, "grad_norm": 66.0, "kl": 1.9340362548828125, "learning_rate": 5e-07, "logits/chosen": -67019077.333333336, "logits/rejected": -3061867.2, "logps/chosen": -365.7135416666667, "logps/rejected": -322.2692138671875, "loss": 0.2477, "rewards/chosen": 1.3637542724609375, "rewards/margins": 3.0433334350585937, "rewards/rejected": -1.6795791625976562, "step": 11284 }, { "epoch": 0.5981501603371054, "grad_norm": 41.0, "kl": 1.1361732482910156, "learning_rate": 5e-07, "logits/chosen": -26505917.333333332, "logits/rejected": -9802804.0, "logps/chosen": -386.4998372395833, "logps/rejected": -284.187060546875, "loss": 0.1854, "rewards/chosen": 1.9932376543680828, "rewards/margins": 3.583199469248454, "rewards/rejected": -1.5899618148803711, "step": 11285 }, { "epoch": 0.5982031643389076, "grad_norm": 38.5, "kl": 0.7851991653442383, "learning_rate": 5e-07, "logits/chosen": -57529356.0, "logits/rejected": -20405824.0, "logps/chosen": -313.811279296875, "logps/rejected": -325.6244201660156, "loss": 0.2238, "rewards/chosen": 0.8787273168563843, "rewards/margins": 3.7017520666122437, "rewards/rejected": -2.8230247497558594, "step": 11286 }, { "epoch": 0.5982561683407097, "grad_norm": 29.0, "kl": 1.5696277618408203, "learning_rate": 5e-07, "logits/chosen": -7001375.0, "logits/rejected": -18193628.0, "logps/chosen": -103.09591674804688, "logps/rejected": -268.70306396484375, "loss": 0.2606, "rewards/chosen": 0.4441061019897461, "rewards/margins": 3.4623045921325684, "rewards/rejected": -3.0181984901428223, "step": 11287 }, { "epoch": 0.5983091723425119, "grad_norm": 43.0, "kl": 1.4127540588378906, "learning_rate": 5e-07, "logits/chosen": -54513188.0, "logits/rejected": -29677040.0, "logps/chosen": -275.10504150390625, "logps/rejected": -536.0399780273438, "loss": 0.223, "rewards/chosen": 0.7368035912513733, "rewards/margins": 4.070840656757355, "rewards/rejected": -3.3340370655059814, "step": 11288 }, { "epoch": 0.598362176344314, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46502517.333333336, "logits/rejected": -7839997.6, "logps/chosen": -734.88720703125, "logps/rejected": -304.1226806640625, "loss": 0.1655, "rewards/chosen": 1.869767189025879, "rewards/margins": 3.8996137619018554, "rewards/rejected": -2.0298465728759765, "step": 11289 }, { "epoch": 0.5984151803461162, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51253189.333333336, "logits/rejected": -17865824.0, "logps/chosen": -369.1900227864583, "logps/rejected": -337.962939453125, "loss": 0.2225, "rewards/chosen": 0.03076883653799693, "rewards/margins": 2.896454871694247, "rewards/rejected": -2.86568603515625, "step": 11290 }, { "epoch": 0.5984681843479183, "grad_norm": 53.75, "kl": 0.5955896377563477, "learning_rate": 5e-07, "logits/chosen": -10098556.666666666, "logits/rejected": 25194170.0, "logps/chosen": -139.24527994791666, "logps/rejected": -324.0936279296875, "loss": 0.3865, "rewards/chosen": 0.36196847756703693, "rewards/margins": 1.493633468945821, "rewards/rejected": -1.1316649913787842, "step": 11291 }, { "epoch": 0.5985211883497205, "grad_norm": 49.25, "kl": 7.084104537963867, "learning_rate": 5e-07, "logits/chosen": -17876675.2, "logits/rejected": -1484146.3333333333, "logps/chosen": -458.245654296875, "logps/rejected": -100.22355143229167, "loss": 0.2954, "rewards/chosen": 1.5984846115112306, "rewards/margins": 3.9530386288960777, "rewards/rejected": -2.354554017384847, "step": 11292 }, { "epoch": 0.5985741923515225, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6669378.5, "logits/rejected": 3380030.25, "logps/chosen": -393.580810546875, "logps/rejected": -134.15061950683594, "loss": 0.283, "rewards/chosen": 0.8482521772384644, "rewards/margins": 2.5102890729904175, "rewards/rejected": -1.6620368957519531, "step": 11293 }, { "epoch": 0.5986271963533247, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72043984.0, "logits/rejected": -32998848.0, "logps/chosen": -537.8912760416666, "logps/rejected": -267.944873046875, "loss": 0.2703, "rewards/chosen": 0.33403321107228595, "rewards/margins": 2.146207245190938, "rewards/rejected": -1.8121740341186523, "step": 11294 }, { "epoch": 0.5986802003551268, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 20609466.666666668, "logits/rejected": -25596689.6, "logps/chosen": -41.62663777669271, "logps/rejected": -188.87857666015626, "loss": 0.2796, "rewards/chosen": 0.222958505153656, "rewards/margins": 2.3593648314476012, "rewards/rejected": -2.1364063262939452, "step": 11295 }, { "epoch": 0.5987332043569289, "grad_norm": 54.25, "kl": 1.6323051452636719, "learning_rate": 5e-07, "logits/chosen": -54852176.0, "logits/rejected": -14176267.2, "logps/chosen": -414.8792724609375, "logps/rejected": -234.31884765625, "loss": 0.3281, "rewards/chosen": 0.262459933757782, "rewards/margins": 1.8324390292167663, "rewards/rejected": -1.5699790954589843, "step": 11296 }, { "epoch": 0.5987862083587311, "grad_norm": 46.5, "kl": 1.1947994232177734, "learning_rate": 5e-07, "logits/chosen": -28634090.666666668, "logits/rejected": -69462248.0, "logps/chosen": -409.7804361979167, "logps/rejected": -526.8640747070312, "loss": 0.2668, "rewards/chosen": 0.8804841041564941, "rewards/margins": 4.750450134277344, "rewards/rejected": -3.8699660301208496, "step": 11297 }, { "epoch": 0.5988392123605332, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13463445.333333334, "logits/rejected": 45972726.4, "logps/chosen": -151.87039184570312, "logps/rejected": -458.42724609375, "loss": 0.1928, "rewards/chosen": 0.5168743928273519, "rewards/margins": 4.216181262334188, "rewards/rejected": -3.699306869506836, "step": 11298 }, { "epoch": 0.5988922163623354, "grad_norm": 54.25, "kl": 0.08359146118164062, "learning_rate": 5e-07, "logits/chosen": -50670341.333333336, "logits/rejected": -14146699.2, "logps/chosen": -380.2881673177083, "logps/rejected": -170.9312744140625, "loss": 0.259, "rewards/chosen": 0.5066914955774943, "rewards/margins": 2.269284383455912, "rewards/rejected": -1.762592887878418, "step": 11299 }, { "epoch": 0.5989452203641374, "grad_norm": 59.25, "kl": 1.1439476013183594, "learning_rate": 5e-07, "logits/chosen": -48407916.8, "logits/rejected": -24026538.666666668, "logps/chosen": -452.91181640625, "logps/rejected": -275.53668212890625, "loss": 0.2474, "rewards/chosen": 0.8106228828430175, "rewards/margins": 3.0456683158874513, "rewards/rejected": -2.2350454330444336, "step": 11300 }, { "epoch": 0.5989982243659396, "grad_norm": 34.0, "kl": 0.9372959136962891, "learning_rate": 5e-07, "logits/chosen": -27301012.0, "logits/rejected": -17417376.0, "logps/chosen": -589.3901977539062, "logps/rejected": -326.2967529296875, "loss": 0.1545, "rewards/chosen": 1.6520031690597534, "rewards/margins": 4.559852957725525, "rewards/rejected": -2.9078497886657715, "step": 11301 }, { "epoch": 0.5990512283677417, "grad_norm": 21.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71350856.0, "logits/rejected": -477883.6875, "logps/chosen": -153.9429931640625, "logps/rejected": -267.04034423828125, "loss": 0.1912, "rewards/chosen": 0.8518060445785522, "rewards/margins": 3.94896137714386, "rewards/rejected": -3.0971553325653076, "step": 11302 }, { "epoch": 0.5991042323695439, "grad_norm": 44.0, "kl": 2.020979881286621, "learning_rate": 5e-07, "logits/chosen": -19007968.0, "logits/rejected": -22460701.333333332, "logps/chosen": -206.0447265625, "logps/rejected": -124.0847676595052, "loss": 0.3248, "rewards/chosen": 0.4954689979553223, "rewards/margins": 1.892965857187907, "rewards/rejected": -1.3974968592325847, "step": 11303 }, { "epoch": 0.599157236371346, "grad_norm": 41.25, "kl": 1.3137569427490234, "learning_rate": 5e-07, "logits/chosen": -14663739.2, "logits/rejected": -8396648.0, "logps/chosen": -330.4573486328125, "logps/rejected": -123.43387858072917, "loss": 0.3271, "rewards/chosen": 1.1080745697021483, "rewards/margins": 2.2348588943481444, "rewards/rejected": -1.126784324645996, "step": 11304 }, { "epoch": 0.5992102403731482, "grad_norm": 48.5, "kl": 0.3472404479980469, "learning_rate": 5e-07, "logits/chosen": -48166284.0, "logits/rejected": -19875892.0, "logps/chosen": -397.0640869140625, "logps/rejected": -214.2821807861328, "loss": 0.3034, "rewards/chosen": 0.6436634063720703, "rewards/margins": 1.82072114944458, "rewards/rejected": -1.1770577430725098, "step": 11305 }, { "epoch": 0.5992632443749503, "grad_norm": 60.0, "kl": 4.140026092529297, "learning_rate": 5e-07, "logits/chosen": -28361907.2, "logits/rejected": -31325866.666666668, "logps/chosen": -466.0400390625, "logps/rejected": -294.03509521484375, "loss": 0.3199, "rewards/chosen": 0.724721097946167, "rewards/margins": 3.556096855799357, "rewards/rejected": -2.83137575785319, "step": 11306 }, { "epoch": 0.5993162483767525, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21625522.666666668, "logits/rejected": -51570665.6, "logps/chosen": -271.47137451171875, "logps/rejected": -255.7772216796875, "loss": 0.2417, "rewards/chosen": 0.3250734210014343, "rewards/margins": 2.5299533724784853, "rewards/rejected": -2.204879951477051, "step": 11307 }, { "epoch": 0.5993692523785545, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24270316.0, "logits/rejected": -20147612.0, "logps/chosen": -163.0428466796875, "logps/rejected": -207.43637084960938, "loss": 0.3494, "rewards/chosen": -0.3398071229457855, "rewards/margins": 1.878920704126358, "rewards/rejected": -2.2187278270721436, "step": 11308 }, { "epoch": 0.5994222563803567, "grad_norm": 39.25, "kl": 1.9996452331542969, "learning_rate": 5e-07, "logits/chosen": -14442885.0, "logits/rejected": -11211554.0, "logps/chosen": -149.72396850585938, "logps/rejected": -309.7655029296875, "loss": 0.3481, "rewards/chosen": 0.0261240154504776, "rewards/margins": 2.2087512165308, "rewards/rejected": -2.1826272010803223, "step": 11309 }, { "epoch": 0.5994752603821588, "grad_norm": 36.5, "kl": 1.0656967163085938, "learning_rate": 5e-07, "logits/chosen": 7218217.6, "logits/rejected": -33084314.666666668, "logps/chosen": -178.1228759765625, "logps/rejected": -636.455810546875, "loss": 0.2155, "rewards/chosen": 0.9779319763183594, "rewards/margins": 5.18430773417155, "rewards/rejected": -4.20637575785319, "step": 11310 }, { "epoch": 0.599528264383961, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1139644.0, "logits/rejected": -33868620.8, "logps/chosen": -420.9195963541667, "logps/rejected": -391.318896484375, "loss": 0.2035, "rewards/chosen": 0.2711312770843506, "rewards/margins": 3.2763270854949953, "rewards/rejected": -3.0051958084106447, "step": 11311 }, { "epoch": 0.5995812683857631, "grad_norm": 56.25, "kl": 2.480320930480957, "learning_rate": 5e-07, "logits/chosen": -20354653.333333332, "logits/rejected": -49447068.0, "logps/chosen": -216.5616658528646, "logps/rejected": -243.04446411132812, "loss": 0.3566, "rewards/chosen": 0.7740946610768636, "rewards/margins": 2.213978131612142, "rewards/rejected": -1.4398834705352783, "step": 11312 }, { "epoch": 0.5996342723875653, "grad_norm": 53.75, "kl": 0.4657173156738281, "learning_rate": 5e-07, "logits/chosen": -38071912.0, "logits/rejected": -34435080.0, "logps/chosen": -311.5921936035156, "logps/rejected": -348.31689453125, "loss": 0.1824, "rewards/chosen": 1.1163074970245361, "rewards/margins": 3.672191619873047, "rewards/rejected": -2.5558841228485107, "step": 11313 }, { "epoch": 0.5996872763893674, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 49964328.0, "logits/rejected": -48011402.666666664, "logps/chosen": -544.8331298828125, "logps/rejected": -221.69059244791666, "loss": 0.1638, "rewards/chosen": 0.2787872552871704, "rewards/margins": 2.8733990589777627, "rewards/rejected": -2.5946118036905923, "step": 11314 }, { "epoch": 0.5997402803911696, "grad_norm": 58.0, "kl": 2.7561206817626953, "learning_rate": 5e-07, "logits/chosen": -35426821.333333336, "logits/rejected": -33088898.0, "logps/chosen": -396.6123046875, "logps/rejected": -178.8033447265625, "loss": 0.3926, "rewards/chosen": 0.37164847056070965, "rewards/margins": 2.528045097986857, "rewards/rejected": -2.1563966274261475, "step": 11315 }, { "epoch": 0.5997932843929716, "grad_norm": 135.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69668826.66666667, "logits/rejected": -36119696.0, "logps/chosen": -515.13818359375, "logps/rejected": -263.3848876953125, "loss": 0.2165, "rewards/chosen": 0.7855905691782633, "rewards/margins": 3.046349541346232, "rewards/rejected": -2.260758972167969, "step": 11316 }, { "epoch": 0.5998462883947738, "grad_norm": 59.25, "kl": 7.301719665527344, "learning_rate": 5e-07, "logits/chosen": 5663020.0, "logits/rejected": -13309389.0, "logps/chosen": -291.3594156901042, "logps/rejected": -326.95220947265625, "loss": 0.2926, "rewards/chosen": 1.471510410308838, "rewards/margins": 2.734707832336426, "rewards/rejected": -1.263197422027588, "step": 11317 }, { "epoch": 0.5998992923965759, "grad_norm": 49.0, "kl": 0.092681884765625, "learning_rate": 5e-07, "logits/chosen": -39025588.0, "logits/rejected": -17294968.0, "logps/chosen": -391.67669677734375, "logps/rejected": -191.56817626953125, "loss": 0.3312, "rewards/chosen": 0.2335975617170334, "rewards/margins": 1.7056656330823898, "rewards/rejected": -1.4720680713653564, "step": 11318 }, { "epoch": 0.5999522963983781, "grad_norm": 47.25, "kl": 3.0612144470214844, "learning_rate": 5e-07, "logits/chosen": 11194604.0, "logits/rejected": -20936352.0, "logps/chosen": -358.9831136067708, "logps/rejected": -283.837109375, "loss": 0.2195, "rewards/chosen": 1.450116793314616, "rewards/margins": 3.2604046503702797, "rewards/rejected": -1.810287857055664, "step": 11319 }, { "epoch": 0.6000053004001802, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59341584.0, "logits/rejected": -47244796.0, "logps/chosen": -330.4417419433594, "logps/rejected": -304.49298095703125, "loss": 0.2707, "rewards/chosen": 0.22846338152885437, "rewards/margins": 2.6087200343608856, "rewards/rejected": -2.3802566528320312, "step": 11320 }, { "epoch": 0.6000583044019824, "grad_norm": 46.25, "kl": 0.6677627563476562, "learning_rate": 5e-07, "logits/chosen": -16779722.0, "logits/rejected": -25379266.0, "logps/chosen": -209.04461669921875, "logps/rejected": -402.80853271484375, "loss": 0.2366, "rewards/chosen": 0.5195528268814087, "rewards/margins": 3.428195595741272, "rewards/rejected": -2.9086427688598633, "step": 11321 }, { "epoch": 0.6001113084037845, "grad_norm": 57.25, "kl": 2.2971439361572266, "learning_rate": 5e-07, "logits/chosen": -49747264.0, "logits/rejected": -31733876.0, "logps/chosen": -298.1263122558594, "logps/rejected": -197.93417358398438, "loss": 0.3, "rewards/chosen": 0.8546799421310425, "rewards/margins": 1.7428494691848755, "rewards/rejected": -0.888169527053833, "step": 11322 }, { "epoch": 0.6001643124055867, "grad_norm": 34.25, "kl": 0.7923049926757812, "learning_rate": 5e-07, "logits/chosen": 2867241.0, "logits/rejected": -30421465.6, "logps/chosen": -320.5773518880208, "logps/rejected": -376.677587890625, "loss": 0.1952, "rewards/chosen": 1.545015017191569, "rewards/margins": 4.184119478861491, "rewards/rejected": -2.639104461669922, "step": 11323 }, { "epoch": 0.6002173164073887, "grad_norm": 41.5, "kl": 1.5531501770019531, "learning_rate": 5e-07, "logits/chosen": -22278105.333333332, "logits/rejected": 2042852.75, "logps/chosen": -154.02730305989584, "logps/rejected": -96.27898406982422, "loss": 0.3407, "rewards/chosen": 0.7293708324432373, "rewards/margins": 2.689626693725586, "rewards/rejected": -1.9602558612823486, "step": 11324 }, { "epoch": 0.6002703204091909, "grad_norm": 54.75, "kl": 1.0379562377929688, "learning_rate": 5e-07, "logits/chosen": -53786784.0, "logits/rejected": 10638716.0, "logps/chosen": -350.672119140625, "logps/rejected": -443.5062662760417, "loss": 0.2688, "rewards/chosen": 0.45156126022338866, "rewards/margins": 4.225960699717204, "rewards/rejected": -3.774399439493815, "step": 11325 }, { "epoch": 0.600323324410993, "grad_norm": 34.75, "kl": 0.32787322998046875, "learning_rate": 5e-07, "logits/chosen": 2382738.0, "logits/rejected": -54020528.0, "logps/chosen": -385.3642883300781, "logps/rejected": -463.6370849609375, "loss": 0.2047, "rewards/chosen": 0.9848731756210327, "rewards/margins": 4.371609091758728, "rewards/rejected": -3.3867359161376953, "step": 11326 }, { "epoch": 0.6003763284127952, "grad_norm": 61.25, "kl": 1.0112075805664062, "learning_rate": 5e-07, "logits/chosen": -54431498.666666664, "logits/rejected": -4249350.5, "logps/chosen": -366.05078125, "logps/rejected": -84.56342315673828, "loss": 0.2834, "rewards/chosen": 0.7325793107350668, "rewards/margins": 3.4921592076619468, "rewards/rejected": -2.75957989692688, "step": 11327 }, { "epoch": 0.6004293324145973, "grad_norm": 55.75, "kl": 1.5725364685058594, "learning_rate": 5e-07, "logits/chosen": 66769.875, "logits/rejected": -12202744.0, "logps/chosen": -23.953657150268555, "logps/rejected": -148.03506469726562, "loss": 0.2688, "rewards/chosen": 0.6256049275398254, "rewards/margins": 2.970948278903961, "rewards/rejected": -2.3453433513641357, "step": 11328 }, { "epoch": 0.6004823364163995, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25957210.0, "logits/rejected": -42823784.0, "logps/chosen": -196.49594116210938, "logps/rejected": -404.70379638671875, "loss": 0.2903, "rewards/chosen": 0.4972009062767029, "rewards/margins": 2.524584949016571, "rewards/rejected": -2.027384042739868, "step": 11329 }, { "epoch": 0.6005353404182016, "grad_norm": 48.75, "kl": 3.80706787109375, "learning_rate": 5e-07, "logits/chosen": -16157288.0, "logits/rejected": -10546056.0, "logps/chosen": -249.47894287109375, "logps/rejected": -227.3385009765625, "loss": 0.2895, "rewards/chosen": 0.6648659706115723, "rewards/margins": 3.4680991172790527, "rewards/rejected": -2.8032331466674805, "step": 11330 }, { "epoch": 0.6005883444200038, "grad_norm": 36.25, "kl": 1.5530824661254883, "learning_rate": 5e-07, "logits/chosen": -38090124.8, "logits/rejected": 6997820.666666667, "logps/chosen": -185.33521728515626, "logps/rejected": -724.6151529947916, "loss": 0.2917, "rewards/chosen": 0.3966336250305176, "rewards/margins": 4.721277713775635, "rewards/rejected": -4.324644088745117, "step": 11331 }, { "epoch": 0.6006413484218058, "grad_norm": 45.25, "kl": 0.2113189697265625, "learning_rate": 5e-07, "logits/chosen": -52520672.0, "logits/rejected": -48419168.0, "logps/chosen": -545.4124755859375, "logps/rejected": -311.41074625651044, "loss": 0.1337, "rewards/chosen": 1.2869049310684204, "rewards/margins": 4.146412491798401, "rewards/rejected": -2.8595075607299805, "step": 11332 }, { "epoch": 0.600694352423608, "grad_norm": 37.75, "kl": 1.5974225997924805, "learning_rate": 5e-07, "logits/chosen": -60661072.0, "logits/rejected": -408277.46875, "logps/chosen": -1113.16748046875, "logps/rejected": -174.81494140625, "loss": 0.1551, "rewards/chosen": 2.349977731704712, "rewards/margins": 4.070698499679565, "rewards/rejected": -1.7207207679748535, "step": 11333 }, { "epoch": 0.6007473564254101, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44356524.0, "logits/rejected": -44943845.333333336, "logps/chosen": -368.12982177734375, "logps/rejected": -355.1322021484375, "loss": 0.2026, "rewards/chosen": 0.8981841802597046, "rewards/margins": 3.0867732763290405, "rewards/rejected": -2.188589096069336, "step": 11334 }, { "epoch": 0.6008003604272123, "grad_norm": 49.0, "kl": 2.5395431518554688, "learning_rate": 5e-07, "logits/chosen": -8081790.0, "logits/rejected": 959600.4, "logps/chosen": -744.9052734375, "logps/rejected": -211.359716796875, "loss": 0.2269, "rewards/chosen": 1.5876658757527669, "rewards/margins": 3.410741933186849, "rewards/rejected": -1.823076057434082, "step": 11335 }, { "epoch": 0.6008533644290144, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3328984.0, "logits/rejected": -26878384.0, "logps/chosen": -251.771826171875, "logps/rejected": -131.86885579427084, "loss": 0.3189, "rewards/chosen": 0.31421875953674316, "rewards/margins": 2.527244965235392, "rewards/rejected": -2.213026205698649, "step": 11336 }, { "epoch": 0.6009063684308166, "grad_norm": 56.5, "kl": 1.6971149444580078, "learning_rate": 5e-07, "logits/chosen": 27281920.0, "logits/rejected": -40521708.0, "logps/chosen": -337.0623779296875, "logps/rejected": -229.069580078125, "loss": 0.3257, "rewards/chosen": 0.3102255165576935, "rewards/margins": 2.2950519621372223, "rewards/rejected": -1.9848264455795288, "step": 11337 }, { "epoch": 0.6009593724326187, "grad_norm": 55.0, "kl": 2.1283645629882812, "learning_rate": 5e-07, "logits/chosen": -6178161.6, "logits/rejected": -18793557.333333332, "logps/chosen": -327.193896484375, "logps/rejected": -278.09800211588544, "loss": 0.3278, "rewards/chosen": 0.5748727798461915, "rewards/margins": 2.784365781148275, "rewards/rejected": -2.2094930013020835, "step": 11338 }, { "epoch": 0.6010123764344208, "grad_norm": 68.0, "kl": 3.8916168212890625, "learning_rate": 5e-07, "logits/chosen": 54321491.2, "logits/rejected": -60212554.666666664, "logps/chosen": -520.1890625, "logps/rejected": -343.9680989583333, "loss": 0.3655, "rewards/chosen": 0.24942116737365722, "rewards/margins": 3.340419499079386, "rewards/rejected": -3.090998331705729, "step": 11339 }, { "epoch": 0.6010653804362229, "grad_norm": 40.75, "kl": 1.2364540100097656, "learning_rate": 5e-07, "logits/chosen": -56078346.666666664, "logits/rejected": -17007848.0, "logps/chosen": -483.5552978515625, "logps/rejected": -274.973828125, "loss": 0.1891, "rewards/chosen": 1.0286489327748616, "rewards/margins": 3.3916053613026937, "rewards/rejected": -2.362956428527832, "step": 11340 }, { "epoch": 0.6011183844380251, "grad_norm": 40.25, "kl": 0.20087051391601562, "learning_rate": 5e-07, "logits/chosen": -30525370.0, "logits/rejected": -25536910.0, "logps/chosen": -264.6989440917969, "logps/rejected": -306.3971862792969, "loss": 0.173, "rewards/chosen": 1.3686232566833496, "rewards/margins": 3.9885599613189697, "rewards/rejected": -2.61993670463562, "step": 11341 }, { "epoch": 0.6011713884398272, "grad_norm": 37.0, "kl": 0.7099018096923828, "learning_rate": 5e-07, "logits/chosen": -64072944.0, "logits/rejected": -64282313.14285714, "logps/chosen": -225.98715209960938, "logps/rejected": -421.75802176339283, "loss": 0.1412, "rewards/chosen": 0.09830017387866974, "rewards/margins": 3.077344134449959, "rewards/rejected": -2.979043960571289, "step": 11342 }, { "epoch": 0.6012243924416294, "grad_norm": 42.75, "kl": 0.7345314025878906, "learning_rate": 5e-07, "logits/chosen": -73560901.33333333, "logits/rejected": -69151795.2, "logps/chosen": -667.0445149739584, "logps/rejected": -332.2732666015625, "loss": 0.1534, "rewards/chosen": 1.6363027890523274, "rewards/margins": 4.196994050343831, "rewards/rejected": -2.560691261291504, "step": 11343 }, { "epoch": 0.6012773964434315, "grad_norm": 41.75, "kl": 0.3822917938232422, "learning_rate": 5e-07, "logits/chosen": -14261908.0, "logits/rejected": -6960674.0, "logps/chosen": -111.02099609375, "logps/rejected": -166.09932454427084, "loss": 0.3389, "rewards/chosen": 0.018311887979507446, "rewards/margins": 1.2728756964206696, "rewards/rejected": -1.254563808441162, "step": 11344 }, { "epoch": 0.6013304004452337, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74410560.0, "logits/rejected": -71421241.6, "logps/chosen": -410.2934977213542, "logps/rejected": -151.578125, "loss": 0.2664, "rewards/chosen": 0.1918598214785258, "rewards/margins": 2.6200742761294045, "rewards/rejected": -2.428214454650879, "step": 11345 }, { "epoch": 0.6013834044470358, "grad_norm": 48.25, "kl": 1.2463722229003906, "learning_rate": 5e-07, "logits/chosen": -76880576.0, "logits/rejected": -20682178.666666668, "logps/chosen": -655.0330810546875, "logps/rejected": -282.0508626302083, "loss": 0.1466, "rewards/chosen": 1.3487274646759033, "rewards/margins": 4.1980596383412685, "rewards/rejected": -2.8493321736653647, "step": 11346 }, { "epoch": 0.6014364084488378, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30012500.0, "logits/rejected": -15951340.0, "logps/chosen": -539.6759033203125, "logps/rejected": -241.7420654296875, "loss": 0.3675, "rewards/chosen": -0.11353074014186859, "rewards/margins": 1.6642435938119888, "rewards/rejected": -1.7777743339538574, "step": 11347 }, { "epoch": 0.60148941245064, "grad_norm": 54.5, "kl": 0.000732421875, "learning_rate": 5e-07, "logits/chosen": 400496.0, "logits/rejected": -35783938.666666664, "logps/chosen": -29.14914321899414, "logps/rejected": -321.7069091796875, "loss": 0.2264, "rewards/chosen": 0.5786558389663696, "rewards/margins": 2.417419870694478, "rewards/rejected": -1.8387640317281086, "step": 11348 }, { "epoch": 0.6015424164524421, "grad_norm": 51.5, "kl": 0.2839059829711914, "learning_rate": 5e-07, "logits/chosen": -16552790.0, "logits/rejected": 8370145.0, "logps/chosen": -205.1953887939453, "logps/rejected": -174.4888916015625, "loss": 0.3509, "rewards/chosen": 0.41343918442726135, "rewards/margins": 1.3095246255397797, "rewards/rejected": -0.8960854411125183, "step": 11349 }, { "epoch": 0.6015954204542443, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25772600.0, "logits/rejected": -13331152.0, "logps/chosen": -214.3836669921875, "logps/rejected": -134.81771850585938, "loss": 0.4066, "rewards/chosen": 0.08852019309997558, "rewards/margins": 1.1197139263153075, "rewards/rejected": -1.031193733215332, "step": 11350 }, { "epoch": 0.6016484244560464, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68892072.0, "logits/rejected": -5895795.0, "logps/chosen": -330.08526611328125, "logps/rejected": -151.7672882080078, "loss": 0.4723, "rewards/chosen": -0.8799998760223389, "rewards/margins": 0.5975196361541748, "rewards/rejected": -1.4775195121765137, "step": 11351 }, { "epoch": 0.6017014284578486, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62851424.0, "logits/rejected": -6209678.8, "logps/chosen": -443.9001057942708, "logps/rejected": -210.22216796875, "loss": 0.2443, "rewards/chosen": 0.520965576171875, "rewards/margins": 2.3847917556762694, "rewards/rejected": -1.8638261795043944, "step": 11352 }, { "epoch": 0.6017544324596507, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38908853.333333336, "logits/rejected": -42427596.8, "logps/chosen": -90.91611735026042, "logps/rejected": -574.823193359375, "loss": 0.2606, "rewards/chosen": -0.18096957604090372, "rewards/margins": 2.580304761727651, "rewards/rejected": -2.7612743377685547, "step": 11353 }, { "epoch": 0.6018074364614528, "grad_norm": 51.5, "kl": 0.8353519439697266, "learning_rate": 5e-07, "logits/chosen": -54431512.0, "logits/rejected": 3102617.5, "logps/chosen": -328.22308349609375, "logps/rejected": -357.302490234375, "loss": 0.2584, "rewards/chosen": 0.5067358016967773, "rewards/margins": 2.6510260105133057, "rewards/rejected": -2.1442902088165283, "step": 11354 }, { "epoch": 0.6018604404632549, "grad_norm": 54.25, "kl": 0.6265659332275391, "learning_rate": 5e-07, "logits/chosen": -60969896.0, "logits/rejected": -14114614.0, "logps/chosen": -261.4243469238281, "logps/rejected": -256.92236328125, "loss": 0.3033, "rewards/chosen": 0.6999927759170532, "rewards/margins": 2.1616196632385254, "rewards/rejected": -1.4616268873214722, "step": 11355 }, { "epoch": 0.6019134444650571, "grad_norm": 41.75, "kl": 0.8620109558105469, "learning_rate": 5e-07, "logits/chosen": -24742634.666666668, "logits/rejected": -5034438.0, "logps/chosen": -225.52034505208334, "logps/rejected": -388.7671203613281, "loss": 0.4194, "rewards/chosen": -0.05253412822882334, "rewards/margins": 1.9410824527343113, "rewards/rejected": -1.9936165809631348, "step": 11356 }, { "epoch": 0.6019664484668592, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75018640.0, "logits/rejected": -10991760.0, "logps/chosen": -504.9782409667969, "logps/rejected": -300.7556559244792, "loss": 0.2221, "rewards/chosen": 0.539715588092804, "rewards/margins": 2.2415687044461565, "rewards/rejected": -1.7018531163533528, "step": 11357 }, { "epoch": 0.6020194524686614, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33085698.666666668, "logits/rejected": -37303846.4, "logps/chosen": -444.3152262369792, "logps/rejected": -216.260791015625, "loss": 0.2498, "rewards/chosen": 1.0472880204518635, "rewards/margins": 2.4397726853688555, "rewards/rejected": -1.3924846649169922, "step": 11358 }, { "epoch": 0.6020724564704635, "grad_norm": 36.5, "kl": 0.04888916015625, "learning_rate": 5e-07, "logits/chosen": -16908928.0, "logits/rejected": -39973156.0, "logps/chosen": -388.62078857421875, "logps/rejected": -478.4468078613281, "loss": 0.2488, "rewards/chosen": 0.5065798163414001, "rewards/margins": 3.4699241518974304, "rewards/rejected": -2.9633443355560303, "step": 11359 }, { "epoch": 0.6021254604722657, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26319510.0, "logits/rejected": -13040240.0, "logps/chosen": -450.63421630859375, "logps/rejected": -276.9649658203125, "loss": 0.2088, "rewards/chosen": 0.8383117914199829, "rewards/margins": 3.156349539756775, "rewards/rejected": -2.318037748336792, "step": 11360 }, { "epoch": 0.6021784644740678, "grad_norm": 38.75, "kl": 2.4839324951171875, "learning_rate": 5e-07, "logits/chosen": -26183426.0, "logits/rejected": -14258146.0, "logps/chosen": -375.723388671875, "logps/rejected": -309.73370361328125, "loss": 0.1933, "rewards/chosen": 1.1938939094543457, "rewards/margins": 3.611884355545044, "rewards/rejected": -2.4179904460906982, "step": 11361 }, { "epoch": 0.60223146847587, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71014392.0, "logits/rejected": -63986712.0, "logps/chosen": -334.6844177246094, "logps/rejected": -446.8606872558594, "loss": 0.3369, "rewards/chosen": -0.2536430358886719, "rewards/margins": 1.8779187202453613, "rewards/rejected": -2.131561756134033, "step": 11362 }, { "epoch": 0.602284472477672, "grad_norm": 57.0, "kl": 0.7093353271484375, "learning_rate": 5e-07, "logits/chosen": -49726636.8, "logits/rejected": 29851949.333333332, "logps/chosen": -280.1055419921875, "logps/rejected": -289.72035725911456, "loss": 0.3282, "rewards/chosen": 0.3722971439361572, "rewards/margins": 1.9744471073150636, "rewards/rejected": -1.6021499633789062, "step": 11363 }, { "epoch": 0.6023374764794742, "grad_norm": 36.25, "kl": 0.6447296142578125, "learning_rate": 5e-07, "logits/chosen": -23324442.0, "logits/rejected": -32541142.0, "logps/chosen": -177.18154907226562, "logps/rejected": -225.84046936035156, "loss": 0.2425, "rewards/chosen": 0.6715257167816162, "rewards/margins": 2.612205743789673, "rewards/rejected": -1.9406800270080566, "step": 11364 }, { "epoch": 0.6023904804812763, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63920000.0, "logits/rejected": -24440000.0, "logps/chosen": -355.4234619140625, "logps/rejected": -306.82666015625, "loss": 0.15, "rewards/chosen": 0.5185455679893494, "rewards/margins": 3.401805023352305, "rewards/rejected": -2.8832594553629556, "step": 11365 }, { "epoch": 0.6024434844830785, "grad_norm": 51.0, "kl": 4.13707160949707, "learning_rate": 5e-07, "logits/chosen": -3697232.75, "logits/rejected": -33299094.0, "logps/chosen": -521.1305541992188, "logps/rejected": -173.84889221191406, "loss": 0.2699, "rewards/chosen": 1.3077017068862915, "rewards/margins": 3.07775342464447, "rewards/rejected": -1.7700517177581787, "step": 11366 }, { "epoch": 0.6024964884848806, "grad_norm": 35.0, "kl": 0.8497467041015625, "learning_rate": 5e-07, "logits/chosen": -33568408.0, "logits/rejected": -33888064.0, "logps/chosen": -556.9472045898438, "logps/rejected": -449.10394287109375, "loss": 0.2368, "rewards/chosen": 0.7949573397636414, "rewards/margins": 4.227682292461395, "rewards/rejected": -3.432724952697754, "step": 11367 }, { "epoch": 0.6025494924866828, "grad_norm": 59.0, "kl": 0.28362274169921875, "learning_rate": 5e-07, "logits/chosen": -20781830.0, "logits/rejected": -29297532.0, "logps/chosen": -484.15155029296875, "logps/rejected": -181.38064575195312, "loss": 0.2594, "rewards/chosen": 0.7341720461845398, "rewards/margins": 2.518253266811371, "rewards/rejected": -1.784081220626831, "step": 11368 }, { "epoch": 0.6026024964884849, "grad_norm": 35.75, "kl": 1.1486873626708984, "learning_rate": 5e-07, "logits/chosen": -37454256.0, "logits/rejected": -42028120.0, "logps/chosen": -112.16822052001953, "logps/rejected": -459.7198486328125, "loss": 0.2352, "rewards/chosen": 0.4537666141986847, "rewards/margins": 4.096309453248978, "rewards/rejected": -3.642542839050293, "step": 11369 }, { "epoch": 0.602655500490287, "grad_norm": 47.75, "kl": 1.9096965789794922, "learning_rate": 5e-07, "logits/chosen": -23093201.6, "logits/rejected": -89264928.0, "logps/chosen": -325.601416015625, "logps/rejected": -556.9143473307291, "loss": 0.3191, "rewards/chosen": 0.4928102493286133, "rewards/margins": 3.481595039367676, "rewards/rejected": -2.9887847900390625, "step": 11370 }, { "epoch": 0.6027085044920891, "grad_norm": 63.5, "kl": 0.7182159423828125, "learning_rate": 5e-07, "logits/chosen": -28092096.0, "logits/rejected": 2242690.0, "logps/chosen": -225.0580037434896, "logps/rejected": -445.2239685058594, "loss": 0.4626, "rewards/chosen": 0.013871252536773682, "rewards/margins": 0.7796618342399597, "rewards/rejected": -0.765790581703186, "step": 11371 }, { "epoch": 0.6027615084938913, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5708232.0, "logits/rejected": -17787025.6, "logps/chosen": -217.64851888020834, "logps/rejected": -283.711328125, "loss": 0.2999, "rewards/chosen": 0.7480731805165609, "rewards/margins": 1.9908303101857503, "rewards/rejected": -1.2427571296691895, "step": 11372 }, { "epoch": 0.6028145124956934, "grad_norm": 43.25, "kl": 2.437380313873291, "learning_rate": 5e-07, "logits/chosen": -38133776.0, "logits/rejected": -32028812.8, "logps/chosen": -289.42299397786456, "logps/rejected": -310.98193359375, "loss": 0.2876, "rewards/chosen": 0.18407841523488364, "rewards/margins": 2.069664963086446, "rewards/rejected": -1.8855865478515625, "step": 11373 }, { "epoch": 0.6028675164974956, "grad_norm": 46.0, "kl": 2.143657684326172, "learning_rate": 5e-07, "logits/chosen": -2915030.2, "logits/rejected": -26424485.333333332, "logps/chosen": -543.25107421875, "logps/rejected": -209.4791463216146, "loss": 0.3121, "rewards/chosen": 1.2349616050720216, "rewards/margins": 3.111343797047933, "rewards/rejected": -1.8763821919759114, "step": 11374 }, { "epoch": 0.6029205204992977, "grad_norm": 66.5, "kl": 0.4440269470214844, "learning_rate": 5e-07, "logits/chosen": -27119554.0, "logits/rejected": -6823252.0, "logps/chosen": -315.5908203125, "logps/rejected": -718.2628173828125, "loss": 0.3504, "rewards/chosen": 0.16450674831867218, "rewards/margins": 2.6214686185121536, "rewards/rejected": -2.4569618701934814, "step": 11375 }, { "epoch": 0.6029735245010999, "grad_norm": 44.75, "kl": 2.46234130859375, "learning_rate": 5e-07, "logits/chosen": -42117712.0, "logits/rejected": -76278976.0, "logps/chosen": -219.87509765625, "logps/rejected": -661.2773030598959, "loss": 0.2651, "rewards/chosen": 0.7561293125152588, "rewards/margins": 4.300289519627889, "rewards/rejected": -3.5441602071126304, "step": 11376 }, { "epoch": 0.603026528502902, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56485888.0, "logits/rejected": -33973756.8, "logps/chosen": -398.0716959635417, "logps/rejected": -303.8697265625, "loss": 0.1879, "rewards/chosen": 0.7187576293945312, "rewards/margins": 3.1677762985229494, "rewards/rejected": -2.449018669128418, "step": 11377 }, { "epoch": 0.6030795325047041, "grad_norm": 31.375, "kl": 0.06127357482910156, "learning_rate": 5e-07, "logits/chosen": 3047322.0, "logits/rejected": -7064558.4, "logps/chosen": -430.2125651041667, "logps/rejected": -320.7353271484375, "loss": 0.158, "rewards/chosen": 1.8610820770263672, "rewards/margins": 3.7362781524658204, "rewards/rejected": -1.8751960754394532, "step": 11378 }, { "epoch": 0.6031325365065062, "grad_norm": 42.75, "kl": 2.3097496032714844, "learning_rate": 5e-07, "logits/chosen": 4504208.0, "logits/rejected": -19649273.6, "logps/chosen": -79.5730489095052, "logps/rejected": -242.099072265625, "loss": 0.2359, "rewards/chosen": 0.983012835184733, "rewards/margins": 2.6525499979654947, "rewards/rejected": -1.6695371627807618, "step": 11379 }, { "epoch": 0.6031855405083084, "grad_norm": 40.5, "kl": 3.3788890838623047, "learning_rate": 5e-07, "logits/chosen": -44639216.0, "logits/rejected": -1646695.625, "logps/chosen": -483.916259765625, "logps/rejected": -190.58848571777344, "loss": 0.2882, "rewards/chosen": 1.1962859630584717, "rewards/margins": 2.7181390523910522, "rewards/rejected": -1.5218530893325806, "step": 11380 }, { "epoch": 0.6032385445101105, "grad_norm": 67.5, "kl": 4.256675720214844, "learning_rate": 5e-07, "logits/chosen": -27438921.14285714, "logits/rejected": -8265843.5, "logps/chosen": -322.0149623325893, "logps/rejected": -101.44954681396484, "loss": 0.3624, "rewards/chosen": 0.9229023797171456, "rewards/margins": 4.229298864092145, "rewards/rejected": -3.306396484375, "step": 11381 }, { "epoch": 0.6032915485119127, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -85761008.0, "logits/rejected": -31509930.666666668, "logps/chosen": -426.802490234375, "logps/rejected": -365.440185546875, "loss": 0.1853, "rewards/chosen": 0.9045990109443665, "rewards/margins": 3.348015606403351, "rewards/rejected": -2.4434165954589844, "step": 11382 }, { "epoch": 0.6033445525137148, "grad_norm": 58.5, "kl": 0.7846794128417969, "learning_rate": 5e-07, "logits/chosen": -7524568.0, "logits/rejected": -28418336.0, "logps/chosen": -534.171826171875, "logps/rejected": -207.77738444010416, "loss": 0.308, "rewards/chosen": 0.8437908172607422, "rewards/margins": 2.1103885650634764, "rewards/rejected": -1.2665977478027344, "step": 11383 }, { "epoch": 0.603397556515517, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36211621.333333336, "logits/rejected": -24402720.0, "logps/chosen": -316.3916422526042, "logps/rejected": -258.3193359375, "loss": 0.1964, "rewards/chosen": 0.9995106856028239, "rewards/margins": 2.881289021174113, "rewards/rejected": -1.8817783355712892, "step": 11384 }, { "epoch": 0.603450560517319, "grad_norm": 42.75, "kl": 1.8897476196289062, "learning_rate": 5e-07, "logits/chosen": -14874375.0, "logits/rejected": -16210752.0, "logps/chosen": -369.7795104980469, "logps/rejected": -375.38970947265625, "loss": 0.2134, "rewards/chosen": 0.6093273162841797, "rewards/margins": 4.3723673820495605, "rewards/rejected": -3.763040065765381, "step": 11385 }, { "epoch": 0.6035035645191212, "grad_norm": 46.0, "kl": 0.938776969909668, "learning_rate": 5e-07, "logits/chosen": -40847093.333333336, "logits/rejected": -16957851.2, "logps/chosen": -445.100830078125, "logps/rejected": -245.9533203125, "loss": 0.2731, "rewards/chosen": 0.31414387623469037, "rewards/margins": 2.098988465468089, "rewards/rejected": -1.7848445892333984, "step": 11386 }, { "epoch": 0.6035565685209233, "grad_norm": 45.75, "kl": 0.5786819458007812, "learning_rate": 5e-07, "logits/chosen": -14579412.8, "logits/rejected": 3308682.3333333335, "logps/chosen": -108.14898681640625, "logps/rejected": -367.2259928385417, "loss": 0.4012, "rewards/chosen": 0.0346526026725769, "rewards/margins": 2.1428593516349794, "rewards/rejected": -2.1082067489624023, "step": 11387 }, { "epoch": 0.6036095725227255, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40818888.0, "logits/rejected": -22383376.0, "logps/chosen": -218.33050537109375, "logps/rejected": -286.56512451171875, "loss": 0.3299, "rewards/chosen": -0.3504500687122345, "rewards/margins": 2.781253308057785, "rewards/rejected": -3.1317033767700195, "step": 11388 }, { "epoch": 0.6036625765245276, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22971736.0, "logits/rejected": -15604002.666666666, "logps/chosen": -659.82666015625, "logps/rejected": -334.6590169270833, "loss": 0.1433, "rewards/chosen": 1.2049225568771362, "rewards/margins": 4.005214889844259, "rewards/rejected": -2.8002923329671225, "step": 11389 }, { "epoch": 0.6037155805263298, "grad_norm": 57.0, "kl": 3.5618629455566406, "learning_rate": 5e-07, "logits/chosen": -33083740.8, "logits/rejected": -7515927.333333333, "logps/chosen": -394.968896484375, "logps/rejected": -180.57014973958334, "loss": 0.3115, "rewards/chosen": 0.5382813453674317, "rewards/margins": 2.479930623372396, "rewards/rejected": -1.9416492780049641, "step": 11390 }, { "epoch": 0.6037685845281319, "grad_norm": 58.25, "kl": 0.471221923828125, "learning_rate": 5e-07, "logits/chosen": -60308736.0, "logits/rejected": -16733086.4, "logps/chosen": -403.74365234375, "logps/rejected": -308.87314453125, "loss": 0.2985, "rewards/chosen": 0.018738811214764912, "rewards/margins": 1.6837729146083196, "rewards/rejected": -1.6650341033935547, "step": 11391 }, { "epoch": 0.6038215885299341, "grad_norm": 43.75, "kl": 1.1431655883789062, "learning_rate": 5e-07, "logits/chosen": -32822806.0, "logits/rejected": -16833316.0, "logps/chosen": -177.25791931152344, "logps/rejected": -201.2540740966797, "loss": 0.3545, "rewards/chosen": 0.39695224165916443, "rewards/margins": 1.532477468252182, "rewards/rejected": -1.1355252265930176, "step": 11392 }, { "epoch": 0.6038745925317361, "grad_norm": 54.75, "kl": 2.7207107543945312, "learning_rate": 5e-07, "logits/chosen": -70031600.0, "logits/rejected": 7814205.5, "logps/chosen": -217.57466634114584, "logps/rejected": -250.24978637695312, "loss": 0.442, "rewards/chosen": 0.2648759682973226, "rewards/margins": 2.6976729234059653, "rewards/rejected": -2.4327969551086426, "step": 11393 }, { "epoch": 0.6039275965335383, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36608424.0, "logits/rejected": -22791890.666666668, "logps/chosen": -296.4261474609375, "logps/rejected": -310.37526448567706, "loss": 0.1015, "rewards/chosen": 1.2972732782363892, "rewards/margins": 4.34995679060618, "rewards/rejected": -3.0526835123697915, "step": 11394 }, { "epoch": 0.6039806005353404, "grad_norm": 44.25, "kl": 0.0971994400024414, "learning_rate": 5e-07, "logits/chosen": -64947264.0, "logits/rejected": -31966886.4, "logps/chosen": -443.8094075520833, "logps/rejected": -373.434326171875, "loss": 0.2113, "rewards/chosen": 1.0774623552958171, "rewards/margins": 3.041200033823649, "rewards/rejected": -1.963737678527832, "step": 11395 }, { "epoch": 0.6040336045371425, "grad_norm": 52.0, "kl": 0.951629638671875, "learning_rate": 5e-07, "logits/chosen": -65641753.6, "logits/rejected": -3388419.0, "logps/chosen": -632.525830078125, "logps/rejected": -153.7806193033854, "loss": 0.3188, "rewards/chosen": 1.0163095474243165, "rewards/margins": 2.3213483174641927, "rewards/rejected": -1.3050387700398762, "step": 11396 }, { "epoch": 0.6040866085389447, "grad_norm": 40.75, "kl": 1.4235725402832031, "learning_rate": 5e-07, "logits/chosen": -35370322.666666664, "logits/rejected": -29454649.6, "logps/chosen": -239.67447916666666, "logps/rejected": -358.69873046875, "loss": 0.2516, "rewards/chosen": 0.03097686419884364, "rewards/margins": 2.417798801759879, "rewards/rejected": -2.3868219375610353, "step": 11397 }, { "epoch": 0.6041396125407468, "grad_norm": 67.0, "kl": 4.136817932128906, "learning_rate": 5e-07, "logits/chosen": -55161749.333333336, "logits/rejected": -21167384.0, "logps/chosen": -510.3841145833333, "logps/rejected": -287.3355712890625, "loss": 0.2156, "rewards/chosen": 1.5778026580810547, "rewards/margins": 3.1978389739990236, "rewards/rejected": -1.6200363159179687, "step": 11398 }, { "epoch": 0.604192616542549, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 94486688.0, "logits/rejected": -624150.0, "logps/chosen": -425.562060546875, "logps/rejected": -139.11690266927084, "loss": 0.3012, "rewards/chosen": 0.5579864978790283, "rewards/margins": 2.5533045927683515, "rewards/rejected": -1.995318094889323, "step": 11399 }, { "epoch": 0.604245620544351, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48160868.0, "logits/rejected": -22906397.333333332, "logps/chosen": -149.51600646972656, "logps/rejected": -354.2667643229167, "loss": 0.3019, "rewards/chosen": -0.03410034626722336, "rewards/margins": 1.5556133861343067, "rewards/rejected": -1.58971373240153, "step": 11400 }, { "epoch": 0.6042986245461532, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13908308.0, "logits/rejected": -18536150.0, "logps/chosen": -123.25750732421875, "logps/rejected": -182.4007110595703, "loss": 0.3166, "rewards/chosen": 0.045176029205322266, "rewards/margins": 1.943192481994629, "rewards/rejected": -1.8980164527893066, "step": 11401 }, { "epoch": 0.6043516285479553, "grad_norm": 43.5, "kl": 0.4658050537109375, "learning_rate": 5e-07, "logits/chosen": -1280799.5, "logits/rejected": -21915324.0, "logps/chosen": -69.52593994140625, "logps/rejected": -298.148193359375, "loss": 0.3355, "rewards/chosen": 0.2475692480802536, "rewards/margins": 2.1142324656248093, "rewards/rejected": -1.8666632175445557, "step": 11402 }, { "epoch": 0.6044046325497575, "grad_norm": 55.0, "kl": 1.2227764129638672, "learning_rate": 5e-07, "logits/chosen": -12174744.0, "logits/rejected": -43507240.0, "logps/chosen": -335.3401794433594, "logps/rejected": -319.524658203125, "loss": 0.3152, "rewards/chosen": 0.20928898453712463, "rewards/margins": 2.49411478638649, "rewards/rejected": -2.2848258018493652, "step": 11403 }, { "epoch": 0.6044576365515596, "grad_norm": 45.75, "kl": 0.4272298812866211, "learning_rate": 5e-07, "logits/chosen": -29147926.0, "logits/rejected": -25832454.0, "logps/chosen": -241.81329345703125, "logps/rejected": -291.690673828125, "loss": 0.3263, "rewards/chosen": 0.420163094997406, "rewards/margins": 2.0376251339912415, "rewards/rejected": -1.6174620389938354, "step": 11404 }, { "epoch": 0.6045106405533618, "grad_norm": 65.5, "kl": 0.2416973114013672, "learning_rate": 5e-07, "logits/chosen": -35871744.0, "logits/rejected": -1359281.5, "logps/chosen": -366.0118713378906, "logps/rejected": -291.3763834635417, "loss": 0.2345, "rewards/chosen": 0.5018303394317627, "rewards/margins": 2.8995781739552817, "rewards/rejected": -2.397747834523519, "step": 11405 }, { "epoch": 0.6045636445551639, "grad_norm": 50.5, "kl": 2.513092041015625, "learning_rate": 5e-07, "logits/chosen": -30676477.333333332, "logits/rejected": -90443528.0, "logps/chosen": -661.085693359375, "logps/rejected": -491.5349426269531, "loss": 0.3588, "rewards/chosen": 0.692503293355306, "rewards/margins": 5.773969968159993, "rewards/rejected": -5.0814666748046875, "step": 11406 }, { "epoch": 0.6046166485569661, "grad_norm": 45.0, "kl": 0.6694450378417969, "learning_rate": 5e-07, "logits/chosen": -57693420.8, "logits/rejected": -17308414.666666668, "logps/chosen": -295.098583984375, "logps/rejected": -733.3068033854166, "loss": 0.3641, "rewards/chosen": -0.16549222469329833, "rewards/margins": 2.9905460596084597, "rewards/rejected": -3.156038284301758, "step": 11407 }, { "epoch": 0.6046696525587681, "grad_norm": 25.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1011691.3333333334, "logits/rejected": -9644569.6, "logps/chosen": -210.220458984375, "logps/rejected": -374.193115234375, "loss": 0.1572, "rewards/chosen": 0.5752089420954386, "rewards/margins": 4.179912845293681, "rewards/rejected": -3.604703903198242, "step": 11408 }, { "epoch": 0.6047226565605703, "grad_norm": 56.75, "kl": 4.041374206542969, "learning_rate": 5e-07, "logits/chosen": -29941250.666666668, "logits/rejected": 17483094.0, "logps/chosen": -284.1978759765625, "logps/rejected": -315.2761535644531, "loss": 0.4011, "rewards/chosen": 0.5411777098973592, "rewards/margins": 2.92339821656545, "rewards/rejected": -2.382220506668091, "step": 11409 }, { "epoch": 0.6047756605623724, "grad_norm": 58.0, "kl": 3.154420852661133, "learning_rate": 5e-07, "logits/chosen": -23881094.4, "logits/rejected": -19014818.666666668, "logps/chosen": -479.643505859375, "logps/rejected": -263.1995035807292, "loss": 0.1963, "rewards/chosen": 1.2704764366149903, "rewards/margins": 4.59483855565389, "rewards/rejected": -3.3243621190389, "step": 11410 }, { "epoch": 0.6048286645641746, "grad_norm": 48.25, "kl": 0.7963447570800781, "learning_rate": 5e-07, "logits/chosen": -52674297.6, "logits/rejected": -35514037.333333336, "logps/chosen": -590.823046875, "logps/rejected": -439.5934651692708, "loss": 0.3052, "rewards/chosen": 0.6152225494384765, "rewards/margins": 2.5410271644592286, "rewards/rejected": -1.925804615020752, "step": 11411 }, { "epoch": 0.6048816685659767, "grad_norm": 53.0, "kl": 0.15518951416015625, "learning_rate": 5e-07, "logits/chosen": -21036915.2, "logits/rejected": -12038850.666666666, "logps/chosen": -200.1079833984375, "logps/rejected": -171.994140625, "loss": 0.2878, "rewards/chosen": 0.8223758697509765, "rewards/margins": 2.0406154950459796, "rewards/rejected": -1.2182396252950032, "step": 11412 }, { "epoch": 0.6049346725677789, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52261584.0, "logits/rejected": -16128038.4, "logps/chosen": -320.57899983723956, "logps/rejected": -219.869921875, "loss": 0.2361, "rewards/chosen": 0.2306564450263977, "rewards/margins": 2.7897876858711244, "rewards/rejected": -2.5591312408447267, "step": 11413 }, { "epoch": 0.604987676569581, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54741792.0, "logits/rejected": -25763753.6, "logps/chosen": -283.34527587890625, "logps/rejected": -401.4580078125, "loss": 0.2307, "rewards/chosen": -0.042355855305989586, "rewards/margins": 2.8617792765299477, "rewards/rejected": -2.9041351318359374, "step": 11414 }, { "epoch": 0.6050406805713832, "grad_norm": 62.5, "kl": 0.5037193298339844, "learning_rate": 5e-07, "logits/chosen": -42377144.0, "logits/rejected": -49312260.0, "logps/chosen": -380.9840087890625, "logps/rejected": -362.96673583984375, "loss": 0.4123, "rewards/chosen": 0.03264604012171427, "rewards/margins": 1.8861907521883647, "rewards/rejected": -1.8535447120666504, "step": 11415 }, { "epoch": 0.6050936845731852, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78796954.66666667, "logits/rejected": -28169996.8, "logps/chosen": -163.64825439453125, "logps/rejected": -329.85791015625, "loss": 0.2684, "rewards/chosen": -0.29842251539230347, "rewards/margins": 2.3750310063362123, "rewards/rejected": -2.673453521728516, "step": 11416 }, { "epoch": 0.6051466885749874, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41320048.0, "logits/rejected": -22686497.6, "logps/chosen": -269.7671712239583, "logps/rejected": -514.843798828125, "loss": 0.1756, "rewards/chosen": 1.152855396270752, "rewards/margins": 5.173801136016846, "rewards/rejected": -4.020945739746094, "step": 11417 }, { "epoch": 0.6051996925767895, "grad_norm": 43.0, "kl": 0.2123889923095703, "learning_rate": 5e-07, "logits/chosen": -25348314.0, "logits/rejected": -37072533.333333336, "logps/chosen": -418.82635498046875, "logps/rejected": -247.16353352864584, "loss": 0.2264, "rewards/chosen": 0.23463591933250427, "rewards/margins": 2.3754837612311044, "rewards/rejected": -2.1408478418986, "step": 11418 }, { "epoch": 0.6052526965785917, "grad_norm": 46.25, "kl": 2.3933372497558594, "learning_rate": 5e-07, "logits/chosen": -9872067.333333334, "logits/rejected": -6516424.0, "logps/chosen": -492.6338297526042, "logps/rejected": -331.5953674316406, "loss": 0.3258, "rewards/chosen": 0.6712516148885092, "rewards/margins": 3.2523794968922934, "rewards/rejected": -2.581127882003784, "step": 11419 }, { "epoch": 0.6053057005803938, "grad_norm": 29.875, "kl": 1.1497783660888672, "learning_rate": 5e-07, "logits/chosen": -20510738.666666668, "logits/rejected": -35816441.6, "logps/chosen": -33.096483866373696, "logps/rejected": -506.32880859375, "loss": 0.2336, "rewards/chosen": 0.0852479338645935, "rewards/margins": 3.2067159056663512, "rewards/rejected": -3.1214679718017577, "step": 11420 }, { "epoch": 0.605358704582196, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15996576.0, "logits/rejected": -48758470.4, "logps/chosen": -668.417724609375, "logps/rejected": -239.126416015625, "loss": 0.2146, "rewards/chosen": 0.8802114327748617, "rewards/margins": 3.058920462926229, "rewards/rejected": -2.178709030151367, "step": 11421 }, { "epoch": 0.6054117085839981, "grad_norm": 63.5, "kl": 3.254121780395508, "learning_rate": 5e-07, "logits/chosen": -31186349.714285713, "logits/rejected": 2169271.0, "logps/chosen": -309.03390066964283, "logps/rejected": -74.53724670410156, "loss": 0.4503, "rewards/chosen": 0.3541452203478132, "rewards/margins": 2.861626556941441, "rewards/rejected": -2.507481336593628, "step": 11422 }, { "epoch": 0.6054647125858003, "grad_norm": 34.0, "kl": 0.8466739654541016, "learning_rate": 5e-07, "logits/chosen": -15132109.333333334, "logits/rejected": -11961827.2, "logps/chosen": -213.3316853841146, "logps/rejected": -255.3166748046875, "loss": 0.1661, "rewards/chosen": 0.6300857861836752, "rewards/margins": 3.619388802846273, "rewards/rejected": -2.9893030166625976, "step": 11423 }, { "epoch": 0.6055177165876023, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21748604.0, "logits/rejected": -1732537.3333333333, "logps/chosen": -415.7279052734375, "logps/rejected": -312.0533854166667, "loss": 0.228, "rewards/chosen": 1.489943027496338, "rewards/margins": 3.1022849082946777, "rewards/rejected": -1.6123418807983398, "step": 11424 }, { "epoch": 0.6055707205894045, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17237588.0, "logits/rejected": -11142936.0, "logps/chosen": -189.08900451660156, "logps/rejected": -177.30792236328125, "loss": 0.1728, "rewards/chosen": 0.8540927767753601, "rewards/margins": 3.3927032351493835, "rewards/rejected": -2.5386104583740234, "step": 11425 }, { "epoch": 0.6056237245912066, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13400649.0, "logits/rejected": -15398401.333333334, "logps/chosen": -249.23031616210938, "logps/rejected": -317.82720947265625, "loss": 0.2463, "rewards/chosen": -0.6351547241210938, "rewards/margins": 2.3953542709350586, "rewards/rejected": -3.0305089950561523, "step": 11426 }, { "epoch": 0.6056767285930088, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47102544.0, "logits/rejected": -5721406.0, "logps/chosen": -160.172265625, "logps/rejected": -128.74418131510416, "loss": 0.4025, "rewards/chosen": 0.15067298412323, "rewards/margins": 1.0154249588648478, "rewards/rejected": -0.8647519747416178, "step": 11427 }, { "epoch": 0.6057297325948109, "grad_norm": 48.75, "kl": 1.0959110260009766, "learning_rate": 5e-07, "logits/chosen": -7247332.666666667, "logits/rejected": 49361844.0, "logps/chosen": -352.8519694010417, "logps/rejected": -331.845703125, "loss": 0.3432, "rewards/chosen": 0.8283029397328695, "rewards/margins": 2.134235938390096, "rewards/rejected": -1.3059329986572266, "step": 11428 }, { "epoch": 0.6057827365966131, "grad_norm": 42.5, "kl": 0.74981689453125, "learning_rate": 5e-07, "logits/chosen": -10846582.4, "logits/rejected": -21842816.0, "logps/chosen": -174.2275146484375, "logps/rejected": -263.1490885416667, "loss": 0.3252, "rewards/chosen": 0.3280069828033447, "rewards/margins": 3.012673298517863, "rewards/rejected": -2.684666315714518, "step": 11429 }, { "epoch": 0.6058357405984152, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19699344.0, "logits/rejected": -36373680.0, "logps/chosen": -177.3681182861328, "logps/rejected": -363.3475341796875, "loss": 0.3159, "rewards/chosen": -0.07052746415138245, "rewards/margins": 2.1045027673244476, "rewards/rejected": -2.17503023147583, "step": 11430 }, { "epoch": 0.6058887446002174, "grad_norm": 47.5, "kl": 2.1342811584472656, "learning_rate": 5e-07, "logits/chosen": -12344562.0, "logits/rejected": -16575601.0, "logps/chosen": -462.355712890625, "logps/rejected": -399.474365234375, "loss": 0.2039, "rewards/chosen": 0.9970588684082031, "rewards/margins": 3.359016180038452, "rewards/rejected": -2.361957311630249, "step": 11431 }, { "epoch": 0.6059417486020194, "grad_norm": 58.25, "kl": 1.841212272644043, "learning_rate": 5e-07, "logits/chosen": -29248397.333333332, "logits/rejected": -23224968.0, "logps/chosen": -359.9036865234375, "logps/rejected": -284.42681884765625, "loss": 0.3932, "rewards/chosen": 0.5432434479395548, "rewards/margins": 1.9075508515040078, "rewards/rejected": -1.3643074035644531, "step": 11432 }, { "epoch": 0.6059947526038216, "grad_norm": 56.25, "kl": 0.67279052734375, "learning_rate": 5e-07, "logits/chosen": -19319629.333333332, "logits/rejected": -47609702.4, "logps/chosen": -385.4100748697917, "logps/rejected": -326.909912109375, "loss": 0.1782, "rewards/chosen": 0.3837677637736003, "rewards/margins": 4.011232058207194, "rewards/rejected": -3.6274642944335938, "step": 11433 }, { "epoch": 0.6060477566056237, "grad_norm": 55.75, "kl": 3.4404001235961914, "learning_rate": 5e-07, "logits/chosen": -15688274.285714285, "logits/rejected": -61978676.0, "logps/chosen": -235.49555315290178, "logps/rejected": -462.02581787109375, "loss": 0.434, "rewards/chosen": 0.4241448811122349, "rewards/margins": 4.4929987362452914, "rewards/rejected": -4.068853855133057, "step": 11434 }, { "epoch": 0.6061007606074259, "grad_norm": 54.25, "kl": 2.841360092163086, "learning_rate": 5e-07, "logits/chosen": -14269498.285714285, "logits/rejected": -53457788.0, "logps/chosen": -284.0347900390625, "logps/rejected": -698.4700927734375, "loss": 0.3666, "rewards/chosen": 0.7015915598188128, "rewards/margins": 3.871342488697597, "rewards/rejected": -3.169750928878784, "step": 11435 }, { "epoch": 0.606153764609228, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -24461278.0, "logps/rejected": -353.7187805175781, "loss": 0.2319, "rewards/rejected": -1.7263596057891846, "step": 11436 }, { "epoch": 0.6062067686110302, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74552104.0, "logits/rejected": -9688884.0, "logps/chosen": -251.1328125, "logps/rejected": -110.3699951171875, "loss": 0.2723, "rewards/chosen": -0.34210890531539917, "rewards/margins": 2.0332202712694802, "rewards/rejected": -2.3753291765848794, "step": 11437 }, { "epoch": 0.6062597726128323, "grad_norm": 37.5, "kl": 0.9108924865722656, "learning_rate": 5e-07, "logits/chosen": -17151760.0, "logits/rejected": -11142208.0, "logps/chosen": -197.0801798502604, "logps/rejected": -377.574658203125, "loss": 0.3025, "rewards/chosen": -0.04370792706807455, "rewards/margins": 2.3656062285105386, "rewards/rejected": -2.4093141555786133, "step": 11438 }, { "epoch": 0.6063127766146345, "grad_norm": 46.5, "kl": 1.5014209747314453, "learning_rate": 5e-07, "logits/chosen": -22718724.0, "logits/rejected": 35778312.0, "logps/chosen": -622.305419921875, "logps/rejected": -240.19271850585938, "loss": 0.2696, "rewards/chosen": 1.2727288007736206, "rewards/margins": 2.193835198879242, "rewards/rejected": -0.9211063981056213, "step": 11439 }, { "epoch": 0.6063657806164365, "grad_norm": 64.0, "kl": 0.7383518218994141, "learning_rate": 5e-07, "logits/chosen": -21121114.666666668, "logits/rejected": -6658697.6, "logps/chosen": -461.0489908854167, "logps/rejected": -183.81507568359376, "loss": 0.2171, "rewards/chosen": 0.8890721797943115, "rewards/margins": 3.1092084407806397, "rewards/rejected": -2.220136260986328, "step": 11440 }, { "epoch": 0.6064187846182387, "grad_norm": 47.5, "kl": 0.9201469421386719, "learning_rate": 5e-07, "logits/chosen": -26005224.0, "logits/rejected": -64543424.0, "logps/chosen": -225.425048828125, "logps/rejected": -368.5144449869792, "loss": 0.244, "rewards/chosen": 0.8517341613769531, "rewards/margins": 4.056290944417318, "rewards/rejected": -3.2045567830403647, "step": 11441 }, { "epoch": 0.6064717886200408, "grad_norm": 41.75, "kl": 3.3336000442504883, "learning_rate": 5e-07, "logits/chosen": -11388698.666666666, "logits/rejected": -25501100.0, "logps/chosen": -330.2606608072917, "logps/rejected": -502.2320556640625, "loss": 0.345, "rewards/chosen": 0.890620231628418, "rewards/margins": 3.0137746334075928, "rewards/rejected": -2.123154401779175, "step": 11442 }, { "epoch": 0.606524792621843, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40671828.0, "logits/rejected": -6996345.0, "logps/chosen": -429.1117248535156, "logps/rejected": -317.38916015625, "loss": 0.3186, "rewards/chosen": 0.40425968170166016, "rewards/margins": 2.2696386575698853, "rewards/rejected": -1.865378975868225, "step": 11443 }, { "epoch": 0.6065777966236451, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78457712.0, "logits/rejected": -39758390.85714286, "logps/chosen": -684.5152587890625, "logps/rejected": -346.94785853794644, "loss": 0.1133, "rewards/chosen": 1.7918213605880737, "rewards/margins": 4.503067851066589, "rewards/rejected": -2.7112464904785156, "step": 11444 }, { "epoch": 0.6066308006254473, "grad_norm": 25.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12351453.0, "logits/rejected": -26343707.42857143, "logps/chosen": -221.56298828125, "logps/rejected": -358.86875697544644, "loss": 0.1459, "rewards/chosen": 1.2695220708847046, "rewards/margins": 3.8649314641952515, "rewards/rejected": -2.595409393310547, "step": 11445 }, { "epoch": 0.6066838046272494, "grad_norm": 76.5, "kl": 1.9187202453613281, "learning_rate": 5e-07, "logits/chosen": -52353808.0, "logits/rejected": 21273636.0, "logps/chosen": -528.1051432291666, "logps/rejected": -367.69622802734375, "loss": 0.4346, "rewards/chosen": 0.32607144117355347, "rewards/margins": 1.9936687350273132, "rewards/rejected": -1.6675972938537598, "step": 11446 }, { "epoch": 0.6067368086290514, "grad_norm": 85.5, "kl": 0.29810142517089844, "learning_rate": 5e-07, "logits/chosen": -96897984.0, "logits/rejected": -15976809.333333334, "logps/chosen": -477.7724304199219, "logps/rejected": -280.7878011067708, "loss": 0.1841, "rewards/chosen": 0.598400890827179, "rewards/margins": 2.9529056350390115, "rewards/rejected": -2.3545047442118325, "step": 11447 }, { "epoch": 0.6067898126308536, "grad_norm": 43.75, "kl": 0.6221389770507812, "learning_rate": 5e-07, "logits/chosen": -57112488.0, "logits/rejected": -29144396.0, "logps/chosen": -346.9901123046875, "logps/rejected": -268.39141845703125, "loss": 0.2368, "rewards/chosen": 0.6883142590522766, "rewards/margins": 3.0041382908821106, "rewards/rejected": -2.315824031829834, "step": 11448 }, { "epoch": 0.6068428166326557, "grad_norm": 32.75, "kl": 0.34147071838378906, "learning_rate": 5e-07, "logits/chosen": -10532015.333333334, "logits/rejected": -34567440.0, "logps/chosen": -694.1774088541666, "logps/rejected": -399.149609375, "loss": 0.1705, "rewards/chosen": 1.6806902885437012, "rewards/margins": 4.581961345672608, "rewards/rejected": -2.9012710571289064, "step": 11449 }, { "epoch": 0.6068958206344579, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45523072.0, "logits/rejected": -33195600.0, "logps/chosen": -214.0934041341146, "logps/rejected": -397.61484375, "loss": 0.3307, "rewards/chosen": -0.46190404891967773, "rewards/margins": 1.5282532691955566, "rewards/rejected": -1.9901573181152343, "step": 11450 }, { "epoch": 0.60694882463626, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12772300.0, "logits/rejected": -13822720.0, "logps/chosen": -447.599609375, "logps/rejected": -482.09637451171875, "loss": 0.2231, "rewards/chosen": 1.193241834640503, "rewards/margins": 3.562866687774658, "rewards/rejected": -2.3696248531341553, "step": 11451 }, { "epoch": 0.6070018286380622, "grad_norm": 35.0, "kl": 2.2656097412109375, "learning_rate": 5e-07, "logits/chosen": 9561932.0, "logits/rejected": -36720996.0, "logps/chosen": -31.82714080810547, "logps/rejected": -345.4942321777344, "loss": 0.1943, "rewards/chosen": 1.3327518701553345, "rewards/margins": 3.764447331428528, "rewards/rejected": -2.4316954612731934, "step": 11452 }, { "epoch": 0.6070548326398643, "grad_norm": 45.75, "kl": 0.9554824829101562, "learning_rate": 5e-07, "logits/chosen": -8375400.0, "logits/rejected": -17198252.0, "logps/chosen": -327.2402038574219, "logps/rejected": -477.78277587890625, "loss": 0.2391, "rewards/chosen": 0.6503692865371704, "rewards/margins": 3.4565902948379517, "rewards/rejected": -2.8062210083007812, "step": 11453 }, { "epoch": 0.6071078366416665, "grad_norm": 48.25, "kl": 0.26861047744750977, "learning_rate": 5e-07, "logits/chosen": -19104681.6, "logits/rejected": -18638314.666666668, "logps/chosen": -271.1677978515625, "logps/rejected": -317.2380777994792, "loss": 0.3678, "rewards/chosen": 0.27482821941375735, "rewards/margins": 1.9675705671310424, "rewards/rejected": -1.6927423477172852, "step": 11454 }, { "epoch": 0.6071608406434685, "grad_norm": 52.75, "kl": 1.6080951690673828, "learning_rate": 5e-07, "logits/chosen": -19023265.6, "logits/rejected": -32249032.0, "logps/chosen": -196.51854248046874, "logps/rejected": -525.5134684244791, "loss": 0.3801, "rewards/chosen": 0.1945541262626648, "rewards/margins": 2.044238841533661, "rewards/rejected": -1.849684715270996, "step": 11455 }, { "epoch": 0.6072138446452707, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24256688.0, "logits/rejected": -4825798.666666667, "logps/chosen": -283.57919921875, "logps/rejected": -135.45482381184897, "loss": 0.2071, "rewards/chosen": 1.0278087615966798, "rewards/margins": 3.3954298655192057, "rewards/rejected": -2.367621103922526, "step": 11456 }, { "epoch": 0.6072668486470728, "grad_norm": 47.75, "kl": 0.9936122894287109, "learning_rate": 5e-07, "logits/chosen": -23762764.0, "logits/rejected": -22616258.666666668, "logps/chosen": -120.74348449707031, "logps/rejected": -307.8654378255208, "loss": 0.1883, "rewards/chosen": 0.5602075457572937, "rewards/margins": 2.8914111653963723, "rewards/rejected": -2.3312036196390786, "step": 11457 }, { "epoch": 0.607319852648875, "grad_norm": 51.5, "kl": 4.6704864501953125, "learning_rate": 5e-07, "logits/chosen": -19125626.285714287, "logits/rejected": -16700084.0, "logps/chosen": -541.4554268973214, "logps/rejected": -191.3349151611328, "loss": 0.371, "rewards/chosen": 1.091291836329869, "rewards/margins": 2.1387208018984114, "rewards/rejected": -1.0474289655685425, "step": 11458 }, { "epoch": 0.6073728566506771, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4867923.333333333, "logits/rejected": -33590755.2, "logps/chosen": -258.6739095052083, "logps/rejected": -446.8220703125, "loss": 0.1423, "rewards/chosen": 1.398194948832194, "rewards/margins": 4.123631731669108, "rewards/rejected": -2.725436782836914, "step": 11459 }, { "epoch": 0.6074258606524793, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -24914518.0, "logps/rejected": -153.75247192382812, "loss": 0.2793, "rewards/rejected": -1.2114864587783813, "step": 11460 }, { "epoch": 0.6074788646542814, "grad_norm": 30.625, "kl": 1.2058124542236328, "learning_rate": 5e-07, "logits/chosen": -12207937.333333334, "logits/rejected": -98308761.6, "logps/chosen": -178.41756184895834, "logps/rejected": -280.6453857421875, "loss": 0.1652, "rewards/chosen": 0.958317756652832, "rewards/margins": 4.388927268981933, "rewards/rejected": -3.4306095123291014, "step": 11461 }, { "epoch": 0.6075318686560836, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4707024.0, "logits/rejected": -24350754.666666668, "logps/chosen": -141.62413024902344, "logps/rejected": -301.2526041666667, "loss": 0.2145, "rewards/chosen": 0.41054680943489075, "rewards/margins": 2.4718224505583444, "rewards/rejected": -2.0612756411234536, "step": 11462 }, { "epoch": 0.6075848726578856, "grad_norm": 82.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44088098.666666664, "logits/rejected": -6888427.2, "logps/chosen": -579.7535807291666, "logps/rejected": -372.08408203125, "loss": 0.2619, "rewards/chosen": 0.7458353042602539, "rewards/margins": 2.4801559448242188, "rewards/rejected": -1.7343206405639648, "step": 11463 }, { "epoch": 0.6076378766596878, "grad_norm": 66.5, "kl": 0.7452621459960938, "learning_rate": 5e-07, "logits/chosen": -55762099.2, "logits/rejected": -95794602.66666667, "logps/chosen": -381.5145263671875, "logps/rejected": -300.8232014973958, "loss": 0.3139, "rewards/chosen": 0.658392333984375, "rewards/margins": 2.6906929651896156, "rewards/rejected": -2.0323006312052407, "step": 11464 }, { "epoch": 0.6076908806614899, "grad_norm": 46.75, "kl": 1.3192825317382812, "learning_rate": 5e-07, "logits/chosen": 8662343.2, "logits/rejected": 4160584.0, "logps/chosen": -288.97578125, "logps/rejected": -619.1737467447916, "loss": 0.2914, "rewards/chosen": 0.9672887802124024, "rewards/margins": 2.7893724123636883, "rewards/rejected": -1.8220836321512859, "step": 11465 }, { "epoch": 0.6077438846632921, "grad_norm": 30.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7489997.5, "logits/rejected": -35208402.28571428, "logps/chosen": -21.739606857299805, "logps/rejected": -303.2467564174107, "loss": 0.1757, "rewards/chosen": 0.08724822849035263, "rewards/margins": 2.5808744143162454, "rewards/rejected": -2.493626185825893, "step": 11466 }, { "epoch": 0.6077968886650942, "grad_norm": 54.25, "kl": 1.2118120193481445, "learning_rate": 5e-07, "logits/chosen": -18957494.4, "logits/rejected": -14904025.333333334, "logps/chosen": -181.4944091796875, "logps/rejected": -320.73895263671875, "loss": 0.3807, "rewards/chosen": 0.09813885688781739, "rewards/margins": 2.1080952167510985, "rewards/rejected": -2.0099563598632812, "step": 11467 }, { "epoch": 0.6078498926668964, "grad_norm": 41.5, "kl": 0.4509868621826172, "learning_rate": 5e-07, "logits/chosen": -71663320.0, "logits/rejected": -8635371.0, "logps/chosen": -168.47703552246094, "logps/rejected": -235.93576049804688, "loss": 0.2521, "rewards/chosen": 0.6667827367782593, "rewards/margins": 2.6494637727737427, "rewards/rejected": -1.9826810359954834, "step": 11468 }, { "epoch": 0.6079028966686985, "grad_norm": 46.75, "kl": 1.7760076522827148, "learning_rate": 5e-07, "logits/chosen": -28433900.8, "logits/rejected": -18123904.0, "logps/chosen": -281.0396484375, "logps/rejected": -337.5710042317708, "loss": 0.2222, "rewards/chosen": 1.6488767623901368, "rewards/margins": 4.15024242401123, "rewards/rejected": -2.5013656616210938, "step": 11469 }, { "epoch": 0.6079559006705006, "grad_norm": 47.25, "kl": 0.912200927734375, "learning_rate": 5e-07, "logits/chosen": -3590077.6666666665, "logits/rejected": -43415059.2, "logps/chosen": -182.86625162760416, "logps/rejected": -301.789306640625, "loss": 0.2423, "rewards/chosen": 0.36356155077616376, "rewards/margins": 2.570745770136515, "rewards/rejected": -2.2071842193603515, "step": 11470 }, { "epoch": 0.6080089046723027, "grad_norm": 47.25, "kl": 1.2023124694824219, "learning_rate": 5e-07, "logits/chosen": -8708716.0, "logits/rejected": 1957760.875, "logps/chosen": -221.1519978841146, "logps/rejected": -91.81039428710938, "loss": 0.3531, "rewards/chosen": 0.5059753259023031, "rewards/margins": 2.345263679822286, "rewards/rejected": -1.839288353919983, "step": 11471 }, { "epoch": 0.6080619086741049, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -540908.1666666666, "logits/rejected": -29809017.6, "logps/chosen": -30.811737060546875, "logps/rejected": -403.058349609375, "loss": 0.169, "rewards/chosen": 1.0076851844787598, "rewards/margins": 3.564177417755127, "rewards/rejected": -2.556492233276367, "step": 11472 }, { "epoch": 0.608114912675907, "grad_norm": 21.125, "kl": 1.2671432495117188, "learning_rate": 5e-07, "logits/chosen": -42515184.0, "logits/rejected": -24546181.333333332, "logps/chosen": -609.2310180664062, "logps/rejected": -338.07838948567706, "loss": 0.0601, "rewards/chosen": 2.4513869285583496, "rewards/margins": 5.72607151667277, "rewards/rejected": -3.2746845881144204, "step": 11473 }, { "epoch": 0.6081679166777092, "grad_norm": 51.5, "kl": 1.9078941345214844, "learning_rate": 5e-07, "logits/chosen": -17188781.714285713, "logits/rejected": -2358289.5, "logps/chosen": -251.79338727678572, "logps/rejected": -82.39697265625, "loss": 0.3405, "rewards/chosen": 0.660667964390346, "rewards/margins": 3.1059210981641496, "rewards/rejected": -2.4452531337738037, "step": 11474 }, { "epoch": 0.6082209206795113, "grad_norm": 50.25, "kl": 2.540005683898926, "learning_rate": 5e-07, "logits/chosen": 28031084.8, "logits/rejected": -22588378.666666668, "logps/chosen": -273.725341796875, "logps/rejected": -273.0813802083333, "loss": 0.363, "rewards/chosen": 0.4848527431488037, "rewards/margins": 2.3753518581390383, "rewards/rejected": -1.8904991149902344, "step": 11475 }, { "epoch": 0.6082739246813135, "grad_norm": 53.25, "kl": 2.0740127563476562, "learning_rate": 5e-07, "logits/chosen": 670520.0, "logits/rejected": -44394886.4, "logps/chosen": -490.3082682291667, "logps/rejected": -460.5189453125, "loss": 0.1943, "rewards/chosen": 1.592665672302246, "rewards/margins": 4.07839298248291, "rewards/rejected": -2.485727310180664, "step": 11476 }, { "epoch": 0.6083269286831156, "grad_norm": 59.0, "kl": 1.0712876319885254, "learning_rate": 5e-07, "logits/chosen": -34553884.0, "logps/chosen": -282.9968566894531, "loss": 0.4934, "rewards/chosen": 0.13220161199569702, "step": 11477 }, { "epoch": 0.6083799326849177, "grad_norm": 42.5, "kl": 0.15822982788085938, "learning_rate": 5e-07, "logits/chosen": -8495473.6, "logits/rejected": -30740562.666666668, "logps/chosen": -165.7752685546875, "logps/rejected": -149.87942504882812, "loss": 0.3247, "rewards/chosen": 0.5145728111267089, "rewards/margins": 1.9421215375264484, "rewards/rejected": -1.4275487263997395, "step": 11478 }, { "epoch": 0.6084329366867198, "grad_norm": 53.5, "kl": 0.2278594970703125, "learning_rate": 5e-07, "logits/chosen": -50450744.0, "logits/rejected": -17266866.0, "logps/chosen": -344.816650390625, "logps/rejected": -219.09573364257812, "loss": 0.3048, "rewards/chosen": -0.02167510986328125, "rewards/margins": 2.4056172370910645, "rewards/rejected": -2.4272923469543457, "step": 11479 }, { "epoch": 0.608485940688522, "grad_norm": 44.0, "kl": 3.3022384643554688, "learning_rate": 5e-07, "logits/chosen": -6808025.333333333, "logits/rejected": -30366649.6, "logps/chosen": -183.445068359375, "logps/rejected": -421.32197265625, "loss": 0.2917, "rewards/chosen": 1.1133670806884766, "rewards/margins": 2.876476860046387, "rewards/rejected": -1.7631097793579102, "step": 11480 }, { "epoch": 0.6085389446903241, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 34759240.0, "logits/rejected": -31204818.0, "logps/chosen": -202.84503173828125, "logps/rejected": -439.5436096191406, "loss": 0.3257, "rewards/chosen": -0.14466814696788788, "rewards/margins": 2.230627492070198, "rewards/rejected": -2.375295639038086, "step": 11481 }, { "epoch": 0.6085919486921263, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18714490.666666668, "logits/rejected": 73423696.0, "logps/chosen": -206.3109130859375, "logps/rejected": -1155.520751953125, "loss": 0.4149, "rewards/chosen": 0.2540263930956523, "rewards/margins": 1.4498516122500102, "rewards/rejected": -1.195825219154358, "step": 11482 }, { "epoch": 0.6086449526939284, "grad_norm": 50.75, "kl": 2.403095245361328, "learning_rate": 5e-07, "logits/chosen": -77221664.0, "logits/rejected": -29315443.2, "logps/chosen": -674.6051839192709, "logps/rejected": -236.63564453125, "loss": 0.2952, "rewards/chosen": 0.026916523774464924, "rewards/margins": 2.2528558929761253, "rewards/rejected": -2.2259393692016602, "step": 11483 }, { "epoch": 0.6086979566957306, "grad_norm": 38.75, "kl": 0.6766433715820312, "learning_rate": 5e-07, "logits/chosen": -27279196.8, "logits/rejected": -15160068.0, "logps/chosen": -498.1171875, "logps/rejected": -157.79434204101562, "loss": 0.2719, "rewards/chosen": 1.2709633827209472, "rewards/margins": 4.072988859812418, "rewards/rejected": -2.802025477091471, "step": 11484 }, { "epoch": 0.6087509606975327, "grad_norm": 40.75, "kl": 2.0006771087646484, "learning_rate": 5e-07, "logits/chosen": -20731214.666666668, "logits/rejected": 20863728.0, "logps/chosen": -202.65008544921875, "logps/rejected": -234.608251953125, "loss": 0.2744, "rewards/chosen": 0.29812419414520264, "rewards/margins": 2.0535767316818236, "rewards/rejected": -1.755452537536621, "step": 11485 }, { "epoch": 0.6088039646993348, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13044915.0, "logits/rejected": 5526154.285714285, "logps/chosen": -31.44720458984375, "logps/rejected": -436.16196986607144, "loss": 0.2332, "rewards/chosen": 0.13852767646312714, "rewards/margins": 2.139906962003027, "rewards/rejected": -2.0013792855398997, "step": 11486 }, { "epoch": 0.6088569687011369, "grad_norm": 45.5, "kl": 0.081634521484375, "learning_rate": 5e-07, "logits/chosen": 1959562.5, "logits/rejected": -19170754.0, "logps/chosen": -271.18328857421875, "logps/rejected": -354.15130615234375, "loss": 0.2948, "rewards/chosen": 0.6941524744033813, "rewards/margins": 2.3676446676254272, "rewards/rejected": -1.673492193222046, "step": 11487 }, { "epoch": 0.6089099727029391, "grad_norm": 38.0, "kl": 0.054604530334472656, "learning_rate": 5e-07, "logits/chosen": -81150122.66666667, "logits/rejected": -23648355.2, "logps/chosen": -223.2467244466146, "logps/rejected": -353.2671875, "loss": 0.2116, "rewards/chosen": 0.9549596309661865, "rewards/margins": 2.9126983165740965, "rewards/rejected": -1.9577386856079102, "step": 11488 }, { "epoch": 0.6089629767047412, "grad_norm": 54.0, "kl": 1.9819717407226562, "learning_rate": 5e-07, "logits/chosen": 76663513.6, "logits/rejected": -37167930.666666664, "logps/chosen": -338.6455810546875, "logps/rejected": -385.6398518880208, "loss": 0.3324, "rewards/chosen": 0.4926711082458496, "rewards/margins": 2.7280879974365235, "rewards/rejected": -2.235416889190674, "step": 11489 }, { "epoch": 0.6090159807065434, "grad_norm": 42.5, "kl": 2.154918670654297, "learning_rate": 5e-07, "logits/chosen": -23481640.0, "logits/rejected": -31127954.0, "logps/chosen": -476.34637451171875, "logps/rejected": -281.11712646484375, "loss": 0.2242, "rewards/chosen": 1.1955478191375732, "rewards/margins": 3.5237252712249756, "rewards/rejected": -2.3281774520874023, "step": 11490 }, { "epoch": 0.6090689847083455, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6834145.6, "logits/rejected": -8923920.0, "logps/chosen": -343.323046875, "logps/rejected": -325.40871175130206, "loss": 0.379, "rewards/chosen": -0.10370612144470215, "rewards/margins": 2.1681101322174072, "rewards/rejected": -2.2718162536621094, "step": 11491 }, { "epoch": 0.6091219887101477, "grad_norm": 36.25, "kl": 0.12518310546875, "learning_rate": 5e-07, "logits/chosen": -29345150.0, "logits/rejected": -19902724.0, "logps/chosen": -536.20361328125, "logps/rejected": -452.47784423828125, "loss": 0.1955, "rewards/chosen": 1.1376502513885498, "rewards/margins": 3.8907852172851562, "rewards/rejected": -2.7531349658966064, "step": 11492 }, { "epoch": 0.6091749927119497, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34677450.666666664, "logits/rejected": -32057216.0, "logps/chosen": -208.9190673828125, "logps/rejected": -282.76708984375, "loss": 0.2256, "rewards/chosen": 0.7191139856974283, "rewards/margins": 3.208161226908366, "rewards/rejected": -2.4890472412109377, "step": 11493 }, { "epoch": 0.6092279967137519, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15695499.2, "logits/rejected": -15544194.666666666, "logps/chosen": -239.06162109375, "logps/rejected": -136.14884440104166, "loss": 0.3232, "rewards/chosen": 0.20369110107421876, "rewards/margins": 2.361608600616455, "rewards/rejected": -2.1579174995422363, "step": 11494 }, { "epoch": 0.609281000715554, "grad_norm": 37.5, "kl": 0.24595069885253906, "learning_rate": 5e-07, "logits/chosen": -18111297.333333332, "logits/rejected": -20933067.2, "logps/chosen": -379.5768229166667, "logps/rejected": -349.19443359375, "loss": 0.1165, "rewards/chosen": 1.8878062566121419, "rewards/margins": 4.523492749532064, "rewards/rejected": -2.635686492919922, "step": 11495 }, { "epoch": 0.6093340047173562, "grad_norm": 52.25, "kl": 0.7215194702148438, "learning_rate": 5e-07, "logits/chosen": -28912064.0, "logits/rejected": -8531252.0, "logps/chosen": -154.02750651041666, "logps/rejected": -160.71343994140625, "loss": 0.4474, "rewards/chosen": 0.08890384435653687, "rewards/margins": 1.2591598629951477, "rewards/rejected": -1.1702560186386108, "step": 11496 }, { "epoch": 0.6093870087191583, "grad_norm": 51.75, "kl": 2.1348228454589844, "learning_rate": 5e-07, "logits/chosen": -48619264.0, "logits/rejected": -17437101.333333332, "logps/chosen": -673.828564453125, "logps/rejected": -222.25545247395834, "loss": 0.3049, "rewards/chosen": 1.2329381942749023, "rewards/margins": 2.7606583277384438, "rewards/rejected": -1.5277201334635417, "step": 11497 }, { "epoch": 0.6094400127209604, "grad_norm": 37.25, "kl": 1.2079534530639648, "learning_rate": 5e-07, "logits/chosen": -32203533.333333332, "logits/rejected": -26587635.2, "logps/chosen": -170.8476359049479, "logps/rejected": -238.63349609375, "loss": 0.2873, "rewards/chosen": 0.18139761686325073, "rewards/margins": 2.0392749667167664, "rewards/rejected": -1.8578773498535157, "step": 11498 }, { "epoch": 0.6094930167227626, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42500501.333333336, "logits/rejected": -21025286.4, "logps/chosen": -247.0650431315104, "logps/rejected": -313.575, "loss": 0.2966, "rewards/chosen": 0.5448547601699829, "rewards/margins": 1.6996604204177856, "rewards/rejected": -1.1548056602478027, "step": 11499 }, { "epoch": 0.6095460207245647, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70229128.0, "logits/rejected": 11479752.0, "logps/chosen": -391.08544921875, "logps/rejected": -108.19669596354167, "loss": 0.1696, "rewards/chosen": 0.8004165887832642, "rewards/margins": 3.1407209634780884, "rewards/rejected": -2.340304374694824, "step": 11500 }, { "epoch": 0.6095990247263668, "grad_norm": 46.75, "kl": 2.728504180908203, "learning_rate": 5e-07, "logits/chosen": -43101485.71428572, "logits/rejected": -19459408.0, "logps/chosen": -373.6483677455357, "logps/rejected": -169.602783203125, "loss": 0.3885, "rewards/chosen": 0.919086183820452, "rewards/margins": 4.296263660703387, "rewards/rejected": -3.3771774768829346, "step": 11501 }, { "epoch": 0.6096520287281689, "grad_norm": 58.5, "kl": 2.0883331298828125, "learning_rate": 5e-07, "logits/chosen": -10296806.0, "logits/rejected": 47999652.0, "logps/chosen": -349.021484375, "logps/rejected": -453.27252197265625, "loss": 0.2806, "rewards/chosen": 1.139837106068929, "rewards/margins": 2.8069513638814287, "rewards/rejected": -1.6671142578125, "step": 11502 }, { "epoch": 0.6097050327299711, "grad_norm": 59.0, "kl": 3.1929588317871094, "learning_rate": 5e-07, "logits/chosen": -29585896.0, "logits/rejected": -2805154.0, "logps/chosen": -151.2236328125, "logps/rejected": -509.7812194824219, "loss": 0.3995, "rewards/chosen": 0.1606408804655075, "rewards/margins": 1.7588369101285934, "rewards/rejected": -1.598196029663086, "step": 11503 }, { "epoch": 0.6097580367317732, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30787674.666666668, "logits/rejected": 279236.0, "logps/chosen": -172.4388427734375, "logps/rejected": -116.7269775390625, "loss": 0.3187, "rewards/chosen": 0.1330927610397339, "rewards/margins": 1.6319990873336792, "rewards/rejected": -1.4989063262939453, "step": 11504 }, { "epoch": 0.6098110407335754, "grad_norm": 37.5, "kl": 0.6708717346191406, "learning_rate": 5e-07, "logits/chosen": -47832672.0, "logits/rejected": -10186640.0, "logps/chosen": -222.45938110351562, "logps/rejected": -266.8416442871094, "loss": 0.2725, "rewards/chosen": 0.26470690965652466, "rewards/margins": 2.7140610814094543, "rewards/rejected": -2.4493541717529297, "step": 11505 }, { "epoch": 0.6098640447353775, "grad_norm": 30.875, "kl": 2.208192825317383, "learning_rate": 5e-07, "logits/chosen": 4828271.0, "logits/rejected": -39806076.8, "logps/chosen": -234.708740234375, "logps/rejected": -427.342529296875, "loss": 0.1724, "rewards/chosen": 1.139916976292928, "rewards/margins": 4.12467687924703, "rewards/rejected": -2.9847599029541017, "step": 11506 }, { "epoch": 0.6099170487371797, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47912616.0, "logits/rejected": -20218084.0, "logps/chosen": -470.6328125, "logps/rejected": -261.22686767578125, "loss": 0.2011, "rewards/chosen": 0.677984356880188, "rewards/margins": 4.174193024635315, "rewards/rejected": -3.496208667755127, "step": 11507 }, { "epoch": 0.6099700527389817, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -527610.6666666666, "logits/rejected": -44524188.8, "logps/chosen": -403.9952799479167, "logps/rejected": -251.853662109375, "loss": 0.2825, "rewards/chosen": 0.30780112743377686, "rewards/margins": 2.2198675870895386, "rewards/rejected": -1.9120664596557617, "step": 11508 }, { "epoch": 0.6100230567407839, "grad_norm": 50.0, "kl": 2.02053165435791, "learning_rate": 5e-07, "logits/chosen": 6680829.0, "logits/rejected": -54330796.0, "logps/chosen": -24.396974563598633, "logps/rejected": -566.6529541015625, "loss": 0.3162, "rewards/chosen": 0.32820039987564087, "rewards/margins": 2.337175667285919, "rewards/rejected": -2.0089752674102783, "step": 11509 }, { "epoch": 0.610076060742586, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15356112.0, "logits/rejected": -40578672.0, "logps/chosen": -139.1751251220703, "logps/rejected": -418.2397054036458, "loss": 0.1873, "rewards/chosen": 0.7728850841522217, "rewards/margins": 3.2885282039642334, "rewards/rejected": -2.5156431198120117, "step": 11510 }, { "epoch": 0.6101290647443882, "grad_norm": 39.25, "kl": 2.375181198120117, "learning_rate": 5e-07, "logits/chosen": -30806521.6, "logits/rejected": -40328506.666666664, "logps/chosen": -274.455859375, "logps/rejected": -878.2223307291666, "loss": 0.3062, "rewards/chosen": 0.48655428886413576, "rewards/margins": 3.3949894746144613, "rewards/rejected": -2.9084351857503257, "step": 11511 }, { "epoch": 0.6101820687461903, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65947240.0, "logits/rejected": -6193836.0, "logps/chosen": -381.7504577636719, "logps/rejected": -140.04979451497397, "loss": 0.2393, "rewards/chosen": 0.18128205835819244, "rewards/margins": 2.3326619615157447, "rewards/rejected": -2.1513799031575522, "step": 11512 }, { "epoch": 0.6102350727479925, "grad_norm": 44.5, "kl": 1.0081901550292969, "learning_rate": 5e-07, "logits/chosen": -80445557.33333333, "logits/rejected": -11726391.2, "logps/chosen": -619.4302978515625, "logps/rejected": -559.00634765625, "loss": 0.1382, "rewards/chosen": 0.9292012850443522, "rewards/margins": 4.428999106089274, "rewards/rejected": -3.499797821044922, "step": 11513 }, { "epoch": 0.6102880767497946, "grad_norm": 46.25, "kl": 2.7548418045043945, "learning_rate": 5e-07, "logits/chosen": -68763648.0, "logits/rejected": -27195178.666666668, "logps/chosen": -348.5978088378906, "logps/rejected": -190.45947265625, "loss": 0.2251, "rewards/chosen": 0.5951126217842102, "rewards/margins": 1.9090359807014465, "rewards/rejected": -1.3139233589172363, "step": 11514 }, { "epoch": 0.6103410807515968, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31991952.0, "logits/rejected": -97229200.0, "logps/chosen": -162.79507446289062, "logps/rejected": -591.6946411132812, "loss": 0.3094, "rewards/chosen": -0.33342379331588745, "rewards/margins": 3.0061972737312317, "rewards/rejected": -3.339621067047119, "step": 11515 }, { "epoch": 0.6103940847533988, "grad_norm": 67.0, "kl": 2.4394845962524414, "learning_rate": 5e-07, "logits/chosen": 29506172.8, "logits/rejected": -2642378.3333333335, "logps/chosen": -296.11826171875, "logps/rejected": -134.64717610677084, "loss": 0.3791, "rewards/chosen": 0.24268751144409179, "rewards/margins": 1.6952717781066895, "rewards/rejected": -1.4525842666625977, "step": 11516 }, { "epoch": 0.610447088755201, "grad_norm": 64.5, "kl": 0.7969951629638672, "learning_rate": 5e-07, "logits/chosen": -9921754.0, "logits/rejected": 8043949.5, "logps/chosen": -376.520263671875, "logps/rejected": -290.4345703125, "loss": 0.244, "rewards/chosen": 0.6768778562545776, "rewards/margins": 3.0612016916275024, "rewards/rejected": -2.384323835372925, "step": 11517 }, { "epoch": 0.6105000927570031, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12170374.0, "logits/rejected": -13095925.0, "logps/chosen": -240.53273010253906, "logps/rejected": -447.4678039550781, "loss": 0.3263, "rewards/chosen": -0.24220772087574005, "rewards/margins": 2.1808960884809494, "rewards/rejected": -2.4231038093566895, "step": 11518 }, { "epoch": 0.6105530967588053, "grad_norm": 45.75, "kl": 1.123291015625, "learning_rate": 5e-07, "logits/chosen": -30052244.0, "logits/rejected": -63715616.0, "logps/chosen": -267.0174560546875, "logps/rejected": -500.1273193359375, "loss": 0.2406, "rewards/chosen": 0.7278299331665039, "rewards/margins": 3.0070998668670654, "rewards/rejected": -2.2792699337005615, "step": 11519 }, { "epoch": 0.6106061007606074, "grad_norm": 49.75, "kl": 1.7689285278320312, "learning_rate": 5e-07, "logits/chosen": -57374648.0, "logits/rejected": 6788017.0, "logps/chosen": -408.8071594238281, "logps/rejected": -158.38951110839844, "loss": 0.3372, "rewards/chosen": 0.1247352659702301, "rewards/margins": 1.7983469069004059, "rewards/rejected": -1.6736116409301758, "step": 11520 }, { "epoch": 0.6106591047624096, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5854075.6, "logits/rejected": -4733169.333333333, "logps/chosen": -88.14366455078125, "logps/rejected": -210.11397298177084, "loss": 0.349, "rewards/chosen": 0.04445629119873047, "rewards/margins": 2.2094867706298826, "rewards/rejected": -2.1650304794311523, "step": 11521 }, { "epoch": 0.6107121087642117, "grad_norm": 35.5, "kl": 0.8194351196289062, "learning_rate": 5e-07, "logits/chosen": -21275794.666666668, "logits/rejected": -24305795.2, "logps/chosen": -258.75653076171875, "logps/rejected": -417.330322265625, "loss": 0.2117, "rewards/chosen": 0.0638262430826823, "rewards/margins": 3.354568354288737, "rewards/rejected": -3.290742111206055, "step": 11522 }, { "epoch": 0.6107651127660139, "grad_norm": 44.25, "kl": 2.1959400177001953, "learning_rate": 5e-07, "logits/chosen": -35483400.0, "logits/rejected": -10197019.0, "logps/chosen": -167.00216674804688, "logps/rejected": -219.18966674804688, "loss": 0.2672, "rewards/chosen": 0.5544924736022949, "rewards/margins": 3.6314799785614014, "rewards/rejected": -3.0769875049591064, "step": 11523 }, { "epoch": 0.6108181167678159, "grad_norm": 43.0, "kl": 1.6969213485717773, "learning_rate": 5e-07, "logits/chosen": -27717284.0, "logits/rejected": -114011072.0, "logps/chosen": -314.6126708984375, "logps/rejected": -377.54437255859375, "loss": 0.2696, "rewards/chosen": 0.7892071604728699, "rewards/margins": 3.016126573085785, "rewards/rejected": -2.226919412612915, "step": 11524 }, { "epoch": 0.6108711207696181, "grad_norm": 52.0, "kl": 1.8341827392578125, "learning_rate": 5e-07, "logits/chosen": 87203923.2, "logits/rejected": -97866410.66666667, "logps/chosen": -210.4839111328125, "logps/rejected": -556.6717122395834, "loss": 0.3147, "rewards/chosen": 0.28496198654174804, "rewards/margins": 2.8687460899353026, "rewards/rejected": -2.5837841033935547, "step": 11525 }, { "epoch": 0.6109241247714202, "grad_norm": 73.0, "kl": 1.0012245178222656, "learning_rate": 5e-07, "logits/chosen": -22303660.0, "logits/rejected": -17184048.0, "logps/chosen": -536.4745279947916, "logps/rejected": -245.98525390625, "loss": 0.2605, "rewards/chosen": 1.212334156036377, "rewards/margins": 2.7721907615661623, "rewards/rejected": -1.5598566055297851, "step": 11526 }, { "epoch": 0.6109771287732224, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64053824.0, "logits/rejected": -41678771.2, "logps/chosen": -329.35451253255206, "logps/rejected": -181.218701171875, "loss": 0.2112, "rewards/chosen": 0.8459767500559489, "rewards/margins": 3.796173588434855, "rewards/rejected": -2.9501968383789063, "step": 11527 }, { "epoch": 0.6110301327750245, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -122644160.0, "logits/rejected": -25708930.666666668, "logps/chosen": -524.8728637695312, "logps/rejected": -233.8586629231771, "loss": 0.2598, "rewards/chosen": 0.01540374755859375, "rewards/margins": 1.8259140650431316, "rewards/rejected": -1.8105103174845378, "step": 11528 }, { "epoch": 0.6110831367768267, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1226606.0, "logits/rejected": -29387049.6, "logps/chosen": -145.29812622070312, "logps/rejected": -397.99365234375, "loss": 0.2001, "rewards/chosen": 0.8135749499003092, "rewards/margins": 3.2020532290140786, "rewards/rejected": -2.3884782791137695, "step": 11529 }, { "epoch": 0.6111361407786288, "grad_norm": 69.5, "kl": 2.092205047607422, "learning_rate": 5e-07, "logits/chosen": 20781169.6, "logits/rejected": -22485997.333333332, "logps/chosen": -276.7278076171875, "logps/rejected": -355.3454182942708, "loss": 0.3987, "rewards/chosen": 0.33130273818969724, "rewards/margins": 1.8904369036356607, "rewards/rejected": -1.5591341654459636, "step": 11530 }, { "epoch": 0.611189144780431, "grad_norm": 45.0, "kl": 2.9821462631225586, "learning_rate": 5e-07, "logits/chosen": -16936868.0, "logits/rejected": -3735057.25, "logps/chosen": -297.26605224609375, "logps/rejected": -201.25515747070312, "loss": 0.2657, "rewards/chosen": 0.713833212852478, "rewards/margins": 2.7050870656967163, "rewards/rejected": -1.9912538528442383, "step": 11531 }, { "epoch": 0.611242148782233, "grad_norm": 27.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6064290.0, "logits/rejected": -32220093.333333332, "logps/chosen": -108.6644058227539, "logps/rejected": -379.8276774088542, "loss": 0.174, "rewards/chosen": 0.05783975124359131, "rewards/margins": 3.2164249817530313, "rewards/rejected": -3.15858523050944, "step": 11532 }, { "epoch": 0.6112951527840352, "grad_norm": 32.5, "kl": 1.491480827331543, "learning_rate": 5e-07, "logits/chosen": -777219.5625, "logits/rejected": -30761236.0, "logps/chosen": -66.78599548339844, "logps/rejected": -266.7425231933594, "loss": 0.2885, "rewards/chosen": 0.6525766253471375, "rewards/margins": 2.624145805835724, "rewards/rejected": -1.9715691804885864, "step": 11533 }, { "epoch": 0.6113481567858373, "grad_norm": 37.5, "kl": 2.2088356018066406, "learning_rate": 5e-07, "logits/chosen": -64537971.2, "logits/rejected": -11893026.666666666, "logps/chosen": -145.058154296875, "logps/rejected": -171.41961669921875, "loss": 0.321, "rewards/chosen": 0.44008588790893555, "rewards/margins": 2.290767192840576, "rewards/rejected": -1.8506813049316406, "step": 11534 }, { "epoch": 0.6114011607876395, "grad_norm": 42.75, "kl": 2.1020736694335938, "learning_rate": 5e-07, "logits/chosen": -34598232.0, "logits/rejected": -34106488.0, "logps/chosen": -248.62312825520834, "logps/rejected": -371.5041809082031, "loss": 0.4205, "rewards/chosen": 0.12010228633880615, "rewards/margins": 1.6776043176651, "rewards/rejected": -1.557502031326294, "step": 11535 }, { "epoch": 0.6114541647894416, "grad_norm": 61.25, "kl": 4.21087646484375, "learning_rate": 5e-07, "logits/chosen": -476087.0833333333, "logits/rejected": -39088344.0, "logps/chosen": -191.59114583333334, "logps/rejected": -556.9560546875, "loss": 0.4254, "rewards/chosen": 0.702874501546224, "rewards/margins": 2.6865654786427817, "rewards/rejected": -1.9836909770965576, "step": 11536 }, { "epoch": 0.6115071687912438, "grad_norm": 38.25, "kl": 0.764068603515625, "learning_rate": 5e-07, "logits/chosen": -15252773.0, "logits/rejected": -25238236.0, "logps/chosen": -326.7890625, "logps/rejected": -522.128662109375, "loss": 0.1735, "rewards/chosen": 1.1440010070800781, "rewards/margins": 4.446097373962402, "rewards/rejected": -3.302096366882324, "step": 11537 }, { "epoch": 0.6115601727930459, "grad_norm": 45.5, "kl": 0.13434982299804688, "learning_rate": 5e-07, "logits/chosen": -32643792.0, "logits/rejected": -19626507.2, "logps/chosen": -271.26003011067706, "logps/rejected": -260.650244140625, "loss": 0.2702, "rewards/chosen": -0.03730762004852295, "rewards/margins": 2.1590596437454224, "rewards/rejected": -2.1963672637939453, "step": 11538 }, { "epoch": 0.6116131767948481, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 22643416.0, "logits/rejected": -5757120.0, "logps/chosen": -227.65615844726562, "logps/rejected": -233.516357421875, "loss": 0.2526, "rewards/chosen": 0.46394574642181396, "rewards/margins": 2.912314772605896, "rewards/rejected": -2.448369026184082, "step": 11539 }, { "epoch": 0.6116661807966501, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8120268.8, "logits/rejected": -16173392.0, "logps/chosen": -323.073388671875, "logps/rejected": -64.62094116210938, "loss": 0.3198, "rewards/chosen": 0.3536362171173096, "rewards/margins": 2.2119868119557697, "rewards/rejected": -1.8583505948384602, "step": 11540 }, { "epoch": 0.6117191847984523, "grad_norm": 28.375, "kl": 1.536630630493164, "learning_rate": 5e-07, "logits/chosen": -25768352.0, "logits/rejected": -57149032.0, "logps/chosen": -230.33831787109375, "logps/rejected": -380.3272705078125, "loss": 0.172, "rewards/chosen": 1.3431849479675293, "rewards/margins": 4.494305372238159, "rewards/rejected": -3.15112042427063, "step": 11541 }, { "epoch": 0.6117721888002544, "grad_norm": 41.25, "kl": 3.110687255859375, "learning_rate": 5e-07, "logits/chosen": -16013744.0, "logits/rejected": -80881312.0, "logps/chosen": -155.7696533203125, "logps/rejected": -297.3412272135417, "loss": 0.3468, "rewards/chosen": 0.3543436288833618, "rewards/margins": 3.664106265703837, "rewards/rejected": -3.309762636820475, "step": 11542 }, { "epoch": 0.6118251928020566, "grad_norm": 45.75, "kl": 2.7442893981933594, "learning_rate": 5e-07, "logits/chosen": -25025418.666666668, "logits/rejected": -39515168.0, "logps/chosen": -316.6134033203125, "logps/rejected": -173.41969299316406, "loss": 0.3167, "rewards/chosen": 0.6903759638468424, "rewards/margins": 2.7620805899302163, "rewards/rejected": -2.071704626083374, "step": 11543 }, { "epoch": 0.6118781968038587, "grad_norm": 47.0, "kl": 0.23270416259765625, "learning_rate": 5e-07, "logits/chosen": 11062874.0, "logits/rejected": -41881712.0, "logps/chosen": -188.94683837890625, "logps/rejected": -665.0684204101562, "loss": 0.3769, "rewards/chosen": -0.5491185188293457, "rewards/margins": 2.1896541118621826, "rewards/rejected": -2.7387726306915283, "step": 11544 }, { "epoch": 0.6119312008056609, "grad_norm": 50.75, "kl": 0.7325973510742188, "learning_rate": 5e-07, "logits/chosen": -32644413.333333332, "logits/rejected": -37979512.0, "logps/chosen": -293.6200764973958, "logps/rejected": -326.6997375488281, "loss": 0.2603, "rewards/chosen": 1.0284223556518555, "rewards/margins": 3.189361333847046, "rewards/rejected": -2.1609389781951904, "step": 11545 }, { "epoch": 0.611984204807463, "grad_norm": 50.0, "kl": 0.2507591247558594, "learning_rate": 5e-07, "logits/chosen": -13211146.0, "logits/rejected": -7949647.0, "logps/chosen": -267.1064453125, "logps/rejected": -358.256103515625, "loss": 0.282, "rewards/chosen": 0.2279043346643448, "rewards/margins": 3.877791181206703, "rewards/rejected": -3.6498868465423584, "step": 11546 }, { "epoch": 0.6120372088092652, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60756404.0, "logits/rejected": -12426950.666666666, "logps/chosen": -149.53680419921875, "logps/rejected": -310.00376383463544, "loss": 0.1982, "rewards/chosen": 0.5064537525177002, "rewards/margins": 3.229149103164673, "rewards/rejected": -2.7226953506469727, "step": 11547 }, { "epoch": 0.6120902128110672, "grad_norm": 40.75, "kl": 1.7065238952636719, "learning_rate": 5e-07, "logits/chosen": -25791435.2, "logits/rejected": -42532152.0, "logps/chosen": -231.40341796875, "logps/rejected": -197.77714029947916, "loss": 0.341, "rewards/chosen": 0.674653434753418, "rewards/margins": 2.0764678955078124, "rewards/rejected": -1.4018144607543945, "step": 11548 }, { "epoch": 0.6121432168128693, "grad_norm": 58.0, "kl": 0.7419357299804688, "learning_rate": 5e-07, "logits/chosen": -95907264.0, "logits/rejected": -2103932.8, "logps/chosen": -344.6962076822917, "logps/rejected": -524.9546875, "loss": 0.2678, "rewards/chosen": 0.03733144203821818, "rewards/margins": 2.0062809507052104, "rewards/rejected": -1.9689495086669921, "step": 11549 }, { "epoch": 0.6121962208146715, "grad_norm": 50.0, "kl": 1.5879230499267578, "learning_rate": 5e-07, "logits/chosen": -33815488.0, "logits/rejected": -1395334.0, "logps/chosen": -293.8958251953125, "logps/rejected": -95.70055135091145, "loss": 0.358, "rewards/chosen": 0.3290797233581543, "rewards/margins": 2.7889182726542154, "rewards/rejected": -2.459838549296061, "step": 11550 }, { "epoch": 0.6122492248164736, "grad_norm": 66.5, "kl": 1.0751686096191406, "learning_rate": 5e-07, "logits/chosen": 663167.0, "logits/rejected": -41492592.0, "logps/chosen": -480.4283447265625, "logps/rejected": -187.37772623697916, "loss": 0.1901, "rewards/chosen": 0.6576617956161499, "rewards/margins": 3.0745309591293335, "rewards/rejected": -2.4168691635131836, "step": 11551 }, { "epoch": 0.6123022288182758, "grad_norm": 35.25, "kl": 0.7911243438720703, "learning_rate": 5e-07, "logits/chosen": -19311902.0, "logits/rejected": -11589332.0, "logps/chosen": -166.4674530029297, "logps/rejected": -334.55120849609375, "loss": 0.1949, "rewards/chosen": 1.013967514038086, "rewards/margins": 3.5199806690216064, "rewards/rejected": -2.5060131549835205, "step": 11552 }, { "epoch": 0.6123552328200779, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24930016.0, "logits/rejected": -32925648.0, "logps/chosen": -161.7747314453125, "logps/rejected": -144.99052937825522, "loss": 0.336, "rewards/chosen": 0.2526706695556641, "rewards/margins": 2.1371816635131835, "rewards/rejected": -1.8845109939575195, "step": 11553 }, { "epoch": 0.6124082368218801, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1450252.6666666667, "logits/rejected": -63623065.6, "logps/chosen": -55.629486083984375, "logps/rejected": -392.238134765625, "loss": 0.2479, "rewards/chosen": 0.055169363816579185, "rewards/margins": 2.458749838670095, "rewards/rejected": -2.4035804748535154, "step": 11554 }, { "epoch": 0.6124612408236821, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4469078.8, "logits/rejected": -21701085.333333332, "logps/chosen": -240.5693359375, "logps/rejected": -456.0048828125, "loss": 0.3428, "rewards/chosen": 0.2127511739730835, "rewards/margins": 1.9512938896814982, "rewards/rejected": -1.7385427157084148, "step": 11555 }, { "epoch": 0.6125142448254843, "grad_norm": 45.0, "kl": 2.831174850463867, "learning_rate": 5e-07, "logits/chosen": -29999988.0, "logits/rejected": -15402836.0, "logps/chosen": -392.65826416015625, "logps/rejected": -341.8518981933594, "loss": 0.2704, "rewards/chosen": 0.2064802348613739, "rewards/margins": 3.81419774889946, "rewards/rejected": -3.607717514038086, "step": 11556 }, { "epoch": 0.6125672488272864, "grad_norm": 64.0, "kl": 3.85237979888916, "learning_rate": 5e-07, "logits/chosen": -27289075.2, "logits/rejected": -23364520.0, "logps/chosen": -598.411083984375, "logps/rejected": -416.3818359375, "loss": 0.3207, "rewards/chosen": 1.0622876167297364, "rewards/margins": 2.3674715836842855, "rewards/rejected": -1.305183966954549, "step": 11557 }, { "epoch": 0.6126202528290886, "grad_norm": 46.0, "kl": 0.4815502166748047, "learning_rate": 5e-07, "logits/chosen": -6463162.0, "logits/rejected": -42419720.0, "logps/chosen": -184.03672790527344, "logps/rejected": -307.22998046875, "loss": 0.2443, "rewards/chosen": 0.4374386966228485, "rewards/margins": 2.9531693160533905, "rewards/rejected": -2.515730619430542, "step": 11558 }, { "epoch": 0.6126732568308907, "grad_norm": 55.5, "kl": 0.02404022216796875, "learning_rate": 5e-07, "logits/chosen": -158479219.2, "logits/rejected": -11625001.333333334, "logps/chosen": -327.4039306640625, "logps/rejected": -276.6292724609375, "loss": 0.3028, "rewards/chosen": 0.4414547920227051, "rewards/margins": 2.212953472137451, "rewards/rejected": -1.771498680114746, "step": 11559 }, { "epoch": 0.6127262608326929, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26006082.0, "logits/rejected": 3758334.75, "logps/chosen": -265.0654296875, "logps/rejected": -272.2742614746094, "loss": 0.2532, "rewards/chosen": 0.7329910397529602, "rewards/margins": 3.4308062195777893, "rewards/rejected": -2.697815179824829, "step": 11560 }, { "epoch": 0.612779264834495, "grad_norm": 50.25, "kl": 0.285919189453125, "learning_rate": 5e-07, "logits/chosen": -27903380.0, "logits/rejected": 6169171.5, "logps/chosen": -497.17340087890625, "logps/rejected": -213.80746459960938, "loss": 0.2775, "rewards/chosen": 0.7544222474098206, "rewards/margins": 2.1863635182380676, "rewards/rejected": -1.431941270828247, "step": 11561 }, { "epoch": 0.6128322688362972, "grad_norm": 50.25, "kl": 1.0812463760375977, "learning_rate": 5e-07, "logits/chosen": -23232050.0, "logits/rejected": -50004696.0, "logps/chosen": -201.07583618164062, "logps/rejected": -323.0550231933594, "loss": 0.3519, "rewards/chosen": 0.28657275438308716, "rewards/margins": 1.4634411931037903, "rewards/rejected": -1.1768684387207031, "step": 11562 }, { "epoch": 0.6128852728380992, "grad_norm": 99.5, "kl": 2.7791824340820312, "learning_rate": 5e-07, "logits/chosen": -4535422.5, "logits/rejected": -46643808.0, "logps/chosen": -122.7723388671875, "logps/rejected": -418.0662841796875, "loss": 0.2939, "rewards/chosen": 0.7107985615730286, "rewards/margins": 3.0641704201698303, "rewards/rejected": -2.3533718585968018, "step": 11563 }, { "epoch": 0.6129382768399014, "grad_norm": 46.75, "kl": 0.0816192626953125, "learning_rate": 5e-07, "logits/chosen": -26792101.333333332, "logits/rejected": -5252139.5, "logps/chosen": -351.364013671875, "logps/rejected": -128.95533752441406, "loss": 0.3801, "rewards/chosen": 0.4539821147918701, "rewards/margins": 2.438414692878723, "rewards/rejected": -1.984432578086853, "step": 11564 }, { "epoch": 0.6129912808417035, "grad_norm": 50.5, "kl": 0.39826202392578125, "learning_rate": 5e-07, "logits/chosen": -12035830.0, "logits/rejected": -14988760.0, "logps/chosen": -159.3234100341797, "logps/rejected": -317.7921447753906, "loss": 0.2702, "rewards/chosen": 0.6132566332817078, "rewards/margins": 3.0038451552391052, "rewards/rejected": -2.3905885219573975, "step": 11565 }, { "epoch": 0.6130442848435057, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49969392.0, "logits/rejected": -27773596.8, "logps/chosen": -731.7117513020834, "logps/rejected": -288.095361328125, "loss": 0.1872, "rewards/chosen": 1.6700908342997234, "rewards/margins": 3.474598089853923, "rewards/rejected": -1.8045072555541992, "step": 11566 }, { "epoch": 0.6130972888453078, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73210448.0, "logits/rejected": -32137520.0, "logps/chosen": -475.02313232421875, "logps/rejected": -520.8086751302084, "loss": 0.2169, "rewards/chosen": -0.06402283161878586, "rewards/margins": 2.7515295619765916, "rewards/rejected": -2.8155523935953775, "step": 11567 }, { "epoch": 0.61315029284711, "grad_norm": 53.0, "kl": 1.682042121887207, "learning_rate": 5e-07, "logits/chosen": -24620160.0, "logits/rejected": -13685984.0, "logps/chosen": -377.469091796875, "logps/rejected": -136.96180216471353, "loss": 0.3979, "rewards/chosen": 0.6137720584869385, "rewards/margins": 1.1161488930384318, "rewards/rejected": -0.5023768345514933, "step": 11568 }, { "epoch": 0.6132032968489121, "grad_norm": 56.5, "kl": 1.3365592956542969, "learning_rate": 5e-07, "logits/chosen": -9017978.666666666, "logits/rejected": -20232870.4, "logps/chosen": -503.5133056640625, "logps/rejected": -313.4794189453125, "loss": 0.2287, "rewards/chosen": 0.2909444173177083, "rewards/margins": 2.5713828404744468, "rewards/rejected": -2.2804384231567383, "step": 11569 }, { "epoch": 0.6132563008507143, "grad_norm": 131.0, "kl": 6.302165985107422, "learning_rate": 5e-07, "logits/chosen": -48024736.0, "logits/rejected": -55470560.0, "logps/chosen": -480.3978515625, "logps/rejected": -257.53159586588544, "loss": 0.2296, "rewards/chosen": 1.2962718963623048, "rewards/margins": 3.400187905629476, "rewards/rejected": -2.1039160092671714, "step": 11570 }, { "epoch": 0.6133093048525163, "grad_norm": 29.625, "kl": 2.682098388671875, "learning_rate": 5e-07, "logits/chosen": -24016427.2, "logits/rejected": -20484522.666666668, "logps/chosen": -287.5900634765625, "logps/rejected": -374.8812662760417, "loss": 0.2475, "rewards/chosen": 1.2310690879821777, "rewards/margins": 3.3811682065327964, "rewards/rejected": -2.1500991185506186, "step": 11571 }, { "epoch": 0.6133623088543185, "grad_norm": 52.0, "kl": 4.375968933105469, "learning_rate": 5e-07, "logits/chosen": -27833744.0, "logits/rejected": -10539307.0, "logps/chosen": -263.3802490234375, "logps/rejected": -194.34979248046875, "loss": 0.4122, "rewards/chosen": 0.45839544137318927, "rewards/margins": 2.909122188886007, "rewards/rejected": -2.4507267475128174, "step": 11572 }, { "epoch": 0.6134153128561206, "grad_norm": 40.0, "kl": 3.694936752319336, "learning_rate": 5e-07, "logits/chosen": -36156900.0, "logits/rejected": -70791328.0, "logps/chosen": -313.0360107421875, "logps/rejected": -479.79962158203125, "loss": 0.2137, "rewards/chosen": 0.874873697757721, "rewards/margins": 4.021967232227325, "rewards/rejected": -3.1470935344696045, "step": 11573 }, { "epoch": 0.6134683168579228, "grad_norm": 43.5, "kl": 1.3497428894042969, "learning_rate": 5e-07, "logits/chosen": -56562080.0, "logits/rejected": -20510371.2, "logps/chosen": -274.1781819661458, "logps/rejected": -308.8783203125, "loss": 0.236, "rewards/chosen": 0.5479020277659098, "rewards/margins": 2.24172412554423, "rewards/rejected": -1.6938220977783203, "step": 11574 }, { "epoch": 0.6135213208597249, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24756473.6, "logits/rejected": -30803861.333333332, "logps/chosen": -272.00380859375, "logps/rejected": -499.9628092447917, "loss": 0.2962, "rewards/chosen": 0.48305206298828124, "rewards/margins": 2.5782147725423177, "rewards/rejected": -2.0951627095540366, "step": 11575 }, { "epoch": 0.6135743248615271, "grad_norm": 42.5, "kl": 1.5197334289550781, "learning_rate": 5e-07, "logits/chosen": -77854472.0, "logits/rejected": -65470036.0, "logps/chosen": -293.982177734375, "logps/rejected": -248.72943115234375, "loss": 0.2317, "rewards/chosen": 0.872948169708252, "rewards/margins": 3.1399872303009033, "rewards/rejected": -2.2670390605926514, "step": 11576 }, { "epoch": 0.6136273288633292, "grad_norm": 49.25, "kl": 0.7966995239257812, "learning_rate": 5e-07, "logits/chosen": -25075280.0, "logits/rejected": -1997039.5, "logps/chosen": -257.021484375, "logps/rejected": -538.369140625, "loss": 0.3623, "rewards/chosen": 0.49135214941842215, "rewards/margins": 3.7544289316449846, "rewards/rejected": -3.2630767822265625, "step": 11577 }, { "epoch": 0.6136803328651314, "grad_norm": 46.0, "kl": 2.120206832885742, "learning_rate": 5e-07, "logits/chosen": -6130694.857142857, "logits/rejected": -95846192.0, "logps/chosen": -245.43059430803572, "logps/rejected": -427.1348876953125, "loss": 0.3904, "rewards/chosen": 0.6080449649265834, "rewards/margins": 3.2677861281803677, "rewards/rejected": -2.659741163253784, "step": 11578 }, { "epoch": 0.6137333368669334, "grad_norm": 61.75, "kl": 3.067183494567871, "learning_rate": 5e-07, "logits/chosen": -29952555.42857143, "logits/rejected": -24855308.0, "logps/chosen": -206.17986188616072, "logps/rejected": -317.9414367675781, "loss": 0.3924, "rewards/chosen": 0.49038502148219515, "rewards/margins": 3.656940902982439, "rewards/rejected": -3.166555881500244, "step": 11579 }, { "epoch": 0.6137863408687356, "grad_norm": 38.0, "kl": 1.0271415710449219, "learning_rate": 5e-07, "logits/chosen": -16629691.2, "logits/rejected": -51041386.666666664, "logps/chosen": -371.2335205078125, "logps/rejected": -410.9402262369792, "loss": 0.287, "rewards/chosen": 0.8149287223815918, "rewards/margins": 3.744649283091227, "rewards/rejected": -2.9297205607096353, "step": 11580 }, { "epoch": 0.6138393448705377, "grad_norm": 56.5, "kl": 1.6504859924316406, "learning_rate": 5e-07, "logits/chosen": -1528565.0, "logits/rejected": -30345640.0, "logps/chosen": -231.38380432128906, "logps/rejected": -298.149658203125, "loss": 0.18, "rewards/chosen": 1.8226470947265625, "rewards/margins": 3.589964270591736, "rewards/rejected": -1.7673171758651733, "step": 11581 }, { "epoch": 0.6138923488723399, "grad_norm": 42.0, "kl": 0.8480892181396484, "learning_rate": 5e-07, "logits/chosen": -16973862.4, "logits/rejected": -62860768.0, "logps/chosen": -387.0371826171875, "logps/rejected": -956.2874348958334, "loss": 0.2039, "rewards/chosen": 1.0501507759094237, "rewards/margins": 5.272615655263264, "rewards/rejected": -4.222464879353841, "step": 11582 }, { "epoch": 0.613945352874142, "grad_norm": 38.0, "kl": 2.0708112716674805, "learning_rate": 5e-07, "logits/chosen": -5702183.0, "logits/rejected": -8523134.0, "logps/chosen": -118.76579284667969, "logps/rejected": -129.37449645996094, "loss": 0.3155, "rewards/chosen": 0.737625002861023, "rewards/margins": 1.7661837339401245, "rewards/rejected": -1.0285587310791016, "step": 11583 }, { "epoch": 0.6139983568759442, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26111244.0, "logits/rejected": -43551749.333333336, "logps/chosen": -263.2638244628906, "logps/rejected": -371.0249837239583, "loss": 0.2155, "rewards/chosen": 0.2272895872592926, "rewards/margins": 3.0866531431674957, "rewards/rejected": -2.859363555908203, "step": 11584 }, { "epoch": 0.6140513608777463, "grad_norm": 41.0, "kl": 1.1896896362304688, "learning_rate": 5e-07, "logits/chosen": -4885374.5, "logits/rejected": -19326678.0, "logps/chosen": -293.76202392578125, "logps/rejected": -282.94586181640625, "loss": 0.2188, "rewards/chosen": 0.5861706733703613, "rewards/margins": 3.8216500282287598, "rewards/rejected": -3.2354793548583984, "step": 11585 }, { "epoch": 0.6141043648795484, "grad_norm": 61.75, "kl": 1.6945877075195312, "learning_rate": 5e-07, "logits/chosen": -54887290.666666664, "logits/rejected": -2875617.2, "logps/chosen": -483.9152018229167, "logps/rejected": -381.0203369140625, "loss": 0.2206, "rewards/chosen": 0.6156992514928182, "rewards/margins": 2.9656035025914513, "rewards/rejected": -2.349904251098633, "step": 11586 }, { "epoch": 0.6141573688813505, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -128378421.33333333, "logits/rejected": -10585073.6, "logps/chosen": -535.9881998697916, "logps/rejected": -236.95595703125, "loss": 0.2315, "rewards/chosen": 0.9945292472839355, "rewards/margins": 2.717871379852295, "rewards/rejected": -1.7233421325683593, "step": 11587 }, { "epoch": 0.6142103728831527, "grad_norm": 34.5, "kl": 2.7134933471679688, "learning_rate": 5e-07, "logits/chosen": 7321956.5, "logits/rejected": 4429229.0, "logps/chosen": -70.90795135498047, "logps/rejected": -337.7177429199219, "loss": 0.2819, "rewards/chosen": 0.34969308972358704, "rewards/margins": 2.8560307323932648, "rewards/rejected": -2.5063376426696777, "step": 11588 }, { "epoch": 0.6142633768849548, "grad_norm": 41.5, "kl": 0.035526275634765625, "learning_rate": 5e-07, "logits/chosen": -38736005.333333336, "logits/rejected": -12718791.2, "logps/chosen": -116.45438639322917, "logps/rejected": -424.61875, "loss": 0.2214, "rewards/chosen": 0.8789048194885254, "rewards/margins": 3.126929187774658, "rewards/rejected": -2.2480243682861327, "step": 11589 }, { "epoch": 0.614316380886757, "grad_norm": 43.5, "kl": 0.260589599609375, "learning_rate": 5e-07, "logits/chosen": -23116416.0, "logits/rejected": -37395296.0, "logps/chosen": -304.1777648925781, "logps/rejected": -193.30215454101562, "loss": 0.2521, "rewards/chosen": 0.9773147702217102, "rewards/margins": 2.3385555148124695, "rewards/rejected": -1.3612407445907593, "step": 11590 }, { "epoch": 0.6143693848885591, "grad_norm": 29.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54984.890625, "logits/rejected": -17070429.714285713, "logps/chosen": -127.25762176513672, "logps/rejected": -451.5001743861607, "loss": 0.1455, "rewards/chosen": 0.7281120419502258, "rewards/margins": 3.789440384932927, "rewards/rejected": -3.061328342982701, "step": 11591 }, { "epoch": 0.6144223888903613, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1044646.625, "logits/rejected": -21568306.0, "logps/chosen": -328.1759033203125, "logps/rejected": -307.1422119140625, "loss": 0.2264, "rewards/chosen": 0.6127617955207825, "rewards/margins": 3.121352732181549, "rewards/rejected": -2.5085909366607666, "step": 11592 }, { "epoch": 0.6144753928921634, "grad_norm": 61.5, "kl": 1.2457809448242188, "learning_rate": 5e-07, "logits/chosen": -20795013.333333332, "logits/rejected": -66179136.0, "logps/chosen": -580.6164957682291, "logps/rejected": -299.23447265625, "loss": 0.2721, "rewards/chosen": 0.708859364191691, "rewards/margins": 2.6167130629221598, "rewards/rejected": -1.9078536987304688, "step": 11593 }, { "epoch": 0.6145283968939655, "grad_norm": 42.75, "kl": 0.69775390625, "learning_rate": 5e-07, "logits/chosen": -20889436.0, "logits/rejected": -38908108.0, "logps/chosen": -251.4817657470703, "logps/rejected": -320.3509521484375, "loss": 0.2854, "rewards/chosen": 0.6661311388015747, "rewards/margins": 2.542927861213684, "rewards/rejected": -1.8767967224121094, "step": 11594 }, { "epoch": 0.6145814008957676, "grad_norm": 46.75, "kl": 0.4748044013977051, "learning_rate": 5e-07, "logits/chosen": -9653665.333333334, "logits/rejected": -24299118.4, "logps/chosen": -281.89129638671875, "logps/rejected": -667.258349609375, "loss": 0.1793, "rewards/chosen": 1.2562393347422283, "rewards/margins": 4.118704430262248, "rewards/rejected": -2.8624650955200197, "step": 11595 }, { "epoch": 0.6146344048975698, "grad_norm": 43.5, "kl": 1.520522117614746, "learning_rate": 5e-07, "logits/chosen": -6028237.0, "logits/rejected": -22520784.0, "logps/chosen": -261.3850402832031, "logps/rejected": -652.47216796875, "loss": 0.3368, "rewards/chosen": -0.21301613748073578, "rewards/margins": 2.6123095899820328, "rewards/rejected": -2.8253257274627686, "step": 11596 }, { "epoch": 0.6146874088993719, "grad_norm": 58.5, "kl": 1.0350966453552246, "learning_rate": 5e-07, "logits/chosen": -33414261.333333332, "logits/rejected": -20412062.4, "logps/chosen": -640.3628743489584, "logps/rejected": -390.504052734375, "loss": 0.2051, "rewards/chosen": 1.3527774810791016, "rewards/margins": 3.7248001098632812, "rewards/rejected": -2.3720226287841797, "step": 11597 }, { "epoch": 0.6147404129011741, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23676484.0, "logits/rejected": -24393648.0, "logps/chosen": -245.63328552246094, "logps/rejected": -344.1739908854167, "loss": 0.206, "rewards/chosen": 0.249635711312294, "rewards/margins": 2.24111828704675, "rewards/rejected": -1.9914825757344563, "step": 11598 }, { "epoch": 0.6147934169029762, "grad_norm": 58.25, "kl": 4.652242660522461, "learning_rate": 5e-07, "logits/chosen": 95929344.0, "logits/rejected": -9859881.0, "logps/chosen": -210.88382393973214, "logps/rejected": -132.3201904296875, "loss": 0.4875, "rewards/chosen": 0.22203048637935094, "rewards/margins": 4.306074466024127, "rewards/rejected": -4.084043979644775, "step": 11599 }, { "epoch": 0.6148464209047783, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7019846.666666667, "logits/rejected": -49282452.0, "logps/chosen": -169.50918579101562, "logps/rejected": -212.97305297851562, "loss": 0.3599, "rewards/chosen": 0.4501802126566569, "rewards/margins": 1.862782875696818, "rewards/rejected": -1.4126026630401611, "step": 11600 }, { "epoch": 0.6148994249065804, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19095718.4, "logits/rejected": -42952330.666666664, "logps/chosen": -251.328271484375, "logps/rejected": -609.5191243489584, "loss": 0.3745, "rewards/chosen": 0.04831973314285278, "rewards/margins": 1.9182366490364076, "rewards/rejected": -1.8699169158935547, "step": 11601 }, { "epoch": 0.6149524289083825, "grad_norm": 97.5, "kl": 0.9298801422119141, "learning_rate": 5e-07, "logits/chosen": 29773286.0, "logits/rejected": -20386658.0, "logps/chosen": -825.7488403320312, "logps/rejected": -723.9066162109375, "loss": 0.2748, "rewards/chosen": 0.09366131573915482, "rewards/margins": 3.92629624158144, "rewards/rejected": -3.832634925842285, "step": 11602 }, { "epoch": 0.6150054329101847, "grad_norm": 54.5, "kl": 1.8129768371582031, "learning_rate": 5e-07, "logits/chosen": 9946936.0, "logits/rejected": -27856825.6, "logps/chosen": -380.5864664713542, "logps/rejected": -366.307666015625, "loss": 0.2529, "rewards/chosen": 0.18930790821711221, "rewards/margins": 2.6726664821306865, "rewards/rejected": -2.4833585739135744, "step": 11603 }, { "epoch": 0.6150584369119868, "grad_norm": 39.0, "kl": 4.518716812133789, "learning_rate": 5e-07, "logits/chosen": -2968835.0, "logits/rejected": -6977853.5, "logps/chosen": -367.7710876464844, "logps/rejected": -225.6216278076172, "loss": 0.2214, "rewards/chosen": 0.5890718102455139, "rewards/margins": 2.523673713207245, "rewards/rejected": -1.934601902961731, "step": 11604 }, { "epoch": 0.615111440913789, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65345600.0, "logits/rejected": -22849120.0, "logps/chosen": -1027.7215576171875, "logps/rejected": -453.3192545572917, "loss": 0.1654, "rewards/chosen": 1.6744590997695923, "rewards/margins": 4.514571785926819, "rewards/rejected": -2.8401126861572266, "step": 11605 }, { "epoch": 0.6151644449155911, "grad_norm": 59.0, "kl": 1.2214889526367188, "learning_rate": 5e-07, "logits/chosen": -54331658.666666664, "logits/rejected": -524304.125, "logps/chosen": -402.5137532552083, "logps/rejected": -151.51824951171875, "loss": 0.3808, "rewards/chosen": 0.5284871260325114, "rewards/margins": 1.6736891667048135, "rewards/rejected": -1.1452020406723022, "step": 11606 }, { "epoch": 0.6152174489173933, "grad_norm": 51.5, "kl": 0.7107162475585938, "learning_rate": 5e-07, "logits/chosen": -25488026.0, "logits/rejected": 1754023.0, "logps/chosen": -353.56378173828125, "logps/rejected": -205.80206298828125, "loss": 0.2388, "rewards/chosen": 0.548173189163208, "rewards/margins": 3.785451889038086, "rewards/rejected": -3.237278699874878, "step": 11607 }, { "epoch": 0.6152704529191954, "grad_norm": 54.25, "kl": 0.5047073364257812, "learning_rate": 5e-07, "logits/chosen": -23208058.666666668, "logits/rejected": -4306221.5, "logps/chosen": -255.83831787109375, "logps/rejected": -139.09161376953125, "loss": 0.3622, "rewards/chosen": 0.1696388522783915, "rewards/margins": 4.1511866847674055, "rewards/rejected": -3.9815478324890137, "step": 11608 }, { "epoch": 0.6153234569209975, "grad_norm": 54.0, "kl": 0.6393260955810547, "learning_rate": 5e-07, "logits/chosen": -53734328.0, "logits/rejected": -28078002.0, "logps/chosen": -353.1051025390625, "logps/rejected": -277.48223876953125, "loss": 0.3044, "rewards/chosen": 0.11194992065429688, "rewards/margins": 2.2536284923553467, "rewards/rejected": -2.14167857170105, "step": 11609 }, { "epoch": 0.6153764609227996, "grad_norm": 33.75, "kl": 0.7025775909423828, "learning_rate": 5e-07, "logits/chosen": -7953777.5, "logits/rejected": -1763010.7142857143, "logps/chosen": -51.18829345703125, "logps/rejected": -296.00467354910717, "loss": 0.2035, "rewards/chosen": 0.8083587884902954, "rewards/margins": 2.589467780930655, "rewards/rejected": -1.78110899244036, "step": 11610 }, { "epoch": 0.6154294649246018, "grad_norm": 45.75, "kl": 0.10909175872802734, "learning_rate": 5e-07, "logits/chosen": -20043113.6, "logits/rejected": -18206406.666666668, "logps/chosen": -227.039111328125, "logps/rejected": -111.56571451822917, "loss": 0.312, "rewards/chosen": 0.46170506477355955, "rewards/margins": 2.350924189885457, "rewards/rejected": -1.8892191251118977, "step": 11611 }, { "epoch": 0.6154824689264039, "grad_norm": 56.0, "kl": 1.0539283752441406, "learning_rate": 5e-07, "logits/chosen": -11781754.0, "logits/rejected": -15305691.0, "logps/chosen": -224.68051147460938, "logps/rejected": -309.00604248046875, "loss": 0.3222, "rewards/chosen": 0.2844082713127136, "rewards/margins": 2.2316723465919495, "rewards/rejected": -1.9472640752792358, "step": 11612 }, { "epoch": 0.6155354729282061, "grad_norm": 40.75, "kl": 0.36841773986816406, "learning_rate": 5e-07, "logits/chosen": -7191392.666666667, "logits/rejected": -24043728.0, "logps/chosen": -254.827880859375, "logps/rejected": -380.738330078125, "loss": 0.1952, "rewards/chosen": 0.7383717695871989, "rewards/margins": 3.092701450983683, "rewards/rejected": -2.3543296813964845, "step": 11613 }, { "epoch": 0.6155884769300082, "grad_norm": 37.75, "kl": 0.3226451873779297, "learning_rate": 5e-07, "logits/chosen": -87822997.33333333, "logits/rejected": -27249616.0, "logps/chosen": -371.2737223307292, "logps/rejected": -400.599365234375, "loss": 0.1996, "rewards/chosen": 0.9152785936991373, "rewards/margins": 3.779704443613688, "rewards/rejected": -2.864425849914551, "step": 11614 }, { "epoch": 0.6156414809318104, "grad_norm": 53.75, "kl": 0.29116249084472656, "learning_rate": 5e-07, "logits/chosen": -49374784.0, "logits/rejected": -2496278.0, "logps/chosen": -276.0910949707031, "logps/rejected": -158.7286173502604, "loss": 0.314, "rewards/chosen": 0.3118278384208679, "rewards/margins": 1.5231968760490417, "rewards/rejected": -1.2113690376281738, "step": 11615 }, { "epoch": 0.6156944849336125, "grad_norm": 49.25, "kl": 5.407867431640625, "learning_rate": 5e-07, "logits/chosen": -34842857.6, "logits/rejected": -25155648.0, "logps/chosen": -239.4731689453125, "logps/rejected": -439.8790690104167, "loss": 0.3778, "rewards/chosen": 0.7325177669525147, "rewards/margins": 2.924488592147827, "rewards/rejected": -2.1919708251953125, "step": 11616 }, { "epoch": 0.6157474889354146, "grad_norm": 38.5, "kl": 1.8068485260009766, "learning_rate": 5e-07, "logits/chosen": -23084584.0, "logits/rejected": -46363946.666666664, "logps/chosen": -365.4609680175781, "logps/rejected": -612.6502685546875, "loss": 0.1173, "rewards/chosen": 2.2968034744262695, "rewards/margins": 5.814056078592936, "rewards/rejected": -3.5172526041666665, "step": 11617 }, { "epoch": 0.6158004929372167, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12520442.0, "logits/rejected": -2614729.5, "logps/chosen": -138.97402954101562, "logps/rejected": -171.25599670410156, "loss": 0.3616, "rewards/chosen": 0.19208325445652008, "rewards/margins": 1.2781195789575577, "rewards/rejected": -1.0860363245010376, "step": 11618 }, { "epoch": 0.6158534969390189, "grad_norm": 41.0, "kl": 3.187286376953125, "learning_rate": 5e-07, "logits/chosen": -13877253.333333334, "logits/rejected": -31842992.0, "logps/chosen": -480.0489908854167, "logps/rejected": -144.8770294189453, "loss": 0.3865, "rewards/chosen": 0.7300659020741781, "rewards/margins": 3.4533918698628745, "rewards/rejected": -2.7233259677886963, "step": 11619 }, { "epoch": 0.615906500940821, "grad_norm": 47.0, "kl": 0.1460399627685547, "learning_rate": 5e-07, "logits/chosen": -51274106.666666664, "logits/rejected": -23429531.2, "logps/chosen": -287.4372151692708, "logps/rejected": -269.690234375, "loss": 0.298, "rewards/chosen": 0.21559657653172812, "rewards/margins": 1.9791982611020404, "rewards/rejected": -1.7636016845703124, "step": 11620 }, { "epoch": 0.6159595049426232, "grad_norm": 40.5, "kl": 0.31098175048828125, "learning_rate": 5e-07, "logits/chosen": -37099154.666666664, "logits/rejected": -25116513.6, "logps/chosen": -219.0587158203125, "logps/rejected": -215.7249755859375, "loss": 0.3128, "rewards/chosen": -0.40232237180074054, "rewards/margins": 2.2580539862314857, "rewards/rejected": -2.6603763580322264, "step": 11621 }, { "epoch": 0.6160125089444253, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40943932.8, "logits/rejected": -5641368.666666667, "logps/chosen": -382.46669921875, "logps/rejected": -93.28551228841145, "loss": 0.4809, "rewards/chosen": -0.3533933639526367, "rewards/margins": 0.572100830078125, "rewards/rejected": -0.9254941940307617, "step": 11622 }, { "epoch": 0.6160655129462275, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35531142.4, "logits/rejected": -47704938.666666664, "logps/chosen": -645.7861328125, "logps/rejected": -282.16796875, "loss": 0.2991, "rewards/chosen": 0.665193510055542, "rewards/margins": 2.8709074179331457, "rewards/rejected": -2.205713907877604, "step": 11623 }, { "epoch": 0.6161185169480295, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37795453.333333336, "logits/rejected": -12026476.0, "logps/chosen": -344.9919840494792, "logps/rejected": -235.406787109375, "loss": 0.3267, "rewards/chosen": 0.04953918854395548, "rewards/margins": 1.5226689378420513, "rewards/rejected": -1.4731297492980957, "step": 11624 }, { "epoch": 0.6161715209498317, "grad_norm": 49.5, "kl": 4.384320259094238, "learning_rate": 5e-07, "logits/chosen": -18748557.333333332, "logits/rejected": 1634769.0, "logps/chosen": -143.59406534830728, "logps/rejected": -68.40318298339844, "loss": 0.4402, "rewards/chosen": 0.546042243639628, "rewards/margins": 2.4226032892862954, "rewards/rejected": -1.8765610456466675, "step": 11625 }, { "epoch": 0.6162245249516338, "grad_norm": 50.0, "kl": 3.4126510620117188, "learning_rate": 5e-07, "logits/chosen": -55184544.0, "logits/rejected": -49102440.0, "logps/chosen": -332.3759765625, "logps/rejected": -466.5904235839844, "loss": 0.3298, "rewards/chosen": 0.2586134076118469, "rewards/margins": 2.9947767853736877, "rewards/rejected": -2.736163377761841, "step": 11626 }, { "epoch": 0.616277528953436, "grad_norm": 25.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6059937.2, "logits/rejected": -12796537.333333334, "logps/chosen": -59.8458740234375, "logps/rejected": -354.4876302083333, "loss": 0.317, "rewards/chosen": 0.05745143890380859, "rewards/margins": 3.59702574412028, "rewards/rejected": -3.539574305216471, "step": 11627 }, { "epoch": 0.6163305329552381, "grad_norm": 52.75, "kl": 0.9508609771728516, "learning_rate": 5e-07, "logits/chosen": -27706932.0, "logits/rejected": -35043640.0, "logps/chosen": -283.7889404296875, "logps/rejected": -382.16326904296875, "loss": 0.2555, "rewards/chosen": 0.4392339885234833, "rewards/margins": 3.1141349971294403, "rewards/rejected": -2.674901008605957, "step": 11628 }, { "epoch": 0.6163835369570403, "grad_norm": 47.0, "kl": 0.7108821868896484, "learning_rate": 5e-07, "logits/chosen": -19921856.0, "logits/rejected": -6436404.5, "logps/chosen": -881.2333984375, "logps/rejected": -201.1146697998047, "loss": 0.2338, "rewards/chosen": 1.2607587575912476, "rewards/margins": 3.2582781314849854, "rewards/rejected": -1.9975193738937378, "step": 11629 }, { "epoch": 0.6164365409588424, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -233614080.0, "logits/rejected": -28269074.666666668, "logps/chosen": -223.9315643310547, "logps/rejected": -457.296142578125, "loss": 0.1721, "rewards/chosen": 0.11928759515285492, "rewards/margins": 2.9226666539907455, "rewards/rejected": -2.8033790588378906, "step": 11630 }, { "epoch": 0.6164895449606446, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32722784.0, "logits/rejected": -61778393.6, "logps/chosen": -184.40547688802084, "logps/rejected": -614.037939453125, "loss": 0.204, "rewards/chosen": 0.5495467980702718, "rewards/margins": 3.7104574044545493, "rewards/rejected": -3.1609106063842773, "step": 11631 }, { "epoch": 0.6165425489624466, "grad_norm": 53.5, "kl": 1.8978195190429688, "learning_rate": 5e-07, "logits/chosen": -28722910.0, "logits/rejected": -6864772.5, "logps/chosen": -255.80047607421875, "logps/rejected": -204.52655029296875, "loss": 0.2878, "rewards/chosen": 0.49639472365379333, "rewards/margins": 2.4689897000789642, "rewards/rejected": -1.972594976425171, "step": 11632 }, { "epoch": 0.6165955529642488, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1581492.3, "logits/rejected": -25872696.0, "logps/chosen": -220.3649658203125, "logps/rejected": -357.151123046875, "loss": 0.3133, "rewards/chosen": 0.42482547760009765, "rewards/margins": 2.352065849304199, "rewards/rejected": -1.9272403717041016, "step": 11633 }, { "epoch": 0.6166485569660509, "grad_norm": 47.0, "kl": 1.6488094329833984, "learning_rate": 5e-07, "logits/chosen": -24815269.333333332, "logits/rejected": -38475334.4, "logps/chosen": -498.587646484375, "logps/rejected": -404.0607421875, "loss": 0.1327, "rewards/chosen": 1.4333251317342122, "rewards/margins": 4.307775433858236, "rewards/rejected": -2.8744503021240235, "step": 11634 }, { "epoch": 0.6167015609678531, "grad_norm": 54.25, "kl": 1.4911727905273438, "learning_rate": 5e-07, "logits/chosen": 8649732.0, "logits/rejected": -11592972.8, "logps/chosen": -462.4710693359375, "logps/rejected": -212.63330078125, "loss": 0.2366, "rewards/chosen": 0.36657023429870605, "rewards/margins": 2.7852519512176515, "rewards/rejected": -2.4186817169189454, "step": 11635 }, { "epoch": 0.6167545649696552, "grad_norm": 29.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22489346.0, "logits/rejected": -7632656.0, "logps/chosen": -238.07191467285156, "logps/rejected": -201.08207194010416, "loss": 0.1679, "rewards/chosen": 0.23885956406593323, "rewards/margins": 2.9250317315260568, "rewards/rejected": -2.6861721674601235, "step": 11636 }, { "epoch": 0.6168075689714574, "grad_norm": 50.25, "kl": 1.324554443359375, "learning_rate": 5e-07, "logits/chosen": -55550816.0, "logits/rejected": -19248380.0, "logps/chosen": -351.4344482421875, "logps/rejected": -197.3833770751953, "loss": 0.3003, "rewards/chosen": 0.08164538443088531, "rewards/margins": 2.1264501959085464, "rewards/rejected": -2.044804811477661, "step": 11637 }, { "epoch": 0.6168605729732595, "grad_norm": 44.5, "kl": 1.00146484375, "learning_rate": 5e-07, "logits/chosen": -4773898.8, "logits/rejected": -59873173.333333336, "logps/chosen": -317.68916015625, "logps/rejected": -715.1214192708334, "loss": 0.149, "rewards/chosen": 1.638850784301758, "rewards/margins": 4.998137410481771, "rewards/rejected": -3.359286626180013, "step": 11638 }, { "epoch": 0.6169135769750617, "grad_norm": 40.75, "kl": 1.4331283569335938, "learning_rate": 5e-07, "logits/chosen": -17078325.333333332, "logits/rejected": -8995184.0, "logps/chosen": -156.961669921875, "logps/rejected": -227.7568115234375, "loss": 0.187, "rewards/chosen": 0.9216718673706055, "rewards/margins": 3.1762380599975586, "rewards/rejected": -2.254566192626953, "step": 11639 }, { "epoch": 0.6169665809768637, "grad_norm": 55.5, "kl": 1.9360389709472656, "learning_rate": 5e-07, "logits/chosen": -41812909.71428572, "logits/rejected": -9834286.0, "logps/chosen": -364.7025669642857, "logps/rejected": -86.48233795166016, "loss": 0.3837, "rewards/chosen": 0.5161277907235282, "rewards/margins": 3.79472896030971, "rewards/rejected": -3.2786011695861816, "step": 11640 }, { "epoch": 0.6170195849786659, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 704698.0, "logits/rejected": 12019218.0, "logps/chosen": -251.32090759277344, "logps/rejected": -350.7525329589844, "loss": 0.2994, "rewards/chosen": 0.5649970769882202, "rewards/margins": 2.697962164878845, "rewards/rejected": -2.132965087890625, "step": 11641 }, { "epoch": 0.617072588980468, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60823493.333333336, "logits/rejected": -37145964.8, "logps/chosen": -470.9698079427083, "logps/rejected": -397.12216796875, "loss": 0.2135, "rewards/chosen": 0.40568594137827557, "rewards/margins": 2.9873227516810097, "rewards/rejected": -2.5816368103027343, "step": 11642 }, { "epoch": 0.6171255929822702, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4901321.333333333, "logits/rejected": -43852787.2, "logps/chosen": -172.9580281575521, "logps/rejected": -474.54638671875, "loss": 0.2105, "rewards/chosen": 0.5981042385101318, "rewards/margins": 3.0905463695526123, "rewards/rejected": -2.4924421310424805, "step": 11643 }, { "epoch": 0.6171785969840723, "grad_norm": 59.5, "kl": 1.1214103698730469, "learning_rate": 5e-07, "logits/chosen": 37079976.0, "logits/rejected": -19181132.0, "logps/chosen": -279.26409912109375, "logps/rejected": -188.4100341796875, "loss": 0.3099, "rewards/chosen": 0.15212783217430115, "rewards/margins": 2.1089673936367035, "rewards/rejected": -1.9568395614624023, "step": 11644 }, { "epoch": 0.6172316009858745, "grad_norm": 50.75, "kl": 2.2509422302246094, "learning_rate": 5e-07, "logits/chosen": -26488441.14285714, "logits/rejected": -35527608.0, "logps/chosen": -246.22928292410714, "logps/rejected": -440.04254150390625, "loss": 0.3577, "rewards/chosen": 0.8417317526681083, "rewards/margins": 1.8637959616524833, "rewards/rejected": -1.022064208984375, "step": 11645 }, { "epoch": 0.6172846049876766, "grad_norm": 54.25, "kl": 1.5607681274414062, "learning_rate": 5e-07, "logits/chosen": 8310894.0, "logits/rejected": -45536777.6, "logps/chosen": -135.33570353190103, "logps/rejected": -389.433203125, "loss": 0.2902, "rewards/chosen": 0.16398164629936218, "rewards/margins": 2.3806417644023896, "rewards/rejected": -2.2166601181030274, "step": 11646 }, { "epoch": 0.6173376089894788, "grad_norm": 41.5, "kl": 0.14856910705566406, "learning_rate": 5e-07, "logits/chosen": -33038994.285714287, "logits/rejected": -906338.0, "logps/chosen": -195.17994907924108, "logps/rejected": -204.7978515625, "loss": 0.2586, "rewards/chosen": 1.330073629106794, "rewards/margins": 2.535517965044294, "rewards/rejected": -1.2054443359375, "step": 11647 }, { "epoch": 0.6173906129912808, "grad_norm": 35.25, "kl": 0.3277740478515625, "learning_rate": 5e-07, "logits/chosen": -72359440.0, "logits/rejected": -12916676.0, "logps/chosen": -595.0032552083334, "logps/rejected": -298.055126953125, "loss": 0.1622, "rewards/chosen": 1.176529328028361, "rewards/margins": 3.8178638617197675, "rewards/rejected": -2.6413345336914062, "step": 11648 }, { "epoch": 0.617443616993083, "grad_norm": 31.875, "kl": 0.6485557556152344, "learning_rate": 5e-07, "logits/chosen": -9001571.2, "logits/rejected": -26811461.333333332, "logps/chosen": -229.7901611328125, "logps/rejected": -422.386962890625, "loss": 0.2115, "rewards/chosen": 1.173452663421631, "rewards/margins": 4.029173183441162, "rewards/rejected": -2.8557205200195312, "step": 11649 }, { "epoch": 0.6174966209948851, "grad_norm": 55.0, "kl": 2.3616714477539062, "learning_rate": 5e-07, "logits/chosen": 10519764.0, "logits/rejected": -18271264.0, "logps/chosen": -217.43724060058594, "logps/rejected": -400.875244140625, "loss": 0.3381, "rewards/chosen": -0.04311532527208328, "rewards/margins": 1.5477824732661247, "rewards/rejected": -1.590897798538208, "step": 11650 }, { "epoch": 0.6175496249966872, "grad_norm": 48.5, "kl": 8.251319885253906, "learning_rate": 5e-07, "logits/chosen": -38371766.4, "logits/rejected": -21266650.666666668, "logps/chosen": -692.4630859375, "logps/rejected": -420.3588053385417, "loss": 0.2523, "rewards/chosen": 1.6860023498535157, "rewards/margins": 4.366859372456869, "rewards/rejected": -2.680857022603353, "step": 11651 }, { "epoch": 0.6176026289984894, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -92816384.0, "logits/rejected": -13953710.4, "logps/chosen": -486.3216552734375, "logps/rejected": -233.78623046875, "loss": 0.2875, "rewards/chosen": -0.19498443603515625, "rewards/margins": 2.2648588180541993, "rewards/rejected": -2.4598432540893556, "step": 11652 }, { "epoch": 0.6176556330002915, "grad_norm": 46.0, "kl": 2.4298782348632812, "learning_rate": 5e-07, "logits/chosen": -5280391.333333333, "logits/rejected": -21620682.0, "logps/chosen": -208.51021321614584, "logps/rejected": -163.7086944580078, "loss": 0.244, "rewards/chosen": 1.2651252746582031, "rewards/margins": 4.559444427490234, "rewards/rejected": -3.2943191528320312, "step": 11653 }, { "epoch": 0.6177086370020937, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54464864.0, "logits/rejected": 42536665.6, "logps/chosen": -194.99898274739584, "logps/rejected": -301.25341796875, "loss": 0.2069, "rewards/chosen": 0.5528976519902548, "rewards/margins": 2.785567863782247, "rewards/rejected": -2.232670211791992, "step": 11654 }, { "epoch": 0.6177616410038957, "grad_norm": 42.5, "kl": 1.8043994903564453, "learning_rate": 5e-07, "logits/chosen": -27471772.0, "logits/rejected": -4270633.5, "logps/chosen": -350.3060607910156, "logps/rejected": -118.19342803955078, "loss": 0.232, "rewards/chosen": 0.8799505233764648, "rewards/margins": 3.4208502769470215, "rewards/rejected": -2.5408997535705566, "step": 11655 }, { "epoch": 0.6178146450056979, "grad_norm": 52.5, "kl": 2.676028251647949, "learning_rate": 5e-07, "logits/chosen": -10884540.8, "logits/rejected": 3660871.3333333335, "logps/chosen": -125.3178466796875, "logps/rejected": -118.9779052734375, "loss": 0.4699, "rewards/chosen": -0.21643338203430176, "rewards/margins": 1.842016839981079, "rewards/rejected": -2.058450222015381, "step": 11656 }, { "epoch": 0.6178676490075, "grad_norm": 33.5, "kl": 0.8615274429321289, "learning_rate": 5e-07, "logits/chosen": -23561629.333333332, "logits/rejected": -22698076.0, "logps/chosen": -450.8323567708333, "logps/rejected": -215.87376403808594, "loss": 0.1957, "rewards/chosen": 1.8044555981953938, "rewards/margins": 4.2672012646993, "rewards/rejected": -2.4627456665039062, "step": 11657 }, { "epoch": 0.6179206530093022, "grad_norm": 49.25, "kl": 0.5346269607543945, "learning_rate": 5e-07, "logits/chosen": -22761688.0, "logits/rejected": -28724371.2, "logps/chosen": -233.8099365234375, "logps/rejected": -426.217626953125, "loss": 0.3577, "rewards/chosen": -0.2552364269892375, "rewards/margins": 1.3172983249028523, "rewards/rejected": -1.5725347518920898, "step": 11658 }, { "epoch": 0.6179736570111043, "grad_norm": 50.75, "kl": 0.8645868301391602, "learning_rate": 5e-07, "logits/chosen": 5474890.5, "logits/rejected": -12590822.857142856, "logps/chosen": -35.438812255859375, "logps/rejected": -186.71280343191964, "loss": 0.2656, "rewards/chosen": 0.09439430385828018, "rewards/margins": 1.5122231362121445, "rewards/rejected": -1.4178288323538644, "step": 11659 }, { "epoch": 0.6180266610129065, "grad_norm": 45.25, "kl": 0.214202880859375, "learning_rate": 5e-07, "logits/chosen": -5124784.0, "logits/rejected": 53752384.0, "logps/chosen": -268.99163818359375, "logps/rejected": -336.2589111328125, "loss": 0.2248, "rewards/chosen": 0.21347732841968536, "rewards/margins": 2.4030216882626214, "rewards/rejected": -2.189544359842936, "step": 11660 }, { "epoch": 0.6180796650147086, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81534512.0, "logits/rejected": -17969656.0, "logps/chosen": -666.215576171875, "logps/rejected": -237.75066266741072, "loss": 0.1479, "rewards/chosen": 0.694805920124054, "rewards/margins": 3.23587794814791, "rewards/rejected": -2.541072028023856, "step": 11661 }, { "epoch": 0.6181326690165108, "grad_norm": 32.75, "kl": 4.068830490112305, "learning_rate": 5e-07, "logits/chosen": -16821622.4, "logits/rejected": -48595333.333333336, "logps/chosen": -154.4575927734375, "logps/rejected": -547.6837565104166, "loss": 0.26, "rewards/chosen": 1.0559165000915527, "rewards/margins": 5.091795953114827, "rewards/rejected": -4.035879453023274, "step": 11662 }, { "epoch": 0.6181856730183128, "grad_norm": 43.75, "kl": 1.914815902709961, "learning_rate": 5e-07, "logits/chosen": 850017.0833333334, "logits/rejected": -13524520.0, "logps/chosen": -114.41173299153645, "logps/rejected": -117.32449340820312, "loss": 0.3366, "rewards/chosen": 0.5818641980489095, "rewards/margins": 3.215302069981893, "rewards/rejected": -2.6334378719329834, "step": 11663 }, { "epoch": 0.618238677020115, "grad_norm": 44.5, "kl": 1.4230871200561523, "learning_rate": 5e-07, "logits/chosen": -17746737.6, "logits/rejected": -53921877.333333336, "logps/chosen": -158.15262451171876, "logps/rejected": -345.5235188802083, "loss": 0.2906, "rewards/chosen": 0.8404892921447754, "rewards/margins": 3.1147342363993324, "rewards/rejected": -2.274244944254557, "step": 11664 }, { "epoch": 0.6182916810219171, "grad_norm": 86.5, "kl": 1.1726751327514648, "learning_rate": 5e-07, "logits/chosen": -10505128.8, "logits/rejected": -51626629.333333336, "logps/chosen": -445.719921875, "logps/rejected": -431.7770589192708, "loss": 0.2756, "rewards/chosen": 0.8327196121215821, "rewards/margins": 2.6693469365437825, "rewards/rejected": -1.8366273244222004, "step": 11665 }, { "epoch": 0.6183446850237193, "grad_norm": 40.0, "kl": 2.8304367065429688, "learning_rate": 5e-07, "logits/chosen": -13513251.0, "logits/rejected": -7072593.0, "logps/chosen": -152.61221313476562, "logps/rejected": -179.7750244140625, "loss": 0.2798, "rewards/chosen": 0.19529588520526886, "rewards/margins": 3.576657608151436, "rewards/rejected": -3.381361722946167, "step": 11666 }, { "epoch": 0.6183976890255214, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39680580.0, "logits/rejected": -36624340.0, "logps/chosen": -157.1087188720703, "logps/rejected": -277.3876037597656, "loss": 0.3063, "rewards/chosen": 0.013016417622566223, "rewards/margins": 2.4174756556749344, "rewards/rejected": -2.404459238052368, "step": 11667 }, { "epoch": 0.6184506930273236, "grad_norm": 42.25, "kl": 2.104076385498047, "learning_rate": 5e-07, "logits/chosen": -11694401.0, "logits/rejected": 157970048.0, "logps/chosen": -271.2421569824219, "logps/rejected": -348.83380126953125, "loss": 0.2476, "rewards/chosen": 0.6402462124824524, "rewards/margins": 2.952430546283722, "rewards/rejected": -2.3121843338012695, "step": 11668 }, { "epoch": 0.6185036970291257, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7992595.333333333, "logits/rejected": -58221043.2, "logps/chosen": -81.0850321451823, "logps/rejected": -217.079833984375, "loss": 0.2683, "rewards/chosen": 0.70216170946757, "rewards/margins": 2.209483257929484, "rewards/rejected": -1.507321548461914, "step": 11669 }, { "epoch": 0.6185567010309279, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25904717.333333332, "logits/rejected": -38542614.4, "logps/chosen": -338.2720947265625, "logps/rejected": -460.03251953125, "loss": 0.2574, "rewards/chosen": -0.19973297913869223, "rewards/margins": 2.4734834591547648, "rewards/rejected": -2.673216438293457, "step": 11670 }, { "epoch": 0.6186097050327299, "grad_norm": 45.25, "kl": 0.9698600769042969, "learning_rate": 5e-07, "logits/chosen": -19652784.0, "logits/rejected": 351050.3333333333, "logps/chosen": -290.30126953125, "logps/rejected": -137.93187459309897, "loss": 0.3599, "rewards/chosen": 0.2615126848220825, "rewards/margins": 3.3974856615066527, "rewards/rejected": -3.1359729766845703, "step": 11671 }, { "epoch": 0.6186627090345321, "grad_norm": 50.0, "kl": 0.24580001831054688, "learning_rate": 5e-07, "logits/chosen": -24625969.6, "logits/rejected": -30111141.333333332, "logps/chosen": -273.4114990234375, "logps/rejected": -449.9133707682292, "loss": 0.2038, "rewards/chosen": 1.18492431640625, "rewards/margins": 4.221313031514486, "rewards/rejected": -3.036388715108236, "step": 11672 }, { "epoch": 0.6187157130363342, "grad_norm": 52.75, "kl": 2.2820119857788086, "learning_rate": 5e-07, "logits/chosen": -66160512.0, "logits/rejected": -61818757.333333336, "logps/chosen": -288.57587890625, "logps/rejected": -451.5892740885417, "loss": 0.2496, "rewards/chosen": 0.736003065109253, "rewards/margins": 3.7164076010386147, "rewards/rejected": -2.980404535929362, "step": 11673 }, { "epoch": 0.6187687170381364, "grad_norm": 40.75, "kl": 1.5919780731201172, "learning_rate": 5e-07, "logits/chosen": -38214960.0, "logits/rejected": -4396460.333333333, "logps/chosen": -247.02158203125, "logps/rejected": -108.99163818359375, "loss": 0.2393, "rewards/chosen": 1.185866928100586, "rewards/margins": 2.908874766031901, "rewards/rejected": -1.7230078379313152, "step": 11674 }, { "epoch": 0.6188217210399385, "grad_norm": 49.75, "kl": 4.6552734375, "learning_rate": 5e-07, "logits/chosen": -49543264.0, "logits/rejected": -23717645.333333332, "logps/chosen": -519.96259765625, "logps/rejected": -225.07975260416666, "loss": 0.4016, "rewards/chosen": 0.3547271728515625, "rewards/margins": 3.034000778198242, "rewards/rejected": -2.6792736053466797, "step": 11675 }, { "epoch": 0.6188747250417407, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7362117.0, "logits/rejected": -17513968.0, "logps/chosen": -183.16526794433594, "logps/rejected": -344.4678431919643, "loss": 0.1532, "rewards/chosen": 1.4283554553985596, "rewards/margins": 3.6983187198638916, "rewards/rejected": -2.269963264465332, "step": 11676 }, { "epoch": 0.6189277290435428, "grad_norm": 45.25, "kl": 6.431068420410156, "learning_rate": 5e-07, "logits/chosen": -33549642.666666668, "logits/rejected": -11664258.0, "logps/chosen": -472.6572265625, "logps/rejected": -475.3938903808594, "loss": 0.3594, "rewards/chosen": 1.0996028582255046, "rewards/margins": 2.3976924816767378, "rewards/rejected": -1.298089623451233, "step": 11677 }, { "epoch": 0.618980733045345, "grad_norm": 47.25, "kl": 2.971080780029297, "learning_rate": 5e-07, "logits/chosen": -19216960.0, "logits/rejected": -6397031.0, "logps/chosen": -106.19682312011719, "logps/rejected": -527.7056884765625, "loss": 0.2762, "rewards/chosen": 0.3675946891307831, "rewards/margins": 3.852319210767746, "rewards/rejected": -3.484724521636963, "step": 11678 }, { "epoch": 0.619033737047147, "grad_norm": 51.0, "kl": 0.04101371765136719, "learning_rate": 5e-07, "logits/chosen": -31332392.0, "logits/rejected": -7635803.2, "logps/chosen": -234.41507975260416, "logps/rejected": -192.2464599609375, "loss": 0.3797, "rewards/chosen": -0.2060689926147461, "rewards/margins": 1.2593483924865723, "rewards/rejected": -1.4654173851013184, "step": 11679 }, { "epoch": 0.6190867410489492, "grad_norm": 31.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6160280.0, "logits/rejected": -52480268.0, "logps/chosen": -39.62684631347656, "logps/rejected": -367.8100280761719, "loss": 0.2509, "rewards/chosen": 0.3980711102485657, "rewards/margins": 2.7491655945777893, "rewards/rejected": -2.3510944843292236, "step": 11680 }, { "epoch": 0.6191397450507513, "grad_norm": 61.5, "kl": 2.627595901489258, "learning_rate": 5e-07, "logits/chosen": -29592976.0, "logits/rejected": -15996065.0, "logps/chosen": -282.6111537388393, "logps/rejected": -263.2396545410156, "loss": 0.4261, "rewards/chosen": 0.570627076285226, "rewards/margins": 1.1424379774502347, "rewards/rejected": -0.5718109011650085, "step": 11681 }, { "epoch": 0.6191927490525535, "grad_norm": 41.0, "kl": 1.5330142974853516, "learning_rate": 5e-07, "logits/chosen": -935805.0, "logits/rejected": -9012636.0, "logps/chosen": -138.432373046875, "logps/rejected": -265.9149475097656, "loss": 0.2454, "rewards/chosen": 0.8309047222137451, "rewards/margins": 3.423295259475708, "rewards/rejected": -2.592390537261963, "step": 11682 }, { "epoch": 0.6192457530543556, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60294712.0, "logits/rejected": -27759152.0, "logps/chosen": -388.04559326171875, "logps/rejected": -225.62788899739584, "loss": 0.2978, "rewards/chosen": -0.6440277099609375, "rewards/margins": 1.0612297058105469, "rewards/rejected": -1.7052574157714844, "step": 11683 }, { "epoch": 0.6192987570561578, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42641092.0, "logits/rejected": -11604289.0, "logps/chosen": -319.1371765136719, "logps/rejected": -167.58627319335938, "loss": 0.3179, "rewards/chosen": 0.41321203112602234, "rewards/margins": 2.4555337131023407, "rewards/rejected": -2.0423216819763184, "step": 11684 }, { "epoch": 0.6193517610579599, "grad_norm": 90.0, "kl": 1.5538330078125, "learning_rate": 5e-07, "logits/chosen": 55070208.0, "logits/rejected": -23845013.333333332, "logps/chosen": -660.9083984375, "logps/rejected": -303.65069580078125, "loss": 0.3711, "rewards/chosen": 0.13519501686096191, "rewards/margins": 1.689571777979533, "rewards/rejected": -1.554376761118571, "step": 11685 }, { "epoch": 0.619404765059762, "grad_norm": 40.0, "kl": 1.0540237426757812, "learning_rate": 5e-07, "logits/chosen": -42801977.6, "logits/rejected": -29536181.333333332, "logps/chosen": -207.569677734375, "logps/rejected": -175.46795654296875, "loss": 0.3592, "rewards/chosen": -0.10127301216125488, "rewards/margins": 4.11062928835551, "rewards/rejected": -4.211902300516765, "step": 11686 }, { "epoch": 0.6194577690615641, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48591748.0, "logits/rejected": -15031013.333333334, "logps/chosen": -167.70953369140625, "logps/rejected": -239.1006062825521, "loss": 0.2073, "rewards/chosen": -0.36720409989356995, "rewards/margins": 2.358309358358383, "rewards/rejected": -2.725513458251953, "step": 11687 }, { "epoch": 0.6195107730633663, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38786389.333333336, "logits/rejected": -69266067.2, "logps/chosen": -468.53759765625, "logps/rejected": -335.92724609375, "loss": 0.1659, "rewards/chosen": 1.2568858464558919, "rewards/margins": 3.4640649159749346, "rewards/rejected": -2.207179069519043, "step": 11688 }, { "epoch": 0.6195637770651684, "grad_norm": 56.25, "kl": 1.3902091979980469, "learning_rate": 5e-07, "logits/chosen": -91009136.0, "logits/rejected": -58113892.0, "logps/chosen": -588.157958984375, "logps/rejected": -430.474365234375, "loss": 0.2717, "rewards/chosen": 0.6839633584022522, "rewards/margins": 3.1473644375801086, "rewards/rejected": -2.4634010791778564, "step": 11689 }, { "epoch": 0.6196167810669706, "grad_norm": 47.5, "kl": 6.534465789794922, "learning_rate": 5e-07, "logits/chosen": -8939900.57142857, "logits/rejected": -144427648.0, "logps/chosen": -204.40562220982142, "logps/rejected": -608.9458618164062, "loss": 0.4734, "rewards/chosen": 0.535642181124006, "rewards/margins": 3.0524940831320624, "rewards/rejected": -2.5168519020080566, "step": 11690 }, { "epoch": 0.6196697850687727, "grad_norm": 36.25, "kl": 1.0604915618896484, "learning_rate": 5e-07, "logits/chosen": -7728089.6, "logits/rejected": -42507040.0, "logps/chosen": -110.1275146484375, "logps/rejected": -495.7008870442708, "loss": 0.2517, "rewards/chosen": 0.6711021900177002, "rewards/margins": 3.6295006910959877, "rewards/rejected": -2.9583985010782876, "step": 11691 }, { "epoch": 0.6197227890705749, "grad_norm": 37.25, "kl": 2.3009796142578125, "learning_rate": 5e-07, "logits/chosen": -2026208.3333333333, "logits/rejected": -7111336.0, "logps/chosen": -299.81585693359375, "logps/rejected": -259.573681640625, "loss": 0.1745, "rewards/chosen": 1.4567540486653645, "rewards/margins": 4.201038297017416, "rewards/rejected": -2.744284248352051, "step": 11692 }, { "epoch": 0.619775793072377, "grad_norm": 59.25, "kl": 5.423133850097656, "learning_rate": 5e-07, "logits/chosen": -49707680.0, "logits/rejected": -17053736.0, "logps/chosen": -363.2484654017857, "logps/rejected": -385.8284912109375, "loss": 0.4704, "rewards/chosen": 0.5720814296177456, "rewards/margins": 2.5722218581608365, "rewards/rejected": -2.000140428543091, "step": 11693 }, { "epoch": 0.6198287970741791, "grad_norm": 38.5, "kl": 0.9637832641601562, "learning_rate": 5e-07, "logits/chosen": -10617532.0, "logits/rejected": -1370049.3333333333, "logps/chosen": -211.2533935546875, "logps/rejected": -406.92041015625, "loss": 0.271, "rewards/chosen": 0.7486531257629394, "rewards/margins": 2.961682987213135, "rewards/rejected": -2.2130298614501953, "step": 11694 }, { "epoch": 0.6198818010759812, "grad_norm": 62.25, "kl": 0.033908843994140625, "learning_rate": 5e-07, "logits/chosen": 7187394.666666667, "logits/rejected": -8909885.0, "logps/chosen": -561.8748779296875, "logps/rejected": -189.0812225341797, "loss": 0.31, "rewards/chosen": 0.9616055488586426, "rewards/margins": 1.9067242741584778, "rewards/rejected": -0.9451187252998352, "step": 11695 }, { "epoch": 0.6199348050777834, "grad_norm": 59.25, "kl": 0.9478988647460938, "learning_rate": 5e-07, "logits/chosen": -46751896.0, "logits/rejected": -38495832.0, "logps/chosen": -420.90826416015625, "logps/rejected": -245.4658660888672, "loss": 0.281, "rewards/chosen": 0.3703209161758423, "rewards/margins": 2.3290486335754395, "rewards/rejected": -1.9587277173995972, "step": 11696 }, { "epoch": 0.6199878090795855, "grad_norm": 63.25, "kl": 0.3439903259277344, "learning_rate": 5e-07, "logits/chosen": -58708736.0, "logits/rejected": -15283712.0, "logps/chosen": -785.1761474609375, "logps/rejected": -386.7106119791667, "loss": 0.2371, "rewards/chosen": 2.0769083499908447, "rewards/margins": 3.9769294261932373, "rewards/rejected": -1.9000210762023926, "step": 11697 }, { "epoch": 0.6200408130813877, "grad_norm": 44.75, "kl": 10.027746200561523, "learning_rate": 5e-07, "logits/chosen": -19394628.0, "logits/rejected": -5452572.5, "logps/chosen": -216.43086751302084, "logps/rejected": -71.40708923339844, "loss": 0.4572, "rewards/chosen": 0.9121060371398926, "rewards/margins": 1.9521698951721191, "rewards/rejected": -1.0400638580322266, "step": 11698 }, { "epoch": 0.6200938170831898, "grad_norm": 42.5, "kl": 2.893604278564453, "learning_rate": 5e-07, "logits/chosen": -8405271.0, "logits/rejected": 624505.875, "logps/chosen": -424.6981201171875, "logps/rejected": -535.023681640625, "loss": 0.1678, "rewards/chosen": 1.4346460103988647, "rewards/margins": 4.902363181114197, "rewards/rejected": -3.467717170715332, "step": 11699 }, { "epoch": 0.6201468210849919, "grad_norm": 46.0, "kl": 1.1293792724609375, "learning_rate": 5e-07, "logits/chosen": -19635496.0, "logits/rejected": -26714074.666666668, "logps/chosen": -389.279931640625, "logps/rejected": -296.61521402994794, "loss": 0.2865, "rewards/chosen": 0.4206707954406738, "rewards/margins": 3.6104097048441566, "rewards/rejected": -3.189738909403483, "step": 11700 }, { "epoch": 0.620199825086794, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40571958.4, "logits/rejected": -66450261.333333336, "logps/chosen": -258.96396484375, "logps/rejected": -921.17626953125, "loss": 0.2911, "rewards/chosen": 0.15702362060546876, "rewards/margins": 5.901136271158855, "rewards/rejected": -5.744112650553386, "step": 11701 }, { "epoch": 0.6202528290885961, "grad_norm": 51.25, "kl": 2.146770477294922, "learning_rate": 5e-07, "logits/chosen": 371761.0, "logits/rejected": -25485168.0, "logps/chosen": -448.39276123046875, "logps/rejected": -349.4381103515625, "loss": 0.1685, "rewards/chosen": 1.417720079421997, "rewards/margins": 3.2302318414052325, "rewards/rejected": -1.8125117619832356, "step": 11702 }, { "epoch": 0.6203058330903983, "grad_norm": 55.5, "kl": 1.1586265563964844, "learning_rate": 5e-07, "logits/chosen": -15539894.0, "logits/rejected": -20199906.0, "logps/chosen": -291.61041259765625, "logps/rejected": -289.82342529296875, "loss": 0.3176, "rewards/chosen": 0.04879635572433472, "rewards/margins": 1.6552895903587341, "rewards/rejected": -1.6064932346343994, "step": 11703 }, { "epoch": 0.6203588370922004, "grad_norm": 53.25, "kl": 2.418516159057617, "learning_rate": 5e-07, "logits/chosen": -85630824.0, "logps/chosen": -337.39599609375, "loss": 0.3735, "rewards/chosen": 0.8222826719284058, "step": 11704 }, { "epoch": 0.6204118410940026, "grad_norm": 79.5, "kl": 2.0766143798828125, "learning_rate": 5e-07, "logits/chosen": -7327424.8, "logits/rejected": -29635312.0, "logps/chosen": -574.53349609375, "logps/rejected": -521.73583984375, "loss": 0.2141, "rewards/chosen": 1.1348796844482423, "rewards/margins": 3.533095932006836, "rewards/rejected": -2.3982162475585938, "step": 11705 }, { "epoch": 0.6204648450958047, "grad_norm": 44.0, "kl": 1.4248504638671875, "learning_rate": 5e-07, "logits/chosen": -18909482.0, "logits/rejected": -22974080.0, "logps/chosen": -374.76531982421875, "logps/rejected": -288.63262939453125, "loss": 0.324, "rewards/chosen": 0.015489950776100159, "rewards/margins": 2.0076277405023575, "rewards/rejected": -1.9921377897262573, "step": 11706 }, { "epoch": 0.6205178490976069, "grad_norm": 59.5, "kl": 0.2729225158691406, "learning_rate": 5e-07, "logits/chosen": -55540480.0, "logits/rejected": -22677008.0, "logps/chosen": -430.446875, "logps/rejected": -319.2906901041667, "loss": 0.3456, "rewards/chosen": 0.27651491165161135, "rewards/margins": 1.833080832163493, "rewards/rejected": -1.5565659205118816, "step": 11707 }, { "epoch": 0.620570853099409, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12558732.0, "logits/rejected": -10704269.0, "logps/chosen": -173.99039713541666, "logps/rejected": -222.243896484375, "loss": 0.46, "rewards/chosen": -0.149107426404953, "rewards/margins": 1.2256080210208893, "rewards/rejected": -1.3747154474258423, "step": 11708 }, { "epoch": 0.6206238571012112, "grad_norm": 62.75, "kl": 0.5509834289550781, "learning_rate": 5e-07, "logits/chosen": 59520954.666666664, "logits/rejected": -27067660.8, "logps/chosen": -457.0615234375, "logps/rejected": -311.824951171875, "loss": 0.1675, "rewards/chosen": 1.2333730061848958, "rewards/margins": 3.6029198964436846, "rewards/rejected": -2.369546890258789, "step": 11709 }, { "epoch": 0.6206768611030132, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84713109.33333333, "logits/rejected": -15927818.0, "logps/chosen": -120.90232340494792, "logps/rejected": -142.89215087890625, "loss": 0.364, "rewards/chosen": 0.5001775423685709, "rewards/margins": 2.079330245653788, "rewards/rejected": -1.5791527032852173, "step": 11710 }, { "epoch": 0.6207298651048154, "grad_norm": 36.25, "kl": 3.536463737487793, "learning_rate": 5e-07, "logits/chosen": 5797270.5, "logits/rejected": -49792328.0, "logps/chosen": -28.15732192993164, "logps/rejected": -191.66259765625, "loss": 0.2674, "rewards/chosen": 0.804581344127655, "rewards/margins": 2.7810258269309998, "rewards/rejected": -1.9764444828033447, "step": 11711 }, { "epoch": 0.6207828691066175, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48193722.666666664, "logits/rejected": -8397076.8, "logps/chosen": -342.406005859375, "logps/rejected": -204.6236328125, "loss": 0.2435, "rewards/chosen": -0.0348287026087443, "rewards/margins": 2.583342989285787, "rewards/rejected": -2.6181716918945312, "step": 11712 }, { "epoch": 0.6208358731084197, "grad_norm": 38.0, "kl": 2.5486717224121094, "learning_rate": 5e-07, "logits/chosen": 3236944.0, "logits/rejected": -47229676.8, "logps/chosen": -202.64261881510416, "logps/rejected": -325.33125, "loss": 0.1782, "rewards/chosen": 1.1333545049031575, "rewards/margins": 3.0456759770711264, "rewards/rejected": -1.9123214721679687, "step": 11713 }, { "epoch": 0.6208888771102218, "grad_norm": 44.25, "kl": 0.8221454620361328, "learning_rate": 5e-07, "logits/chosen": -18852829.333333332, "logits/rejected": -40626838.4, "logps/chosen": -199.5601806640625, "logps/rejected": -156.247705078125, "loss": 0.2359, "rewards/chosen": 1.3164749145507812, "rewards/margins": 2.5663166999816895, "rewards/rejected": -1.2498417854309083, "step": 11714 }, { "epoch": 0.620941881112024, "grad_norm": 23.625, "kl": 0.4439277648925781, "learning_rate": 5e-07, "logits/chosen": 25734184.0, "logits/rejected": -47547424.0, "logps/chosen": -912.1679077148438, "logps/rejected": -523.3546549479166, "loss": 0.0937, "rewards/chosen": 2.3704025745391846, "rewards/margins": 5.301714181900024, "rewards/rejected": -2.93131160736084, "step": 11715 }, { "epoch": 0.6209948851138261, "grad_norm": 105.0, "kl": 5.200481414794922, "learning_rate": 5e-07, "logits/chosen": -17612010.666666668, "logits/rejected": -28026856.0, "logps/chosen": -661.5707194010416, "logps/rejected": -349.20269775390625, "loss": 0.3737, "rewards/chosen": 0.8578247229258219, "rewards/margins": 3.2693375746409097, "rewards/rejected": -2.411512851715088, "step": 11716 }, { "epoch": 0.6210478891156282, "grad_norm": 49.0, "kl": 0.3549537658691406, "learning_rate": 5e-07, "logits/chosen": -19384006.4, "logits/rejected": -21381900.0, "logps/chosen": -274.279541015625, "logps/rejected": -238.8568115234375, "loss": 0.3329, "rewards/chosen": 0.30234594345092775, "rewards/margins": 2.6889909426371257, "rewards/rejected": -2.3866449991861978, "step": 11717 }, { "epoch": 0.6211008931174303, "grad_norm": 46.25, "kl": 1.5705909729003906, "learning_rate": 5e-07, "logits/chosen": 3835124.0, "logits/rejected": -31009992.0, "logps/chosen": -17.94973373413086, "logps/rejected": -596.8767496744791, "loss": 0.2454, "rewards/chosen": 0.16194315254688263, "rewards/margins": 3.1002709418535233, "rewards/rejected": -2.9383277893066406, "step": 11718 }, { "epoch": 0.6211538971192325, "grad_norm": 27.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3224864.5, "logits/rejected": -22467298.666666668, "logps/chosen": -48.19904327392578, "logps/rejected": -559.71435546875, "loss": 0.1396, "rewards/chosen": 0.41948071122169495, "rewards/margins": 3.55266864101092, "rewards/rejected": -3.133187929789225, "step": 11719 }, { "epoch": 0.6212069011210346, "grad_norm": 37.5, "kl": 1.9121007919311523, "learning_rate": 5e-07, "logits/chosen": -28691069.333333332, "logits/rejected": -31181478.4, "logps/chosen": -224.55682373046875, "logps/rejected": -290.6456298828125, "loss": 0.2397, "rewards/chosen": 0.8983232180277506, "rewards/margins": 3.24892250696818, "rewards/rejected": -2.3505992889404297, "step": 11720 }, { "epoch": 0.6212599051228368, "grad_norm": 44.5, "kl": 1.2856082916259766, "learning_rate": 5e-07, "logits/chosen": -15898068.0, "logits/rejected": -8461262.0, "logps/chosen": -256.73590087890625, "logps/rejected": -185.3818359375, "loss": 0.3498, "rewards/chosen": 0.3664777874946594, "rewards/margins": 1.627604901790619, "rewards/rejected": -1.2611271142959595, "step": 11721 }, { "epoch": 0.6213129091246389, "grad_norm": 30.625, "kl": 0.7132701873779297, "learning_rate": 5e-07, "logits/chosen": -35748772.0, "logits/rejected": -14963169.0, "logps/chosen": -215.19204711914062, "logps/rejected": -208.51754760742188, "loss": 0.1615, "rewards/chosen": 1.2904112339019775, "rewards/margins": 4.2872443199157715, "rewards/rejected": -2.996833086013794, "step": 11722 }, { "epoch": 0.6213659131264411, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -131385941.33333333, "logits/rejected": -25683136.0, "logps/chosen": -359.290283203125, "logps/rejected": -359.760595703125, "loss": 0.2296, "rewards/chosen": 0.6549123128255209, "rewards/margins": 2.56795171101888, "rewards/rejected": -1.9130393981933593, "step": 11723 }, { "epoch": 0.6214189171282432, "grad_norm": 48.25, "kl": 0.9857845306396484, "learning_rate": 5e-07, "logits/chosen": -16564469.333333334, "logits/rejected": -31290342.0, "logps/chosen": -216.3886922200521, "logps/rejected": -399.3424987792969, "loss": 0.41, "rewards/chosen": 0.09397891163825989, "rewards/margins": 2.268443375825882, "rewards/rejected": -2.174464464187622, "step": 11724 }, { "epoch": 0.6214719211300453, "grad_norm": 43.75, "kl": 2.25616455078125, "learning_rate": 5e-07, "logits/chosen": -4545189.0, "logits/rejected": 95784102.4, "logps/chosen": -247.71268717447916, "logps/rejected": -171.6694580078125, "loss": 0.3189, "rewards/chosen": 0.8467158476511637, "rewards/margins": 2.147028175989787, "rewards/rejected": -1.300312328338623, "step": 11725 }, { "epoch": 0.6215249251318474, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31242208.0, "logits/rejected": -7312489.5, "logps/chosen": -436.39141845703125, "logps/rejected": -517.3574829101562, "loss": 0.2249, "rewards/chosen": 0.854230523109436, "rewards/margins": 3.601488947868347, "rewards/rejected": -2.747258424758911, "step": 11726 }, { "epoch": 0.6215779291336496, "grad_norm": 41.25, "kl": 1.2195024490356445, "learning_rate": 5e-07, "logits/chosen": 3654567.75, "logits/rejected": -11006526.0, "logps/chosen": -154.6666717529297, "logps/rejected": -149.9756622314453, "loss": 0.279, "rewards/chosen": 0.8164685964584351, "rewards/margins": 2.5436594486236572, "rewards/rejected": -1.7271908521652222, "step": 11727 }, { "epoch": 0.6216309331354517, "grad_norm": 80.5, "kl": 5.255889892578125, "learning_rate": 5e-07, "logits/chosen": -43723040.0, "logits/rejected": -8574670.0, "logps/chosen": -806.42978515625, "logps/rejected": -221.30106608072916, "loss": 0.4134, "rewards/chosen": 0.9633245468139648, "rewards/margins": 2.0913550059000654, "rewards/rejected": -1.1280304590861003, "step": 11728 }, { "epoch": 0.6216839371372539, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -89063920.0, "logits/rejected": -28594138.666666668, "logps/chosen": -782.1348876953125, "logps/rejected": -355.7675374348958, "loss": 0.1368, "rewards/chosen": 1.5198456048965454, "rewards/margins": 4.433048208554586, "rewards/rejected": -2.9132026036580405, "step": 11729 }, { "epoch": 0.621736941139056, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 981167.3333333334, "logits/rejected": -39345792.0, "logps/chosen": -372.4513346354167, "logps/rejected": -446.433203125, "loss": 0.2213, "rewards/chosen": 0.4251943031946818, "rewards/margins": 2.9078431526819863, "rewards/rejected": -2.4826488494873047, "step": 11730 }, { "epoch": 0.6217899451408582, "grad_norm": 54.0, "kl": 4.014604568481445, "learning_rate": 5e-07, "logits/chosen": -37060128.0, "logits/rejected": -23012286.0, "logps/chosen": -492.950439453125, "logps/rejected": -206.48977661132812, "loss": 0.1922, "rewards/chosen": 1.2438820600509644, "rewards/margins": 3.328189730644226, "rewards/rejected": -2.0843076705932617, "step": 11731 }, { "epoch": 0.6218429491426603, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41520512.0, "logits/rejected": 3099084.75, "logps/chosen": -299.61785888671875, "logps/rejected": -223.0302276611328, "loss": 0.2922, "rewards/chosen": 0.3743419647216797, "rewards/margins": 2.868058919906616, "rewards/rejected": -2.4937169551849365, "step": 11732 }, { "epoch": 0.6218959531444624, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55239880.0, "logits/rejected": -16352591.0, "logps/chosen": -438.7638854980469, "logps/rejected": -575.589599609375, "loss": 0.1672, "rewards/chosen": 0.9581207633018494, "rewards/margins": 4.529970586299896, "rewards/rejected": -3.571849822998047, "step": 11733 }, { "epoch": 0.6219489571462645, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27492812.0, "logits/rejected": -33957293.333333336, "logps/chosen": -154.43182373046875, "logps/rejected": -344.9349772135417, "loss": 0.169, "rewards/chosen": 0.4858818054199219, "rewards/margins": 3.0259809494018555, "rewards/rejected": -2.5400991439819336, "step": 11734 }, { "epoch": 0.6220019611480667, "grad_norm": 55.5, "kl": 0.5910873413085938, "learning_rate": 5e-07, "logits/chosen": -23204872.0, "logits/rejected": -40207914.666666664, "logps/chosen": -401.350390625, "logps/rejected": -341.6268717447917, "loss": 0.2739, "rewards/chosen": 0.6154794692993164, "rewards/margins": 2.927565574645996, "rewards/rejected": -2.3120861053466797, "step": 11735 }, { "epoch": 0.6220549651498688, "grad_norm": 63.0, "kl": 1.4813575744628906, "learning_rate": 5e-07, "logits/chosen": -2180565.3333333335, "logits/rejected": -26362500.0, "logps/chosen": -330.0872802734375, "logps/rejected": -277.50823974609375, "loss": 0.4612, "rewards/chosen": 0.056542461117108665, "rewards/margins": 1.477419560154279, "rewards/rejected": -1.4208770990371704, "step": 11736 }, { "epoch": 0.622107969151671, "grad_norm": 51.0, "kl": 1.5823440551757812, "learning_rate": 5e-07, "logits/chosen": 6596115.0, "logits/rejected": 28390408.0, "logps/chosen": -690.052001953125, "logps/rejected": -337.3620910644531, "loss": 0.2813, "rewards/chosen": 1.2492706775665283, "rewards/margins": 2.6346319913864136, "rewards/rejected": -1.3853613138198853, "step": 11737 }, { "epoch": 0.6221609731534731, "grad_norm": 27.75, "kl": 0.08895492553710938, "learning_rate": 5e-07, "logits/chosen": -22954969.6, "logits/rejected": 5313637.0, "logps/chosen": -65.22996826171875, "logps/rejected": -563.0549723307291, "loss": 0.3678, "rewards/chosen": -0.27276105880737306, "rewards/margins": 2.927642790476481, "rewards/rejected": -3.200403849283854, "step": 11738 }, { "epoch": 0.6222139771552753, "grad_norm": 48.75, "kl": 2.223825454711914, "learning_rate": 5e-07, "logits/chosen": -26971154.666666668, "logits/rejected": -22606916.0, "logps/chosen": -237.2881876627604, "logps/rejected": -343.55438232421875, "loss": 0.4489, "rewards/chosen": 0.0517129252354304, "rewards/margins": 1.9215349505345027, "rewards/rejected": -1.8698220252990723, "step": 11739 }, { "epoch": 0.6222669811570773, "grad_norm": 52.5, "kl": 2.137150764465332, "learning_rate": 5e-07, "logits/chosen": 5610860.8, "logits/rejected": -27263200.0, "logps/chosen": -271.2404296875, "logps/rejected": -516.55029296875, "loss": 0.3003, "rewards/chosen": 0.4301022529602051, "rewards/margins": 3.946584924062093, "rewards/rejected": -3.516482671101888, "step": 11740 }, { "epoch": 0.6223199851588795, "grad_norm": 53.0, "kl": 1.998321533203125, "learning_rate": 5e-07, "logits/chosen": -14936644.0, "logits/rejected": -11458725.0, "logps/chosen": -411.9894714355469, "logps/rejected": -311.1746520996094, "loss": 0.2674, "rewards/chosen": 0.6488014459609985, "rewards/margins": 2.533094048500061, "rewards/rejected": -1.8842926025390625, "step": 11741 }, { "epoch": 0.6223729891606816, "grad_norm": 40.5, "kl": 0.6240615844726562, "learning_rate": 5e-07, "logits/chosen": -43946002.666666664, "logits/rejected": -35110777.6, "logps/chosen": -340.6691487630208, "logps/rejected": -513.6259765625, "loss": 0.191, "rewards/chosen": 0.6285927295684814, "rewards/margins": 3.360688829421997, "rewards/rejected": -2.7320960998535155, "step": 11742 }, { "epoch": 0.6224259931624838, "grad_norm": 38.25, "kl": 1.297445297241211, "learning_rate": 5e-07, "logits/chosen": -22407112.0, "logits/rejected": -34646296.0, "logps/chosen": -180.77444458007812, "logps/rejected": -255.72239685058594, "loss": 0.2549, "rewards/chosen": 0.35899460315704346, "rewards/margins": 2.905488133430481, "rewards/rejected": -2.5464935302734375, "step": 11743 }, { "epoch": 0.6224789971642859, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4512661.0, "logits/rejected": -15309580.57142857, "logps/chosen": -21.746257781982422, "logps/rejected": -317.08510044642856, "loss": 0.1553, "rewards/chosen": -0.029448127374053, "rewards/margins": 2.7346357349306345, "rewards/rejected": -2.7640838623046875, "step": 11744 }, { "epoch": 0.6225320011660881, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21728522.0, "logits/rejected": -26318792.0, "logps/chosen": -293.803955078125, "logps/rejected": -385.0560709635417, "loss": 0.2024, "rewards/chosen": 0.5496718883514404, "rewards/margins": 3.1940810680389404, "rewards/rejected": -2.6444091796875, "step": 11745 }, { "epoch": 0.6225850051678902, "grad_norm": 54.0, "kl": 4.6932525634765625, "learning_rate": 5e-07, "logits/chosen": -29254963.2, "logits/rejected": -24395082.666666668, "logps/chosen": -394.0751953125, "logps/rejected": -162.19047037760416, "loss": 0.4404, "rewards/chosen": -0.11084710359573365, "rewards/margins": 1.5421368797620136, "rewards/rejected": -1.6529839833577473, "step": 11746 }, { "epoch": 0.6226380091696924, "grad_norm": 53.25, "kl": 3.864622116088867, "learning_rate": 5e-07, "logits/chosen": 6310299.0, "logps/chosen": -346.3177490234375, "loss": 0.3431, "rewards/chosen": 1.3307936191558838, "step": 11747 }, { "epoch": 0.6226910131714944, "grad_norm": 45.75, "kl": 2.825763702392578, "learning_rate": 5e-07, "logits/chosen": -49965555.2, "logits/rejected": 78610405.33333333, "logps/chosen": -474.70224609375, "logps/rejected": -242.18595377604166, "loss": 0.2985, "rewards/chosen": 1.4540534973144532, "rewards/margins": 2.476845359802246, "rewards/rejected": -1.022791862487793, "step": 11748 }, { "epoch": 0.6227440171732966, "grad_norm": 73.0, "kl": 0.840484619140625, "learning_rate": 5e-07, "logits/chosen": -14746001.142857144, "logits/rejected": -66969424.0, "logps/chosen": -246.93118722098214, "logps/rejected": -582.82373046875, "loss": 0.3911, "rewards/chosen": 0.29330274036952425, "rewards/margins": 3.4727582590920583, "rewards/rejected": -3.179455518722534, "step": 11749 }, { "epoch": 0.6227970211750987, "grad_norm": 49.5, "kl": 1.2065162658691406, "learning_rate": 5e-07, "logits/chosen": -31608998.0, "logits/rejected": -39382136.0, "logps/chosen": -281.5870361328125, "logps/rejected": -313.1235046386719, "loss": 0.1728, "rewards/chosen": 1.1018825769424438, "rewards/margins": 4.067041993141174, "rewards/rejected": -2.9651594161987305, "step": 11750 }, { "epoch": 0.6228500251769008, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14286916.0, "logits/rejected": -44737211.428571425, "logps/chosen": -164.95623779296875, "logps/rejected": -493.5711146763393, "loss": 0.1556, "rewards/chosen": -0.3292251527309418, "rewards/margins": 2.404890720333372, "rewards/rejected": -2.734115873064314, "step": 11751 }, { "epoch": 0.622903029178703, "grad_norm": 32.75, "kl": 3.5294055938720703, "learning_rate": 5e-07, "logits/chosen": -11274960.8, "logits/rejected": 3682438.6666666665, "logps/chosen": -62.8928955078125, "logps/rejected": -281.38140869140625, "loss": 0.3857, "rewards/chosen": 0.21518616676330565, "rewards/margins": 2.3249628225962318, "rewards/rejected": -2.1097766558329263, "step": 11752 }, { "epoch": 0.6229560331805051, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17327924.0, "logits/rejected": -53530112.0, "logps/chosen": -154.78997802734375, "logps/rejected": -371.0371500651042, "loss": 0.1819, "rewards/chosen": 0.6815494298934937, "rewards/margins": 2.925448457400004, "rewards/rejected": -2.2438990275065103, "step": 11753 }, { "epoch": 0.6230090371823073, "grad_norm": 37.0, "kl": 0.24088287353515625, "learning_rate": 5e-07, "logits/chosen": -19628278.0, "logits/rejected": -36505264.0, "logps/chosen": -272.2660217285156, "logps/rejected": -237.9568328857422, "loss": 0.2872, "rewards/chosen": 0.33333829045295715, "rewards/margins": 2.4737110435962677, "rewards/rejected": -2.1403727531433105, "step": 11754 }, { "epoch": 0.6230620411841093, "grad_norm": 81.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62105664.0, "logits/rejected": -23532716.8, "logps/chosen": -787.55419921875, "logps/rejected": -328.371484375, "loss": 0.2223, "rewards/chosen": 1.077109734217326, "rewards/margins": 2.6600518385569254, "rewards/rejected": -1.5829421043395997, "step": 11755 }, { "epoch": 0.6231150451859115, "grad_norm": 62.75, "kl": 2.470998764038086, "learning_rate": 5e-07, "logits/chosen": -93453984.0, "logits/rejected": -15318744.0, "logps/chosen": -436.6161804199219, "logps/rejected": -506.93402099609375, "loss": 0.2643, "rewards/chosen": 0.5941675305366516, "rewards/margins": 3.1132732033729553, "rewards/rejected": -2.5191056728363037, "step": 11756 }, { "epoch": 0.6231680491877136, "grad_norm": 42.5, "kl": 1.17156982421875, "learning_rate": 5e-07, "logits/chosen": -34599712.0, "logits/rejected": -18415470.0, "logps/chosen": -222.84600830078125, "logps/rejected": -261.7370910644531, "loss": 0.2487, "rewards/chosen": 0.5579410791397095, "rewards/margins": 2.957853674888611, "rewards/rejected": -2.3999125957489014, "step": 11757 }, { "epoch": 0.6232210531895158, "grad_norm": 51.75, "kl": 0.5559234619140625, "learning_rate": 5e-07, "logits/chosen": -2785276.0, "logits/rejected": -18352246.0, "logps/chosen": -333.912109375, "logps/rejected": -179.43145751953125, "loss": 0.3657, "rewards/chosen": -0.5280085802078247, "rewards/margins": 1.9775315523147583, "rewards/rejected": -2.505540132522583, "step": 11758 }, { "epoch": 0.6232740571913179, "grad_norm": 56.0, "kl": 2.0608348846435547, "learning_rate": 5e-07, "logits/chosen": -19871904.0, "logits/rejected": -12272365.333333334, "logps/chosen": -931.4130859375, "logps/rejected": -370.9767252604167, "loss": 0.2573, "rewards/chosen": 0.9855364799499512, "rewards/margins": 4.005507691701253, "rewards/rejected": -3.0199712117513022, "step": 11759 }, { "epoch": 0.6233270611931201, "grad_norm": 36.5, "kl": 1.1963081359863281, "learning_rate": 5e-07, "logits/chosen": -22990204.0, "logits/rejected": -3804755.0, "logps/chosen": -191.41758728027344, "logps/rejected": -334.5111083984375, "loss": 0.2456, "rewards/chosen": 1.069391131401062, "rewards/margins": 3.0563913583755493, "rewards/rejected": -1.9870002269744873, "step": 11760 }, { "epoch": 0.6233800651949222, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23303538.666666668, "logits/rejected": -13986358.4, "logps/chosen": -215.31962076822916, "logps/rejected": -199.0561279296875, "loss": 0.1893, "rewards/chosen": 0.7277230421702067, "rewards/margins": 3.537762753168742, "rewards/rejected": -2.8100397109985353, "step": 11761 }, { "epoch": 0.6234330691967244, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -22843712.0, "logps/rejected": -256.6593017578125, "loss": 0.1643, "rewards/rejected": -2.013195276260376, "step": 11762 }, { "epoch": 0.6234860731985264, "grad_norm": 36.0, "kl": 1.8526992797851562, "learning_rate": 5e-07, "logits/chosen": -22760635.2, "logits/rejected": -16393542.666666666, "logps/chosen": -367.18759765625, "logps/rejected": -216.55244954427084, "loss": 0.2486, "rewards/chosen": 1.0073581695556642, "rewards/margins": 3.4435163497924806, "rewards/rejected": -2.4361581802368164, "step": 11763 }, { "epoch": 0.6235390772003286, "grad_norm": 64.5, "kl": 2.0266265869140625, "learning_rate": 5e-07, "logits/chosen": -40341368.0, "logps/chosen": -260.6321716308594, "loss": 0.4386, "rewards/chosen": 0.4775936007499695, "step": 11764 }, { "epoch": 0.6235920812021307, "grad_norm": 50.25, "kl": 3.825429916381836, "learning_rate": 5e-07, "logits/chosen": -43567437.71428572, "logits/rejected": -11703392.0, "logps/chosen": -220.61310686383928, "logps/rejected": -418.46990966796875, "loss": 0.4524, "rewards/chosen": 0.4070068768092564, "rewards/margins": 1.8823120764323642, "rewards/rejected": -1.475305199623108, "step": 11765 }, { "epoch": 0.6236450852039329, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10362850.0, "logits/rejected": -12529377.333333334, "logps/chosen": -188.63735961914062, "logps/rejected": -276.2806803385417, "loss": 0.2021, "rewards/chosen": 0.33115196228027344, "rewards/margins": 2.8413209915161133, "rewards/rejected": -2.51016902923584, "step": 11766 }, { "epoch": 0.623698089205735, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1372027.5, "logits/rejected": -30404912.0, "logps/chosen": -22.4251766204834, "logps/rejected": -446.4118245442708, "loss": 0.1629, "rewards/chosen": 1.4379279613494873, "rewards/margins": 3.5261730353037515, "rewards/rejected": -2.088245073954264, "step": 11767 }, { "epoch": 0.6237510932075372, "grad_norm": 56.75, "kl": 1.4108963012695312, "learning_rate": 5e-07, "logits/chosen": -58325274.666666664, "logits/rejected": 34623176.0, "logps/chosen": -339.9508056640625, "logps/rejected": -559.8727416992188, "loss": 0.346, "rewards/chosen": 0.555074135462443, "rewards/margins": 2.0133062998453775, "rewards/rejected": -1.4582321643829346, "step": 11768 }, { "epoch": 0.6238040972093393, "grad_norm": 30.625, "kl": 1.1487808227539062, "learning_rate": 5e-07, "logits/chosen": -14941983.0, "logits/rejected": -41269996.0, "logps/chosen": -160.29296875, "logps/rejected": -515.7154541015625, "loss": 0.1623, "rewards/chosen": 1.072108268737793, "rewards/margins": 4.38605809211731, "rewards/rejected": -3.3139498233795166, "step": 11769 }, { "epoch": 0.6238571012111415, "grad_norm": 44.5, "kl": 3.6896190643310547, "learning_rate": 5e-07, "logits/chosen": -28001772.8, "logits/rejected": -8898162.0, "logps/chosen": -225.8343017578125, "logps/rejected": -226.87788899739584, "loss": 0.3017, "rewards/chosen": 0.5143420696258545, "rewards/margins": 3.269270658493042, "rewards/rejected": -2.7549285888671875, "step": 11770 }, { "epoch": 0.6239101052129435, "grad_norm": 50.25, "kl": 0.040592193603515625, "learning_rate": 5e-07, "logits/chosen": -36978068.0, "logits/rejected": -19498788.0, "logps/chosen": -133.79241943359375, "logps/rejected": -227.43033854166666, "loss": 0.2342, "rewards/chosen": -0.011756896041333675, "rewards/margins": 2.129039129552742, "rewards/rejected": -2.1407960255940757, "step": 11771 }, { "epoch": 0.6239631092147457, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10807273.0, "logits/rejected": 23777.46875, "logps/chosen": -542.456787109375, "logps/rejected": -174.76911272321428, "loss": 0.2462, "rewards/chosen": 0.5937256217002869, "rewards/margins": 2.1970631820814948, "rewards/rejected": -1.6033375603812081, "step": 11772 }, { "epoch": 0.6240161132165478, "grad_norm": 68.5, "kl": 3.4240646362304688, "learning_rate": 5e-07, "logits/chosen": -33829652.571428575, "logits/rejected": -1503930.75, "logps/chosen": -390.07442801339283, "logps/rejected": -109.95231628417969, "loss": 0.4764, "rewards/chosen": 0.20398988042558944, "rewards/margins": 5.253850357873099, "rewards/rejected": -5.04986047744751, "step": 11773 }, { "epoch": 0.62406911721835, "grad_norm": 59.75, "kl": 2.3130340576171875, "learning_rate": 5e-07, "logits/chosen": -51702189.71428572, "logits/rejected": -556592.375, "logps/chosen": -574.1288364955357, "logps/rejected": -262.72021484375, "loss": 0.3525, "rewards/chosen": 0.8805773598807198, "rewards/margins": 2.3513583285467963, "rewards/rejected": -1.4707809686660767, "step": 11774 }, { "epoch": 0.6241221212201521, "grad_norm": 42.25, "kl": 2.385164260864258, "learning_rate": 5e-07, "logits/chosen": -15735102.4, "logits/rejected": 2949658.0, "logps/chosen": -248.0066162109375, "logps/rejected": -84.40924580891927, "loss": 0.4447, "rewards/chosen": 0.009435653686523438, "rewards/margins": 1.4487067858378093, "rewards/rejected": -1.4392711321512859, "step": 11775 }, { "epoch": 0.6241751252219543, "grad_norm": 56.0, "kl": 0.07725143432617188, "learning_rate": 5e-07, "logits/chosen": -43178825.6, "logits/rejected": -31810837.333333332, "logps/chosen": -444.2498046875, "logps/rejected": -231.1481730143229, "loss": 0.2435, "rewards/chosen": 0.6390892028808594, "rewards/margins": 3.694900449117025, "rewards/rejected": -3.0558112462361655, "step": 11776 }, { "epoch": 0.6242281292237564, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26776412.0, "logits/rejected": -20346072.0, "logps/chosen": -318.8890686035156, "logps/rejected": -307.9358317057292, "loss": 0.135, "rewards/chosen": 1.0669914484024048, "rewards/margins": 3.4646771351496377, "rewards/rejected": -2.397685686747233, "step": 11777 }, { "epoch": 0.6242811332255586, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12550280.0, "logits/rejected": -16519195.2, "logps/chosen": -292.5442301432292, "logps/rejected": -411.54853515625, "loss": 0.2225, "rewards/chosen": 0.06020150581995646, "rewards/margins": 3.3101152936617533, "rewards/rejected": -3.249913787841797, "step": 11778 }, { "epoch": 0.6243341372273606, "grad_norm": 50.25, "kl": 2.14385986328125, "learning_rate": 5e-07, "logits/chosen": -65815283.2, "logits/rejected": -5348908.666666667, "logps/chosen": -283.058203125, "logps/rejected": -128.91459147135416, "loss": 0.3407, "rewards/chosen": 0.20051419734954834, "rewards/margins": 4.070923844973246, "rewards/rejected": -3.8704096476236978, "step": 11779 }, { "epoch": 0.6243871412291628, "grad_norm": 41.25, "kl": 4.929252624511719, "learning_rate": 5e-07, "logits/chosen": -37329088.0, "logits/rejected": -42233248.0, "logps/chosen": -468.1673177083333, "logps/rejected": -499.08685302734375, "loss": 0.3423, "rewards/chosen": 0.8437496821085612, "rewards/margins": 3.3870946566263833, "rewards/rejected": -2.5433449745178223, "step": 11780 }, { "epoch": 0.6244401452309649, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34043676.0, "logits/rejected": -51158392.0, "logps/chosen": -343.0714111328125, "logps/rejected": -450.595947265625, "loss": 0.2669, "rewards/chosen": 0.40100958943367004, "rewards/margins": 3.460596591234207, "rewards/rejected": -3.059587001800537, "step": 11781 }, { "epoch": 0.6244931492327671, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54209648.0, "logits/rejected": -12725073.0, "logps/chosen": -384.4076843261719, "logps/rejected": -148.31661987304688, "loss": 0.2868, "rewards/chosen": 0.3971729278564453, "rewards/margins": 2.3368101119995117, "rewards/rejected": -1.9396371841430664, "step": 11782 }, { "epoch": 0.6245461532345692, "grad_norm": 55.0, "kl": 2.133829116821289, "learning_rate": 5e-07, "logits/chosen": -33054950.85714286, "logits/rejected": 1383145.75, "logps/chosen": -330.595458984375, "logps/rejected": -21.913448333740234, "loss": 0.4685, "rewards/chosen": 0.2256168978554862, "rewards/margins": 1.1507467882973808, "rewards/rejected": -0.9251298904418945, "step": 11783 }, { "epoch": 0.6245991572363714, "grad_norm": 70.5, "kl": 3.3466644287109375, "learning_rate": 5e-07, "logits/chosen": -46200128.0, "logits/rejected": -23885758.0, "logps/chosen": -597.2545166015625, "logps/rejected": -136.64715576171875, "loss": 0.2789, "rewards/chosen": 1.1877789497375488, "rewards/margins": 2.1722391843795776, "rewards/rejected": -0.9844602346420288, "step": 11784 }, { "epoch": 0.6246521612381735, "grad_norm": 50.25, "kl": 1.8723955154418945, "learning_rate": 5e-07, "logits/chosen": -13014013.6, "logits/rejected": -4443786.0, "logps/chosen": -256.6865478515625, "logps/rejected": -424.7596028645833, "loss": 0.3546, "rewards/chosen": 0.375242280960083, "rewards/margins": 1.843442964553833, "rewards/rejected": -1.46820068359375, "step": 11785 }, { "epoch": 0.6247051652399757, "grad_norm": 49.25, "kl": 0.7304134368896484, "learning_rate": 5e-07, "logits/chosen": 5519286.0, "logits/rejected": -26735468.0, "logps/chosen": -245.04489135742188, "logps/rejected": -335.3962097167969, "loss": 0.273, "rewards/chosen": 0.6172406673431396, "rewards/margins": 2.6314291954040527, "rewards/rejected": -2.014188528060913, "step": 11786 }, { "epoch": 0.6247581692417777, "grad_norm": 45.75, "kl": 0.5593404769897461, "learning_rate": 5e-07, "logits/chosen": -12593781.333333334, "logits/rejected": 7463115.2, "logps/chosen": -319.4005940755208, "logps/rejected": -332.5948486328125, "loss": 0.2067, "rewards/chosen": 1.4144633611043294, "rewards/margins": 2.7817245801289876, "rewards/rejected": -1.3672612190246582, "step": 11787 }, { "epoch": 0.6248111732435799, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -114417760.0, "logits/rejected": -10012667.0, "logps/chosen": -273.2186279296875, "logps/rejected": -196.202880859375, "loss": 0.282, "rewards/chosen": 0.2290097326040268, "rewards/margins": 2.233462914824486, "rewards/rejected": -2.004453182220459, "step": 11788 }, { "epoch": 0.624864177245382, "grad_norm": 37.75, "kl": 1.3364982604980469, "learning_rate": 5e-07, "logits/chosen": -24803891.2, "logits/rejected": -47020096.0, "logps/chosen": -217.25556640625, "logps/rejected": -329.59507242838544, "loss": 0.3075, "rewards/chosen": 0.29531474113464357, "rewards/margins": 3.343149773279826, "rewards/rejected": -3.047835032145182, "step": 11789 }, { "epoch": 0.6249171812471842, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41230112.0, "logits/rejected": -7426827.0, "logps/chosen": -320.13531494140625, "logps/rejected": -202.10025024414062, "loss": 0.2006, "rewards/chosen": 1.0063918828964233, "rewards/margins": 3.49016010761261, "rewards/rejected": -2.4837682247161865, "step": 11790 }, { "epoch": 0.6249701852489863, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 55212536.0, "logits/rejected": -56169228.0, "logps/chosen": -370.4530029296875, "logps/rejected": -403.6087951660156, "loss": 0.2547, "rewards/chosen": 0.7077628970146179, "rewards/margins": 2.8276321291923523, "rewards/rejected": -2.1198692321777344, "step": 11791 }, { "epoch": 0.6250231892507885, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44850220.0, "logits/rejected": -879664.0, "logps/chosen": -364.77117919921875, "logps/rejected": -467.02740478515625, "loss": 0.257, "rewards/chosen": 0.6694795489311218, "rewards/margins": 2.6966726183891296, "rewards/rejected": -2.027193069458008, "step": 11792 }, { "epoch": 0.6250761932525906, "grad_norm": 53.25, "kl": 1.3982563018798828, "learning_rate": 5e-07, "logits/chosen": -12890301.6, "logits/rejected": 10864749.333333334, "logps/chosen": -358.2863525390625, "logps/rejected": -292.1711832682292, "loss": 0.2637, "rewards/chosen": 1.077493953704834, "rewards/margins": 2.9482237180074056, "rewards/rejected": -1.8707297643025715, "step": 11793 }, { "epoch": 0.6251291972543928, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54868416.0, "logits/rejected": -16296585.0, "logps/chosen": -190.1243896484375, "logps/rejected": -321.6123962402344, "loss": 0.268, "rewards/chosen": -0.052481845021247864, "rewards/margins": 3.7518585175275803, "rewards/rejected": -3.804340362548828, "step": 11794 }, { "epoch": 0.6251822012561948, "grad_norm": 46.25, "kl": 1.595010757446289, "learning_rate": 5e-07, "logits/chosen": -48725414.4, "logits/rejected": -16953430.666666668, "logps/chosen": -397.6931884765625, "logps/rejected": -233.88883463541666, "loss": 0.268, "rewards/chosen": 0.9823348045349121, "rewards/margins": 3.1395438830057776, "rewards/rejected": -2.1572090784708657, "step": 11795 }, { "epoch": 0.625235205257997, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36946320.0, "logits/rejected": -17841017.333333332, "logps/chosen": -238.18614196777344, "logps/rejected": -375.4956868489583, "loss": 0.2554, "rewards/chosen": -0.27581024169921875, "rewards/margins": 2.1702818870544434, "rewards/rejected": -2.446092128753662, "step": 11796 }, { "epoch": 0.6252882092597991, "grad_norm": 51.0, "kl": 1.0071563720703125, "learning_rate": 5e-07, "logits/chosen": -29326706.666666668, "logits/rejected": 1036204.2, "logps/chosen": -196.21785481770834, "logps/rejected": -237.792138671875, "loss": 0.247, "rewards/chosen": 0.7074347337086996, "rewards/margins": 2.6227869828542074, "rewards/rejected": -1.9153522491455077, "step": 11797 }, { "epoch": 0.6253412132616013, "grad_norm": 59.0, "kl": 0.24227523803710938, "learning_rate": 5e-07, "logits/chosen": -50485664.0, "logits/rejected": -27630652.0, "logps/chosen": -474.5308837890625, "logps/rejected": -633.3455810546875, "loss": 0.3781, "rewards/chosen": 0.14505602916081747, "rewards/margins": 2.78838183482488, "rewards/rejected": -2.6433258056640625, "step": 11798 }, { "epoch": 0.6253942172634034, "grad_norm": 41.25, "kl": 0.6303482055664062, "learning_rate": 5e-07, "logits/chosen": 554727.375, "logits/rejected": -50758893.71428572, "logps/chosen": -15.846336364746094, "logps/rejected": -504.48660714285717, "loss": 0.1679, "rewards/chosen": 0.3112795054912567, "rewards/margins": 2.6997137793472836, "rewards/rejected": -2.388434273856027, "step": 11799 }, { "epoch": 0.6254472212652056, "grad_norm": 48.0, "kl": 0.3867645263671875, "learning_rate": 5e-07, "logits/chosen": -34259596.0, "logits/rejected": -18678844.0, "logps/chosen": -450.4712219238281, "logps/rejected": -287.037109375, "loss": 0.2334, "rewards/chosen": 0.6880332827568054, "rewards/margins": 3.571736752986908, "rewards/rejected": -2.8837034702301025, "step": 11800 }, { "epoch": 0.6255002252670077, "grad_norm": 30.375, "kl": 0.0015125274658203125, "learning_rate": 5e-07, "logits/chosen": -98838.25, "logits/rejected": -8035064.0, "logps/chosen": -99.03651428222656, "logps/rejected": -335.3631591796875, "loss": 0.2812, "rewards/chosen": 0.2126203030347824, "rewards/margins": 2.4864850491285324, "rewards/rejected": -2.27386474609375, "step": 11801 }, { "epoch": 0.6255532292688097, "grad_norm": 45.25, "kl": 2.1780738830566406, "learning_rate": 5e-07, "logits/chosen": -13244139.0, "logits/rejected": -33052486.0, "logps/chosen": -207.50631713867188, "logps/rejected": -409.99066162109375, "loss": 0.309, "rewards/chosen": 0.39439040422439575, "rewards/margins": 2.817010223865509, "rewards/rejected": -2.4226198196411133, "step": 11802 }, { "epoch": 0.6256062332706119, "grad_norm": 42.5, "kl": 0.2897043228149414, "learning_rate": 5e-07, "logits/chosen": -28878313.6, "logits/rejected": -53772357.333333336, "logps/chosen": -289.1311279296875, "logps/rejected": -534.167236328125, "loss": 0.3386, "rewards/chosen": 0.12241637706756592, "rewards/margins": 2.8131248553593955, "rewards/rejected": -2.6907084782918296, "step": 11803 }, { "epoch": 0.625659237272414, "grad_norm": 43.25, "kl": 2.6405019760131836, "learning_rate": 5e-07, "logits/chosen": -13736484.8, "logits/rejected": -27822162.666666668, "logps/chosen": -197.6486083984375, "logps/rejected": -492.1570231119792, "loss": 0.3645, "rewards/chosen": 0.4348305702209473, "rewards/margins": 2.5683522860209145, "rewards/rejected": -2.1335217157999673, "step": 11804 }, { "epoch": 0.6257122412742162, "grad_norm": 39.25, "kl": 0.05947685241699219, "learning_rate": 5e-07, "logits/chosen": -8841614.0, "logits/rejected": -3874748.0, "logps/chosen": -56.784820556640625, "logps/rejected": -411.2547607421875, "loss": 0.2486, "rewards/chosen": 0.3211140036582947, "rewards/margins": 3.717324197292328, "rewards/rejected": -3.396210193634033, "step": 11805 }, { "epoch": 0.6257652452760183, "grad_norm": 57.25, "kl": 5.394725322723389, "learning_rate": 5e-07, "logits/chosen": -14673651.42857143, "logits/rejected": -7621810.5, "logps/chosen": -250.68443080357142, "logps/rejected": -83.12537384033203, "loss": 0.3334, "rewards/chosen": 1.251889910016741, "rewards/margins": 4.484543766294207, "rewards/rejected": -3.232653856277466, "step": 11806 }, { "epoch": 0.6258182492778205, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41025560.0, "logits/rejected": -6701092.666666667, "logps/chosen": -292.038330078125, "logps/rejected": -224.15399169921875, "loss": 0.2571, "rewards/chosen": 0.07336120307445526, "rewards/margins": 1.7789131452639897, "rewards/rejected": -1.7055519421895344, "step": 11807 }, { "epoch": 0.6258712532796226, "grad_norm": 47.25, "kl": 1.078125, "learning_rate": 5e-07, "logits/chosen": -30179870.0, "logits/rejected": -79243120.0, "logps/chosen": -327.18792724609375, "logps/rejected": -277.4753723144531, "loss": 0.3189, "rewards/chosen": 0.09918728470802307, "rewards/margins": 2.237446218729019, "rewards/rejected": -2.138258934020996, "step": 11808 }, { "epoch": 0.6259242572814248, "grad_norm": 45.5, "kl": 0.5175933837890625, "learning_rate": 5e-07, "logits/chosen": -4363461.0, "logits/rejected": -32988032.0, "logps/chosen": -136.62455240885416, "logps/rejected": -275.8767578125, "loss": 0.3812, "rewards/chosen": -0.43827299276987713, "rewards/margins": 0.9421723922093708, "rewards/rejected": -1.380445384979248, "step": 11809 }, { "epoch": 0.6259772612832268, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47010944.0, "logits/rejected": -27160908.0, "logps/chosen": -352.17828369140625, "logps/rejected": -303.03240966796875, "loss": 0.3775, "rewards/chosen": -0.47129976749420166, "rewards/margins": 1.74191415309906, "rewards/rejected": -2.2132139205932617, "step": 11810 }, { "epoch": 0.626030265285029, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39888760.0, "logits/rejected": -26210410.0, "logps/chosen": -263.8243713378906, "logps/rejected": -472.7229919433594, "loss": 0.3357, "rewards/chosen": 0.14130735397338867, "rewards/margins": 2.144115686416626, "rewards/rejected": -2.0028083324432373, "step": 11811 }, { "epoch": 0.6260832692868311, "grad_norm": 43.25, "kl": 0.8982219696044922, "learning_rate": 5e-07, "logits/chosen": -18694478.666666668, "logits/rejected": -26121235.2, "logps/chosen": -283.0685221354167, "logps/rejected": -367.348828125, "loss": 0.2224, "rewards/chosen": 0.5792121092478434, "rewards/margins": 2.7947205702463784, "rewards/rejected": -2.215508460998535, "step": 11812 }, { "epoch": 0.6261362732886333, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50606704.0, "logits/rejected": -34938333.333333336, "logps/chosen": -455.9426574707031, "logps/rejected": -371.3566487630208, "loss": 0.2056, "rewards/chosen": -0.8160400390625, "rewards/margins": 2.857945124308268, "rewards/rejected": -3.673985163370768, "step": 11813 }, { "epoch": 0.6261892772904354, "grad_norm": 67.0, "kl": 0.5813446044921875, "learning_rate": 5e-07, "logits/chosen": -26919193.6, "logits/rejected": -19720573.333333332, "logps/chosen": -394.8721435546875, "logps/rejected": -513.0122884114584, "loss": 0.3847, "rewards/chosen": -0.07673094868659973, "rewards/margins": 2.382297295331955, "rewards/rejected": -2.4590282440185547, "step": 11814 }, { "epoch": 0.6262422812922376, "grad_norm": 24.25, "kl": 0.5342540740966797, "learning_rate": 5e-07, "logits/chosen": 4560302.0, "logits/rejected": -45283477.333333336, "logps/chosen": -164.5834197998047, "logps/rejected": -287.7920735677083, "loss": 0.1441, "rewards/chosen": 0.8803198337554932, "rewards/margins": 4.290735801060995, "rewards/rejected": -3.4104159673055015, "step": 11815 }, { "epoch": 0.6262952852940397, "grad_norm": 42.0, "kl": 2.294178009033203, "learning_rate": 5e-07, "logits/chosen": -62707283.2, "logits/rejected": 15915265.333333334, "logps/chosen": -169.0270751953125, "logps/rejected": -229.64510091145834, "loss": 0.4195, "rewards/chosen": -0.11127946376800538, "rewards/margins": 2.218409244219462, "rewards/rejected": -2.3296887079874673, "step": 11816 }, { "epoch": 0.6263482892958419, "grad_norm": 32.75, "kl": 0.36940765380859375, "learning_rate": 5e-07, "logits/chosen": 8582820.0, "logits/rejected": -16216100.57142857, "logps/chosen": -6.585197448730469, "logps/rejected": -180.59872000558036, "loss": 0.2262, "rewards/chosen": 1.2114170789718628, "rewards/margins": 2.8478355578013828, "rewards/rejected": -1.6364184788295202, "step": 11817 }, { "epoch": 0.6264012932976439, "grad_norm": 60.75, "kl": 1.3324432373046875, "learning_rate": 5e-07, "logits/chosen": -75458246.4, "logits/rejected": -946876.0833333334, "logps/chosen": -549.96982421875, "logps/rejected": -252.74613444010416, "loss": 0.3028, "rewards/chosen": 0.818198299407959, "rewards/margins": 2.577824370066325, "rewards/rejected": -1.759626070658366, "step": 11818 }, { "epoch": 0.6264542972994461, "grad_norm": 35.5, "kl": 1.4879169464111328, "learning_rate": 5e-07, "logits/chosen": -26180225.6, "logits/rejected": -22646248.0, "logps/chosen": -203.96612548828125, "logps/rejected": -286.55938720703125, "loss": 0.3436, "rewards/chosen": 0.23969719409942628, "rewards/margins": 3.4661665519078575, "rewards/rejected": -3.226469357808431, "step": 11819 }, { "epoch": 0.6265073013012482, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21019700.0, "logits/rejected": 2053172.0, "logps/chosen": -320.4998779296875, "logps/rejected": -109.5089340209961, "loss": 0.2686, "rewards/chosen": 0.36627116799354553, "rewards/margins": 2.7929557859897614, "rewards/rejected": -2.426684617996216, "step": 11820 }, { "epoch": 0.6265603053030504, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 588504.9166666666, "logits/rejected": -4439308.4, "logps/chosen": -344.7049153645833, "logps/rejected": -186.160400390625, "loss": 0.2075, "rewards/chosen": 0.7051892280578613, "rewards/margins": 2.8395142555236816, "rewards/rejected": -2.1343250274658203, "step": 11821 }, { "epoch": 0.6266133093048525, "grad_norm": 43.25, "kl": 1.067422866821289, "learning_rate": 5e-07, "logits/chosen": -20540403.2, "logits/rejected": -4227938.0, "logps/chosen": -256.574853515625, "logps/rejected": -104.5263163248698, "loss": 0.3761, "rewards/chosen": 0.16593157052993773, "rewards/margins": 2.4126938462257383, "rewards/rejected": -2.246762275695801, "step": 11822 }, { "epoch": 0.6266663133066547, "grad_norm": 37.5, "kl": 0.21226119995117188, "learning_rate": 5e-07, "logits/chosen": -40786712.0, "logits/rejected": -19794014.0, "logps/chosen": -256.8638610839844, "logps/rejected": -219.1834716796875, "loss": 0.2905, "rewards/chosen": 0.27627336978912354, "rewards/margins": 2.3209460973739624, "rewards/rejected": -2.044672727584839, "step": 11823 }, { "epoch": 0.6267193173084568, "grad_norm": 46.0, "kl": 1.6018123626708984, "learning_rate": 5e-07, "logits/chosen": 463735.875, "logits/rejected": -11594473.0, "logps/chosen": -241.79537963867188, "logps/rejected": -299.9333801269531, "loss": 0.292, "rewards/chosen": 1.0222829580307007, "rewards/margins": 2.69007670879364, "rewards/rejected": -1.6677937507629395, "step": 11824 }, { "epoch": 0.626772321310259, "grad_norm": 38.5, "kl": 2.580556869506836, "learning_rate": 5e-07, "logits/chosen": -10706070.0, "logits/rejected": -8934405.0, "logps/chosen": -163.04318237304688, "logps/rejected": -266.1387023925781, "loss": 0.3234, "rewards/chosen": 0.5381374955177307, "rewards/margins": 2.448994219303131, "rewards/rejected": -1.9108567237854004, "step": 11825 }, { "epoch": 0.626825325312061, "grad_norm": 50.5, "kl": 0.17725372314453125, "learning_rate": 5e-07, "logits/chosen": -19956865.6, "logits/rejected": -12804608.0, "logps/chosen": -422.8279296875, "logps/rejected": -420.3201090494792, "loss": 0.2743, "rewards/chosen": 0.5302566528320313, "rewards/margins": 3.221044921875, "rewards/rejected": -2.6907882690429688, "step": 11826 }, { "epoch": 0.6268783293138632, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63689338.666666664, "logits/rejected": -22059049.6, "logps/chosen": -489.12548828125, "logps/rejected": -290.056103515625, "loss": 0.2441, "rewards/chosen": 0.7757080396016439, "rewards/margins": 2.4423446973164875, "rewards/rejected": -1.6666366577148437, "step": 11827 }, { "epoch": 0.6269313333156653, "grad_norm": 43.0, "kl": 1.9566268920898438, "learning_rate": 5e-07, "logits/chosen": -29454548.0, "logits/rejected": -29910124.0, "logps/chosen": -463.5111083984375, "logps/rejected": -480.1925048828125, "loss": 0.1877, "rewards/chosen": 1.6718246936798096, "rewards/margins": 4.300335168838501, "rewards/rejected": -2.6285104751586914, "step": 11828 }, { "epoch": 0.6269843373174675, "grad_norm": 84.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55718233.6, "logits/rejected": -2297569.1666666665, "logps/chosen": -329.1521240234375, "logps/rejected": -185.72119140625, "loss": 0.4461, "rewards/chosen": -0.1791803002357483, "rewards/margins": 0.7698612252871195, "rewards/rejected": -0.9490415255228678, "step": 11829 }, { "epoch": 0.6270373413192696, "grad_norm": 44.5, "kl": 2.644411087036133, "learning_rate": 5e-07, "logits/chosen": -22970634.666666668, "logits/rejected": -25370594.0, "logps/chosen": -249.72576904296875, "logps/rejected": -155.16015625, "loss": 0.3403, "rewards/chosen": 0.7527976036071777, "rewards/margins": 3.7145962715148926, "rewards/rejected": -2.961798667907715, "step": 11830 }, { "epoch": 0.6270903453210718, "grad_norm": 68.5, "kl": 0.5129470825195312, "learning_rate": 5e-07, "logits/chosen": -71609120.0, "logits/rejected": -21370054.0, "logps/chosen": -697.0828857421875, "logps/rejected": -213.57638549804688, "loss": 0.304, "rewards/chosen": 0.6913681030273438, "rewards/margins": 2.0051456689834595, "rewards/rejected": -1.3137775659561157, "step": 11831 }, { "epoch": 0.6271433493228739, "grad_norm": 44.0, "kl": 0.03365898132324219, "learning_rate": 5e-07, "logits/chosen": -32737794.0, "logits/rejected": -51403736.0, "logps/chosen": -273.6874084472656, "logps/rejected": -428.7307434082031, "loss": 0.2183, "rewards/chosen": 0.5764292478561401, "rewards/margins": 3.605174422264099, "rewards/rejected": -3.028745174407959, "step": 11832 }, { "epoch": 0.627196353324676, "grad_norm": 44.75, "kl": 7.242653846740723, "learning_rate": 5e-07, "logits/chosen": -14991875.2, "logits/rejected": -41149016.0, "logps/chosen": -427.165869140625, "logps/rejected": -425.7677408854167, "loss": 0.2634, "rewards/chosen": 1.556917667388916, "rewards/margins": 4.45734198888143, "rewards/rejected": -2.900424321492513, "step": 11833 }, { "epoch": 0.6272493573264781, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8362857.0, "logits/rejected": -13922698.0, "logps/chosen": -255.02435302734375, "logps/rejected": -274.3392028808594, "loss": 0.3634, "rewards/chosen": -0.06875629723072052, "rewards/margins": 1.7227987498044968, "rewards/rejected": -1.7915550470352173, "step": 11834 }, { "epoch": 0.6273023613282803, "grad_norm": 41.25, "kl": 3.560361862182617, "learning_rate": 5e-07, "logits/chosen": -4685594.666666667, "logits/rejected": -10571201.6, "logps/chosen": -534.6256917317709, "logps/rejected": -258.374609375, "loss": 0.2318, "rewards/chosen": 1.411160151163737, "rewards/margins": 2.8792868296305336, "rewards/rejected": -1.4681266784667968, "step": 11835 }, { "epoch": 0.6273553653300824, "grad_norm": 64.0, "kl": 2.3327102661132812, "learning_rate": 5e-07, "logits/chosen": -5687641.6, "logits/rejected": -28748530.666666668, "logps/chosen": -481.360595703125, "logps/rejected": -247.0697021484375, "loss": 0.2765, "rewards/chosen": 0.9114972114562988, "rewards/margins": 3.277326202392578, "rewards/rejected": -2.3658289909362793, "step": 11836 }, { "epoch": 0.6274083693318846, "grad_norm": 31.0, "kl": 2.008373260498047, "learning_rate": 5e-07, "logits/chosen": -64078368.0, "logits/rejected": -22322924.8, "logps/chosen": -124.1393330891927, "logps/rejected": -333.72900390625, "loss": 0.2319, "rewards/chosen": 0.658876895904541, "rewards/margins": 3.2733736991882325, "rewards/rejected": -2.6144968032836915, "step": 11837 }, { "epoch": 0.6274613733336867, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1642204.0, "logits/rejected": -26898960.0, "logps/chosen": -424.8406066894531, "logps/rejected": -184.06526692708334, "loss": 0.2687, "rewards/chosen": 0.25120699405670166, "rewards/margins": 2.0390208164850874, "rewards/rejected": -1.7878138224283855, "step": 11838 }, { "epoch": 0.6275143773354889, "grad_norm": 49.25, "kl": 1.9648551940917969, "learning_rate": 5e-07, "logits/chosen": -8995642.0, "logits/rejected": -18499336.0, "logps/chosen": -427.7284342447917, "logps/rejected": -246.2329833984375, "loss": 0.1767, "rewards/chosen": 1.594895362854004, "rewards/margins": 4.516138648986816, "rewards/rejected": -2.9212432861328126, "step": 11839 }, { "epoch": 0.627567381337291, "grad_norm": 43.25, "kl": 0.250396728515625, "learning_rate": 5e-07, "logits/chosen": -24185312.0, "logits/rejected": -34655304.0, "logps/chosen": -519.4344482421875, "logps/rejected": -375.2249755859375, "loss": 0.1335, "rewards/chosen": 1.996131181716919, "rewards/margins": 4.497276067733765, "rewards/rejected": -2.5011448860168457, "step": 11840 }, { "epoch": 0.6276203853390931, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12268456.0, "logits/rejected": -29532840.0, "logps/chosen": -155.46299743652344, "logps/rejected": -206.02249145507812, "loss": 0.3324, "rewards/chosen": 0.00064096599817276, "rewards/margins": 1.8209835067391396, "rewards/rejected": -1.8203425407409668, "step": 11841 }, { "epoch": 0.6276733893408952, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8693476.0, "logits/rejected": -23502530.666666668, "logps/chosen": -44.79744338989258, "logps/rejected": -279.178466796875, "loss": 0.2477, "rewards/chosen": 0.2747228741645813, "rewards/margins": 2.333819051583608, "rewards/rejected": -2.059096177419027, "step": 11842 }, { "epoch": 0.6277263933426974, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6047550.666666667, "logits/rejected": -9364858.4, "logps/chosen": -256.49680582682294, "logps/rejected": -266.0706787109375, "loss": 0.2789, "rewards/chosen": 0.5855941772460938, "rewards/margins": 2.012711238861084, "rewards/rejected": -1.4271170616149902, "step": 11843 }, { "epoch": 0.6277793973444995, "grad_norm": 46.5, "kl": 1.0952281951904297, "learning_rate": 5e-07, "logits/chosen": -9454218.666666666, "logits/rejected": -29310422.4, "logps/chosen": -184.5306396484375, "logps/rejected": -283.984521484375, "loss": 0.2937, "rewards/chosen": 0.17716777324676514, "rewards/margins": 1.8174410581588745, "rewards/rejected": -1.6402732849121093, "step": 11844 }, { "epoch": 0.6278324013463017, "grad_norm": 47.0, "kl": 4.088384628295898, "learning_rate": 5e-07, "logits/chosen": -17489698.666666668, "logits/rejected": -36885496.0, "logps/chosen": -206.3859659830729, "logps/rejected": -662.0350952148438, "loss": 0.4054, "rewards/chosen": 0.28863441944122314, "rewards/margins": 4.794498562812805, "rewards/rejected": -4.505864143371582, "step": 11845 }, { "epoch": 0.6278854053481038, "grad_norm": 39.5, "kl": 0.20331573486328125, "learning_rate": 5e-07, "logits/chosen": -48145781.333333336, "logits/rejected": -29094617.6, "logps/chosen": -155.21065266927084, "logps/rejected": -255.5774169921875, "loss": 0.2706, "rewards/chosen": -0.01888154447078705, "rewards/margins": 1.9893009811639786, "rewards/rejected": -2.0081825256347656, "step": 11846 }, { "epoch": 0.627938409349906, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31792688.0, "logits/rejected": -24785325.333333332, "logps/chosen": -93.67682647705078, "logps/rejected": -369.672119140625, "loss": 0.2509, "rewards/chosen": -0.26495200395584106, "rewards/margins": 2.316056748231252, "rewards/rejected": -2.5810087521870932, "step": 11847 }, { "epoch": 0.627991413351708, "grad_norm": 39.25, "kl": 1.751084327697754, "learning_rate": 5e-07, "logits/chosen": 3806687.0, "logits/rejected": -22008469.333333332, "logps/chosen": -56.95254135131836, "logps/rejected": -287.53704833984375, "loss": 0.1893, "rewards/chosen": 1.0836337804794312, "rewards/margins": 2.9120289882024126, "rewards/rejected": -1.8283952077229817, "step": 11848 }, { "epoch": 0.6280444173535102, "grad_norm": 41.5, "kl": 1.206695556640625, "learning_rate": 5e-07, "logits/chosen": -22841380.8, "logits/rejected": 33410976.0, "logps/chosen": -168.30120849609375, "logps/rejected": -318.0804850260417, "loss": 0.239, "rewards/chosen": 0.9410102844238282, "rewards/margins": 3.6103796641031902, "rewards/rejected": -2.669369379679362, "step": 11849 }, { "epoch": 0.6280974213553123, "grad_norm": 32.25, "kl": 3.722738265991211, "learning_rate": 5e-07, "logits/chosen": -9052288.666666666, "logits/rejected": -18838385.6, "logps/chosen": -253.09159342447916, "logps/rejected": -223.291845703125, "loss": 0.1081, "rewards/chosen": 1.915705680847168, "rewards/margins": 4.7096864700317385, "rewards/rejected": -2.7939807891845705, "step": 11850 }, { "epoch": 0.6281504253571145, "grad_norm": 59.25, "kl": 0.7142434120178223, "learning_rate": 5e-07, "logits/chosen": -7119602.666666667, "logits/rejected": -41873027.2, "logps/chosen": -546.5953776041666, "logps/rejected": -406.9513671875, "loss": 0.2084, "rewards/chosen": 0.46432522932688397, "rewards/margins": 3.4702931324640907, "rewards/rejected": -3.005967903137207, "step": 11851 }, { "epoch": 0.6282034293589166, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35035588.0, "logits/rejected": -29790704.0, "logps/chosen": -292.3190002441406, "logps/rejected": -219.35398864746094, "loss": 0.2975, "rewards/chosen": 0.5091183185577393, "rewards/margins": 2.5676982402801514, "rewards/rejected": -2.058579921722412, "step": 11852 }, { "epoch": 0.6282564333607187, "grad_norm": 37.75, "kl": 4.469414710998535, "learning_rate": 5e-07, "logits/chosen": -3497004.5, "logits/rejected": -36896288.0, "logps/chosen": -407.9393310546875, "logps/rejected": -339.231689453125, "loss": 0.3099, "rewards/chosen": 0.7963600754737854, "rewards/margins": 3.8090057969093323, "rewards/rejected": -3.012645721435547, "step": 11853 }, { "epoch": 0.6283094373625209, "grad_norm": 41.5, "kl": 1.6139936447143555, "learning_rate": 5e-07, "logits/chosen": -57437392.0, "logits/rejected": -11048448.0, "logps/chosen": -419.0444641113281, "logps/rejected": -245.0753631591797, "loss": 0.2021, "rewards/chosen": 0.9499936699867249, "rewards/margins": 4.054398357868195, "rewards/rejected": -3.1044046878814697, "step": 11854 }, { "epoch": 0.628362441364323, "grad_norm": 41.75, "kl": 0.9671497344970703, "learning_rate": 5e-07, "logits/chosen": -17013164.8, "logits/rejected": -10424140.0, "logps/chosen": -110.56103515625, "logps/rejected": -196.03873697916666, "loss": 0.3219, "rewards/chosen": 0.36612863540649415, "rewards/margins": 2.9550679842631022, "rewards/rejected": -2.588939348856608, "step": 11855 }, { "epoch": 0.6284154453661251, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31954310.4, "logits/rejected": -24683592.0, "logps/chosen": -228.865380859375, "logps/rejected": -247.2225341796875, "loss": 0.3992, "rewards/chosen": 0.04861176013946533, "rewards/margins": 1.2046339909235637, "rewards/rejected": -1.1560222307840984, "step": 11856 }, { "epoch": 0.6284684493679272, "grad_norm": 51.75, "kl": 4.326930999755859, "learning_rate": 5e-07, "logits/chosen": -54440089.6, "logits/rejected": -130469.66666666667, "logps/chosen": -580.537939453125, "logps/rejected": -186.353515625, "loss": 0.2356, "rewards/chosen": 1.7223871231079102, "rewards/margins": 3.459358978271484, "rewards/rejected": -1.7369718551635742, "step": 11857 }, { "epoch": 0.6285214533697294, "grad_norm": 71.5, "kl": 8.099104881286621, "learning_rate": 5e-07, "logits/chosen": -25109221.333333332, "logits/rejected": -7809106.5, "logps/chosen": -419.7283528645833, "logps/rejected": -245.80441284179688, "loss": 0.3745, "rewards/chosen": 1.4528797467549641, "rewards/margins": 2.63236924012502, "rewards/rejected": -1.1794894933700562, "step": 11858 }, { "epoch": 0.6285744573715315, "grad_norm": 72.0, "kl": 3.0578155517578125, "learning_rate": 5e-07, "logits/chosen": -57425465.6, "logits/rejected": -111948053.33333333, "logps/chosen": -761.263671875, "logps/rejected": -382.3254801432292, "loss": 0.321, "rewards/chosen": 1.1821551322937012, "rewards/margins": 2.958448092142741, "rewards/rejected": -1.7762929598490398, "step": 11859 }, { "epoch": 0.6286274613733337, "grad_norm": 47.5, "kl": 3.1731529235839844, "learning_rate": 5e-07, "logits/chosen": 1550838.4, "logits/rejected": -19056124.0, "logps/chosen": -187.976123046875, "logps/rejected": -349.225341796875, "loss": 0.326, "rewards/chosen": 1.0088748931884766, "rewards/margins": 2.846539815266927, "rewards/rejected": -1.8376649220784504, "step": 11860 }, { "epoch": 0.6286804653751358, "grad_norm": 48.0, "kl": 3.2562074661254883, "learning_rate": 5e-07, "logits/chosen": 6873785.6, "logits/rejected": -52862442.666666664, "logps/chosen": -361.028369140625, "logps/rejected": -741.2698567708334, "loss": 0.285, "rewards/chosen": 0.8364115715026855, "rewards/margins": 6.92088295618693, "rewards/rejected": -6.084471384684245, "step": 11861 }, { "epoch": 0.628733469376938, "grad_norm": 53.25, "kl": 2.955078125, "learning_rate": 5e-07, "logits/chosen": -25049885.333333332, "logits/rejected": -5750685.0, "logps/chosen": -228.98616536458334, "logps/rejected": -94.18263244628906, "loss": 0.4799, "rewards/chosen": 0.33998246987660724, "rewards/margins": 0.8365021546681721, "rewards/rejected": -0.49651968479156494, "step": 11862 }, { "epoch": 0.62878647337874, "grad_norm": 52.5, "kl": 1.4741573333740234, "learning_rate": 5e-07, "logits/chosen": -4230712.666666667, "logits/rejected": -47742236.0, "logps/chosen": -85.53883870442708, "logps/rejected": -333.5152587890625, "loss": 0.4075, "rewards/chosen": 0.5633204778035482, "rewards/margins": 0.9559390445550283, "rewards/rejected": -0.3926185667514801, "step": 11863 }, { "epoch": 0.6288394773805422, "grad_norm": 51.75, "kl": 1.480255126953125, "learning_rate": 5e-07, "logits/chosen": -54959353.6, "logits/rejected": -21657504.0, "logps/chosen": -435.36064453125, "logps/rejected": -210.2847696940104, "loss": 0.3114, "rewards/chosen": 0.692120361328125, "rewards/margins": 2.673082478841146, "rewards/rejected": -1.9809621175130208, "step": 11864 }, { "epoch": 0.6288924813823443, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36449372.0, "logits/rejected": -38081712.0, "logps/chosen": -134.41001892089844, "logps/rejected": -318.6694641113281, "loss": 0.2974, "rewards/chosen": -0.06324291229248047, "rewards/margins": 2.824930191040039, "rewards/rejected": -2.8881731033325195, "step": 11865 }, { "epoch": 0.6289454853841465, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -98759136.0, "logits/rejected": -9753553.333333334, "logps/chosen": -431.2730407714844, "logps/rejected": -255.87886555989584, "loss": 0.2619, "rewards/chosen": 0.09323807060718536, "rewards/margins": 2.154049431284269, "rewards/rejected": -2.0608113606770835, "step": 11866 }, { "epoch": 0.6289984893859486, "grad_norm": 49.75, "kl": 1.00360107421875, "learning_rate": 5e-07, "logits/chosen": -12057323.0, "logits/rejected": -57106936.0, "logps/chosen": -336.765869140625, "logps/rejected": -249.70823669433594, "loss": 0.2778, "rewards/chosen": 1.0352802276611328, "rewards/margins": 2.295767307281494, "rewards/rejected": -1.2604870796203613, "step": 11867 }, { "epoch": 0.6290514933877508, "grad_norm": 41.0, "kl": 2.7482681274414062, "learning_rate": 5e-07, "logits/chosen": -29041507.2, "logits/rejected": 3422766.0, "logps/chosen": -206.916845703125, "logps/rejected": -66.81431579589844, "loss": 0.4605, "rewards/chosen": 0.12426414489746093, "rewards/margins": 1.5516005198160807, "rewards/rejected": -1.4273363749186199, "step": 11868 }, { "epoch": 0.6291044973895529, "grad_norm": 66.5, "kl": 1.248016357421875, "learning_rate": 5e-07, "logits/chosen": -47434544.0, "logits/rejected": -47856362.666666664, "logps/chosen": -493.81669921875, "logps/rejected": -406.677978515625, "loss": 0.3442, "rewards/chosen": 0.6154163837432861, "rewards/margins": 2.3990041573842364, "rewards/rejected": -1.7835877736409504, "step": 11869 }, { "epoch": 0.6291575013913551, "grad_norm": 29.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8440634.666666666, "logits/rejected": -47266809.6, "logps/chosen": -48.12456766764323, "logps/rejected": -339.445947265625, "loss": 0.2341, "rewards/chosen": 0.5751796960830688, "rewards/margins": 2.6854650735855103, "rewards/rejected": -2.1102853775024415, "step": 11870 }, { "epoch": 0.6292105053931571, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70319193.6, "logits/rejected": -8901524.666666666, "logps/chosen": -342.126513671875, "logps/rejected": -325.9014485677083, "loss": 0.3458, "rewards/chosen": 0.058112716674804686, "rewards/margins": 2.1397062301635743, "rewards/rejected": -2.0815935134887695, "step": 11871 }, { "epoch": 0.6292635093949593, "grad_norm": 43.25, "kl": 0.9291458129882812, "learning_rate": 5e-07, "logits/chosen": -80915104.0, "logits/rejected": -6909654.4, "logps/chosen": -301.8048095703125, "logps/rejected": -435.9455078125, "loss": 0.2672, "rewards/chosen": 0.4501230716705322, "rewards/margins": 2.9101908206939697, "rewards/rejected": -2.4600677490234375, "step": 11872 }, { "epoch": 0.6293165133967614, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26526877.333333332, "logits/rejected": -12188424.8, "logps/chosen": -170.62069702148438, "logps/rejected": -256.12568359375, "loss": 0.2774, "rewards/chosen": 0.10468965768814087, "rewards/margins": 1.900848639011383, "rewards/rejected": -1.7961589813232421, "step": 11873 }, { "epoch": 0.6293695173985636, "grad_norm": 67.0, "kl": 3.9249162673950195, "learning_rate": 5e-07, "logits/chosen": -35606160.0, "logits/rejected": -12166430.666666666, "logps/chosen": -420.19404296875, "logps/rejected": -436.3607584635417, "loss": 0.353, "rewards/chosen": 0.7592789649963378, "rewards/margins": 2.7276371637980144, "rewards/rejected": -1.9683581988016765, "step": 11874 }, { "epoch": 0.6294225214003657, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -120971050.66666667, "logits/rejected": -10842705.6, "logps/chosen": -574.4676106770834, "logps/rejected": -194.88486328125, "loss": 0.1784, "rewards/chosen": 0.5767568349838257, "rewards/margins": 3.5429839849472047, "rewards/rejected": -2.966227149963379, "step": 11875 }, { "epoch": 0.6294755254021679, "grad_norm": 43.0, "kl": 0.8759613037109375, "learning_rate": 5e-07, "logits/chosen": -20132196.0, "logits/rejected": -31192885.333333332, "logps/chosen": -171.22244262695312, "logps/rejected": -356.9609375, "loss": 0.1667, "rewards/chosen": 0.6363937258720398, "rewards/margins": 3.3336870074272156, "rewards/rejected": -2.697293281555176, "step": 11876 }, { "epoch": 0.62952852940397, "grad_norm": 40.25, "kl": 2.750237464904785, "learning_rate": 5e-07, "logits/chosen": 4521293.6, "logits/rejected": 3163016.0, "logps/chosen": -42.87879638671875, "logps/rejected": -380.6627604166667, "loss": 0.3399, "rewards/chosen": 0.6671295166015625, "rewards/margins": 2.2586487134297686, "rewards/rejected": -1.5915191968282063, "step": 11877 }, { "epoch": 0.6295815334057722, "grad_norm": 45.0, "kl": 2.1952762603759766, "learning_rate": 5e-07, "logits/chosen": -4309030.0, "logits/rejected": -19622184.0, "logps/chosen": -71.87994384765625, "logps/rejected": -411.73590087890625, "loss": 0.3394, "rewards/chosen": 0.032663777470588684, "rewards/margins": 2.209833577275276, "rewards/rejected": -2.1771697998046875, "step": 11878 }, { "epoch": 0.6296345374075742, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38134269.333333336, "logits/rejected": -6854344.8, "logps/chosen": -444.1825358072917, "logps/rejected": -270.2001953125, "loss": 0.2921, "rewards/chosen": -0.27951393524805707, "rewards/margins": 1.9360792120297747, "rewards/rejected": -2.215593147277832, "step": 11879 }, { "epoch": 0.6296875414093764, "grad_norm": 54.75, "kl": 2.0018444061279297, "learning_rate": 5e-07, "logits/chosen": -35343946.666666664, "logits/rejected": 6391210.0, "logps/chosen": -252.69842529296875, "logps/rejected": -358.45880126953125, "loss": 0.427, "rewards/chosen": 0.20189601182937622, "rewards/margins": 2.502538502216339, "rewards/rejected": -2.300642490386963, "step": 11880 }, { "epoch": 0.6297405454111785, "grad_norm": 51.5, "kl": 0.6110401153564453, "learning_rate": 5e-07, "logits/chosen": 15314363.0, "logits/rejected": -7450485.0, "logps/chosen": -190.61721801757812, "logps/rejected": -190.08343505859375, "loss": 0.3891, "rewards/chosen": -0.22836384177207947, "rewards/margins": 1.3054406940937042, "rewards/rejected": -1.5338045358657837, "step": 11881 }, { "epoch": 0.6297935494129807, "grad_norm": 44.75, "kl": 1.0917892456054688, "learning_rate": 5e-07, "logits/chosen": -45160170.666666664, "logits/rejected": -40864208.0, "logps/chosen": -228.41487630208334, "logps/rejected": -501.61162109375, "loss": 0.2908, "rewards/chosen": 0.1604507565498352, "rewards/margins": 2.3148082852363587, "rewards/rejected": -2.1543575286865235, "step": 11882 }, { "epoch": 0.6298465534147828, "grad_norm": 49.75, "kl": 2.430490493774414, "learning_rate": 5e-07, "logits/chosen": -17577750.0, "logits/rejected": -10646603.0, "logps/chosen": -310.71832275390625, "logps/rejected": -322.0654602050781, "loss": 0.2775, "rewards/chosen": 1.1274149417877197, "rewards/margins": 2.739805817604065, "rewards/rejected": -1.6123908758163452, "step": 11883 }, { "epoch": 0.629899557416585, "grad_norm": 50.75, "kl": 0.8578834533691406, "learning_rate": 5e-07, "logits/chosen": -27162188.0, "logits/rejected": -3780926.75, "logps/chosen": -290.49505615234375, "logps/rejected": -372.9324951171875, "loss": 0.2407, "rewards/chosen": 0.6418880224227905, "rewards/margins": 2.9474319219589233, "rewards/rejected": -2.305543899536133, "step": 11884 }, { "epoch": 0.6299525614183871, "grad_norm": 41.5, "kl": 0.018941879272460938, "learning_rate": 5e-07, "logits/chosen": -114215040.0, "logits/rejected": -3416256.6666666665, "logps/chosen": -215.39093017578125, "logps/rejected": -304.6238199869792, "loss": 0.1351, "rewards/chosen": 1.1074107885360718, "rewards/margins": 4.021644949913025, "rewards/rejected": -2.914234161376953, "step": 11885 }, { "epoch": 0.6300055654201893, "grad_norm": 52.25, "kl": 4.736406326293945, "learning_rate": 5e-07, "logits/chosen": -18725206.0, "logits/rejected": -21544860.0, "logps/chosen": -232.98373413085938, "logps/rejected": -456.20831298828125, "loss": 0.24, "rewards/chosen": 1.5876898765563965, "rewards/margins": 4.47768235206604, "rewards/rejected": -2.8899924755096436, "step": 11886 }, { "epoch": 0.6300585694219913, "grad_norm": 100.5, "kl": 0.4523468017578125, "learning_rate": 5e-07, "logits/chosen": 17828088.0, "logits/rejected": -18369985.6, "logps/chosen": -316.5858561197917, "logps/rejected": -306.3846435546875, "loss": 0.2883, "rewards/chosen": 0.6035675207773844, "rewards/margins": 1.9766944090525307, "rewards/rejected": -1.3731268882751464, "step": 11887 }, { "epoch": 0.6301115734237935, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10010062.666666666, "logits/rejected": -16279544.0, "logps/chosen": -258.3173421223958, "logps/rejected": -377.589697265625, "loss": 0.2133, "rewards/chosen": 0.32184431950251263, "rewards/margins": 2.86047652165095, "rewards/rejected": -2.5386322021484373, "step": 11888 }, { "epoch": 0.6301645774255956, "grad_norm": 47.75, "kl": 1.3340682983398438, "learning_rate": 5e-07, "logits/chosen": -13123509.333333334, "logits/rejected": -3999359.2, "logps/chosen": -168.60514322916666, "logps/rejected": -482.123876953125, "loss": 0.2331, "rewards/chosen": 0.3293718894322713, "rewards/margins": 3.1364498694737755, "rewards/rejected": -2.807077980041504, "step": 11889 }, { "epoch": 0.6302175814273978, "grad_norm": 47.5, "kl": 3.3116846084594727, "learning_rate": 5e-07, "logits/chosen": -3948339.5, "logits/rejected": -61412016.0, "logps/chosen": -244.8121795654297, "logps/rejected": -310.5437927246094, "loss": 0.2582, "rewards/chosen": 0.5754061937332153, "rewards/margins": 3.734948992729187, "rewards/rejected": -3.1595427989959717, "step": 11890 }, { "epoch": 0.6302705854291999, "grad_norm": 48.0, "kl": 5.627686500549316, "learning_rate": 5e-07, "logits/chosen": -42249716.0, "logits/rejected": -29030880.0, "logps/chosen": -235.9101104736328, "logps/rejected": -305.3118896484375, "loss": 0.3271, "rewards/chosen": 1.4583146572113037, "rewards/margins": 3.1600453853607178, "rewards/rejected": -1.701730728149414, "step": 11891 }, { "epoch": 0.6303235894310021, "grad_norm": 57.0, "kl": 0.0627288818359375, "learning_rate": 5e-07, "logits/chosen": -36998860.0, "logits/rejected": -8500065.0, "logps/chosen": -390.45135498046875, "logps/rejected": -313.066162109375, "loss": 0.303, "rewards/chosen": 0.2298072874546051, "rewards/margins": 2.2776474058628082, "rewards/rejected": -2.047840118408203, "step": 11892 }, { "epoch": 0.6303765934328042, "grad_norm": 60.0, "kl": 2.7173995971679688, "learning_rate": 5e-07, "logits/chosen": -31470076.0, "logits/rejected": -13787817.0, "logps/chosen": -788.6488037109375, "logps/rejected": -186.16017150878906, "loss": 0.3108, "rewards/chosen": 1.203221321105957, "rewards/margins": 2.7785801887512207, "rewards/rejected": -1.5753588676452637, "step": 11893 }, { "epoch": 0.6304295974346064, "grad_norm": 29.25, "kl": 1.3773365020751953, "learning_rate": 5e-07, "logits/chosen": -1127110.875, "logits/rejected": -42810816.0, "logps/chosen": -82.17642211914062, "logps/rejected": -496.5882263183594, "loss": 0.2451, "rewards/chosen": 0.7847728729248047, "rewards/margins": 3.746506690979004, "rewards/rejected": -2.961733818054199, "step": 11894 }, { "epoch": 0.6304826014364084, "grad_norm": 45.25, "kl": 1.2289400100708008, "learning_rate": 5e-07, "logits/chosen": -5591662.5, "logits/rejected": -68282800.0, "logps/chosen": -237.1277618408203, "logps/rejected": -612.5054931640625, "loss": 0.2945, "rewards/chosen": 0.13640546798706055, "rewards/margins": 2.9296762943267822, "rewards/rejected": -2.7932708263397217, "step": 11895 }, { "epoch": 0.6305356054382106, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39033397.333333336, "logits/rejected": 2564024.6, "logps/chosen": -413.5293375651042, "logps/rejected": -171.4321533203125, "loss": 0.1902, "rewards/chosen": 0.9502410888671875, "rewards/margins": 3.3663721084594727, "rewards/rejected": -2.416131019592285, "step": 11896 }, { "epoch": 0.6305886094400127, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75302890.66666667, "logits/rejected": -14825812.8, "logps/chosen": -349.6932373046875, "logps/rejected": -224.4064208984375, "loss": 0.2609, "rewards/chosen": 0.26790008942286175, "rewards/margins": 2.0540460626284283, "rewards/rejected": -1.7861459732055665, "step": 11897 }, { "epoch": 0.6306416134418149, "grad_norm": 50.25, "kl": 0.7718353271484375, "learning_rate": 5e-07, "logits/chosen": -76054856.0, "logits/rejected": -51653354.666666664, "logps/chosen": -338.71112060546875, "logps/rejected": -442.4195963541667, "loss": 0.2282, "rewards/chosen": 0.0319976806640625, "rewards/margins": 2.5672364234924316, "rewards/rejected": -2.535238742828369, "step": 11898 }, { "epoch": 0.630694617443617, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44426936.0, "logits/rejected": 7411259.333333333, "logps/chosen": -930.859130859375, "logps/rejected": -428.8202311197917, "loss": 0.2412, "rewards/chosen": 1.8054627180099487, "rewards/margins": 3.90787144502004, "rewards/rejected": -2.1024087270100913, "step": 11899 }, { "epoch": 0.6307476214454192, "grad_norm": 37.75, "kl": 1.5785675048828125, "learning_rate": 5e-07, "logits/chosen": -1496369.6666666667, "logits/rejected": -30241792.0, "logps/chosen": -212.28861490885416, "logps/rejected": -309.0919189453125, "loss": 0.2154, "rewards/chosen": 0.4866897662480672, "rewards/margins": 3.4631721576054892, "rewards/rejected": -2.976482391357422, "step": 11900 }, { "epoch": 0.6308006254472213, "grad_norm": 67.0, "kl": 2.437419891357422, "learning_rate": 5e-07, "logits/chosen": -50957124.0, "logits/rejected": -64820800.0, "logps/chosen": -449.650634765625, "logps/rejected": -594.7671712239584, "loss": 0.1626, "rewards/chosen": 1.500625729560852, "rewards/margins": 3.8229647874832153, "rewards/rejected": -2.3223390579223633, "step": 11901 }, { "epoch": 0.6308536294490235, "grad_norm": 45.0, "kl": 0.6530914306640625, "learning_rate": 5e-07, "logits/chosen": -21547441.333333332, "logits/rejected": -24318297.6, "logps/chosen": -186.6554158528646, "logps/rejected": -356.904541015625, "loss": 0.228, "rewards/chosen": 0.6760756174723307, "rewards/margins": 3.1617819468180337, "rewards/rejected": -2.485706329345703, "step": 11902 }, { "epoch": 0.6309066334508255, "grad_norm": 45.75, "kl": 1.184617042541504, "learning_rate": 5e-07, "logits/chosen": -12967694.666666666, "logits/rejected": 2162795.0, "logps/chosen": -98.68739827473958, "logps/rejected": -659.433837890625, "loss": 0.3883, "rewards/chosen": 0.22522260745366415, "rewards/margins": 2.840472241242727, "rewards/rejected": -2.6152496337890625, "step": 11903 }, { "epoch": 0.6309596374526276, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56005781.333333336, "logits/rejected": -11920685.6, "logps/chosen": -201.0357666015625, "logps/rejected": -103.0899658203125, "loss": 0.2244, "rewards/chosen": 0.21650469303131104, "rewards/margins": 2.86846239566803, "rewards/rejected": -2.651957702636719, "step": 11904 }, { "epoch": 0.6310126414544298, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 617774.5625, "logits/rejected": 172170276.57142857, "logps/chosen": -25.565866470336914, "logps/rejected": -412.93858119419644, "loss": 0.1172, "rewards/chosen": 1.1104856729507446, "rewards/margins": 3.8527173144476756, "rewards/rejected": -2.742231641496931, "step": 11905 }, { "epoch": 0.6310656454562319, "grad_norm": 37.5, "kl": 1.4767341613769531, "learning_rate": 5e-07, "logits/chosen": 6280208.666666667, "logits/rejected": -48944851.2, "logps/chosen": -240.48223876953125, "logps/rejected": -517.72998046875, "loss": 0.1976, "rewards/chosen": 0.45182255903879803, "rewards/margins": 3.430656520525614, "rewards/rejected": -2.9788339614868162, "step": 11906 }, { "epoch": 0.6311186494580341, "grad_norm": 77.0, "kl": 3.664119243621826, "learning_rate": 5e-07, "logits/chosen": -31753280.0, "logits/rejected": -5432098.666666667, "logps/chosen": -426.94775390625, "logps/rejected": -196.1137491861979, "loss": 0.2817, "rewards/chosen": 0.9279043197631835, "rewards/margins": 3.9297121683756506, "rewards/rejected": -3.0018078486124673, "step": 11907 }, { "epoch": 0.6311716534598362, "grad_norm": 51.75, "kl": 4.522564888000488, "learning_rate": 5e-07, "logits/chosen": -11320313.333333334, "logits/rejected": 948944.25, "logps/chosen": -303.4046630859375, "logps/rejected": -122.13372039794922, "loss": 0.2876, "rewards/chosen": 1.201579252878825, "rewards/margins": 2.8523055712382, "rewards/rejected": -1.650726318359375, "step": 11908 }, { "epoch": 0.6312246574616384, "grad_norm": 45.25, "kl": 0.45147705078125, "learning_rate": 5e-07, "logits/chosen": 51092410.666666664, "logits/rejected": -32809468.8, "logps/chosen": -398.3828938802083, "logps/rejected": -341.6182373046875, "loss": 0.1801, "rewards/chosen": 1.024871826171875, "rewards/margins": 3.631427764892578, "rewards/rejected": -2.606555938720703, "step": 11909 }, { "epoch": 0.6312776614634404, "grad_norm": 39.5, "kl": 0.23449325561523438, "learning_rate": 5e-07, "logits/chosen": -71772160.0, "logits/rejected": -24514482.0, "logps/chosen": -285.5009460449219, "logps/rejected": -425.80316162109375, "loss": 0.298, "rewards/chosen": 0.030925743281841278, "rewards/margins": 2.4834783002734184, "rewards/rejected": -2.452552556991577, "step": 11910 }, { "epoch": 0.6313306654652426, "grad_norm": 33.5, "kl": 0.6901664733886719, "learning_rate": 5e-07, "logits/chosen": -8732076.0, "logits/rejected": -10089088.0, "logps/chosen": -118.1087417602539, "logps/rejected": -270.2382507324219, "loss": 0.2349, "rewards/chosen": 0.5628926157951355, "rewards/margins": 3.123573958873749, "rewards/rejected": -2.5606813430786133, "step": 11911 }, { "epoch": 0.6313836694670447, "grad_norm": 43.75, "kl": 3.160459041595459, "learning_rate": 5e-07, "logits/chosen": -19804152.0, "logits/rejected": -33887492.0, "logps/chosen": -253.9029541015625, "logps/rejected": -308.49737548828125, "loss": 0.3751, "rewards/chosen": 0.4618813991546631, "rewards/margins": 3.0350964069366455, "rewards/rejected": -2.5732150077819824, "step": 11912 }, { "epoch": 0.6314366734688469, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35696788.0, "logits/rejected": -49241260.0, "logps/chosen": -262.8460998535156, "logps/rejected": -576.0017700195312, "loss": 0.2617, "rewards/chosen": 0.14874133467674255, "rewards/margins": 3.6835552155971527, "rewards/rejected": -3.53481388092041, "step": 11913 }, { "epoch": 0.631489677470649, "grad_norm": 66.0, "kl": 2.5683250427246094, "learning_rate": 5e-07, "logits/chosen": -37104186.666666664, "logits/rejected": 420846.3125, "logps/chosen": -341.0250651041667, "logps/rejected": -232.70843505859375, "loss": 0.4022, "rewards/chosen": 0.4553350607554118, "rewards/margins": 1.949342171351115, "rewards/rejected": -1.4940071105957031, "step": 11914 }, { "epoch": 0.6315426814724512, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33332104.0, "logits/rejected": -32314588.8, "logps/chosen": -186.20475260416666, "logps/rejected": -332.079052734375, "loss": 0.2724, "rewards/chosen": 0.014070451259613037, "rewards/margins": 1.9990867018699645, "rewards/rejected": -1.9850162506103515, "step": 11915 }, { "epoch": 0.6315956854742533, "grad_norm": 51.25, "kl": 0.4965400695800781, "learning_rate": 5e-07, "logits/chosen": -35730323.2, "logits/rejected": -7656620.0, "logps/chosen": -166.032470703125, "logps/rejected": -218.89925130208334, "loss": 0.2934, "rewards/chosen": 0.6005398750305175, "rewards/margins": 2.5602390289306642, "rewards/rejected": -1.9596991539001465, "step": 11916 }, { "epoch": 0.6316486894760555, "grad_norm": 47.75, "kl": 2.8376731872558594, "learning_rate": 5e-07, "logits/chosen": -6408057.5, "logits/rejected": -15043356.0, "logps/chosen": -273.3556213378906, "logps/rejected": -189.85592651367188, "loss": 0.2585, "rewards/chosen": 1.277223825454712, "rewards/margins": 3.252405047416687, "rewards/rejected": -1.975181221961975, "step": 11917 }, { "epoch": 0.6317016934778575, "grad_norm": 56.75, "kl": 2.239765167236328, "learning_rate": 5e-07, "logits/chosen": -23983940.57142857, "logits/rejected": -40482872.0, "logps/chosen": -365.605712890625, "logps/rejected": -804.5715942382812, "loss": 0.4133, "rewards/chosen": 0.39606830051967074, "rewards/margins": 3.4172169821602956, "rewards/rejected": -3.021148681640625, "step": 11918 }, { "epoch": 0.6317546974796597, "grad_norm": 74.0, "kl": 0.22719573974609375, "learning_rate": 5e-07, "logits/chosen": -52095098.666666664, "logits/rejected": -16176623.0, "logps/chosen": -623.322021484375, "logps/rejected": -211.79513549804688, "loss": 0.4029, "rewards/chosen": -0.013809083650509516, "rewards/margins": 2.516108633329471, "rewards/rejected": -2.5299177169799805, "step": 11919 }, { "epoch": 0.6318077014814618, "grad_norm": 62.5, "kl": 3.7648696899414062, "learning_rate": 5e-07, "logits/chosen": -7772765.333333333, "logits/rejected": -19945538.0, "logps/chosen": -451.3710123697917, "logps/rejected": -250.24240112304688, "loss": 0.33, "rewards/chosen": 0.936529795328776, "rewards/margins": 3.751429716746012, "rewards/rejected": -2.8148999214172363, "step": 11920 }, { "epoch": 0.631860705483264, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61558772.0, "logits/rejected": -19434710.0, "logps/chosen": -362.2353515625, "logps/rejected": -278.7378845214844, "loss": 0.2685, "rewards/chosen": 0.5092629790306091, "rewards/margins": 2.317169725894928, "rewards/rejected": -1.8079067468643188, "step": 11921 }, { "epoch": 0.6319137094850661, "grad_norm": 39.0, "kl": 1.6229372024536133, "learning_rate": 5e-07, "logits/chosen": -20235134.4, "logits/rejected": -10147982.666666666, "logps/chosen": -312.7738037109375, "logps/rejected": -435.2712809244792, "loss": 0.3375, "rewards/chosen": 0.6412902355194092, "rewards/margins": 2.872374741236369, "rewards/rejected": -2.2310845057169595, "step": 11922 }, { "epoch": 0.6319667134868683, "grad_norm": 44.75, "kl": 0.049526214599609375, "learning_rate": 5e-07, "logits/chosen": -35558192.0, "logits/rejected": -40191932.0, "logps/chosen": -305.17840576171875, "logps/rejected": -202.42343139648438, "loss": 0.3011, "rewards/chosen": 0.33265018463134766, "rewards/margins": 2.1071085929870605, "rewards/rejected": -1.774458408355713, "step": 11923 }, { "epoch": 0.6320197174886704, "grad_norm": 34.0, "kl": 2.438312530517578, "learning_rate": 5e-07, "logits/chosen": -13028610.666666666, "logits/rejected": -10708593.6, "logps/chosen": -265.70717366536456, "logps/rejected": -217.3628173828125, "loss": 0.208, "rewards/chosen": 0.48849602540334064, "rewards/margins": 3.3417305072148644, "rewards/rejected": -2.8532344818115236, "step": 11924 }, { "epoch": 0.6320727214904726, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23819800.0, "logits/rejected": -25557680.0, "logps/chosen": -360.6563313802083, "logps/rejected": -293.827587890625, "loss": 0.259, "rewards/chosen": 0.6113576491673788, "rewards/margins": 2.4367990096410117, "rewards/rejected": -1.8254413604736328, "step": 11925 }, { "epoch": 0.6321257254922746, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32883621.333333332, "logits/rejected": -11416668.0, "logps/chosen": -340.97918701171875, "logps/rejected": -179.21649169921875, "loss": 0.4304, "rewards/chosen": 0.003729333480199178, "rewards/margins": 1.5044027467568715, "rewards/rejected": -1.5006734132766724, "step": 11926 }, { "epoch": 0.6321787294940768, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71440768.0, "logits/rejected": -39242752.0, "logps/chosen": -424.80767822265625, "logps/rejected": -353.8006068638393, "loss": 0.1826, "rewards/chosen": -0.11376953125, "rewards/margins": 2.1528000150408064, "rewards/rejected": -2.2665695462908064, "step": 11927 }, { "epoch": 0.6322317334958789, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20787090.0, "logits/rejected": -37506688.0, "logps/chosen": -760.6868896484375, "logps/rejected": -290.16802978515625, "loss": 0.2354, "rewards/chosen": 1.0934499502182007, "rewards/margins": 3.1887396574020386, "rewards/rejected": -2.095289707183838, "step": 11928 }, { "epoch": 0.6322847374976811, "grad_norm": 42.25, "kl": 0.9002704620361328, "learning_rate": 5e-07, "logits/chosen": 9577498.666666666, "logits/rejected": -53782892.8, "logps/chosen": -154.80037434895834, "logps/rejected": -407.7221923828125, "loss": 0.2282, "rewards/chosen": 0.6476062138875326, "rewards/margins": 2.985767110188802, "rewards/rejected": -2.3381608963012694, "step": 11929 }, { "epoch": 0.6323377414994832, "grad_norm": 79.0, "kl": 1.161285400390625, "learning_rate": 5e-07, "logits/chosen": 3539090.75, "logits/rejected": -22747944.0, "logps/chosen": -266.0644836425781, "logps/rejected": -452.0570373535156, "loss": 0.3031, "rewards/chosen": 0.5624679923057556, "rewards/margins": 2.2541932463645935, "rewards/rejected": -1.691725254058838, "step": 11930 }, { "epoch": 0.6323907455012854, "grad_norm": 56.25, "kl": 3.560483932495117, "learning_rate": 5e-07, "logits/chosen": -87480064.0, "logits/rejected": -11347601.333333334, "logps/chosen": -911.67041015625, "logps/rejected": -279.68829345703125, "loss": 0.1907, "rewards/chosen": 1.116857886314392, "rewards/margins": 3.3065295616785684, "rewards/rejected": -2.1896716753641763, "step": 11931 }, { "epoch": 0.6324437495030875, "grad_norm": 51.75, "kl": 0.9730911254882812, "learning_rate": 5e-07, "logits/chosen": 12496388.0, "logits/rejected": -27796434.0, "logps/chosen": -211.94651794433594, "logps/rejected": -266.3763732910156, "loss": 0.2636, "rewards/chosen": 0.4553661346435547, "rewards/margins": 3.179595947265625, "rewards/rejected": -2.7242298126220703, "step": 11932 }, { "epoch": 0.6324967535048897, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34250706.666666664, "logits/rejected": -27355977.6, "logps/chosen": -262.0323486328125, "logps/rejected": -329.45087890625, "loss": 0.2874, "rewards/chosen": -0.06967240571975708, "rewards/margins": 1.8471626162528991, "rewards/rejected": -1.9168350219726562, "step": 11933 }, { "epoch": 0.6325497575066917, "grad_norm": 44.0, "kl": 0.10297393798828125, "learning_rate": 5e-07, "logits/chosen": -37062508.8, "logits/rejected": 10319898.666666666, "logps/chosen": -314.30751953125, "logps/rejected": -236.28753662109375, "loss": 0.3033, "rewards/chosen": 0.9254342079162597, "rewards/margins": 2.136953290303548, "rewards/rejected": -1.2115190823872883, "step": 11934 }, { "epoch": 0.6326027615084939, "grad_norm": 43.0, "kl": 2.1482133865356445, "learning_rate": 5e-07, "logits/chosen": 5944747.5, "logits/rejected": -15915831.0, "logps/chosen": -121.53836059570312, "logps/rejected": -116.26771545410156, "loss": 0.3941, "rewards/chosen": 0.1799192726612091, "rewards/margins": 1.3343338072299957, "rewards/rejected": -1.1544145345687866, "step": 11935 }, { "epoch": 0.632655765510296, "grad_norm": 48.25, "kl": 3.1472434997558594, "learning_rate": 5e-07, "logits/chosen": -26099461.333333332, "logits/rejected": -29705724.0, "logps/chosen": -311.689208984375, "logps/rejected": -676.7167358398438, "loss": 0.3389, "rewards/chosen": 0.6805838743845621, "rewards/margins": 4.056514660517375, "rewards/rejected": -3.3759307861328125, "step": 11936 }, { "epoch": 0.6327087695120982, "grad_norm": 38.0, "kl": 1.4333610534667969, "learning_rate": 5e-07, "logits/chosen": -14106048.0, "logits/rejected": -25406257.6, "logps/chosen": -292.04705810546875, "logps/rejected": -266.597314453125, "loss": 0.1162, "rewards/chosen": 2.018422762552897, "rewards/margins": 4.349739710489908, "rewards/rejected": -2.3313169479370117, "step": 11937 }, { "epoch": 0.6327617735139003, "grad_norm": 74.5, "kl": 4.592097282409668, "learning_rate": 5e-07, "logits/chosen": -33161862.4, "logits/rejected": 232606890.66666666, "logps/chosen": -329.26787109375, "logps/rejected": -319.9503173828125, "loss": 0.3853, "rewards/chosen": 0.6491328716278076, "rewards/margins": 2.3930031299591064, "rewards/rejected": -1.7438702583312988, "step": 11938 }, { "epoch": 0.6328147775157025, "grad_norm": 59.0, "kl": 0.19788360595703125, "learning_rate": 5e-07, "logits/chosen": -79860944.0, "logits/rejected": -42437604.0, "logps/chosen": -387.2673645019531, "logps/rejected": -312.8123474121094, "loss": 0.3472, "rewards/chosen": 0.2465408444404602, "rewards/margins": 1.4704740643501282, "rewards/rejected": -1.223933219909668, "step": 11939 }, { "epoch": 0.6328677815175046, "grad_norm": 53.75, "kl": 1.9394006729125977, "learning_rate": 5e-07, "logits/chosen": -20944155.42857143, "logits/rejected": -40834104.0, "logps/chosen": -246.677490234375, "logps/rejected": -207.6604461669922, "loss": 0.3958, "rewards/chosen": 0.4590752805982317, "rewards/margins": 3.1082254137311662, "rewards/rejected": -2.6491501331329346, "step": 11940 }, { "epoch": 0.6329207855193068, "grad_norm": 56.0, "kl": 1.6475458145141602, "learning_rate": 5e-07, "logits/chosen": -48600160.0, "logits/rejected": -5306304.0, "logps/chosen": -318.98687744140625, "logps/rejected": -292.73297119140625, "loss": 0.3336, "rewards/chosen": 0.6484678983688354, "rewards/margins": 2.9701544046401978, "rewards/rejected": -2.3216865062713623, "step": 11941 }, { "epoch": 0.6329737895211088, "grad_norm": 38.5, "kl": 0.4112424850463867, "learning_rate": 5e-07, "logits/chosen": -19411398.666666668, "logits/rejected": 11218001.0, "logps/chosen": -146.53848266601562, "logps/rejected": -550.7857666015625, "loss": 0.3357, "rewards/chosen": 0.31426974137624103, "rewards/margins": 4.956468780835469, "rewards/rejected": -4.6421990394592285, "step": 11942 }, { "epoch": 0.633026793522911, "grad_norm": 26.5, "kl": 2.3622817993164062, "learning_rate": 5e-07, "logits/chosen": 860751.375, "logits/rejected": -17181309.714285713, "logps/chosen": -23.216630935668945, "logps/rejected": -364.98095703125, "loss": 0.1463, "rewards/chosen": 0.27245673537254333, "rewards/margins": 3.1698566377162933, "rewards/rejected": -2.89739990234375, "step": 11943 }, { "epoch": 0.6330797975247131, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14987770.0, "logits/rejected": -46545376.0, "logps/chosen": -402.0179443359375, "logps/rejected": -585.8865559895834, "loss": 0.1233, "rewards/chosen": 0.8428970575332642, "rewards/margins": 3.9830960830052695, "rewards/rejected": -3.1401990254720054, "step": 11944 }, { "epoch": 0.6331328015265153, "grad_norm": 41.5, "kl": 1.4689521789550781, "learning_rate": 5e-07, "logits/chosen": -39187170.666666664, "logits/rejected": -50112460.8, "logps/chosen": -285.63295491536456, "logps/rejected": -212.96767578125, "loss": 0.2233, "rewards/chosen": 1.3027379512786865, "rewards/margins": 3.2843556880950926, "rewards/rejected": -1.9816177368164063, "step": 11945 }, { "epoch": 0.6331858055283174, "grad_norm": 52.5, "kl": 0.9844207763671875, "learning_rate": 5e-07, "logits/chosen": -10853267.2, "logits/rejected": -7852062.0, "logps/chosen": -262.2720947265625, "logps/rejected": -127.36548868815105, "loss": 0.3468, "rewards/chosen": 0.49358582496643066, "rewards/margins": 1.8740837574005127, "rewards/rejected": -1.380497932434082, "step": 11946 }, { "epoch": 0.6332388095301196, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41587253.333333336, "logits/rejected": -18440806.4, "logps/chosen": -477.0425618489583, "logps/rejected": -285.994775390625, "loss": 0.233, "rewards/chosen": 0.07779844601949056, "rewards/margins": 3.0995139916737875, "rewards/rejected": -3.021715545654297, "step": 11947 }, { "epoch": 0.6332918135319217, "grad_norm": 35.25, "kl": 0.6371517181396484, "learning_rate": 5e-07, "logits/chosen": -2378842.0, "logits/rejected": -45226352.0, "logps/chosen": -161.31715393066406, "logps/rejected": -301.8767395019531, "loss": 0.2946, "rewards/chosen": 0.5663397908210754, "rewards/margins": 2.722152292728424, "rewards/rejected": -2.1558125019073486, "step": 11948 }, { "epoch": 0.6333448175337238, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34855888.0, "logits/rejected": -25346608.0, "logps/chosen": -342.634033203125, "logps/rejected": -274.1656494140625, "loss": 0.2739, "rewards/chosen": 0.8218035697937012, "rewards/margins": 2.7608866691589355, "rewards/rejected": -1.9390830993652344, "step": 11949 }, { "epoch": 0.6333978215355259, "grad_norm": 54.5, "kl": 2.5675430297851562, "learning_rate": 5e-07, "logits/chosen": -23675429.333333332, "logits/rejected": -21685256.0, "logps/chosen": -686.0760091145834, "logps/rejected": -410.09482421875, "loss": 0.1675, "rewards/chosen": 1.3618489901224773, "rewards/margins": 4.645612112681071, "rewards/rejected": -3.283763122558594, "step": 11950 }, { "epoch": 0.6334508255373281, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13263486.666666666, "logits/rejected": -34235900.8, "logps/chosen": -52.4635009765625, "logps/rejected": -298.47265625, "loss": 0.291, "rewards/chosen": 0.30841338634490967, "rewards/margins": 1.8391237020492555, "rewards/rejected": -1.5307103157043458, "step": 11951 }, { "epoch": 0.6335038295391302, "grad_norm": 31.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1251685.0, "logits/rejected": -38723061.333333336, "logps/chosen": -233.53095703125, "logps/rejected": -265.53468831380206, "loss": 0.319, "rewards/chosen": 0.41423234939575193, "rewards/margins": 2.542328612009684, "rewards/rejected": -2.128096262613932, "step": 11952 }, { "epoch": 0.6335568335409324, "grad_norm": 46.75, "kl": 1.3279304504394531, "learning_rate": 5e-07, "logits/chosen": -23577838.0, "logits/rejected": -72188424.0, "logps/chosen": -336.19329833984375, "logps/rejected": -229.76556396484375, "loss": 0.2767, "rewards/chosen": 0.5926101803779602, "rewards/margins": 2.3678385615348816, "rewards/rejected": -1.7752283811569214, "step": 11953 }, { "epoch": 0.6336098375427345, "grad_norm": 73.5, "kl": 3.5192718505859375, "learning_rate": 5e-07, "logits/chosen": 10373570.0, "logits/rejected": -26406868.0, "logps/chosen": -878.1991577148438, "logps/rejected": -545.2456665039062, "loss": 0.1984, "rewards/chosen": 1.49612295627594, "rewards/margins": 3.7393046617507935, "rewards/rejected": -2.2431817054748535, "step": 11954 }, { "epoch": 0.6336628415445366, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11749163.0, "logits/rejected": -30620283.42857143, "logps/chosen": -38.044151306152344, "logps/rejected": -251.24904087611608, "loss": 0.2304, "rewards/chosen": 0.4962783753871918, "rewards/margins": 2.5423664578369687, "rewards/rejected": -2.046088082449777, "step": 11955 }, { "epoch": 0.6337158455463388, "grad_norm": 39.0, "kl": 0.8401203155517578, "learning_rate": 5e-07, "logits/chosen": -25564900.0, "logits/rejected": -22626432.0, "logps/chosen": -275.9430847167969, "logps/rejected": -183.26187133789062, "loss": 0.2347, "rewards/chosen": 1.1670879125595093, "rewards/margins": 2.5765496492385864, "rewards/rejected": -1.4094617366790771, "step": 11956 }, { "epoch": 0.6337688495481408, "grad_norm": 49.0, "kl": 0.3244056701660156, "learning_rate": 5e-07, "logits/chosen": -32885290.0, "logits/rejected": 122861736.0, "logps/chosen": -300.7638244628906, "logps/rejected": -652.0795288085938, "loss": 0.2401, "rewards/chosen": 0.3943450152873993, "rewards/margins": 3.983342856168747, "rewards/rejected": -3.5889978408813477, "step": 11957 }, { "epoch": 0.633821853549943, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5067148.0, "logits/rejected": 1189155.4285714286, "logps/chosen": -9.22625732421875, "logps/rejected": -214.38213239397322, "loss": 0.274, "rewards/chosen": 0.6887809634208679, "rewards/margins": 1.8840450985091073, "rewards/rejected": -1.1952641350882394, "step": 11958 }, { "epoch": 0.6338748575517451, "grad_norm": 52.0, "kl": 0.10624313354492188, "learning_rate": 5e-07, "logits/chosen": -58161258.666666664, "logits/rejected": -12803913.6, "logps/chosen": -381.9106852213542, "logps/rejected": -233.78486328125, "loss": 0.2909, "rewards/chosen": 0.1875762939453125, "rewards/margins": 2.1681182861328123, "rewards/rejected": -1.9805419921875, "step": 11959 }, { "epoch": 0.6339278615535473, "grad_norm": 50.5, "kl": 4.955986976623535, "learning_rate": 5e-07, "logits/chosen": -12129832.0, "logits/rejected": -27333008.0, "logps/chosen": -413.29951171875, "logps/rejected": -388.4886067708333, "loss": 0.2818, "rewards/chosen": 1.12499942779541, "rewards/margins": 4.087820434570313, "rewards/rejected": -2.9628210067749023, "step": 11960 }, { "epoch": 0.6339808655553494, "grad_norm": 44.75, "kl": 2.103515625, "learning_rate": 5e-07, "logits/chosen": -27859690.666666668, "logits/rejected": -48048569.6, "logps/chosen": -586.1820882161459, "logps/rejected": -258.909619140625, "loss": 0.1939, "rewards/chosen": 1.4450991948445637, "rewards/margins": 3.0655171712239584, "rewards/rejected": -1.6204179763793944, "step": 11961 }, { "epoch": 0.6340338695571516, "grad_norm": 76.5, "kl": 0.37004852294921875, "learning_rate": 5e-07, "logits/chosen": 22770769.6, "logits/rejected": -6703397.333333333, "logps/chosen": -448.399365234375, "logps/rejected": -537.0255940755209, "loss": 0.3571, "rewards/chosen": -0.09412933588027954, "rewards/margins": 3.5033835450808204, "rewards/rejected": -3.5975128809611, "step": 11962 }, { "epoch": 0.6340868735589537, "grad_norm": 53.75, "kl": 0.2564096450805664, "learning_rate": 5e-07, "logits/chosen": -20232837.333333332, "logits/rejected": -17221134.4, "logps/chosen": -293.95166015625, "logps/rejected": -193.12972412109374, "loss": 0.2699, "rewards/chosen": 0.3901836077372233, "rewards/margins": 2.1615905443827312, "rewards/rejected": -1.7714069366455079, "step": 11963 }, { "epoch": 0.6341398775607558, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31654285.333333332, "logits/rejected": -29651884.8, "logps/chosen": -210.9944864908854, "logps/rejected": -251.463037109375, "loss": 0.2046, "rewards/chosen": 0.24269775549570718, "rewards/margins": 3.587072984377543, "rewards/rejected": -3.344375228881836, "step": 11964 }, { "epoch": 0.6341928815625579, "grad_norm": 39.5, "kl": 0.8816757202148438, "learning_rate": 5e-07, "logits/chosen": -32099754.0, "logits/rejected": -10402896.0, "logps/chosen": -405.90081787109375, "logps/rejected": -336.3988037109375, "loss": 0.3389, "rewards/chosen": 0.9547727108001709, "rewards/margins": 2.010265827178955, "rewards/rejected": -1.0554931163787842, "step": 11965 }, { "epoch": 0.6342458855643601, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57540368.0, "logits/rejected": -21187002.666666668, "logps/chosen": -498.1368103027344, "logps/rejected": -378.6016031901042, "loss": 0.2004, "rewards/chosen": 0.13331452012062073, "rewards/margins": 2.7267084817091622, "rewards/rejected": -2.5933939615885415, "step": 11966 }, { "epoch": 0.6342988895661622, "grad_norm": 43.0, "kl": 1.955052375793457, "learning_rate": 5e-07, "logits/chosen": -28954133.333333332, "logits/rejected": 5181590.4, "logps/chosen": -379.9718831380208, "logps/rejected": -482.852978515625, "loss": 0.2057, "rewards/chosen": 1.7393234570821126, "rewards/margins": 4.640136369069417, "rewards/rejected": -2.9008129119873045, "step": 11967 }, { "epoch": 0.6343518935679644, "grad_norm": 34.0, "kl": 1.013936996459961, "learning_rate": 5e-07, "logits/chosen": 1662476.25, "logits/rejected": -36842672.0, "logps/chosen": -83.98377990722656, "logps/rejected": -296.35528564453125, "loss": 0.2117, "rewards/chosen": 0.8139902949333191, "rewards/margins": 4.326502978801727, "rewards/rejected": -3.512512683868408, "step": 11968 }, { "epoch": 0.6344048975697665, "grad_norm": 48.0, "kl": 0.4944305419921875, "learning_rate": 5e-07, "logits/chosen": -54955680.0, "logits/rejected": 1478909.1666666667, "logps/chosen": -258.298876953125, "logps/rejected": -93.05767822265625, "loss": 0.3199, "rewards/chosen": 0.3213973522186279, "rewards/margins": 2.851535209019979, "rewards/rejected": -2.530137856801351, "step": 11969 }, { "epoch": 0.6344579015715687, "grad_norm": 54.5, "kl": 1.4099774360656738, "learning_rate": 5e-07, "logits/chosen": -20900629.333333332, "logits/rejected": 5313630.0, "logps/chosen": -181.25435384114584, "logps/rejected": -282.819677734375, "loss": 0.3295, "rewards/chosen": 0.32754512627919513, "rewards/margins": 1.8468441565831502, "rewards/rejected": -1.519299030303955, "step": 11970 }, { "epoch": 0.6345109055733708, "grad_norm": 48.25, "kl": 2.308298110961914, "learning_rate": 5e-07, "logits/chosen": -23940744.0, "logits/rejected": 22520630.0, "logps/chosen": -464.0846862792969, "logps/rejected": -398.02032470703125, "loss": 0.2111, "rewards/chosen": 1.5962201356887817, "rewards/margins": 3.46435284614563, "rewards/rejected": -1.8681327104568481, "step": 11971 }, { "epoch": 0.634563909575173, "grad_norm": 40.5, "kl": 0.5519018173217773, "learning_rate": 5e-07, "logits/chosen": -7584783.0, "logits/rejected": -4386951.0, "logps/chosen": -336.41400146484375, "logps/rejected": -138.3118693033854, "loss": 0.14, "rewards/chosen": 2.4931321144104004, "rewards/margins": 4.547367731730143, "rewards/rejected": -2.0542356173197427, "step": 11972 }, { "epoch": 0.634616913576975, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44437896.0, "logits/rejected": -32931612.0, "logps/chosen": -413.55303955078125, "logps/rejected": -299.7727966308594, "loss": 0.1941, "rewards/chosen": 0.8044964075088501, "rewards/margins": 3.566198468208313, "rewards/rejected": -2.761702060699463, "step": 11973 }, { "epoch": 0.6346699175787772, "grad_norm": 62.25, "kl": 0.4788932800292969, "learning_rate": 5e-07, "logits/chosen": -46132940.8, "logits/rejected": -13843357.333333334, "logps/chosen": -347.7776123046875, "logps/rejected": -136.23919677734375, "loss": 0.3498, "rewards/chosen": 0.19610657691955566, "rewards/margins": 2.0857089201609296, "rewards/rejected": -1.8896023432413738, "step": 11974 }, { "epoch": 0.6347229215805793, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12639822.4, "logits/rejected": -40435981.333333336, "logps/chosen": -194.68023681640625, "logps/rejected": -561.3258870442709, "loss": 0.3278, "rewards/chosen": 0.3298588752746582, "rewards/margins": 2.8340668042500816, "rewards/rejected": -2.5042079289754233, "step": 11975 }, { "epoch": 0.6347759255823815, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44239570.666666664, "logits/rejected": -23261601.6, "logps/chosen": -146.78900146484375, "logps/rejected": -287.455224609375, "loss": 0.2451, "rewards/chosen": 0.4210748275121053, "rewards/margins": 2.60112472375234, "rewards/rejected": -2.1800498962402344, "step": 11976 }, { "epoch": 0.6348289295841836, "grad_norm": 41.75, "kl": 1.1353988647460938, "learning_rate": 5e-07, "logits/chosen": -30902328.0, "logits/rejected": -27817852.0, "logps/chosen": -306.59130859375, "logps/rejected": -375.8804931640625, "loss": 0.28, "rewards/chosen": 0.3546920716762543, "rewards/margins": 2.678839534521103, "rewards/rejected": -2.3241474628448486, "step": 11977 }, { "epoch": 0.6348819335859858, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50983980.8, "logits/rejected": -31482656.0, "logps/chosen": -404.4056640625, "logps/rejected": -479.6780192057292, "loss": 0.36, "rewards/chosen": 0.16280487775802613, "rewards/margins": 2.3327982783317567, "rewards/rejected": -2.1699934005737305, "step": 11978 }, { "epoch": 0.6349349375877879, "grad_norm": 54.25, "kl": 1.9551725387573242, "learning_rate": 5e-07, "logits/chosen": -50415744.0, "logits/rejected": -19237294.0, "logps/chosen": -361.7952473958333, "logps/rejected": -287.4454650878906, "loss": 0.4182, "rewards/chosen": 0.05616122484207153, "rewards/margins": 2.305777132511139, "rewards/rejected": -2.2496159076690674, "step": 11979 }, { "epoch": 0.63498794158959, "grad_norm": 38.0, "kl": 3.5156078338623047, "learning_rate": 5e-07, "logits/chosen": -18145533.333333332, "logits/rejected": -22068241.6, "logps/chosen": -672.299072265625, "logps/rejected": -177.9364013671875, "loss": 0.208, "rewards/chosen": 1.5547464688618977, "rewards/margins": 4.423354943593343, "rewards/rejected": -2.8686084747314453, "step": 11980 }, { "epoch": 0.6350409455913921, "grad_norm": 59.5, "kl": 0.012729644775390625, "learning_rate": 5e-07, "logits/chosen": -33185561.6, "logits/rejected": -17232470.666666668, "logps/chosen": -195.463623046875, "logps/rejected": -137.595947265625, "loss": 0.3085, "rewards/chosen": 0.20461618900299072, "rewards/margins": 3.004726688067118, "rewards/rejected": -2.8001104990641275, "step": 11981 }, { "epoch": 0.6350939495931943, "grad_norm": 50.25, "kl": 0.7559814453125, "learning_rate": 5e-07, "logits/chosen": -24016528.0, "logits/rejected": -48073637.333333336, "logps/chosen": -265.570361328125, "logps/rejected": -172.89668782552084, "loss": 0.2941, "rewards/chosen": 0.394521427154541, "rewards/margins": 3.0313021659851076, "rewards/rejected": -2.6367807388305664, "step": 11982 }, { "epoch": 0.6351469535949964, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3171488.6666666665, "logits/rejected": -3028389.8, "logps/chosen": -35.5186513264974, "logps/rejected": -311.135302734375, "loss": 0.2055, "rewards/chosen": 0.6258432865142822, "rewards/margins": 3.2186985492706297, "rewards/rejected": -2.5928552627563475, "step": 11983 }, { "epoch": 0.6351999575967986, "grad_norm": 69.5, "kl": 3.6904029846191406, "learning_rate": 5e-07, "logits/chosen": -55404708.571428575, "logits/rejected": -50320240.0, "logps/chosen": -351.1823032924107, "logps/rejected": -1095.5469970703125, "loss": 0.4023, "rewards/chosen": 0.6404642377580915, "rewards/margins": 3.6542703424181258, "rewards/rejected": -3.013806104660034, "step": 11984 }, { "epoch": 0.6352529615986007, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21765598.666666668, "logits/rejected": -22840326.4, "logps/chosen": -215.37030029296875, "logps/rejected": -264.8533203125, "loss": 0.2465, "rewards/chosen": 0.06813672681649525, "rewards/margins": 3.169433723886808, "rewards/rejected": -3.1012969970703126, "step": 11985 }, { "epoch": 0.6353059656004029, "grad_norm": 43.5, "kl": 1.6355438232421875, "learning_rate": 5e-07, "logits/chosen": 50960856.0, "logits/rejected": -25765269.333333332, "logps/chosen": -355.54071044921875, "logps/rejected": -564.1697591145834, "loss": 0.2068, "rewards/chosen": 0.0419464148581028, "rewards/margins": 3.423830036073923, "rewards/rejected": -3.3818836212158203, "step": 11986 }, { "epoch": 0.635358969602205, "grad_norm": 41.0, "kl": 1.2681198120117188, "learning_rate": 5e-07, "logits/chosen": 2035055.6, "logits/rejected": -4906865.333333333, "logps/chosen": -150.93204345703126, "logps/rejected": -150.75907389322916, "loss": 0.353, "rewards/chosen": 0.3838371753692627, "rewards/margins": 2.2402157306671144, "rewards/rejected": -1.8563785552978516, "step": 11987 }, { "epoch": 0.6354119736040071, "grad_norm": 55.0, "kl": 2.4661788940429688, "learning_rate": 5e-07, "logits/chosen": -14799954.0, "logits/rejected": -21146466.0, "logps/chosen": -337.6273193359375, "logps/rejected": -189.99078369140625, "loss": 0.2724, "rewards/chosen": 0.8723604679107666, "rewards/margins": 2.9104340076446533, "rewards/rejected": -2.0380735397338867, "step": 11988 }, { "epoch": 0.6354649776058092, "grad_norm": 56.25, "kl": 0.6797561645507812, "learning_rate": 5e-07, "logits/chosen": 4607579.2, "logits/rejected": -16354549.333333334, "logps/chosen": -336.935400390625, "logps/rejected": -363.2664388020833, "loss": 0.3388, "rewards/chosen": 0.20422065258026123, "rewards/margins": 2.381868084271749, "rewards/rejected": -2.177647431691488, "step": 11989 }, { "epoch": 0.6355179816076114, "grad_norm": 58.5, "kl": 2.386140823364258, "learning_rate": 5e-07, "logits/chosen": -28874582.4, "logits/rejected": -35702930.666666664, "logps/chosen": -314.7545654296875, "logps/rejected": -483.9679361979167, "loss": 0.4153, "rewards/chosen": -0.12238407135009766, "rewards/margins": 2.6043739318847656, "rewards/rejected": -2.7267580032348633, "step": 11990 }, { "epoch": 0.6355709856094135, "grad_norm": 37.25, "kl": 4.102384567260742, "learning_rate": 5e-07, "logits/chosen": -18363577.6, "logits/rejected": -73541610.66666667, "logps/chosen": -640.445654296875, "logps/rejected": -324.3724365234375, "loss": 0.299, "rewards/chosen": 1.182420539855957, "rewards/margins": 3.2436229070027673, "rewards/rejected": -2.06120236714681, "step": 11991 }, { "epoch": 0.6356239896112157, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52110848.0, "logits/rejected": -10550894.666666666, "logps/chosen": -340.99896240234375, "logps/rejected": -248.53129069010416, "loss": 0.1905, "rewards/chosen": 0.9848266839981079, "rewards/margins": 2.887826879819234, "rewards/rejected": -1.9030001958211262, "step": 11992 }, { "epoch": 0.6356769936130178, "grad_norm": 46.75, "kl": 0.8263044357299805, "learning_rate": 5e-07, "logits/chosen": -9057167.0, "logits/rejected": 6863066.0, "logps/chosen": -159.5782928466797, "logps/rejected": -325.21185302734375, "loss": 0.3033, "rewards/chosen": 0.561272144317627, "rewards/margins": 2.001037836074829, "rewards/rejected": -1.4397656917572021, "step": 11993 }, { "epoch": 0.63572999761482, "grad_norm": 52.25, "kl": 3.5907630920410156, "learning_rate": 5e-07, "logits/chosen": -34717052.0, "logits/rejected": -16413876.0, "logps/chosen": -753.2896728515625, "logps/rejected": -132.4508514404297, "loss": 0.2947, "rewards/chosen": 1.0993683338165283, "rewards/margins": 3.2207677364349365, "rewards/rejected": -2.121399402618408, "step": 11994 }, { "epoch": 0.635783001616622, "grad_norm": 55.25, "kl": 0.9736061096191406, "learning_rate": 5e-07, "logits/chosen": -53788452.0, "logits/rejected": 112959210.66666667, "logps/chosen": -410.13958740234375, "logps/rejected": -277.10654703776044, "loss": 0.2116, "rewards/chosen": 0.39448243379592896, "rewards/margins": 2.4290289680163064, "rewards/rejected": -2.0345465342203775, "step": 11995 }, { "epoch": 0.6358360056184242, "grad_norm": 48.25, "kl": 0.45919036865234375, "learning_rate": 5e-07, "logits/chosen": -40131817.6, "logits/rejected": 12178692.0, "logps/chosen": -179.91990966796874, "logps/rejected": -533.1434733072916, "loss": 0.3244, "rewards/chosen": 0.2241738796234131, "rewards/margins": 3.320641533533732, "rewards/rejected": -3.096467653910319, "step": 11996 }, { "epoch": 0.6358890096202263, "grad_norm": 54.25, "kl": 1.8691864013671875, "learning_rate": 5e-07, "logits/chosen": -57581555.2, "logits/rejected": -16139210.666666666, "logps/chosen": -494.595849609375, "logps/rejected": -329.3118082682292, "loss": 0.2784, "rewards/chosen": 1.032940673828125, "rewards/margins": 2.889182535807292, "rewards/rejected": -1.8562418619791667, "step": 11997 }, { "epoch": 0.6359420136220285, "grad_norm": 40.5, "kl": 0.6285953521728516, "learning_rate": 5e-07, "logits/chosen": -25420544.0, "logits/rejected": -51799702.4, "logps/chosen": -211.34039306640625, "logps/rejected": -353.7719482421875, "loss": 0.2169, "rewards/chosen": 0.6508171161015829, "rewards/margins": 2.831995781262716, "rewards/rejected": -2.181178665161133, "step": 11998 }, { "epoch": 0.6359950176238306, "grad_norm": 70.5, "kl": 2.5405216217041016, "learning_rate": 5e-07, "logits/chosen": -44866438.4, "logits/rejected": -57351264.0, "logps/chosen": -559.69677734375, "logps/rejected": -608.2654622395834, "loss": 0.3255, "rewards/chosen": 0.34163925647735593, "rewards/margins": 4.7487049659093215, "rewards/rejected": -4.407065709431966, "step": 11999 }, { "epoch": 0.6360480216256328, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12026010.666666666, "logits/rejected": -22934052.8, "logps/chosen": -564.3538004557291, "logps/rejected": -495.8087890625, "loss": 0.2196, "rewards/chosen": 0.35447641213734943, "rewards/margins": 3.0173140128453575, "rewards/rejected": -2.662837600708008, "step": 12000 }, { "epoch": 0.6361010256274349, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21386557.333333332, "logits/rejected": -5746510.4, "logps/chosen": -336.73980712890625, "logps/rejected": -164.28182373046874, "loss": 0.1977, "rewards/chosen": 0.8243668874104818, "rewards/margins": 3.2664523442586266, "rewards/rejected": -2.4420854568481447, "step": 12001 }, { "epoch": 0.6361540296292371, "grad_norm": 47.75, "kl": 2.5578346252441406, "learning_rate": 5e-07, "logits/chosen": -5813559.0, "logits/rejected": -24746356.0, "logps/chosen": -205.78883361816406, "logps/rejected": -240.92465209960938, "loss": 0.2683, "rewards/chosen": 1.1863898038864136, "rewards/margins": 2.826800584793091, "rewards/rejected": -1.6404107809066772, "step": 12002 }, { "epoch": 0.6362070336310391, "grad_norm": 64.5, "kl": 0.41968536376953125, "learning_rate": 5e-07, "logits/chosen": -46304736.0, "logits/rejected": -75536576.0, "logps/chosen": -592.2843017578125, "logps/rejected": -436.37615966796875, "loss": 0.3045, "rewards/chosen": 0.74945068359375, "rewards/margins": 3.4602324962615967, "rewards/rejected": -2.7107818126678467, "step": 12003 }, { "epoch": 0.6362600376328412, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40158872.0, "logits/rejected": -11085580.0, "logps/chosen": -396.07568359375, "logps/rejected": -304.3361002604167, "loss": 0.2346, "rewards/chosen": -0.23592987656593323, "rewards/margins": 1.8787976205348969, "rewards/rejected": -2.11472749710083, "step": 12004 }, { "epoch": 0.6363130416346434, "grad_norm": 53.75, "kl": 3.8622193336486816, "learning_rate": 5e-07, "logits/chosen": -16593908.57142857, "logits/rejected": 4165506.0, "logps/chosen": -206.49004255022322, "logps/rejected": -34.21226119995117, "loss": 0.48, "rewards/chosen": 0.2957169328417097, "rewards/margins": 1.9555476222719466, "rewards/rejected": -1.6598306894302368, "step": 12005 }, { "epoch": 0.6363660456364455, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32725650.0, "logits/rejected": -35656820.0, "logps/chosen": -147.818359375, "logps/rejected": -425.2667236328125, "loss": 0.2798, "rewards/chosen": 0.3592543601989746, "rewards/margins": 2.2018157243728638, "rewards/rejected": -1.8425613641738892, "step": 12006 }, { "epoch": 0.6364190496382477, "grad_norm": 51.5, "kl": 3.0695533752441406, "learning_rate": 5e-07, "logits/chosen": -18491733.333333332, "logits/rejected": -95860144.0, "logps/chosen": -323.30576578776044, "logps/rejected": -584.8828125, "loss": 0.3572, "rewards/chosen": 0.6898395220438639, "rewards/margins": 2.802339474360148, "rewards/rejected": -2.112499952316284, "step": 12007 }, { "epoch": 0.6364720536400498, "grad_norm": 67.5, "kl": 4.294073104858398, "learning_rate": 5e-07, "logits/chosen": -17225568.0, "logits/rejected": -62534036.0, "logps/chosen": -492.1144205729167, "logps/rejected": -159.10714721679688, "loss": 0.4053, "rewards/chosen": 0.44695862134297687, "rewards/margins": 4.670672972997029, "rewards/rejected": -4.223714351654053, "step": 12008 }, { "epoch": 0.636525057641852, "grad_norm": 61.75, "kl": 1.4913959503173828, "learning_rate": 5e-07, "logits/chosen": -16187936.0, "logits/rejected": -9704719.0, "logps/chosen": -305.9592692057292, "logps/rejected": -517.4359130859375, "loss": 0.4965, "rewards/chosen": -0.2035321593284607, "rewards/margins": 2.594334900379181, "rewards/rejected": -2.7978670597076416, "step": 12009 }, { "epoch": 0.636578061643654, "grad_norm": 44.0, "kl": 5.90418815612793, "learning_rate": 5e-07, "logits/chosen": 12649188.0, "logits/rejected": -42284088.0, "logps/chosen": -443.96087646484375, "logps/rejected": -318.91015625, "loss": 0.3126, "rewards/chosen": 1.3117289543151855, "rewards/margins": 2.962637424468994, "rewards/rejected": -1.6509084701538086, "step": 12010 }, { "epoch": 0.6366310656454562, "grad_norm": 54.25, "kl": 1.8799362182617188, "learning_rate": 5e-07, "logits/chosen": -10946888.0, "logits/rejected": -14498921.0, "logps/chosen": -329.0709228515625, "logps/rejected": -273.150634765625, "loss": 0.3881, "rewards/chosen": 0.41620715459187824, "rewards/margins": 2.300190528233846, "rewards/rejected": -1.8839833736419678, "step": 12011 }, { "epoch": 0.6366840696472583, "grad_norm": 32.5, "kl": 2.714242935180664, "learning_rate": 5e-07, "logits/chosen": 3614926.0, "logits/rejected": -28592985.6, "logps/chosen": -39.77094014485677, "logps/rejected": -531.6740234375, "loss": 0.2621, "rewards/chosen": 0.20157917340596518, "rewards/margins": 2.3649222215016685, "rewards/rejected": -2.163343048095703, "step": 12012 }, { "epoch": 0.6367370736490605, "grad_norm": 50.75, "kl": 0.05887603759765625, "learning_rate": 5e-07, "logits/chosen": -41846024.0, "logits/rejected": -37890200.0, "logps/chosen": -318.81195068359375, "logps/rejected": -308.6089172363281, "loss": 0.2949, "rewards/chosen": 0.3042606711387634, "rewards/margins": 2.1908718943595886, "rewards/rejected": -1.8866112232208252, "step": 12013 }, { "epoch": 0.6367900776508626, "grad_norm": 76.0, "kl": 3.7126731872558594, "learning_rate": 5e-07, "logits/chosen": -9504552.0, "logits/rejected": -6814052.0, "logps/chosen": -161.38133239746094, "logps/rejected": -356.63140869140625, "loss": 0.3792, "rewards/chosen": 0.584571123123169, "rewards/margins": 2.482205867767334, "rewards/rejected": -1.897634744644165, "step": 12014 }, { "epoch": 0.6368430816526648, "grad_norm": 47.75, "kl": 0.014153480529785156, "learning_rate": 5e-07, "logits/chosen": -23308386.0, "logits/rejected": -72452808.0, "logps/chosen": -151.71664428710938, "logps/rejected": -310.00738525390625, "loss": 0.3348, "rewards/chosen": -0.2669680714607239, "rewards/margins": 2.0804094672203064, "rewards/rejected": -2.3473775386810303, "step": 12015 }, { "epoch": 0.6368960856544669, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64682304.0, "logits/rejected": -8567380.57142857, "logps/chosen": -401.85211181640625, "logps/rejected": -209.06108747209822, "loss": 0.2628, "rewards/chosen": -0.5169433951377869, "rewards/margins": 1.126021785395486, "rewards/rejected": -1.6429651805332728, "step": 12016 }, { "epoch": 0.6369490896562691, "grad_norm": 56.5, "kl": 2.408066749572754, "learning_rate": 5e-07, "logits/chosen": -8030628.0, "logits/rejected": -32185660.0, "logps/chosen": -387.0035705566406, "logps/rejected": -452.3280029296875, "loss": 0.2681, "rewards/chosen": 0.6062635779380798, "rewards/margins": 3.407390058040619, "rewards/rejected": -2.801126480102539, "step": 12017 }, { "epoch": 0.6370020936580711, "grad_norm": 40.75, "kl": 1.7258262634277344, "learning_rate": 5e-07, "logits/chosen": -21066094.0, "logits/rejected": -11331248.0, "logps/chosen": -483.3792419433594, "logps/rejected": -189.4613037109375, "loss": 0.1683, "rewards/chosen": 2.5991456508636475, "rewards/margins": 4.770069519678751, "rewards/rejected": -2.170923868815104, "step": 12018 }, { "epoch": 0.6370550976598733, "grad_norm": 54.0, "kl": 0.29273414611816406, "learning_rate": 5e-07, "logits/chosen": -15056377.6, "logits/rejected": -46629701.333333336, "logps/chosen": -359.2169189453125, "logps/rejected": -369.173828125, "loss": 0.3711, "rewards/chosen": -0.10570313930511474, "rewards/margins": 2.452102875709534, "rewards/rejected": -2.5578060150146484, "step": 12019 }, { "epoch": 0.6371081016616754, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68554816.0, "logits/rejected": -38823272.0, "logps/chosen": -373.76015625, "logps/rejected": -345.6424153645833, "loss": 0.4324, "rewards/chosen": -0.2640014886856079, "rewards/margins": 1.2656835317611694, "rewards/rejected": -1.5296850204467773, "step": 12020 }, { "epoch": 0.6371611056634776, "grad_norm": 52.0, "kl": 1.1230850219726562, "learning_rate": 5e-07, "logits/chosen": -10859283.2, "logits/rejected": -22520824.0, "logps/chosen": -312.4408447265625, "logps/rejected": -195.78470865885416, "loss": 0.2738, "rewards/chosen": 0.658709716796875, "rewards/margins": 3.024795087178548, "rewards/rejected": -2.3660853703816733, "step": 12021 }, { "epoch": 0.6372141096652797, "grad_norm": 68.5, "kl": 3.5514373779296875, "learning_rate": 5e-07, "logits/chosen": -45413267.2, "logits/rejected": -36186565.333333336, "logps/chosen": -525.9119140625, "logps/rejected": -285.7634684244792, "loss": 0.2778, "rewards/chosen": 1.2200879096984862, "rewards/margins": 3.001341406504313, "rewards/rejected": -1.781253496805827, "step": 12022 }, { "epoch": 0.6372671136670819, "grad_norm": 42.0, "kl": 1.6577949523925781, "learning_rate": 5e-07, "logits/chosen": -17779890.666666668, "logits/rejected": -6232135.2, "logps/chosen": -190.8896280924479, "logps/rejected": -193.46612548828125, "loss": 0.3108, "rewards/chosen": 0.06200942893822988, "rewards/margins": 1.7720117559035617, "rewards/rejected": -1.710002326965332, "step": 12023 }, { "epoch": 0.637320117668884, "grad_norm": 35.75, "kl": 1.007741928100586, "learning_rate": 5e-07, "logits/chosen": -34208880.0, "logits/rejected": -11525000.8, "logps/chosen": -262.8896891276042, "logps/rejected": -292.2176513671875, "loss": 0.2283, "rewards/chosen": 0.13816680510838827, "rewards/margins": 3.002254100640615, "rewards/rejected": -2.8640872955322267, "step": 12024 }, { "epoch": 0.6373731216706862, "grad_norm": 41.5, "kl": 1.4054450988769531, "learning_rate": 5e-07, "logits/chosen": -16443204.0, "logits/rejected": -40707331.2, "logps/chosen": -851.3202311197916, "logps/rejected": -352.9569091796875, "loss": 0.1989, "rewards/chosen": 1.8478525479634602, "rewards/margins": 4.30600856145223, "rewards/rejected": -2.4581560134887694, "step": 12025 }, { "epoch": 0.6374261256724882, "grad_norm": 39.5, "kl": 0.9042673110961914, "learning_rate": 5e-07, "logits/chosen": -39056960.0, "logits/rejected": -54394756.0, "logps/chosen": -156.34396362304688, "logps/rejected": -271.32830810546875, "loss": 0.2995, "rewards/chosen": 0.34812021255493164, "rewards/margins": 2.052912950515747, "rewards/rejected": -1.7047927379608154, "step": 12026 }, { "epoch": 0.6374791296742904, "grad_norm": 66.5, "kl": 2.272228240966797, "learning_rate": 5e-07, "logits/chosen": -54087445.333333336, "logits/rejected": 4653380.0, "logps/chosen": -367.0913899739583, "logps/rejected": -385.8764953613281, "loss": 0.3495, "rewards/chosen": 0.6753364404042562, "rewards/margins": 2.3374787171681723, "rewards/rejected": -1.662142276763916, "step": 12027 }, { "epoch": 0.6375321336760925, "grad_norm": 61.25, "kl": 2.414896011352539, "learning_rate": 5e-07, "logits/chosen": -34297074.666666664, "logits/rejected": 4822778.0, "logps/chosen": -332.2755533854167, "logps/rejected": -248.73326110839844, "loss": 0.4561, "rewards/chosen": 0.1266286571820577, "rewards/margins": 1.2170843084653218, "rewards/rejected": -1.0904556512832642, "step": 12028 }, { "epoch": 0.6375851376778947, "grad_norm": 37.75, "kl": 0.8598737716674805, "learning_rate": 5e-07, "logits/chosen": 1168450.6666666667, "logits/rejected": -7707692.0, "logps/chosen": -218.0660400390625, "logps/rejected": -273.945458984375, "loss": 0.2262, "rewards/chosen": 0.6459775368372599, "rewards/margins": 3.1331979195276896, "rewards/rejected": -2.4872203826904298, "step": 12029 }, { "epoch": 0.6376381416796968, "grad_norm": 48.75, "kl": 1.0355224609375, "learning_rate": 5e-07, "logits/chosen": -48029948.8, "logits/rejected": -37698266.666666664, "logps/chosen": -663.64423828125, "logps/rejected": -449.60400390625, "loss": 0.3089, "rewards/chosen": 0.6275171279907227, "rewards/margins": 3.1432943979899086, "rewards/rejected": -2.515777269999186, "step": 12030 }, { "epoch": 0.637691145681499, "grad_norm": 44.5, "kl": 0.8878040313720703, "learning_rate": 5e-07, "logits/chosen": -36735136.0, "logits/rejected": -7589882.4, "logps/chosen": -387.8425699869792, "logps/rejected": -460.077490234375, "loss": 0.2894, "rewards/chosen": 0.37968289852142334, "rewards/margins": 2.887846350669861, "rewards/rejected": -2.5081634521484375, "step": 12031 }, { "epoch": 0.6377441496833011, "grad_norm": 52.25, "kl": 1.294576644897461, "learning_rate": 5e-07, "logits/chosen": -87416856.0, "logits/rejected": -18399230.666666668, "logps/chosen": -155.2428741455078, "logps/rejected": -263.6964518229167, "loss": 0.2652, "rewards/chosen": -0.15828362107276917, "rewards/margins": 1.9603960613409677, "rewards/rejected": -2.118679682413737, "step": 12032 }, { "epoch": 0.6377971536851033, "grad_norm": 43.5, "kl": 2.9430112838745117, "learning_rate": 5e-07, "logits/chosen": -14776302.0, "logits/rejected": -30911004.0, "logps/chosen": -113.89669799804688, "logps/rejected": -290.4945983886719, "loss": 0.3162, "rewards/chosen": 0.42452746629714966, "rewards/margins": 3.060442268848419, "rewards/rejected": -2.6359148025512695, "step": 12033 }, { "epoch": 0.6378501576869053, "grad_norm": 53.0, "kl": 3.5484695434570312, "learning_rate": 5e-07, "logits/chosen": -32711040.0, "logits/rejected": -6556424.0, "logps/chosen": -340.2041713169643, "logps/rejected": -695.8006591796875, "loss": 0.4057, "rewards/chosen": 0.5983525003705706, "rewards/margins": 3.5563603128705705, "rewards/rejected": -2.9580078125, "step": 12034 }, { "epoch": 0.6379031616887075, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41290880.0, "logits/rejected": -26936010.666666668, "logps/chosen": -292.1638488769531, "logps/rejected": -268.11354573567706, "loss": 0.2189, "rewards/chosen": 0.03844985365867615, "rewards/margins": 2.1547715763250985, "rewards/rejected": -2.1163217226664224, "step": 12035 }, { "epoch": 0.6379561656905096, "grad_norm": 62.25, "kl": 1.2968502044677734, "learning_rate": 5e-07, "logits/chosen": -38474514.666666664, "logits/rejected": -11932822.0, "logps/chosen": -327.57281494140625, "logps/rejected": -145.02365112304688, "loss": 0.3836, "rewards/chosen": 0.4555559953053792, "rewards/margins": 1.6219416459401448, "rewards/rejected": -1.1663856506347656, "step": 12036 }, { "epoch": 0.6380091696923118, "grad_norm": 30.375, "kl": 1.4947175979614258, "learning_rate": 5e-07, "logits/chosen": -13334736.0, "logits/rejected": -23299856.0, "logps/chosen": -179.9360809326172, "logps/rejected": -314.5988464355469, "loss": 0.2693, "rewards/chosen": 0.08146189153194427, "rewards/margins": 3.5017347186803818, "rewards/rejected": -3.4202728271484375, "step": 12037 }, { "epoch": 0.6380621736941139, "grad_norm": 47.0, "kl": 0.2713451385498047, "learning_rate": 5e-07, "logits/chosen": -32579824.0, "logits/rejected": -27251232.0, "logps/chosen": -384.97857666015625, "logps/rejected": -242.60910034179688, "loss": 0.2945, "rewards/chosen": 0.7371088266372681, "rewards/margins": 2.3304948806762695, "rewards/rejected": -1.5933860540390015, "step": 12038 }, { "epoch": 0.6381151776959161, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43871616.0, "logits/rejected": -18597860.0, "logps/chosen": -765.73671875, "logps/rejected": -345.7923990885417, "loss": 0.3477, "rewards/chosen": 0.613332986831665, "rewards/margins": 1.969480276107788, "rewards/rejected": -1.356147289276123, "step": 12039 }, { "epoch": 0.6381681816977182, "grad_norm": 67.5, "kl": 5.922329902648926, "learning_rate": 5e-07, "logits/chosen": -20530643.2, "logits/rejected": 5270054.333333333, "logps/chosen": -570.97216796875, "logps/rejected": -238.21012369791666, "loss": 0.3532, "rewards/chosen": 1.2424237251281738, "rewards/margins": 2.8718586285909016, "rewards/rejected": -1.6294349034627278, "step": 12040 }, { "epoch": 0.6382211856995204, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 65779996.0, "logits/rejected": -30073128.0, "logps/chosen": -545.319580078125, "logps/rejected": -475.1781005859375, "loss": 0.2022, "rewards/chosen": 0.25414276123046875, "rewards/margins": 2.5590540568033853, "rewards/rejected": -2.3049112955729165, "step": 12041 }, { "epoch": 0.6382741897013224, "grad_norm": 41.25, "kl": 3.0684337615966797, "learning_rate": 5e-07, "logits/chosen": -19254002.666666668, "logits/rejected": -47297348.0, "logps/chosen": -258.38950602213544, "logps/rejected": -584.677490234375, "loss": 0.2516, "rewards/chosen": 1.088701566060384, "rewards/margins": 5.636527379353841, "rewards/rejected": -4.547825813293457, "step": 12042 }, { "epoch": 0.6383271937031246, "grad_norm": 42.25, "kl": 0.4982147216796875, "learning_rate": 5e-07, "logits/chosen": 594521.5, "logits/rejected": -28080046.0, "logps/chosen": -81.13996124267578, "logps/rejected": -171.67462158203125, "loss": 0.3367, "rewards/chosen": 0.2161722183227539, "rewards/margins": 1.8331613540649414, "rewards/rejected": -1.6169891357421875, "step": 12043 }, { "epoch": 0.6383801977049267, "grad_norm": 54.0, "kl": 1.5472640991210938, "learning_rate": 5e-07, "logits/chosen": -21977153.6, "logits/rejected": -4349942.333333333, "logps/chosen": -288.2894287109375, "logps/rejected": -115.36061604817708, "loss": 0.2717, "rewards/chosen": 0.8557966232299805, "rewards/margins": 3.9101086934407556, "rewards/rejected": -3.054312070210775, "step": 12044 }, { "epoch": 0.6384332017067289, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36569501.333333336, "logits/rejected": -28635020.8, "logps/chosen": -226.93560791015625, "logps/rejected": -438.50244140625, "loss": 0.1745, "rewards/chosen": 0.7932289441426595, "rewards/margins": 3.385914262135824, "rewards/rejected": -2.592685317993164, "step": 12045 }, { "epoch": 0.638486205708531, "grad_norm": 67.5, "kl": 2.838491439819336, "learning_rate": 5e-07, "logits/chosen": 4506407.5, "logits/rejected": -2618753.0, "logps/chosen": -314.4678649902344, "logps/rejected": -414.1549072265625, "loss": 0.2495, "rewards/chosen": 1.1108760833740234, "rewards/margins": 3.886127471923828, "rewards/rejected": -2.7752513885498047, "step": 12046 }, { "epoch": 0.6385392097103332, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4314270.0, "logits/rejected": -24675032.0, "logps/chosen": -119.71234130859375, "logps/rejected": -107.39259338378906, "loss": 0.3507, "rewards/chosen": 0.07228896021842957, "rewards/margins": 1.685785859823227, "rewards/rejected": -1.6134968996047974, "step": 12047 }, { "epoch": 0.6385922137121353, "grad_norm": 51.5, "kl": 2.758668899536133, "learning_rate": 5e-07, "logits/chosen": -25690837.333333332, "logits/rejected": 14427943.0, "logps/chosen": -231.4773152669271, "logps/rejected": -226.43820190429688, "loss": 0.4144, "rewards/chosen": 0.5138729015986124, "rewards/margins": 1.328499952952067, "rewards/rejected": -0.8146270513534546, "step": 12048 }, { "epoch": 0.6386452177139375, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25826996.0, "logits/rejected": -47583984.0, "logps/chosen": -308.8458251953125, "logps/rejected": -355.1264953613281, "loss": 0.33, "rewards/chosen": 0.31254786252975464, "rewards/margins": 2.468951404094696, "rewards/rejected": -2.1564035415649414, "step": 12049 }, { "epoch": 0.6386982217157395, "grad_norm": 52.75, "kl": 1.6402549743652344, "learning_rate": 5e-07, "logits/chosen": -24169066.666666668, "logits/rejected": -682976.4, "logps/chosen": -491.5284016927083, "logps/rejected": -130.71798095703124, "loss": 0.1893, "rewards/chosen": 1.798783302307129, "rewards/margins": 3.2135804176330565, "rewards/rejected": -1.4147971153259278, "step": 12050 }, { "epoch": 0.6387512257175417, "grad_norm": 55.75, "kl": 4.499486923217773, "learning_rate": 5e-07, "logits/chosen": -18032038.666666668, "logits/rejected": 4439024.0, "logps/chosen": -232.1645304361979, "logps/rejected": -267.88531494140625, "loss": 0.4774, "rewards/chosen": 0.36183810234069824, "rewards/margins": 1.5923312902450562, "rewards/rejected": -1.230493187904358, "step": 12051 }, { "epoch": 0.6388042297193438, "grad_norm": 43.5, "kl": 3.6340179443359375, "learning_rate": 5e-07, "logits/chosen": 10267938.0, "logits/rejected": -11599927.0, "logps/chosen": -91.83201090494792, "logps/rejected": -109.59903717041016, "loss": 0.4482, "rewards/chosen": 0.2846009333928426, "rewards/margins": 1.8731728394826253, "rewards/rejected": -1.5885719060897827, "step": 12052 }, { "epoch": 0.638857233721146, "grad_norm": 34.25, "kl": 1.8858528137207031, "learning_rate": 5e-07, "logits/chosen": -526003.5, "logits/rejected": -32953440.0, "logps/chosen": -179.0914306640625, "logps/rejected": -357.2592366536458, "loss": 0.2079, "rewards/chosen": 0.7243145704269409, "rewards/margins": 2.9438052574793496, "rewards/rejected": -2.2194906870524087, "step": 12053 }, { "epoch": 0.6389102377229481, "grad_norm": 52.75, "kl": 0.9976806640625, "learning_rate": 5e-07, "logits/chosen": 7615983.0, "logits/rejected": -43416268.0, "logps/chosen": -299.88714599609375, "logps/rejected": -263.5347595214844, "loss": 0.3787, "rewards/chosen": -0.35147762298583984, "rewards/margins": 1.596706509590149, "rewards/rejected": -1.9481841325759888, "step": 12054 }, { "epoch": 0.6389632417247502, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5814923.5, "logits/rejected": -17704077.333333332, "logps/chosen": -16.686214447021484, "logps/rejected": -571.906982421875, "loss": 0.2293, "rewards/chosen": -0.038356781005859375, "rewards/margins": 2.281912644704183, "rewards/rejected": -2.3202694257100425, "step": 12055 }, { "epoch": 0.6390162457265524, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2742570.5, "logits/rejected": -77933235.2, "logps/chosen": -23.959426879882812, "logps/rejected": -402.9421875, "loss": 0.2114, "rewards/chosen": 0.6487298806508383, "rewards/margins": 2.848481925328573, "rewards/rejected": -2.1997520446777346, "step": 12056 }, { "epoch": 0.6390692497283544, "grad_norm": 47.75, "kl": 1.321615219116211, "learning_rate": 5e-07, "logits/chosen": -31192992.0, "logits/rejected": -17695476.8, "logps/chosen": -398.8638102213542, "logps/rejected": -256.5677978515625, "loss": 0.2529, "rewards/chosen": 0.8741324742635092, "rewards/margins": 2.5835662206013996, "rewards/rejected": -1.7094337463378906, "step": 12057 }, { "epoch": 0.6391222537301566, "grad_norm": 41.75, "kl": 0.5025062561035156, "learning_rate": 5e-07, "logits/chosen": -18109720.0, "logits/rejected": -79919660.8, "logps/chosen": -197.28568522135416, "logps/rejected": -382.8086669921875, "loss": 0.2002, "rewards/chosen": 0.7187845706939697, "rewards/margins": 3.503107118606567, "rewards/rejected": -2.7843225479125975, "step": 12058 }, { "epoch": 0.6391752577319587, "grad_norm": 26.875, "kl": 4.635431289672852, "learning_rate": 5e-07, "logits/chosen": 1032052.0, "logits/rejected": -5208891.0, "logps/chosen": -440.06109619140625, "logps/rejected": -220.38287353515625, "loss": 0.1977, "rewards/chosen": 1.785290002822876, "rewards/margins": 4.156455755233765, "rewards/rejected": -2.3711657524108887, "step": 12059 }, { "epoch": 0.6392282617337609, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20112658.0, "logits/rejected": -15402427.0, "logps/chosen": -374.21173095703125, "logps/rejected": -176.21127319335938, "loss": 0.2582, "rewards/chosen": 0.6325758099555969, "rewards/margins": 2.502557337284088, "rewards/rejected": -1.8699815273284912, "step": 12060 }, { "epoch": 0.639281265735563, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -87841816.0, "logits/rejected": -36682298.666666664, "logps/chosen": -420.15069580078125, "logps/rejected": -355.8807779947917, "loss": 0.1819, "rewards/chosen": 0.5565429925918579, "rewards/margins": 2.9558332363764444, "rewards/rejected": -2.3992902437845864, "step": 12061 }, { "epoch": 0.6393342697373652, "grad_norm": 28.25, "kl": 0.7579526901245117, "learning_rate": 5e-07, "logits/chosen": -29578008.0, "logits/rejected": -10562360.8, "logps/chosen": -346.70458984375, "logps/rejected": -302.7685546875, "loss": 0.1718, "rewards/chosen": 1.507321039835612, "rewards/margins": 4.087878481547038, "rewards/rejected": -2.580557441711426, "step": 12062 }, { "epoch": 0.6393872737391673, "grad_norm": 35.75, "kl": 0.44782400131225586, "learning_rate": 5e-07, "logits/chosen": 2518918.1666666665, "logits/rejected": 18736600.0, "logps/chosen": -85.71657307942708, "logps/rejected": -250.84462890625, "loss": 0.2189, "rewards/chosen": 0.7014053662618002, "rewards/margins": 2.871641000111898, "rewards/rejected": -2.1702356338500977, "step": 12063 }, { "epoch": 0.6394402777409695, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20871456.0, "logits/rejected": -40243283.2, "logps/chosen": -234.40779622395834, "logps/rejected": -374.194091796875, "loss": 0.307, "rewards/chosen": -0.434625506401062, "rewards/margins": 2.0259042024612426, "rewards/rejected": -2.4605297088623046, "step": 12064 }, { "epoch": 0.6394932817427715, "grad_norm": 46.25, "kl": 1.8470468521118164, "learning_rate": 5e-07, "logits/chosen": 7284966.0, "logits/rejected": 12351498.0, "logps/chosen": -67.89302062988281, "logps/rejected": -370.2579040527344, "loss": 0.3168, "rewards/chosen": 0.48628756403923035, "rewards/margins": 2.567385643720627, "rewards/rejected": -2.0810980796813965, "step": 12065 }, { "epoch": 0.6395462857445737, "grad_norm": 38.25, "kl": 0.38068580627441406, "learning_rate": 5e-07, "logits/chosen": -12187248.0, "logits/rejected": -14708158.0, "logps/chosen": -554.7999877929688, "logps/rejected": -193.5218505859375, "loss": 0.1881, "rewards/chosen": 0.8881990909576416, "rewards/margins": 4.34747838973999, "rewards/rejected": -3.4592792987823486, "step": 12066 }, { "epoch": 0.6395992897463758, "grad_norm": 45.0, "kl": 1.5844640731811523, "learning_rate": 5e-07, "logits/chosen": -16750046.0, "logits/rejected": 16830892.0, "logps/chosen": -255.94921875, "logps/rejected": -548.4078369140625, "loss": 0.285, "rewards/chosen": 0.7020288705825806, "rewards/margins": 3.0153003931045532, "rewards/rejected": -2.3132715225219727, "step": 12067 }, { "epoch": 0.639652293748178, "grad_norm": 29.5, "kl": 0.3243541717529297, "learning_rate": 5e-07, "logits/chosen": -6820457.333333333, "logits/rejected": -25614726.4, "logps/chosen": -126.6033935546875, "logps/rejected": -246.945751953125, "loss": 0.2458, "rewards/chosen": -0.11903063456217448, "rewards/margins": 2.5726176579793294, "rewards/rejected": -2.6916482925415037, "step": 12068 }, { "epoch": 0.6397052977499801, "grad_norm": 50.75, "kl": 0.3228263854980469, "learning_rate": 5e-07, "logits/chosen": -21376666.666666668, "logits/rejected": -32904233.6, "logps/chosen": -310.7181803385417, "logps/rejected": -303.0092041015625, "loss": 0.29, "rewards/chosen": 0.3717036247253418, "rewards/margins": 2.015891933441162, "rewards/rejected": -1.6441883087158202, "step": 12069 }, { "epoch": 0.6397583017517823, "grad_norm": 39.75, "kl": 0.8275556564331055, "learning_rate": 5e-07, "logits/chosen": -26325480.0, "logits/rejected": -27890136.0, "logps/chosen": -192.5902099609375, "logps/rejected": -244.90403747558594, "loss": 0.232, "rewards/chosen": 0.851747453212738, "rewards/margins": 3.6743656992912292, "rewards/rejected": -2.822618246078491, "step": 12070 }, { "epoch": 0.6398113057535844, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45296243.2, "logits/rejected": -29485312.0, "logps/chosen": -346.059033203125, "logps/rejected": -492.2660319010417, "loss": 0.2475, "rewards/chosen": 0.6798129558563233, "rewards/margins": 4.375424591700236, "rewards/rejected": -3.6956116358439126, "step": 12071 }, { "epoch": 0.6398643097553866, "grad_norm": 43.75, "kl": 0.8109588623046875, "learning_rate": 5e-07, "logits/chosen": -75896720.0, "logits/rejected": -17871358.4, "logps/chosen": -410.9518229166667, "logps/rejected": -166.73607177734374, "loss": 0.1865, "rewards/chosen": 0.6203409830729166, "rewards/margins": 3.222118631998698, "rewards/rejected": -2.6017776489257813, "step": 12072 }, { "epoch": 0.6399173137571886, "grad_norm": 41.0, "kl": 5.408531188964844, "learning_rate": 5e-07, "logits/chosen": 4108250.0, "logits/rejected": -54719091.2, "logps/chosen": -141.94099934895834, "logps/rejected": -459.919140625, "loss": 0.3142, "rewards/chosen": 0.33597683906555176, "rewards/margins": 2.296612596511841, "rewards/rejected": -1.9606357574462892, "step": 12073 }, { "epoch": 0.6399703177589908, "grad_norm": 39.5, "kl": 2.227842330932617, "learning_rate": 5e-07, "logits/chosen": -31341758.0, "logits/rejected": -42468580.0, "logps/chosen": -640.4951782226562, "logps/rejected": -272.965576171875, "loss": 0.2727, "rewards/chosen": 1.0971890687942505, "rewards/margins": 3.385025143623352, "rewards/rejected": -2.2878360748291016, "step": 12074 }, { "epoch": 0.6400233217607929, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64170952.0, "logits/rejected": -29028443.42857143, "logps/chosen": -795.766845703125, "logps/rejected": -389.24863978794644, "loss": 0.129, "rewards/chosen": 2.54034423828125, "rewards/margins": 5.158205849783761, "rewards/rejected": -2.6178616115025113, "step": 12075 }, { "epoch": 0.6400763257625951, "grad_norm": 68.5, "kl": 0.018306732177734375, "learning_rate": 5e-07, "logits/chosen": -21440536.0, "logits/rejected": -20840124.0, "logps/chosen": -321.1892395019531, "logps/rejected": -233.04075622558594, "loss": 0.3467, "rewards/chosen": 0.06022892892360687, "rewards/margins": 1.9870853275060654, "rewards/rejected": -1.9268563985824585, "step": 12076 }, { "epoch": 0.6401293297643972, "grad_norm": 48.5, "kl": 4.85282039642334, "learning_rate": 5e-07, "logits/chosen": -14587889.6, "logits/rejected": -2466513.3333333335, "logps/chosen": -217.158544921875, "logps/rejected": -327.924072265625, "loss": 0.3978, "rewards/chosen": 0.42488656044006345, "rewards/margins": 2.2315762996673585, "rewards/rejected": -1.806689739227295, "step": 12077 }, { "epoch": 0.6401823337661994, "grad_norm": 41.0, "kl": 1.7309188842773438, "learning_rate": 5e-07, "logits/chosen": -9472138.0, "logits/rejected": -22878076.8, "logps/chosen": -175.8546346028646, "logps/rejected": -214.2177978515625, "loss": 0.3046, "rewards/chosen": -0.44238829612731934, "rewards/margins": 1.980041742324829, "rewards/rejected": -2.4224300384521484, "step": 12078 }, { "epoch": 0.6402353377680015, "grad_norm": 41.25, "kl": 2.7047252655029297, "learning_rate": 5e-07, "logits/chosen": -1233635.2, "logits/rejected": -40958597.333333336, "logps/chosen": -187.64674072265626, "logps/rejected": -539.7604573567709, "loss": 0.3024, "rewards/chosen": 0.6315415382385254, "rewards/margins": 5.060502338409424, "rewards/rejected": -4.428960800170898, "step": 12079 }, { "epoch": 0.6402883417698036, "grad_norm": 105.5, "kl": 4.92636775970459, "learning_rate": 5e-07, "logits/chosen": 2506984.8, "logits/rejected": -44923136.0, "logps/chosen": -511.480517578125, "logps/rejected": -290.2547607421875, "loss": 0.3339, "rewards/chosen": 1.0375232696533203, "rewards/margins": 2.775251706441243, "rewards/rejected": -1.737728436787923, "step": 12080 }, { "epoch": 0.6403413457716057, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36175468.0, "logits/rejected": -16963350.85714286, "logps/chosen": -218.04550170898438, "logps/rejected": -412.13535853794644, "loss": 0.2182, "rewards/chosen": 0.33853912353515625, "rewards/margins": 2.915597915649414, "rewards/rejected": -2.577058792114258, "step": 12081 }, { "epoch": 0.6403943497734079, "grad_norm": 45.75, "kl": 0.2522430419921875, "learning_rate": 5e-07, "logits/chosen": 72316.75, "logits/rejected": -16117984.0, "logps/chosen": -85.49466959635417, "logps/rejected": -269.1146240234375, "loss": 0.3956, "rewards/chosen": 0.12977607051531473, "rewards/margins": 1.8483300904432933, "rewards/rejected": -1.7185540199279785, "step": 12082 }, { "epoch": 0.64044735377521, "grad_norm": 38.75, "kl": 0.8707122802734375, "learning_rate": 5e-07, "logits/chosen": -41809252.0, "logits/rejected": -41220656.0, "logps/chosen": -197.89234924316406, "logps/rejected": -230.74923706054688, "loss": 0.292, "rewards/chosen": 0.24821138381958008, "rewards/margins": 2.4835081100463867, "rewards/rejected": -2.2352967262268066, "step": 12083 }, { "epoch": 0.6405003577770122, "grad_norm": 41.25, "kl": 0.668614387512207, "learning_rate": 5e-07, "logits/chosen": -2196403.3333333335, "logits/rejected": -13693768.0, "logps/chosen": -44.4389394124349, "logps/rejected": -236.203369140625, "loss": 0.2484, "rewards/chosen": 0.5940002600351969, "rewards/margins": 2.5450449148813883, "rewards/rejected": -1.9510446548461915, "step": 12084 }, { "epoch": 0.6405533617788143, "grad_norm": 50.25, "kl": 1.2782630920410156, "learning_rate": 5e-07, "logits/chosen": -46713130.666666664, "logits/rejected": -35434772.0, "logps/chosen": -324.3207600911458, "logps/rejected": -219.46408081054688, "loss": 0.3492, "rewards/chosen": 0.46744195620218915, "rewards/margins": 2.8695453802744546, "rewards/rejected": -2.4021034240722656, "step": 12085 }, { "epoch": 0.6406063657806165, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16995348.0, "logits/rejected": -11259584.0, "logps/chosen": -376.1466064453125, "logps/rejected": -418.6864420572917, "loss": 0.222, "rewards/chosen": 1.2718658447265625, "rewards/margins": 2.8623929023742676, "rewards/rejected": -1.590527057647705, "step": 12086 }, { "epoch": 0.6406593697824186, "grad_norm": 39.75, "kl": 0.3797645568847656, "learning_rate": 5e-07, "logits/chosen": -26006220.0, "logits/rejected": -48029264.0, "logps/chosen": -180.6348876953125, "logps/rejected": -534.0557861328125, "loss": 0.3408, "rewards/chosen": 0.0482499897480011, "rewards/margins": 2.373301774263382, "rewards/rejected": -2.325051784515381, "step": 12087 }, { "epoch": 0.6407123737842207, "grad_norm": 28.75, "kl": 1.1852569580078125, "learning_rate": 5e-07, "logits/chosen": 4950687.0, "logits/rejected": -38935140.571428575, "logps/chosen": -17.344696044921875, "logps/rejected": -292.38905552455356, "loss": 0.1531, "rewards/chosen": 0.06768455356359482, "rewards/margins": 2.7241350976484164, "rewards/rejected": -2.6564505440848216, "step": 12088 }, { "epoch": 0.6407653777860228, "grad_norm": 60.0, "kl": 3.283736228942871, "learning_rate": 5e-07, "logits/chosen": 27094140.8, "logits/rejected": 13479802.666666666, "logps/chosen": -265.195654296875, "logps/rejected": -385.6724039713542, "loss": 0.345, "rewards/chosen": 0.5491866111755371, "rewards/margins": 2.3217162132263183, "rewards/rejected": -1.7725296020507812, "step": 12089 }, { "epoch": 0.640818381787825, "grad_norm": 49.75, "kl": 2.5363454818725586, "learning_rate": 5e-07, "logits/chosen": -17797522.0, "logits/rejected": -22656306.0, "logps/chosen": -260.36541748046875, "logps/rejected": -329.61572265625, "loss": 0.3481, "rewards/chosen": 0.4809929132461548, "rewards/margins": 2.2589083909988403, "rewards/rejected": -1.7779154777526855, "step": 12090 }, { "epoch": 0.6408713857896271, "grad_norm": 48.5, "kl": 2.254733085632324, "learning_rate": 5e-07, "logits/chosen": -25619883.42857143, "logits/rejected": -21426692.0, "logps/chosen": -241.95736258370536, "logps/rejected": -1385.621337890625, "loss": 0.4578, "rewards/chosen": 0.18457799298422678, "rewards/margins": 3.6475663696016585, "rewards/rejected": -3.4629883766174316, "step": 12091 }, { "epoch": 0.6409243897914293, "grad_norm": 51.75, "kl": 1.4024124145507812, "learning_rate": 5e-07, "logits/chosen": -1239758.0, "logits/rejected": -15263415.0, "logps/chosen": -661.3958740234375, "logps/rejected": -324.30645751953125, "loss": 0.3097, "rewards/chosen": 0.7244579195976257, "rewards/margins": 3.0624067187309265, "rewards/rejected": -2.337948799133301, "step": 12092 }, { "epoch": 0.6409773937932314, "grad_norm": 53.5, "kl": 0.41460227966308594, "learning_rate": 5e-07, "logits/chosen": -42488966.4, "logits/rejected": -36846245.333333336, "logps/chosen": -265.203564453125, "logps/rejected": -407.8003743489583, "loss": 0.3477, "rewards/chosen": 0.21040682792663573, "rewards/margins": 2.2176506201426185, "rewards/rejected": -2.007243792215983, "step": 12093 }, { "epoch": 0.6410303977950336, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52442788.0, "logits/rejected": -17427384.0, "logps/chosen": -361.7934265136719, "logps/rejected": -367.61602783203125, "loss": 0.236, "rewards/chosen": 0.38957709074020386, "rewards/margins": 3.2701510787010193, "rewards/rejected": -2.8805739879608154, "step": 12094 }, { "epoch": 0.6410834017968357, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -117909162.66666667, "logits/rejected": -2841009.8, "logps/chosen": -257.87860107421875, "logps/rejected": -252.4072509765625, "loss": 0.2599, "rewards/chosen": 0.5659805138905843, "rewards/margins": 2.14764297803243, "rewards/rejected": -1.5816624641418457, "step": 12095 }, { "epoch": 0.6411364057986378, "grad_norm": 43.5, "kl": 0.46864891052246094, "learning_rate": 5e-07, "logits/chosen": -2330124.0, "logits/rejected": -30232611.2, "logps/chosen": -174.4981689453125, "logps/rejected": -220.2240966796875, "loss": 0.2812, "rewards/chosen": 0.0808306336402893, "rewards/margins": 2.1239220261573792, "rewards/rejected": -2.04309139251709, "step": 12096 }, { "epoch": 0.6411894098004399, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -125313392.0, "logits/rejected": -60837869.71428572, "logps/chosen": -89.97700500488281, "logps/rejected": -375.59814453125, "loss": 0.1827, "rewards/chosen": 0.03589172288775444, "rewards/margins": 2.4619519907448972, "rewards/rejected": -2.426060267857143, "step": 12097 }, { "epoch": 0.6412424138022421, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16601234.666666666, "logits/rejected": -24654028.8, "logps/chosen": -359.4340006510417, "logps/rejected": -271.762548828125, "loss": 0.2541, "rewards/chosen": 0.517181913057963, "rewards/margins": 2.3409902016321817, "rewards/rejected": -1.8238082885742188, "step": 12098 }, { "epoch": 0.6412954178040442, "grad_norm": 68.5, "kl": 6.565845489501953, "learning_rate": 5e-07, "logits/chosen": -25165920.0, "logits/rejected": -2403104.0, "logps/chosen": -312.822998046875, "logps/rejected": -127.16104125976562, "loss": 0.4616, "rewards/chosen": 0.8070485932486398, "rewards/margins": 1.6822111351149422, "rewards/rejected": -0.8751625418663025, "step": 12099 }, { "epoch": 0.6413484218058464, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -373155.0, "logits/rejected": -5421584.0, "logps/chosen": -73.64560953776042, "logps/rejected": -312.780517578125, "loss": 0.2046, "rewards/chosen": 0.6194047927856445, "rewards/margins": 3.5113550186157227, "rewards/rejected": -2.891950225830078, "step": 12100 }, { "epoch": 0.6414014258076485, "grad_norm": 59.0, "kl": 1.3411216735839844, "learning_rate": 5e-07, "logits/chosen": -36473024.0, "logits/rejected": 1519029.0, "logps/chosen": -338.69217354910717, "logps/rejected": -22.558443069458008, "loss": 0.4362, "rewards/chosen": 0.4867277145385742, "rewards/margins": 0.4528783783316612, "rewards/rejected": 0.033849336206912994, "step": 12101 }, { "epoch": 0.6414544298094507, "grad_norm": 57.25, "kl": 2.8297042846679688, "learning_rate": 5e-07, "logits/chosen": -41570227.2, "logits/rejected": -15382218.666666666, "logps/chosen": -446.84951171875, "logps/rejected": -204.97981770833334, "loss": 0.3329, "rewards/chosen": 0.6800793647766114, "rewards/margins": 2.558252557118734, "rewards/rejected": -1.8781731923421223, "step": 12102 }, { "epoch": 0.6415074338112527, "grad_norm": 40.5, "kl": 3.3300933837890625, "learning_rate": 5e-07, "logits/chosen": -49223498.666666664, "logits/rejected": -37336760.0, "logps/chosen": -316.21018473307294, "logps/rejected": -253.39431762695312, "loss": 0.3211, "rewards/chosen": 0.9895186424255371, "rewards/margins": 3.2513160705566406, "rewards/rejected": -2.2617974281311035, "step": 12103 }, { "epoch": 0.6415604378130549, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29609744.0, "logits/rejected": -229470.0, "logps/chosen": -339.2422688802083, "logps/rejected": -355.1869140625, "loss": 0.2974, "rewards/chosen": -0.24953154722849527, "rewards/margins": 2.373099716504415, "rewards/rejected": -2.6226312637329103, "step": 12104 }, { "epoch": 0.641613441814857, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -381846.3333333333, "logits/rejected": -17717022.4, "logps/chosen": -212.4780476888021, "logps/rejected": -303.8847900390625, "loss": 0.191, "rewards/chosen": 0.5558401743570963, "rewards/margins": 3.4505608240763346, "rewards/rejected": -2.8947206497192384, "step": 12105 }, { "epoch": 0.6416664458166591, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -793088.0, "logits/rejected": -15255556.0, "logps/chosen": -82.16021728515625, "logps/rejected": -509.7178141276042, "loss": 0.1989, "rewards/chosen": 0.30521661043167114, "rewards/margins": 2.8307987650235495, "rewards/rejected": -2.5255821545918784, "step": 12106 }, { "epoch": 0.6417194498184613, "grad_norm": 44.0, "kl": 5.386308670043945, "learning_rate": 5e-07, "logits/chosen": 1070992.0, "logits/rejected": -34891020.0, "logps/chosen": -192.7912394205729, "logps/rejected": -312.92706298828125, "loss": 0.4379, "rewards/chosen": 0.44844643274943036, "rewards/margins": 2.6243223349253335, "rewards/rejected": -2.1758759021759033, "step": 12107 }, { "epoch": 0.6417724538202634, "grad_norm": 54.5, "kl": 0.12941360473632812, "learning_rate": 5e-07, "logits/chosen": -53051082.666666664, "logits/rejected": -6367118.0, "logps/chosen": -396.0304768880208, "logps/rejected": -192.05706787109375, "loss": 0.2754, "rewards/chosen": 0.7199313640594482, "rewards/margins": 4.269846677780151, "rewards/rejected": -3.549915313720703, "step": 12108 }, { "epoch": 0.6418254578220656, "grad_norm": 43.5, "kl": 1.4017353057861328, "learning_rate": 5e-07, "logits/chosen": -34085740.0, "logits/rejected": -5975092.666666667, "logps/chosen": -665.013427734375, "logps/rejected": -131.97393798828125, "loss": 0.2187, "rewards/chosen": 0.7726806402206421, "rewards/margins": 2.6599243879318237, "rewards/rejected": -1.8872437477111816, "step": 12109 }, { "epoch": 0.6418784618238677, "grad_norm": 64.5, "kl": 3.3879165649414062, "learning_rate": 5e-07, "logits/chosen": -41043752.0, "logits/rejected": -40379880.0, "logps/chosen": -305.71563720703125, "logps/rejected": -412.0067138671875, "loss": 0.3508, "rewards/chosen": 1.033424933751424, "rewards/margins": 2.899704535802205, "rewards/rejected": -1.8662796020507812, "step": 12110 }, { "epoch": 0.6419314658256698, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13211345.333333334, "logits/rejected": -19606114.0, "logps/chosen": -300.99550374348956, "logps/rejected": -280.7490234375, "loss": 0.3796, "rewards/chosen": 0.23775573571523032, "rewards/margins": 2.264086206754049, "rewards/rejected": -2.0263304710388184, "step": 12111 }, { "epoch": 0.6419844698274719, "grad_norm": 50.5, "kl": 0.9712285995483398, "learning_rate": 5e-07, "logits/chosen": -33476701.333333332, "logits/rejected": 832408.5, "logps/chosen": -120.91588338216145, "logps/rejected": -129.7174072265625, "loss": 0.4108, "rewards/chosen": 0.26255613565444946, "rewards/margins": 1.3389076590538025, "rewards/rejected": -1.076351523399353, "step": 12112 }, { "epoch": 0.6420374738292741, "grad_norm": 56.5, "kl": 2.7150449752807617, "learning_rate": 5e-07, "logits/chosen": -35378956.8, "logits/rejected": -49924266.666666664, "logps/chosen": -324.2901123046875, "logps/rejected": -271.3472900390625, "loss": 0.3231, "rewards/chosen": 0.45903635025024414, "rewards/margins": 2.926844278971354, "rewards/rejected": -2.46780792872111, "step": 12113 }, { "epoch": 0.6420904778310762, "grad_norm": 88.5, "kl": 0.2441244125366211, "learning_rate": 5e-07, "logits/chosen": -28027278.0, "logits/rejected": -7352379.0, "logps/chosen": -705.1376953125, "logps/rejected": -137.48809814453125, "loss": 0.3259, "rewards/chosen": 0.5914139151573181, "rewards/margins": 1.5484350323677063, "rewards/rejected": -0.9570211172103882, "step": 12114 }, { "epoch": 0.6421434818328784, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 20249710.0, "logits/rejected": -21507508.0, "logps/chosen": -290.0662841796875, "logps/rejected": -318.01483154296875, "loss": 0.222, "rewards/chosen": 0.3471214473247528, "rewards/margins": 2.4821162720521293, "rewards/rejected": -2.1349948247273765, "step": 12115 }, { "epoch": 0.6421964858346805, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11226738.666666666, "logits/rejected": -35986652.8, "logps/chosen": -362.0652669270833, "logps/rejected": -528.859228515625, "loss": 0.1994, "rewards/chosen": 0.5559438069661459, "rewards/margins": 3.2410481770833335, "rewards/rejected": -2.6851043701171875, "step": 12116 }, { "epoch": 0.6422494898364827, "grad_norm": 51.5, "kl": 0.928950309753418, "learning_rate": 5e-07, "logits/chosen": -36438886.4, "logits/rejected": -26805085.333333332, "logps/chosen": -108.95732421875, "logps/rejected": -344.5579427083333, "loss": 0.4111, "rewards/chosen": -0.011517906188964843, "rewards/margins": 1.7021580696105958, "rewards/rejected": -1.7136759757995605, "step": 12117 }, { "epoch": 0.6423024938382847, "grad_norm": 40.5, "kl": 0.5967025756835938, "learning_rate": 5e-07, "logits/chosen": -18439860.0, "logits/rejected": -19936382.0, "logps/chosen": -141.3045196533203, "logps/rejected": -396.418212890625, "loss": 0.2378, "rewards/chosen": 0.8078278303146362, "rewards/margins": 3.52876079082489, "rewards/rejected": -2.720932960510254, "step": 12118 }, { "epoch": 0.6423554978400869, "grad_norm": 39.0, "kl": 0.06648826599121094, "learning_rate": 5e-07, "logits/chosen": -7980816.0, "logits/rejected": -19665052.8, "logps/chosen": -248.21966552734375, "logps/rejected": -274.877685546875, "loss": 0.2081, "rewards/chosen": 1.1055285135904949, "rewards/margins": 3.123774401346843, "rewards/rejected": -2.018245887756348, "step": 12119 }, { "epoch": 0.642408501841889, "grad_norm": 41.75, "kl": 0.7809906005859375, "learning_rate": 5e-07, "logits/chosen": -19688566.0, "logits/rejected": -43666736.0, "logps/chosen": -117.94855499267578, "logps/rejected": -654.3109130859375, "loss": 0.2511, "rewards/chosen": 0.2253609597682953, "rewards/margins": 3.7450219094753265, "rewards/rejected": -3.5196609497070312, "step": 12120 }, { "epoch": 0.6424615058436912, "grad_norm": 45.75, "kl": 0.15313339233398438, "learning_rate": 5e-07, "logits/chosen": -22061637.333333332, "logits/rejected": -26244704.0, "logps/chosen": -287.34926350911456, "logps/rejected": -431.941064453125, "loss": 0.2165, "rewards/chosen": 0.8827443917592367, "rewards/margins": 3.2392497857411704, "rewards/rejected": -2.3565053939819336, "step": 12121 }, { "epoch": 0.6425145098454933, "grad_norm": 52.75, "kl": 1.0015335083007812, "learning_rate": 5e-07, "logits/chosen": -30222454.85714286, "logits/rejected": -4718000.0, "logps/chosen": -297.4180385044643, "logps/rejected": -248.92788696289062, "loss": 0.4984, "rewards/chosen": -0.12991510118756974, "rewards/margins": 2.39877142224993, "rewards/rejected": -2.5286865234375, "step": 12122 }, { "epoch": 0.6425675138472955, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5283259.0, "logits/rejected": -13929802.666666666, "logps/chosen": -241.712158203125, "logps/rejected": -433.6033528645833, "loss": 0.1353, "rewards/chosen": 0.7698357105255127, "rewards/margins": 3.763585329055786, "rewards/rejected": -2.9937496185302734, "step": 12123 }, { "epoch": 0.6426205178490976, "grad_norm": 53.5, "kl": 0.5823974609375, "learning_rate": 5e-07, "logits/chosen": -58053828.0, "logits/rejected": -26536868.0, "logps/chosen": -440.49188232421875, "logps/rejected": -392.4263610839844, "loss": 0.2664, "rewards/chosen": 0.5582367181777954, "rewards/margins": 2.484049677848816, "rewards/rejected": -1.9258129596710205, "step": 12124 }, { "epoch": 0.6426735218508998, "grad_norm": 39.75, "kl": 2.2571325302124023, "learning_rate": 5e-07, "logits/chosen": -20573024.0, "logits/rejected": -28004260.0, "logps/chosen": -611.7393188476562, "logps/rejected": -543.2529296875, "loss": 0.2887, "rewards/chosen": -0.03719198703765869, "rewards/margins": 3.093225836753845, "rewards/rejected": -3.130417823791504, "step": 12125 }, { "epoch": 0.6427265258527018, "grad_norm": 40.5, "kl": 0.10290908813476562, "learning_rate": 5e-07, "logits/chosen": 3536553.75, "logits/rejected": -18531821.333333332, "logps/chosen": -41.09016418457031, "logps/rejected": -288.4160563151042, "loss": 0.2597, "rewards/chosen": -0.37344932556152344, "rewards/margins": 2.1317389806111655, "rewards/rejected": -2.505188306172689, "step": 12126 }, { "epoch": 0.642779529854504, "grad_norm": 58.0, "kl": 2.2707862854003906, "learning_rate": 5e-07, "logits/chosen": -50459504.0, "logits/rejected": -19791513.6, "logps/chosen": -514.1881917317709, "logps/rejected": -265.88740234375, "loss": 0.247, "rewards/chosen": 0.7962689399719238, "rewards/margins": 2.699332332611084, "rewards/rejected": -1.9030633926391602, "step": 12127 }, { "epoch": 0.6428325338563061, "grad_norm": 45.0, "kl": 0.4674224853515625, "learning_rate": 5e-07, "logits/chosen": -37426864.0, "logits/rejected": -27477717.333333332, "logps/chosen": -373.3590393066406, "logps/rejected": -296.8026123046875, "loss": 0.1862, "rewards/chosen": 1.535253882408142, "rewards/margins": 3.582671046257019, "rewards/rejected": -2.047417163848877, "step": 12128 }, { "epoch": 0.6428855378581083, "grad_norm": 49.75, "kl": 0.6221370697021484, "learning_rate": 5e-07, "logits/chosen": -51426956.0, "logits/rejected": -4211890.0, "logps/chosen": -703.7955322265625, "logps/rejected": -143.56533813476562, "loss": 0.2302, "rewards/chosen": 0.21862183511257172, "rewards/margins": 2.170219048857689, "rewards/rejected": -1.9515972137451172, "step": 12129 }, { "epoch": 0.6429385418599104, "grad_norm": 66.0, "kl": 3.7781190872192383, "learning_rate": 5e-07, "logits/chosen": 10884280.0, "logits/rejected": -6128110.0, "logps/chosen": -313.64560546875, "logps/rejected": -64.82298787434895, "loss": 0.4482, "rewards/chosen": 0.4809847354888916, "rewards/margins": 0.6401019871234894, "rewards/rejected": -0.15911725163459778, "step": 12130 }, { "epoch": 0.6429915458617126, "grad_norm": 43.75, "kl": 2.109241485595703, "learning_rate": 5e-07, "logits/chosen": -3002401.0, "logits/rejected": 2211013.0, "logps/chosen": -368.29998779296875, "logps/rejected": -284.6878967285156, "loss": 0.3143, "rewards/chosen": 0.3824606239795685, "rewards/margins": 2.595240145921707, "rewards/rejected": -2.2127795219421387, "step": 12131 }, { "epoch": 0.6430445498635147, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45996024.0, "logits/rejected": -19889668.0, "logps/chosen": -331.5911865234375, "logps/rejected": -307.23431396484375, "loss": 0.2243, "rewards/chosen": 0.5666691064834595, "rewards/margins": 3.509982705116272, "rewards/rejected": -2.9433135986328125, "step": 12132 }, { "epoch": 0.6430975538653169, "grad_norm": 51.0, "kl": 4.50233268737793, "learning_rate": 5e-07, "logits/chosen": -17113150.85714286, "logits/rejected": -5733722.0, "logps/chosen": -298.82456752232144, "logps/rejected": -596.9090576171875, "loss": 0.41, "rewards/chosen": 0.6880495888846261, "rewards/margins": 4.827466283525739, "rewards/rejected": -4.139416694641113, "step": 12133 }, { "epoch": 0.6431505578671189, "grad_norm": 57.25, "kl": 0.20076370239257812, "learning_rate": 5e-07, "logits/chosen": -30730882.666666668, "logits/rejected": 17282848.0, "logps/chosen": -368.1915283203125, "logps/rejected": -329.32684326171875, "loss": 0.3775, "rewards/chosen": 0.48850584030151367, "rewards/margins": 1.4976089000701904, "rewards/rejected": -1.0091030597686768, "step": 12134 }, { "epoch": 0.6432035618689211, "grad_norm": 57.25, "kl": 1.6764278411865234, "learning_rate": 5e-07, "logits/chosen": -54098080.0, "logits/rejected": -19399044.0, "logps/chosen": -577.5735677083334, "logps/rejected": -258.9229431152344, "loss": 0.2642, "rewards/chosen": 1.1485182444254558, "rewards/margins": 3.7613429228464765, "rewards/rejected": -2.6128246784210205, "step": 12135 }, { "epoch": 0.6432565658707232, "grad_norm": 44.25, "kl": 0.2247304916381836, "learning_rate": 5e-07, "logits/chosen": -18529401.333333332, "logits/rejected": -24269564.8, "logps/chosen": -163.97130330403647, "logps/rejected": -259.727978515625, "loss": 0.2444, "rewards/chosen": 1.188894271850586, "rewards/margins": 2.4810836791992186, "rewards/rejected": -1.2921894073486329, "step": 12136 }, { "epoch": 0.6433095698725254, "grad_norm": 60.25, "kl": 0.017375946044921875, "learning_rate": 5e-07, "logits/chosen": -29660532.0, "logits/rejected": -36617384.0, "logps/chosen": -409.898193359375, "logps/rejected": -314.98504638671875, "loss": 0.2725, "rewards/chosen": 0.2937431335449219, "rewards/margins": 3.0764760971069336, "rewards/rejected": -2.7827329635620117, "step": 12137 }, { "epoch": 0.6433625738743275, "grad_norm": 53.25, "kl": 6.1808319091796875, "learning_rate": 5e-07, "logits/chosen": -16359384.0, "logits/rejected": 14780856.0, "logps/chosen": -301.5621337890625, "logps/rejected": -114.04486846923828, "loss": 0.4378, "rewards/chosen": 0.7693167527516683, "rewards/margins": 1.8881483872731528, "rewards/rejected": -1.1188316345214844, "step": 12138 }, { "epoch": 0.6434155778761297, "grad_norm": 51.0, "kl": 1.3257713317871094, "learning_rate": 5e-07, "logits/chosen": -38879360.0, "logits/rejected": -20523344.0, "logps/chosen": -483.97222900390625, "logps/rejected": -209.00628662109375, "loss": 0.2487, "rewards/chosen": 0.6510116457939148, "rewards/margins": 2.876589000225067, "rewards/rejected": -2.2255773544311523, "step": 12139 }, { "epoch": 0.6434685818779318, "grad_norm": 44.0, "kl": 0.09316635131835938, "learning_rate": 5e-07, "logits/chosen": -110421376.0, "logits/rejected": -8183590.4, "logps/chosen": -193.96931966145834, "logps/rejected": -151.13900146484374, "loss": 0.3323, "rewards/chosen": 0.3577672640482585, "rewards/margins": 1.4615187327067058, "rewards/rejected": -1.1037514686584473, "step": 12140 }, { "epoch": 0.643521585879734, "grad_norm": 48.5, "kl": 2.0630502700805664, "learning_rate": 5e-07, "logits/chosen": -15497929.6, "logits/rejected": -35199802.666666664, "logps/chosen": -235.857861328125, "logps/rejected": -472.4391276041667, "loss": 0.3173, "rewards/chosen": 0.4861196517944336, "rewards/margins": 3.7406778971354164, "rewards/rejected": -3.254558245340983, "step": 12141 }, { "epoch": 0.643574589881536, "grad_norm": 31.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13264578.0, "logits/rejected": 109383253.33333333, "logps/chosen": -141.2010498046875, "logps/rejected": -384.0576985677083, "loss": 0.2441, "rewards/chosen": -0.5008652210235596, "rewards/margins": 2.24495800336202, "rewards/rejected": -2.7458232243855796, "step": 12142 }, { "epoch": 0.6436275938833382, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42263589.333333336, "logits/rejected": -19581566.4, "logps/chosen": -389.7672932942708, "logps/rejected": -533.608203125, "loss": 0.2011, "rewards/chosen": 0.38967645168304443, "rewards/margins": 3.233280348777771, "rewards/rejected": -2.8436038970947264, "step": 12143 }, { "epoch": 0.6436805978851403, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19676484.0, "logits/rejected": -12787898.666666666, "logps/chosen": -300.7782287597656, "logps/rejected": -412.0775960286458, "loss": 0.2032, "rewards/chosen": 0.7841064929962158, "rewards/margins": 3.3209358056386313, "rewards/rejected": -2.5368293126424155, "step": 12144 }, { "epoch": 0.6437336018869425, "grad_norm": 58.0, "kl": 2.0333938598632812, "learning_rate": 5e-07, "logits/chosen": -37426844.0, "logits/rejected": -103989.5, "logps/chosen": -490.26263427734375, "logps/rejected": -417.7083740234375, "loss": 0.2438, "rewards/chosen": 1.0748885869979858, "rewards/margins": 3.477934956550598, "rewards/rejected": -2.4030463695526123, "step": 12145 }, { "epoch": 0.6437866058887446, "grad_norm": 40.0, "kl": 1.5706100463867188, "learning_rate": 5e-07, "logits/chosen": -21699160.0, "logits/rejected": -17533186.0, "logps/chosen": -272.9755859375, "logps/rejected": -663.1951293945312, "loss": 0.2412, "rewards/chosen": 1.228163480758667, "rewards/margins": 3.7379283905029297, "rewards/rejected": -2.5097649097442627, "step": 12146 }, { "epoch": 0.6438396098905468, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29910152.0, "logits/rejected": -28340516.0, "logps/chosen": -349.56982421875, "logps/rejected": -206.84071350097656, "loss": 0.2883, "rewards/chosen": 0.1848289519548416, "rewards/margins": 2.7208723574876785, "rewards/rejected": -2.536043405532837, "step": 12147 }, { "epoch": 0.6438926138923489, "grad_norm": 32.25, "kl": 1.0949573516845703, "learning_rate": 5e-07, "logits/chosen": -33305138.0, "logits/rejected": -46422724.0, "logps/chosen": -162.4960174560547, "logps/rejected": -164.38108825683594, "loss": 0.2546, "rewards/chosen": 0.42874467372894287, "rewards/margins": 3.037716507911682, "rewards/rejected": -2.6089718341827393, "step": 12148 }, { "epoch": 0.6439456178941511, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39645818.666666664, "logits/rejected": -2059734.0, "logps/chosen": -471.8904622395833, "logps/rejected": -100.09606323242187, "loss": 0.2459, "rewards/chosen": -0.009996542086203894, "rewards/margins": 2.46779505337278, "rewards/rejected": -2.477791595458984, "step": 12149 }, { "epoch": 0.6439986218959531, "grad_norm": 81.0, "kl": 1.8128852844238281, "learning_rate": 5e-07, "logits/chosen": -23339752.0, "logits/rejected": -28330240.0, "logps/chosen": -439.1058349609375, "logps/rejected": -348.78765869140625, "loss": 0.3631, "rewards/chosen": 0.45298771063486737, "rewards/margins": 2.4458977381388345, "rewards/rejected": -1.9929100275039673, "step": 12150 }, { "epoch": 0.6440516258977553, "grad_norm": 53.75, "kl": 0.73828125, "learning_rate": 5e-07, "logits/chosen": -68464632.0, "logits/rejected": -16424757.0, "logps/chosen": -346.0541687011719, "logps/rejected": -144.03489685058594, "loss": 0.3261, "rewards/chosen": 0.05765876919031143, "rewards/margins": 2.0806642547249794, "rewards/rejected": -2.023005485534668, "step": 12151 }, { "epoch": 0.6441046298995574, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2487990.6666666665, "logits/rejected": -22849411.2, "logps/chosen": -131.77698771158853, "logps/rejected": -476.8833984375, "loss": 0.2218, "rewards/chosen": 0.2466044028600057, "rewards/margins": 2.825200327237447, "rewards/rejected": -2.5785959243774412, "step": 12152 }, { "epoch": 0.6441576339013596, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22328896.0, "logits/rejected": -5866786.666666667, "logps/chosen": -337.9354248046875, "logps/rejected": -231.27950032552084, "loss": 0.3408, "rewards/chosen": 0.17937440872192384, "rewards/margins": 2.106329313913981, "rewards/rejected": -1.9269549051920574, "step": 12153 }, { "epoch": 0.6442106379031617, "grad_norm": 50.5, "kl": 2.551362991333008, "learning_rate": 5e-07, "logits/chosen": -31276051.2, "logits/rejected": -17367984.0, "logps/chosen": -301.812353515625, "logps/rejected": -252.85738118489584, "loss": 0.4057, "rewards/chosen": 0.6663167476654053, "rewards/margins": 2.281643470128377, "rewards/rejected": -1.615326722462972, "step": 12154 }, { "epoch": 0.6442636419049639, "grad_norm": 45.75, "kl": 0.7435798645019531, "learning_rate": 5e-07, "logits/chosen": -31564252.8, "logits/rejected": -50526128.0, "logps/chosen": -265.43408203125, "logps/rejected": -520.4147135416666, "loss": 0.2375, "rewards/chosen": 0.8395579338073731, "rewards/margins": 3.685160287221273, "rewards/rejected": -2.8456023534139, "step": 12155 }, { "epoch": 0.644316645906766, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32297956.0, "logits/rejected": -75244160.0, "logps/chosen": -283.6656494140625, "logps/rejected": -387.51171875, "loss": 0.3267, "rewards/chosen": -0.1441909819841385, "rewards/margins": 2.1000178307294846, "rewards/rejected": -2.244208812713623, "step": 12156 }, { "epoch": 0.644369649908568, "grad_norm": 76.5, "kl": 1.85028076171875, "learning_rate": 5e-07, "logits/chosen": -23935602.666666668, "logits/rejected": -13473352.0, "logps/chosen": -366.6992594401042, "logps/rejected": -404.0595703125, "loss": 0.3585, "rewards/chosen": 0.6133671601613363, "rewards/margins": 2.8628438313802085, "rewards/rejected": -2.249476671218872, "step": 12157 }, { "epoch": 0.6444226539103702, "grad_norm": 50.25, "kl": 2.2230186462402344, "learning_rate": 5e-07, "logits/chosen": -22534420.0, "logits/rejected": -23129458.0, "logps/chosen": -194.6962890625, "logps/rejected": -345.70947265625, "loss": 0.2305, "rewards/chosen": 1.2108745574951172, "rewards/margins": 3.363253593444824, "rewards/rejected": -2.152379035949707, "step": 12158 }, { "epoch": 0.6444756579121723, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33477378.666666668, "logits/rejected": -27406515.2, "logps/chosen": -135.6559041341146, "logps/rejected": -329.34072265625, "loss": 0.2677, "rewards/chosen": -0.33369191487630206, "rewards/margins": 2.0574789683024086, "rewards/rejected": -2.391170883178711, "step": 12159 }, { "epoch": 0.6445286619139745, "grad_norm": 62.25, "kl": 2.2925262451171875, "learning_rate": 5e-07, "logits/chosen": -26242400.0, "logps/chosen": -296.85369873046875, "loss": 0.4885, "rewards/chosen": 0.2814505398273468, "step": 12160 }, { "epoch": 0.6445816659157766, "grad_norm": 71.0, "kl": 0.6822052001953125, "learning_rate": 5e-07, "logits/chosen": -8845737.333333334, "logits/rejected": -22359624.0, "logps/chosen": -485.259765625, "logps/rejected": -353.4134521484375, "loss": 0.2129, "rewards/chosen": 0.8077817757924398, "rewards/margins": 3.5321683724721273, "rewards/rejected": -2.7243865966796874, "step": 12161 }, { "epoch": 0.6446346699175788, "grad_norm": 55.25, "kl": 2.985719680786133, "learning_rate": 5e-07, "logits/chosen": -59271104.0, "logits/rejected": 6980179.0, "logps/chosen": -361.6753336588542, "logps/rejected": -460.5435791015625, "loss": 0.3964, "rewards/chosen": 0.40540194511413574, "rewards/margins": 2.916409730911255, "rewards/rejected": -2.511007785797119, "step": 12162 }, { "epoch": 0.6446876739193809, "grad_norm": 48.75, "kl": 4.695940017700195, "learning_rate": 5e-07, "logits/chosen": -31959834.666666668, "logits/rejected": -30808598.0, "logps/chosen": -302.5041910807292, "logps/rejected": -150.278076171875, "loss": 0.3736, "rewards/chosen": 0.8582695325215658, "rewards/margins": 2.4991310437520347, "rewards/rejected": -1.6408615112304688, "step": 12163 }, { "epoch": 0.6447406779211831, "grad_norm": 63.5, "kl": 3.4041643142700195, "learning_rate": 5e-07, "logits/chosen": 5999774.4, "logits/rejected": -12394434.666666666, "logps/chosen": -389.50263671875, "logps/rejected": -126.88264973958333, "loss": 0.3745, "rewards/chosen": 0.7375504493713378, "rewards/margins": 2.5478020032246906, "rewards/rejected": -1.8102515538533528, "step": 12164 }, { "epoch": 0.6447936819229851, "grad_norm": 75.5, "kl": 0.9225425720214844, "learning_rate": 5e-07, "logits/chosen": 99709734.4, "logits/rejected": -18188896.0, "logps/chosen": -456.78115234375, "logps/rejected": -176.886962890625, "loss": 0.405, "rewards/chosen": 0.3284393548965454, "rewards/margins": 1.2177080710728965, "rewards/rejected": -0.889268716176351, "step": 12165 }, { "epoch": 0.6448466859247873, "grad_norm": 29.375, "kl": 0.6435832977294922, "learning_rate": 5e-07, "logits/chosen": -25939490.666666668, "logits/rejected": -39156211.2, "logps/chosen": -110.43359375, "logps/rejected": -486.389599609375, "loss": 0.1968, "rewards/chosen": 0.6078750292460123, "rewards/margins": 3.7214719454447427, "rewards/rejected": -3.1135969161987305, "step": 12166 }, { "epoch": 0.6448996899265894, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3654551.0, "logits/rejected": -50757874.28571428, "logps/chosen": -80.56692504882812, "logps/rejected": -450.5899135044643, "loss": 0.2012, "rewards/chosen": 0.4817848205566406, "rewards/margins": 2.4910054888044084, "rewards/rejected": -2.009220668247768, "step": 12167 }, { "epoch": 0.6449526939283916, "grad_norm": 44.0, "kl": 1.7725791931152344, "learning_rate": 5e-07, "logits/chosen": -49554752.0, "logits/rejected": -32823216.0, "logps/chosen": -209.90009765625, "logps/rejected": -403.5169270833333, "loss": 0.3406, "rewards/chosen": 0.4231771945953369, "rewards/margins": 2.5512279987335207, "rewards/rejected": -2.1280508041381836, "step": 12168 }, { "epoch": 0.6450056979301937, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59636688.0, "logits/rejected": -5403971.0, "logps/chosen": -348.4973551432292, "logps/rejected": -440.32830810546875, "loss": 0.3601, "rewards/chosen": 0.23219301303227743, "rewards/margins": 2.5956573685010276, "rewards/rejected": -2.36346435546875, "step": 12169 }, { "epoch": 0.6450587019319959, "grad_norm": 43.5, "kl": 0.5048313140869141, "learning_rate": 5e-07, "logits/chosen": 84102.75, "logits/rejected": -16705472.0, "logps/chosen": -80.9960708618164, "logps/rejected": -238.52982584635416, "loss": 0.2583, "rewards/chosen": -0.09900417923927307, "rewards/margins": 1.9521814286708832, "rewards/rejected": -2.0511856079101562, "step": 12170 }, { "epoch": 0.645111705933798, "grad_norm": 47.5, "kl": 2.8142919540405273, "learning_rate": 5e-07, "logits/chosen": -26289589.333333332, "logits/rejected": -68046512.0, "logps/chosen": -277.77028401692706, "logps/rejected": -305.77618408203125, "loss": 0.3609, "rewards/chosen": 0.6630793809890747, "rewards/margins": 2.2618649005889893, "rewards/rejected": -1.5987855195999146, "step": 12171 }, { "epoch": 0.6451647099356002, "grad_norm": 38.25, "kl": 0.7000198364257812, "learning_rate": 5e-07, "logits/chosen": -62484436.0, "logits/rejected": -26269634.0, "logps/chosen": -713.77392578125, "logps/rejected": -113.48271179199219, "loss": 0.2951, "rewards/chosen": 0.6562149524688721, "rewards/margins": 3.3438823223114014, "rewards/rejected": -2.6876673698425293, "step": 12172 }, { "epoch": 0.6452177139374022, "grad_norm": 53.0, "kl": 0.9553442001342773, "learning_rate": 5e-07, "logits/chosen": -10747637.333333334, "logits/rejected": 5723986.5, "logps/chosen": -355.6792805989583, "logps/rejected": -38.35001754760742, "loss": 0.4, "rewards/chosen": 0.8460443814595541, "rewards/margins": 1.1457339425881705, "rewards/rejected": -0.29968956112861633, "step": 12173 }, { "epoch": 0.6452707179392044, "grad_norm": 64.0, "kl": 3.719867706298828, "learning_rate": 5e-07, "logits/chosen": -37089536.0, "logits/rejected": -4257774.5, "logps/chosen": -641.91748046875, "logps/rejected": -138.1772918701172, "loss": 0.2196, "rewards/chosen": 1.7471054395039876, "rewards/margins": 3.5592723687489825, "rewards/rejected": -1.8121669292449951, "step": 12174 }, { "epoch": 0.6453237219410065, "grad_norm": 52.0, "kl": 0.8136577606201172, "learning_rate": 5e-07, "logits/chosen": -15622860.8, "logits/rejected": -3772412.0, "logps/chosen": -310.7478759765625, "logps/rejected": -253.35709635416666, "loss": 0.2595, "rewards/chosen": 0.6959038257598877, "rewards/margins": 3.3202116171518963, "rewards/rejected": -2.6243077913920083, "step": 12175 }, { "epoch": 0.6453767259428087, "grad_norm": 54.75, "kl": 0.14534950256347656, "learning_rate": 5e-07, "logits/chosen": -2697345.6666666665, "logits/rejected": -42321542.4, "logps/chosen": -174.5135498046875, "logps/rejected": -531.93369140625, "loss": 0.2481, "rewards/chosen": 0.13432311018308005, "rewards/margins": 3.3987934013207757, "rewards/rejected": -3.2644702911376955, "step": 12176 }, { "epoch": 0.6454297299446108, "grad_norm": 47.75, "kl": 1.4416618347167969, "learning_rate": 5e-07, "logits/chosen": -29147203.2, "logits/rejected": -31351941.333333332, "logps/chosen": -378.4049560546875, "logps/rejected": -267.73069254557294, "loss": 0.2268, "rewards/chosen": 0.9135069847106934, "rewards/margins": 4.599048773447672, "rewards/rejected": -3.685541788736979, "step": 12177 }, { "epoch": 0.645482733946413, "grad_norm": 64.0, "kl": 0.5172367095947266, "learning_rate": 5e-07, "logits/chosen": -32363500.0, "logits/rejected": -36506474.666666664, "logps/chosen": -289.1092529296875, "logps/rejected": -304.21901448567706, "loss": 0.2423, "rewards/chosen": 0.860729992389679, "rewards/margins": 2.274910271167755, "rewards/rejected": -1.4141802787780762, "step": 12178 }, { "epoch": 0.6455357379482151, "grad_norm": 50.75, "kl": 1.3195304870605469, "learning_rate": 5e-07, "logits/chosen": -46082448.0, "logits/rejected": -18103216.0, "logps/chosen": -379.4437255859375, "logps/rejected": -248.98550415039062, "loss": 0.2694, "rewards/chosen": 0.8153702616691589, "rewards/margins": 2.5540931820869446, "rewards/rejected": -1.7387229204177856, "step": 12179 }, { "epoch": 0.6455887419500173, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77294048.0, "logits/rejected": -5901036.571428572, "logps/chosen": -410.17120361328125, "logps/rejected": -356.90053013392856, "loss": 0.2351, "rewards/chosen": 0.2574401795864105, "rewards/margins": 1.7413715847900935, "rewards/rejected": -1.483931405203683, "step": 12180 }, { "epoch": 0.6456417459518193, "grad_norm": 43.0, "kl": 1.809072494506836, "learning_rate": 5e-07, "logits/chosen": -29587074.666666668, "logits/rejected": -39506787.2, "logps/chosen": -543.1798095703125, "logps/rejected": -308.7435302734375, "loss": 0.2147, "rewards/chosen": 1.2610981464385986, "rewards/margins": 3.533385896682739, "rewards/rejected": -2.2722877502441405, "step": 12181 }, { "epoch": 0.6456947499536215, "grad_norm": 55.5, "kl": 1.2368240356445312, "learning_rate": 5e-07, "logits/chosen": -32824928.0, "logits/rejected": -39970828.8, "logps/chosen": -471.2195638020833, "logps/rejected": -418.422705078125, "loss": 0.1964, "rewards/chosen": 0.9420583248138428, "rewards/margins": 3.3908414363861086, "rewards/rejected": -2.448783111572266, "step": 12182 }, { "epoch": 0.6457477539554236, "grad_norm": 57.75, "kl": 2.4011173248291016, "learning_rate": 5e-07, "logits/chosen": -14836204.8, "logits/rejected": -4083888.3333333335, "logps/chosen": -374.883935546875, "logps/rejected": -145.89439900716147, "loss": 0.3236, "rewards/chosen": 0.7622933387756348, "rewards/margins": 2.5898782412211103, "rewards/rejected": -1.8275849024454753, "step": 12183 }, { "epoch": 0.6458007579572258, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13098394.0, "logits/rejected": -4421285.333333333, "logps/chosen": -318.95904541015625, "logps/rejected": -282.6284993489583, "loss": 0.2468, "rewards/chosen": -0.06439361721277237, "rewards/margins": 2.069672646621863, "rewards/rejected": -2.1340662638346353, "step": 12184 }, { "epoch": 0.6458537619590279, "grad_norm": 32.25, "kl": 1.4601669311523438, "learning_rate": 5e-07, "logits/chosen": -66845548.0, "logits/rejected": -30264410.666666668, "logps/chosen": -140.15272521972656, "logps/rejected": -439.8414713541667, "loss": 0.2056, "rewards/chosen": -0.02313385158777237, "rewards/margins": 3.1771873459219933, "rewards/rejected": -3.2003211975097656, "step": 12185 }, { "epoch": 0.6459067659608301, "grad_norm": 38.75, "kl": 0.9895133972167969, "learning_rate": 5e-07, "logits/chosen": -8132584.5, "logits/rejected": -30729960.0, "logps/chosen": -115.28064727783203, "logps/rejected": -387.114501953125, "loss": 0.2764, "rewards/chosen": 0.37439364194869995, "rewards/margins": 2.6795379519462585, "rewards/rejected": -2.3051443099975586, "step": 12186 }, { "epoch": 0.6459597699626322, "grad_norm": 49.75, "kl": 1.0850391387939453, "learning_rate": 5e-07, "logits/chosen": -27138108.0, "logits/rejected": -25632720.0, "logps/chosen": -310.27288818359375, "logps/rejected": -350.0355224609375, "loss": 0.2746, "rewards/chosen": 0.6217159032821655, "rewards/margins": 3.000247359275818, "rewards/rejected": -2.3785314559936523, "step": 12187 }, { "epoch": 0.6460127739644344, "grad_norm": 66.5, "kl": 0.3194293975830078, "learning_rate": 5e-07, "logits/chosen": -35531381.333333336, "logits/rejected": -2447843.5, "logps/chosen": -323.2257486979167, "logps/rejected": -70.43971252441406, "loss": 0.3921, "rewards/chosen": 0.2126384178797404, "rewards/margins": 1.8275354305903118, "rewards/rejected": -1.6148970127105713, "step": 12188 }, { "epoch": 0.6460657779662364, "grad_norm": 44.75, "kl": 0.6854324340820312, "learning_rate": 5e-07, "logits/chosen": -33799000.0, "logits/rejected": -21460408.0, "logps/chosen": -446.8856506347656, "logps/rejected": -331.4688720703125, "loss": 0.2219, "rewards/chosen": 1.282067060470581, "rewards/margins": 3.163543224334717, "rewards/rejected": -1.8814761638641357, "step": 12189 }, { "epoch": 0.6461187819680386, "grad_norm": 65.0, "kl": 4.007728576660156, "learning_rate": 5e-07, "logits/chosen": -38665264.0, "logits/rejected": -15652648.0, "logps/chosen": -873.842529296875, "logps/rejected": -171.0821533203125, "loss": 0.3067, "rewards/chosen": 1.8129853010177612, "rewards/margins": 2.793850302696228, "rewards/rejected": -0.9808650016784668, "step": 12190 }, { "epoch": 0.6461717859698407, "grad_norm": 54.25, "kl": 3.07528018951416, "learning_rate": 5e-07, "logits/chosen": -28869392.0, "logits/rejected": -21138404.0, "logps/chosen": -298.43776448567706, "logps/rejected": -182.16665649414062, "loss": 0.3828, "rewards/chosen": 0.6989485422770182, "rewards/margins": 2.10419229666392, "rewards/rejected": -1.4052437543869019, "step": 12191 }, { "epoch": 0.6462247899716429, "grad_norm": 56.25, "kl": 1.504119873046875, "learning_rate": 5e-07, "logits/chosen": 45753888.0, "logits/rejected": -1323476.1666666667, "logps/chosen": -245.3107421875, "logps/rejected": -117.66644287109375, "loss": 0.3506, "rewards/chosen": 0.4865900993347168, "rewards/margins": 2.30751215616862, "rewards/rejected": -1.820922056833903, "step": 12192 }, { "epoch": 0.646277793973445, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53816644.0, "logits/rejected": -23659580.0, "logps/chosen": -219.7615509033203, "logps/rejected": -258.8846130371094, "loss": 0.3376, "rewards/chosen": -0.2638843357563019, "rewards/margins": 1.8532623946666718, "rewards/rejected": -2.1171467304229736, "step": 12193 }, { "epoch": 0.6463307979752472, "grad_norm": 57.75, "kl": 1.5397815704345703, "learning_rate": 5e-07, "logits/chosen": -53187238.4, "logits/rejected": -77987125.33333333, "logps/chosen": -316.6047119140625, "logps/rejected": -556.9798583984375, "loss": 0.3096, "rewards/chosen": 0.48985748291015624, "rewards/margins": 3.1436428705851234, "rewards/rejected": -2.6537853876749673, "step": 12194 }, { "epoch": 0.6463838019770493, "grad_norm": 34.5, "kl": 1.2492523193359375, "learning_rate": 5e-07, "logits/chosen": -26275394.0, "logits/rejected": -32958932.0, "logps/chosen": -357.6229553222656, "logps/rejected": -246.87161254882812, "loss": 0.1615, "rewards/chosen": 1.4898967742919922, "rewards/margins": 4.037670612335205, "rewards/rejected": -2.547773838043213, "step": 12195 }, { "epoch": 0.6464368059788514, "grad_norm": 51.5, "kl": 2.1420745849609375, "learning_rate": 5e-07, "logits/chosen": -16603368.0, "logits/rejected": -17879688.0, "logps/chosen": -365.087646484375, "logps/rejected": -456.1253967285156, "loss": 0.1964, "rewards/chosen": 1.5157612562179565, "rewards/margins": 3.6279741525650024, "rewards/rejected": -2.112212896347046, "step": 12196 }, { "epoch": 0.6464898099806535, "grad_norm": 31.875, "kl": 2.915034294128418, "learning_rate": 5e-07, "logits/chosen": -16594038.0, "logits/rejected": -51771536.0, "logps/chosen": -559.7825317382812, "logps/rejected": -423.2547302246094, "loss": 0.1947, "rewards/chosen": 1.6688765287399292, "rewards/margins": 4.857270121574402, "rewards/rejected": -3.1883935928344727, "step": 12197 }, { "epoch": 0.6465428139824557, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -85289864.0, "logits/rejected": -37915328.0, "logps/chosen": -335.4369812011719, "logps/rejected": -307.7840881347656, "loss": 0.3024, "rewards/chosen": 0.4079345762729645, "rewards/margins": 1.8925407230854034, "rewards/rejected": -1.484606146812439, "step": 12198 }, { "epoch": 0.6465958179842578, "grad_norm": 41.0, "kl": 0.9017333984375, "learning_rate": 5e-07, "logits/chosen": -53427828.0, "logits/rejected": -10553315.333333334, "logps/chosen": -462.8221130371094, "logps/rejected": -121.87057495117188, "loss": 0.1846, "rewards/chosen": 1.3330703973770142, "rewards/margins": 3.597960432370504, "rewards/rejected": -2.2648900349934897, "step": 12199 }, { "epoch": 0.64664882198606, "grad_norm": 50.25, "kl": 1.6615638732910156, "learning_rate": 5e-07, "logits/chosen": -38692680.0, "logits/rejected": -9817608.0, "logps/chosen": -290.02386474609375, "logps/rejected": -247.8335418701172, "loss": 0.2809, "rewards/chosen": 0.5548107028007507, "rewards/margins": 3.0747944712638855, "rewards/rejected": -2.5199837684631348, "step": 12200 }, { "epoch": 0.6467018259878621, "grad_norm": 45.75, "kl": 2.363401412963867, "learning_rate": 5e-07, "logits/chosen": -23673560.0, "logits/rejected": 4242661.0, "logps/chosen": -265.996337890625, "logps/rejected": -331.1859130859375, "loss": 0.2964, "rewards/chosen": 0.5576590895652771, "rewards/margins": 2.672670543193817, "rewards/rejected": -2.11501145362854, "step": 12201 }, { "epoch": 0.6467548299896643, "grad_norm": 50.25, "kl": 0.5300569534301758, "learning_rate": 5e-07, "logits/chosen": -31900004.0, "logits/rejected": -9715384.0, "logps/chosen": -334.074951171875, "logps/rejected": -161.67755126953125, "loss": 0.2506, "rewards/chosen": 0.6651027798652649, "rewards/margins": 2.8443544507026672, "rewards/rejected": -2.1792516708374023, "step": 12202 }, { "epoch": 0.6468078339914664, "grad_norm": 51.0, "kl": 4.290265083312988, "learning_rate": 5e-07, "logits/chosen": -24854665.14285714, "logits/rejected": -90713504.0, "logps/chosen": -607.0751953125, "logps/rejected": -533.0303955078125, "loss": 0.3187, "rewards/chosen": 1.6054390498570033, "rewards/margins": 4.023883921759469, "rewards/rejected": -2.418444871902466, "step": 12203 }, { "epoch": 0.6468608379932685, "grad_norm": 66.5, "kl": 2.2376766204833984, "learning_rate": 5e-07, "logits/chosen": -31605896.0, "logits/rejected": 7129455.5, "logps/chosen": -418.6710205078125, "logps/rejected": -166.02835083007812, "loss": 0.2958, "rewards/chosen": 0.8895150423049927, "rewards/margins": 2.519906759262085, "rewards/rejected": -1.6303917169570923, "step": 12204 }, { "epoch": 0.6469138419950706, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18207504.0, "logits/rejected": -38134387.2, "logps/chosen": -315.6986897786458, "logps/rejected": -251.751416015625, "loss": 0.3052, "rewards/chosen": -0.3137397766113281, "rewards/margins": 2.0387910842895507, "rewards/rejected": -2.352530860900879, "step": 12205 }, { "epoch": 0.6469668459968728, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77784800.0, "logits/rejected": -29693187.2, "logps/chosen": -471.4644368489583, "logps/rejected": -258.01181640625, "loss": 0.2168, "rewards/chosen": 0.6634734869003296, "rewards/margins": 3.3422199010849, "rewards/rejected": -2.6787464141845705, "step": 12206 }, { "epoch": 0.6470198499986749, "grad_norm": 57.5, "kl": 0.5734939575195312, "learning_rate": 5e-07, "logits/chosen": -49511862.85714286, "logits/rejected": -66863824.0, "logps/chosen": -503.3579799107143, "logps/rejected": -649.3206787109375, "loss": 0.2441, "rewards/chosen": 1.0940848759242467, "rewards/margins": 5.988896983010428, "rewards/rejected": -4.894812107086182, "step": 12207 }, { "epoch": 0.647072854000477, "grad_norm": 55.25, "kl": 0.7443695068359375, "learning_rate": 5e-07, "logits/chosen": 1459683.0, "logits/rejected": -8287290.0, "logps/chosen": -233.9940948486328, "logps/rejected": -282.94512939453125, "loss": 0.2032, "rewards/chosen": 0.9778984189033508, "rewards/margins": 3.535384714603424, "rewards/rejected": -2.5574862957000732, "step": 12208 }, { "epoch": 0.6471258580022792, "grad_norm": 46.5, "kl": 1.7400436401367188, "learning_rate": 5e-07, "logits/chosen": -25416420.8, "logits/rejected": -12530186.666666666, "logps/chosen": -217.3917724609375, "logps/rejected": -345.6305338541667, "loss": 0.4124, "rewards/chosen": 0.014787900447845458, "rewards/margins": 1.9155916253725689, "rewards/rejected": -1.9008037249247234, "step": 12209 }, { "epoch": 0.6471788620040813, "grad_norm": 61.25, "kl": 3.678525924682617, "learning_rate": 5e-07, "logits/chosen": -33627846.4, "logits/rejected": -38755122.666666664, "logps/chosen": -373.6519775390625, "logps/rejected": -190.56266276041666, "loss": 0.3898, "rewards/chosen": 0.4711251735687256, "rewards/margins": 2.9106445789337156, "rewards/rejected": -2.4395194053649902, "step": 12210 }, { "epoch": 0.6472318660058834, "grad_norm": 43.75, "kl": 1.44854736328125, "learning_rate": 5e-07, "logits/chosen": -73637.5, "logits/rejected": -25603914.0, "logps/chosen": -66.94876098632812, "logps/rejected": -416.77880859375, "loss": 0.3091, "rewards/chosen": -0.0453634113073349, "rewards/margins": 2.5035174041986465, "rewards/rejected": -2.5488808155059814, "step": 12211 }, { "epoch": 0.6472848700076855, "grad_norm": 58.25, "kl": 4.401664733886719, "learning_rate": 5e-07, "logits/chosen": -23813653.333333332, "logits/rejected": -58176912.0, "logps/chosen": -431.6663411458333, "logps/rejected": -273.25213623046875, "loss": 0.315, "rewards/chosen": 1.1852453549702961, "rewards/margins": 2.943816264470418, "rewards/rejected": -1.758570909500122, "step": 12212 }, { "epoch": 0.6473378740094877, "grad_norm": 45.25, "kl": 4.200399398803711, "learning_rate": 5e-07, "logits/chosen": -11387640.8, "logits/rejected": -12118384.0, "logps/chosen": -146.13719482421874, "logps/rejected": -268.38547770182294, "loss": 0.4188, "rewards/chosen": 0.29489572048187257, "rewards/margins": 2.4558290561040246, "rewards/rejected": -2.160933335622152, "step": 12213 }, { "epoch": 0.6473908780112898, "grad_norm": 54.25, "kl": 4.760156631469727, "learning_rate": 5e-07, "logits/chosen": -22341189.333333332, "logits/rejected": -33490364.0, "logps/chosen": -457.680419921875, "logps/rejected": -398.7290344238281, "loss": 0.3063, "rewards/chosen": 1.3964037895202637, "rewards/margins": 4.253875017166138, "rewards/rejected": -2.857471227645874, "step": 12214 }, { "epoch": 0.647443882013092, "grad_norm": 47.25, "kl": 0.677947998046875, "learning_rate": 5e-07, "logits/chosen": -38344980.0, "logits/rejected": -26815056.0, "logps/chosen": -285.9189758300781, "logps/rejected": -271.2737223307292, "loss": 0.1986, "rewards/chosen": 1.4959274530410767, "rewards/margins": 3.3169622023900347, "rewards/rejected": -1.8210347493489583, "step": 12215 }, { "epoch": 0.6474968860148941, "grad_norm": 36.25, "kl": 1.1750259399414062, "learning_rate": 5e-07, "logits/chosen": -48277573.333333336, "logits/rejected": -38091708.8, "logps/chosen": -74.16553751627605, "logps/rejected": -224.833837890625, "loss": 0.3193, "rewards/chosen": -0.06919123729070027, "rewards/margins": 1.4779053966204325, "rewards/rejected": -1.5470966339111327, "step": 12216 }, { "epoch": 0.6475498900166963, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31392165.333333332, "logits/rejected": -5196020.8, "logps/chosen": -390.0509033203125, "logps/rejected": -334.2541015625, "loss": 0.2423, "rewards/chosen": 0.26632030804951984, "rewards/margins": 3.0195620377858483, "rewards/rejected": -2.7532417297363283, "step": 12217 }, { "epoch": 0.6476028940184984, "grad_norm": 39.25, "kl": 0.6636314392089844, "learning_rate": 5e-07, "logits/chosen": -35289032.0, "logits/rejected": -17271235.2, "logps/chosen": -406.0130615234375, "logps/rejected": -203.274365234375, "loss": 0.2524, "rewards/chosen": 1.1121505896250408, "rewards/margins": 2.744649902979533, "rewards/rejected": -1.6324993133544923, "step": 12218 }, { "epoch": 0.6476558980203005, "grad_norm": 78.0, "kl": 2.255870819091797, "learning_rate": 5e-07, "logits/chosen": -27239724.8, "logits/rejected": -27152458.666666668, "logps/chosen": -252.164013671875, "logps/rejected": -191.04777018229166, "loss": 0.4481, "rewards/chosen": 0.16337890625, "rewards/margins": 1.2199948946634929, "rewards/rejected": -1.056615988413493, "step": 12219 }, { "epoch": 0.6477089020221026, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3088343.5, "logits/rejected": -22201737.14285714, "logps/chosen": -52.78414535522461, "logps/rejected": -199.72739955357142, "loss": 0.2975, "rewards/chosen": 0.12476921081542969, "rewards/margins": 1.2819720676967077, "rewards/rejected": -1.157202856881278, "step": 12220 }, { "epoch": 0.6477619060239048, "grad_norm": 43.75, "kl": 0.6647243499755859, "learning_rate": 5e-07, "logits/chosen": -33317574.4, "logits/rejected": -23717949.333333332, "logps/chosen": -301.126025390625, "logps/rejected": -335.03456624348956, "loss": 0.2883, "rewards/chosen": 0.38822784423828127, "rewards/margins": 4.546458943684896, "rewards/rejected": -4.158231099446614, "step": 12221 }, { "epoch": 0.6478149100257069, "grad_norm": 40.75, "kl": 0.40503692626953125, "learning_rate": 5e-07, "logits/chosen": -25817498.0, "logits/rejected": -26211340.0, "logps/chosen": -143.42608642578125, "logps/rejected": -296.5270690917969, "loss": 0.193, "rewards/chosen": 0.9726564288139343, "rewards/margins": 4.642065703868866, "rewards/rejected": -3.6694092750549316, "step": 12222 }, { "epoch": 0.6478679140275091, "grad_norm": 61.25, "kl": 3.05003023147583, "learning_rate": 5e-07, "logits/chosen": -8761608.0, "logits/rejected": -23510842.666666668, "logps/chosen": -287.42999267578125, "logps/rejected": -224.96441650390625, "loss": 0.3909, "rewards/chosen": 0.8866968154907227, "rewards/margins": 1.936638355255127, "rewards/rejected": -1.0499415397644043, "step": 12223 }, { "epoch": 0.6479209180293112, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60510328.0, "logits/rejected": -12032421.714285715, "logps/chosen": -569.976806640625, "logps/rejected": -443.439697265625, "loss": 0.1929, "rewards/chosen": 1.18511962890625, "rewards/margins": 3.096684047154018, "rewards/rejected": -1.9115644182477678, "step": 12224 }, { "epoch": 0.6479739220311134, "grad_norm": 29.375, "kl": 1.6571435928344727, "learning_rate": 5e-07, "logits/chosen": -5893348.0, "logits/rejected": -56887600.0, "logps/chosen": -290.8504333496094, "logps/rejected": -424.6603698730469, "loss": 0.2149, "rewards/chosen": 1.2754032611846924, "rewards/margins": 3.7749204635620117, "rewards/rejected": -2.4995172023773193, "step": 12225 }, { "epoch": 0.6480269260329155, "grad_norm": 54.0, "kl": 2.1001510620117188, "learning_rate": 5e-07, "logits/chosen": -8990562.4, "logits/rejected": -48152122.666666664, "logps/chosen": -306.339892578125, "logps/rejected": -375.0970865885417, "loss": 0.3543, "rewards/chosen": 0.630849552154541, "rewards/margins": 2.561275513966878, "rewards/rejected": -1.9304259618123372, "step": 12226 }, { "epoch": 0.6480799300347176, "grad_norm": 73.5, "kl": 1.4413633346557617, "learning_rate": 5e-07, "logits/chosen": -51363257.6, "logits/rejected": -18323126.666666668, "logps/chosen": -295.6677978515625, "logps/rejected": -285.28021240234375, "loss": 0.3324, "rewards/chosen": 0.24607160091400146, "rewards/margins": 3.0358487208684286, "rewards/rejected": -2.7897771199544272, "step": 12227 }, { "epoch": 0.6481329340365197, "grad_norm": 36.25, "kl": 0.08629417419433594, "learning_rate": 5e-07, "logits/chosen": -33310468.0, "logits/rejected": -39240792.0, "logps/chosen": -241.3361358642578, "logps/rejected": -252.81233723958334, "loss": 0.1485, "rewards/chosen": 0.42029017210006714, "rewards/margins": 3.26005889972051, "rewards/rejected": -2.839768727620443, "step": 12228 }, { "epoch": 0.6481859380383219, "grad_norm": 62.75, "kl": 0.5025482177734375, "learning_rate": 5e-07, "logits/chosen": -24252397.333333332, "logits/rejected": -42213107.2, "logps/chosen": -358.5922037760417, "logps/rejected": -347.130029296875, "loss": 0.2894, "rewards/chosen": 0.16860926151275635, "rewards/margins": 2.3092655420303343, "rewards/rejected": -2.140656280517578, "step": 12229 }, { "epoch": 0.648238942040124, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48651382.4, "logits/rejected": -103522325.33333333, "logps/chosen": -302.41962890625, "logps/rejected": -935.5665690104166, "loss": 0.2875, "rewards/chosen": 0.2677211046218872, "rewards/margins": 4.30616721312205, "rewards/rejected": -4.038446108500163, "step": 12230 }, { "epoch": 0.6482919460419262, "grad_norm": 43.25, "kl": 1.2182507514953613, "learning_rate": 5e-07, "logits/chosen": -23227040.0, "logits/rejected": -39833548.0, "logps/chosen": -198.12530517578125, "logps/rejected": -277.9512939453125, "loss": 0.3532, "rewards/chosen": 0.42987263202667236, "rewards/margins": 2.7265034914016724, "rewards/rejected": -2.296630859375, "step": 12231 }, { "epoch": 0.6483449500437283, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41104376.0, "logits/rejected": -25598550.0, "logps/chosen": -285.57391357421875, "logps/rejected": -431.0761413574219, "loss": 0.3191, "rewards/chosen": 0.00697346031665802, "rewards/margins": 2.0263318568468094, "rewards/rejected": -2.0193583965301514, "step": 12232 }, { "epoch": 0.6483979540455305, "grad_norm": 45.25, "kl": 1.5872278213500977, "learning_rate": 5e-07, "logits/chosen": -47383235.2, "logits/rejected": -21313890.666666668, "logps/chosen": -206.114990234375, "logps/rejected": -401.4479166666667, "loss": 0.2911, "rewards/chosen": 0.6667423248291016, "rewards/margins": 2.9496933619181314, "rewards/rejected": -2.28295103708903, "step": 12233 }, { "epoch": 0.6484509580473325, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 777350.0, "logits/rejected": 48933498.666666664, "logps/chosen": -427.9105224609375, "logps/rejected": -310.2880859375, "loss": 0.2252, "rewards/chosen": 0.9093902707099915, "rewards/margins": 2.767485598723094, "rewards/rejected": -1.8580953280131023, "step": 12234 }, { "epoch": 0.6485039620491347, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41873813.333333336, "logits/rejected": -38157337.6, "logps/chosen": -559.185791015625, "logps/rejected": -393.253564453125, "loss": 0.2255, "rewards/chosen": -0.10422058900197347, "rewards/margins": 3.3869850079218544, "rewards/rejected": -3.491205596923828, "step": 12235 }, { "epoch": 0.6485569660509368, "grad_norm": 47.5, "kl": 0.55657958984375, "learning_rate": 5e-07, "logits/chosen": -7075833.333333333, "logits/rejected": -35780828.8, "logps/chosen": -1030.0948893229167, "logps/rejected": -354.237060546875, "loss": 0.2486, "rewards/chosen": 0.7437075773874918, "rewards/margins": 3.056907002131144, "rewards/rejected": -2.3131994247436523, "step": 12236 }, { "epoch": 0.648609970052739, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30816060.0, "logits/rejected": -26980834.0, "logps/chosen": -272.1442565917969, "logps/rejected": -295.4794921875, "loss": 0.2172, "rewards/chosen": 1.0038983821868896, "rewards/margins": 4.405335426330566, "rewards/rejected": -3.4014370441436768, "step": 12237 }, { "epoch": 0.6486629740545411, "grad_norm": 37.5, "kl": 1.3685550689697266, "learning_rate": 5e-07, "logits/chosen": -11631321.333333334, "logits/rejected": 2590796.6, "logps/chosen": -206.50675455729166, "logps/rejected": -265.1300537109375, "loss": 0.2354, "rewards/chosen": 0.8074841499328613, "rewards/margins": 3.233270359039307, "rewards/rejected": -2.4257862091064455, "step": 12238 }, { "epoch": 0.6487159780563433, "grad_norm": 47.75, "kl": 1.8760929107666016, "learning_rate": 5e-07, "logits/chosen": -1931462.5, "logits/rejected": -26243581.333333332, "logps/chosen": -32.2241325378418, "logps/rejected": -361.617919921875, "loss": 0.2744, "rewards/chosen": -0.30365344882011414, "rewards/margins": 1.7665592332681022, "rewards/rejected": -2.0702126820882163, "step": 12239 }, { "epoch": 0.6487689820581454, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15592873.6, "logits/rejected": -2513663.6666666665, "logps/chosen": -286.413623046875, "logps/rejected": -234.459716796875, "loss": 0.4015, "rewards/chosen": -0.12992508411407472, "rewards/margins": 1.6540950218836468, "rewards/rejected": -1.7840201059977214, "step": 12240 }, { "epoch": 0.6488219860599476, "grad_norm": 43.75, "kl": 0.5970077514648438, "learning_rate": 5e-07, "logits/chosen": -34534496.0, "logits/rejected": -31570314.666666668, "logps/chosen": -341.31298828125, "logps/rejected": -91.49957275390625, "loss": 0.2617, "rewards/chosen": 0.8738720893859864, "rewards/margins": 3.447908369700114, "rewards/rejected": -2.5740362803141275, "step": 12241 }, { "epoch": 0.6488749900617496, "grad_norm": 23.625, "kl": 2.8096961975097656, "learning_rate": 5e-07, "logits/chosen": 2652663.6666666665, "logits/rejected": -8901051.2, "logps/chosen": -89.00875854492188, "logps/rejected": -219.2738037109375, "loss": 0.1821, "rewards/chosen": 1.0287857850392659, "rewards/margins": 4.540049823125203, "rewards/rejected": -3.5112640380859377, "step": 12242 }, { "epoch": 0.6489279940635518, "grad_norm": 51.0, "kl": 1.9620628356933594, "learning_rate": 5e-07, "logits/chosen": -40998500.0, "logits/rejected": -30031554.0, "logps/chosen": -239.87364196777344, "logps/rejected": -439.19873046875, "loss": 0.3379, "rewards/chosen": 0.40299922227859497, "rewards/margins": 2.4622610211372375, "rewards/rejected": -2.0592617988586426, "step": 12243 }, { "epoch": 0.6489809980653539, "grad_norm": 51.75, "kl": 0.6311817169189453, "learning_rate": 5e-07, "logits/chosen": -17249193.6, "logits/rejected": -783333.0, "logps/chosen": -319.453076171875, "logps/rejected": -135.3204549153646, "loss": 0.2894, "rewards/chosen": 0.7015727996826172, "rewards/margins": 2.6874913215637206, "rewards/rejected": -1.9859185218811035, "step": 12244 }, { "epoch": 0.6490340020671561, "grad_norm": 62.75, "kl": 3.988037109375, "learning_rate": 5e-07, "logits/chosen": -30869654.85714286, "logits/rejected": -10204206.0, "logps/chosen": -308.389892578125, "logps/rejected": -144.70413208007812, "loss": 0.434, "rewards/chosen": 0.6722406659807477, "rewards/margins": 1.4828897629465376, "rewards/rejected": -0.8106490969657898, "step": 12245 }, { "epoch": 0.6490870060689582, "grad_norm": 59.0, "kl": 0.13245010375976562, "learning_rate": 5e-07, "logits/chosen": 50966634.666666664, "logits/rejected": -11476736.8, "logps/chosen": -299.85414632161456, "logps/rejected": -332.543310546875, "loss": 0.312, "rewards/chosen": -0.04315255582332611, "rewards/margins": 2.032008042931557, "rewards/rejected": -2.075160598754883, "step": 12246 }, { "epoch": 0.6491400100707604, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -121417.66666666667, "logits/rejected": -34144534.4, "logps/chosen": -294.1792399088542, "logps/rejected": -149.48446044921874, "loss": 0.2074, "rewards/chosen": 0.3159248431523641, "rewards/margins": 3.312948234875997, "rewards/rejected": -2.9970233917236326, "step": 12247 }, { "epoch": 0.6491930140725625, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13203098.0, "logits/rejected": -32962464.0, "logps/chosen": -176.6407470703125, "logps/rejected": -299.1809895833333, "loss": 0.2777, "rewards/chosen": -0.13707847893238068, "rewards/margins": 1.8702757805585861, "rewards/rejected": -2.007354259490967, "step": 12248 }, { "epoch": 0.6492460180743647, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5940399.333333333, "logits/rejected": -10455793.6, "logps/chosen": -112.86890665690105, "logps/rejected": -205.3411376953125, "loss": 0.2036, "rewards/chosen": 0.7635815938313802, "rewards/margins": 2.886550267537435, "rewards/rejected": -2.1229686737060547, "step": 12249 }, { "epoch": 0.6492990220761667, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 133722176.0, "logits/rejected": -21453481.14285714, "logps/chosen": -203.57357788085938, "logps/rejected": -239.05777413504464, "loss": 0.276, "rewards/chosen": -0.0903778076171875, "rewards/margins": 1.5536509922572546, "rewards/rejected": -1.644028799874442, "step": 12250 }, { "epoch": 0.6493520260779689, "grad_norm": 91.0, "kl": 0.13773727416992188, "learning_rate": 5e-07, "logits/chosen": -2913511.6666666665, "logits/rejected": 47487974.4, "logps/chosen": -167.9315185546875, "logps/rejected": -507.740283203125, "loss": 0.2575, "rewards/chosen": 1.6992638905843098, "rewards/margins": 2.81029733022054, "rewards/rejected": -1.1110334396362305, "step": 12251 }, { "epoch": 0.649405030079771, "grad_norm": 46.0, "kl": 1.10009765625, "learning_rate": 5e-07, "logits/chosen": -10265957.0, "logits/rejected": -30272916.0, "logps/chosen": -478.05401611328125, "logps/rejected": -228.54913330078125, "loss": 0.2895, "rewards/chosen": 0.49515706300735474, "rewards/margins": 2.4677351117134094, "rewards/rejected": -1.9725780487060547, "step": 12252 }, { "epoch": 0.6494580340815732, "grad_norm": 52.25, "kl": 0.5291023254394531, "learning_rate": 5e-07, "logits/chosen": -43663856.0, "logits/rejected": -85787168.0, "logps/chosen": -261.4732971191406, "logps/rejected": -614.4867553710938, "loss": 0.281, "rewards/chosen": 0.06534042954444885, "rewards/margins": 3.21459236741066, "rewards/rejected": -3.149251937866211, "step": 12253 }, { "epoch": 0.6495110380833753, "grad_norm": 60.25, "kl": 2.954500198364258, "learning_rate": 5e-07, "logits/chosen": -45025494.4, "logits/rejected": -30826845.333333332, "logps/chosen": -410.931884765625, "logps/rejected": -291.9225260416667, "loss": 0.3695, "rewards/chosen": 0.5955126762390137, "rewards/margins": 1.9229737599690755, "rewards/rejected": -1.3274610837300618, "step": 12254 }, { "epoch": 0.6495640420851775, "grad_norm": 54.0, "kl": 1.5604629516601562, "learning_rate": 5e-07, "logits/chosen": -49106528.0, "logits/rejected": -24058522.0, "logps/chosen": -510.4278564453125, "logps/rejected": -279.49774169921875, "loss": 0.3622, "rewards/chosen": 0.5017022291819254, "rewards/margins": 3.0189360777537027, "rewards/rejected": -2.5172338485717773, "step": 12255 }, { "epoch": 0.6496170460869796, "grad_norm": 40.75, "kl": 0.5767250061035156, "learning_rate": 5e-07, "logits/chosen": 12488608.0, "logits/rejected": -25750715.42857143, "logps/chosen": -54.8157958984375, "logps/rejected": -368.54495675223217, "loss": 0.2017, "rewards/chosen": 0.5010578036308289, "rewards/margins": 2.765339893954141, "rewards/rejected": -2.264282090323312, "step": 12256 }, { "epoch": 0.6496700500887816, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57523380.0, "logits/rejected": -26200806.0, "logps/chosen": -417.37109375, "logps/rejected": -313.75787353515625, "loss": 0.2465, "rewards/chosen": 0.675727903842926, "rewards/margins": 3.009939730167389, "rewards/rejected": -2.334211826324463, "step": 12257 }, { "epoch": 0.6497230540905838, "grad_norm": 52.5, "kl": 0.17751312255859375, "learning_rate": 5e-07, "logits/chosen": -57636672.0, "logits/rejected": -42041008.0, "logps/chosen": -262.8016357421875, "logps/rejected": -314.4924011230469, "loss": 0.2864, "rewards/chosen": 0.22431260347366333, "rewards/margins": 2.661206066608429, "rewards/rejected": -2.4368934631347656, "step": 12258 }, { "epoch": 0.6497760580923859, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36978517.333333336, "logits/rejected": -22674692.8, "logps/chosen": -386.3512369791667, "logps/rejected": -331.77314453125, "loss": 0.281, "rewards/chosen": 0.6614471276601156, "rewards/margins": 2.3152726968129476, "rewards/rejected": -1.653825569152832, "step": 12259 }, { "epoch": 0.6498290620941881, "grad_norm": 43.25, "kl": 2.901073455810547, "learning_rate": 5e-07, "logits/chosen": -45693237.333333336, "logits/rejected": -23073036.8, "logps/chosen": -626.1326090494791, "logps/rejected": -377.8728515625, "loss": 0.2557, "rewards/chosen": 1.4677988688151042, "rewards/margins": 3.4725805918375654, "rewards/rejected": -2.004781723022461, "step": 12260 }, { "epoch": 0.6498820660959902, "grad_norm": 34.75, "kl": 0.7350502014160156, "learning_rate": 5e-07, "logits/chosen": -13947645.333333334, "logits/rejected": -13869465.6, "logps/chosen": -152.12665812174478, "logps/rejected": -163.50855712890626, "loss": 0.2495, "rewards/chosen": 0.8882260322570801, "rewards/margins": 2.9644192695617675, "rewards/rejected": -2.0761932373046874, "step": 12261 }, { "epoch": 0.6499350700977924, "grad_norm": 47.5, "kl": 1.8281135559082031, "learning_rate": 5e-07, "logits/chosen": -50373174.4, "logits/rejected": -8073956.0, "logps/chosen": -254.23837890625, "logps/rejected": -272.1788330078125, "loss": 0.3478, "rewards/chosen": 0.2900080680847168, "rewards/margins": 2.707301616668701, "rewards/rejected": -2.4172935485839844, "step": 12262 }, { "epoch": 0.6499880740995945, "grad_norm": 49.25, "kl": 0.1528186798095703, "learning_rate": 5e-07, "logits/chosen": -9253476.666666666, "logits/rejected": -13111498.0, "logps/chosen": -229.33538818359375, "logps/rejected": -294.0530700683594, "loss": 0.3242, "rewards/chosen": 0.5957045555114746, "rewards/margins": 2.662487030029297, "rewards/rejected": -2.0667824745178223, "step": 12263 }, { "epoch": 0.6500410781013967, "grad_norm": 40.75, "kl": 0.8630847930908203, "learning_rate": 5e-07, "logits/chosen": -17891760.0, "logits/rejected": -24493637.333333332, "logps/chosen": -139.1680145263672, "logps/rejected": -345.0900472005208, "loss": 0.1882, "rewards/chosen": 0.8190693855285645, "rewards/margins": 3.6815199851989746, "rewards/rejected": -2.86245059967041, "step": 12264 }, { "epoch": 0.6500940821031987, "grad_norm": 39.5, "kl": 4.037660598754883, "learning_rate": 5e-07, "logits/chosen": -9796482.4, "logits/rejected": -63498208.0, "logps/chosen": -274.251416015625, "logps/rejected": -497.0214436848958, "loss": 0.2197, "rewards/chosen": 1.4347508430480957, "rewards/margins": 4.031195290883383, "rewards/rejected": -2.5964444478352866, "step": 12265 }, { "epoch": 0.6501470861050009, "grad_norm": 47.5, "kl": 1.3260345458984375, "learning_rate": 5e-07, "logits/chosen": -51159468.8, "logits/rejected": -31877024.0, "logps/chosen": -309.522119140625, "logps/rejected": -449.6067708333333, "loss": 0.3136, "rewards/chosen": 0.5333490371704102, "rewards/margins": 3.0336524645487466, "rewards/rejected": -2.5003034273783364, "step": 12266 }, { "epoch": 0.650200090106803, "grad_norm": 49.25, "kl": 2.8311233520507812, "learning_rate": 5e-07, "logits/chosen": -17401088.0, "logits/rejected": 45359792.0, "logps/chosen": -256.6671875, "logps/rejected": -171.1308797200521, "loss": 0.4218, "rewards/chosen": 0.1992759108543396, "rewards/margins": 2.4253103931744895, "rewards/rejected": -2.22603448232015, "step": 12267 }, { "epoch": 0.6502530941086052, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9809044.0, "logits/rejected": -54022986.666666664, "logps/chosen": -287.45062255859375, "logps/rejected": -306.5447591145833, "loss": 0.2785, "rewards/chosen": 0.10164108127355576, "rewards/margins": 1.8144617701570194, "rewards/rejected": -1.7128206888834636, "step": 12268 }, { "epoch": 0.6503060981104073, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44153053.333333336, "logits/rejected": -14585668.8, "logps/chosen": -177.376953125, "logps/rejected": -171.55823974609376, "loss": 0.2453, "rewards/chosen": 0.44015733400980633, "rewards/margins": 2.547484223047892, "rewards/rejected": -2.107326889038086, "step": 12269 }, { "epoch": 0.6503591021122095, "grad_norm": 41.0, "kl": 1.978475570678711, "learning_rate": 5e-07, "logits/chosen": -7255005.0, "logits/rejected": -59065568.0, "logps/chosen": -274.2583923339844, "logps/rejected": -508.52197265625, "loss": 0.2724, "rewards/chosen": 0.44253653287887573, "rewards/margins": 2.995175540447235, "rewards/rejected": -2.5526390075683594, "step": 12270 }, { "epoch": 0.6504121061140116, "grad_norm": 54.75, "kl": 1.4694633483886719, "learning_rate": 5e-07, "logits/chosen": -41950532.0, "logits/rejected": -59523320.0, "logps/chosen": -314.4752197265625, "logps/rejected": -521.6065673828125, "loss": 0.2955, "rewards/chosen": 0.5513371229171753, "rewards/margins": 3.1188215017318726, "rewards/rejected": -2.5674843788146973, "step": 12271 }, { "epoch": 0.6504651101158138, "grad_norm": 58.5, "kl": 1.4939804077148438, "learning_rate": 5e-07, "logits/chosen": -65048554.666666664, "logits/rejected": -47840360.0, "logps/chosen": -342.3733723958333, "logps/rejected": -322.80670166015625, "loss": 0.3514, "rewards/chosen": 0.5283731619517008, "rewards/margins": 2.5112620989481607, "rewards/rejected": -1.98288893699646, "step": 12272 }, { "epoch": 0.6505181141176158, "grad_norm": 49.75, "kl": 0.9713916778564453, "learning_rate": 5e-07, "logits/chosen": -11701720.0, "logits/rejected": 3175545.5, "logps/chosen": -233.9432576497396, "logps/rejected": -96.35909271240234, "loss": 0.406, "rewards/chosen": 0.09297557671864827, "rewards/margins": 2.3696875174840293, "rewards/rejected": -2.276711940765381, "step": 12273 }, { "epoch": 0.650571118119418, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31188133.333333332, "logits/rejected": -9361260.0, "logps/chosen": -380.1894938151042, "logps/rejected": -156.34942626953125, "loss": 0.2681, "rewards/chosen": 0.09322611490885417, "rewards/margins": 2.0250040690104165, "rewards/rejected": -1.9317779541015625, "step": 12274 }, { "epoch": 0.6506241221212201, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31234604.0, "logits/rejected": -27905637.333333332, "logps/chosen": -234.6261444091797, "logps/rejected": -419.3985188802083, "loss": 0.1369, "rewards/chosen": 0.6080513000488281, "rewards/margins": 3.8308404286702475, "rewards/rejected": -3.2227891286214194, "step": 12275 }, { "epoch": 0.6506771261230223, "grad_norm": 47.0, "kl": 2.7355756759643555, "learning_rate": 5e-07, "logits/chosen": -13637754.666666666, "logits/rejected": -8400497.6, "logps/chosen": -313.450439453125, "logps/rejected": -353.0851318359375, "loss": 0.1964, "rewards/chosen": 1.232511838277181, "rewards/margins": 3.0976260503133135, "rewards/rejected": -1.8651142120361328, "step": 12276 }, { "epoch": 0.6507301301248244, "grad_norm": 39.0, "kl": 1.2425956726074219, "learning_rate": 5e-07, "logits/chosen": -14581435.2, "logits/rejected": -31592008.0, "logps/chosen": -213.393310546875, "logps/rejected": -283.3169352213542, "loss": 0.2491, "rewards/chosen": 1.1024413108825684, "rewards/margins": 3.2196879386901855, "rewards/rejected": -2.117246627807617, "step": 12277 }, { "epoch": 0.6507831341266266, "grad_norm": 57.0, "kl": 6.499439239501953, "learning_rate": 5e-07, "logits/chosen": -1780009.6, "logits/rejected": -19699937.333333332, "logps/chosen": -823.7365234375, "logps/rejected": -111.5386962890625, "loss": 0.3318, "rewards/chosen": 1.740506362915039, "rewards/margins": 2.8218969027201335, "rewards/rejected": -1.0813905398050945, "step": 12278 }, { "epoch": 0.6508361381284287, "grad_norm": 44.5, "kl": 1.1182880401611328, "learning_rate": 5e-07, "logits/chosen": -23964361.6, "logits/rejected": -27947029.333333332, "logps/chosen": -241.180078125, "logps/rejected": -273.91831461588544, "loss": 0.3026, "rewards/chosen": 0.4150986671447754, "rewards/margins": 3.4194350242614746, "rewards/rejected": -3.004336357116699, "step": 12279 }, { "epoch": 0.6508891421302309, "grad_norm": 48.75, "kl": 1.1785449981689453, "learning_rate": 5e-07, "logits/chosen": -11543582.666666666, "logits/rejected": -9415423.0, "logps/chosen": -232.46319580078125, "logps/rejected": -259.04913330078125, "loss": 0.374, "rewards/chosen": 0.543463945388794, "rewards/margins": 1.8056435585021973, "rewards/rejected": -1.2621796131134033, "step": 12280 }, { "epoch": 0.6509421461320329, "grad_norm": 50.75, "kl": 0.17537879943847656, "learning_rate": 5e-07, "logits/chosen": -29887453.333333332, "logits/rejected": -35083043.2, "logps/chosen": -344.7188313802083, "logps/rejected": -227.180078125, "loss": 0.2865, "rewards/chosen": 0.3631940285364787, "rewards/margins": 1.9205873886744182, "rewards/rejected": -1.5573933601379395, "step": 12281 }, { "epoch": 0.6509951501338351, "grad_norm": 58.0, "kl": 1.2235736846923828, "learning_rate": 5e-07, "logits/chosen": -52274073.6, "logits/rejected": -5626486.0, "logps/chosen": -284.045751953125, "logps/rejected": -164.29366048177084, "loss": 0.3185, "rewards/chosen": 0.3734171390533447, "rewards/margins": 2.299538691838582, "rewards/rejected": -1.9261215527852376, "step": 12282 }, { "epoch": 0.6510481541356372, "grad_norm": 54.25, "kl": 0.9354887008666992, "learning_rate": 5e-07, "logits/chosen": -15786372.0, "logits/rejected": -2290059.0, "logps/chosen": -258.5043131510417, "logps/rejected": -92.8895263671875, "loss": 0.2537, "rewards/chosen": 0.9483351707458496, "rewards/margins": 4.244397163391113, "rewards/rejected": -3.2960619926452637, "step": 12283 }, { "epoch": 0.6511011581374394, "grad_norm": 48.25, "kl": 1.5116653442382812, "learning_rate": 5e-07, "logits/chosen": -49174012.0, "logits/rejected": -15858784.0, "logps/chosen": -391.49725341796875, "logps/rejected": -303.4457702636719, "loss": 0.2062, "rewards/chosen": 1.2813470363616943, "rewards/margins": 3.2859089374542236, "rewards/rejected": -2.0045619010925293, "step": 12284 }, { "epoch": 0.6511541621392415, "grad_norm": 36.25, "kl": 0.39009857177734375, "learning_rate": 5e-07, "logits/chosen": -63324500.0, "logits/rejected": -31949988.0, "logps/chosen": -1036.4039306640625, "logps/rejected": -589.6055908203125, "loss": 0.186, "rewards/chosen": 1.3187841176986694, "rewards/margins": 5.535034537315369, "rewards/rejected": -4.216250419616699, "step": 12285 }, { "epoch": 0.6512071661410437, "grad_norm": 43.5, "kl": 1.978865623474121, "learning_rate": 5e-07, "logits/chosen": -26550124.8, "logits/rejected": 101773962.66666667, "logps/chosen": -274.1337890625, "logps/rejected": -109.03646850585938, "loss": 0.3825, "rewards/chosen": 0.4283622741699219, "rewards/margins": 1.8443049430847167, "rewards/rejected": -1.415942668914795, "step": 12286 }, { "epoch": 0.6512601701428458, "grad_norm": 56.0, "kl": 0.7040119171142578, "learning_rate": 5e-07, "logits/chosen": -21459378.666666668, "logits/rejected": -49391840.0, "logps/chosen": -251.25799560546875, "logps/rejected": -614.5966186523438, "loss": 0.34, "rewards/chosen": 0.29792169729868573, "rewards/margins": 4.4874439636866255, "rewards/rejected": -4.1895222663879395, "step": 12287 }, { "epoch": 0.651313174144648, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30231466.666666668, "logits/rejected": -208780.0, "logps/chosen": -597.6111246744791, "logps/rejected": -70.87503662109376, "loss": 0.3194, "rewards/chosen": -0.16033935546875, "rewards/margins": 1.6714149475097657, "rewards/rejected": -1.8317543029785157, "step": 12288 }, { "epoch": 0.65136617814645, "grad_norm": 55.5, "kl": 1.6822509765625, "learning_rate": 5e-07, "logits/chosen": -934322.3333333334, "logits/rejected": 10430468.0, "logps/chosen": -105.01540120442708, "logps/rejected": -151.4409423828125, "loss": 0.3105, "rewards/chosen": 0.9993139902750651, "rewards/margins": 2.258595339457194, "rewards/rejected": -1.2592813491821289, "step": 12289 }, { "epoch": 0.6514191821482522, "grad_norm": 82.0, "kl": 1.005462646484375, "learning_rate": 5e-07, "logits/chosen": 7565041.6, "logits/rejected": 21139524.0, "logps/chosen": -440.10400390625, "logps/rejected": -159.07664998372397, "loss": 0.3709, "rewards/chosen": 0.5545117855072021, "rewards/margins": 1.5766019821166992, "rewards/rejected": -1.022090196609497, "step": 12290 }, { "epoch": 0.6514721861500543, "grad_norm": 51.75, "kl": 0.8146333694458008, "learning_rate": 5e-07, "logits/chosen": -24622170.666666668, "logits/rejected": 5575663.6, "logps/chosen": -231.59745279947916, "logps/rejected": -346.863037109375, "loss": 0.251, "rewards/chosen": 0.4094378153483073, "rewards/margins": 2.4276823679606117, "rewards/rejected": -2.0182445526123045, "step": 12291 }, { "epoch": 0.6515251901518565, "grad_norm": 31.25, "kl": 2.9882755279541016, "learning_rate": 5e-07, "logits/chosen": -19590705.6, "logits/rejected": -43337933.333333336, "logps/chosen": -212.0455078125, "logps/rejected": -420.89892578125, "loss": 0.3422, "rewards/chosen": 0.5766413688659668, "rewards/margins": 3.1794271151224773, "rewards/rejected": -2.6027857462565103, "step": 12292 }, { "epoch": 0.6515781941536586, "grad_norm": 46.75, "kl": 1.7805099487304688, "learning_rate": 5e-07, "logits/chosen": -3663064.3333333335, "logits/rejected": -11809801.6, "logps/chosen": -648.5069986979166, "logps/rejected": -160.53511962890624, "loss": 0.3247, "rewards/chosen": 1.0820651054382324, "rewards/margins": 2.424695110321045, "rewards/rejected": -1.3426300048828126, "step": 12293 }, { "epoch": 0.6516311981554608, "grad_norm": 56.75, "kl": 1.3086090087890625, "learning_rate": 5e-07, "logits/chosen": -30486252.8, "logits/rejected": -64211168.0, "logps/chosen": -452.380859375, "logps/rejected": -183.7989705403646, "loss": 0.2987, "rewards/chosen": 0.8817634582519531, "rewards/margins": 2.688915093739827, "rewards/rejected": -1.8071516354878743, "step": 12294 }, { "epoch": 0.6516842021572629, "grad_norm": 68.5, "kl": 4.082788467407227, "learning_rate": 5e-07, "logits/chosen": 4495491.2, "logits/rejected": -10921128.666666666, "logps/chosen": -333.214990234375, "logps/rejected": -189.78190104166666, "loss": 0.3458, "rewards/chosen": 0.6287076950073243, "rewards/margins": 3.581532287597656, "rewards/rejected": -2.952824592590332, "step": 12295 }, { "epoch": 0.651737206159065, "grad_norm": 34.25, "kl": 0.4898681640625, "learning_rate": 5e-07, "logits/chosen": -8533051.0, "logits/rejected": -50965552.0, "logps/chosen": -159.0766143798828, "logps/rejected": -474.9029846191406, "loss": 0.2627, "rewards/chosen": 0.5155378580093384, "rewards/margins": 3.4691962003707886, "rewards/rejected": -2.95365834236145, "step": 12296 }, { "epoch": 0.6517902101608671, "grad_norm": 64.0, "kl": 5.52116584777832, "learning_rate": 5e-07, "logits/chosen": -21293376.0, "logits/rejected": -5890282.5, "logps/chosen": -380.2925618489583, "logps/rejected": -251.6140899658203, "loss": 0.4051, "rewards/chosen": 0.7661072413126627, "rewards/margins": 1.853780428568522, "rewards/rejected": -1.0876731872558594, "step": 12297 }, { "epoch": 0.6518432141626693, "grad_norm": 39.0, "kl": 2.345465660095215, "learning_rate": 5e-07, "logits/chosen": -4976949.0, "logits/rejected": -47193164.0, "logps/chosen": -214.59834798177084, "logps/rejected": -392.1915283203125, "loss": 0.3479, "rewards/chosen": 0.6850893497467041, "rewards/margins": 3.5903449058532715, "rewards/rejected": -2.9052555561065674, "step": 12298 }, { "epoch": 0.6518962181644714, "grad_norm": 62.75, "kl": 1.4286041259765625, "learning_rate": 5e-07, "logits/chosen": -71752608.0, "logits/rejected": -46663212.0, "logps/chosen": -613.6563720703125, "logps/rejected": -343.23211669921875, "loss": 0.2351, "rewards/chosen": 1.343424677848816, "rewards/margins": 2.9696625471115112, "rewards/rejected": -1.6262378692626953, "step": 12299 }, { "epoch": 0.6519492221662736, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -118333104.0, "logits/rejected": -20708361.14285714, "logps/chosen": -317.0361022949219, "logps/rejected": -366.9151088169643, "loss": 0.2049, "rewards/chosen": -0.412484735250473, "rewards/margins": 1.8738914004393985, "rewards/rejected": -2.2863761356898715, "step": 12300 }, { "epoch": 0.6520022261680757, "grad_norm": 86.0, "kl": 4.9637908935546875, "learning_rate": 5e-07, "logits/chosen": -78285100.8, "logits/rejected": -2613702.6666666665, "logps/chosen": -495.74677734375, "logps/rejected": -92.49078369140625, "loss": 0.3432, "rewards/chosen": 1.2456487655639648, "rewards/margins": 2.5736722787221273, "rewards/rejected": -1.3280235131581624, "step": 12301 }, { "epoch": 0.6520552301698779, "grad_norm": 43.5, "kl": 2.7932605743408203, "learning_rate": 5e-07, "logits/chosen": -18041694.0, "logits/rejected": -36478600.0, "logps/chosen": -240.8804168701172, "logps/rejected": -550.1201171875, "loss": 0.3092, "rewards/chosen": 0.10232873260974884, "rewards/margins": 4.254582837224007, "rewards/rejected": -4.152254104614258, "step": 12302 }, { "epoch": 0.65210823417168, "grad_norm": 48.25, "kl": 2.120637893676758, "learning_rate": 5e-07, "logits/chosen": -14051230.666666666, "logits/rejected": -19572068.8, "logps/chosen": -118.8122049967448, "logps/rejected": -337.6322998046875, "loss": 0.2183, "rewards/chosen": 1.1218135356903076, "rewards/margins": 3.0002345561981203, "rewards/rejected": -1.8784210205078125, "step": 12303 }, { "epoch": 0.6521612381734821, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12206777.333333334, "logits/rejected": -37918678.4, "logps/chosen": -297.4783121744792, "logps/rejected": -240.6408447265625, "loss": 0.2324, "rewards/chosen": 0.8001554807027181, "rewards/margins": 2.4714815457661947, "rewards/rejected": -1.6713260650634765, "step": 12304 }, { "epoch": 0.6522142421752842, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 23087168.0, "logits/rejected": -35140889.6, "logps/chosen": -167.86843872070312, "logps/rejected": -101.36573486328125, "loss": 0.2623, "rewards/chosen": -0.25857847929000854, "rewards/margins": 2.6393898129463196, "rewards/rejected": -2.897968292236328, "step": 12305 }, { "epoch": 0.6522672461770864, "grad_norm": 35.0, "kl": 0.8058872222900391, "learning_rate": 5e-07, "logits/chosen": -3930140.0, "logits/rejected": -40612456.0, "logps/chosen": -215.76815795898438, "logps/rejected": -390.2100524902344, "loss": 0.1678, "rewards/chosen": 1.551924705505371, "rewards/margins": 3.940854549407959, "rewards/rejected": -2.388929843902588, "step": 12306 }, { "epoch": 0.6523202501788885, "grad_norm": 45.25, "kl": 0.45752716064453125, "learning_rate": 5e-07, "logits/chosen": -6946609.333333333, "logits/rejected": -16292534.4, "logps/chosen": -316.44191487630206, "logps/rejected": -280.928466796875, "loss": 0.2956, "rewards/chosen": -0.08157959083716075, "rewards/margins": 2.1376729955275855, "rewards/rejected": -2.219252586364746, "step": 12307 }, { "epoch": 0.6523732541806906, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13977571.0, "logits/rejected": -18712206.0, "logps/chosen": -307.931640625, "logps/rejected": -476.7288818359375, "loss": 0.1927, "rewards/chosen": 0.7313466668128967, "rewards/margins": 4.130302011966705, "rewards/rejected": -3.3989553451538086, "step": 12308 }, { "epoch": 0.6524262581824928, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3002278.5, "logits/rejected": -30042100.0, "logps/chosen": -425.4399108886719, "logps/rejected": -256.43170166015625, "loss": 0.2126, "rewards/chosen": 0.9265166521072388, "rewards/margins": 3.1716328859329224, "rewards/rejected": -2.2451162338256836, "step": 12309 }, { "epoch": 0.6524792621842949, "grad_norm": 53.25, "kl": 0.6552696228027344, "learning_rate": 5e-07, "logits/chosen": -25147768.0, "logits/rejected": -15734590.0, "logps/chosen": -274.7342529296875, "logps/rejected": -220.2479705810547, "loss": 0.3027, "rewards/chosen": 0.33249711990356445, "rewards/margins": 2.4192047119140625, "rewards/rejected": -2.086707592010498, "step": 12310 }, { "epoch": 0.652532266186097, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34034666.666666664, "logits/rejected": -25787235.2, "logps/chosen": -255.58333333333334, "logps/rejected": -262.6170654296875, "loss": 0.2856, "rewards/chosen": 0.6758481661478678, "rewards/margins": 2.7309720675150553, "rewards/rejected": -2.0551239013671876, "step": 12311 }, { "epoch": 0.6525852701878991, "grad_norm": 53.5, "kl": 3.6079578399658203, "learning_rate": 5e-07, "logits/chosen": -52887867.428571425, "logits/rejected": -11157356.0, "logps/chosen": -250.33269391741072, "logps/rejected": -125.76818084716797, "loss": 0.4242, "rewards/chosen": 0.6595638138907296, "rewards/margins": 1.6411335212843758, "rewards/rejected": -0.9815697073936462, "step": 12312 }, { "epoch": 0.6526382741897013, "grad_norm": 41.75, "kl": 0.3760862350463867, "learning_rate": 5e-07, "logits/chosen": 8839895.0, "logits/rejected": -28652500.0, "logps/chosen": -234.37973022460938, "logps/rejected": -174.01800537109375, "loss": 0.3302, "rewards/chosen": 0.739389181137085, "rewards/margins": 1.9868875741958618, "rewards/rejected": -1.2474983930587769, "step": 12313 }, { "epoch": 0.6526912781915034, "grad_norm": 52.75, "kl": 5.068216323852539, "learning_rate": 5e-07, "logits/chosen": -3319934.5714285714, "logits/rejected": -29329456.0, "logps/chosen": -371.8895786830357, "logps/rejected": -284.9475402832031, "loss": 0.4476, "rewards/chosen": 0.5772765023367745, "rewards/margins": 2.291439686502729, "rewards/rejected": -1.7141631841659546, "step": 12314 }, { "epoch": 0.6527442821933056, "grad_norm": 44.5, "kl": 2.5895423889160156, "learning_rate": 5e-07, "logits/chosen": 1007738.9375, "logits/rejected": -21398332.0, "logps/chosen": -309.5116882324219, "logps/rejected": -184.69632975260416, "loss": 0.2221, "rewards/chosen": 1.1477994918823242, "rewards/margins": 3.7008864084879556, "rewards/rejected": -2.5530869166056314, "step": 12315 }, { "epoch": 0.6527972861951077, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34417770.666666664, "logits/rejected": 3729348.0, "logps/chosen": -285.29266357421875, "logps/rejected": -471.656640625, "loss": 0.2134, "rewards/chosen": -0.07098875443140666, "rewards/margins": 3.7097485502560934, "rewards/rejected": -3.7807373046875, "step": 12316 }, { "epoch": 0.6528502901969099, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30967056.0, "logits/rejected": -14774402.0, "logps/chosen": -187.2586669921875, "logps/rejected": -401.08843994140625, "loss": 0.1828, "rewards/chosen": 1.1460540294647217, "rewards/margins": 3.568148374557495, "rewards/rejected": -2.4220943450927734, "step": 12317 }, { "epoch": 0.652903294198712, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12345916.57142857, "logits/rejected": -8792946.0, "logps/chosen": -221.85389927455358, "logps/rejected": -360.55413818359375, "loss": 0.3949, "rewards/chosen": 0.32559783118111746, "rewards/margins": 2.2692868198667253, "rewards/rejected": -1.943688988685608, "step": 12318 }, { "epoch": 0.6529562982005142, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25600720.0, "logits/rejected": -27805108.0, "logps/chosen": -311.5895182291667, "logps/rejected": -620.516357421875, "loss": 0.3366, "rewards/chosen": 0.33837278683980304, "rewards/margins": 3.3287232716878257, "rewards/rejected": -2.9903504848480225, "step": 12319 }, { "epoch": 0.6530093022023162, "grad_norm": 56.0, "kl": 3.923616409301758, "learning_rate": 5e-07, "logits/chosen": -29059880.0, "logits/rejected": -36712796.0, "logps/chosen": -329.5218098958333, "logps/rejected": -421.0633544921875, "loss": 0.4122, "rewards/chosen": 0.44956477483113605, "rewards/margins": 2.6737492879231772, "rewards/rejected": -2.224184513092041, "step": 12320 }, { "epoch": 0.6530623062041184, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66010568.0, "logits/rejected": -22338328.0, "logps/chosen": -504.82928466796875, "logps/rejected": -224.17608642578125, "loss": 0.1536, "rewards/chosen": 0.47587281465530396, "rewards/margins": 3.410712699095408, "rewards/rejected": -2.934839884440104, "step": 12321 }, { "epoch": 0.6531153102059205, "grad_norm": 53.25, "kl": 0.5881862640380859, "learning_rate": 5e-07, "logits/chosen": -26455984.0, "logits/rejected": -3679397.0, "logps/chosen": -209.49810791015625, "logps/rejected": -329.4578857421875, "loss": 0.3265, "rewards/chosen": 0.2827546298503876, "rewards/margins": 2.2696214616298676, "rewards/rejected": -1.98686683177948, "step": 12322 }, { "epoch": 0.6531683142077227, "grad_norm": 44.0, "kl": 0.6485347747802734, "learning_rate": 5e-07, "logits/chosen": -6437854.0, "logits/rejected": -17977344.0, "logps/chosen": -215.67471313476562, "logps/rejected": -399.1982421875, "loss": 0.2513, "rewards/chosen": 0.5189741849899292, "rewards/margins": 2.898971438407898, "rewards/rejected": -2.3799972534179688, "step": 12323 }, { "epoch": 0.6532213182095248, "grad_norm": 81.0, "kl": 1.1638622283935547, "learning_rate": 5e-07, "logits/chosen": -39713016.0, "logits/rejected": -4216138.0, "logps/chosen": -464.7960205078125, "logps/rejected": -134.55526733398438, "loss": 0.3423, "rewards/chosen": 0.4227650761604309, "rewards/margins": 1.702306091785431, "rewards/rejected": -1.279541015625, "step": 12324 }, { "epoch": 0.653274322211327, "grad_norm": 34.25, "kl": 0.04970359802246094, "learning_rate": 5e-07, "logits/chosen": 7005771.5, "logits/rejected": -37377744.0, "logps/chosen": -28.851226806640625, "logps/rejected": -394.1260986328125, "loss": 0.2289, "rewards/chosen": 0.09308995306491852, "rewards/margins": 2.4624394327402115, "rewards/rejected": -2.369349479675293, "step": 12325 }, { "epoch": 0.6533273262131291, "grad_norm": 48.75, "kl": 0.3480415344238281, "learning_rate": 5e-07, "logits/chosen": -20917366.4, "logits/rejected": -41355384.0, "logps/chosen": -366.7291015625, "logps/rejected": -230.26153564453125, "loss": 0.2117, "rewards/chosen": 0.9189728736877442, "rewards/margins": 3.8622321128845214, "rewards/rejected": -2.9432592391967773, "step": 12326 }, { "epoch": 0.6533803302149312, "grad_norm": 68.0, "kl": 1.1574249267578125, "learning_rate": 5e-07, "logits/chosen": -35752368.0, "logits/rejected": 17580966.85714286, "logps/chosen": -186.59487915039062, "logps/rejected": -381.20218331473217, "loss": 0.2206, "rewards/chosen": -0.5864471793174744, "rewards/margins": 1.1798023411205836, "rewards/rejected": -1.766249520438058, "step": 12327 }, { "epoch": 0.6534333342167333, "grad_norm": 83.5, "kl": 1.86700439453125, "learning_rate": 5e-07, "logits/chosen": -5293615.0, "logits/rejected": -12816108.0, "logps/chosen": -111.59477233886719, "logps/rejected": -453.33563232421875, "loss": 0.2981, "rewards/chosen": 0.6763831973075867, "rewards/margins": 2.668783128261566, "rewards/rejected": -1.9923999309539795, "step": 12328 }, { "epoch": 0.6534863382185355, "grad_norm": 62.5, "kl": 0.12634658813476562, "learning_rate": 5e-07, "logits/chosen": -46816864.0, "logits/rejected": -59104848.0, "logps/chosen": -335.1910807291667, "logps/rejected": -335.2351989746094, "loss": 0.3397, "rewards/chosen": 0.3410155375798543, "rewards/margins": 3.075194557507833, "rewards/rejected": -2.7341790199279785, "step": 12329 }, { "epoch": 0.6535393422203376, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43944448.0, "logits/rejected": -70774952.0, "logps/chosen": -268.93145751953125, "logps/rejected": -213.59796142578125, "loss": 0.3119, "rewards/chosen": 0.2123512327671051, "rewards/margins": 2.1729492247104645, "rewards/rejected": -1.9605979919433594, "step": 12330 }, { "epoch": 0.6535923462221398, "grad_norm": 45.0, "kl": 1.1144726276397705, "learning_rate": 5e-07, "logits/chosen": -14010266.666666666, "logits/rejected": -26652803.2, "logps/chosen": -170.08767700195312, "logps/rejected": -325.335986328125, "loss": 0.3288, "rewards/chosen": -0.3381277720133464, "rewards/margins": 1.5775239308675129, "rewards/rejected": -1.9156517028808593, "step": 12331 }, { "epoch": 0.6536453502239419, "grad_norm": 62.5, "kl": 1.2421722412109375, "learning_rate": 5e-07, "logits/chosen": -48935400.0, "logits/rejected": -22462402.0, "logps/chosen": -618.9022216796875, "logps/rejected": -361.5392761230469, "loss": 0.2843, "rewards/chosen": 0.7131233215332031, "rewards/margins": 2.930710554122925, "rewards/rejected": -2.2175872325897217, "step": 12332 }, { "epoch": 0.6536983542257441, "grad_norm": 49.75, "kl": 0.2257080078125, "learning_rate": 5e-07, "logits/chosen": -53662032.0, "logits/rejected": -29666570.666666668, "logps/chosen": -286.126806640625, "logps/rejected": -482.0644938151042, "loss": 0.3311, "rewards/chosen": 0.0483560174703598, "rewards/margins": 3.1850289911031724, "rewards/rejected": -3.1366729736328125, "step": 12333 }, { "epoch": 0.6537513582275462, "grad_norm": 60.75, "kl": 0.27170562744140625, "learning_rate": 5e-07, "logits/chosen": 21097926.4, "logits/rejected": -13222820.0, "logps/chosen": -402.7334228515625, "logps/rejected": -314.02423095703125, "loss": 0.3573, "rewards/chosen": -0.08996429443359374, "rewards/margins": 2.8659247716267906, "rewards/rejected": -2.9558890660603843, "step": 12334 }, { "epoch": 0.6538043622293483, "grad_norm": 60.25, "kl": 0.2522697448730469, "learning_rate": 5e-07, "logits/chosen": -9513174.0, "logits/rejected": 1526304.0, "logps/chosen": -146.82684326171875, "logps/rejected": -245.165283203125, "loss": 0.2497, "rewards/chosen": 0.9604634642601013, "rewards/margins": 2.514103988806407, "rewards/rejected": -1.5536405245463054, "step": 12335 }, { "epoch": 0.6538573662311504, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37106720.0, "logits/rejected": 12510952.0, "logps/chosen": -356.84674072265625, "logps/rejected": -289.73679606119794, "loss": 0.2103, "rewards/chosen": 0.442129522562027, "rewards/margins": 2.648586183786392, "rewards/rejected": -2.2064566612243652, "step": 12336 }, { "epoch": 0.6539103702329526, "grad_norm": 53.5, "kl": 2.7384300231933594, "learning_rate": 5e-07, "logits/chosen": -18104208.0, "logits/rejected": -29468528.0, "logps/chosen": -544.2855224609375, "logps/rejected": -355.9481506347656, "loss": 0.2361, "rewards/chosen": 0.8418252468109131, "rewards/margins": 4.131432294845581, "rewards/rejected": -3.289607048034668, "step": 12337 }, { "epoch": 0.6539633742347547, "grad_norm": 45.0, "kl": 1.8213157653808594, "learning_rate": 5e-07, "logits/chosen": -50869125.333333336, "logits/rejected": -26609697.6, "logps/chosen": -668.4564615885416, "logps/rejected": -429.136962890625, "loss": 0.2067, "rewards/chosen": 0.7903792858123779, "rewards/margins": 3.6321481227874757, "rewards/rejected": -2.8417688369750977, "step": 12338 }, { "epoch": 0.6540163782365569, "grad_norm": 43.5, "kl": 1.4832062721252441, "learning_rate": 5e-07, "logits/chosen": -12509124.0, "logits/rejected": -39590100.0, "logps/chosen": -247.08538818359375, "logps/rejected": -416.6665344238281, "loss": 0.2445, "rewards/chosen": 0.8686061501502991, "rewards/margins": 3.077928602695465, "rewards/rejected": -2.209322452545166, "step": 12339 }, { "epoch": 0.654069382238359, "grad_norm": 64.5, "kl": 0.3255500793457031, "learning_rate": 5e-07, "logits/chosen": -7513841.0, "logits/rejected": -16007408.0, "logps/chosen": -299.84735107421875, "logps/rejected": -313.7825927734375, "loss": 0.3138, "rewards/chosen": 0.25939103960990906, "rewards/margins": 1.9214119613170624, "rewards/rejected": -1.6620209217071533, "step": 12340 }, { "epoch": 0.6541223862401612, "grad_norm": 36.25, "kl": 0.5405025482177734, "learning_rate": 5e-07, "logits/chosen": 1081180.5, "logits/rejected": -30115248.0, "logps/chosen": -40.977699279785156, "logps/rejected": -405.67898995535717, "loss": 0.1736, "rewards/chosen": -0.31961289048194885, "rewards/margins": 2.057978106396539, "rewards/rejected": -2.377590996878488, "step": 12341 }, { "epoch": 0.6541753902419633, "grad_norm": 53.75, "kl": 1.6494998931884766, "learning_rate": 5e-07, "logits/chosen": -17327598.4, "logits/rejected": 8187460.0, "logps/chosen": -164.02589111328126, "logps/rejected": -234.758544921875, "loss": 0.3322, "rewards/chosen": 0.7805947303771973, "rewards/margins": 2.049716059366862, "rewards/rejected": -1.2691213289896648, "step": 12342 }, { "epoch": 0.6542283942437654, "grad_norm": 45.75, "kl": 1.4474143981933594, "learning_rate": 5e-07, "logits/chosen": 6729050.0, "logits/rejected": -26479296.0, "logps/chosen": -152.59701538085938, "logps/rejected": -334.8251647949219, "loss": 0.3002, "rewards/chosen": 0.4402596950531006, "rewards/margins": 2.2472089529037476, "rewards/rejected": -1.806949257850647, "step": 12343 }, { "epoch": 0.6542813982455675, "grad_norm": 90.5, "kl": 4.936614990234375, "learning_rate": 5e-07, "logits/chosen": 7222928.0, "logits/rejected": -69732888.0, "logps/chosen": -640.3849051339286, "logps/rejected": -538.9515380859375, "loss": 0.4007, "rewards/chosen": 0.8731741905212402, "rewards/margins": 3.045464277267456, "rewards/rejected": -2.172290086746216, "step": 12344 }, { "epoch": 0.6543344022473697, "grad_norm": 79.5, "kl": 2.8515586853027344, "learning_rate": 5e-07, "logits/chosen": -28729440.0, "logits/rejected": 3194322.3333333335, "logps/chosen": -603.966943359375, "logps/rejected": -90.85951741536458, "loss": 0.3737, "rewards/chosen": 0.7945283889770508, "rewards/margins": 3.004956785837809, "rewards/rejected": -2.2104283968607583, "step": 12345 }, { "epoch": 0.6543874062491718, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38306138.666666664, "logits/rejected": -17964387.2, "logps/chosen": -201.0787353515625, "logps/rejected": -139.090283203125, "loss": 0.3175, "rewards/chosen": -0.21153024832407633, "rewards/margins": 1.710643442471822, "rewards/rejected": -1.9221736907958984, "step": 12346 }, { "epoch": 0.654440410250974, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3186433.0, "logits/rejected": -51151296.0, "logps/chosen": -43.90555953979492, "logps/rejected": -419.17661830357144, "loss": 0.1885, "rewards/chosen": -0.17300263047218323, "rewards/margins": 2.1198263381208693, "rewards/rejected": -2.2928289685930525, "step": 12347 }, { "epoch": 0.6544934142527761, "grad_norm": 60.5, "kl": 0.6290283203125, "learning_rate": 5e-07, "logits/chosen": -30286180.0, "logits/rejected": -30724840.0, "logps/chosen": -225.86146545410156, "logps/rejected": -648.878173828125, "loss": 0.3257, "rewards/chosen": 0.06658840924501419, "rewards/margins": 2.428231008350849, "rewards/rejected": -2.361642599105835, "step": 12348 }, { "epoch": 0.6545464182545783, "grad_norm": 51.75, "kl": 1.278005599975586, "learning_rate": 5e-07, "logits/chosen": -32090688.0, "logits/rejected": 26888202.666666668, "logps/chosen": -270.1149169921875, "logps/rejected": -325.17934163411456, "loss": 0.2971, "rewards/chosen": 0.5507923126220703, "rewards/margins": 3.03739382425944, "rewards/rejected": -2.4866015116373696, "step": 12349 }, { "epoch": 0.6545994222563803, "grad_norm": 46.25, "kl": 4.481405258178711, "learning_rate": 5e-07, "logits/chosen": -19847104.0, "logits/rejected": 11960808.0, "logps/chosen": -324.53291015625, "logps/rejected": -175.93196614583334, "loss": 0.3616, "rewards/chosen": 0.9971155166625977, "rewards/margins": 1.7417476812998454, "rewards/rejected": -0.7446321646372477, "step": 12350 }, { "epoch": 0.6546524262581825, "grad_norm": 48.25, "kl": 5.371685028076172, "learning_rate": 5e-07, "logits/chosen": -3321143.0, "logps/chosen": -207.04931640625, "loss": 0.455, "rewards/chosen": 0.7589625716209412, "step": 12351 }, { "epoch": 0.6547054302599846, "grad_norm": 42.75, "kl": 3.1102771759033203, "learning_rate": 5e-07, "logits/chosen": -22057548.8, "logits/rejected": -64205226.666666664, "logps/chosen": -258.7204833984375, "logps/rejected": -552.8104654947916, "loss": 0.2859, "rewards/chosen": 0.8326092720031738, "rewards/margins": 3.953076457977295, "rewards/rejected": -3.120467185974121, "step": 12352 }, { "epoch": 0.6547584342617868, "grad_norm": 36.5, "kl": 1.2894554138183594, "learning_rate": 5e-07, "logits/chosen": -28484176.0, "logits/rejected": -4697366.0, "logps/chosen": -169.50341796875, "logps/rejected": -131.8719024658203, "loss": 0.2888, "rewards/chosen": 0.18419188261032104, "rewards/margins": 3.6072716116905212, "rewards/rejected": -3.4230797290802, "step": 12353 }, { "epoch": 0.6548114382635889, "grad_norm": 57.0, "kl": 2.1094465255737305, "learning_rate": 5e-07, "logits/chosen": -15235050.666666666, "logits/rejected": -37283590.4, "logps/chosen": -302.8420003255208, "logps/rejected": -252.9442138671875, "loss": 0.2992, "rewards/chosen": 0.6037310759226481, "rewards/margins": 1.766671005884806, "rewards/rejected": -1.1629399299621581, "step": 12354 }, { "epoch": 0.6548644422653911, "grad_norm": 50.5, "kl": 1.8554229736328125, "learning_rate": 5e-07, "logits/chosen": -37530056.0, "logits/rejected": -3984477.0, "logps/chosen": -310.6178894042969, "logps/rejected": -254.09505208333334, "loss": 0.2124, "rewards/chosen": 1.2533760070800781, "rewards/margins": 2.9979961713155108, "rewards/rejected": -1.7446201642354329, "step": 12355 }, { "epoch": 0.6549174462671932, "grad_norm": 80.5, "kl": 3.0148582458496094, "learning_rate": 5e-07, "logits/chosen": -30973752.0, "logits/rejected": -167961120.0, "logps/chosen": -424.2474772135417, "logps/rejected": -508.8531799316406, "loss": 0.3773, "rewards/chosen": 0.8558077812194824, "rewards/margins": 1.8838427066802979, "rewards/rejected": -1.0280349254608154, "step": 12356 }, { "epoch": 0.6549704502689954, "grad_norm": 72.0, "kl": 1.6090755462646484, "learning_rate": 5e-07, "logits/chosen": -43251254.4, "logits/rejected": -12668685.333333334, "logps/chosen": -513.511181640625, "logps/rejected": -52.33268737792969, "loss": 0.376, "rewards/chosen": 0.4027683734893799, "rewards/margins": 1.879576285680135, "rewards/rejected": -1.4768079121907551, "step": 12357 }, { "epoch": 0.6550234542707974, "grad_norm": 58.25, "kl": 1.105461597442627, "learning_rate": 5e-07, "logits/chosen": -49847460.0, "logits/rejected": -71395520.0, "logps/chosen": -498.0708923339844, "logps/rejected": -214.47096252441406, "loss": 0.3675, "rewards/chosen": 0.1720016449689865, "rewards/margins": 1.9267843216657639, "rewards/rejected": -1.7547826766967773, "step": 12358 }, { "epoch": 0.6550764582725995, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45910794.666666664, "logits/rejected": -6844339.2, "logps/chosen": -369.6464029947917, "logps/rejected": -183.1108154296875, "loss": 0.3309, "rewards/chosen": 0.45580307642618817, "rewards/margins": 1.4643039385477703, "rewards/rejected": -1.008500862121582, "step": 12359 }, { "epoch": 0.6551294622744017, "grad_norm": 45.0, "kl": 3.7883071899414062, "learning_rate": 5e-07, "logits/chosen": -24196180.0, "logits/rejected": -15156472.0, "logps/chosen": -284.2843322753906, "logps/rejected": -113.47168731689453, "loss": 0.2999, "rewards/chosen": 0.8350483775138855, "rewards/margins": 2.9431185126304626, "rewards/rejected": -2.108070135116577, "step": 12360 }, { "epoch": 0.6551824662762038, "grad_norm": 42.75, "kl": 4.268138885498047, "learning_rate": 5e-07, "logits/chosen": -5426141.2, "logits/rejected": 10877932.666666666, "logps/chosen": -491.2060546875, "logps/rejected": -395.2421875, "loss": 0.2631, "rewards/chosen": 1.4854984283447266, "rewards/margins": 4.338591893513998, "rewards/rejected": -2.853093465169271, "step": 12361 }, { "epoch": 0.655235470278006, "grad_norm": 50.5, "kl": 2.8324356079101562, "learning_rate": 5e-07, "logits/chosen": -3785002.6666666665, "logits/rejected": -29061334.4, "logps/chosen": -310.58900960286456, "logps/rejected": -265.74873046875, "loss": 0.3721, "rewards/chosen": 0.16408399740854898, "rewards/margins": 2.088139287630717, "rewards/rejected": -1.924055290222168, "step": 12362 }, { "epoch": 0.6552884742798081, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14745976.0, "logits/rejected": -23435419.2, "logps/chosen": -415.1853434244792, "logps/rejected": -379.3903564453125, "loss": 0.2102, "rewards/chosen": 0.7198172410329183, "rewards/margins": 3.16395214398702, "rewards/rejected": -2.4441349029541017, "step": 12363 }, { "epoch": 0.6553414782816103, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9525944.666666666, "logits/rejected": -13152418.4, "logps/chosen": -231.15706380208334, "logps/rejected": -447.392431640625, "loss": 0.2006, "rewards/chosen": 0.3195200363794963, "rewards/margins": 3.942438737551371, "rewards/rejected": -3.622918701171875, "step": 12364 }, { "epoch": 0.6553944822834123, "grad_norm": 42.75, "kl": 1.0604629516601562, "learning_rate": 5e-07, "logits/chosen": -38234252.0, "logits/rejected": -29936837.333333332, "logps/chosen": -470.4114074707031, "logps/rejected": -401.2753092447917, "loss": 0.1154, "rewards/chosen": 1.3344742059707642, "rewards/margins": 4.502217888832092, "rewards/rejected": -3.167743682861328, "step": 12365 }, { "epoch": 0.6554474862852145, "grad_norm": 43.75, "kl": 3.063192367553711, "learning_rate": 5e-07, "logits/chosen": -12942583.0, "logits/rejected": -32510136.0, "logps/chosen": -203.66110229492188, "logps/rejected": -310.6305847167969, "loss": 0.3261, "rewards/chosen": 0.4693945348262787, "rewards/margins": 3.105141967535019, "rewards/rejected": -2.6357474327087402, "step": 12366 }, { "epoch": 0.6555004902870166, "grad_norm": 54.75, "kl": 1.3333148956298828, "learning_rate": 5e-07, "logits/chosen": -22545908.0, "logits/rejected": -31100192.0, "logps/chosen": -455.39202880859375, "logps/rejected": -304.5438537597656, "loss": 0.391, "rewards/chosen": -0.4356590807437897, "rewards/margins": 1.6840785443782806, "rewards/rejected": -2.1197376251220703, "step": 12367 }, { "epoch": 0.6555534942888188, "grad_norm": 55.75, "kl": 0.4617481231689453, "learning_rate": 5e-07, "logits/chosen": -341399.3333333333, "logits/rejected": 37297689.6, "logps/chosen": -57.13487243652344, "logps/rejected": -166.1089111328125, "loss": 0.3368, "rewards/chosen": 0.1426947315533956, "rewards/margins": 1.6472593029340108, "rewards/rejected": -1.5045645713806153, "step": 12368 }, { "epoch": 0.6556064982906209, "grad_norm": 60.0, "kl": 0.6766242980957031, "learning_rate": 5e-07, "logits/chosen": -48971635.2, "logits/rejected": -28759248.0, "logps/chosen": -478.54541015625, "logps/rejected": -311.1300048828125, "loss": 0.308, "rewards/chosen": 0.5847928047180175, "rewards/margins": 2.292580318450928, "rewards/rejected": -1.7077875137329102, "step": 12369 }, { "epoch": 0.6556595022924231, "grad_norm": 58.5, "kl": 1.4248409271240234, "learning_rate": 5e-07, "logits/chosen": 3764843.75, "logits/rejected": -17559208.0, "logps/chosen": -55.607181549072266, "logps/rejected": -246.59365844726562, "loss": 0.4252, "rewards/chosen": -0.27285873889923096, "rewards/margins": 1.2199112176895142, "rewards/rejected": -1.4927699565887451, "step": 12370 }, { "epoch": 0.6557125062942252, "grad_norm": 38.0, "kl": 0.5381660461425781, "learning_rate": 5e-07, "logits/chosen": -72230144.0, "logits/rejected": -14189971.2, "logps/chosen": -237.84151204427084, "logps/rejected": -248.40595703125, "loss": 0.2838, "rewards/chosen": 0.6532230774561564, "rewards/margins": 2.006544915835063, "rewards/rejected": -1.3533218383789063, "step": 12371 }, { "epoch": 0.6557655102960274, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43939285.333333336, "logits/rejected": -54296281.6, "logps/chosen": -366.9208170572917, "logps/rejected": -280.0941162109375, "loss": 0.1968, "rewards/chosen": 0.6707995732625326, "rewards/margins": 3.8713954289754233, "rewards/rejected": -3.2005958557128906, "step": 12372 }, { "epoch": 0.6558185142978294, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74142832.0, "logits/rejected": -15786172.0, "logps/chosen": -455.17022705078125, "logps/rejected": -213.9304656982422, "loss": 0.296, "rewards/chosen": 0.23155060410499573, "rewards/margins": 2.499598652124405, "rewards/rejected": -2.268048048019409, "step": 12373 }, { "epoch": 0.6558715182996316, "grad_norm": 83.0, "kl": 2.030245780944824, "learning_rate": 5e-07, "logits/chosen": -11070910.666666666, "logits/rejected": 31712540.8, "logps/chosen": -105.1697998046875, "logps/rejected": -580.79462890625, "loss": 0.2625, "rewards/chosen": 0.9336786270141602, "rewards/margins": 3.265769195556641, "rewards/rejected": -2.3320905685424806, "step": 12374 }, { "epoch": 0.6559245223014337, "grad_norm": 35.5, "kl": 2.887049674987793, "learning_rate": 5e-07, "logits/chosen": -79768240.0, "logits/rejected": -10978207.0, "logps/chosen": -362.8709411621094, "logps/rejected": -111.19580841064453, "loss": 0.2266, "rewards/chosen": 1.1835354566574097, "rewards/margins": 4.1960209608078, "rewards/rejected": -3.0124855041503906, "step": 12375 }, { "epoch": 0.6559775263032359, "grad_norm": 60.0, "kl": 1.9475078582763672, "learning_rate": 5e-07, "logits/chosen": -60289776.0, "logits/rejected": -360854.5, "logps/chosen": -527.3001302083334, "logps/rejected": -114.15858459472656, "loss": 0.2659, "rewards/chosen": 0.9809652169545492, "rewards/margins": 4.438766558965047, "rewards/rejected": -3.457801342010498, "step": 12376 }, { "epoch": 0.656030530305038, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -83735520.0, "logits/rejected": -17713468.0, "logps/chosen": -592.472412109375, "logps/rejected": -475.2650553385417, "loss": 0.1952, "rewards/chosen": 0.0568084642291069, "rewards/margins": 2.474315000077089, "rewards/rejected": -2.417506535847982, "step": 12377 }, { "epoch": 0.6560835343068402, "grad_norm": 45.0, "kl": 1.2759246826171875, "learning_rate": 5e-07, "logits/chosen": -26087893.333333332, "logits/rejected": -26150208.0, "logps/chosen": -450.0136311848958, "logps/rejected": -197.8134765625, "loss": 0.1812, "rewards/chosen": 1.7372827529907227, "rewards/margins": 3.6889375686645507, "rewards/rejected": -1.951654815673828, "step": 12378 }, { "epoch": 0.6561365383086423, "grad_norm": 57.25, "kl": 2.5224876403808594, "learning_rate": 5e-07, "logits/chosen": -61632757.333333336, "logits/rejected": -72687064.0, "logps/chosen": -329.23215738932294, "logps/rejected": -649.7523803710938, "loss": 0.3766, "rewards/chosen": 0.4305366675059001, "rewards/margins": 2.9072534243265786, "rewards/rejected": -2.4767167568206787, "step": 12379 }, { "epoch": 0.6561895423104445, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30928180.0, "logits/rejected": 972252.75, "logps/chosen": -461.5901184082031, "logps/rejected": -89.38827514648438, "loss": 0.323, "rewards/chosen": 0.13028642535209656, "rewards/margins": 1.9626328647136688, "rewards/rejected": -1.8323464393615723, "step": 12380 }, { "epoch": 0.6562425463122465, "grad_norm": 54.75, "kl": 1.0991687774658203, "learning_rate": 5e-07, "logits/chosen": -15436449.6, "logits/rejected": -22874605.333333332, "logps/chosen": -203.7755859375, "logps/rejected": -314.8939615885417, "loss": 0.3933, "rewards/chosen": -0.142106032371521, "rewards/margins": 2.5687469085057577, "rewards/rejected": -2.710852940877279, "step": 12381 }, { "epoch": 0.6562955503140487, "grad_norm": 39.5, "kl": 0.7611713409423828, "learning_rate": 5e-07, "logits/chosen": -9177504.0, "logits/rejected": -3349825.5, "logps/chosen": -215.80300903320312, "logps/rejected": -201.6522216796875, "loss": 0.2538, "rewards/chosen": 0.3208458125591278, "rewards/margins": 3.429397314786911, "rewards/rejected": -3.108551502227783, "step": 12382 }, { "epoch": 0.6563485543158508, "grad_norm": 36.0, "kl": 4.223564624786377, "learning_rate": 5e-07, "logits/chosen": -17318434.0, "logits/rejected": -17792720.0, "logps/chosen": -160.1836395263672, "logps/rejected": -283.43487548828125, "loss": 0.3218, "rewards/chosen": 0.8729286193847656, "rewards/margins": 2.2931047677993774, "rewards/rejected": -1.4201761484146118, "step": 12383 }, { "epoch": 0.656401558317653, "grad_norm": 44.25, "kl": 3.329347610473633, "learning_rate": 5e-07, "logits/chosen": -53432105.6, "logits/rejected": -15537880.0, "logps/chosen": -455.1396484375, "logps/rejected": -422.5882161458333, "loss": 0.2251, "rewards/chosen": 1.5041784286499023, "rewards/margins": 4.86427853902181, "rewards/rejected": -3.3601001103719077, "step": 12384 }, { "epoch": 0.6564545623194551, "grad_norm": 78.5, "kl": 2.3801345825195312, "learning_rate": 5e-07, "logits/chosen": -70807046.4, "logits/rejected": -32703410.666666668, "logps/chosen": -1098.3544921875, "logps/rejected": -124.01111857096355, "loss": 0.2718, "rewards/chosen": 1.2715625762939453, "rewards/margins": 2.937301953633626, "rewards/rejected": -1.665739377339681, "step": 12385 }, { "epoch": 0.6565075663212573, "grad_norm": 46.75, "kl": 2.1397361755371094, "learning_rate": 5e-07, "logits/chosen": -39635506.666666664, "logits/rejected": -24720990.4, "logps/chosen": -354.10302734375, "logps/rejected": -178.7786376953125, "loss": 0.2666, "rewards/chosen": 1.096579949061076, "rewards/margins": 2.915783135096232, "rewards/rejected": -1.8192031860351563, "step": 12386 }, { "epoch": 0.6565605703230594, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2974134.5, "logits/rejected": -25829540.57142857, "logps/chosen": -79.34477233886719, "logps/rejected": -228.84385463169642, "loss": 0.1874, "rewards/chosen": 1.0794998407363892, "rewards/margins": 3.0395626851490567, "rewards/rejected": -1.9600628444126673, "step": 12387 }, { "epoch": 0.6566135743248616, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22133926.0, "logits/rejected": -71210144.0, "logps/chosen": -370.7872314453125, "logps/rejected": -340.78179931640625, "loss": 0.267, "rewards/chosen": 0.33235856890678406, "rewards/margins": 2.7107095420360565, "rewards/rejected": -2.3783509731292725, "step": 12388 }, { "epoch": 0.6566665783266636, "grad_norm": 41.75, "kl": 2.4979515075683594, "learning_rate": 5e-07, "logits/chosen": -24628056.0, "logits/rejected": -38989816.0, "logps/chosen": -233.09945678710938, "logps/rejected": -377.5301513671875, "loss": 0.3209, "rewards/chosen": 0.3315313458442688, "rewards/margins": 2.4838181138038635, "rewards/rejected": -2.1522867679595947, "step": 12389 }, { "epoch": 0.6567195823284658, "grad_norm": 39.75, "kl": 1.5960636138916016, "learning_rate": 5e-07, "logits/chosen": -57512576.0, "logits/rejected": -3945875.5, "logps/chosen": -141.57025146484375, "logps/rejected": -459.6974182128906, "loss": 0.3849, "rewards/chosen": 0.11640545725822449, "rewards/margins": 3.3250970542430878, "rewards/rejected": -3.2086915969848633, "step": 12390 }, { "epoch": 0.6567725863302679, "grad_norm": 47.25, "kl": 2.568828582763672, "learning_rate": 5e-07, "logits/chosen": -22154630.4, "logits/rejected": 3222392.3333333335, "logps/chosen": -378.670263671875, "logps/rejected": -479.7726236979167, "loss": 0.2807, "rewards/chosen": 0.9232303619384765, "rewards/margins": 3.7703417460123694, "rewards/rejected": -2.847111384073893, "step": 12391 }, { "epoch": 0.6568255903320701, "grad_norm": 46.0, "kl": 2.672220230102539, "learning_rate": 5e-07, "logits/chosen": -39654067.2, "logits/rejected": -44645933.333333336, "logps/chosen": -430.1921875, "logps/rejected": -407.317626953125, "loss": 0.2833, "rewards/chosen": 0.9506105422973633, "rewards/margins": 3.5236584981282553, "rewards/rejected": -2.573047955830892, "step": 12392 }, { "epoch": 0.6568785943338722, "grad_norm": 58.75, "kl": 3.587618827819824, "learning_rate": 5e-07, "logits/chosen": -73572460.8, "logits/rejected": -21817938.666666668, "logps/chosen": -436.18427734375, "logps/rejected": -280.11081949869794, "loss": 0.3526, "rewards/chosen": 1.3264063835144042, "rewards/margins": 1.8084803263346354, "rewards/rejected": -0.48207394282023114, "step": 12393 }, { "epoch": 0.6569315983356744, "grad_norm": 85.0, "kl": 0.7314491271972656, "learning_rate": 5e-07, "logits/chosen": -63808576.0, "logits/rejected": -14027794.666666666, "logps/chosen": -286.70625, "logps/rejected": -331.12646484375, "loss": 0.3958, "rewards/chosen": 0.16158403158187867, "rewards/margins": 1.6437403082847595, "rewards/rejected": -1.4821562767028809, "step": 12394 }, { "epoch": 0.6569846023374765, "grad_norm": 50.0, "kl": 0.2118968963623047, "learning_rate": 5e-07, "logits/chosen": -41723128.0, "logits/rejected": -43844632.0, "logps/chosen": -293.6824951171875, "logps/rejected": -524.2167358398438, "loss": 0.227, "rewards/chosen": 0.6527852416038513, "rewards/margins": 2.9670485854148865, "rewards/rejected": -2.314263343811035, "step": 12395 }, { "epoch": 0.6570376063392787, "grad_norm": 49.0, "kl": 0.7219939231872559, "learning_rate": 5e-07, "logits/chosen": -18866168.0, "logits/rejected": -49535817.6, "logps/chosen": -573.4600423177084, "logps/rejected": -460.37490234375, "loss": 0.1791, "rewards/chosen": 1.6990655263264973, "rewards/margins": 3.8786040623982743, "rewards/rejected": -2.179538536071777, "step": 12396 }, { "epoch": 0.6570906103410807, "grad_norm": 41.25, "kl": 0.4839487075805664, "learning_rate": 5e-07, "logits/chosen": -24617838.0, "logits/rejected": 12881835.0, "logps/chosen": -186.97352600097656, "logps/rejected": -283.2479248046875, "loss": 0.2572, "rewards/chosen": 0.7087955474853516, "rewards/margins": 2.8827900886535645, "rewards/rejected": -2.173994541168213, "step": 12397 }, { "epoch": 0.6571436143428829, "grad_norm": 48.5, "kl": 1.1714954376220703, "learning_rate": 5e-07, "logits/chosen": -28575160.0, "logits/rejected": -182664512.0, "logps/chosen": -249.41935221354166, "logps/rejected": -726.3615112304688, "loss": 0.3682, "rewards/chosen": 0.24679839611053467, "rewards/margins": 4.029246211051941, "rewards/rejected": -3.7824478149414062, "step": 12398 }, { "epoch": 0.657196618344685, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15303058.666666666, "logits/rejected": 2548788.4, "logps/chosen": -288.93723551432294, "logps/rejected": -294.518115234375, "loss": 0.2429, "rewards/chosen": 0.670989990234375, "rewards/margins": 2.477594566345215, "rewards/rejected": -1.8066045761108398, "step": 12399 }, { "epoch": 0.6572496223464872, "grad_norm": 41.25, "kl": 3.750011444091797, "learning_rate": 5e-07, "logits/chosen": -16236409.333333334, "logits/rejected": -11554772.8, "logps/chosen": -131.206298828125, "logps/rejected": -491.0765625, "loss": 0.2639, "rewards/chosen": 0.7331957022349039, "rewards/margins": 3.2048732916514076, "rewards/rejected": -2.4716775894165037, "step": 12400 }, { "epoch": 0.6573026263482893, "grad_norm": 34.75, "kl": 1.6376333236694336, "learning_rate": 5e-07, "logits/chosen": -370261.0625, "logits/rejected": -27111516.0, "logps/chosen": -67.1805191040039, "logps/rejected": -139.38076782226562, "loss": 0.2965, "rewards/chosen": 0.5421571731567383, "rewards/margins": 2.641127586364746, "rewards/rejected": -2.098970413208008, "step": 12401 }, { "epoch": 0.6573556303500915, "grad_norm": 44.75, "kl": 1.264303207397461, "learning_rate": 5e-07, "logits/chosen": -11412706.666666666, "logits/rejected": -9713360.0, "logps/chosen": -221.29461669921875, "logps/rejected": -107.32598876953125, "loss": 0.3814, "rewards/chosen": 0.3545023997624715, "rewards/margins": 2.353476961453756, "rewards/rejected": -1.9989745616912842, "step": 12402 }, { "epoch": 0.6574086343518936, "grad_norm": 47.5, "kl": 3.0975279808044434, "learning_rate": 5e-07, "logits/chosen": -23938147.2, "logits/rejected": -3487617.3333333335, "logps/chosen": -354.3107421875, "logps/rejected": -119.81458536783855, "loss": 0.3497, "rewards/chosen": 1.169424819946289, "rewards/margins": 2.3563565889994305, "rewards/rejected": -1.1869317690531414, "step": 12403 }, { "epoch": 0.6574616383536958, "grad_norm": 52.5, "kl": 0.614288330078125, "learning_rate": 5e-07, "logits/chosen": -1059133.0, "logits/rejected": -9250749.333333334, "logps/chosen": -742.8023681640625, "logps/rejected": -341.9165445963542, "loss": 0.1604, "rewards/chosen": 1.5648193359375, "rewards/margins": 3.3460027376810713, "rewards/rejected": -1.781183401743571, "step": 12404 }, { "epoch": 0.6575146423554978, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46984024.0, "logits/rejected": -14224826.666666666, "logps/chosen": -285.3645935058594, "logps/rejected": -434.8069661458333, "loss": 0.208, "rewards/chosen": 0.18031218647956848, "rewards/margins": 2.523227721452713, "rewards/rejected": -2.3429155349731445, "step": 12405 }, { "epoch": 0.6575676463573, "grad_norm": 54.75, "kl": 0.6479301452636719, "learning_rate": 5e-07, "logits/chosen": -23088598.0, "logits/rejected": -52057284.0, "logps/chosen": -407.3046875, "logps/rejected": -508.6541748046875, "loss": 0.2416, "rewards/chosen": 0.27260035276412964, "rewards/margins": 4.59007853269577, "rewards/rejected": -4.317478179931641, "step": 12406 }, { "epoch": 0.6576206503591021, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7246015.0, "logits/rejected": 10741058.285714285, "logps/chosen": -305.2914733886719, "logps/rejected": -332.11415318080356, "loss": 0.1956, "rewards/chosen": 2.255209445953369, "rewards/margins": 3.931130886077881, "rewards/rejected": -1.6759214401245117, "step": 12407 }, { "epoch": 0.6576736543609043, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -111652088.0, "logits/rejected": -19837061.333333332, "logps/chosen": -415.67376708984375, "logps/rejected": -235.04951985677084, "loss": 0.1991, "rewards/chosen": 0.3456375002861023, "rewards/margins": 2.8421913981437683, "rewards/rejected": -2.496553897857666, "step": 12408 }, { "epoch": 0.6577266583627064, "grad_norm": 50.5, "kl": 0.4465923309326172, "learning_rate": 5e-07, "logits/chosen": -911844.5, "logits/rejected": -4585103.333333333, "logps/chosen": -134.1812744140625, "logps/rejected": -302.7021077473958, "loss": 0.2649, "rewards/chosen": 0.045306771993637085, "rewards/margins": 1.8275595605373383, "rewards/rejected": -1.7822527885437012, "step": 12409 }, { "epoch": 0.6577796623645085, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46991504.0, "logits/rejected": -21810924.8, "logps/chosen": -183.59016927083334, "logps/rejected": -218.0754638671875, "loss": 0.406, "rewards/chosen": -0.3905794620513916, "rewards/margins": 0.88075852394104, "rewards/rejected": -1.2713379859924316, "step": 12410 }, { "epoch": 0.6578326663663107, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24727796.0, "logits/rejected": -29116270.0, "logps/chosen": -413.6619873046875, "logps/rejected": -470.640380859375, "loss": 0.2141, "rewards/chosen": 0.9956142902374268, "rewards/margins": 4.463953256607056, "rewards/rejected": -3.468338966369629, "step": 12411 }, { "epoch": 0.6578856703681127, "grad_norm": 48.5, "kl": 1.9887676239013672, "learning_rate": 5e-07, "logits/chosen": -1733919.2, "logits/rejected": -32843546.666666668, "logps/chosen": -171.8322998046875, "logps/rejected": -164.18417358398438, "loss": 0.3502, "rewards/chosen": 0.6580150127410889, "rewards/margins": 2.454181432723999, "rewards/rejected": -1.7961664199829102, "step": 12412 }, { "epoch": 0.6579386743699149, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73810552.0, "logits/rejected": -13092336.0, "logps/chosen": -230.66793823242188, "logps/rejected": -258.6953125, "loss": 0.3634, "rewards/chosen": -0.0886867567896843, "rewards/margins": 1.55840153247118, "rewards/rejected": -1.6470882892608643, "step": 12413 }, { "epoch": 0.657991678371717, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33412637.333333332, "logits/rejected": 30938022.4, "logps/chosen": -133.61337280273438, "logps/rejected": -372.8452880859375, "loss": 0.266, "rewards/chosen": -0.06616515914599101, "rewards/margins": 2.417088510592779, "rewards/rejected": -2.4832536697387697, "step": 12414 }, { "epoch": 0.6580446823735192, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31243410.0, "logits/rejected": -31300742.0, "logps/chosen": -315.2867431640625, "logps/rejected": -271.25933837890625, "loss": 0.1869, "rewards/chosen": 1.2090675830841064, "rewards/margins": 3.1661088466644287, "rewards/rejected": -1.9570412635803223, "step": 12415 }, { "epoch": 0.6580976863753213, "grad_norm": 57.75, "kl": 1.7194042205810547, "learning_rate": 5e-07, "logits/chosen": -39738528.0, "logits/rejected": -47025664.0, "logps/chosen": -441.29449462890625, "logps/rejected": -334.3793029785156, "loss": 0.3614, "rewards/chosen": -0.07303380221128464, "rewards/margins": 1.3693810775876045, "rewards/rejected": -1.4424148797988892, "step": 12416 }, { "epoch": 0.6581506903771235, "grad_norm": 81.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16495617.0, "logits/rejected": 5004238.0, "logps/chosen": -439.7870788574219, "logps/rejected": -468.1116536458333, "loss": 0.2518, "rewards/chosen": -0.37174683809280396, "rewards/margins": 2.52189431587855, "rewards/rejected": -2.893641153971354, "step": 12417 }, { "epoch": 0.6582036943789256, "grad_norm": 52.5, "kl": 2.3624496459960938, "learning_rate": 5e-07, "logits/chosen": -35013766.4, "logits/rejected": -11211757.333333334, "logps/chosen": -316.4330322265625, "logps/rejected": -585.791259765625, "loss": 0.242, "rewards/chosen": 1.0210591316223145, "rewards/margins": 3.5222636222839356, "rewards/rejected": -2.501204490661621, "step": 12418 }, { "epoch": 0.6582566983807278, "grad_norm": 49.25, "kl": 1.8441448211669922, "learning_rate": 5e-07, "logits/chosen": -28899182.0, "logits/rejected": -28092936.0, "logps/chosen": -155.30569458007812, "logps/rejected": -337.6509094238281, "loss": 0.3347, "rewards/chosen": 0.326907753944397, "rewards/margins": 2.103550672531128, "rewards/rejected": -1.776642918586731, "step": 12419 }, { "epoch": 0.6583097023825298, "grad_norm": 46.25, "kl": 6.388350963592529, "learning_rate": 5e-07, "logits/chosen": -1765137.0, "logits/rejected": -11613524.0, "logps/chosen": -371.565869140625, "logps/rejected": -301.0376383463542, "loss": 0.3802, "rewards/chosen": 0.6650155067443848, "rewards/margins": 4.211354541778564, "rewards/rejected": -3.5463390350341797, "step": 12420 }, { "epoch": 0.658362706384332, "grad_norm": 44.25, "kl": 1.5226411819458008, "learning_rate": 5e-07, "logits/chosen": 9045482.4, "logits/rejected": -23601101.333333332, "logps/chosen": -172.6849609375, "logps/rejected": -444.0872395833333, "loss": 0.2947, "rewards/chosen": 0.45859737396240235, "rewards/margins": 3.262396049499512, "rewards/rejected": -2.8037986755371094, "step": 12421 }, { "epoch": 0.6584157103861341, "grad_norm": 61.25, "kl": 1.0468807220458984, "learning_rate": 5e-07, "logits/chosen": -44376730.666666664, "logits/rejected": -60399376.0, "logps/chosen": -314.08831787109375, "logps/rejected": -393.34002685546875, "loss": 0.3867, "rewards/chosen": 0.3575105667114258, "rewards/margins": 1.8991708755493164, "rewards/rejected": -1.5416603088378906, "step": 12422 }, { "epoch": 0.6584687143879363, "grad_norm": 63.5, "kl": 1.6169366836547852, "learning_rate": 5e-07, "logits/chosen": -22744059.2, "logits/rejected": 12581410.666666666, "logps/chosen": -291.8859130859375, "logps/rejected": -91.3251953125, "loss": 0.4024, "rewards/chosen": 0.6013236999511719, "rewards/margins": 1.1855892499287923, "rewards/rejected": -0.5842655499776205, "step": 12423 }, { "epoch": 0.6585217183897384, "grad_norm": 75.5, "kl": 3.3727493286132812, "learning_rate": 5e-07, "logits/chosen": 104082118.4, "logits/rejected": -21571085.333333332, "logps/chosen": -353.33505859375, "logps/rejected": -323.4482421875, "loss": 0.3384, "rewards/chosen": 0.6226268768310547, "rewards/margins": 3.234289105733236, "rewards/rejected": -2.611662228902181, "step": 12424 }, { "epoch": 0.6585747223915406, "grad_norm": 43.25, "kl": 0.5626564025878906, "learning_rate": 5e-07, "logits/chosen": 235963.1875, "logits/rejected": -20987024.0, "logps/chosen": -221.0164794921875, "logps/rejected": -246.28175354003906, "loss": 0.3244, "rewards/chosen": 0.021562740206718445, "rewards/margins": 2.440337583422661, "rewards/rejected": -2.4187748432159424, "step": 12425 }, { "epoch": 0.6586277263933427, "grad_norm": 41.75, "kl": 6.306119918823242, "learning_rate": 5e-07, "logits/chosen": 2349917.5, "logits/rejected": -15001703.0, "logps/chosen": -248.75982666015625, "logps/rejected": -243.1771697998047, "loss": 0.3903, "rewards/chosen": 0.5803623795509338, "rewards/margins": 2.2789936661720276, "rewards/rejected": -1.6986312866210938, "step": 12426 }, { "epoch": 0.6586807303951449, "grad_norm": 54.0, "kl": 2.3721914291381836, "learning_rate": 5e-07, "logits/chosen": -17817780.0, "logits/rejected": 98376312.0, "logps/chosen": -271.2748209635417, "logps/rejected": -420.52337646484375, "loss": 0.4714, "rewards/chosen": 0.11383811632792155, "rewards/margins": 1.8962973753611247, "rewards/rejected": -1.7824592590332031, "step": 12427 }, { "epoch": 0.6587337343969469, "grad_norm": 60.0, "kl": 3.042491912841797, "learning_rate": 5e-07, "logits/chosen": -4413089.333333333, "logits/rejected": -21502659.2, "logps/chosen": -311.7212727864583, "logps/rejected": -378.3358642578125, "loss": 0.2489, "rewards/chosen": 1.0212941964467366, "rewards/margins": 3.029078086217244, "rewards/rejected": -2.007783889770508, "step": 12428 }, { "epoch": 0.6587867383987491, "grad_norm": 34.5, "kl": 0.9437379837036133, "learning_rate": 5e-07, "logits/chosen": -41820756.0, "logits/rejected": -11816486.0, "logps/chosen": -509.96746826171875, "logps/rejected": -168.5103759765625, "loss": 0.1272, "rewards/chosen": 1.5206408500671387, "rewards/margins": 4.531022548675537, "rewards/rejected": -3.0103816986083984, "step": 12429 }, { "epoch": 0.6588397424005512, "grad_norm": 43.75, "kl": 2.0766983032226562, "learning_rate": 5e-07, "logits/chosen": -22455004.8, "logits/rejected": -54371392.0, "logps/chosen": -230.6766845703125, "logps/rejected": -392.860107421875, "loss": 0.1908, "rewards/chosen": 1.6235965728759765, "rewards/margins": 3.4910587310791015, "rewards/rejected": -1.867462158203125, "step": 12430 }, { "epoch": 0.6588927464023534, "grad_norm": 43.25, "kl": 2.7931671142578125, "learning_rate": 5e-07, "logits/chosen": -13628774.0, "logits/rejected": -7493104.0, "logps/chosen": -846.1176147460938, "logps/rejected": -406.2274576822917, "loss": 0.2291, "rewards/chosen": 2.0945487022399902, "rewards/margins": 3.95866060256958, "rewards/rejected": -1.8641119003295898, "step": 12431 }, { "epoch": 0.6589457504041555, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37471616.0, "logits/rejected": -82748208.0, "logps/chosen": -272.9671875, "logps/rejected": -389.883056640625, "loss": 0.2797, "rewards/chosen": 0.6019505977630615, "rewards/margins": 2.437769365310669, "rewards/rejected": -1.8358187675476074, "step": 12432 }, { "epoch": 0.6589987544059577, "grad_norm": 49.25, "kl": 3.649251937866211, "learning_rate": 5e-07, "logits/chosen": 4752676.0, "logits/rejected": -15129874.0, "logps/chosen": -106.80584716796875, "logps/rejected": -338.62078857421875, "loss": 0.43, "rewards/chosen": 0.45503807067871094, "rewards/margins": 2.1557247638702393, "rewards/rejected": -1.7006866931915283, "step": 12433 }, { "epoch": 0.6590517584077598, "grad_norm": 52.75, "kl": 4.354092597961426, "learning_rate": 5e-07, "logits/chosen": -16256124.0, "logits/rejected": -15288204.8, "logps/chosen": -343.0861409505208, "logps/rejected": -261.5988525390625, "loss": 0.2243, "rewards/chosen": 2.035144011179606, "rewards/margins": 3.6882991472880047, "rewards/rejected": -1.6531551361083985, "step": 12434 }, { "epoch": 0.659104762409562, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25447096.0, "logits/rejected": -68083146.66666667, "logps/chosen": -236.8454345703125, "logps/rejected": -535.0667317708334, "loss": 0.2656, "rewards/chosen": 0.6286905765533447, "rewards/margins": 3.671887254714966, "rewards/rejected": -3.043196678161621, "step": 12435 }, { "epoch": 0.659157766411364, "grad_norm": 59.0, "kl": 0.7596044540405273, "learning_rate": 5e-07, "logits/chosen": -20225770.666666668, "logits/rejected": -24831184.0, "logps/chosen": -186.97408040364584, "logps/rejected": -116.08775329589844, "loss": 0.4224, "rewards/chosen": 0.11238559087117513, "rewards/margins": 2.0306881268819175, "rewards/rejected": -1.9183025360107422, "step": 12436 }, { "epoch": 0.6592107704131662, "grad_norm": 32.75, "kl": 1.8971996307373047, "learning_rate": 5e-07, "logits/chosen": 1862603.75, "logits/rejected": -23232002.666666668, "logps/chosen": -127.51615905761719, "logps/rejected": -494.5373128255208, "loss": 0.2256, "rewards/chosen": -0.5308746695518494, "rewards/margins": 2.3822103142738342, "rewards/rejected": -2.9130849838256836, "step": 12437 }, { "epoch": 0.6592637744149683, "grad_norm": 53.25, "kl": 4.729004859924316, "learning_rate": 5e-07, "logits/chosen": -81795605.33333333, "logits/rejected": -46262428.0, "logps/chosen": -274.5265299479167, "logps/rejected": -164.4568634033203, "loss": 0.4525, "rewards/chosen": 0.47834567228953045, "rewards/margins": 1.3877543012301128, "rewards/rejected": -0.9094086289405823, "step": 12438 }, { "epoch": 0.6593167784167705, "grad_norm": 55.75, "kl": 3.3357486724853516, "learning_rate": 5e-07, "logits/chosen": -52161664.0, "logits/rejected": -21935680.0, "logps/chosen": -470.91142578125, "logps/rejected": -212.95198567708334, "loss": 0.3659, "rewards/chosen": 0.4129605293273926, "rewards/margins": 2.84920326868693, "rewards/rejected": -2.4362427393595376, "step": 12439 }, { "epoch": 0.6593697824185726, "grad_norm": 45.25, "kl": 0.759425163269043, "learning_rate": 5e-07, "logits/chosen": -51654720.0, "logits/rejected": -2570576.75, "logps/chosen": -524.083251953125, "logps/rejected": -155.61276245117188, "loss": 0.3059, "rewards/chosen": 0.4647254943847656, "rewards/margins": 2.620563268661499, "rewards/rejected": -2.1558377742767334, "step": 12440 }, { "epoch": 0.6594227864203748, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37925650.666666664, "logits/rejected": -4928964.8, "logps/chosen": -464.0264485677083, "logps/rejected": -219.960205078125, "loss": 0.2178, "rewards/chosen": 1.4567500750223796, "rewards/margins": 3.061526075998942, "rewards/rejected": -1.6047760009765626, "step": 12441 }, { "epoch": 0.6594757904221769, "grad_norm": 56.25, "kl": 0.043292999267578125, "learning_rate": 5e-07, "logits/chosen": -99608560.0, "logits/rejected": -28030772.0, "logps/chosen": -298.2677307128906, "logps/rejected": -197.92990112304688, "loss": 0.2442, "rewards/chosen": 0.8574192523956299, "rewards/margins": 2.638176679611206, "rewards/rejected": -1.7807574272155762, "step": 12442 }, { "epoch": 0.659528794423979, "grad_norm": 40.0, "kl": 4.927983283996582, "learning_rate": 5e-07, "logits/chosen": -4765984.0, "logits/rejected": -13764022.0, "logps/chosen": -620.6099243164062, "logps/rejected": -144.80630493164062, "loss": 0.228, "rewards/chosen": 1.7727305889129639, "rewards/margins": 4.027071952819824, "rewards/rejected": -2.2543413639068604, "step": 12443 }, { "epoch": 0.6595817984257811, "grad_norm": 48.25, "kl": 0.9807949066162109, "learning_rate": 5e-07, "logits/chosen": -22237641.6, "logits/rejected": -104566336.0, "logps/chosen": -248.9185791015625, "logps/rejected": -356.260498046875, "loss": 0.3718, "rewards/chosen": 0.052614593505859376, "rewards/margins": 1.7573729197184245, "rewards/rejected": -1.7047583262125652, "step": 12444 }, { "epoch": 0.6596348024275833, "grad_norm": 27.375, "kl": 2.851785659790039, "learning_rate": 5e-07, "logits/chosen": -45714920.0, "logits/rejected": -23566956.0, "logps/chosen": -235.44151306152344, "logps/rejected": -386.74462890625, "loss": 0.2117, "rewards/chosen": 0.9081900715827942, "rewards/margins": 3.7903369069099426, "rewards/rejected": -2.8821468353271484, "step": 12445 }, { "epoch": 0.6596878064293854, "grad_norm": 45.25, "kl": 1.6643524169921875, "learning_rate": 5e-07, "logits/chosen": -41238922.666666664, "logits/rejected": -43808774.4, "logps/chosen": -317.27406819661456, "logps/rejected": -412.749267578125, "loss": 0.2284, "rewards/chosen": 1.2559718290964763, "rewards/margins": 3.4541655699412033, "rewards/rejected": -2.1981937408447267, "step": 12446 }, { "epoch": 0.6597408104311876, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9617062.666666666, "logits/rejected": -8007025.6, "logps/chosen": -387.2539876302083, "logps/rejected": -337.696435546875, "loss": 0.2524, "rewards/chosen": 0.6904324690500895, "rewards/margins": 2.367414204279582, "rewards/rejected": -1.6769817352294922, "step": 12447 }, { "epoch": 0.6597938144329897, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32688490.666666668, "logits/rejected": -15919564.8, "logps/chosen": -410.9506429036458, "logps/rejected": -215.897119140625, "loss": 0.2934, "rewards/chosen": 0.07909443974494934, "rewards/margins": 1.8070533096790313, "rewards/rejected": -1.727958869934082, "step": 12448 }, { "epoch": 0.6598468184347919, "grad_norm": 62.75, "kl": 8.33608627319336, "learning_rate": 5e-07, "logits/chosen": -17862702.0, "logps/chosen": -589.2029418945312, "loss": 0.4233, "rewards/chosen": 1.2679234743118286, "step": 12449 }, { "epoch": 0.659899822436594, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4768770.666666667, "logits/rejected": -72460712.0, "logps/chosen": -310.6053059895833, "logps/rejected": -280.731201171875, "loss": 0.2895, "rewards/chosen": 0.6273859341939291, "rewards/margins": 3.538204034169515, "rewards/rejected": -2.910818099975586, "step": 12450 }, { "epoch": 0.6599528264383961, "grad_norm": 62.0, "kl": 4.894354820251465, "learning_rate": 5e-07, "logits/chosen": -31850246.4, "logits/rejected": -8214534.0, "logps/chosen": -328.798388671875, "logps/rejected": -137.64344278971353, "loss": 0.3479, "rewards/chosen": 0.9212147712707519, "rewards/margins": 2.653658898671468, "rewards/rejected": -1.732444127400716, "step": 12451 }, { "epoch": 0.6600058304401982, "grad_norm": 52.5, "kl": 0.23823165893554688, "learning_rate": 5e-07, "logits/chosen": -17520956.0, "logits/rejected": -50032117.333333336, "logps/chosen": -538.8630981445312, "logps/rejected": -253.9677530924479, "loss": 0.2537, "rewards/chosen": 0.0636596754193306, "rewards/margins": 1.7272871409853299, "rewards/rejected": -1.6636274655659993, "step": 12452 }, { "epoch": 0.6600588344420004, "grad_norm": 80.0, "kl": 1.8053793907165527, "learning_rate": 5e-07, "logits/chosen": -25018498.0, "logits/rejected": 20403332.0, "logps/chosen": -587.7208251953125, "logps/rejected": -520.8726806640625, "loss": 0.3384, "rewards/chosen": 0.917952299118042, "rewards/margins": 1.624284267425537, "rewards/rejected": -0.7063319683074951, "step": 12453 }, { "epoch": 0.6601118384438025, "grad_norm": 57.25, "kl": 3.3875818252563477, "learning_rate": 5e-07, "logits/chosen": -46613500.8, "logits/rejected": -24533653.333333332, "logps/chosen": -716.515234375, "logps/rejected": -231.11822509765625, "loss": 0.3247, "rewards/chosen": 0.9153561592102051, "rewards/margins": 2.5453003247578936, "rewards/rejected": -1.6299441655476887, "step": 12454 }, { "epoch": 0.6601648424456047, "grad_norm": 67.0, "kl": 0.7468662261962891, "learning_rate": 5e-07, "logits/chosen": 9882456.0, "logits/rejected": -18755509.714285713, "logps/chosen": -59.63262939453125, "logps/rejected": -239.70757184709822, "loss": 0.2333, "rewards/chosen": -0.23615112900733948, "rewards/margins": 1.4590627338205064, "rewards/rejected": -1.695213862827846, "step": 12455 }, { "epoch": 0.6602178464474068, "grad_norm": 54.75, "kl": 0.6910972595214844, "learning_rate": 5e-07, "logits/chosen": -30852044.0, "logits/rejected": -31119322.0, "logps/chosen": -214.1881561279297, "logps/rejected": -444.1351318359375, "loss": 0.2635, "rewards/chosen": 0.7845794558525085, "rewards/margins": 2.779683291912079, "rewards/rejected": -1.9951038360595703, "step": 12456 }, { "epoch": 0.660270850449209, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51365129.6, "logits/rejected": -12370757.333333334, "logps/chosen": -268.9069091796875, "logps/rejected": -370.9978841145833, "loss": 0.3005, "rewards/chosen": 0.24367339611053468, "rewards/margins": 3.454083832105001, "rewards/rejected": -3.2104104359944663, "step": 12457 }, { "epoch": 0.660323854451011, "grad_norm": 44.0, "kl": 1.5432214736938477, "learning_rate": 5e-07, "logits/chosen": 10456180.0, "logits/rejected": -109724736.0, "logps/chosen": -63.59496053059896, "logps/rejected": -352.66923828125, "loss": 0.2945, "rewards/chosen": 0.0014698604742685954, "rewards/margins": 2.3134056667486824, "rewards/rejected": -2.311935806274414, "step": 12458 }, { "epoch": 0.6603768584528132, "grad_norm": 46.0, "kl": 0.6658649444580078, "learning_rate": 5e-07, "logits/chosen": -25730317.333333332, "logits/rejected": -11629537.6, "logps/chosen": -63.58599853515625, "logps/rejected": -171.49862060546874, "loss": 0.2514, "rewards/chosen": 0.6376446882883707, "rewards/margins": 2.344339100519816, "rewards/rejected": -1.7066944122314454, "step": 12459 }, { "epoch": 0.6604298624546153, "grad_norm": 62.5, "kl": 0.9767446517944336, "learning_rate": 5e-07, "logits/chosen": -23102788.0, "logits/rejected": -25696008.0, "logps/chosen": -87.7701416015625, "logps/rejected": -386.2313537597656, "loss": 0.2704, "rewards/chosen": 0.24725157022476196, "rewards/margins": 3.41979843378067, "rewards/rejected": -3.172546863555908, "step": 12460 }, { "epoch": 0.6604828664564174, "grad_norm": 60.5, "kl": 4.75892448425293, "learning_rate": 5e-07, "logits/chosen": -65388794.666666664, "logits/rejected": -16514082.0, "logps/chosen": -423.0933430989583, "logps/rejected": -745.4301147460938, "loss": 0.377, "rewards/chosen": 0.9196572303771973, "rewards/margins": 8.107384204864502, "rewards/rejected": -7.187726974487305, "step": 12461 }, { "epoch": 0.6605358704582196, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29125420.8, "logits/rejected": -16223405.333333334, "logps/chosen": -462.99541015625, "logps/rejected": -293.7784830729167, "loss": 0.1896, "rewards/chosen": 1.0242602348327636, "rewards/margins": 4.8972091992696125, "rewards/rejected": -3.872948964436849, "step": 12462 }, { "epoch": 0.6605888744600217, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48775392.0, "logits/rejected": -12539420.8, "logps/chosen": -190.2015380859375, "logps/rejected": -273.8632080078125, "loss": 0.2868, "rewards/chosen": 0.16126938660939535, "rewards/margins": 1.9621599276860555, "rewards/rejected": -1.8008905410766602, "step": 12463 }, { "epoch": 0.6606418784618239, "grad_norm": 56.75, "kl": 2.1691741943359375, "learning_rate": 5e-07, "logits/chosen": -29323564.0, "logits/rejected": -35676100.0, "logps/chosen": -369.30108642578125, "logps/rejected": -240.23056030273438, "loss": 0.3542, "rewards/chosen": 0.3470684289932251, "rewards/margins": 2.051500916481018, "rewards/rejected": -1.704432487487793, "step": 12464 }, { "epoch": 0.660694882463626, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12010376.0, "logits/rejected": -11703759.2, "logps/chosen": -479.8214925130208, "logps/rejected": -122.4090576171875, "loss": 0.3513, "rewards/chosen": -0.7503112951914469, "rewards/margins": 1.6210302193959554, "rewards/rejected": -2.371341514587402, "step": 12465 }, { "epoch": 0.6607478864654281, "grad_norm": 42.0, "kl": 1.9543991088867188, "learning_rate": 5e-07, "logits/chosen": -3842602.25, "logits/rejected": -18754918.0, "logps/chosen": -176.79124450683594, "logps/rejected": -374.53271484375, "loss": 0.2538, "rewards/chosen": 0.6219696998596191, "rewards/margins": 4.251288890838623, "rewards/rejected": -3.629319190979004, "step": 12466 }, { "epoch": 0.6608008904672302, "grad_norm": 41.25, "kl": 1.9529972076416016, "learning_rate": 5e-07, "logits/chosen": -5020993.6, "logits/rejected": -33750749.333333336, "logps/chosen": -154.7682373046875, "logps/rejected": -560.8128255208334, "loss": 0.3216, "rewards/chosen": 0.3737076997756958, "rewards/margins": 3.568126924832662, "rewards/rejected": -3.1944192250569663, "step": 12467 }, { "epoch": 0.6608538944690324, "grad_norm": 36.75, "kl": 4.784976959228516, "learning_rate": 5e-07, "logits/chosen": -2975558.0, "logits/rejected": -28793685.333333332, "logps/chosen": -391.025244140625, "logps/rejected": -508.3138834635417, "loss": 0.3294, "rewards/chosen": 0.8727200508117676, "rewards/margins": 3.0594000498453777, "rewards/rejected": -2.18667999903361, "step": 12468 }, { "epoch": 0.6609068984708345, "grad_norm": 32.25, "kl": 0.30344104766845703, "learning_rate": 5e-07, "logits/chosen": 13757965.0, "logits/rejected": -19118274.285714287, "logps/chosen": -29.943286895751953, "logps/rejected": -382.9921875, "loss": 0.1391, "rewards/chosen": 0.7363914847373962, "rewards/margins": 3.409910891737257, "rewards/rejected": -2.6735194069998607, "step": 12469 }, { "epoch": 0.6609599024726367, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3039557.0, "logits/rejected": -42321360.0, "logps/chosen": -296.9727478027344, "logps/rejected": -262.67608642578125, "loss": 0.3231, "rewards/chosen": -0.15139980614185333, "rewards/margins": 2.204807087779045, "rewards/rejected": -2.3562068939208984, "step": 12470 }, { "epoch": 0.6610129064744388, "grad_norm": 38.5, "kl": 1.0034751892089844, "learning_rate": 5e-07, "logits/chosen": -7399909.333333333, "logits/rejected": -27576793.6, "logps/chosen": -157.22403971354166, "logps/rejected": -375.76689453125, "loss": 0.1598, "rewards/chosen": 1.7323678334554036, "rewards/margins": 4.07156499226888, "rewards/rejected": -2.3391971588134766, "step": 12471 }, { "epoch": 0.661065910476241, "grad_norm": 54.0, "kl": 4.115390777587891, "learning_rate": 5e-07, "logits/chosen": -38677450.666666664, "logits/rejected": -44297680.0, "logps/chosen": -230.5579833984375, "logps/rejected": -296.4744567871094, "loss": 0.3418, "rewards/chosen": 0.8859976132710775, "rewards/margins": 4.0232804616292315, "rewards/rejected": -3.1372828483581543, "step": 12472 }, { "epoch": 0.661118914478043, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25780701.333333332, "logits/rejected": -109995916.8, "logps/chosen": -311.7245279947917, "logps/rejected": -294.8205078125, "loss": 0.2718, "rewards/chosen": -0.2958486080169678, "rewards/margins": 2.1108004093170165, "rewards/rejected": -2.4066490173339843, "step": 12473 }, { "epoch": 0.6611719184798452, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42898458.666666664, "logits/rejected": 2990156.0, "logps/chosen": -210.4100341796875, "logps/rejected": -229.4147705078125, "loss": 0.3974, "rewards/chosen": -0.21368918816248575, "rewards/margins": 0.7524353464444479, "rewards/rejected": -0.9661245346069336, "step": 12474 }, { "epoch": 0.6612249224816473, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46851212.0, "logits/rejected": -25583268.0, "logps/chosen": -326.69329833984375, "logps/rejected": -352.6138000488281, "loss": 0.2597, "rewards/chosen": 0.3347633481025696, "rewards/margins": 2.9129944443702698, "rewards/rejected": -2.5782310962677, "step": 12475 }, { "epoch": 0.6612779264834495, "grad_norm": 54.75, "kl": 0.6613731384277344, "learning_rate": 5e-07, "logits/chosen": -27708466.0, "logits/rejected": -54829120.0, "logps/chosen": -511.4598083496094, "logps/rejected": -375.0399475097656, "loss": 0.1934, "rewards/chosen": 0.9808723330497742, "rewards/margins": 3.6543607115745544, "rewards/rejected": -2.6734883785247803, "step": 12476 }, { "epoch": 0.6613309304852516, "grad_norm": 48.25, "kl": 1.2153644561767578, "learning_rate": 5e-07, "logits/chosen": -22680762.666666668, "logits/rejected": -53178572.0, "logps/chosen": -225.90226236979166, "logps/rejected": -464.57244873046875, "loss": 0.3209, "rewards/chosen": 0.5510191520055135, "rewards/margins": 3.6417891581853232, "rewards/rejected": -3.0907700061798096, "step": 12477 }, { "epoch": 0.6613839344870538, "grad_norm": 39.25, "kl": 2.945068359375, "learning_rate": 5e-07, "logits/chosen": 2689055.25, "logits/rejected": -42645740.0, "logps/chosen": -276.2093200683594, "logps/rejected": -347.6482849121094, "loss": 0.2576, "rewards/chosen": 0.5446881055831909, "rewards/margins": 2.9867018461227417, "rewards/rejected": -2.442013740539551, "step": 12478 }, { "epoch": 0.6614369384888559, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50588496.0, "logits/rejected": -39114924.0, "logps/chosen": -253.7759552001953, "logps/rejected": -246.90359497070312, "loss": 0.3156, "rewards/chosen": -0.131519615650177, "rewards/margins": 2.749438464641571, "rewards/rejected": -2.880958080291748, "step": 12479 }, { "epoch": 0.6614899424906581, "grad_norm": 48.5, "kl": 3.397228240966797, "learning_rate": 5e-07, "logits/chosen": 10333930.4, "logits/rejected": -66346202.666666664, "logps/chosen": -192.53865966796874, "logps/rejected": -464.0002848307292, "loss": 0.3262, "rewards/chosen": 0.5585086345672607, "rewards/margins": 3.050604263941447, "rewards/rejected": -2.492095629374186, "step": 12480 }, { "epoch": 0.6615429464924601, "grad_norm": 45.5, "kl": 2.3625402450561523, "learning_rate": 5e-07, "logits/chosen": -26883792.0, "logits/rejected": -17079230.666666668, "logps/chosen": -236.1404541015625, "logps/rejected": -270.50439453125, "loss": 0.3774, "rewards/chosen": -0.06219578981399536, "rewards/margins": 3.4357058087984718, "rewards/rejected": -3.4979015986124673, "step": 12481 }, { "epoch": 0.6615959504942623, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9002214.0, "logits/rejected": -7528312.571428572, "logps/chosen": -393.503173828125, "logps/rejected": -219.39948381696428, "loss": 0.1195, "rewards/chosen": 1.759130835533142, "rewards/margins": 4.0409276996340076, "rewards/rejected": -2.281796864100865, "step": 12482 }, { "epoch": 0.6616489544960644, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45338512.0, "logits/rejected": -9582438.4, "logps/chosen": -359.7835693359375, "logps/rejected": -196.8410400390625, "loss": 0.2602, "rewards/chosen": 0.5663119157155355, "rewards/margins": 2.255850871404012, "rewards/rejected": -1.6895389556884766, "step": 12483 }, { "epoch": 0.6617019584978666, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49747276.8, "logits/rejected": -17970733.333333332, "logps/chosen": -429.66455078125, "logps/rejected": -175.1790568033854, "loss": 0.3433, "rewards/chosen": 0.2418583869934082, "rewards/margins": 2.8806416829427084, "rewards/rejected": -2.6387832959493003, "step": 12484 }, { "epoch": 0.6617549624996687, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -83472624.0, "logits/rejected": 6965651.0, "logps/chosen": -393.5966796875, "logps/rejected": -199.6171875, "loss": 0.2909, "rewards/chosen": 0.07749786972999573, "rewards/margins": 2.5397218763828278, "rewards/rejected": -2.462224006652832, "step": 12485 }, { "epoch": 0.6618079665014709, "grad_norm": 55.5, "kl": 0.10750865936279297, "learning_rate": 5e-07, "logits/chosen": -45237876.0, "logits/rejected": -23241224.0, "logps/chosen": -411.3935546875, "logps/rejected": -171.02117919921875, "loss": 0.2889, "rewards/chosen": 0.3051399290561676, "rewards/margins": 2.5278872549533844, "rewards/rejected": -2.222747325897217, "step": 12486 }, { "epoch": 0.661860970503273, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79024280.0, "logits/rejected": -32765574.85714286, "logps/chosen": -340.7100830078125, "logps/rejected": -335.5092075892857, "loss": 0.135, "rewards/chosen": 0.998065173625946, "rewards/margins": 3.726831478731973, "rewards/rejected": -2.728766305106027, "step": 12487 }, { "epoch": 0.6619139745050752, "grad_norm": 88.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 85836568.0, "logits/rejected": -7076142.285714285, "logps/chosen": -401.3127136230469, "logps/rejected": -291.11936732700894, "loss": 0.2444, "rewards/chosen": -0.19332580268383026, "rewards/margins": 1.510529984320913, "rewards/rejected": -1.7038557870047433, "step": 12488 }, { "epoch": 0.6619669785068772, "grad_norm": 52.75, "kl": 1.2680397033691406, "learning_rate": 5e-07, "logits/chosen": 33325218.0, "logits/rejected": -28007912.0, "logps/chosen": -103.91162109375, "logps/rejected": -429.978759765625, "loss": 0.346, "rewards/chosen": -0.013882368803024292, "rewards/margins": 1.9748132526874542, "rewards/rejected": -1.9886956214904785, "step": 12489 }, { "epoch": 0.6620199825086794, "grad_norm": 41.5, "kl": 2.7618770599365234, "learning_rate": 5e-07, "logits/chosen": -18940189.333333332, "logits/rejected": -19076179.2, "logps/chosen": -127.5739034016927, "logps/rejected": -348.1906494140625, "loss": 0.2235, "rewards/chosen": 0.9237092336018881, "rewards/margins": 3.4402615865071615, "rewards/rejected": -2.5165523529052733, "step": 12490 }, { "epoch": 0.6620729865104815, "grad_norm": 43.75, "kl": 2.400674819946289, "learning_rate": 5e-07, "logits/chosen": -37275588.0, "logits/rejected": -48470520.0, "logps/chosen": -355.40234375, "logps/rejected": -595.6880493164062, "loss": 0.2022, "rewards/chosen": 1.1409976482391357, "rewards/margins": 4.749114990234375, "rewards/rejected": -3.6081173419952393, "step": 12491 }, { "epoch": 0.6621259905122837, "grad_norm": 57.75, "kl": 2.6863174438476562, "learning_rate": 5e-07, "logits/chosen": -34211176.0, "logits/rejected": 4489364.0, "logps/chosen": -252.6592814127604, "logps/rejected": -384.0604553222656, "loss": 0.3827, "rewards/chosen": 0.7294975916544596, "rewards/margins": 1.598202625910441, "rewards/rejected": -0.8687050342559814, "step": 12492 }, { "epoch": 0.6621789945140858, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12941418.0, "logits/rejected": -29788618.0, "logps/chosen": -415.66693115234375, "logps/rejected": -222.96145629882812, "loss": 0.1958, "rewards/chosen": 0.5777252316474915, "rewards/margins": 4.508052408695221, "rewards/rejected": -3.9303271770477295, "step": 12493 }, { "epoch": 0.662231998515888, "grad_norm": 35.0, "kl": 1.6176366806030273, "learning_rate": 5e-07, "logits/chosen": -5945670.0, "logits/rejected": -97685484.8, "logps/chosen": -207.82523600260416, "logps/rejected": -367.3383544921875, "loss": 0.2072, "rewards/chosen": 1.2746955553690593, "rewards/margins": 3.03865925470988, "rewards/rejected": -1.7639636993408203, "step": 12494 }, { "epoch": 0.6622850025176901, "grad_norm": 52.5, "kl": 0.9732513427734375, "learning_rate": 5e-07, "logits/chosen": -7544974.0, "logits/rejected": -30947044.0, "logps/chosen": -417.01275634765625, "logps/rejected": -279.6433410644531, "loss": 0.1943, "rewards/chosen": 0.8336925506591797, "rewards/margins": 4.833585739135742, "rewards/rejected": -3.9998931884765625, "step": 12495 }, { "epoch": 0.6623380065194923, "grad_norm": 30.75, "kl": 1.424983024597168, "learning_rate": 5e-07, "logits/chosen": 10518679.0, "logits/rejected": -7948819.5, "logps/chosen": -50.82387161254883, "logps/rejected": -415.37823486328125, "loss": 0.2552, "rewards/chosen": 0.5348165035247803, "rewards/margins": 3.361534357070923, "rewards/rejected": -2.8267178535461426, "step": 12496 }, { "epoch": 0.6623910105212943, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27964090.666666668, "logits/rejected": -15581620.8, "logps/chosen": -294.1597086588542, "logps/rejected": -268.2543212890625, "loss": 0.2717, "rewards/chosen": 0.5578827063242594, "rewards/margins": 2.2080806891123452, "rewards/rejected": -1.650197982788086, "step": 12497 }, { "epoch": 0.6624440145230965, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30878448.0, "logits/rejected": -29840028.8, "logps/chosen": -326.5698649088542, "logps/rejected": -482.55107421875, "loss": 0.2127, "rewards/chosen": 0.2918396194775899, "rewards/margins": 3.561230870087942, "rewards/rejected": -3.2693912506103517, "step": 12498 }, { "epoch": 0.6624970185248986, "grad_norm": 28.25, "kl": 1.3313369750976562, "learning_rate": 5e-07, "logits/chosen": -2406430.3333333335, "logits/rejected": -35258924.8, "logps/chosen": -71.15681457519531, "logps/rejected": -450.46513671875, "loss": 0.1819, "rewards/chosen": 0.6862583955128988, "rewards/margins": 4.11600006421407, "rewards/rejected": -3.4297416687011717, "step": 12499 }, { "epoch": 0.6625500225267008, "grad_norm": 71.5, "kl": 1.4276847839355469, "learning_rate": 5e-07, "logits/chosen": -22916197.333333332, "logits/rejected": -10645546.0, "logps/chosen": -430.0685221354167, "logps/rejected": -143.93267822265625, "loss": 0.3173, "rewards/chosen": 0.8042335510253906, "rewards/margins": 2.353542447090149, "rewards/rejected": -1.5493088960647583, "step": 12500 }, { "epoch": 0.6626030265285029, "grad_norm": 70.0, "kl": 1.0365657806396484, "learning_rate": 5e-07, "logits/chosen": 33605776.0, "logits/rejected": 27645626.0, "logps/chosen": -578.3068237304688, "logps/rejected": -276.76416015625, "loss": 0.2901, "rewards/chosen": 0.8285998702049255, "rewards/margins": 2.2242358326911926, "rewards/rejected": -1.395635962486267, "step": 12501 }, { "epoch": 0.6626560305303051, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49895580.0, "logits/rejected": -21738714.0, "logps/chosen": -184.94090270996094, "logps/rejected": -310.7158203125, "loss": 0.3023, "rewards/chosen": 0.2589627802371979, "rewards/margins": 1.951531320810318, "rewards/rejected": -1.6925685405731201, "step": 12502 }, { "epoch": 0.6627090345321072, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75597072.0, "logits/rejected": -30753877.333333332, "logps/chosen": -483.0251770019531, "logps/rejected": -336.4468587239583, "loss": 0.2224, "rewards/chosen": 0.75872802734375, "rewards/margins": 2.3269673983256025, "rewards/rejected": -1.5682393709818523, "step": 12503 }, { "epoch": 0.6627620385339094, "grad_norm": 44.75, "kl": 2.537931442260742, "learning_rate": 5e-07, "logits/chosen": -30711654.4, "logits/rejected": -24682840.0, "logps/chosen": -429.42177734375, "logps/rejected": -286.71826171875, "loss": 0.3223, "rewards/chosen": 0.5059809684753418, "rewards/margins": 2.4754598935445147, "rewards/rejected": -1.969478925069173, "step": 12504 }, { "epoch": 0.6628150425357114, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65369002.666666664, "logits/rejected": -13614913.6, "logps/chosen": -458.7747395833333, "logps/rejected": -257.91376953125, "loss": 0.3201, "rewards/chosen": -0.01607974370320638, "rewards/margins": 1.3857887903849284, "rewards/rejected": -1.4018685340881347, "step": 12505 }, { "epoch": 0.6628680465375136, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7969076.666666667, "logits/rejected": -4714026.0, "logps/chosen": -276.986083984375, "logps/rejected": -193.91097412109374, "loss": 0.2904, "rewards/chosen": 0.47407039006551105, "rewards/margins": 2.0214675267537436, "rewards/rejected": -1.5473971366882324, "step": 12506 }, { "epoch": 0.6629210505393157, "grad_norm": 59.75, "kl": 2.1982421875, "learning_rate": 5e-07, "logits/chosen": -64590005.333333336, "logits/rejected": -4120627.2, "logps/chosen": -689.93505859375, "logps/rejected": -64.86431884765625, "loss": 0.2607, "rewards/chosen": 1.8260688781738281, "rewards/margins": 2.9627467155456544, "rewards/rejected": -1.1366778373718263, "step": 12507 }, { "epoch": 0.6629740545411179, "grad_norm": 52.5, "kl": 0.7102851867675781, "learning_rate": 5e-07, "logits/chosen": -38821720.0, "logits/rejected": -70977544.0, "logps/chosen": -361.41094970703125, "logps/rejected": -550.4962158203125, "loss": 0.2863, "rewards/chosen": 0.301907479763031, "rewards/margins": 2.8633164763450623, "rewards/rejected": -2.5614089965820312, "step": 12508 }, { "epoch": 0.66302705854292, "grad_norm": 40.75, "kl": 0.4328956604003906, "learning_rate": 5e-07, "logits/chosen": -26201934.0, "logits/rejected": -10493640.666666666, "logps/chosen": -212.01437377929688, "logps/rejected": -270.9600016276042, "loss": 0.2337, "rewards/chosen": -0.14183788001537323, "rewards/margins": 2.2294925401608148, "rewards/rejected": -2.371330420176188, "step": 12509 }, { "epoch": 0.6630800625447222, "grad_norm": 41.75, "kl": 0.548090934753418, "learning_rate": 5e-07, "logits/chosen": -53644116.0, "logits/rejected": -6826949.0, "logps/chosen": -224.49949645996094, "logps/rejected": -182.49118041992188, "loss": 0.2287, "rewards/chosen": 0.642014741897583, "rewards/margins": 3.478769302368164, "rewards/rejected": -2.836754560470581, "step": 12510 }, { "epoch": 0.6631330665465243, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48789052.0, "logits/rejected": -36696349.333333336, "logps/chosen": -317.163818359375, "logps/rejected": -360.6856282552083, "loss": 0.2181, "rewards/chosen": 0.7563705444335938, "rewards/margins": 2.466142336527507, "rewards/rejected": -1.7097717920939128, "step": 12511 }, { "epoch": 0.6631860705483263, "grad_norm": 63.5, "kl": 4.688840866088867, "learning_rate": 5e-07, "logits/chosen": -35139016.0, "logps/chosen": -227.9468231201172, "loss": 0.5071, "rewards/chosen": 0.44119739532470703, "step": 12512 }, { "epoch": 0.6632390745501285, "grad_norm": 54.0, "kl": 3.7991714477539062, "learning_rate": 5e-07, "logits/chosen": -9740080.0, "logits/rejected": -24638821.333333332, "logps/chosen": -325.5353515625, "logps/rejected": -629.5967610677084, "loss": 0.2588, "rewards/chosen": 1.4477620124816895, "rewards/margins": 3.7506704330444336, "rewards/rejected": -2.302908420562744, "step": 12513 }, { "epoch": 0.6632920785519306, "grad_norm": 36.5, "kl": 1.0601863861083984, "learning_rate": 5e-07, "logits/chosen": -9087929.333333334, "logits/rejected": -18724470.4, "logps/chosen": -110.62143961588542, "logps/rejected": -336.428076171875, "loss": 0.2474, "rewards/chosen": -0.04467297593752543, "rewards/margins": 2.671850766738256, "rewards/rejected": -2.7165237426757813, "step": 12514 }, { "epoch": 0.6633450825537328, "grad_norm": 42.0, "kl": 1.2100200653076172, "learning_rate": 5e-07, "logits/chosen": 8063861.0, "logits/rejected": -17895422.0, "logps/chosen": -262.66943359375, "logps/rejected": -376.46337890625, "loss": 0.2754, "rewards/chosen": 0.4111437201499939, "rewards/margins": 3.639445722103119, "rewards/rejected": -3.228302001953125, "step": 12515 }, { "epoch": 0.6633980865555349, "grad_norm": 72.5, "kl": 1.3878631591796875, "learning_rate": 5e-07, "logits/chosen": -84906416.0, "logits/rejected": -9436022.4, "logps/chosen": -436.5006103515625, "logps/rejected": -313.9168212890625, "loss": 0.2083, "rewards/chosen": 0.7048869132995605, "rewards/margins": 2.894269847869873, "rewards/rejected": -2.1893829345703124, "step": 12516 }, { "epoch": 0.6634510905573371, "grad_norm": 60.0, "kl": 0.0758209228515625, "learning_rate": 5e-07, "logits/chosen": -57623738.666666664, "logits/rejected": -7471418.4, "logps/chosen": -412.10498046875, "logps/rejected": -239.4214599609375, "loss": 0.2898, "rewards/chosen": 0.23074849446614584, "rewards/margins": 1.8996642430623372, "rewards/rejected": -1.6689157485961914, "step": 12517 }, { "epoch": 0.6635040945591392, "grad_norm": 42.75, "kl": 2.3107261657714844, "learning_rate": 5e-07, "logits/chosen": -83757045.33333333, "logits/rejected": 6255824.0, "logps/chosen": -368.5419108072917, "logps/rejected": -309.9470947265625, "loss": 0.2527, "rewards/chosen": 0.8335891564687093, "rewards/margins": 2.9787570794423424, "rewards/rejected": -2.145167922973633, "step": 12518 }, { "epoch": 0.6635570985609414, "grad_norm": 40.5, "kl": 2.4788570404052734, "learning_rate": 5e-07, "logits/chosen": -22621713.6, "logits/rejected": -3998716.6666666665, "logps/chosen": -310.879296875, "logps/rejected": -665.0972493489584, "loss": 0.3371, "rewards/chosen": 0.465347146987915, "rewards/margins": 4.7014025211334225, "rewards/rejected": -4.236055374145508, "step": 12519 }, { "epoch": 0.6636101025627434, "grad_norm": 42.75, "kl": 1.5985422134399414, "learning_rate": 5e-07, "logits/chosen": -22409786.0, "logits/rejected": -10318125.0, "logps/chosen": -189.4760284423828, "logps/rejected": -231.8303985595703, "loss": 0.2868, "rewards/chosen": 0.3907836675643921, "rewards/margins": 2.600000262260437, "rewards/rejected": -2.209216594696045, "step": 12520 }, { "epoch": 0.6636631065645456, "grad_norm": 52.5, "kl": 1.8630638122558594, "learning_rate": 5e-07, "logits/chosen": -16297354.0, "logits/rejected": -22315094.0, "logps/chosen": -191.4755096435547, "logps/rejected": -370.55181884765625, "loss": 0.3561, "rewards/chosen": -0.10562703758478165, "rewards/margins": 1.9554321989417076, "rewards/rejected": -2.0610592365264893, "step": 12521 }, { "epoch": 0.6637161105663477, "grad_norm": 38.0, "kl": 0.5582752227783203, "learning_rate": 5e-07, "logits/chosen": -355470.6666666667, "logits/rejected": -15854585.6, "logps/chosen": -215.24444580078125, "logps/rejected": -363.41787109375, "loss": 0.1896, "rewards/chosen": 1.3533596992492676, "rewards/margins": 3.382259654998779, "rewards/rejected": -2.0288999557495115, "step": 12522 }, { "epoch": 0.6637691145681499, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47066176.0, "logits/rejected": -6892713.6, "logps/chosen": -472.9083658854167, "logps/rejected": -299.429296875, "loss": 0.2837, "rewards/chosen": 0.5964111487070719, "rewards/margins": 2.542533508936564, "rewards/rejected": -1.9461223602294921, "step": 12523 }, { "epoch": 0.663822118569952, "grad_norm": 58.25, "kl": 2.2340126037597656, "learning_rate": 5e-07, "logits/chosen": -32819507.2, "logits/rejected": -29994576.0, "logps/chosen": -385.1503173828125, "logps/rejected": -384.4818929036458, "loss": 0.3333, "rewards/chosen": 0.5025863170623779, "rewards/margins": 2.246246925989787, "rewards/rejected": -1.743660608927409, "step": 12524 }, { "epoch": 0.6638751225717542, "grad_norm": 50.25, "kl": 5.025260925292969, "learning_rate": 5e-07, "logits/chosen": -3209762.4, "logits/rejected": -27962762.666666668, "logps/chosen": -257.297705078125, "logps/rejected": -267.5601399739583, "loss": 0.3466, "rewards/chosen": 0.8106624603271484, "rewards/margins": 4.081291389465332, "rewards/rejected": -3.2706289291381836, "step": 12525 }, { "epoch": 0.6639281265735563, "grad_norm": 30.75, "kl": 2.5095973014831543, "learning_rate": 5e-07, "logits/chosen": 9260238.4, "logits/rejected": -6588130.666666667, "logps/chosen": -97.09064331054688, "logps/rejected": -150.85526529947916, "loss": 0.3574, "rewards/chosen": 0.4224992752075195, "rewards/margins": 2.5409844080607096, "rewards/rejected": -2.11848513285319, "step": 12526 }, { "epoch": 0.6639811305753585, "grad_norm": 31.125, "kl": 2.634815216064453, "learning_rate": 5e-07, "logits/chosen": -7793637.5, "logits/rejected": -20550214.0, "logps/chosen": -113.65843200683594, "logps/rejected": -338.27178955078125, "loss": 0.2805, "rewards/chosen": 0.4076353907585144, "rewards/margins": 3.3186082243919373, "rewards/rejected": -2.910972833633423, "step": 12527 }, { "epoch": 0.6640341345771605, "grad_norm": 51.75, "kl": 1.7355461120605469, "learning_rate": 5e-07, "logits/chosen": -36239098.666666664, "logits/rejected": -4355411.5, "logps/chosen": -287.64833577473956, "logps/rejected": -103.32146453857422, "loss": 0.5211, "rewards/chosen": -0.20528576771418253, "rewards/margins": 0.7468652923901876, "rewards/rejected": -0.9521510601043701, "step": 12528 }, { "epoch": 0.6640871385789627, "grad_norm": 46.0, "kl": 2.449970245361328, "learning_rate": 5e-07, "logits/chosen": -45246848.0, "logits/rejected": -19861248.0, "logps/chosen": -342.3982421875, "logps/rejected": -236.3052978515625, "loss": 0.2898, "rewards/chosen": 0.7324635028839112, "rewards/margins": 3.5765790780385336, "rewards/rejected": -2.8441155751546225, "step": 12529 }, { "epoch": 0.6641401425807648, "grad_norm": 58.75, "kl": 2.796630859375, "learning_rate": 5e-07, "logits/chosen": -12939534.666666666, "logits/rejected": 2571322.5, "logps/chosen": -437.4407552083333, "logps/rejected": -132.63905334472656, "loss": 0.3165, "rewards/chosen": 1.0267833073933919, "rewards/margins": 2.6324230035146075, "rewards/rejected": -1.6056396961212158, "step": 12530 }, { "epoch": 0.664193146582567, "grad_norm": 35.5, "kl": 0.3607959747314453, "learning_rate": 5e-07, "logits/chosen": -5224895.666666667, "logits/rejected": -18815062.4, "logps/chosen": -202.8001912434896, "logps/rejected": -285.55205078125, "loss": 0.2831, "rewards/chosen": 0.27237751086552936, "rewards/margins": 2.019532318909963, "rewards/rejected": -1.7471548080444337, "step": 12531 }, { "epoch": 0.6642461505843691, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9605738.0, "logits/rejected": -30808250.0, "logps/chosen": -213.14727783203125, "logps/rejected": -430.1854553222656, "loss": 0.3428, "rewards/chosen": -0.13030605018138885, "rewards/margins": 1.9353012591600418, "rewards/rejected": -2.0656073093414307, "step": 12532 }, { "epoch": 0.6642991545861713, "grad_norm": 44.5, "kl": 2.540738105773926, "learning_rate": 5e-07, "logits/chosen": -13652038.0, "logits/rejected": -20076094.0, "logps/chosen": -295.66650390625, "logps/rejected": -420.8686218261719, "loss": 0.2402, "rewards/chosen": 0.5330772399902344, "rewards/margins": 3.505162000656128, "rewards/rejected": -2.9720847606658936, "step": 12533 }, { "epoch": 0.6643521585879734, "grad_norm": 65.0, "kl": 2.725020408630371, "learning_rate": 5e-07, "logits/chosen": -59140970.666666664, "logits/rejected": -38806960.0, "logps/chosen": -619.6852213541666, "logps/rejected": -152.80625, "loss": 0.2862, "rewards/chosen": 0.9956459204355875, "rewards/margins": 2.6914432684580483, "rewards/rejected": -1.6957973480224608, "step": 12534 }, { "epoch": 0.6644051625897756, "grad_norm": 58.5, "kl": 1.964442253112793, "learning_rate": 5e-07, "logits/chosen": -29715433.14285714, "logits/rejected": 4235258.5, "logps/chosen": -255.8765869140625, "logps/rejected": -46.579978942871094, "loss": 0.4059, "rewards/chosen": 0.48042917251586914, "rewards/margins": 2.321598768234253, "rewards/rejected": -1.8411695957183838, "step": 12535 }, { "epoch": 0.6644581665915776, "grad_norm": 63.25, "kl": 2.7860584259033203, "learning_rate": 5e-07, "logits/chosen": 161504181.33333334, "logits/rejected": 2434352.4, "logps/chosen": -626.9202473958334, "logps/rejected": -80.1932861328125, "loss": 0.2607, "rewards/chosen": 1.6601359049479167, "rewards/margins": 2.626863543192546, "rewards/rejected": -0.9667276382446289, "step": 12536 }, { "epoch": 0.6645111705933798, "grad_norm": 50.0, "kl": 2.2450828552246094, "learning_rate": 5e-07, "logits/chosen": -30449955.2, "logits/rejected": -21722956.0, "logps/chosen": -301.34658203125, "logps/rejected": -318.96559651692706, "loss": 0.3057, "rewards/chosen": 0.5416345596313477, "rewards/margins": 2.6337594985961914, "rewards/rejected": -2.0921249389648438, "step": 12537 }, { "epoch": 0.6645641745951819, "grad_norm": 36.5, "kl": 2.410696029663086, "learning_rate": 5e-07, "logits/chosen": -45969781.333333336, "logits/rejected": -36494707.2, "logps/chosen": -218.19803873697916, "logps/rejected": -464.03310546875, "loss": 0.2207, "rewards/chosen": 0.601405660311381, "rewards/margins": 3.8743365685145057, "rewards/rejected": -3.272930908203125, "step": 12538 }, { "epoch": 0.6646171785969841, "grad_norm": 32.75, "kl": 2.9703798294067383, "learning_rate": 5e-07, "logits/chosen": 5773220.0, "logits/rejected": -82416012.8, "logps/chosen": -22.97008005777995, "logps/rejected": -492.37001953125, "loss": 0.2014, "rewards/chosen": 0.6539666652679443, "rewards/margins": 3.479648733139038, "rewards/rejected": -2.8256820678710937, "step": 12539 }, { "epoch": 0.6646701825987862, "grad_norm": 50.75, "kl": 0.5988311767578125, "learning_rate": 5e-07, "logits/chosen": -16401559.0, "logits/rejected": -14075520.0, "logps/chosen": -297.4188232421875, "logps/rejected": -134.55120849609375, "loss": 0.1901, "rewards/chosen": 1.1689518690109253, "rewards/margins": 3.740002751350403, "rewards/rejected": -2.5710508823394775, "step": 12540 }, { "epoch": 0.6647231866005884, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12924771.2, "logits/rejected": -21787713.333333332, "logps/chosen": -478.24716796875, "logps/rejected": -91.5039571126302, "loss": 0.3552, "rewards/chosen": 0.377764892578125, "rewards/margins": 1.4522768974304199, "rewards/rejected": -1.074512004852295, "step": 12541 }, { "epoch": 0.6647761906023905, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7216388.8, "logits/rejected": -18357100.0, "logps/chosen": -408.0316162109375, "logps/rejected": -424.2947998046875, "loss": 0.3459, "rewards/chosen": -0.15584410429000856, "rewards/margins": 4.013263014952342, "rewards/rejected": -4.169107119242351, "step": 12542 }, { "epoch": 0.6648291946041927, "grad_norm": 38.25, "kl": 1.3779535293579102, "learning_rate": 5e-07, "logits/chosen": -16398963.2, "logits/rejected": -20301632.0, "logps/chosen": -128.5568359375, "logps/rejected": -392.6039225260417, "loss": 0.3435, "rewards/chosen": 0.22688694000244142, "rewards/margins": 3.0172445297241213, "rewards/rejected": -2.7903575897216797, "step": 12543 }, { "epoch": 0.6648821986059947, "grad_norm": 50.25, "kl": 0.3408651351928711, "learning_rate": 5e-07, "logits/chosen": -30307475.2, "logits/rejected": 35733432.0, "logps/chosen": -143.46029052734374, "logps/rejected": -674.2452392578125, "loss": 0.3776, "rewards/chosen": 0.057179379463195804, "rewards/margins": 3.489101974169413, "rewards/rejected": -3.4319225947062173, "step": 12544 }, { "epoch": 0.6649352026077969, "grad_norm": 67.0, "kl": 4.2488861083984375, "learning_rate": 5e-07, "logits/chosen": -45414956.8, "logits/rejected": -12877520.0, "logps/chosen": -679.2091796875, "logps/rejected": -469.8904215494792, "loss": 0.2081, "rewards/chosen": 1.8044061660766602, "rewards/margins": 5.140380541483561, "rewards/rejected": -3.335974375406901, "step": 12545 }, { "epoch": 0.664988206609599, "grad_norm": 45.5, "kl": 2.3538436889648438, "learning_rate": 5e-07, "logits/chosen": 17291203.2, "logits/rejected": -19567557.333333332, "logps/chosen": -148.57628173828124, "logps/rejected": -100.11903889973958, "loss": 0.4249, "rewards/chosen": 0.058882832527160645, "rewards/margins": 1.831104079882304, "rewards/rejected": -1.7722212473551433, "step": 12546 }, { "epoch": 0.6650412106114012, "grad_norm": 48.25, "kl": 2.8944644927978516, "learning_rate": 5e-07, "logits/chosen": -61196752.0, "logits/rejected": -21292294.4, "logps/chosen": -249.061279296875, "logps/rejected": -283.024755859375, "loss": 0.3457, "rewards/chosen": 0.6093390782674154, "rewards/margins": 1.9175963719685871, "rewards/rejected": -1.3082572937011718, "step": 12547 }, { "epoch": 0.6650942146132033, "grad_norm": 49.25, "kl": 2.5159473419189453, "learning_rate": 5e-07, "logits/chosen": -12989936.0, "logits/rejected": 17436540.0, "logps/chosen": -201.0029296875, "logps/rejected": -471.8351135253906, "loss": 0.4046, "rewards/chosen": 0.47032274518694195, "rewards/margins": 3.8311900070735385, "rewards/rejected": -3.3608672618865967, "step": 12548 }, { "epoch": 0.6651472186150055, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20278790.666666668, "logits/rejected": -47848940.8, "logps/chosen": -203.32708740234375, "logps/rejected": -381.491796875, "loss": 0.221, "rewards/chosen": 0.32449615001678467, "rewards/margins": 3.315798258781433, "rewards/rejected": -2.9913021087646485, "step": 12549 }, { "epoch": 0.6652002226168076, "grad_norm": 53.25, "kl": 1.6042251586914062, "learning_rate": 5e-07, "logits/chosen": -36623348.0, "logits/rejected": -32932992.0, "logps/chosen": -372.9639892578125, "logps/rejected": -560.6622314453125, "loss": 0.312, "rewards/chosen": 0.4656982421875, "rewards/margins": 2.6050186157226562, "rewards/rejected": -2.1393203735351562, "step": 12550 }, { "epoch": 0.6652532266186097, "grad_norm": 48.25, "kl": 1.2212295532226562, "learning_rate": 5e-07, "logits/chosen": -57178405.333333336, "logits/rejected": 4397190.0, "logps/chosen": -652.9376627604166, "logps/rejected": -360.5833740234375, "loss": 0.2607, "rewards/chosen": 1.4653887748718262, "rewards/margins": 3.007323169708252, "rewards/rejected": -1.5419343948364257, "step": 12551 }, { "epoch": 0.6653062306204118, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15734922.0, "logits/rejected": -16123877.333333334, "logps/chosen": -524.5931396484375, "logps/rejected": -290.1770426432292, "loss": 0.1617, "rewards/chosen": 0.6813766360282898, "rewards/margins": 3.5198375582695007, "rewards/rejected": -2.838460922241211, "step": 12552 }, { "epoch": 0.665359234622214, "grad_norm": 55.5, "kl": 0.518890380859375, "learning_rate": 5e-07, "logits/chosen": -3699536.75, "logits/rejected": -30953840.0, "logps/chosen": -297.4295654296875, "logps/rejected": -178.2531534830729, "loss": 0.2948, "rewards/chosen": 0.18156738579273224, "rewards/margins": 1.817263161142667, "rewards/rejected": -1.6356957753499348, "step": 12553 }, { "epoch": 0.6654122386240161, "grad_norm": 75.0, "kl": 3.1080780029296875, "learning_rate": 5e-07, "logits/chosen": -15401177.6, "logits/rejected": -19131265.333333332, "logps/chosen": -402.34912109375, "logps/rejected": -175.9697062174479, "loss": 0.4201, "rewards/chosen": 0.5736485481262207, "rewards/margins": 1.4165165583292643, "rewards/rejected": -0.8428680102030436, "step": 12554 }, { "epoch": 0.6654652426258183, "grad_norm": 46.75, "kl": 2.6371002197265625, "learning_rate": 5e-07, "logits/chosen": -94303.6, "logits/rejected": -24383536.0, "logps/chosen": -257.5948974609375, "logps/rejected": -347.7867431640625, "loss": 0.2378, "rewards/chosen": 1.1228534698486328, "rewards/margins": 3.9944208780924475, "rewards/rejected": -2.871567408243815, "step": 12555 }, { "epoch": 0.6655182466276204, "grad_norm": 64.5, "kl": 2.5562820434570312, "learning_rate": 5e-07, "logits/chosen": -31131412.0, "logits/rejected": -6816933.0, "logps/chosen": -904.2952880859375, "logps/rejected": -252.02471923828125, "loss": 0.287, "rewards/chosen": 1.0689598321914673, "rewards/margins": 2.8241941928863525, "rewards/rejected": -1.7552343606948853, "step": 12556 }, { "epoch": 0.6655712506294226, "grad_norm": 85.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59807795.2, "logits/rejected": -5290224.666666667, "logps/chosen": -452.97158203125, "logps/rejected": -157.64818318684897, "loss": 0.3349, "rewards/chosen": 0.6658495903015137, "rewards/margins": 1.6015500863393148, "rewards/rejected": -0.9357004960378011, "step": 12557 }, { "epoch": 0.6656242546312247, "grad_norm": 29.25, "kl": 1.9972877502441406, "learning_rate": 5e-07, "logits/chosen": -15697542.666666666, "logits/rejected": -53395462.4, "logps/chosen": -124.43231201171875, "logps/rejected": -474.7623046875, "loss": 0.2273, "rewards/chosen": 0.34349024295806885, "rewards/margins": 3.4579275369644167, "rewards/rejected": -3.114437294006348, "step": 12558 }, { "epoch": 0.6656772586330268, "grad_norm": 52.5, "kl": 1.1188316345214844, "learning_rate": 5e-07, "logits/chosen": -7168702.857142857, "logits/rejected": 4871188.0, "logps/chosen": -281.1099155970982, "logps/rejected": -71.37339782714844, "loss": 0.4218, "rewards/chosen": 0.25568956988198416, "rewards/margins": 2.806144424847194, "rewards/rejected": -2.55045485496521, "step": 12559 }, { "epoch": 0.6657302626348289, "grad_norm": 39.5, "kl": 1.627197265625, "learning_rate": 5e-07, "logits/chosen": 12900297.333333334, "logits/rejected": -41625014.4, "logps/chosen": -39.17573547363281, "logps/rejected": -285.9951904296875, "loss": 0.2339, "rewards/chosen": 0.9433582623799642, "rewards/margins": 2.9917093594868978, "rewards/rejected": -2.0483510971069334, "step": 12560 }, { "epoch": 0.665783266636631, "grad_norm": 110.0, "kl": 1.5540103912353516, "learning_rate": 5e-07, "logits/chosen": -36737645.333333336, "logits/rejected": -59180294.4, "logps/chosen": -340.32961018880206, "logps/rejected": -298.6004638671875, "loss": 0.2811, "rewards/chosen": 0.8329363663991293, "rewards/margins": 2.5630911668141683, "rewards/rejected": -1.730154800415039, "step": 12561 }, { "epoch": 0.6658362706384332, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5129646.0, "logits/rejected": -46957660.0, "logps/chosen": -332.32928466796875, "logps/rejected": -353.34381103515625, "loss": 0.3878, "rewards/chosen": -0.16007386147975922, "rewards/margins": 1.5623672157526016, "rewards/rejected": -1.7224410772323608, "step": 12562 }, { "epoch": 0.6658892746402353, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46521940.0, "logits/rejected": 1899245.0, "logps/chosen": -330.73895263671875, "logps/rejected": -394.79693603515625, "loss": 0.3745, "rewards/chosen": -0.12404479831457138, "rewards/margins": 2.556493379175663, "rewards/rejected": -2.6805381774902344, "step": 12563 }, { "epoch": 0.6659422786420375, "grad_norm": 44.5, "kl": 4.060547828674316, "learning_rate": 5e-07, "logits/chosen": -8229405.0, "logits/rejected": -13909993.0, "logps/chosen": -403.45208740234375, "logps/rejected": -210.81854248046875, "loss": 0.3379, "rewards/chosen": 1.0568456649780273, "rewards/margins": 2.868386387825012, "rewards/rejected": -1.8115407228469849, "step": 12564 }, { "epoch": 0.6659952826438396, "grad_norm": 42.25, "kl": 2.4854564666748047, "learning_rate": 5e-07, "logits/chosen": -10772952.0, "logits/rejected": -13691085.0, "logps/chosen": -512.2120971679688, "logps/rejected": -325.0221252441406, "loss": 0.2524, "rewards/chosen": 1.0960806608200073, "rewards/margins": 3.548900008201599, "rewards/rejected": -2.452819347381592, "step": 12565 }, { "epoch": 0.6660482866456418, "grad_norm": 48.25, "kl": 2.2227344512939453, "learning_rate": 5e-07, "logits/chosen": 157626.66666666666, "logits/rejected": -12509251.0, "logps/chosen": -512.4657796223959, "logps/rejected": -193.07839965820312, "loss": 0.4385, "rewards/chosen": 0.5451299349466959, "rewards/margins": 1.9209384123484292, "rewards/rejected": -1.3758084774017334, "step": 12566 }, { "epoch": 0.6661012906474438, "grad_norm": 37.25, "kl": 2.6827850341796875, "learning_rate": 5e-07, "logits/chosen": -15224794.666666666, "logits/rejected": -33628284.0, "logps/chosen": -129.540771484375, "logps/rejected": -296.84661865234375, "loss": 0.4165, "rewards/chosen": 0.36613627274831134, "rewards/margins": 2.096005400021871, "rewards/rejected": -1.7298691272735596, "step": 12567 }, { "epoch": 0.666154294649246, "grad_norm": 40.0, "kl": 1.6613349914550781, "learning_rate": 5e-07, "logits/chosen": -24446444.0, "logits/rejected": -47429205.333333336, "logps/chosen": -681.3113403320312, "logps/rejected": -360.3678792317708, "loss": 0.2267, "rewards/chosen": 1.7051337957382202, "rewards/margins": 3.614822268486023, "rewards/rejected": -1.9096884727478027, "step": 12568 }, { "epoch": 0.6662072986510481, "grad_norm": 39.25, "kl": 1.757781982421875, "learning_rate": 5e-07, "logits/chosen": -27013344.0, "logits/rejected": -38813928.0, "logps/chosen": -158.0387725830078, "logps/rejected": -201.78517150878906, "loss": 0.3193, "rewards/chosen": 0.6006821990013123, "rewards/margins": 1.9986082911491394, "rewards/rejected": -1.3979260921478271, "step": 12569 }, { "epoch": 0.6662603026528503, "grad_norm": 51.0, "kl": 3.724163055419922, "learning_rate": 5e-07, "logits/chosen": -25171864.0, "logits/rejected": -30878476.0, "logps/chosen": -365.08258056640625, "logps/rejected": -463.81182861328125, "loss": 0.2779, "rewards/chosen": 1.0506277084350586, "rewards/margins": 3.2290544509887695, "rewards/rejected": -2.178426742553711, "step": 12570 }, { "epoch": 0.6663133066546524, "grad_norm": 66.0, "kl": 2.4542832374572754, "learning_rate": 5e-07, "logits/chosen": -26569412.0, "logits/rejected": -2231777.0, "logps/chosen": -532.1632080078125, "logps/rejected": -137.65103149414062, "loss": 0.2904, "rewards/chosen": 0.85455322265625, "rewards/margins": 2.7434422969818115, "rewards/rejected": -1.8888890743255615, "step": 12571 }, { "epoch": 0.6663663106564546, "grad_norm": 62.75, "kl": 1.8092174530029297, "learning_rate": 5e-07, "logits/chosen": -8196805.0, "logits/rejected": -29693916.0, "logps/chosen": -382.54266357421875, "logps/rejected": -330.6396484375, "loss": 0.3228, "rewards/chosen": 0.33286362886428833, "rewards/margins": 2.166096270084381, "rewards/rejected": -1.8332326412200928, "step": 12572 }, { "epoch": 0.6664193146582567, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37576392.0, "logits/rejected": -40064216.0, "logps/chosen": -257.31427001953125, "logps/rejected": -374.3028869628906, "loss": 0.3053, "rewards/chosen": -0.1747041791677475, "rewards/margins": 3.4033812433481216, "rewards/rejected": -3.578085422515869, "step": 12573 }, { "epoch": 0.6664723186600588, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14153137.0, "logits/rejected": -29496155.42857143, "logps/chosen": -327.8211975097656, "logps/rejected": -301.2388392857143, "loss": 0.214, "rewards/chosen": 2.422201633453369, "rewards/margins": 3.7213796206883023, "rewards/rejected": -1.299177987234933, "step": 12574 }, { "epoch": 0.6665253226618609, "grad_norm": 47.25, "kl": 5.114864349365234, "learning_rate": 5e-07, "logits/chosen": -19401365.333333332, "logits/rejected": -16720558.0, "logps/chosen": -285.8205159505208, "logps/rejected": -495.7777099609375, "loss": 0.3698, "rewards/chosen": 0.8462523619333903, "rewards/margins": 3.3052612940470376, "rewards/rejected": -2.4590089321136475, "step": 12575 }, { "epoch": 0.6665783266636631, "grad_norm": 50.5, "kl": 4.180154323577881, "learning_rate": 5e-07, "logits/chosen": -16965833.6, "logits/rejected": -16018437.333333334, "logps/chosen": -393.07001953125, "logps/rejected": -71.72517903645833, "loss": 0.2863, "rewards/chosen": 1.3392526626586914, "rewards/margins": 3.4104914983113606, "rewards/rejected": -2.0712388356526694, "step": 12576 }, { "epoch": 0.6666313306654652, "grad_norm": 40.75, "kl": 1.4243240356445312, "learning_rate": 5e-07, "logits/chosen": 562289.3333333334, "logits/rejected": -1080715.3, "logps/chosen": -183.8583984375, "logps/rejected": -135.2829345703125, "loss": 0.2039, "rewards/chosen": 0.9858342806498209, "rewards/margins": 2.9511102358500163, "rewards/rejected": -1.9652759552001953, "step": 12577 }, { "epoch": 0.6666843346672674, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53018787.2, "logits/rejected": 141412757.33333334, "logps/chosen": -385.57783203125, "logps/rejected": -212.5744425455729, "loss": 0.2246, "rewards/chosen": 1.1552563667297364, "rewards/margins": 2.938641166687012, "rewards/rejected": -1.7833847999572754, "step": 12578 }, { "epoch": 0.6667373386690695, "grad_norm": 53.0, "kl": 0.43149566650390625, "learning_rate": 5e-07, "logits/chosen": -7427606.666666667, "logits/rejected": -20752305.6, "logps/chosen": -257.9386393229167, "logps/rejected": -388.63564453125, "loss": 0.2342, "rewards/chosen": 0.6577355861663818, "rewards/margins": 3.1467496395111083, "rewards/rejected": -2.4890140533447265, "step": 12579 }, { "epoch": 0.6667903426708717, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -82815052.8, "logits/rejected": -72857264.0, "logps/chosen": -380.251611328125, "logps/rejected": -339.7508138020833, "loss": 0.3206, "rewards/chosen": 0.3397489070892334, "rewards/margins": 3.35375329653422, "rewards/rejected": -3.014004389444987, "step": 12580 }, { "epoch": 0.6668433466726738, "grad_norm": 52.0, "kl": 2.264512062072754, "learning_rate": 5e-07, "logits/chosen": -30244035.2, "logits/rejected": -64990122.666666664, "logps/chosen": -455.224267578125, "logps/rejected": -499.0397135416667, "loss": 0.2968, "rewards/chosen": 0.8513504028320312, "rewards/margins": 2.923111375172933, "rewards/rejected": -2.071760972340902, "step": 12581 }, { "epoch": 0.666896350674476, "grad_norm": 49.5, "kl": 2.4038162231445312, "learning_rate": 5e-07, "logits/chosen": -20614169.6, "logits/rejected": -7789114.0, "logps/chosen": -325.59697265625, "logps/rejected": -249.4346923828125, "loss": 0.3015, "rewards/chosen": 0.9544196128845215, "rewards/margins": 2.850971221923828, "rewards/rejected": -1.8965516090393066, "step": 12582 }, { "epoch": 0.666949354676278, "grad_norm": 47.5, "kl": 0.8773384094238281, "learning_rate": 5e-07, "logits/chosen": -10491816.0, "logits/rejected": -29007488.0, "logps/chosen": -149.63818359375, "logps/rejected": -270.0917724609375, "loss": 0.2964, "rewards/chosen": -0.06290562947591145, "rewards/margins": 1.8072411219278972, "rewards/rejected": -1.8701467514038086, "step": 12583 }, { "epoch": 0.6670023586780802, "grad_norm": 53.25, "kl": 1.22052001953125, "learning_rate": 5e-07, "logits/chosen": -27020793.6, "logits/rejected": -62546698.666666664, "logps/chosen": -254.811376953125, "logps/rejected": -603.2759602864584, "loss": 0.3565, "rewards/chosen": 0.2119654893875122, "rewards/margins": 2.797017979621887, "rewards/rejected": -2.585052490234375, "step": 12584 }, { "epoch": 0.6670553626798823, "grad_norm": 47.25, "kl": 1.7070274353027344, "learning_rate": 5e-07, "logits/chosen": -46296332.8, "logits/rejected": -7125061.333333333, "logps/chosen": -221.8268310546875, "logps/rejected": -216.10872395833334, "loss": 0.2823, "rewards/chosen": 0.6170482158660888, "rewards/margins": 2.859463802973429, "rewards/rejected": -2.2424155871073403, "step": 12585 }, { "epoch": 0.6671083666816845, "grad_norm": 42.25, "kl": 1.0386390686035156, "learning_rate": 5e-07, "logits/chosen": -19717876.0, "logits/rejected": -91650384.0, "logps/chosen": -493.29095458984375, "logps/rejected": -705.7365112304688, "loss": 0.2355, "rewards/chosen": 0.45428621768951416, "rewards/margins": 4.485157608985901, "rewards/rejected": -4.030871391296387, "step": 12586 }, { "epoch": 0.6671613706834866, "grad_norm": 54.25, "kl": 1.980534553527832, "learning_rate": 5e-07, "logits/chosen": -10911900.0, "logits/rejected": -2284148.5, "logps/chosen": -193.672119140625, "logps/rejected": -204.12269592285156, "loss": 0.4524, "rewards/chosen": 0.13807723919550577, "rewards/margins": 1.4572179118792217, "rewards/rejected": -1.3191406726837158, "step": 12587 }, { "epoch": 0.6672143746852888, "grad_norm": 42.25, "kl": 2.852720260620117, "learning_rate": 5e-07, "logits/chosen": -4809856.0, "logits/rejected": -4151653.0, "logps/chosen": -221.52789306640625, "logps/rejected": -161.81385803222656, "loss": 0.355, "rewards/chosen": 0.6527032852172852, "rewards/margins": 1.779050588607788, "rewards/rejected": -1.126347303390503, "step": 12588 }, { "epoch": 0.6672673786870909, "grad_norm": 47.75, "kl": 2.3754920959472656, "learning_rate": 5e-07, "logits/chosen": 552238.4, "logits/rejected": -13972276.0, "logps/chosen": -104.687353515625, "logps/rejected": -244.429931640625, "loss": 0.4039, "rewards/chosen": 0.34085688591003416, "rewards/margins": 1.3848202387491861, "rewards/rejected": -1.043963352839152, "step": 12589 }, { "epoch": 0.667320382688893, "grad_norm": 33.0, "kl": 0.5162544250488281, "learning_rate": 5e-07, "logits/chosen": -10503840.0, "logits/rejected": -33146288.0, "logps/chosen": -481.4385986328125, "logps/rejected": -568.2994140625, "loss": 0.1306, "rewards/chosen": 1.2324310938517253, "rewards/margins": 4.578804079691569, "rewards/rejected": -3.3463729858398437, "step": 12590 }, { "epoch": 0.6673733866906951, "grad_norm": 46.5, "kl": 0.3839073181152344, "learning_rate": 5e-07, "logits/chosen": -39247784.0, "logits/rejected": -30571208.0, "logps/chosen": -624.4532470703125, "logps/rejected": -221.8700714111328, "loss": 0.2893, "rewards/chosen": 1.0057919025421143, "rewards/margins": 2.4478650093078613, "rewards/rejected": -1.442073106765747, "step": 12591 }, { "epoch": 0.6674263906924973, "grad_norm": 35.75, "kl": 3.0173072814941406, "learning_rate": 5e-07, "logits/chosen": -18626697.333333332, "logits/rejected": -44408089.6, "logps/chosen": -463.36474609375, "logps/rejected": -395.0174072265625, "loss": 0.2362, "rewards/chosen": 1.034015417098999, "rewards/margins": 3.746948003768921, "rewards/rejected": -2.712932586669922, "step": 12592 }, { "epoch": 0.6674793946942994, "grad_norm": 46.75, "kl": 1.0669002532958984, "learning_rate": 5e-07, "logits/chosen": 10460438.0, "logits/rejected": -1869435.2, "logps/chosen": -168.9866943359375, "logps/rejected": -224.0747314453125, "loss": 0.3168, "rewards/chosen": -0.1794550617535909, "rewards/margins": 1.6761979381243388, "rewards/rejected": -1.8556529998779296, "step": 12593 }, { "epoch": 0.6675323986961016, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12258266.666666666, "logits/rejected": -11442512.0, "logps/chosen": -222.0125732421875, "logps/rejected": -139.8899169921875, "loss": 0.2688, "rewards/chosen": 0.4390970865885417, "rewards/margins": 2.1610514322916665, "rewards/rejected": -1.721954345703125, "step": 12594 }, { "epoch": 0.6675854026979037, "grad_norm": 49.5, "kl": 0.9311647415161133, "learning_rate": 5e-07, "logits/chosen": -20959952.0, "logits/rejected": 6978465.333333333, "logps/chosen": -214.77578125, "logps/rejected": -281.438232421875, "loss": 0.3529, "rewards/chosen": 0.2135483741760254, "rewards/margins": 2.1823705355326335, "rewards/rejected": -1.9688221613566081, "step": 12595 }, { "epoch": 0.6676384066997059, "grad_norm": 53.25, "kl": 0.9843120574951172, "learning_rate": 5e-07, "logits/chosen": -25005370.666666668, "logits/rejected": -13295884.0, "logps/chosen": -306.9595133463542, "logps/rejected": -683.501708984375, "loss": 0.3711, "rewards/chosen": 0.3763182957967122, "rewards/margins": 2.480227549870809, "rewards/rejected": -2.1039092540740967, "step": 12596 }, { "epoch": 0.667691410701508, "grad_norm": 48.5, "kl": 0.3873281478881836, "learning_rate": 5e-07, "logits/chosen": -44646636.8, "logits/rejected": -3759171.3333333335, "logps/chosen": -162.57528076171874, "logps/rejected": -158.39856974283853, "loss": 0.3999, "rewards/chosen": 0.18620182275772096, "rewards/margins": 1.1944105744361877, "rewards/rejected": -1.0082087516784668, "step": 12597 }, { "epoch": 0.6677444147033101, "grad_norm": 57.75, "kl": 1.5601177215576172, "learning_rate": 5e-07, "logits/chosen": -27533498.0, "logits/rejected": -15933261.333333334, "logps/chosen": -435.64404296875, "logps/rejected": -332.8498942057292, "loss": 0.1582, "rewards/chosen": 1.9878387451171875, "rewards/margins": 3.7127377192179365, "rewards/rejected": -1.7248989741007488, "step": 12598 }, { "epoch": 0.6677974187051122, "grad_norm": 46.75, "kl": 2.349994659423828, "learning_rate": 5e-07, "logits/chosen": -44073270.4, "logits/rejected": 7354897.333333333, "logps/chosen": -271.01025390625, "logps/rejected": -402.3026936848958, "loss": 0.3474, "rewards/chosen": 0.4660538673400879, "rewards/margins": 2.942081356048584, "rewards/rejected": -2.476027488708496, "step": 12599 }, { "epoch": 0.6678504227069144, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30942328.0, "logits/rejected": -23518654.0, "logps/chosen": -236.94759114583334, "logps/rejected": -273.2233581542969, "loss": 0.4228, "rewards/chosen": -0.07637533048788707, "rewards/margins": 2.0878900637229285, "rewards/rejected": -2.1642653942108154, "step": 12600 }, { "epoch": 0.6679034267087165, "grad_norm": 54.0, "kl": 0.9430732727050781, "learning_rate": 5e-07, "logits/chosen": -39276528.0, "logits/rejected": -31040873.6, "logps/chosen": -418.2510579427083, "logps/rejected": -409.71494140625, "loss": 0.1854, "rewards/chosen": 0.9764393170674642, "rewards/margins": 3.662104829152425, "rewards/rejected": -2.685665512084961, "step": 12601 }, { "epoch": 0.6679564307105187, "grad_norm": 36.0, "kl": 1.9324874877929688, "learning_rate": 5e-07, "logits/chosen": -34961664.0, "logits/rejected": -29298713.14285714, "logps/chosen": -2025.11865234375, "logps/rejected": -459.87939453125, "loss": 0.112, "rewards/chosen": 3.6663575172424316, "rewards/margins": 6.51150369644165, "rewards/rejected": -2.8451461791992188, "step": 12602 }, { "epoch": 0.6680094347123208, "grad_norm": 41.25, "kl": 0.041011810302734375, "learning_rate": 5e-07, "logits/chosen": -46896566.4, "logits/rejected": -33725373.333333336, "logps/chosen": -215.86953125, "logps/rejected": -522.2002766927084, "loss": 0.2127, "rewards/chosen": 0.8525898933410645, "rewards/margins": 3.869570382436117, "rewards/rejected": -3.0169804890950522, "step": 12603 }, { "epoch": 0.668062438714123, "grad_norm": 47.75, "kl": 2.8850021362304688, "learning_rate": 5e-07, "logits/chosen": 252580.85, "logits/rejected": -37150392.0, "logps/chosen": -229.2622802734375, "logps/rejected": -413.9256998697917, "loss": 0.3101, "rewards/chosen": 0.774654769897461, "rewards/margins": 2.933564567565918, "rewards/rejected": -2.158909797668457, "step": 12604 }, { "epoch": 0.668115442715925, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65595589.333333336, "logits/rejected": -73560896.0, "logps/chosen": -403.2325846354167, "logps/rejected": -265.79560546875, "loss": 0.3129, "rewards/chosen": -0.43598175048828125, "rewards/margins": 1.701325035095215, "rewards/rejected": -2.137306785583496, "step": 12605 }, { "epoch": 0.6681684467177272, "grad_norm": 37.75, "kl": 0.16632080078125, "learning_rate": 5e-07, "logits/chosen": -28503842.666666668, "logits/rejected": -41284624.0, "logps/chosen": -175.060302734375, "logps/rejected": -332.912744140625, "loss": 0.2617, "rewards/chosen": 0.18774986267089844, "rewards/margins": 2.5657379150390627, "rewards/rejected": -2.3779880523681642, "step": 12606 }, { "epoch": 0.6682214507195293, "grad_norm": 49.25, "kl": 1.0333824157714844, "learning_rate": 5e-07, "logits/chosen": -1885404.75, "logits/rejected": -10698722.0, "logps/chosen": -246.5832977294922, "logps/rejected": -316.15625, "loss": 0.2576, "rewards/chosen": 0.8373782634735107, "rewards/margins": 2.8470311164855957, "rewards/rejected": -2.009652853012085, "step": 12607 }, { "epoch": 0.6682744547213315, "grad_norm": 67.0, "kl": 0.9122962951660156, "learning_rate": 5e-07, "logits/chosen": -15090092.8, "logits/rejected": -1797000.6666666667, "logps/chosen": -282.88876953125, "logps/rejected": -145.14645385742188, "loss": 0.3312, "rewards/chosen": 0.6054783821105957, "rewards/margins": 2.1588099161783854, "rewards/rejected": -1.5533315340677898, "step": 12608 }, { "epoch": 0.6683274587231336, "grad_norm": 54.25, "kl": 5.672262668609619, "learning_rate": 5e-07, "logits/chosen": -3785708.0, "logits/rejected": -35978592.0, "logps/chosen": -267.5908203125, "logps/rejected": -318.1481526692708, "loss": 0.2991, "rewards/chosen": 0.9918043136596679, "rewards/margins": 3.9486038208007814, "rewards/rejected": -2.9567995071411133, "step": 12609 }, { "epoch": 0.6683804627249358, "grad_norm": 38.75, "kl": 1.304774284362793, "learning_rate": 5e-07, "logits/chosen": -7386766.666666667, "logits/rejected": -6994380.8, "logps/chosen": -207.69429524739584, "logps/rejected": -183.99373779296874, "loss": 0.2855, "rewards/chosen": -0.1237879494825999, "rewards/margins": 1.957317092021306, "rewards/rejected": -2.081105041503906, "step": 12610 }, { "epoch": 0.6684334667267379, "grad_norm": 47.25, "kl": 0.8061065673828125, "learning_rate": 5e-07, "logits/chosen": -23429432.0, "logits/rejected": -30769164.0, "logps/chosen": -114.84500885009766, "logps/rejected": -537.7213134765625, "loss": 0.2095, "rewards/chosen": 0.9528920650482178, "rewards/margins": 3.2945919036865234, "rewards/rejected": -2.3416998386383057, "step": 12611 }, { "epoch": 0.66848647072854, "grad_norm": 60.75, "kl": 1.242696762084961, "learning_rate": 5e-07, "logits/chosen": -40771174.4, "logits/rejected": -48669045.333333336, "logps/chosen": -378.44443359375, "logps/rejected": -407.4679361979167, "loss": 0.3581, "rewards/chosen": 0.1856438398361206, "rewards/margins": 2.3504697561264036, "rewards/rejected": -2.164825916290283, "step": 12612 }, { "epoch": 0.6685394747303421, "grad_norm": 53.5, "kl": 0.23811721801757812, "learning_rate": 5e-07, "logits/chosen": -30743766.0, "logits/rejected": -18500360.0, "logps/chosen": -337.64813232421875, "logps/rejected": -277.308349609375, "loss": 0.3542, "rewards/chosen": 0.06018955260515213, "rewards/margins": 1.6849835589528084, "rewards/rejected": -1.6247940063476562, "step": 12613 }, { "epoch": 0.6685924787321442, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1998650.75, "logits/rejected": -3118052.6666666665, "logps/chosen": -26.578105926513672, "logps/rejected": -105.01473999023438, "loss": 0.2482, "rewards/chosen": 0.11539563536643982, "rewards/margins": 1.8997406264146168, "rewards/rejected": -1.784344991048177, "step": 12614 }, { "epoch": 0.6686454827339464, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36877964.8, "logits/rejected": -34540344.0, "logps/chosen": -212.5379638671875, "logps/rejected": -481.7669270833333, "loss": 0.2754, "rewards/chosen": 0.3133403778076172, "rewards/margins": 3.833591969807943, "rewards/rejected": -3.5202515920003257, "step": 12615 }, { "epoch": 0.6686984867357485, "grad_norm": 45.5, "kl": 1.4575181007385254, "learning_rate": 5e-07, "logits/chosen": -93878325.33333333, "logits/rejected": -23580806.4, "logps/chosen": -160.18824259440103, "logps/rejected": -369.6744384765625, "loss": 0.2356, "rewards/chosen": 0.552505890528361, "rewards/margins": 3.0260013739267984, "rewards/rejected": -2.4734954833984375, "step": 12616 }, { "epoch": 0.6687514907375507, "grad_norm": 48.0, "kl": 1.0281448364257812, "learning_rate": 5e-07, "logits/chosen": -16920539.2, "logits/rejected": -62020373.333333336, "logps/chosen": -233.8524169921875, "logps/rejected": -493.8267415364583, "loss": 0.3487, "rewards/chosen": 0.2604595899581909, "rewards/margins": 2.2303876956303914, "rewards/rejected": -1.9699281056722004, "step": 12617 }, { "epoch": 0.6688044947393528, "grad_norm": 36.75, "kl": 1.7621383666992188, "learning_rate": 5e-07, "logits/chosen": 5676647.333333333, "logits/rejected": -30794355.2, "logps/chosen": -490.5712076822917, "logps/rejected": -404.991650390625, "loss": 0.1905, "rewards/chosen": 1.168111244837443, "rewards/margins": 3.9329736868540444, "rewards/rejected": -2.7648624420166015, "step": 12618 }, { "epoch": 0.668857498741155, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39118768.0, "logits/rejected": -29844544.0, "logps/chosen": -143.99293009440103, "logps/rejected": -154.2987060546875, "loss": 0.334, "rewards/chosen": 0.008854230244954428, "rewards/margins": 1.545239766438802, "rewards/rejected": -1.5363855361938477, "step": 12619 }, { "epoch": 0.668910502742957, "grad_norm": 52.5, "kl": 0.6788558959960938, "learning_rate": 5e-07, "logits/chosen": -37107782.4, "logits/rejected": -38489328.0, "logps/chosen": -282.893505859375, "logps/rejected": -607.5406494140625, "loss": 0.2674, "rewards/chosen": 0.5219805717468262, "rewards/margins": 3.9667968432108562, "rewards/rejected": -3.44481627146403, "step": 12620 }, { "epoch": 0.6689635067447592, "grad_norm": 51.0, "kl": 0.2099456787109375, "learning_rate": 5e-07, "logits/chosen": -18827686.0, "logits/rejected": 4498552.0, "logps/chosen": -288.8672180175781, "logps/rejected": -305.8027038574219, "loss": 0.2795, "rewards/chosen": 0.5666342377662659, "rewards/margins": 2.633829653263092, "rewards/rejected": -2.067195415496826, "step": 12621 }, { "epoch": 0.6690165107465613, "grad_norm": 40.75, "kl": 3.779116630554199, "learning_rate": 5e-07, "logits/chosen": -50102096.0, "logits/rejected": -39271302.4, "logps/chosen": -319.21112060546875, "logps/rejected": -265.641162109375, "loss": 0.1851, "rewards/chosen": 0.8556433518727621, "rewards/margins": 4.415658076604207, "rewards/rejected": -3.5600147247314453, "step": 12622 }, { "epoch": 0.6690695147483635, "grad_norm": 45.5, "kl": 4.338179588317871, "learning_rate": 5e-07, "logits/chosen": 2820881.0, "logits/rejected": -7634761.0, "logps/chosen": -146.8358612060547, "logps/rejected": -211.84231567382812, "loss": 0.3363, "rewards/chosen": 0.8738961219787598, "rewards/margins": 2.77049458026886, "rewards/rejected": -1.8965984582901, "step": 12623 }, { "epoch": 0.6691225187501656, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12993048.0, "logits/rejected": -8701624.0, "logps/chosen": -354.12005615234375, "logps/rejected": -159.5009562174479, "loss": 0.2807, "rewards/chosen": 0.01904602348804474, "rewards/margins": 1.3609225302934647, "rewards/rejected": -1.34187650680542, "step": 12624 }, { "epoch": 0.6691755227519678, "grad_norm": 53.0, "kl": 1.7622575759887695, "learning_rate": 5e-07, "logits/chosen": -21409572.0, "logits/rejected": -27752440.0, "logps/chosen": -282.4391174316406, "logps/rejected": -573.9970092773438, "loss": 0.2594, "rewards/chosen": 1.0108308792114258, "rewards/margins": 3.4820237159729004, "rewards/rejected": -2.4711928367614746, "step": 12625 }, { "epoch": 0.6692285267537699, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -115860245.33333333, "logits/rejected": -30213785.6, "logps/chosen": -422.9385579427083, "logps/rejected": -420.99404296875, "loss": 0.2803, "rewards/chosen": -0.5054885943730673, "rewards/margins": 2.1179885784784953, "rewards/rejected": -2.6234771728515627, "step": 12626 }, { "epoch": 0.6692815307555721, "grad_norm": 44.75, "kl": 3.0778398513793945, "learning_rate": 5e-07, "logits/chosen": -13376469.333333334, "logits/rejected": -52537436.8, "logps/chosen": -465.029296875, "logps/rejected": -533.048583984375, "loss": 0.2388, "rewards/chosen": 0.6339748700459799, "rewards/margins": 3.7958967526753744, "rewards/rejected": -3.1619218826293944, "step": 12627 }, { "epoch": 0.6693345347573741, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11453526.0, "logits/rejected": -24988510.0, "logps/chosen": -240.7862548828125, "logps/rejected": -257.5495300292969, "loss": 0.2311, "rewards/chosen": 0.41249656677246094, "rewards/margins": 3.3494696617126465, "rewards/rejected": -2.9369730949401855, "step": 12628 }, { "epoch": 0.6693875387591763, "grad_norm": 47.0, "kl": 3.957615375518799, "learning_rate": 5e-07, "logits/chosen": -29749318.4, "logits/rejected": 8552041.333333334, "logps/chosen": -261.880517578125, "logps/rejected": -270.5919189453125, "loss": 0.3558, "rewards/chosen": 0.7357093811035156, "rewards/margins": 4.161521784464518, "rewards/rejected": -3.4258124033610025, "step": 12629 }, { "epoch": 0.6694405427609784, "grad_norm": 44.75, "kl": 1.978128433227539, "learning_rate": 5e-07, "logits/chosen": -24750499.2, "logits/rejected": -40319997.333333336, "logps/chosen": -430.264599609375, "logps/rejected": -528.1275634765625, "loss": 0.2588, "rewards/chosen": 1.1682411193847657, "rewards/margins": 3.5715188344319664, "rewards/rejected": -2.4032777150472007, "step": 12630 }, { "epoch": 0.6694935467627806, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31209330.666666668, "logits/rejected": -13755950.4, "logps/chosen": -400.1561686197917, "logps/rejected": -229.220849609375, "loss": 0.2837, "rewards/chosen": 0.562242348988851, "rewards/margins": 2.240026887257894, "rewards/rejected": -1.677784538269043, "step": 12631 }, { "epoch": 0.6695465507645827, "grad_norm": 29.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9968718.666666666, "logits/rejected": -11367591.2, "logps/chosen": -132.16663614908853, "logps/rejected": -455.3974609375, "loss": 0.173, "rewards/chosen": 0.674659808476766, "rewards/margins": 3.8288910706837975, "rewards/rejected": -3.1542312622070314, "step": 12632 }, { "epoch": 0.6695995547663849, "grad_norm": 56.25, "kl": 3.071146011352539, "learning_rate": 5e-07, "logits/chosen": -51526236.8, "logits/rejected": 4184517.6666666665, "logps/chosen": -339.5092041015625, "logps/rejected": -208.29512532552084, "loss": 0.4422, "rewards/chosen": 0.06240234375, "rewards/margins": 1.899799919128418, "rewards/rejected": -1.837397575378418, "step": 12633 }, { "epoch": 0.669652558768187, "grad_norm": 57.5, "kl": 2.418384552001953, "learning_rate": 5e-07, "logits/chosen": -68580245.33333333, "logits/rejected": -20393532.8, "logps/chosen": -292.80747477213544, "logps/rejected": -177.24569091796874, "loss": 0.3079, "rewards/chosen": 0.43208928902943927, "rewards/margins": 2.070116480191549, "rewards/rejected": -1.6380271911621094, "step": 12634 }, { "epoch": 0.6697055627699892, "grad_norm": 50.0, "kl": 1.5477275848388672, "learning_rate": 5e-07, "logits/chosen": -10340888.8, "logits/rejected": -79478149.33333333, "logps/chosen": -283.300244140625, "logps/rejected": -498.74462890625, "loss": 0.2595, "rewards/chosen": 0.768009090423584, "rewards/margins": 2.912285073598226, "rewards/rejected": -2.144275983174642, "step": 12635 }, { "epoch": 0.6697585667717912, "grad_norm": 46.75, "kl": 1.4103689193725586, "learning_rate": 5e-07, "logits/chosen": -11931590.4, "logits/rejected": -10587340.0, "logps/chosen": -274.19453125, "logps/rejected": -199.3278605143229, "loss": 0.215, "rewards/chosen": 1.1498554229736329, "rewards/margins": 3.8206044514973962, "rewards/rejected": -2.670749028523763, "step": 12636 }, { "epoch": 0.6698115707735934, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17664252.0, "logits/rejected": -29472517.333333332, "logps/chosen": -305.8345642089844, "logps/rejected": -623.515869140625, "loss": 0.1668, "rewards/chosen": 0.4519691467285156, "rewards/margins": 3.3166987101236978, "rewards/rejected": -2.864729563395182, "step": 12637 }, { "epoch": 0.6698645747753955, "grad_norm": 45.0, "kl": 5.158360481262207, "learning_rate": 5e-07, "logits/chosen": 28633344.0, "logits/rejected": -7787315.333333333, "logps/chosen": -237.5534912109375, "logps/rejected": -213.14286295572916, "loss": 0.467, "rewards/chosen": 0.027686715126037598, "rewards/margins": 2.525551756223043, "rewards/rejected": -2.4978650410970054, "step": 12638 }, { "epoch": 0.6699175787771977, "grad_norm": 78.5, "kl": 2.777799606323242, "learning_rate": 5e-07, "logits/chosen": -52227218.28571428, "logits/rejected": -10643766.0, "logps/chosen": -341.54234095982144, "logps/rejected": -47.14727783203125, "loss": 0.3977, "rewards/chosen": 0.6300053596496582, "rewards/margins": 2.214419722557068, "rewards/rejected": -1.5844143629074097, "step": 12639 }, { "epoch": 0.6699705827789998, "grad_norm": 34.25, "kl": 2.8619537353515625, "learning_rate": 5e-07, "logits/chosen": -32010276.0, "logits/rejected": -30837541.333333332, "logps/chosen": -2073.31396484375, "logps/rejected": -487.2178548177083, "loss": 0.0864, "rewards/chosen": 3.862689256668091, "rewards/margins": 6.559155702590942, "rewards/rejected": -2.6964664459228516, "step": 12640 }, { "epoch": 0.670023586780802, "grad_norm": 50.75, "kl": 4.414510726928711, "learning_rate": 5e-07, "logits/chosen": -20443050.666666668, "logits/rejected": 1067126.0, "logps/chosen": -305.5122884114583, "logps/rejected": -108.1632308959961, "loss": 0.3451, "rewards/chosen": 0.7955473264058431, "rewards/margins": 5.056783517201741, "rewards/rejected": -4.261236190795898, "step": 12641 }, { "epoch": 0.6700765907826041, "grad_norm": 54.5, "kl": 1.2713203430175781, "learning_rate": 5e-07, "logits/chosen": -56228940.8, "logits/rejected": -18700537.333333332, "logps/chosen": -236.6041748046875, "logps/rejected": -174.5338338216146, "loss": 0.4093, "rewards/chosen": -0.0972936987876892, "rewards/margins": 2.3547218600908915, "rewards/rejected": -2.4520155588785806, "step": 12642 }, { "epoch": 0.6701295947844063, "grad_norm": 57.5, "kl": 1.274033546447754, "learning_rate": 5e-07, "logits/chosen": -25470298.0, "logits/rejected": -2962971.3333333335, "logps/chosen": -281.9107971191406, "logps/rejected": -152.0074666341146, "loss": 0.1723, "rewards/chosen": 1.1684154272079468, "rewards/margins": 3.1945616801579795, "rewards/rejected": -2.0261462529500327, "step": 12643 }, { "epoch": 0.6701825987862083, "grad_norm": 49.25, "kl": 1.2996063232421875, "learning_rate": 5e-07, "logits/chosen": -70241113.6, "logits/rejected": -24690642.666666668, "logps/chosen": -442.077734375, "logps/rejected": -208.98284912109375, "loss": 0.2971, "rewards/chosen": 1.1239862442016602, "rewards/margins": 2.3697928587595625, "rewards/rejected": -1.245806614557902, "step": 12644 }, { "epoch": 0.6702356027880105, "grad_norm": 44.25, "kl": 3.002880096435547, "learning_rate": 5e-07, "logits/chosen": -14273248.0, "logits/rejected": 4371063.5, "logps/chosen": -298.5094299316406, "logps/rejected": -347.3101806640625, "loss": 0.1861, "rewards/chosen": 1.4799689054489136, "rewards/margins": 3.617209553718567, "rewards/rejected": -2.1372406482696533, "step": 12645 }, { "epoch": 0.6702886067898126, "grad_norm": 57.75, "kl": 0.64202880859375, "learning_rate": 5e-07, "logits/chosen": -61740000.0, "logits/rejected": -2114726.1666666665, "logps/chosen": -328.7701416015625, "logps/rejected": -338.0687662760417, "loss": 0.3551, "rewards/chosen": 0.11613737344741822, "rewards/margins": 2.3110461513201392, "rewards/rejected": -2.194908777872721, "step": 12646 }, { "epoch": 0.6703416107916148, "grad_norm": 48.0, "kl": 0.2958946228027344, "learning_rate": 5e-07, "logits/chosen": -29298712.0, "logits/rejected": -44029092.0, "logps/chosen": -232.16311645507812, "logps/rejected": -307.8213806152344, "loss": 0.3616, "rewards/chosen": 0.07313433289527893, "rewards/margins": 1.64981809258461, "rewards/rejected": -1.576683759689331, "step": 12647 }, { "epoch": 0.6703946147934169, "grad_norm": 41.75, "kl": 3.4770374298095703, "learning_rate": 5e-07, "logits/chosen": 10478042.666666666, "logits/rejected": -23328422.0, "logps/chosen": -174.82600911458334, "logps/rejected": -254.4190673828125, "loss": 0.4271, "rewards/chosen": 0.09283572435379028, "rewards/margins": 2.733014404773712, "rewards/rejected": -2.640178680419922, "step": 12648 }, { "epoch": 0.6704476187952191, "grad_norm": 59.75, "kl": 0.542633056640625, "learning_rate": 5e-07, "logits/chosen": -66004442.666666664, "logits/rejected": -21707702.4, "logps/chosen": -318.63116455078125, "logps/rejected": -321.2859375, "loss": 0.2369, "rewards/chosen": 0.8330586751302084, "rewards/margins": 2.5936238606770834, "rewards/rejected": -1.760565185546875, "step": 12649 }, { "epoch": 0.6705006227970212, "grad_norm": 50.5, "kl": 3.3294858932495117, "learning_rate": 5e-07, "logits/chosen": -49264234.666666664, "logits/rejected": 10053647.0, "logps/chosen": -247.03946940104166, "logps/rejected": -117.95533752441406, "loss": 0.4039, "rewards/chosen": 0.5543575684229533, "rewards/margins": 2.4405954281489053, "rewards/rejected": -1.8862378597259521, "step": 12650 }, { "epoch": 0.6705536267988234, "grad_norm": 39.5, "kl": 1.5327730178833008, "learning_rate": 5e-07, "logits/chosen": -12046834.0, "logits/rejected": -23162494.0, "logps/chosen": -130.120849609375, "logps/rejected": -430.7412414550781, "loss": 0.2808, "rewards/chosen": 0.5542811155319214, "rewards/margins": 2.6901808977127075, "rewards/rejected": -2.135899782180786, "step": 12651 }, { "epoch": 0.6706066308006254, "grad_norm": 54.0, "kl": 0.5583724975585938, "learning_rate": 5e-07, "logits/chosen": -16676928.0, "logits/rejected": 504700416.0, "logps/chosen": -190.11801147460938, "logps/rejected": -311.13262939453125, "loss": 0.3014, "rewards/chosen": 0.3375750780105591, "rewards/margins": 2.1845592260360718, "rewards/rejected": -1.8469841480255127, "step": 12652 }, { "epoch": 0.6706596348024276, "grad_norm": 55.0, "kl": 1.8602218627929688, "learning_rate": 5e-07, "logits/chosen": -42089849.6, "logits/rejected": -4821777.333333333, "logps/chosen": -325.1314697265625, "logps/rejected": -224.942138671875, "loss": 0.3663, "rewards/chosen": 0.40579700469970703, "rewards/margins": 1.9054431915283203, "rewards/rejected": -1.4996461868286133, "step": 12653 }, { "epoch": 0.6707126388042297, "grad_norm": 54.0, "kl": 2.160449981689453, "learning_rate": 5e-07, "logits/chosen": -20671685.333333332, "logits/rejected": -7155624.0, "logps/chosen": -343.83935546875, "logps/rejected": -399.7755126953125, "loss": 0.2845, "rewards/chosen": 1.1086976528167725, "rewards/margins": 2.3773454427719116, "rewards/rejected": -1.2686477899551392, "step": 12654 }, { "epoch": 0.6707656428060319, "grad_norm": 32.5, "kl": 2.4692792892456055, "learning_rate": 5e-07, "logits/chosen": -25792600.0, "logits/rejected": -9490095.2, "logps/chosen": -142.6385498046875, "logps/rejected": -68.2760986328125, "loss": 0.3201, "rewards/chosen": 0.3568886915842692, "rewards/margins": 2.546717564264933, "rewards/rejected": -2.189828872680664, "step": 12655 }, { "epoch": 0.670818646807834, "grad_norm": 52.0, "kl": 4.723376274108887, "learning_rate": 5e-07, "logits/chosen": -36150816.0, "logits/rejected": -7356871.0, "logps/chosen": -423.5590413411458, "logps/rejected": -198.3431396484375, "loss": 0.2982, "rewards/chosen": 1.347095012664795, "rewards/margins": 3.532139778137207, "rewards/rejected": -2.185044765472412, "step": 12656 }, { "epoch": 0.6708716508096362, "grad_norm": 47.75, "kl": 1.6002626419067383, "learning_rate": 5e-07, "logits/chosen": -14181342.666666666, "logits/rejected": -25472804.0, "logps/chosen": -217.0550740559896, "logps/rejected": -342.59783935546875, "loss": 0.2802, "rewards/chosen": 0.9495422840118408, "rewards/margins": 3.221637010574341, "rewards/rejected": -2.2720947265625, "step": 12657 }, { "epoch": 0.6709246548114383, "grad_norm": 48.25, "kl": 0.8569507598876953, "learning_rate": 5e-07, "logits/chosen": -50034520.0, "logits/rejected": -28782450.0, "logps/chosen": -275.45904541015625, "logps/rejected": -241.7487335205078, "loss": 0.2659, "rewards/chosen": 0.43776997923851013, "rewards/margins": 2.70935520529747, "rewards/rejected": -2.27158522605896, "step": 12658 }, { "epoch": 0.6709776588132405, "grad_norm": 44.5, "kl": 0.06449127197265625, "learning_rate": 5e-07, "logits/chosen": -17366568.0, "logits/rejected": -44439564.8, "logps/chosen": -368.4433186848958, "logps/rejected": -471.0328125, "loss": 0.1763, "rewards/chosen": 0.9818094571431478, "rewards/margins": 3.630058320363363, "rewards/rejected": -2.648248863220215, "step": 12659 }, { "epoch": 0.6710306628150425, "grad_norm": 54.5, "kl": 0.4717845916748047, "learning_rate": 5e-07, "logits/chosen": -67044704.0, "logits/rejected": -22043756.0, "logps/chosen": -226.1446736653646, "logps/rejected": -364.6483154296875, "loss": 0.3509, "rewards/chosen": 0.5476422707239786, "rewards/margins": 2.2127439181009927, "rewards/rejected": -1.6651016473770142, "step": 12660 }, { "epoch": 0.6710836668168447, "grad_norm": 48.25, "kl": 0.6330881118774414, "learning_rate": 5e-07, "logits/chosen": -46987132.8, "logits/rejected": -31783008.0, "logps/chosen": -231.6286376953125, "logps/rejected": -209.05908203125, "loss": 0.3565, "rewards/chosen": 0.3114482879638672, "rewards/margins": 2.0237667083740236, "rewards/rejected": -1.7123184204101562, "step": 12661 }, { "epoch": 0.6711366708186468, "grad_norm": 46.0, "kl": 0.07341957092285156, "learning_rate": 5e-07, "logits/chosen": -33432080.0, "logits/rejected": -9393370.0, "logps/chosen": -189.43109130859375, "logps/rejected": -284.2611083984375, "loss": 0.2874, "rewards/chosen": -0.0931258350610733, "rewards/margins": 2.9235830157995224, "rewards/rejected": -3.0167088508605957, "step": 12662 }, { "epoch": 0.6711896748204489, "grad_norm": 52.0, "kl": 1.064992904663086, "learning_rate": 5e-07, "logits/chosen": -18802729.6, "logits/rejected": -22898322.666666668, "logps/chosen": -219.13662109375, "logps/rejected": -208.50537109375, "loss": 0.2444, "rewards/chosen": 0.7719961166381836, "rewards/margins": 3.888853963216146, "rewards/rejected": -3.1168578465779624, "step": 12663 }, { "epoch": 0.6712426788222511, "grad_norm": 60.75, "kl": 3.5820560455322266, "learning_rate": 5e-07, "logits/chosen": -4241379.6, "logits/rejected": -18429764.0, "logps/chosen": -165.0416015625, "logps/rejected": -364.2414957682292, "loss": 0.4049, "rewards/chosen": 0.3803586483001709, "rewards/margins": 2.1561440944671633, "rewards/rejected": -1.7757854461669922, "step": 12664 }, { "epoch": 0.6712956828240532, "grad_norm": 55.5, "kl": 4.118927955627441, "learning_rate": 5e-07, "logits/chosen": -6442430.4, "logits/rejected": -2589855.5, "logps/chosen": -641.1359375, "logps/rejected": -212.3720703125, "loss": 0.2991, "rewards/chosen": 1.7603252410888672, "rewards/margins": 2.677012809117635, "rewards/rejected": -0.9166875680287679, "step": 12665 }, { "epoch": 0.6713486868258554, "grad_norm": 50.5, "kl": 1.935995101928711, "learning_rate": 5e-07, "logits/chosen": -15253833.6, "logits/rejected": 12691474.666666666, "logps/chosen": -297.8893310546875, "logps/rejected": -619.583984375, "loss": 0.2938, "rewards/chosen": 0.7266532421112061, "rewards/margins": 4.217910591761271, "rewards/rejected": -3.491257349650065, "step": 12666 }, { "epoch": 0.6714016908276574, "grad_norm": 89.5, "kl": 0.1843242645263672, "learning_rate": 5e-07, "logits/chosen": -28221350.4, "logits/rejected": -57130448.0, "logps/chosen": -302.6717529296875, "logps/rejected": -392.9700113932292, "loss": 0.4564, "rewards/chosen": -0.3329490661621094, "rewards/margins": 1.1507718722025553, "rewards/rejected": -1.4837209383646648, "step": 12667 }, { "epoch": 0.6714546948294596, "grad_norm": 55.0, "kl": 1.1767044067382812, "learning_rate": 5e-07, "logits/chosen": -34760796.8, "logits/rejected": -14323068.0, "logps/chosen": -379.9137939453125, "logps/rejected": -298.60235595703125, "loss": 0.269, "rewards/chosen": 0.7022967338562012, "rewards/margins": 3.9161483446756997, "rewards/rejected": -3.2138516108194985, "step": 12668 }, { "epoch": 0.6715076988312617, "grad_norm": 43.75, "kl": 0.9976272583007812, "learning_rate": 5e-07, "logits/chosen": -26072842.666666668, "logits/rejected": -30068422.4, "logps/chosen": -740.1131184895834, "logps/rejected": -241.30380859375, "loss": 0.2374, "rewards/chosen": 1.329938809076945, "rewards/margins": 2.843718449274699, "rewards/rejected": -1.513779640197754, "step": 12669 }, { "epoch": 0.6715607028330639, "grad_norm": 31.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73454096.0, "logits/rejected": -25090978.666666668, "logps/chosen": -244.70492553710938, "logps/rejected": -283.8501383463542, "loss": 0.2096, "rewards/chosen": -0.3239097595214844, "rewards/margins": 2.390476862589518, "rewards/rejected": -2.7143866221110025, "step": 12670 }, { "epoch": 0.671613706834866, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9145692.0, "logits/rejected": -4580388.333333333, "logps/chosen": -365.8072204589844, "logps/rejected": -116.82794189453125, "loss": 0.2811, "rewards/chosen": -0.342550665140152, "rewards/margins": 1.7408845523993173, "rewards/rejected": -2.0834352175394693, "step": 12671 }, { "epoch": 0.6716667108366682, "grad_norm": 76.5, "kl": 0.18939971923828125, "learning_rate": 5e-07, "logits/chosen": 36718637.333333336, "logits/rejected": 2116448.25, "logps/chosen": -364.6864013671875, "logps/rejected": -167.61070251464844, "loss": 0.3325, "rewards/chosen": 0.9051925341288248, "rewards/margins": 1.7717979351679483, "rewards/rejected": -0.8666054010391235, "step": 12672 }, { "epoch": 0.6717197148384703, "grad_norm": 27.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33887176.0, "logits/rejected": -12609714.285714285, "logps/chosen": -129.01475524902344, "logps/rejected": -279.92677525111606, "loss": 0.2026, "rewards/chosen": -0.2563217282295227, "rewards/margins": 1.9816288692610606, "rewards/rejected": -2.2379505974905833, "step": 12673 }, { "epoch": 0.6717727188402725, "grad_norm": 46.5, "kl": 2.762655258178711, "learning_rate": 5e-07, "logits/chosen": -14447305.0, "logits/rejected": -51199248.0, "logps/chosen": -311.67938232421875, "logps/rejected": -276.43609619140625, "loss": 0.309, "rewards/chosen": 0.5145667791366577, "rewards/margins": 3.114073395729065, "rewards/rejected": -2.5995066165924072, "step": 12674 }, { "epoch": 0.6718257228420745, "grad_norm": 55.0, "kl": 0.5974960327148438, "learning_rate": 5e-07, "logits/chosen": -13273834.666666666, "logits/rejected": -43162436.0, "logps/chosen": -529.8109944661459, "logps/rejected": -408.6598815917969, "loss": 0.2644, "rewards/chosen": 1.0541880925496419, "rewards/margins": 4.405395110448201, "rewards/rejected": -3.3512070178985596, "step": 12675 }, { "epoch": 0.6718787268438767, "grad_norm": 41.75, "kl": 0.1541757583618164, "learning_rate": 5e-07, "logits/chosen": -27349891.2, "logits/rejected": -38771920.0, "logps/chosen": -195.9638916015625, "logps/rejected": -416.163818359375, "loss": 0.2502, "rewards/chosen": 0.753538703918457, "rewards/margins": 3.153858757019043, "rewards/rejected": -2.400320053100586, "step": 12676 }, { "epoch": 0.6719317308456788, "grad_norm": 32.5, "kl": 1.4249420166015625, "learning_rate": 5e-07, "logits/chosen": -9240820.0, "logits/rejected": -37265408.0, "logps/chosen": -338.1607360839844, "logps/rejected": -381.1547037760417, "loss": 0.1302, "rewards/chosen": 1.511871099472046, "rewards/margins": 4.328558365503946, "rewards/rejected": -2.816687266031901, "step": 12677 }, { "epoch": 0.671984734847481, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15196369.6, "logits/rejected": -21871464.0, "logps/chosen": -254.740673828125, "logps/rejected": -217.282470703125, "loss": 0.3543, "rewards/chosen": 0.295991039276123, "rewards/margins": 1.8224335670471192, "rewards/rejected": -1.526442527770996, "step": 12678 }, { "epoch": 0.6720377388492831, "grad_norm": 43.5, "kl": 4.759101867675781, "learning_rate": 5e-07, "logits/chosen": 9965102.0, "logits/rejected": -29670294.0, "logps/chosen": -183.53672790527344, "logps/rejected": -328.938232421875, "loss": 0.2675, "rewards/chosen": 1.0353261232376099, "rewards/margins": 2.9961732625961304, "rewards/rejected": -1.9608471393585205, "step": 12679 }, { "epoch": 0.6720907428510853, "grad_norm": 65.5, "kl": 2.0858840942382812, "learning_rate": 5e-07, "logits/chosen": 13423046.0, "logits/rejected": -25410952.0, "logps/chosen": -248.65338134765625, "logps/rejected": -155.40126037597656, "loss": 0.3747, "rewards/chosen": 0.264944463968277, "rewards/margins": 1.7425266802310944, "rewards/rejected": -1.4775822162628174, "step": 12680 }, { "epoch": 0.6721437468528874, "grad_norm": 41.5, "kl": 0.9753818511962891, "learning_rate": 5e-07, "logits/chosen": -15530953.0, "logits/rejected": -48423960.0, "logps/chosen": -176.86822509765625, "logps/rejected": -314.7999572753906, "loss": 0.2899, "rewards/chosen": 0.2896450161933899, "rewards/margins": 2.5115397572517395, "rewards/rejected": -2.2218947410583496, "step": 12681 }, { "epoch": 0.6721967508546896, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10945048.0, "logits/rejected": -36669267.2, "logps/chosen": -198.448486328125, "logps/rejected": -438.267578125, "loss": 0.263, "rewards/chosen": 0.05605672299861908, "rewards/margins": 2.6250640243291854, "rewards/rejected": -2.5690073013305663, "step": 12682 }, { "epoch": 0.6722497548564916, "grad_norm": 41.0, "kl": 1.7220354080200195, "learning_rate": 5e-07, "logits/chosen": 6029652.0, "logits/rejected": -57333030.4, "logps/chosen": -213.1793416341146, "logps/rejected": -198.08072509765626, "loss": 0.2543, "rewards/chosen": 0.6842719713846842, "rewards/margins": 2.235217062632243, "rewards/rejected": -1.5509450912475586, "step": 12683 }, { "epoch": 0.6723027588582938, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42292468.0, "logits/rejected": -1759353.6666666667, "logps/chosen": -311.190185546875, "logps/rejected": -271.5719401041667, "loss": 0.2911, "rewards/chosen": -0.23544006049633026, "rewards/margins": 1.3689901381731033, "rewards/rejected": -1.6044301986694336, "step": 12684 }, { "epoch": 0.6723557628600959, "grad_norm": 43.75, "kl": 0.2855987548828125, "learning_rate": 5e-07, "logits/chosen": -26990349.333333332, "logits/rejected": -9556224.0, "logps/chosen": -437.462158203125, "logps/rejected": -230.066650390625, "loss": 0.277, "rewards/chosen": 0.5296987295150757, "rewards/margins": 2.1561471700668333, "rewards/rejected": -1.6264484405517579, "step": 12685 }, { "epoch": 0.6724087668618981, "grad_norm": 66.5, "kl": 1.1028175354003906, "learning_rate": 5e-07, "logits/chosen": 12045050.0, "logits/rejected": -20363168.0, "logps/chosen": -50.35551834106445, "logps/rejected": -284.8359375, "loss": 0.1806, "rewards/chosen": 0.5088810324668884, "rewards/margins": 2.759246289730072, "rewards/rejected": -2.2503652572631836, "step": 12686 }, { "epoch": 0.6724617708637002, "grad_norm": 36.0, "kl": 3.850142002105713, "learning_rate": 5e-07, "logits/chosen": 5175928.5, "logits/rejected": -67493192.0, "logps/chosen": -216.25845336914062, "logps/rejected": -152.546875, "loss": 0.248, "rewards/chosen": 0.9980906844139099, "rewards/margins": 3.7969016432762146, "rewards/rejected": -2.7988109588623047, "step": 12687 }, { "epoch": 0.6725147748655024, "grad_norm": 49.25, "kl": 3.280841827392578, "learning_rate": 5e-07, "logits/chosen": -39702752.0, "logits/rejected": -55251112.0, "logps/chosen": -240.69566127232142, "logps/rejected": -877.0283203125, "loss": 0.3519, "rewards/chosen": 0.8689592906406948, "rewards/margins": 5.0533221789768765, "rewards/rejected": -4.184362888336182, "step": 12688 }, { "epoch": 0.6725677788673045, "grad_norm": 49.25, "kl": 1.0204505920410156, "learning_rate": 5e-07, "logits/chosen": -46375731.2, "logits/rejected": 5982838.666666667, "logps/chosen": -268.7273681640625, "logps/rejected": -266.49066162109375, "loss": 0.3788, "rewards/chosen": 0.2699321746826172, "rewards/margins": 1.6930529912312826, "rewards/rejected": -1.4231208165486653, "step": 12689 }, { "epoch": 0.6726207828691066, "grad_norm": 45.25, "kl": 0.16593074798583984, "learning_rate": 5e-07, "logits/chosen": -35118312.0, "logits/rejected": -22446948.0, "logps/chosen": -258.1246337890625, "logps/rejected": -348.64404296875, "loss": 0.3253, "rewards/chosen": -0.3011844754219055, "rewards/margins": 2.8556177020072937, "rewards/rejected": -3.156802177429199, "step": 12690 }, { "epoch": 0.6726737868709087, "grad_norm": 46.0, "kl": 0.5428466796875, "learning_rate": 5e-07, "logits/chosen": -49160026.666666664, "logits/rejected": -45620265.6, "logps/chosen": -277.4639892578125, "logps/rejected": -340.18759765625, "loss": 0.259, "rewards/chosen": 0.6256891886393229, "rewards/margins": 2.857774798075358, "rewards/rejected": -2.232085609436035, "step": 12691 }, { "epoch": 0.6727267908727109, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7673681.5, "logits/rejected": -22851764.0, "logps/chosen": -242.92005920410156, "logps/rejected": -280.88018798828125, "loss": 0.3253, "rewards/chosen": -0.06634922325611115, "rewards/margins": 2.4985490292310715, "rewards/rejected": -2.5648982524871826, "step": 12692 }, { "epoch": 0.672779794874513, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64846389.333333336, "logits/rejected": -42751068.8, "logps/chosen": -346.5791015625, "logps/rejected": -377.017626953125, "loss": 0.2507, "rewards/chosen": -0.0003702839215596517, "rewards/margins": 2.573319558302561, "rewards/rejected": -2.573689842224121, "step": 12693 }, { "epoch": 0.6728327988763152, "grad_norm": 58.5, "kl": 2.271129608154297, "learning_rate": 5e-07, "logits/chosen": -41738371.2, "logits/rejected": -22741133.333333332, "logps/chosen": -320.897412109375, "logps/rejected": -207.5266316731771, "loss": 0.3826, "rewards/chosen": 0.4790841579437256, "rewards/margins": 2.406491740544637, "rewards/rejected": -1.9274075826009114, "step": 12694 }, { "epoch": 0.6728858028781173, "grad_norm": 40.5, "kl": 2.1545467376708984, "learning_rate": 5e-07, "logits/chosen": -14517656.0, "logits/rejected": -6359119.5, "logps/chosen": -428.02154541015625, "logps/rejected": -170.554443359375, "loss": 0.2917, "rewards/chosen": 1.240848183631897, "rewards/margins": 2.8125747442245483, "rewards/rejected": -1.5717265605926514, "step": 12695 }, { "epoch": 0.6729388068799195, "grad_norm": 42.5, "kl": 2.232858657836914, "learning_rate": 5e-07, "logits/chosen": 9363078.0, "logits/rejected": -36507336.0, "logps/chosen": -233.23233032226562, "logps/rejected": -353.3974609375, "loss": 0.3612, "rewards/chosen": 0.1424281746149063, "rewards/margins": 2.2404511123895645, "rewards/rejected": -2.098022937774658, "step": 12696 }, { "epoch": 0.6729918108817216, "grad_norm": 44.25, "kl": 1.2156410217285156, "learning_rate": 5e-07, "logits/chosen": -29155238.4, "logits/rejected": -21292997.333333332, "logps/chosen": -250.7185546875, "logps/rejected": -262.5539143880208, "loss": 0.3191, "rewards/chosen": 0.7675302505493165, "rewards/margins": 2.5044984499613445, "rewards/rejected": -1.736968199412028, "step": 12697 }, { "epoch": 0.6730448148835237, "grad_norm": 43.75, "kl": 4.477445602416992, "learning_rate": 5e-07, "logits/chosen": -26960696.0, "logits/rejected": -13571377.333333334, "logps/chosen": -268.67431640625, "logps/rejected": -252.23396809895834, "loss": 0.2301, "rewards/chosen": 1.9007930755615234, "rewards/margins": 3.239392598470052, "rewards/rejected": -1.3385995229085286, "step": 12698 }, { "epoch": 0.6730978188853258, "grad_norm": 43.25, "kl": 0.7185258865356445, "learning_rate": 5e-07, "logits/chosen": -23399229.333333332, "logits/rejected": 11146795.2, "logps/chosen": -358.5181477864583, "logps/rejected": -382.483447265625, "loss": 0.2515, "rewards/chosen": 0.2001466155052185, "rewards/margins": 2.6940663695335387, "rewards/rejected": -2.4939197540283202, "step": 12699 }, { "epoch": 0.673150822887128, "grad_norm": 55.75, "kl": 0.6454639434814453, "learning_rate": 5e-07, "logits/chosen": -32075386.666666668, "logits/rejected": -5632392.5, "logps/chosen": -355.3768717447917, "logps/rejected": -300.4013671875, "loss": 0.2859, "rewards/chosen": 0.89289657274882, "rewards/margins": 2.9134763876597085, "rewards/rejected": -2.0205798149108887, "step": 12700 }, { "epoch": 0.6732038268889301, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34197680.0, "logits/rejected": -14310540.0, "logps/chosen": -360.37640380859375, "logps/rejected": -228.34928385416666, "loss": 0.1641, "rewards/chosen": 1.0371071100234985, "rewards/margins": 3.4103326400121055, "rewards/rejected": -2.373225529988607, "step": 12701 }, { "epoch": 0.6732568308907323, "grad_norm": 35.0, "kl": 1.1415367126464844, "learning_rate": 5e-07, "logits/chosen": -29256310.4, "logits/rejected": -48668618.666666664, "logps/chosen": -184.489013671875, "logps/rejected": -570.9696451822916, "loss": 0.3419, "rewards/chosen": 0.014329035580158234, "rewards/margins": 3.7212941641608874, "rewards/rejected": -3.706965128580729, "step": 12702 }, { "epoch": 0.6733098348925344, "grad_norm": 55.0, "kl": 0.45501708984375, "learning_rate": 5e-07, "logits/chosen": -45392580.0, "logits/rejected": -29869776.0, "logps/chosen": -358.2912292480469, "logps/rejected": -522.404052734375, "loss": 0.1613, "rewards/chosen": 1.1143784523010254, "rewards/margins": 4.879604816436768, "rewards/rejected": -3.765226364135742, "step": 12703 }, { "epoch": 0.6733628388943366, "grad_norm": 46.5, "kl": 1.416813850402832, "learning_rate": 5e-07, "logits/chosen": -36289757.333333336, "logits/rejected": 7046554.4, "logps/chosen": -127.83517456054688, "logps/rejected": -342.0626953125, "loss": 0.2542, "rewards/chosen": 0.5636852582295736, "rewards/margins": 2.6003700574239095, "rewards/rejected": -2.0366847991943358, "step": 12704 }, { "epoch": 0.6734158428961386, "grad_norm": 94.5, "kl": 4.909595489501953, "learning_rate": 5e-07, "logits/chosen": 2381049.3333333335, "logits/rejected": -20230054.0, "logps/chosen": -626.6031494140625, "logps/rejected": -592.554931640625, "loss": 0.3138, "rewards/chosen": 1.0162370204925537, "rewards/margins": 7.189303159713745, "rewards/rejected": -6.173066139221191, "step": 12705 }, { "epoch": 0.6734688468979408, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26089508.0, "logits/rejected": -23719269.333333332, "logps/chosen": -252.19912719726562, "logps/rejected": -238.2852783203125, "loss": 0.2479, "rewards/chosen": -0.07944335788488388, "rewards/margins": 2.256568687657515, "rewards/rejected": -2.336012045542399, "step": 12706 }, { "epoch": 0.6735218508997429, "grad_norm": 58.25, "kl": 3.234930992126465, "learning_rate": 5e-07, "logits/chosen": -36612245.333333336, "logits/rejected": -46852624.0, "logps/chosen": -487.0531412760417, "logps/rejected": -471.6485900878906, "loss": 0.3031, "rewards/chosen": 1.3366626103719075, "rewards/margins": 4.0348161061604815, "rewards/rejected": -2.698153495788574, "step": 12707 }, { "epoch": 0.6735748549015451, "grad_norm": 65.0, "kl": 3.9327392578125, "learning_rate": 5e-07, "logits/chosen": -17924318.666666668, "logits/rejected": -3259590.4, "logps/chosen": -445.6781412760417, "logps/rejected": -542.151708984375, "loss": 0.2385, "rewards/chosen": 1.3468923568725586, "rewards/margins": 3.7529779434204102, "rewards/rejected": -2.4060855865478517, "step": 12708 }, { "epoch": 0.6736278589033472, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 25312616.0, "logits/rejected": -70398264.0, "logps/chosen": -501.3702087402344, "logps/rejected": -543.9412841796875, "loss": 0.2725, "rewards/chosen": 0.38314130902290344, "rewards/margins": 2.9474838078022003, "rewards/rejected": -2.564342498779297, "step": 12709 }, { "epoch": 0.6736808629051494, "grad_norm": 52.5, "kl": 1.5347089767456055, "learning_rate": 5e-07, "logits/chosen": 1675871.25, "logits/rejected": -17803334.666666668, "logps/chosen": -68.70365142822266, "logps/rejected": -357.26318359375, "loss": 0.2039, "rewards/chosen": 0.7780479788780212, "rewards/margins": 2.9035284717877707, "rewards/rejected": -2.1254804929097495, "step": 12710 }, { "epoch": 0.6737338669069515, "grad_norm": 49.0, "kl": 6.577617645263672, "learning_rate": 5e-07, "logits/chosen": -22179894.4, "logits/rejected": -43340173.333333336, "logps/chosen": -502.873291015625, "logps/rejected": -477.1746012369792, "loss": 0.3494, "rewards/chosen": 1.2318973541259766, "rewards/margins": 3.1226048469543457, "rewards/rejected": -1.8907074928283691, "step": 12711 }, { "epoch": 0.6737868709087537, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40864453.333333336, "logits/rejected": -26292609.6, "logps/chosen": -391.853271484375, "logps/rejected": -224.015625, "loss": 0.233, "rewards/chosen": 0.880878766377767, "rewards/margins": 2.5147467931111653, "rewards/rejected": -1.6338680267333985, "step": 12712 }, { "epoch": 0.6738398749105557, "grad_norm": 98.0, "kl": 1.2575340270996094, "learning_rate": 5e-07, "logits/chosen": -63016762.666666664, "logits/rejected": -55413452.0, "logps/chosen": -533.6866048177084, "logps/rejected": -349.9954833984375, "loss": 0.3401, "rewards/chosen": 0.5660638411839803, "rewards/margins": 2.5533289511998496, "rewards/rejected": -1.9872651100158691, "step": 12713 }, { "epoch": 0.6738928789123578, "grad_norm": 60.5, "kl": 2.0920486450195312, "learning_rate": 5e-07, "logits/chosen": -73071176.0, "logits/rejected": -13357582.0, "logps/chosen": -645.2767333984375, "logps/rejected": -269.3489990234375, "loss": 0.301, "rewards/chosen": 1.0746750831604004, "rewards/margins": 2.360658049583435, "rewards/rejected": -1.2859829664230347, "step": 12714 }, { "epoch": 0.67394588291416, "grad_norm": 21.375, "kl": 0.679107666015625, "learning_rate": 5e-07, "logits/chosen": 4094893.3333333335, "logits/rejected": -7742584.8, "logps/chosen": -28.879295349121094, "logps/rejected": -383.0093994140625, "loss": 0.1885, "rewards/chosen": 0.7199086348215739, "rewards/margins": 3.796754757563273, "rewards/rejected": -3.076846122741699, "step": 12715 }, { "epoch": 0.6739988869159621, "grad_norm": 42.75, "kl": 2.3639049530029297, "learning_rate": 5e-07, "logits/chosen": -45764492.0, "logits/rejected": -36996844.0, "logps/chosen": -294.9968566894531, "logps/rejected": -389.2994689941406, "loss": 0.2309, "rewards/chosen": 0.8067222833633423, "rewards/margins": 4.473068356513977, "rewards/rejected": -3.6663460731506348, "step": 12716 }, { "epoch": 0.6740518909177643, "grad_norm": 48.0, "kl": 0.4729728698730469, "learning_rate": 5e-07, "logits/chosen": -80352528.0, "logits/rejected": -39490675.2, "logps/chosen": -269.4659830729167, "logps/rejected": -319.694921875, "loss": 0.2205, "rewards/chosen": 0.6131509145100912, "rewards/margins": 2.5288337071736655, "rewards/rejected": -1.9156827926635742, "step": 12717 }, { "epoch": 0.6741048949195664, "grad_norm": 44.25, "kl": 0.021111488342285156, "learning_rate": 5e-07, "logits/chosen": 8385218.0, "logits/rejected": -13998070.4, "logps/chosen": -33.50033060709635, "logps/rejected": -380.3991943359375, "loss": 0.2549, "rewards/chosen": 0.32582231362660724, "rewards/margins": 2.303069551785787, "rewards/rejected": -1.9772472381591797, "step": 12718 }, { "epoch": 0.6741578989213686, "grad_norm": 50.5, "kl": 0.47984790802001953, "learning_rate": 5e-07, "logits/chosen": -419478.5833333333, "logits/rejected": -26280707.2, "logps/chosen": -220.2529500325521, "logps/rejected": -329.8226318359375, "loss": 0.2614, "rewards/chosen": 0.3935991128285726, "rewards/margins": 2.5842660744984944, "rewards/rejected": -2.1906669616699217, "step": 12719 }, { "epoch": 0.6742109029231707, "grad_norm": 62.0, "kl": 0.9922752380371094, "learning_rate": 5e-07, "logits/chosen": -40189480.0, "logits/rejected": 8302415.0, "logps/chosen": -353.886962890625, "logps/rejected": -169.366455078125, "loss": 0.4033, "rewards/chosen": -0.08255864679813385, "rewards/margins": 1.4363927692174911, "rewards/rejected": -1.518951416015625, "step": 12720 }, { "epoch": 0.6742639069249728, "grad_norm": 32.75, "kl": 4.836965560913086, "learning_rate": 5e-07, "logits/chosen": 22327816.0, "logits/rejected": -12648439.0, "logps/chosen": -870.6323852539062, "logps/rejected": -416.19036865234375, "loss": 0.2348, "rewards/chosen": 1.5601460933685303, "rewards/margins": 4.2385094165802, "rewards/rejected": -2.67836332321167, "step": 12721 }, { "epoch": 0.6743169109267749, "grad_norm": 57.75, "kl": 5.157949447631836, "learning_rate": 5e-07, "logits/chosen": -24388268.0, "logits/rejected": -18424816.0, "logps/chosen": -407.1040344238281, "logps/rejected": -159.05490112304688, "loss": 0.3338, "rewards/chosen": 0.5526275038719177, "rewards/margins": 2.286224067211151, "rewards/rejected": -1.7335965633392334, "step": 12722 }, { "epoch": 0.6743699149285771, "grad_norm": 53.5, "kl": 2.1334495544433594, "learning_rate": 5e-07, "logits/chosen": 1861812.0, "logits/rejected": -36529168.0, "logps/chosen": -109.4009017944336, "logps/rejected": -484.6309407552083, "loss": 0.2002, "rewards/chosen": 0.9068302512168884, "rewards/margins": 3.0601394375165305, "rewards/rejected": -2.153309186299642, "step": 12723 }, { "epoch": 0.6744229189303792, "grad_norm": 41.25, "kl": 1.5068445205688477, "learning_rate": 5e-07, "logits/chosen": -27755482.666666668, "logits/rejected": -25022243.2, "logps/chosen": -198.58721923828125, "logps/rejected": -358.213427734375, "loss": 0.2278, "rewards/chosen": 0.6173215707143148, "rewards/margins": 2.872349723180135, "rewards/rejected": -2.2550281524658202, "step": 12724 }, { "epoch": 0.6744759229321814, "grad_norm": 41.75, "kl": 0.3475761413574219, "learning_rate": 5e-07, "logits/chosen": -49359736.0, "logits/rejected": 5953955.5, "logps/chosen": -240.8109588623047, "logps/rejected": -373.7198791503906, "loss": 0.3626, "rewards/chosen": -0.19114407896995544, "rewards/margins": 2.7399195730686188, "rewards/rejected": -2.931063652038574, "step": 12725 }, { "epoch": 0.6745289269339835, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6317828.0, "logits/rejected": -34423228.0, "logps/chosen": -192.9176025390625, "logps/rejected": -346.80340576171875, "loss": 0.2256, "rewards/chosen": 1.2314014434814453, "rewards/margins": 3.4387545585632324, "rewards/rejected": -2.207353115081787, "step": 12726 }, { "epoch": 0.6745819309357857, "grad_norm": 49.25, "kl": 3.5548620223999023, "learning_rate": 5e-07, "logits/chosen": -10906064.8, "logits/rejected": -26309920.0, "logps/chosen": -258.3113037109375, "logps/rejected": -812.8846842447916, "loss": 0.299, "rewards/chosen": 0.6736061573028564, "rewards/margins": 4.888372977574666, "rewards/rejected": -4.21476682027181, "step": 12727 }, { "epoch": 0.6746349349375877, "grad_norm": 49.75, "kl": 2.6405200958251953, "learning_rate": 5e-07, "logits/chosen": -18781762.666666668, "logits/rejected": -66588888.0, "logps/chosen": -177.46891276041666, "logps/rejected": -408.5919189453125, "loss": 0.346, "rewards/chosen": 0.494485338528951, "rewards/margins": 3.7924849589665732, "rewards/rejected": -3.297999620437622, "step": 12728 }, { "epoch": 0.6746879389393899, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7721744.0, "logits/rejected": -14719286.666666666, "logps/chosen": -368.5103759765625, "logps/rejected": -118.49270629882812, "loss": 0.2586, "rewards/chosen": 0.6763821601867676, "rewards/margins": 3.365145460764567, "rewards/rejected": -2.6887633005777993, "step": 12729 }, { "epoch": 0.674740942941192, "grad_norm": 33.25, "kl": 1.2553939819335938, "learning_rate": 5e-07, "logits/chosen": 2246580.2, "logits/rejected": -41366088.0, "logps/chosen": -60.0246337890625, "logps/rejected": -365.1991373697917, "loss": 0.3634, "rewards/chosen": 0.1428015947341919, "rewards/margins": 2.5186591068903605, "rewards/rejected": -2.3758575121561685, "step": 12730 }, { "epoch": 0.6747939469429942, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27440362.0, "logits/rejected": -27874224.0, "logps/chosen": -168.79891967773438, "logps/rejected": -262.0579020182292, "loss": 0.2732, "rewards/chosen": -0.5136076211929321, "rewards/margins": 1.5819367170333862, "rewards/rejected": -2.0955443382263184, "step": 12731 }, { "epoch": 0.6748469509447963, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79732496.0, "logits/rejected": -61812682.666666664, "logps/chosen": -347.62982177734375, "logps/rejected": -211.4310506184896, "loss": 0.297, "rewards/chosen": -0.12000275403261185, "rewards/margins": 2.0198744063576064, "rewards/rejected": -2.1398771603902182, "step": 12732 }, { "epoch": 0.6748999549465985, "grad_norm": 47.75, "kl": 1.0887413024902344, "learning_rate": 5e-07, "logits/chosen": -8213539.2, "logits/rejected": -29331434.666666668, "logps/chosen": -198.29722900390624, "logps/rejected": -482.0425618489583, "loss": 0.2913, "rewards/chosen": 0.50479736328125, "rewards/margins": 3.8569272994995116, "rewards/rejected": -3.3521299362182617, "step": 12733 }, { "epoch": 0.6749529589484006, "grad_norm": 31.625, "kl": 1.8476791381835938, "learning_rate": 5e-07, "logits/chosen": -2212242.4, "logits/rejected": -7394330.0, "logps/chosen": -95.60736694335938, "logps/rejected": -320.9532470703125, "loss": 0.3445, "rewards/chosen": 0.2783250331878662, "rewards/margins": 3.494969956080119, "rewards/rejected": -3.2166449228922525, "step": 12734 }, { "epoch": 0.6750059629502028, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 151059360.0, "logits/rejected": -26861277.333333332, "logps/chosen": -662.3341064453125, "logps/rejected": -496.3389485677083, "loss": 0.1259, "rewards/chosen": 0.6618438363075256, "rewards/margins": 3.8853434522946677, "rewards/rejected": -3.223499615987142, "step": 12735 }, { "epoch": 0.6750589669520048, "grad_norm": 32.75, "kl": 1.1917972564697266, "learning_rate": 5e-07, "logits/chosen": -616987.1875, "logits/rejected": -18651549.714285713, "logps/chosen": -18.717880249023438, "logps/rejected": -373.65530831473217, "loss": 0.1848, "rewards/chosen": 2.0139853954315186, "rewards/margins": 3.984842198235648, "rewards/rejected": -1.9708568028041296, "step": 12736 }, { "epoch": 0.675111970953807, "grad_norm": 37.0, "kl": 2.5390872955322266, "learning_rate": 5e-07, "logits/chosen": -51028704.0, "logits/rejected": -79068792.0, "logps/chosen": -639.4876708984375, "logps/rejected": -207.9818115234375, "loss": 0.1374, "rewards/chosen": 1.9770374298095703, "rewards/margins": 4.328338623046875, "rewards/rejected": -2.3513011932373047, "step": 12737 }, { "epoch": 0.6751649749556091, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21414016.0, "logits/rejected": -24627336.0, "logps/chosen": -177.17027282714844, "logps/rejected": -289.05853271484375, "loss": 0.2809, "rewards/chosen": 0.3726765513420105, "rewards/margins": 2.674106776714325, "rewards/rejected": -2.3014302253723145, "step": 12738 }, { "epoch": 0.6752179789574113, "grad_norm": 72.5, "kl": 1.1709518432617188, "learning_rate": 5e-07, "logits/chosen": -7160392.0, "logits/rejected": -76385712.0, "logps/chosen": -192.20574079241072, "logps/rejected": -830.6355590820312, "loss": 0.4318, "rewards/chosen": 0.1693087135042463, "rewards/margins": 3.644504121371678, "rewards/rejected": -3.4751954078674316, "step": 12739 }, { "epoch": 0.6752709829592134, "grad_norm": 67.0, "kl": 0.4116668701171875, "learning_rate": 5e-07, "logits/chosen": -19758924.0, "logits/rejected": -17424978.0, "logps/chosen": -684.2626342773438, "logps/rejected": -154.06845092773438, "loss": 0.2384, "rewards/chosen": 0.9436066150665283, "rewards/margins": 3.476935625076294, "rewards/rejected": -2.5333290100097656, "step": 12740 }, { "epoch": 0.6753239869610156, "grad_norm": 47.5, "kl": 1.5716381072998047, "learning_rate": 5e-07, "logits/chosen": -13395180.0, "logits/rejected": -29374618.0, "logps/chosen": -326.04150390625, "logps/rejected": -361.4971923828125, "loss": 0.3757, "rewards/chosen": -0.4016157388687134, "rewards/margins": 2.4547804594039917, "rewards/rejected": -2.856396198272705, "step": 12741 }, { "epoch": 0.6753769909628177, "grad_norm": 66.0, "kl": 4.745044708251953, "learning_rate": 5e-07, "logits/chosen": -36322168.0, "logits/rejected": -12767016.0, "logps/chosen": -920.0224609375, "logps/rejected": -433.6116943359375, "loss": 0.1979, "rewards/chosen": 2.0363516807556152, "rewards/margins": 4.286948362986246, "rewards/rejected": -2.2505966822306314, "step": 12742 }, { "epoch": 0.6754299949646199, "grad_norm": 21.875, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -32634044.0, "logps/rejected": -287.67938232421875, "loss": 0.0604, "rewards/rejected": -3.3731446266174316, "step": 12743 }, { "epoch": 0.6754829989664219, "grad_norm": 49.5, "kl": 1.4198150634765625, "learning_rate": 5e-07, "logits/chosen": -5040409.6, "logits/rejected": 1214694.6666666667, "logps/chosen": -180.936376953125, "logps/rejected": -470.5108642578125, "loss": 0.3704, "rewards/chosen": 0.3517314434051514, "rewards/margins": 1.7842367331186932, "rewards/rejected": -1.4325052897135417, "step": 12744 }, { "epoch": 0.6755360029682241, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17471718.666666668, "logits/rejected": -17853054.4, "logps/chosen": -198.70513916015625, "logps/rejected": -318.9855712890625, "loss": 0.1675, "rewards/chosen": 1.0309942563374836, "rewards/margins": 3.781577905019124, "rewards/rejected": -2.7505836486816406, "step": 12745 }, { "epoch": 0.6755890069700262, "grad_norm": 40.0, "kl": 1.041290283203125, "learning_rate": 5e-07, "logits/chosen": 3567484.0, "logits/rejected": -15846158.4, "logps/chosen": -50.97468566894531, "logps/rejected": -196.88623046875, "loss": 0.3084, "rewards/chosen": -0.03517513473828634, "rewards/margins": 1.8983884791533152, "rewards/rejected": -1.9335636138916015, "step": 12746 }, { "epoch": 0.6756420109718284, "grad_norm": 72.5, "kl": 3.293231964111328, "learning_rate": 5e-07, "logits/chosen": -24466651.2, "logits/rejected": -32156992.0, "logps/chosen": -308.07431640625, "logps/rejected": -279.8759765625, "loss": 0.4093, "rewards/chosen": 0.3844449043273926, "rewards/margins": 2.1287233352661135, "rewards/rejected": -1.7442784309387207, "step": 12747 }, { "epoch": 0.6756950149736305, "grad_norm": 47.75, "kl": 0.5722541809082031, "learning_rate": 5e-07, "logits/chosen": -9593272.666666666, "logits/rejected": -20888582.0, "logps/chosen": -159.7319539388021, "logps/rejected": -85.67994689941406, "loss": 0.453, "rewards/chosen": -0.1675038735071818, "rewards/margins": 2.116031606992086, "rewards/rejected": -2.2835354804992676, "step": 12748 }, { "epoch": 0.6757480189754327, "grad_norm": 52.5, "kl": 3.841724395751953, "learning_rate": 5e-07, "logits/chosen": -29548595.2, "logits/rejected": -5883057.333333333, "logps/chosen": -352.5928955078125, "logps/rejected": -156.8418172200521, "loss": 0.3362, "rewards/chosen": 1.2490571975708007, "rewards/margins": 1.7663039604822792, "rewards/rejected": -0.5172467629114786, "step": 12749 }, { "epoch": 0.6758010229772348, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -92134960.0, "logits/rejected": -44132297.14285714, "logps/chosen": -387.19293212890625, "logps/rejected": -415.19883510044644, "loss": 0.1968, "rewards/chosen": -0.444976806640625, "rewards/margins": 2.194611140659877, "rewards/rejected": -2.639587947300502, "step": 12750 }, { "epoch": 0.675854026979037, "grad_norm": 54.75, "kl": 0.4589271545410156, "learning_rate": 5e-07, "logits/chosen": -18674797.333333332, "logits/rejected": -18079854.0, "logps/chosen": -185.11376953125, "logps/rejected": -364.1614990234375, "loss": 0.334, "rewards/chosen": 0.5177169243494669, "rewards/margins": 2.9145667950312295, "rewards/rejected": -2.3968498706817627, "step": 12751 }, { "epoch": 0.675907030980839, "grad_norm": 24.75, "kl": 0.8687095642089844, "learning_rate": 5e-07, "logits/chosen": 7963261.0, "logits/rejected": -14583145.333333334, "logps/chosen": -340.6437072753906, "logps/rejected": -266.3118489583333, "loss": 0.064, "rewards/chosen": 1.9448745250701904, "rewards/margins": 5.436351219813028, "rewards/rejected": -3.4914766947428384, "step": 12752 }, { "epoch": 0.6759600349826412, "grad_norm": 43.25, "kl": 0.6922369003295898, "learning_rate": 5e-07, "logits/chosen": -29357092.0, "logits/rejected": -13653422.0, "logps/chosen": -153.76907348632812, "logps/rejected": -456.6639404296875, "loss": 0.271, "rewards/chosen": 0.6933333873748779, "rewards/margins": 2.6078895330429077, "rewards/rejected": -1.9145561456680298, "step": 12753 }, { "epoch": 0.6760130389844433, "grad_norm": 40.75, "kl": 2.9419775009155273, "learning_rate": 5e-07, "logits/chosen": -11435796.0, "logits/rejected": -39493229.333333336, "logps/chosen": -121.64608154296874, "logps/rejected": -548.542724609375, "loss": 0.3154, "rewards/chosen": 0.4348490238189697, "rewards/margins": 3.8444772561391196, "rewards/rejected": -3.40962823232015, "step": 12754 }, { "epoch": 0.6760660429862455, "grad_norm": 37.5, "kl": 0.6643543243408203, "learning_rate": 5e-07, "logits/chosen": -16337559.0, "logits/rejected": -37924904.0, "logps/chosen": -193.3139190673828, "logps/rejected": -531.4349975585938, "loss": 0.3007, "rewards/chosen": 0.23944544792175293, "rewards/margins": 3.4762256145477295, "rewards/rejected": -3.2367801666259766, "step": 12755 }, { "epoch": 0.6761190469880476, "grad_norm": 37.5, "kl": 0.7025470733642578, "learning_rate": 5e-07, "logits/chosen": -45481548.0, "logits/rejected": -27219384.0, "logps/chosen": -565.2173461914062, "logps/rejected": -283.6158447265625, "loss": 0.2447, "rewards/chosen": 1.2015128135681152, "rewards/margins": 2.9589476585388184, "rewards/rejected": -1.7574348449707031, "step": 12756 }, { "epoch": 0.6761720509898498, "grad_norm": 32.25, "kl": 0.4754371643066406, "learning_rate": 5e-07, "logits/chosen": 3817659.75, "logits/rejected": -28654160.0, "logps/chosen": -107.916748046875, "logps/rejected": -344.9761962890625, "loss": 0.1761, "rewards/chosen": 0.32431086897850037, "rewards/margins": 3.4128989477952323, "rewards/rejected": -3.088588078816732, "step": 12757 }, { "epoch": 0.6762250549916519, "grad_norm": 34.25, "kl": 0.15693092346191406, "learning_rate": 5e-07, "logits/chosen": 3531643.0, "logits/rejected": -5181619.666666667, "logps/chosen": -157.8308868408203, "logps/rejected": -193.98185221354166, "loss": 0.155, "rewards/chosen": 0.8118839263916016, "rewards/margins": 3.202225685119629, "rewards/rejected": -2.3903417587280273, "step": 12758 }, { "epoch": 0.6762780589934541, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33989288.0, "logits/rejected": -23093010.666666668, "logps/chosen": -426.1990966796875, "logps/rejected": -262.0854085286458, "loss": 0.2541, "rewards/chosen": 0.714764416217804, "rewards/margins": 2.540668785572052, "rewards/rejected": -1.825904369354248, "step": 12759 }, { "epoch": 0.6763310629952561, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23866342.0, "logits/rejected": -15213849.142857144, "logps/chosen": -389.6070251464844, "logps/rejected": -147.13821847098214, "loss": 0.2595, "rewards/chosen": 0.30508118867874146, "rewards/margins": 2.340879397732871, "rewards/rejected": -2.0357982090541293, "step": 12760 }, { "epoch": 0.6763840669970583, "grad_norm": 47.75, "kl": 2.7321367263793945, "learning_rate": 5e-07, "logits/chosen": 8980856.0, "logits/rejected": -37713989.333333336, "logps/chosen": -177.77041015625, "logps/rejected": -138.7422078450521, "loss": 0.3807, "rewards/chosen": 0.8135499954223633, "rewards/margins": 3.0890777905782065, "rewards/rejected": -2.2755277951558432, "step": 12761 }, { "epoch": 0.6764370709988604, "grad_norm": 61.25, "kl": 1.2392349243164062, "learning_rate": 5e-07, "logits/chosen": -39059164.8, "logits/rejected": -7743504.666666667, "logps/chosen": -261.202392578125, "logps/rejected": -294.3341064453125, "loss": 0.3636, "rewards/chosen": 0.1057477355003357, "rewards/margins": 1.785486098130544, "rewards/rejected": -1.6797383626302083, "step": 12762 }, { "epoch": 0.6764900750006626, "grad_norm": 57.0, "kl": 1.6755471229553223, "learning_rate": 5e-07, "logits/chosen": -1900559.4, "logits/rejected": -11234980.0, "logps/chosen": -211.840625, "logps/rejected": -315.0356852213542, "loss": 0.3021, "rewards/chosen": 0.7964919090270997, "rewards/margins": 2.4307021776835125, "rewards/rejected": -1.6342102686564128, "step": 12763 }, { "epoch": 0.6765430790024647, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47779685.333333336, "logits/rejected": -38818944.0, "logps/chosen": -709.29541015625, "logps/rejected": -331.98623046875, "loss": 0.2327, "rewards/chosen": 1.1834656397501628, "rewards/margins": 2.5422697703043617, "rewards/rejected": -1.3588041305541991, "step": 12764 }, { "epoch": 0.6765960830042668, "grad_norm": 71.5, "kl": 3.055643081665039, "learning_rate": 5e-07, "logits/chosen": -16353478.4, "logits/rejected": 4220321.333333333, "logps/chosen": -464.924755859375, "logps/rejected": -130.88662719726562, "loss": 0.3251, "rewards/chosen": 0.774806547164917, "rewards/margins": 3.895044533411662, "rewards/rejected": -3.1202379862467446, "step": 12765 }, { "epoch": 0.676649087006069, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4042697.3333333335, "logits/rejected": -55569830.4, "logps/chosen": -218.6623738606771, "logps/rejected": -333.988525390625, "loss": 0.1702, "rewards/chosen": 0.9204192956288656, "rewards/margins": 4.0779582818349205, "rewards/rejected": -3.1575389862060548, "step": 12766 }, { "epoch": 0.676702091007871, "grad_norm": 55.25, "kl": 1.7495708465576172, "learning_rate": 5e-07, "logits/chosen": -17955846.85714286, "logits/rejected": -32959784.0, "logps/chosen": -277.53309849330356, "logps/rejected": -798.708740234375, "loss": 0.4021, "rewards/chosen": 0.46641097749982563, "rewards/margins": 3.8525682517460416, "rewards/rejected": -3.386157274246216, "step": 12767 }, { "epoch": 0.6767550950096732, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6662185.5, "logits/rejected": -25417314.666666668, "logps/chosen": -378.13140869140625, "logps/rejected": -404.9490152994792, "loss": 0.155, "rewards/chosen": 1.162451148033142, "rewards/margins": 3.667486548423767, "rewards/rejected": -2.505035400390625, "step": 12768 }, { "epoch": 0.6768080990114753, "grad_norm": 53.0, "kl": 3.550509452819824, "learning_rate": 5e-07, "logits/chosen": -33812108.0, "logits/rejected": 7455383.0, "logps/chosen": -574.563232421875, "logps/rejected": -185.3441619873047, "loss": 0.3397, "rewards/chosen": 1.1132276058197021, "rewards/margins": 2.1154576539993286, "rewards/rejected": -1.0022300481796265, "step": 12769 }, { "epoch": 0.6768611030132775, "grad_norm": 48.5, "kl": 0.4829540252685547, "learning_rate": 5e-07, "logits/chosen": -1251859.8, "logits/rejected": -5553871.0, "logps/chosen": -134.5088134765625, "logps/rejected": -494.6734619140625, "loss": 0.3606, "rewards/chosen": 0.17332637310028076, "rewards/margins": 2.096693714459737, "rewards/rejected": -1.9233673413594563, "step": 12770 }, { "epoch": 0.6769141070150796, "grad_norm": 50.0, "kl": 2.795424461364746, "learning_rate": 5e-07, "logits/chosen": -30743628.8, "logits/rejected": -85396949.33333333, "logps/chosen": -262.589599609375, "logps/rejected": -379.8080240885417, "loss": 0.2435, "rewards/chosen": 0.847740364074707, "rewards/margins": 4.158315849304199, "rewards/rejected": -3.310575485229492, "step": 12771 }, { "epoch": 0.6769671110168818, "grad_norm": 47.75, "kl": 1.4162673950195312, "learning_rate": 5e-07, "logits/chosen": -21887757.333333332, "logits/rejected": -139969.515625, "logps/chosen": -204.6937052408854, "logps/rejected": -179.6755828857422, "loss": 0.4543, "rewards/chosen": 0.1511868933836619, "rewards/margins": 1.2571647862593334, "rewards/rejected": -1.1059778928756714, "step": 12772 }, { "epoch": 0.6770201150186839, "grad_norm": 49.5, "kl": 3.1000804901123047, "learning_rate": 5e-07, "logits/chosen": -7161126.4, "logits/rejected": -33433984.0, "logps/chosen": -394.558447265625, "logps/rejected": -354.8150227864583, "loss": 0.3006, "rewards/chosen": 0.8321086883544921, "rewards/margins": 3.102815500895182, "rewards/rejected": -2.27070681254069, "step": 12773 }, { "epoch": 0.6770731190204861, "grad_norm": 58.75, "kl": 1.4785089492797852, "learning_rate": 5e-07, "logits/chosen": -11314141.333333334, "logits/rejected": -26385616.0, "logps/chosen": -359.3817952473958, "logps/rejected": -412.737255859375, "loss": 0.2285, "rewards/chosen": 0.15634294350941977, "rewards/margins": 3.24317155679067, "rewards/rejected": -3.08682861328125, "step": 12774 }, { "epoch": 0.6771261230222881, "grad_norm": 35.75, "kl": 0.5238265991210938, "learning_rate": 5e-07, "logits/chosen": -12971297.0, "logits/rejected": -39356603.428571425, "logps/chosen": -63.33812713623047, "logps/rejected": -342.5423060825893, "loss": 0.1141, "rewards/chosen": 1.7935523986816406, "rewards/margins": 4.99207387651716, "rewards/rejected": -3.198521477835519, "step": 12775 }, { "epoch": 0.6771791270240903, "grad_norm": 46.0, "kl": 1.9517126083374023, "learning_rate": 5e-07, "logits/chosen": -35348828.8, "logits/rejected": -20543330.666666668, "logps/chosen": -290.14443359375, "logps/rejected": -523.7703043619791, "loss": 0.3304, "rewards/chosen": 0.49497098922729493, "rewards/margins": 2.827931118011475, "rewards/rejected": -2.3329601287841797, "step": 12776 }, { "epoch": 0.6772321310258924, "grad_norm": 46.25, "kl": 0.321441650390625, "learning_rate": 5e-07, "logits/chosen": -24812128.0, "logits/rejected": -26764661.333333332, "logps/chosen": -221.467333984375, "logps/rejected": -343.5897623697917, "loss": 0.3728, "rewards/chosen": 0.4013315200805664, "rewards/margins": 1.6492127736409503, "rewards/rejected": -1.247881253560384, "step": 12777 }, { "epoch": 0.6772851350276946, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42704760.0, "logits/rejected": -17970992.0, "logps/chosen": -341.0738525390625, "logps/rejected": -213.77593122209822, "loss": 0.2129, "rewards/chosen": -0.12467346340417862, "rewards/margins": 1.7550434097647667, "rewards/rejected": -1.8797168731689453, "step": 12778 }, { "epoch": 0.6773381390294967, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7435680.0, "logits/rejected": -22626738.285714287, "logps/chosen": -56.94189453125, "logps/rejected": -239.10466657366072, "loss": 0.2198, "rewards/chosen": -0.2748645842075348, "rewards/margins": 1.5574842393398285, "rewards/rejected": -1.8323488235473633, "step": 12779 }, { "epoch": 0.6773911430312989, "grad_norm": 85.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10877800.0, "logits/rejected": -6272297.0, "logps/chosen": -495.93585205078125, "logps/rejected": -212.81851196289062, "loss": 0.2737, "rewards/chosen": 0.3294408917427063, "rewards/margins": 2.346438944339752, "rewards/rejected": -2.016998052597046, "step": 12780 }, { "epoch": 0.677444147033101, "grad_norm": 65.0, "kl": 0.4145317077636719, "learning_rate": 5e-07, "logits/chosen": -60243754.666666664, "logits/rejected": -25046710.0, "logps/chosen": -370.8021647135417, "logps/rejected": -266.15277099609375, "loss": 0.318, "rewards/chosen": 0.7085011800130209, "rewards/margins": 2.7294073899586997, "rewards/rejected": -2.0209062099456787, "step": 12781 }, { "epoch": 0.6774971510349032, "grad_norm": 46.25, "kl": 0.06909942626953125, "learning_rate": 5e-07, "logits/chosen": -80559976.0, "logits/rejected": -42538360.0, "logps/chosen": -312.7156982421875, "logps/rejected": -555.3336791992188, "loss": 0.2132, "rewards/chosen": 0.5676418542861938, "rewards/margins": 3.7306867837905884, "rewards/rejected": -3.1630449295043945, "step": 12782 }, { "epoch": 0.6775501550367052, "grad_norm": 62.25, "kl": 4.306665420532227, "learning_rate": 5e-07, "logits/chosen": -32815808.0, "logits/rejected": -7171706.0, "logps/chosen": -338.1551920572917, "logps/rejected": -308.26434326171875, "loss": 0.3615, "rewards/chosen": 0.9826135635375977, "rewards/margins": 2.5917338132858276, "rewards/rejected": -1.60912024974823, "step": 12783 }, { "epoch": 0.6776031590385074, "grad_norm": 53.0, "kl": 0.10108470916748047, "learning_rate": 5e-07, "logits/chosen": -19326800.0, "logits/rejected": -12849129.0, "logps/chosen": -243.72402954101562, "logps/rejected": -379.5957946777344, "loss": 0.2993, "rewards/chosen": 0.5121680498123169, "rewards/margins": 2.291512131690979, "rewards/rejected": -1.779344081878662, "step": 12784 }, { "epoch": 0.6776561630403095, "grad_norm": 33.0, "kl": 0.9247932434082031, "learning_rate": 5e-07, "logits/chosen": 3637596.75, "logits/rejected": -11521654.857142856, "logps/chosen": -35.07804870605469, "logps/rejected": -387.85947963169644, "loss": 0.194, "rewards/chosen": -0.35407811403274536, "rewards/margins": 2.2917346017701283, "rewards/rejected": -2.6458127158028737, "step": 12785 }, { "epoch": 0.6777091670421117, "grad_norm": 55.25, "kl": 1.963524341583252, "learning_rate": 5e-07, "logits/chosen": -30822578.0, "logits/rejected": -22182464.0, "logps/chosen": -396.8544006347656, "logps/rejected": -82.535400390625, "loss": 0.3427, "rewards/chosen": 0.5715526938438416, "rewards/margins": 1.4484484791755676, "rewards/rejected": -0.8768957853317261, "step": 12786 }, { "epoch": 0.6777621710439138, "grad_norm": 46.5, "kl": 0.07470130920410156, "learning_rate": 5e-07, "logits/chosen": -27986204.0, "logits/rejected": -37029840.0, "logps/chosen": -282.46856689453125, "logps/rejected": -328.4040222167969, "loss": 0.2267, "rewards/chosen": 0.7891079783439636, "rewards/margins": 2.9071208834648132, "rewards/rejected": -2.1180129051208496, "step": 12787 }, { "epoch": 0.677815175045716, "grad_norm": 43.0, "kl": 2.9817543029785156, "learning_rate": 5e-07, "logits/chosen": -46654700.0, "logits/rejected": -65965736.0, "logps/chosen": -302.45147705078125, "logps/rejected": -442.7611999511719, "loss": 0.4, "rewards/chosen": -0.4373663067817688, "rewards/margins": 2.6865782141685486, "rewards/rejected": -3.1239445209503174, "step": 12788 }, { "epoch": 0.6778681790475181, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30285816.0, "logits/rejected": -34217952.0, "logps/chosen": -312.84149169921875, "logps/rejected": -468.6233825683594, "loss": 0.2019, "rewards/chosen": 0.8894329071044922, "rewards/margins": 5.050776481628418, "rewards/rejected": -4.161343574523926, "step": 12789 }, { "epoch": 0.6779211830493203, "grad_norm": 42.0, "kl": 3.42864990234375, "learning_rate": 5e-07, "logits/chosen": -5342196.333333333, "logits/rejected": 637106.8, "logps/chosen": -677.5320638020834, "logps/rejected": -176.2132080078125, "loss": 0.1474, "rewards/chosen": 1.4280613263448079, "rewards/margins": 4.478948052724203, "rewards/rejected": -3.0508867263793946, "step": 12790 }, { "epoch": 0.6779741870511223, "grad_norm": 48.0, "kl": 1.062591552734375, "learning_rate": 5e-07, "logits/chosen": -11839920.8, "logits/rejected": 2680490.5, "logps/chosen": -208.1731201171875, "logps/rejected": -83.9113057454427, "loss": 0.3876, "rewards/chosen": 0.5745237350463868, "rewards/margins": 1.211451021830241, "rewards/rejected": -0.6369272867838541, "step": 12791 }, { "epoch": 0.6780271910529245, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17915464.0, "logits/rejected": -29214576.0, "logps/chosen": -256.9488525390625, "logps/rejected": -358.630615234375, "loss": 0.1699, "rewards/chosen": 0.7833271622657776, "rewards/margins": 3.1003440817197165, "rewards/rejected": -2.317016919453939, "step": 12792 }, { "epoch": 0.6780801950547266, "grad_norm": 47.75, "kl": 1.348475456237793, "learning_rate": 5e-07, "logits/chosen": -10182988.0, "logits/rejected": -43836883.2, "logps/chosen": -138.62948608398438, "logps/rejected": -381.659130859375, "loss": 0.3182, "rewards/chosen": -0.03676885366439819, "rewards/margins": 1.7405758500099182, "rewards/rejected": -1.7773447036743164, "step": 12793 }, { "epoch": 0.6781331990565288, "grad_norm": 68.0, "kl": 0.9994735717773438, "learning_rate": 5e-07, "logits/chosen": -27844662.4, "logits/rejected": -22316672.0, "logps/chosen": -221.4585693359375, "logps/rejected": -228.18098958333334, "loss": 0.4511, "rewards/chosen": -0.19511371850967407, "rewards/margins": 1.342978258927663, "rewards/rejected": -1.5380919774373372, "step": 12794 }, { "epoch": 0.6781862030583309, "grad_norm": 38.25, "kl": 2.6425094604492188, "learning_rate": 5e-07, "logits/chosen": -14651911.0, "logits/rejected": -42382476.0, "logps/chosen": -334.4151306152344, "logps/rejected": -383.0331115722656, "loss": 0.2204, "rewards/chosen": 1.0013033151626587, "rewards/margins": 4.58710253238678, "rewards/rejected": -3.585799217224121, "step": 12795 }, { "epoch": 0.6782392070601331, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3795260.6666666665, "logits/rejected": -21461913.6, "logps/chosen": -488.0224202473958, "logps/rejected": -268.4308349609375, "loss": 0.1382, "rewards/chosen": 1.7101054191589355, "rewards/margins": 4.0894349098205565, "rewards/rejected": -2.379329490661621, "step": 12796 }, { "epoch": 0.6782922110619352, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -120832213.33333333, "logits/rejected": -1328844.6, "logps/chosen": -377.3048095703125, "logps/rejected": -101.54320678710937, "loss": 0.2024, "rewards/chosen": 0.18887736399968466, "rewards/margins": 3.44960009654363, "rewards/rejected": -3.2607227325439454, "step": 12797 }, { "epoch": 0.6783452150637374, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10268307.0, "logits/rejected": -19473542.0, "logps/chosen": -435.4071960449219, "logps/rejected": -379.2464904785156, "loss": 0.2246, "rewards/chosen": 0.9481891393661499, "rewards/margins": 3.5906981229782104, "rewards/rejected": -2.6425089836120605, "step": 12798 }, { "epoch": 0.6783982190655394, "grad_norm": 52.75, "kl": 2.248403549194336, "learning_rate": 5e-07, "logits/chosen": -22965764.8, "logits/rejected": -17515008.0, "logps/chosen": -206.196826171875, "logps/rejected": -121.42819213867188, "loss": 0.3881, "rewards/chosen": 0.3180074214935303, "rewards/margins": 1.8024870077768962, "rewards/rejected": -1.484479586283366, "step": 12799 }, { "epoch": 0.6784512230673416, "grad_norm": 25.125, "kl": 2.4219894409179688, "learning_rate": 5e-07, "logits/chosen": 1812226.0, "logits/rejected": -37949171.2, "logps/chosen": -121.23751831054688, "logps/rejected": -337.0518798828125, "loss": 0.15, "rewards/chosen": 1.3054447174072266, "rewards/margins": 4.026387405395508, "rewards/rejected": -2.7209426879882814, "step": 12800 }, { "epoch": 0.6785042270691437, "grad_norm": 44.0, "kl": 1.7457151412963867, "learning_rate": 5e-07, "logits/chosen": -9220948.8, "logits/rejected": -53050213.333333336, "logps/chosen": -301.9998046875, "logps/rejected": -341.4201253255208, "loss": 0.2693, "rewards/chosen": 1.0907978057861327, "rewards/margins": 3.0594831148783364, "rewards/rejected": -1.9686853090922039, "step": 12801 }, { "epoch": 0.6785572310709459, "grad_norm": 57.75, "kl": 1.694502830505371, "learning_rate": 5e-07, "logits/chosen": -29488888.0, "logits/rejected": -9203473.0, "logps/chosen": -341.879638671875, "logps/rejected": -531.8603515625, "loss": 0.3575, "rewards/chosen": 0.5303703943888346, "rewards/margins": 2.404887596766154, "rewards/rejected": -1.8745172023773193, "step": 12802 }, { "epoch": 0.678610235072748, "grad_norm": 40.0, "kl": 0.5750808715820312, "learning_rate": 5e-07, "logits/chosen": -37262292.0, "logits/rejected": -24622240.0, "logps/chosen": -613.5986328125, "logps/rejected": -372.66542271205356, "loss": 0.1419, "rewards/chosen": 3.300152540206909, "rewards/margins": 5.181948968342372, "rewards/rejected": -1.881796428135463, "step": 12803 }, { "epoch": 0.6786632390745502, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27410642.666666668, "logits/rejected": -2964748.8, "logps/chosen": -383.82666015625, "logps/rejected": -228.6370361328125, "loss": 0.2742, "rewards/chosen": 0.11970723668734233, "rewards/margins": 2.103793273369471, "rewards/rejected": -1.984086036682129, "step": 12804 }, { "epoch": 0.6787162430763523, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16124872.0, "logits/rejected": 2368028.8333333335, "logps/chosen": -320.2691955566406, "logps/rejected": -239.18107096354166, "loss": 0.1696, "rewards/chosen": 0.6675354242324829, "rewards/margins": 3.56967826684316, "rewards/rejected": -2.9021428426106772, "step": 12805 }, { "epoch": 0.6787692470781544, "grad_norm": 71.0, "kl": 1.5042543411254883, "learning_rate": 5e-07, "logits/chosen": -29341872.0, "logits/rejected": -2383069.0, "logps/chosen": -191.9559122721354, "logps/rejected": -191.4381591796875, "loss": 0.3497, "rewards/chosen": -0.07293446858723958, "rewards/margins": 1.7590359369913737, "rewards/rejected": -1.8319704055786132, "step": 12806 }, { "epoch": 0.6788222510799565, "grad_norm": 40.0, "kl": 1.0913276672363281, "learning_rate": 5e-07, "logits/chosen": -31189282.0, "logits/rejected": -50813688.0, "logps/chosen": -189.84747314453125, "logps/rejected": -390.2248840332031, "loss": 0.2986, "rewards/chosen": 0.3603436350822449, "rewards/margins": 3.169838845729828, "rewards/rejected": -2.809495210647583, "step": 12807 }, { "epoch": 0.6788752550817587, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -120416824.0, "logits/rejected": -22086854.85714286, "logps/chosen": -449.79681396484375, "logps/rejected": -174.27143205915178, "loss": 0.1729, "rewards/chosen": 0.3542419373989105, "rewards/margins": 2.3208394944667816, "rewards/rejected": -1.966597557067871, "step": 12808 }, { "epoch": 0.6789282590835608, "grad_norm": 63.75, "kl": 0.39981651306152344, "learning_rate": 5e-07, "logits/chosen": -27040569.6, "logits/rejected": -36729162.666666664, "logps/chosen": -153.12099609375, "logps/rejected": -550.4770100911459, "loss": 0.442, "rewards/chosen": -0.10202083587646485, "rewards/margins": 1.6500678698221842, "rewards/rejected": -1.7520887056986492, "step": 12809 }, { "epoch": 0.678981263085363, "grad_norm": 33.25, "kl": 2.35357666015625, "learning_rate": 5e-07, "logits/chosen": -22350780.0, "logits/rejected": -25149068.8, "logps/chosen": -524.5873616536459, "logps/rejected": -268.615380859375, "loss": 0.2493, "rewards/chosen": 1.416136900583903, "rewards/margins": 3.330398146311442, "rewards/rejected": -1.914261245727539, "step": 12810 }, { "epoch": 0.6790342670871651, "grad_norm": 39.0, "kl": 0.28501033782958984, "learning_rate": 5e-07, "logits/chosen": -33609648.0, "logits/rejected": -29502784.0, "logps/chosen": -239.2079874674479, "logps/rejected": -277.893798828125, "loss": 0.2684, "rewards/chosen": 0.3959919611612956, "rewards/margins": 2.1772801081339517, "rewards/rejected": -1.7812881469726562, "step": 12811 }, { "epoch": 0.6790872710889673, "grad_norm": 62.75, "kl": 0.16895484924316406, "learning_rate": 5e-07, "logits/chosen": -13507579.0, "logits/rejected": -5921228.0, "logps/chosen": -162.46713256835938, "logps/rejected": -298.73488362630206, "loss": 0.3007, "rewards/chosen": -0.4537836015224457, "rewards/margins": 1.0940906902154286, "rewards/rejected": -1.5478742917378743, "step": 12812 }, { "epoch": 0.6791402750907694, "grad_norm": 45.25, "kl": 0.6820888519287109, "learning_rate": 5e-07, "logits/chosen": -4915401.333333333, "logits/rejected": -6773016.0, "logps/chosen": -229.193115234375, "logps/rejected": -250.5273193359375, "loss": 0.2801, "rewards/chosen": 0.22537243366241455, "rewards/margins": 2.1907822847366334, "rewards/rejected": -1.9654098510742188, "step": 12813 }, { "epoch": 0.6791932790925715, "grad_norm": 44.5, "kl": 2.1377248764038086, "learning_rate": 5e-07, "logits/chosen": -35315008.0, "logits/rejected": -27921034.0, "logps/chosen": -323.6915588378906, "logps/rejected": -301.4918518066406, "loss": 0.2307, "rewards/chosen": 0.9887478351593018, "rewards/margins": 2.8402366638183594, "rewards/rejected": -1.8514888286590576, "step": 12814 }, { "epoch": 0.6792462830943736, "grad_norm": 26.375, "kl": 1.9314050674438477, "learning_rate": 5e-07, "logits/chosen": -16591937.6, "logits/rejected": 10853783.333333334, "logps/chosen": -91.44408569335937, "logps/rejected": -109.17572021484375, "loss": 0.2786, "rewards/chosen": 0.9366941452026367, "rewards/margins": 3.2783339818318686, "rewards/rejected": -2.341639836629232, "step": 12815 }, { "epoch": 0.6792992870961757, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53318064.0, "logits/rejected": -39137042.28571428, "logps/chosen": -451.59246826171875, "logps/rejected": -402.33701869419644, "loss": 0.1797, "rewards/chosen": 0.008374023251235485, "rewards/margins": 2.7866918834458505, "rewards/rejected": -2.778317860194615, "step": 12816 }, { "epoch": 0.6793522910979779, "grad_norm": 54.5, "kl": 4.5981903076171875, "learning_rate": 5e-07, "logits/chosen": -12749830.666666666, "logits/rejected": -8608199.0, "logps/chosen": -421.6107584635417, "logps/rejected": -243.2369384765625, "loss": 0.3451, "rewards/chosen": 0.6947950522104899, "rewards/margins": 3.1098164717356362, "rewards/rejected": -2.4150214195251465, "step": 12817 }, { "epoch": 0.67940529509978, "grad_norm": 51.5, "kl": 1.4702987670898438, "learning_rate": 5e-07, "logits/chosen": -19546538.0, "logits/rejected": -45682620.0, "logps/chosen": -251.65414428710938, "logps/rejected": -217.03086853027344, "loss": 0.2606, "rewards/chosen": 0.6290024518966675, "rewards/margins": 2.5921852588653564, "rewards/rejected": -1.963182806968689, "step": 12818 }, { "epoch": 0.6794582991015822, "grad_norm": 56.0, "kl": 0.8625469207763672, "learning_rate": 5e-07, "logits/chosen": -23643381.333333332, "logits/rejected": 4102926.8, "logps/chosen": -151.30549112955728, "logps/rejected": -172.00718994140624, "loss": 0.2331, "rewards/chosen": 0.536526600519816, "rewards/margins": 2.960869518915812, "rewards/rejected": -2.424342918395996, "step": 12819 }, { "epoch": 0.6795113031033843, "grad_norm": 57.75, "kl": 2.350356101989746, "learning_rate": 5e-07, "logits/chosen": -28903754.666666668, "logits/rejected": -25507140.0, "logps/chosen": -316.74863688151044, "logps/rejected": -211.40155029296875, "loss": 0.5037, "rewards/chosen": -0.1258927583694458, "rewards/margins": 1.3687827587127686, "rewards/rejected": -1.4946755170822144, "step": 12820 }, { "epoch": 0.6795643071051864, "grad_norm": 53.5, "kl": 1.2544622421264648, "learning_rate": 5e-07, "logits/chosen": -24884240.0, "logits/rejected": -37014170.666666664, "logps/chosen": -434.746484375, "logps/rejected": -261.68609619140625, "loss": 0.2771, "rewards/chosen": 0.8280890464782715, "rewards/margins": 3.0710878054300945, "rewards/rejected": -2.2429987589518228, "step": 12821 }, { "epoch": 0.6796173111069885, "grad_norm": 53.0, "kl": 1.4265785217285156, "learning_rate": 5e-07, "logits/chosen": -26064964.0, "logits/rejected": -24531672.0, "logps/chosen": -439.5075378417969, "logps/rejected": -355.2615051269531, "loss": 0.2544, "rewards/chosen": 0.33990994095802307, "rewards/margins": 3.645144611597061, "rewards/rejected": -3.305234670639038, "step": 12822 }, { "epoch": 0.6796703151087907, "grad_norm": 44.75, "kl": 0.4047279357910156, "learning_rate": 5e-07, "logits/chosen": -4534140.0, "logits/rejected": -29520072.0, "logps/chosen": -196.03904724121094, "logps/rejected": -343.84698486328125, "loss": 0.28, "rewards/chosen": 0.18953672051429749, "rewards/margins": 3.0430075228214264, "rewards/rejected": -2.853470802307129, "step": 12823 }, { "epoch": 0.6797233191105928, "grad_norm": 52.0, "kl": 2.287985324859619, "learning_rate": 5e-07, "logits/chosen": -4403849.333333333, "logits/rejected": -53544444.0, "logps/chosen": -113.60593668619792, "logps/rejected": -689.6800537109375, "loss": 0.3023, "rewards/chosen": 0.7983795007069906, "rewards/margins": 3.5609923203786216, "rewards/rejected": -2.762612819671631, "step": 12824 }, { "epoch": 0.679776323112395, "grad_norm": 50.0, "kl": 2.953866958618164, "learning_rate": 5e-07, "logits/chosen": -23531942.4, "logits/rejected": -38833866.666666664, "logps/chosen": -243.562109375, "logps/rejected": -352.532958984375, "loss": 0.3116, "rewards/chosen": 0.8226770401000977, "rewards/margins": 2.9984258651733398, "rewards/rejected": -2.175748825073242, "step": 12825 }, { "epoch": 0.6798293271141971, "grad_norm": 53.5, "kl": 1.6299705505371094, "learning_rate": 5e-07, "logits/chosen": -25577954.666666668, "logits/rejected": -45814552.0, "logps/chosen": -265.0996907552083, "logps/rejected": -913.671142578125, "loss": 0.3093, "rewards/chosen": 0.6736404895782471, "rewards/margins": 5.296552896499634, "rewards/rejected": -4.622912406921387, "step": 12826 }, { "epoch": 0.6798823311159993, "grad_norm": 38.0, "kl": 4.1723432540893555, "learning_rate": 5e-07, "logits/chosen": -14256915.2, "logits/rejected": -25729013.333333332, "logps/chosen": -165.30626220703124, "logps/rejected": -675.8299967447916, "loss": 0.2667, "rewards/chosen": 0.9231106758117675, "rewards/margins": 5.265153598785401, "rewards/rejected": -4.342042922973633, "step": 12827 }, { "epoch": 0.6799353351178014, "grad_norm": 31.125, "kl": 2.5463619232177734, "learning_rate": 5e-07, "logits/chosen": -8662668.666666666, "logits/rejected": 1536071.8, "logps/chosen": -242.11431884765625, "logps/rejected": -295.48349609375, "loss": 0.132, "rewards/chosen": 1.73674472173055, "rewards/margins": 4.43552172978719, "rewards/rejected": -2.6987770080566404, "step": 12828 }, { "epoch": 0.6799883391196035, "grad_norm": 38.5, "kl": 3.4412012100219727, "learning_rate": 5e-07, "logits/chosen": 3068128.0, "logits/rejected": -2300987.0, "logps/chosen": -99.82773844401042, "logps/rejected": -100.49373168945313, "loss": 0.2843, "rewards/chosen": 0.7673874696095785, "rewards/margins": 1.9785057862599693, "rewards/rejected": -1.2111183166503907, "step": 12829 }, { "epoch": 0.6800413431214056, "grad_norm": 75.0, "kl": 3.404791831970215, "learning_rate": 5e-07, "logits/chosen": -22159100.0, "logits/rejected": -2780097.5, "logps/chosen": -595.975830078125, "logps/rejected": -490.66839599609375, "loss": 0.2809, "rewards/chosen": 1.3530830144882202, "rewards/margins": 2.844537138938904, "rewards/rejected": -1.4914541244506836, "step": 12830 }, { "epoch": 0.6800943471232078, "grad_norm": 44.5, "kl": 1.1692981719970703, "learning_rate": 5e-07, "logits/chosen": -15878302.0, "logits/rejected": -45004764.0, "logps/chosen": -170.1029052734375, "logps/rejected": -414.0189514160156, "loss": 0.2378, "rewards/chosen": 0.8278573155403137, "rewards/margins": 3.0218647122383118, "rewards/rejected": -2.194007396697998, "step": 12831 }, { "epoch": 0.6801473511250099, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22458716.0, "logits/rejected": -10305499.0, "logps/chosen": -456.3223876953125, "logps/rejected": -158.85501098632812, "loss": 0.2912, "rewards/chosen": 0.8462806940078735, "rewards/margins": 2.00112521648407, "rewards/rejected": -1.1548445224761963, "step": 12832 }, { "epoch": 0.6802003551268121, "grad_norm": 45.75, "kl": 0.9067764282226562, "learning_rate": 5e-07, "logits/chosen": -26476980.0, "logits/rejected": -11904641.0, "logps/chosen": -418.5721435546875, "logps/rejected": -169.87484741210938, "loss": 0.2462, "rewards/chosen": 0.447296142578125, "rewards/margins": 3.417581081390381, "rewards/rejected": -2.970284938812256, "step": 12833 }, { "epoch": 0.6802533591286142, "grad_norm": 60.0, "kl": 2.962646484375, "learning_rate": 5e-07, "logits/chosen": -16933264.0, "logits/rejected": -17816480.0, "logps/chosen": -281.03131103515625, "logps/rejected": -224.24896240234375, "loss": 0.2533, "rewards/chosen": 0.69683837890625, "rewards/margins": 2.727378527323405, "rewards/rejected": -2.030540148417155, "step": 12834 }, { "epoch": 0.6803063631304164, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48646163.2, "logits/rejected": -8949433.333333334, "logps/chosen": -416.473974609375, "logps/rejected": -107.78757731119792, "loss": 0.2937, "rewards/chosen": 0.6436926364898682, "rewards/margins": 2.9950822035471596, "rewards/rejected": -2.3513895670572915, "step": 12835 }, { "epoch": 0.6803593671322185, "grad_norm": 80.5, "kl": 0.2372589111328125, "learning_rate": 5e-07, "logits/chosen": -24782266.666666668, "logits/rejected": -11236684.8, "logps/chosen": -594.7589111328125, "logps/rejected": -123.7216064453125, "loss": 0.2471, "rewards/chosen": 0.518311063448588, "rewards/margins": 2.48135351339976, "rewards/rejected": -1.9630424499511718, "step": 12836 }, { "epoch": 0.6804123711340206, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26683242.666666668, "logits/rejected": -25400380.8, "logps/chosen": -268.8802897135417, "logps/rejected": -371.5755859375, "loss": 0.2338, "rewards/chosen": 0.3197791973749797, "rewards/margins": 2.9254642407099403, "rewards/rejected": -2.6056850433349608, "step": 12837 }, { "epoch": 0.6804653751358227, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2869645.6666666665, "logits/rejected": 7941348.8, "logps/chosen": -143.67045084635416, "logps/rejected": -328.6613037109375, "loss": 0.2578, "rewards/chosen": 0.4633597135543823, "rewards/margins": 2.3213267087936402, "rewards/rejected": -1.857966995239258, "step": 12838 }, { "epoch": 0.6805183791376249, "grad_norm": 40.5, "kl": 1.7196025848388672, "learning_rate": 5e-07, "logits/chosen": -39697193.6, "logits/rejected": -68478368.0, "logps/chosen": -114.9413330078125, "logps/rejected": -407.3872884114583, "loss": 0.3703, "rewards/chosen": 0.15847086906433105, "rewards/margins": 2.5277254581451416, "rewards/rejected": -2.3692545890808105, "step": 12839 }, { "epoch": 0.680571383139427, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25701237.333333332, "logits/rejected": -36587398.4, "logps/chosen": -719.2445475260416, "logps/rejected": -335.853173828125, "loss": 0.1896, "rewards/chosen": 0.8754669825236002, "rewards/margins": 3.5054728190104165, "rewards/rejected": -2.6300058364868164, "step": 12840 }, { "epoch": 0.6806243871412292, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32163824.0, "logits/rejected": -93503701.33333333, "logps/chosen": -354.60068359375, "logps/rejected": -537.4636637369791, "loss": 0.2539, "rewards/chosen": 0.5683959484100342, "rewards/margins": 3.5841573238372804, "rewards/rejected": -3.015761375427246, "step": 12841 }, { "epoch": 0.6806773911430313, "grad_norm": 42.0, "kl": 3.1714820861816406, "learning_rate": 5e-07, "logits/chosen": -36590832.0, "logits/rejected": -6910551.0, "logps/chosen": -499.42364501953125, "logps/rejected": -259.052490234375, "loss": 0.3187, "rewards/chosen": 0.7268503308296204, "rewards/margins": 3.0778756737709045, "rewards/rejected": -2.351025342941284, "step": 12842 }, { "epoch": 0.6807303951448335, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70358090.66666667, "logits/rejected": 5749902.4, "logps/chosen": -535.2945149739584, "logps/rejected": -234.62509765625, "loss": 0.2097, "rewards/chosen": 0.5080881913503011, "rewards/margins": 3.548768695195516, "rewards/rejected": -3.040680503845215, "step": 12843 }, { "epoch": 0.6807833991466355, "grad_norm": 48.75, "kl": 2.6613950729370117, "learning_rate": 5e-07, "logits/chosen": 18718788.0, "logits/rejected": -14687592.0, "logps/chosen": -381.1700439453125, "logps/rejected": -227.94740295410156, "loss": 0.2146, "rewards/chosen": 1.428286870320638, "rewards/margins": 4.08135708173116, "rewards/rejected": -2.6530702114105225, "step": 12844 }, { "epoch": 0.6808364031484377, "grad_norm": 46.5, "kl": 1.3380165100097656, "learning_rate": 5e-07, "logits/chosen": -4978502.5, "logits/rejected": -34468592.0, "logps/chosen": -147.54762268066406, "logps/rejected": -220.9899139404297, "loss": 0.2835, "rewards/chosen": 0.522712767124176, "rewards/margins": 2.371259391307831, "rewards/rejected": -1.8485466241836548, "step": 12845 }, { "epoch": 0.6808894071502398, "grad_norm": 35.75, "kl": 0.7214951515197754, "learning_rate": 5e-07, "logits/chosen": -22767404.0, "logits/rejected": -49782716.0, "logps/chosen": -125.48440551757812, "logps/rejected": -666.1690673828125, "loss": 0.2328, "rewards/chosen": 0.7048123478889465, "rewards/margins": 5.089095413684845, "rewards/rejected": -4.384283065795898, "step": 12846 }, { "epoch": 0.680942411152042, "grad_norm": 45.75, "kl": 0.4566969871520996, "learning_rate": 5e-07, "logits/chosen": -80393113.6, "logits/rejected": -8092047.333333333, "logps/chosen": -189.7218017578125, "logps/rejected": -290.85988362630206, "loss": 0.3211, "rewards/chosen": 0.2411522388458252, "rewards/margins": 2.7342276096343996, "rewards/rejected": -2.493075370788574, "step": 12847 }, { "epoch": 0.6809954151538441, "grad_norm": 72.0, "kl": 3.6740036010742188, "learning_rate": 5e-07, "logits/chosen": -37631228.8, "logits/rejected": -36006677.333333336, "logps/chosen": -474.852734375, "logps/rejected": -505.5446370442708, "loss": 0.2625, "rewards/chosen": 1.2783979415893554, "rewards/margins": 3.8100669225056967, "rewards/rejected": -2.5316689809163413, "step": 12848 }, { "epoch": 0.6810484191556463, "grad_norm": 49.0, "kl": 1.1319751739501953, "learning_rate": 5e-07, "logits/chosen": -3477525.3333333335, "logits/rejected": -28887651.2, "logps/chosen": -172.4000447591146, "logps/rejected": -399.06669921875, "loss": 0.2998, "rewards/chosen": 0.23665491739908853, "rewards/margins": 1.9206915537516276, "rewards/rejected": -1.684036636352539, "step": 12849 }, { "epoch": 0.6811014231574484, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24331397.333333332, "logits/rejected": -114115993.6, "logps/chosen": -213.66446940104166, "logps/rejected": -618.5431640625, "loss": 0.2163, "rewards/chosen": -0.10190429290135701, "rewards/margins": 3.762105564276377, "rewards/rejected": -3.8640098571777344, "step": 12850 }, { "epoch": 0.6811544271592506, "grad_norm": 50.25, "kl": 1.5022544860839844, "learning_rate": 5e-07, "logits/chosen": -48679576.0, "logits/rejected": -38349120.0, "logps/chosen": -376.8567810058594, "logps/rejected": -412.7410888671875, "loss": 0.2533, "rewards/chosen": 0.7761039733886719, "rewards/margins": 3.13425612449646, "rewards/rejected": -2.358152151107788, "step": 12851 }, { "epoch": 0.6812074311610526, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45098416.0, "logits/rejected": -13345586.285714285, "logps/chosen": -375.26434326171875, "logps/rejected": -433.2608119419643, "loss": 0.1391, "rewards/chosen": 0.33598634600639343, "rewards/margins": 3.693423479795456, "rewards/rejected": -3.3574371337890625, "step": 12852 }, { "epoch": 0.6812604351628548, "grad_norm": 52.0, "kl": 0.7586383819580078, "learning_rate": 5e-07, "logits/chosen": -47644196.0, "logits/rejected": -16914876.0, "logps/chosen": -373.6762390136719, "logps/rejected": -304.1945495605469, "loss": 0.3181, "rewards/chosen": 0.17913571000099182, "rewards/margins": 2.5958527624607086, "rewards/rejected": -2.416717052459717, "step": 12853 }, { "epoch": 0.6813134391646569, "grad_norm": 40.0, "kl": 0.010753631591796875, "learning_rate": 5e-07, "logits/chosen": -27078320.0, "logits/rejected": -44754400.0, "logps/chosen": -338.9582112630208, "logps/rejected": -466.88876953125, "loss": 0.1861, "rewards/chosen": 0.47911326090494794, "rewards/margins": 3.48139902750651, "rewards/rejected": -3.0022857666015623, "step": 12854 }, { "epoch": 0.6813664431664591, "grad_norm": 47.5, "kl": 0.24468231201171875, "learning_rate": 5e-07, "logits/chosen": -21415328.0, "logits/rejected": -9643287.333333334, "logps/chosen": -348.0341064453125, "logps/rejected": -136.99808756510416, "loss": 0.251, "rewards/chosen": 0.7661765098571778, "rewards/margins": 3.5302171389261883, "rewards/rejected": -2.7640406290690103, "step": 12855 }, { "epoch": 0.6814194471682612, "grad_norm": 49.5, "kl": 2.1961631774902344, "learning_rate": 5e-07, "logits/chosen": -19831174.4, "logits/rejected": -27327210.666666668, "logps/chosen": -386.7298828125, "logps/rejected": -139.6780802408854, "loss": 0.2558, "rewards/chosen": 1.1162689208984375, "rewards/margins": 3.145533688863119, "rewards/rejected": -2.029264767964681, "step": 12856 }, { "epoch": 0.6814724511700634, "grad_norm": 50.5, "kl": 2.4390478134155273, "learning_rate": 5e-07, "logits/chosen": -19694564.0, "logits/rejected": -7076322.5, "logps/chosen": -218.81791178385416, "logps/rejected": -278.49609375, "loss": 0.3574, "rewards/chosen": 0.6802291870117188, "rewards/margins": 2.3039214611053467, "rewards/rejected": -1.623692274093628, "step": 12857 }, { "epoch": 0.6815254551718655, "grad_norm": 44.75, "kl": 0.5216033458709717, "learning_rate": 5e-07, "logits/chosen": -12604480.0, "logits/rejected": -10216150.0, "logps/chosen": -204.81552124023438, "logps/rejected": -303.9417724609375, "loss": 0.3273, "rewards/chosen": 0.33012425899505615, "rewards/margins": 1.8162152767181396, "rewards/rejected": -1.4860910177230835, "step": 12858 }, { "epoch": 0.6815784591736677, "grad_norm": 45.5, "kl": 0.6309490203857422, "learning_rate": 5e-07, "logits/chosen": -66129962.666666664, "logits/rejected": -1001179.2, "logps/chosen": -275.8451334635417, "logps/rejected": -526.90888671875, "loss": 0.23, "rewards/chosen": 0.4785037438074748, "rewards/margins": 3.4434127251307167, "rewards/rejected": -2.964908981323242, "step": 12859 }, { "epoch": 0.6816314631754697, "grad_norm": 30.75, "kl": 2.8374404907226562, "learning_rate": 5e-07, "logits/chosen": -15282269.333333334, "logits/rejected": -58361420.8, "logps/chosen": -375.5747884114583, "logps/rejected": -281.61826171875, "loss": 0.1185, "rewards/chosen": 1.8649487495422363, "rewards/margins": 4.685521602630615, "rewards/rejected": -2.820572853088379, "step": 12860 }, { "epoch": 0.6816844671772719, "grad_norm": 53.75, "kl": 4.202570915222168, "learning_rate": 5e-07, "logits/chosen": -41506868.0, "logits/rejected": -86065432.0, "logps/chosen": -292.40594482421875, "logps/rejected": -333.43560791015625, "loss": 0.3466, "rewards/chosen": 0.46320098638534546, "rewards/margins": 1.7576132416725159, "rewards/rejected": -1.2944122552871704, "step": 12861 }, { "epoch": 0.681737471179074, "grad_norm": 40.5, "kl": 0.29639530181884766, "learning_rate": 5e-07, "logits/chosen": -46807068.0, "logits/rejected": -44190048.0, "logps/chosen": -252.70034790039062, "logps/rejected": -465.1419677734375, "loss": 0.3073, "rewards/chosen": 0.002235129475593567, "rewards/margins": 2.5270664244890213, "rewards/rejected": -2.5248312950134277, "step": 12862 }, { "epoch": 0.6817904751808762, "grad_norm": 37.0, "kl": 0.5501441955566406, "learning_rate": 5e-07, "logits/chosen": -20145056.0, "logits/rejected": 1107356.0, "logps/chosen": -308.0791320800781, "logps/rejected": -251.62933349609375, "loss": 0.2249, "rewards/chosen": 1.573673963546753, "rewards/margins": 3.170311689376831, "rewards/rejected": -1.5966377258300781, "step": 12863 }, { "epoch": 0.6818434791826783, "grad_norm": 59.0, "kl": 1.2220325469970703, "learning_rate": 5e-07, "logits/chosen": -61076890.666666664, "logits/rejected": -2791881.6, "logps/chosen": -1000.003173828125, "logps/rejected": -168.96495361328124, "loss": 0.2618, "rewards/chosen": 1.4673757553100586, "rewards/margins": 2.446013069152832, "rewards/rejected": -0.9786373138427734, "step": 12864 }, { "epoch": 0.6818964831844804, "grad_norm": 38.5, "kl": 1.3749008178710938, "learning_rate": 5e-07, "logits/chosen": -62218912.0, "logits/rejected": -16116190.4, "logps/chosen": -373.1373697916667, "logps/rejected": -305.713720703125, "loss": 0.2649, "rewards/chosen": 1.0154698689778645, "rewards/margins": 2.5681225140889485, "rewards/rejected": -1.552652645111084, "step": 12865 }, { "epoch": 0.6819494871862826, "grad_norm": 65.0, "kl": 2.379974126815796, "learning_rate": 5e-07, "logits/chosen": -15350000.0, "logits/rejected": -52700117.333333336, "logps/chosen": -251.93876953125, "logps/rejected": -384.2577311197917, "loss": 0.3561, "rewards/chosen": 0.46199607849121094, "rewards/margins": 2.369025230407715, "rewards/rejected": -1.907029151916504, "step": 12866 }, { "epoch": 0.6820024911880846, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14875628.8, "logits/rejected": -20798133.333333332, "logps/chosen": -299.3945556640625, "logps/rejected": -287.9637858072917, "loss": 0.317, "rewards/chosen": 0.1832864761352539, "rewards/margins": 2.663674163818359, "rewards/rejected": -2.4803876876831055, "step": 12867 }, { "epoch": 0.6820554951898868, "grad_norm": 47.0, "kl": 1.310516357421875, "learning_rate": 5e-07, "logits/chosen": -42849444.0, "logits/rejected": -42296389.333333336, "logps/chosen": -672.0239868164062, "logps/rejected": -376.0007731119792, "loss": 0.1839, "rewards/chosen": 1.4666550159454346, "rewards/margins": 3.649176836013794, "rewards/rejected": -2.1825218200683594, "step": 12868 }, { "epoch": 0.6821084991916889, "grad_norm": 52.25, "kl": 4.630239486694336, "learning_rate": 5e-07, "logits/chosen": -20359588.0, "logits/rejected": -28129124.0, "logps/chosen": -315.77301025390625, "logps/rejected": -181.14285278320312, "loss": 0.3537, "rewards/chosen": 0.8689860701560974, "rewards/margins": 2.1242485642433167, "rewards/rejected": -1.2552624940872192, "step": 12869 }, { "epoch": 0.6821615031934911, "grad_norm": 37.0, "kl": 1.3115167617797852, "learning_rate": 5e-07, "logits/chosen": -1516385.8, "logits/rejected": 32227760.0, "logps/chosen": -189.27138671875, "logps/rejected": -461.8052571614583, "loss": 0.337, "rewards/chosen": 0.3721742630004883, "rewards/margins": 3.1604347229003906, "rewards/rejected": -2.7882604598999023, "step": 12870 }, { "epoch": 0.6822145071952932, "grad_norm": 50.0, "kl": 1.3495368957519531, "learning_rate": 5e-07, "logits/chosen": -45139433.6, "logits/rejected": -38595746.666666664, "logps/chosen": -324.078076171875, "logps/rejected": -301.6155192057292, "loss": 0.375, "rewards/chosen": 0.08764435052871704, "rewards/margins": 2.187571934858958, "rewards/rejected": -2.0999275843302407, "step": 12871 }, { "epoch": 0.6822675111970954, "grad_norm": 49.75, "kl": 1.592367172241211, "learning_rate": 5e-07, "logits/chosen": -13818433.6, "logits/rejected": -26508045.333333332, "logps/chosen": -209.999365234375, "logps/rejected": -257.94000244140625, "loss": 0.262, "rewards/chosen": 0.6534234523773194, "rewards/margins": 4.7015509446462, "rewards/rejected": -4.04812749226888, "step": 12872 }, { "epoch": 0.6823205151988975, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5580063.666666667, "logits/rejected": -26062844.8, "logps/chosen": -189.244140625, "logps/rejected": -260.8044189453125, "loss": 0.2872, "rewards/chosen": 0.2539968093236287, "rewards/margins": 2.039715345700582, "rewards/rejected": -1.7857185363769532, "step": 12873 }, { "epoch": 0.6823735192006997, "grad_norm": 42.0, "kl": 0.12352466583251953, "learning_rate": 5e-07, "logits/chosen": -58067178.666666664, "logits/rejected": -14976036.8, "logps/chosen": -312.6891682942708, "logps/rejected": -345.3212158203125, "loss": 0.2538, "rewards/chosen": 0.24036969741185507, "rewards/margins": 2.81636457045873, "rewards/rejected": -2.575994873046875, "step": 12874 }, { "epoch": 0.6824265232025017, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 74722944.0, "logits/rejected": -60287257.6, "logps/chosen": -367.5777587890625, "logps/rejected": -579.347802734375, "loss": 0.1868, "rewards/chosen": 0.531727115313212, "rewards/margins": 3.0952366749445597, "rewards/rejected": -2.5635095596313477, "step": 12875 }, { "epoch": 0.6824795272043039, "grad_norm": 47.25, "kl": 0.25184059143066406, "learning_rate": 5e-07, "logits/chosen": -49335776.0, "logits/rejected": -27160009.6, "logps/chosen": -157.00325520833334, "logps/rejected": -425.520458984375, "loss": 0.303, "rewards/chosen": -0.07080944379170735, "rewards/margins": 1.6846715132395427, "rewards/rejected": -1.75548095703125, "step": 12876 }, { "epoch": 0.682532531206106, "grad_norm": 50.75, "kl": 4.548915863037109, "learning_rate": 5e-07, "logits/chosen": -10739177.6, "logits/rejected": -8516824.0, "logps/chosen": -242.418310546875, "logps/rejected": -100.32204182942708, "loss": 0.3363, "rewards/chosen": 0.7874575614929199, "rewards/margins": 2.304497178395589, "rewards/rejected": -1.5170396169026692, "step": 12877 }, { "epoch": 0.6825855352079082, "grad_norm": 41.75, "kl": 1.507110595703125, "learning_rate": 5e-07, "logits/chosen": -77614072.0, "logits/rejected": -33337597.333333332, "logps/chosen": -526.3563232421875, "logps/rejected": -336.9944254557292, "loss": 0.1828, "rewards/chosen": 0.5837463140487671, "rewards/margins": 4.20252009232839, "rewards/rejected": -3.6187737782796225, "step": 12878 }, { "epoch": 0.6826385392097103, "grad_norm": 75.0, "kl": 3.5149917602539062, "learning_rate": 5e-07, "logits/chosen": -41048658.666666664, "logits/rejected": -53812572.0, "logps/chosen": -462.1529134114583, "logps/rejected": -794.6400756835938, "loss": 0.3907, "rewards/chosen": 0.6490137577056885, "rewards/margins": 2.533428430557251, "rewards/rejected": -1.8844146728515625, "step": 12879 }, { "epoch": 0.6826915432115125, "grad_norm": 46.75, "kl": 0.9019050598144531, "learning_rate": 5e-07, "logits/chosen": 18388686.666666668, "logits/rejected": -27956336.0, "logps/chosen": -410.6526285807292, "logps/rejected": -161.094384765625, "loss": 0.2234, "rewards/chosen": 0.7885915438334147, "rewards/margins": 2.7939506212870278, "rewards/rejected": -2.005359077453613, "step": 12880 }, { "epoch": 0.6827445472133146, "grad_norm": 49.5, "kl": 2.548595428466797, "learning_rate": 5e-07, "logits/chosen": -22419611.2, "logits/rejected": -61443328.0, "logps/chosen": -190.034912109375, "logps/rejected": -512.5553792317709, "loss": 0.3566, "rewards/chosen": 0.4291866302490234, "rewards/margins": 3.0509193420410154, "rewards/rejected": -2.621732711791992, "step": 12881 }, { "epoch": 0.6827975512151168, "grad_norm": 43.0, "kl": 1.4641914367675781, "learning_rate": 5e-07, "logits/chosen": -3681853.5, "logits/rejected": 944156.0, "logps/chosen": -135.1648712158203, "logps/rejected": -540.5720825195312, "loss": 0.2579, "rewards/chosen": 0.4898080825805664, "rewards/margins": 2.8069376945495605, "rewards/rejected": -2.317129611968994, "step": 12882 }, { "epoch": 0.6828505552169188, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 27235936.0, "logits/rejected": -20755618.666666668, "logps/chosen": -601.232421875, "logps/rejected": -220.40152994791666, "loss": 0.2308, "rewards/chosen": 0.3100738525390625, "rewards/margins": 2.014878749847412, "rewards/rejected": -1.7048048973083496, "step": 12883 }, { "epoch": 0.682903559218721, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -30082384.0, "logps/rejected": -295.524658203125, "loss": 0.1142, "rewards/rejected": -2.43192458152771, "step": 12884 }, { "epoch": 0.6829565632205231, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50119594.666666664, "logits/rejected": -39532166.4, "logps/chosen": -515.7982177734375, "logps/rejected": -189.15, "loss": 0.193, "rewards/chosen": 1.014503002166748, "rewards/margins": 3.5167143821716307, "rewards/rejected": -2.5022113800048826, "step": 12885 }, { "epoch": 0.6830095672223253, "grad_norm": 45.25, "kl": 2.719696044921875, "learning_rate": 5e-07, "logits/chosen": -6998868.0, "logits/rejected": -12184277.333333334, "logps/chosen": -192.54013671875, "logps/rejected": -142.2802530924479, "loss": 0.3397, "rewards/chosen": 0.7127423286437988, "rewards/margins": 2.5898493131001787, "rewards/rejected": -1.8771069844563801, "step": 12886 }, { "epoch": 0.6830625712241274, "grad_norm": 53.5, "kl": 2.0435657501220703, "learning_rate": 5e-07, "logits/chosen": -11178538.666666666, "logits/rejected": -17327598.0, "logps/chosen": -452.8093668619792, "logps/rejected": -537.5905151367188, "loss": 0.2909, "rewards/chosen": 0.9470675786336263, "rewards/margins": 4.383436997731526, "rewards/rejected": -3.4363694190979004, "step": 12887 }, { "epoch": 0.6831155752259296, "grad_norm": 57.75, "kl": 1.240081787109375, "learning_rate": 5e-07, "logits/chosen": -15676624.0, "logits/rejected": 6660023.333333333, "logps/chosen": -331.335107421875, "logps/rejected": -240.31685384114584, "loss": 0.3138, "rewards/chosen": 0.30956521034240725, "rewards/margins": 3.2836023807525634, "rewards/rejected": -2.9740371704101562, "step": 12888 }, { "epoch": 0.6831685792277317, "grad_norm": 36.75, "kl": 2.3988285064697266, "learning_rate": 5e-07, "logits/chosen": -3062595.0, "logits/rejected": -25578681.6, "logps/chosen": -185.53350830078125, "logps/rejected": -277.3632568359375, "loss": 0.2664, "rewards/chosen": -0.12037430206934611, "rewards/margins": 2.6084018667538964, "rewards/rejected": -2.7287761688232424, "step": 12889 }, { "epoch": 0.6832215832295339, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47218976.0, "logits/rejected": 1458223.25, "logps/chosen": -301.60491943359375, "logps/rejected": -279.75604248046875, "loss": 0.3945, "rewards/chosen": -0.3120977580547333, "rewards/margins": 1.4043913185596466, "rewards/rejected": -1.7164890766143799, "step": 12890 }, { "epoch": 0.6832745872313359, "grad_norm": 78.5, "kl": 0.4411277770996094, "learning_rate": 5e-07, "logits/chosen": -34189352.0, "logits/rejected": -12853950.4, "logps/chosen": -391.5321858723958, "logps/rejected": -276.03251953125, "loss": 0.3462, "rewards/chosen": -0.02625123659769694, "rewards/margins": 1.4167804559071857, "rewards/rejected": -1.4430316925048827, "step": 12891 }, { "epoch": 0.6833275912331381, "grad_norm": 57.0, "kl": 5.726909637451172, "learning_rate": 5e-07, "logits/chosen": -36429736.0, "logits/rejected": -201514528.0, "logps/chosen": -306.988037109375, "logps/rejected": -521.5640258789062, "loss": 0.4089, "rewards/chosen": 0.4151938358942668, "rewards/margins": 2.9816611210505166, "rewards/rejected": -2.56646728515625, "step": 12892 }, { "epoch": 0.6833805952349402, "grad_norm": 45.25, "kl": 3.3651561737060547, "learning_rate": 5e-07, "logits/chosen": -50319992.0, "logits/rejected": -46763936.0, "logps/chosen": -685.154052734375, "logps/rejected": -349.8883056640625, "loss": 0.1903, "rewards/chosen": 1.3704776763916016, "rewards/margins": 4.604315280914307, "rewards/rejected": -3.233837604522705, "step": 12893 }, { "epoch": 0.6834335992367424, "grad_norm": 34.25, "kl": 3.563220977783203, "learning_rate": 5e-07, "logits/chosen": -25718874.666666668, "logits/rejected": -31072998.4, "logps/chosen": -297.8600260416667, "logps/rejected": -411.805859375, "loss": 0.2731, "rewards/chosen": 0.5037497282028198, "rewards/margins": 3.1351771116256715, "rewards/rejected": -2.6314273834228517, "step": 12894 }, { "epoch": 0.6834866032385445, "grad_norm": 64.5, "kl": 2.527242660522461, "learning_rate": 5e-07, "logits/chosen": 5562621.0, "logits/rejected": -15098393.6, "logps/chosen": -178.5631103515625, "logps/rejected": -238.3697265625, "loss": 0.31, "rewards/chosen": 0.992082675298055, "rewards/margins": 2.520226271947225, "rewards/rejected": -1.5281435966491699, "step": 12895 }, { "epoch": 0.6835396072403467, "grad_norm": 52.75, "kl": 4.181413650512695, "learning_rate": 5e-07, "logits/chosen": -32583104.0, "logits/rejected": -33036336.0, "logps/chosen": -244.25264485677084, "logps/rejected": -341.5049133300781, "loss": 0.4402, "rewards/chosen": 0.4380367199579875, "rewards/margins": 2.417693773905436, "rewards/rejected": -1.9796570539474487, "step": 12896 }, { "epoch": 0.6835926112421488, "grad_norm": 35.5, "kl": 2.2644290924072266, "learning_rate": 5e-07, "logits/chosen": 1459435.0, "logits/rejected": -11957824.0, "logps/chosen": -62.46622848510742, "logps/rejected": -372.4999186197917, "loss": 0.278, "rewards/chosen": -0.05784282088279724, "rewards/margins": 1.9317449629306793, "rewards/rejected": -1.9895877838134766, "step": 12897 }, { "epoch": 0.683645615243951, "grad_norm": 40.0, "kl": 0.7970733642578125, "learning_rate": 5e-07, "logits/chosen": -30564298.0, "logits/rejected": -18538744.0, "logps/chosen": -71.97561645507812, "logps/rejected": -271.85137939453125, "loss": 0.2894, "rewards/chosen": 0.5144184231758118, "rewards/margins": 2.5662737488746643, "rewards/rejected": -2.0518553256988525, "step": 12898 }, { "epoch": 0.683698619245753, "grad_norm": 45.25, "kl": 1.6999740600585938, "learning_rate": 5e-07, "logits/chosen": 1024724.25, "logits/rejected": 1955932.0, "logps/chosen": -204.7012939453125, "logps/rejected": -222.230810546875, "loss": 0.2966, "rewards/chosen": 0.5085695584615072, "rewards/margins": 2.0778072675069175, "rewards/rejected": -1.5692377090454102, "step": 12899 }, { "epoch": 0.6837516232475552, "grad_norm": 53.0, "kl": 2.1472997665405273, "learning_rate": 5e-07, "logits/chosen": -33444530.666666668, "logits/rejected": -49120644.0, "logps/chosen": -296.84039306640625, "logps/rejected": -380.3805847167969, "loss": 0.4291, "rewards/chosen": 0.14937671025594076, "rewards/margins": 2.2185479005177817, "rewards/rejected": -2.069171190261841, "step": 12900 }, { "epoch": 0.6838046272493573, "grad_norm": 60.25, "kl": 2.73660945892334, "learning_rate": 5e-07, "logits/chosen": -60642368.0, "logits/rejected": 19630668.0, "logps/chosen": -343.0246887207031, "logps/rejected": -287.55120849609375, "loss": 0.3755, "rewards/chosen": 0.4571648836135864, "rewards/margins": 1.7689483165740967, "rewards/rejected": -1.3117834329605103, "step": 12901 }, { "epoch": 0.6838576312511595, "grad_norm": 52.5, "kl": 0.8181638717651367, "learning_rate": 5e-07, "logits/chosen": -33474304.0, "logits/rejected": -23501608.0, "logps/chosen": -399.6864501953125, "logps/rejected": -239.2790323893229, "loss": 0.3559, "rewards/chosen": 0.5712882041931152, "rewards/margins": 1.641320006052653, "rewards/rejected": -1.0700318018595378, "step": 12902 }, { "epoch": 0.6839106352529616, "grad_norm": 34.5, "kl": 0.653228759765625, "learning_rate": 5e-07, "logits/chosen": -25782170.666666668, "logits/rejected": -22264934.4, "logps/chosen": -215.59611002604166, "logps/rejected": -250.18642578125, "loss": 0.2033, "rewards/chosen": 0.7291313012441, "rewards/margins": 3.07810648282369, "rewards/rejected": -2.34897518157959, "step": 12903 }, { "epoch": 0.6839636392547638, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6037407.5, "logits/rejected": -40854149.333333336, "logps/chosen": -326.6092529296875, "logps/rejected": -337.1441650390625, "loss": 0.1813, "rewards/chosen": 0.5864368677139282, "rewards/margins": 2.798375884691874, "rewards/rejected": -2.211939016977946, "step": 12904 }, { "epoch": 0.6840166432565659, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23146100.0, "logits/rejected": -61868928.0, "logps/chosen": -161.2274627685547, "logps/rejected": -435.60504150390625, "loss": 0.3291, "rewards/chosen": 0.11378850787878036, "rewards/margins": 2.4414557442069054, "rewards/rejected": -2.327667236328125, "step": 12905 }, { "epoch": 0.684069647258368, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46411738.666666664, "logits/rejected": -8011808.8, "logps/chosen": -409.1451822916667, "logps/rejected": -334.0636474609375, "loss": 0.2699, "rewards/chosen": 0.15701166788736978, "rewards/margins": 2.178594462076823, "rewards/rejected": -2.0215827941894533, "step": 12906 }, { "epoch": 0.6841226512601701, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62875381.333333336, "logits/rejected": -20143854.4, "logps/chosen": -298.80029296875, "logps/rejected": -217.1949462890625, "loss": 0.2453, "rewards/chosen": 0.44116469224294025, "rewards/margins": 2.351367290814718, "rewards/rejected": -1.9102025985717774, "step": 12907 }, { "epoch": 0.6841756552619723, "grad_norm": 56.75, "kl": 2.4952468872070312, "learning_rate": 5e-07, "logits/chosen": -30869378.666666668, "logits/rejected": 15105900.8, "logps/chosen": -360.59423828125, "logps/rejected": -179.35242919921876, "loss": 0.3509, "rewards/chosen": 0.12667696674664816, "rewards/margins": 1.9886245985825857, "rewards/rejected": -1.8619476318359376, "step": 12908 }, { "epoch": 0.6842286592637744, "grad_norm": 89.0, "kl": 2.5832338333129883, "learning_rate": 5e-07, "logits/chosen": -23651172.0, "logits/rejected": -32920906.0, "logps/chosen": -224.73721313476562, "logps/rejected": -291.37335205078125, "loss": 0.4079, "rewards/chosen": -0.12729011476039886, "rewards/margins": 0.9815327078104019, "rewards/rejected": -1.1088228225708008, "step": 12909 }, { "epoch": 0.6842816632655766, "grad_norm": 59.5, "kl": 3.337810516357422, "learning_rate": 5e-07, "logits/chosen": -37970184.0, "logits/rejected": -16680236.0, "logps/chosen": -535.2550659179688, "logps/rejected": -479.4873046875, "loss": 0.1762, "rewards/chosen": 1.942322015762329, "rewards/margins": 5.045342683792114, "rewards/rejected": -3.103020668029785, "step": 12910 }, { "epoch": 0.6843346672673787, "grad_norm": 59.0, "kl": 4.184940814971924, "learning_rate": 5e-07, "logits/chosen": -39652969.14285714, "logits/rejected": -9003267.0, "logps/chosen": -428.0176478794643, "logps/rejected": -142.59031677246094, "loss": 0.3728, "rewards/chosen": 1.1340257780892509, "rewards/margins": 1.4445070581776756, "rewards/rejected": -0.3104812800884247, "step": 12911 }, { "epoch": 0.6843876712691809, "grad_norm": 37.0, "kl": 1.157613754272461, "learning_rate": 5e-07, "logits/chosen": 2564515.3333333335, "logits/rejected": -13395626.4, "logps/chosen": -175.29618326822916, "logps/rejected": -369.485546875, "loss": 0.2034, "rewards/chosen": 0.6626805067062378, "rewards/margins": 3.3153326749801635, "rewards/rejected": -2.6526521682739257, "step": 12912 }, { "epoch": 0.684440675270983, "grad_norm": 39.0, "kl": 0.20349693298339844, "learning_rate": 5e-07, "logits/chosen": -47650884.0, "logits/rejected": -19154866.666666668, "logps/chosen": -286.04278564453125, "logps/rejected": -228.00545247395834, "loss": 0.2613, "rewards/chosen": -0.6477619409561157, "rewards/margins": 1.557146747907003, "rewards/rejected": -2.2049086888631186, "step": 12913 }, { "epoch": 0.6844936792727851, "grad_norm": 49.0, "kl": 2.6665687561035156, "learning_rate": 5e-07, "logits/chosen": -53912272.0, "logits/rejected": -16857512.0, "logps/chosen": -316.63092041015625, "logps/rejected": -244.18594360351562, "loss": 0.3546, "rewards/chosen": 0.14765459299087524, "rewards/margins": 2.2413483262062073, "rewards/rejected": -2.093693733215332, "step": 12914 }, { "epoch": 0.6845466832745872, "grad_norm": 45.25, "kl": 3.3264293670654297, "learning_rate": 5e-07, "logits/chosen": -2278859.0, "logits/rejected": -17163838.0, "logps/chosen": -516.7007446289062, "logps/rejected": -285.42840576171875, "loss": 0.2259, "rewards/chosen": 1.120207667350769, "rewards/margins": 3.4403949975967407, "rewards/rejected": -2.3201873302459717, "step": 12915 }, { "epoch": 0.6845996872763893, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45521720.0, "logits/rejected": -32131332.0, "logps/chosen": -505.5623474121094, "logps/rejected": -414.07928466796875, "loss": 0.276, "rewards/chosen": 0.2926879823207855, "rewards/margins": 3.2310172021389008, "rewards/rejected": -2.9383292198181152, "step": 12916 }, { "epoch": 0.6846526912781915, "grad_norm": 55.75, "kl": 1.3872737884521484, "learning_rate": 5e-07, "logits/chosen": -16180580.8, "logits/rejected": -17407360.0, "logps/chosen": -260.4573974609375, "logps/rejected": -199.2618408203125, "loss": 0.3699, "rewards/chosen": 0.3609994649887085, "rewards/margins": 2.1761128505071, "rewards/rejected": -1.8151133855183919, "step": 12917 }, { "epoch": 0.6847056952799936, "grad_norm": 36.25, "kl": 2.7071590423583984, "learning_rate": 5e-07, "logits/chosen": -26574578.666666668, "logits/rejected": -49413948.8, "logps/chosen": -314.74176025390625, "logps/rejected": -349.8919189453125, "loss": 0.1889, "rewards/chosen": 1.32449738184611, "rewards/margins": 4.005535093943278, "rewards/rejected": -2.681037712097168, "step": 12918 }, { "epoch": 0.6847586992817958, "grad_norm": 28.375, "kl": 1.8018827438354492, "learning_rate": 5e-07, "logits/chosen": -25532682.666666668, "logits/rejected": -21315558.4, "logps/chosen": -122.04368082682292, "logps/rejected": -259.8749267578125, "loss": 0.2635, "rewards/chosen": -0.08494579792022705, "rewards/margins": 3.162968325614929, "rewards/rejected": -3.247914123535156, "step": 12919 }, { "epoch": 0.6848117032835979, "grad_norm": 56.5, "kl": 2.735067367553711, "learning_rate": 5e-07, "logits/chosen": -31333413.333333332, "logits/rejected": -28772530.0, "logps/chosen": -175.77775065104166, "logps/rejected": -367.83502197265625, "loss": 0.4416, "rewards/chosen": 0.2854490478833516, "rewards/margins": 1.8329603870709736, "rewards/rejected": -1.547511339187622, "step": 12920 }, { "epoch": 0.6848647072854, "grad_norm": 32.25, "kl": 5.459342002868652, "learning_rate": 5e-07, "logits/chosen": -24974330.0, "logits/rejected": -28199552.0, "logps/chosen": -352.2454528808594, "logps/rejected": -497.29656982421875, "loss": 0.3581, "rewards/chosen": 0.32385072112083435, "rewards/margins": 4.030278533697128, "rewards/rejected": -3.706427812576294, "step": 12921 }, { "epoch": 0.6849177112872021, "grad_norm": 59.75, "kl": 1.687993049621582, "learning_rate": 5e-07, "logits/chosen": -65917427.2, "logits/rejected": -27110552.0, "logps/chosen": -388.8919677734375, "logps/rejected": -357.6719970703125, "loss": 0.3795, "rewards/chosen": 0.32845683097839357, "rewards/margins": 2.0025983969370524, "rewards/rejected": -1.674141565958659, "step": 12922 }, { "epoch": 0.6849707152890043, "grad_norm": 67.0, "kl": 3.2866878509521484, "learning_rate": 5e-07, "logits/chosen": 10561487.2, "logits/rejected": 73784677.33333333, "logps/chosen": -469.816796875, "logps/rejected": -229.13326009114584, "loss": 0.3428, "rewards/chosen": 1.0983336448669434, "rewards/margins": 2.5763244311014812, "rewards/rejected": -1.4779907862345378, "step": 12923 }, { "epoch": 0.6850237192908064, "grad_norm": 46.25, "kl": 3.058910369873047, "learning_rate": 5e-07, "logits/chosen": -18235590.85714286, "logits/rejected": -112137920.0, "logps/chosen": -217.16659109933036, "logps/rejected": -270.50384521484375, "loss": 0.5012, "rewards/chosen": 0.14926900182451522, "rewards/margins": 1.7254164389201574, "rewards/rejected": -1.576147437095642, "step": 12924 }, { "epoch": 0.6850767232926086, "grad_norm": 56.0, "kl": 0.6279740333557129, "learning_rate": 5e-07, "logits/chosen": -45222288.0, "logits/rejected": -31389430.0, "logps/chosen": -509.7209167480469, "logps/rejected": -236.6540985107422, "loss": 0.326, "rewards/chosen": 0.3367347717285156, "rewards/margins": 1.6675591468811035, "rewards/rejected": -1.330824375152588, "step": 12925 }, { "epoch": 0.6851297272944107, "grad_norm": 49.75, "kl": 2.3507776260375977, "learning_rate": 5e-07, "logits/chosen": -97744.75, "logits/rejected": 217364.4375, "logps/chosen": -147.22459411621094, "logps/rejected": -84.16835021972656, "loss": 0.3482, "rewards/chosen": 0.39533886313438416, "rewards/margins": 2.3191647231578827, "rewards/rejected": -1.9238258600234985, "step": 12926 }, { "epoch": 0.6851827312962129, "grad_norm": 41.0, "kl": 0.4940757751464844, "learning_rate": 5e-07, "logits/chosen": -20328064.0, "logits/rejected": -3601844.6666666665, "logps/chosen": -184.83734130859375, "logps/rejected": -125.26561482747395, "loss": 0.2603, "rewards/chosen": 0.3777424693107605, "rewards/margins": 2.1966247757275896, "rewards/rejected": -1.8188823064168294, "step": 12927 }, { "epoch": 0.685235735298015, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4341773.5, "logits/rejected": -38447.333333333336, "logps/chosen": -96.4166488647461, "logps/rejected": -424.1787923177083, "loss": 0.2481, "rewards/chosen": -0.20681554079055786, "rewards/margins": 1.6740767359733582, "rewards/rejected": -1.880892276763916, "step": 12928 }, { "epoch": 0.6852887392998172, "grad_norm": 50.75, "kl": 4.792764663696289, "learning_rate": 5e-07, "logits/chosen": -2977508.0, "logits/rejected": -17949882.0, "logps/chosen": -186.69829450334822, "logps/rejected": -171.29385375976562, "loss": 0.4407, "rewards/chosen": 0.6641007832118443, "rewards/margins": 2.128578322274344, "rewards/rejected": -1.4644775390625, "step": 12929 }, { "epoch": 0.6853417433016192, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63611144.0, "logits/rejected": -39526522.666666664, "logps/chosen": -361.20098876953125, "logps/rejected": -417.10546875, "loss": 0.1737, "rewards/chosen": 0.8353180289268494, "rewards/margins": 3.9844180146853128, "rewards/rejected": -3.1490999857584634, "step": 12930 }, { "epoch": 0.6853947473034214, "grad_norm": 56.25, "kl": 4.566840171813965, "learning_rate": 5e-07, "logits/chosen": -17702514.666666668, "logits/rejected": -97090024.0, "logps/chosen": -344.2349853515625, "logps/rejected": -802.4769287109375, "loss": 0.2938, "rewards/chosen": 1.121049404144287, "rewards/margins": 4.5544114112854, "rewards/rejected": -3.4333620071411133, "step": 12931 }, { "epoch": 0.6854477513052235, "grad_norm": 39.25, "kl": 2.263517379760742, "learning_rate": 5e-07, "logits/chosen": -5009718.333333333, "logits/rejected": -21413068.8, "logps/chosen": -186.16357421875, "logps/rejected": -243.478955078125, "loss": 0.2009, "rewards/chosen": 1.3438378969828289, "rewards/margins": 3.7229598681132003, "rewards/rejected": -2.3791219711303713, "step": 12932 }, { "epoch": 0.6855007553070257, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32072246.4, "logits/rejected": -52324368.0, "logps/chosen": -343.903955078125, "logps/rejected": -277.0464274088542, "loss": 0.1887, "rewards/chosen": 1.1200932502746581, "rewards/margins": 4.055211925506592, "rewards/rejected": -2.9351186752319336, "step": 12933 }, { "epoch": 0.6855537593088278, "grad_norm": 53.0, "kl": 3.106472969055176, "learning_rate": 5e-07, "logits/chosen": -33761158.4, "logits/rejected": -22606930.666666668, "logps/chosen": -712.907861328125, "logps/rejected": -227.0596923828125, "loss": 0.2827, "rewards/chosen": 1.233572006225586, "rewards/margins": 3.770420710245768, "rewards/rejected": -2.536848704020182, "step": 12934 }, { "epoch": 0.68560676331063, "grad_norm": 46.25, "kl": 1.098297119140625, "learning_rate": 5e-07, "logits/chosen": -25406642.666666668, "logits/rejected": -25745561.6, "logps/chosen": -352.4779052734375, "logps/rejected": -187.188525390625, "loss": 0.2061, "rewards/chosen": 1.0355051358540852, "rewards/margins": 3.106428368886312, "rewards/rejected": -2.0709232330322265, "step": 12935 }, { "epoch": 0.6856597673124321, "grad_norm": 61.5, "kl": 5.175304412841797, "learning_rate": 5e-07, "logits/chosen": -6852924.0, "logits/rejected": 2896283.75, "logps/chosen": -474.1582728794643, "logps/rejected": -727.5804443359375, "loss": 0.3751, "rewards/chosen": 0.9979406084333148, "rewards/margins": 3.3535071100507463, "rewards/rejected": -2.3555665016174316, "step": 12936 }, { "epoch": 0.6857127713142342, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -70998008.0, "logits/rejected": -64138107.428571425, "logps/chosen": -202.47634887695312, "logps/rejected": -521.81982421875, "loss": 0.1699, "rewards/chosen": -0.06139678880572319, "rewards/margins": 2.5537414013275077, "rewards/rejected": -2.615138190133231, "step": 12937 }, { "epoch": 0.6857657753160363, "grad_norm": 25.875, "kl": 0.04447364807128906, "learning_rate": 5e-07, "logits/chosen": 2960318.0, "logits/rejected": -23997893.333333332, "logps/chosen": -24.24655532836914, "logps/rejected": -497.6705322265625, "loss": 0.1912, "rewards/chosen": 0.5111948847770691, "rewards/margins": 3.6728363235791526, "rewards/rejected": -3.1616414388020835, "step": 12938 }, { "epoch": 0.6858187793178385, "grad_norm": 90.0, "kl": 0.2963237762451172, "learning_rate": 5e-07, "logits/chosen": -76529104.0, "logits/rejected": -99365824.0, "logps/chosen": -294.134521484375, "logps/rejected": -350.04217529296875, "loss": 0.32, "rewards/chosen": 0.2175096571445465, "rewards/margins": 1.9173586666584015, "rewards/rejected": -1.699849009513855, "step": 12939 }, { "epoch": 0.6858717833196406, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -127064464.0, "logits/rejected": -6119421.333333333, "logps/chosen": -443.89935302734375, "logps/rejected": -432.9051920572917, "loss": 0.1813, "rewards/chosen": 0.29755401611328125, "rewards/margins": 2.9623870849609375, "rewards/rejected": -2.6648330688476562, "step": 12940 }, { "epoch": 0.6859247873214428, "grad_norm": 38.5, "kl": 6.023039817810059, "learning_rate": 5e-07, "logits/chosen": -8548331.2, "logits/rejected": 13441040.0, "logps/chosen": -160.3699951171875, "logps/rejected": -372.4978841145833, "loss": 0.2986, "rewards/chosen": 0.9130619049072266, "rewards/margins": 3.4394182840983074, "rewards/rejected": -2.5263563791910806, "step": 12941 }, { "epoch": 0.6859777913232449, "grad_norm": 42.25, "kl": 0.22570037841796875, "learning_rate": 5e-07, "logits/chosen": -22934370.666666668, "logits/rejected": -7697455.0, "logps/chosen": -345.1003011067708, "logps/rejected": -258.09722900390625, "loss": 0.305, "rewards/chosen": 0.8358365694681803, "rewards/margins": 2.743912855784098, "rewards/rejected": -1.908076286315918, "step": 12942 }, { "epoch": 0.6860307953250471, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23270734.0, "logits/rejected": -43765712.0, "logps/chosen": -504.44671630859375, "logps/rejected": -375.0352783203125, "loss": 0.191, "rewards/chosen": 0.9818435907363892, "rewards/margins": 4.557513117790222, "rewards/rejected": -3.575669527053833, "step": 12943 }, { "epoch": 0.6860837993268492, "grad_norm": 52.5, "kl": 2.4533233642578125, "learning_rate": 5e-07, "logits/chosen": -17691545.6, "logits/rejected": -23456725.333333332, "logps/chosen": -262.280810546875, "logps/rejected": -226.07503255208334, "loss": 0.2331, "rewards/chosen": 1.026520347595215, "rewards/margins": 3.7558917363484703, "rewards/rejected": -2.7293713887532554, "step": 12944 }, { "epoch": 0.6861368033286513, "grad_norm": 53.5, "kl": 0.30055999755859375, "learning_rate": 5e-07, "logits/chosen": -68083432.0, "logits/rejected": -17788260.0, "logps/chosen": -349.1842346191406, "logps/rejected": -353.6822814941406, "loss": 0.2339, "rewards/chosen": 0.6107944250106812, "rewards/margins": 3.3978551626205444, "rewards/rejected": -2.7870607376098633, "step": 12945 }, { "epoch": 0.6861898073304534, "grad_norm": 74.5, "kl": 0.210479736328125, "learning_rate": 5e-07, "logits/chosen": -26573568.0, "logits/rejected": 1850048.5, "logps/chosen": -370.189404296875, "logps/rejected": -80.57962036132812, "loss": 0.3955, "rewards/chosen": -0.12879486083984376, "rewards/margins": 2.551096725463867, "rewards/rejected": -2.679891586303711, "step": 12946 }, { "epoch": 0.6862428113322556, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38832410.666666664, "logits/rejected": 34468102.4, "logps/chosen": -591.1726888020834, "logps/rejected": -208.0148681640625, "loss": 0.2411, "rewards/chosen": 0.6236195961634318, "rewards/margins": 2.423540631930033, "rewards/rejected": -1.7999210357666016, "step": 12947 }, { "epoch": 0.6862958153340577, "grad_norm": 40.0, "kl": 0.11705780029296875, "learning_rate": 5e-07, "logits/chosen": -21382992.0, "logits/rejected": 5139248.8, "logps/chosen": -206.3580525716146, "logps/rejected": -417.13193359375, "loss": 0.2111, "rewards/chosen": 0.4665926694869995, "rewards/margins": 2.8746300458908083, "rewards/rejected": -2.4080373764038088, "step": 12948 }, { "epoch": 0.6863488193358599, "grad_norm": 64.5, "kl": 2.038320541381836, "learning_rate": 5e-07, "logits/chosen": -16039748.8, "logits/rejected": -57769626.666666664, "logps/chosen": -268.0728759765625, "logps/rejected": -240.15450032552084, "loss": 0.3484, "rewards/chosen": 0.6537199497222901, "rewards/margins": 2.0433138688405355, "rewards/rejected": -1.3895939191182454, "step": 12949 }, { "epoch": 0.686401823337662, "grad_norm": 40.0, "kl": 0.15317726135253906, "learning_rate": 5e-07, "logits/chosen": -4839299.0, "logits/rejected": -36491845.333333336, "logps/chosen": -143.19187927246094, "logps/rejected": -394.4337565104167, "loss": 0.1873, "rewards/chosen": -0.3382234573364258, "rewards/margins": 2.9037097295125327, "rewards/rejected": -3.2419331868489585, "step": 12950 }, { "epoch": 0.6864548273394642, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43288648.0, "logits/rejected": -121877192.0, "logps/chosen": -357.4884440104167, "logps/rejected": -454.6976623535156, "loss": 0.4319, "rewards/chosen": 0.04370346665382385, "rewards/margins": 1.1702355444431305, "rewards/rejected": -1.1265320777893066, "step": 12951 }, { "epoch": 0.6865078313412663, "grad_norm": 49.0, "kl": 1.2843284606933594, "learning_rate": 5e-07, "logits/chosen": -46669525.333333336, "logits/rejected": 3929384.4, "logps/chosen": -603.7036946614584, "logps/rejected": -233.443017578125, "loss": 0.1949, "rewards/chosen": 1.9728205998738606, "rewards/margins": 4.025510152180989, "rewards/rejected": -2.052689552307129, "step": 12952 }, { "epoch": 0.6865608353430684, "grad_norm": 54.25, "kl": 0.16217517852783203, "learning_rate": 5e-07, "logits/chosen": -35685296.0, "logits/rejected": -30132211.2, "logps/chosen": -356.2620442708333, "logps/rejected": -261.4388671875, "loss": 0.2422, "rewards/chosen": 1.196876049041748, "rewards/margins": 3.1271510124206543, "rewards/rejected": -1.9302749633789062, "step": 12953 }, { "epoch": 0.6866138393448705, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66790584.0, "logits/rejected": -61516688.0, "logps/chosen": -321.87664794921875, "logps/rejected": -242.7469685872396, "loss": 0.2173, "rewards/chosen": 0.8004531860351562, "rewards/margins": 2.9711058934529624, "rewards/rejected": -2.170652707417806, "step": 12954 }, { "epoch": 0.6866668433466727, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23054752.0, "logits/rejected": -73382960.0, "logps/chosen": -352.7809244791667, "logps/rejected": -471.1485900878906, "loss": 0.4043, "rewards/chosen": -0.005572378635406494, "rewards/margins": 2.3394989371299744, "rewards/rejected": -2.345071315765381, "step": 12955 }, { "epoch": 0.6867198473484748, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94615136.0, "logits/rejected": -12978892.0, "logps/chosen": -394.220703125, "logps/rejected": -110.22564697265625, "loss": 0.2145, "rewards/chosen": 0.6278213858604431, "rewards/margins": 2.769714136918386, "rewards/rejected": -2.141892751057943, "step": 12956 }, { "epoch": 0.686772851350277, "grad_norm": 57.0, "kl": 2.6266441345214844, "learning_rate": 5e-07, "logits/chosen": -41868341.333333336, "logits/rejected": -33444488.0, "logps/chosen": -409.0946451822917, "logps/rejected": -475.4451904296875, "loss": 0.33, "rewards/chosen": 0.7238538265228271, "rewards/margins": 3.563915967941284, "rewards/rejected": -2.840062141418457, "step": 12957 }, { "epoch": 0.6868258553520791, "grad_norm": 63.5, "kl": 1.41510009765625, "learning_rate": 5e-07, "logits/chosen": -12032288.0, "logits/rejected": 518789.0, "logps/chosen": -496.25201416015625, "logps/rejected": -284.1417541503906, "loss": 0.2781, "rewards/chosen": 1.227257251739502, "rewards/margins": 2.1285685300827026, "rewards/rejected": -0.9013112783432007, "step": 12958 }, { "epoch": 0.6868788593538813, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5547942.0, "logits/rejected": 4113377.25, "logps/chosen": -37.85499572753906, "logps/rejected": -350.1341247558594, "loss": 0.2677, "rewards/chosen": 0.5036673545837402, "rewards/margins": 3.15075945854187, "rewards/rejected": -2.64709210395813, "step": 12959 }, { "epoch": 0.6869318633556833, "grad_norm": 41.75, "kl": 6.614863395690918, "learning_rate": 5e-07, "logits/chosen": -18067440.0, "logits/rejected": -30126040.0, "logps/chosen": -241.61965942382812, "logps/rejected": -264.44305419921875, "loss": 0.3223, "rewards/chosen": 0.9510654211044312, "rewards/margins": 3.4141019582748413, "rewards/rejected": -2.46303653717041, "step": 12960 }, { "epoch": 0.6869848673574855, "grad_norm": 62.0, "kl": 3.899280548095703, "learning_rate": 5e-07, "logits/chosen": -37891924.0, "logits/rejected": -34596768.0, "logps/chosen": -386.5108947753906, "logps/rejected": -260.16473388671875, "loss": 0.3505, "rewards/chosen": 0.8111820816993713, "rewards/margins": 2.788254201412201, "rewards/rejected": -1.9770721197128296, "step": 12961 }, { "epoch": 0.6870378713592876, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23072642.666666668, "logits/rejected": 16582000.0, "logps/chosen": -133.71484375, "logps/rejected": -302.160791015625, "loss": 0.2257, "rewards/chosen": 0.2678612271944682, "rewards/margins": 3.035043958822886, "rewards/rejected": -2.767182731628418, "step": 12962 }, { "epoch": 0.6870908753610898, "grad_norm": 39.25, "kl": 2.2535791397094727, "learning_rate": 5e-07, "logits/chosen": 8763350.4, "logits/rejected": -26628650.666666668, "logps/chosen": -149.23023681640626, "logps/rejected": -162.95808919270834, "loss": 0.4172, "rewards/chosen": -0.279648494720459, "rewards/margins": 1.84065211613973, "rewards/rejected": -2.120300610860189, "step": 12963 }, { "epoch": 0.6871438793628919, "grad_norm": 30.875, "kl": 0.37715911865234375, "learning_rate": 5e-07, "logits/chosen": -1865315.8333333333, "logits/rejected": -304014.8, "logps/chosen": -247.0223592122396, "logps/rejected": -161.7473388671875, "loss": 0.1659, "rewards/chosen": 1.1072533925374348, "rewards/margins": 3.8295867284139, "rewards/rejected": -2.7223333358764648, "step": 12964 }, { "epoch": 0.6871968833646941, "grad_norm": 45.0, "kl": 0.3803424835205078, "learning_rate": 5e-07, "logits/chosen": -28556702.0, "logits/rejected": -38302596.0, "logps/chosen": -102.01416778564453, "logps/rejected": -546.5999145507812, "loss": 0.2949, "rewards/chosen": 0.1976596862077713, "rewards/margins": 2.605298474431038, "rewards/rejected": -2.4076387882232666, "step": 12965 }, { "epoch": 0.6872498873664962, "grad_norm": 87.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32767688.0, "logits/rejected": -4932814.0, "logps/chosen": -229.78651428222656, "logps/rejected": -117.08329264322917, "loss": 0.3489, "rewards/chosen": -0.6904277801513672, "rewards/margins": 0.5898806254069011, "rewards/rejected": -1.2803084055582683, "step": 12966 }, { "epoch": 0.6873028913682983, "grad_norm": 44.0, "kl": 0.3388252258300781, "learning_rate": 5e-07, "logits/chosen": -21068918.0, "logits/rejected": -29290582.0, "logps/chosen": -431.56878662109375, "logps/rejected": -160.32740783691406, "loss": 0.1909, "rewards/chosen": 0.8826957941055298, "rewards/margins": 4.110374569892883, "rewards/rejected": -3.2276787757873535, "step": 12967 }, { "epoch": 0.6873558953701004, "grad_norm": 54.25, "kl": 1.5965194702148438, "learning_rate": 5e-07, "logits/chosen": -26674104.0, "logits/rejected": -26169652.0, "logps/chosen": -448.8568522135417, "logps/rejected": -458.81805419921875, "loss": 0.2746, "rewards/chosen": 1.1206684112548828, "rewards/margins": 3.6911916732788086, "rewards/rejected": -2.570523262023926, "step": 12968 }, { "epoch": 0.6874088993719025, "grad_norm": 63.75, "kl": 0.644801139831543, "learning_rate": 5e-07, "logits/chosen": -13458595.0, "logits/rejected": -3369832.75, "logps/chosen": -137.23660278320312, "logps/rejected": -498.93280029296875, "loss": 0.2627, "rewards/chosen": 0.5924625396728516, "rewards/margins": 2.8088369369506836, "rewards/rejected": -2.216374397277832, "step": 12969 }, { "epoch": 0.6874619033737047, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46015386.666666664, "logits/rejected": -17717248.0, "logps/chosen": -491.7762858072917, "logps/rejected": -430.82578125, "loss": 0.2161, "rewards/chosen": 0.08519796530405681, "rewards/margins": 2.956589452425639, "rewards/rejected": -2.871391487121582, "step": 12970 }, { "epoch": 0.6875149073755068, "grad_norm": 40.25, "kl": 1.5862293243408203, "learning_rate": 5e-07, "logits/chosen": 2668037.2, "logits/rejected": 33440730.666666668, "logps/chosen": -175.08341064453126, "logps/rejected": -456.0899251302083, "loss": 0.301, "rewards/chosen": 0.40696134567260744, "rewards/margins": 3.933586343129476, "rewards/rejected": -3.5266249974568686, "step": 12971 }, { "epoch": 0.687567911377309, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27419580.0, "logits/rejected": -14634626.0, "logps/chosen": -538.3748168945312, "logps/rejected": -303.7744140625, "loss": 0.3713, "rewards/chosen": 0.09615707397460938, "rewards/margins": 1.4764313697814941, "rewards/rejected": -1.3802742958068848, "step": 12972 }, { "epoch": 0.6876209153791111, "grad_norm": 42.75, "kl": 2.558795928955078, "learning_rate": 5e-07, "logits/chosen": -61490901.333333336, "logits/rejected": -8225388.0, "logps/chosen": -414.731201171875, "logps/rejected": -128.22213134765624, "loss": 0.2892, "rewards/chosen": -0.08704617619514465, "rewards/margins": 1.8429121673107147, "rewards/rejected": -1.9299583435058594, "step": 12973 }, { "epoch": 0.6876739193809133, "grad_norm": 45.0, "kl": 0.6932926177978516, "learning_rate": 5e-07, "logits/chosen": -16480694.0, "logits/rejected": -11538840.0, "logps/chosen": -152.96310424804688, "logps/rejected": -259.8584289550781, "loss": 0.3153, "rewards/chosen": 0.3782959580421448, "rewards/margins": 2.8413079380989075, "rewards/rejected": -2.4630119800567627, "step": 12974 }, { "epoch": 0.6877269233827153, "grad_norm": 47.0, "kl": 6.559324264526367, "learning_rate": 5e-07, "logits/chosen": -11283256.0, "logits/rejected": -54864088.0, "logps/chosen": -435.1058872767857, "logps/rejected": -442.4147644042969, "loss": 0.4708, "rewards/chosen": 0.43995652879987446, "rewards/margins": 1.940093857901437, "rewards/rejected": -1.5001373291015625, "step": 12975 }, { "epoch": 0.6877799273845175, "grad_norm": 38.75, "kl": 0.3510627746582031, "learning_rate": 5e-07, "logits/chosen": -10378828.8, "logits/rejected": -29902352.0, "logps/chosen": -236.429736328125, "logps/rejected": -432.7652994791667, "loss": 0.2031, "rewards/chosen": 0.9098199844360352, "rewards/margins": 4.9279881159464525, "rewards/rejected": -4.018168131510417, "step": 12976 }, { "epoch": 0.6878329313863196, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12848365.0, "logits/rejected": 409553.3333333333, "logps/chosen": -105.88046264648438, "logps/rejected": -327.6918538411458, "loss": 0.313, "rewards/chosen": -0.17348863184452057, "rewards/margins": 1.4235917578140895, "rewards/rejected": -1.59708038965861, "step": 12977 }, { "epoch": 0.6878859353881218, "grad_norm": 38.0, "kl": 2.475433349609375, "learning_rate": 5e-07, "logits/chosen": -36327788.8, "logits/rejected": -8449571.333333334, "logps/chosen": -229.533251953125, "logps/rejected": -248.98771158854166, "loss": 0.28, "rewards/chosen": 0.7481911659240723, "rewards/margins": 3.6338516553243005, "rewards/rejected": -2.885660489400228, "step": 12978 }, { "epoch": 0.6879389393899239, "grad_norm": 32.25, "kl": 1.7596149444580078, "learning_rate": 5e-07, "logits/chosen": 3969405.6, "logits/rejected": -49242272.0, "logps/chosen": -656.20078125, "logps/rejected": -524.4396158854166, "loss": 0.2257, "rewards/chosen": 1.4414046287536622, "rewards/margins": 4.509769090016683, "rewards/rejected": -3.068364461263021, "step": 12979 }, { "epoch": 0.6879919433917261, "grad_norm": 48.0, "kl": 0.6608848571777344, "learning_rate": 5e-07, "logits/chosen": -15068513.333333334, "logits/rejected": 14794691.2, "logps/chosen": -225.6669921875, "logps/rejected": -448.80146484375, "loss": 0.2915, "rewards/chosen": 0.019739786783854168, "rewards/margins": 2.3717280069986977, "rewards/rejected": -2.3519882202148437, "step": 12980 }, { "epoch": 0.6880449473935282, "grad_norm": 40.5, "kl": 0.6593780517578125, "learning_rate": 5e-07, "logits/chosen": -22780301.333333332, "logits/rejected": -15302384.0, "logps/chosen": -182.549072265625, "logps/rejected": -334.260205078125, "loss": 0.246, "rewards/chosen": 0.49099870522816974, "rewards/margins": 2.909483774503072, "rewards/rejected": -2.4184850692749023, "step": 12981 }, { "epoch": 0.6880979513953304, "grad_norm": 47.25, "kl": 1.325758934020996, "learning_rate": 5e-07, "logits/chosen": -35992668.0, "logits/rejected": -30052156.0, "logps/chosen": -443.3059387207031, "logps/rejected": -345.3426513671875, "loss": 0.223, "rewards/chosen": 0.5562665462493896, "rewards/margins": 3.785559892654419, "rewards/rejected": -3.2292933464050293, "step": 12982 }, { "epoch": 0.6881509553971324, "grad_norm": 28.5, "kl": 0.9584484100341797, "learning_rate": 5e-07, "logits/chosen": -7256866.0, "logits/rejected": -19960128.0, "logps/chosen": -770.5784505208334, "logps/rejected": -461.425244140625, "loss": 0.1082, "rewards/chosen": 2.252507050832113, "rewards/margins": 4.975148804982503, "rewards/rejected": -2.7226417541503904, "step": 12983 }, { "epoch": 0.6882039593989346, "grad_norm": 59.5, "kl": 3.5986480712890625, "learning_rate": 5e-07, "logits/chosen": -70062560.0, "logits/rejected": -29155897.6, "logps/chosen": -991.1981608072916, "logps/rejected": -181.361083984375, "loss": 0.0983, "rewards/chosen": 2.985978126525879, "rewards/margins": 5.682711791992188, "rewards/rejected": -2.6967336654663088, "step": 12984 }, { "epoch": 0.6882569634007367, "grad_norm": 47.0, "kl": 0.9316329956054688, "learning_rate": 5e-07, "logits/chosen": -13666337.333333334, "logits/rejected": -59835848.0, "logps/chosen": -267.7289632161458, "logps/rejected": -316.9809265136719, "loss": 0.3996, "rewards/chosen": 0.2230912446975708, "rewards/margins": 2.102929949760437, "rewards/rejected": -1.8798387050628662, "step": 12985 }, { "epoch": 0.6883099674025389, "grad_norm": 29.5, "kl": 1.4247074127197266, "learning_rate": 5e-07, "logits/chosen": -2449056.25, "logits/rejected": -26298628.0, "logps/chosen": -120.65876007080078, "logps/rejected": -316.2894287109375, "loss": 0.2807, "rewards/chosen": 0.2942160964012146, "rewards/margins": 2.7831584811210632, "rewards/rejected": -2.4889423847198486, "step": 12986 }, { "epoch": 0.688362971404341, "grad_norm": 59.75, "kl": 0.37105846405029297, "learning_rate": 5e-07, "logits/chosen": -19198432.0, "logits/rejected": -29192166.0, "logps/chosen": -253.2037353515625, "logps/rejected": -257.90887451171875, "loss": 0.4115, "rewards/chosen": 0.01469281812508901, "rewards/margins": 2.0076989283164344, "rewards/rejected": -1.9930061101913452, "step": 12987 }, { "epoch": 0.6884159754061432, "grad_norm": 47.5, "kl": 0.6861438751220703, "learning_rate": 5e-07, "logits/chosen": -4187628.6666666665, "logits/rejected": -6675829.2, "logps/chosen": -107.08944702148438, "logps/rejected": -165.31279296875, "loss": 0.2958, "rewards/chosen": 1.0970985094706218, "rewards/margins": 2.2149219195048016, "rewards/rejected": -1.1178234100341797, "step": 12988 }, { "epoch": 0.6884689794079453, "grad_norm": 65.5, "kl": 0.4649848937988281, "learning_rate": 5e-07, "logits/chosen": 30929334.0, "logits/rejected": -719812.6666666666, "logps/chosen": -358.8727722167969, "logps/rejected": -248.0325927734375, "loss": 0.2739, "rewards/chosen": -0.16883468627929688, "rewards/margins": 1.7093116442362468, "rewards/rejected": -1.8781463305155437, "step": 12989 }, { "epoch": 0.6885219834097475, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66415152.0, "logits/rejected": -42226244.571428575, "logps/chosen": -640.8847045898438, "logps/rejected": -348.99295479910717, "loss": 0.1378, "rewards/chosen": 0.957598865032196, "rewards/margins": 3.5737604839461192, "rewards/rejected": -2.616161618913923, "step": 12990 }, { "epoch": 0.6885749874115495, "grad_norm": 23.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5871639.0, "logits/rejected": -19414893.714285713, "logps/chosen": -17.147371292114258, "logps/rejected": -209.18289620535714, "loss": 0.1465, "rewards/chosen": 0.7017269134521484, "rewards/margins": 3.4101638793945312, "rewards/rejected": -2.708436965942383, "step": 12991 }, { "epoch": 0.6886279914133517, "grad_norm": 75.0, "kl": 1.4026908874511719, "learning_rate": 5e-07, "logits/chosen": 3486402.0, "logits/rejected": -19642284.0, "logps/chosen": -347.360595703125, "logps/rejected": -283.47845458984375, "loss": 0.3439, "rewards/chosen": 0.5732795000076294, "rewards/margins": 1.4979295134544373, "rewards/rejected": -0.9246500134468079, "step": 12992 }, { "epoch": 0.6886809954151538, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19693728.0, "logits/rejected": -30848461.333333332, "logps/chosen": -478.777587890625, "logps/rejected": -311.9842936197917, "loss": 0.1204, "rewards/chosen": 1.0201339721679688, "rewards/margins": 4.139652887980143, "rewards/rejected": -3.1195189158121743, "step": 12993 }, { "epoch": 0.688733999416956, "grad_norm": 53.25, "kl": 3.683013916015625, "learning_rate": 5e-07, "logits/chosen": -16207525.0, "logits/rejected": -8943279.0, "logps/chosen": -403.608642578125, "logps/rejected": -366.43878173828125, "loss": 0.278, "rewards/chosen": 0.8340887427330017, "rewards/margins": 2.9377588629722595, "rewards/rejected": -2.103670120239258, "step": 12994 }, { "epoch": 0.6887870034187581, "grad_norm": 60.75, "kl": 0.11014556884765625, "learning_rate": 5e-07, "logits/chosen": -33347446.4, "logits/rejected": 4617934.666666667, "logps/chosen": -529.0984375, "logps/rejected": -150.9876708984375, "loss": 0.3181, "rewards/chosen": 1.1261329650878906, "rewards/margins": 1.7589290142059326, "rewards/rejected": -0.632796049118042, "step": 12995 }, { "epoch": 0.6888400074205603, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27730990.0, "logits/rejected": -44697192.0, "logps/chosen": -187.80491638183594, "logps/rejected": -408.4049072265625, "loss": 0.3374, "rewards/chosen": -0.13704219460487366, "rewards/margins": 1.8952969014644623, "rewards/rejected": -2.032339096069336, "step": 12996 }, { "epoch": 0.6888930114223624, "grad_norm": 46.75, "kl": 0.8115558624267578, "learning_rate": 5e-07, "logits/chosen": -44620416.0, "logits/rejected": -19612216.0, "logps/chosen": -271.70635986328125, "logps/rejected": -259.7154541015625, "loss": 0.2698, "rewards/chosen": 0.08515186607837677, "rewards/margins": 4.491896823048592, "rewards/rejected": -4.406744956970215, "step": 12997 }, { "epoch": 0.6889460154241646, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74234416.0, "logits/rejected": -50377064.0, "logps/chosen": -852.4710083007812, "logps/rejected": -477.0228576660156, "loss": 0.1468, "rewards/chosen": 1.3178402185440063, "rewards/margins": 4.284838557243347, "rewards/rejected": -2.966998338699341, "step": 12998 }, { "epoch": 0.6889990194259666, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 161039797.33333334, "logits/rejected": -21330956.8, "logps/chosen": -393.1389973958333, "logps/rejected": -344.980322265625, "loss": 0.1644, "rewards/chosen": 0.6323636770248413, "rewards/margins": 3.8996381521224976, "rewards/rejected": -3.2672744750976563, "step": 12999 }, { "epoch": 0.6890520234277688, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10343696.0, "logits/rejected": -5755095.2, "logps/chosen": -239.79536946614584, "logps/rejected": -194.493408203125, "loss": 0.1909, "rewards/chosen": 0.8063163757324219, "rewards/margins": 3.6099292755126955, "rewards/rejected": -2.8036128997802736, "step": 13000 }, { "epoch": 0.6891050274295709, "grad_norm": 39.75, "kl": 0.4963417053222656, "learning_rate": 5e-07, "logits/chosen": -11181493.0, "logits/rejected": -22513168.0, "logps/chosen": -193.6258544921875, "logps/rejected": -265.32135009765625, "loss": 0.3253, "rewards/chosen": 0.16920509934425354, "rewards/margins": 2.163316994905472, "rewards/rejected": -1.9941118955612183, "step": 13001 }, { "epoch": 0.6891580314313731, "grad_norm": 39.0, "kl": 0.3924846649169922, "learning_rate": 5e-07, "logits/chosen": -9920044.8, "logits/rejected": -13823698.666666666, "logps/chosen": -203.4521240234375, "logps/rejected": -76.71340433756511, "loss": 0.2184, "rewards/chosen": 1.2446781158447267, "rewards/margins": 3.6070152282714845, "rewards/rejected": -2.362337112426758, "step": 13002 }, { "epoch": 0.6892110354331752, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18503052.0, "logits/rejected": -13061681.0, "logps/chosen": -156.73497009277344, "logps/rejected": -231.55575561523438, "loss": 0.2959, "rewards/chosen": 0.03523407131433487, "rewards/margins": 2.315265752375126, "rewards/rejected": -2.280031681060791, "step": 13003 }, { "epoch": 0.6892640394349774, "grad_norm": 38.25, "kl": 0.17060089111328125, "learning_rate": 5e-07, "logits/chosen": -16197656.0, "logits/rejected": -12570309.0, "logps/chosen": -273.365966796875, "logps/rejected": -599.98046875, "loss": 0.1843, "rewards/chosen": 1.045472264289856, "rewards/margins": 5.283949494361877, "rewards/rejected": -4.2384772300720215, "step": 13004 }, { "epoch": 0.6893170434367795, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16795236.0, "logits/rejected": -37394124.8, "logps/chosen": -486.9838460286458, "logps/rejected": -497.678271484375, "loss": 0.301, "rewards/chosen": 0.70928955078125, "rewards/margins": 1.8259567260742187, "rewards/rejected": -1.1166671752929687, "step": 13005 }, { "epoch": 0.6893700474385817, "grad_norm": 41.75, "kl": 1.2456092834472656, "learning_rate": 5e-07, "logits/chosen": 737651.2, "logits/rejected": -14718864.0, "logps/chosen": -183.69952392578125, "logps/rejected": -329.7593587239583, "loss": 0.1658, "rewards/chosen": 1.4746023178100587, "rewards/margins": 4.716737683614095, "rewards/rejected": -3.2421353658040366, "step": 13006 }, { "epoch": 0.6894230514403837, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9099626.0, "logits/rejected": -28299772.0, "logps/chosen": -124.17767333984375, "logps/rejected": -461.553955078125, "loss": 0.2584, "rewards/chosen": 0.17183934152126312, "rewards/margins": 3.4530056565999985, "rewards/rejected": -3.2811663150787354, "step": 13007 }, { "epoch": 0.6894760554421859, "grad_norm": 43.25, "kl": 5.268619537353516, "learning_rate": 5e-07, "logits/chosen": -543546.3333333334, "logits/rejected": 51762848.0, "logps/chosen": -180.47894287109375, "logps/rejected": -401.5495361328125, "loss": 0.2408, "rewards/chosen": 0.7495336532592773, "rewards/margins": 3.2107475280761717, "rewards/rejected": -2.4612138748168944, "step": 13008 }, { "epoch": 0.689529059443988, "grad_norm": 38.25, "kl": 2.039196014404297, "learning_rate": 5e-07, "logits/chosen": 3037644.5, "logits/rejected": -34091932.0, "logps/chosen": -142.37185668945312, "logps/rejected": -474.3904724121094, "loss": 0.2614, "rewards/chosen": 0.5686831474304199, "rewards/margins": 3.9443492889404297, "rewards/rejected": -3.3756661415100098, "step": 13009 }, { "epoch": 0.6895820634457902, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1814203.625, "logits/rejected": -44260882.28571428, "logps/chosen": -275.38177490234375, "logps/rejected": -311.85201590401783, "loss": 0.1315, "rewards/chosen": 1.0097473859786987, "rewards/margins": 3.28626811504364, "rewards/rejected": -2.2765207290649414, "step": 13010 }, { "epoch": 0.6896350674475923, "grad_norm": 56.5, "kl": 1.5563163757324219, "learning_rate": 5e-07, "logits/chosen": -12301684.8, "logits/rejected": -27249960.0, "logps/chosen": -92.2726318359375, "logps/rejected": -433.4359130859375, "loss": 0.2842, "rewards/chosen": 0.7345060348510742, "rewards/margins": 3.1742745081583656, "rewards/rejected": -2.4397684733072915, "step": 13011 }, { "epoch": 0.6896880714493945, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38117094.4, "logits/rejected": -50188992.0, "logps/chosen": -410.03916015625, "logps/rejected": -526.7235107421875, "loss": 0.2276, "rewards/chosen": 0.8604576110839843, "rewards/margins": 3.7280729929606116, "rewards/rejected": -2.8676153818766275, "step": 13012 }, { "epoch": 0.6897410754511966, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36939776.0, "logits/rejected": -5126674.0, "logps/chosen": -186.17788696289062, "logps/rejected": -311.718017578125, "loss": 0.3463, "rewards/chosen": -0.10949211567640305, "rewards/margins": 1.6907226517796516, "rewards/rejected": -1.8002147674560547, "step": 13013 }, { "epoch": 0.6897940794529988, "grad_norm": 46.0, "kl": 2.560138702392578, "learning_rate": 5e-07, "logits/chosen": -29022892.8, "logits/rejected": -1817719.6666666667, "logps/chosen": -280.1857421875, "logps/rejected": -548.9903157552084, "loss": 0.3197, "rewards/chosen": 0.8358230590820312, "rewards/margins": 3.372917811075846, "rewards/rejected": -2.537094751993815, "step": 13014 }, { "epoch": 0.6898470834548008, "grad_norm": 61.0, "kl": 1.3924064636230469, "learning_rate": 5e-07, "logits/chosen": -33159385.6, "logits/rejected": -24235450.666666668, "logps/chosen": -221.964453125, "logps/rejected": -320.24749755859375, "loss": 0.3852, "rewards/chosen": -0.00977703332901001, "rewards/margins": 2.711914416154226, "rewards/rejected": -2.721691449483236, "step": 13015 }, { "epoch": 0.689900087456603, "grad_norm": 44.75, "kl": 1.6980361938476562, "learning_rate": 5e-07, "logits/chosen": -16533121.333333334, "logits/rejected": -11404814.0, "logps/chosen": -175.10009765625, "logps/rejected": -96.17427062988281, "loss": 0.3616, "rewards/chosen": 0.791219154993693, "rewards/margins": 3.173696438471476, "rewards/rejected": -2.382477283477783, "step": 13016 }, { "epoch": 0.6899530914584051, "grad_norm": 39.25, "kl": 4.508411407470703, "learning_rate": 5e-07, "logits/chosen": -49293616.0, "logits/rejected": -12837283.2, "logps/chosen": -282.983642578125, "logps/rejected": -232.932470703125, "loss": 0.2241, "rewards/chosen": 1.0021380583445232, "rewards/margins": 3.256111828486125, "rewards/rejected": -2.2539737701416014, "step": 13017 }, { "epoch": 0.6900060954602072, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38172867.2, "logits/rejected": -32184816.0, "logps/chosen": -465.297265625, "logps/rejected": -351.1688639322917, "loss": 0.2867, "rewards/chosen": 0.44727387428283694, "rewards/margins": 2.9392650763193764, "rewards/rejected": -2.4919912020365396, "step": 13018 }, { "epoch": 0.6900590994620094, "grad_norm": 32.0, "kl": 1.4661121368408203, "learning_rate": 5e-07, "logits/chosen": -1225094.0, "logits/rejected": -13937640.0, "logps/chosen": -142.2294464111328, "logps/rejected": -192.89552307128906, "loss": 0.2678, "rewards/chosen": 0.5347647666931152, "rewards/margins": 3.7058868408203125, "rewards/rejected": -3.1711220741271973, "step": 13019 }, { "epoch": 0.6901121034638115, "grad_norm": 60.25, "kl": 3.440117835998535, "learning_rate": 5e-07, "logits/chosen": -51419990.85714286, "logits/rejected": -45766664.0, "logps/chosen": -293.02650669642856, "logps/rejected": -690.67919921875, "loss": 0.3728, "rewards/chosen": 0.7963989121573312, "rewards/margins": 4.16973260470799, "rewards/rejected": -3.373333692550659, "step": 13020 }, { "epoch": 0.6901651074656137, "grad_norm": 37.5, "kl": 2.76312255859375, "learning_rate": 5e-07, "logits/chosen": -35098184.0, "logits/rejected": -12307625.0, "logps/chosen": -269.794189453125, "logps/rejected": -318.4186096191406, "loss": 0.2356, "rewards/chosen": 0.6177462339401245, "rewards/margins": 3.5035191774368286, "rewards/rejected": -2.885772943496704, "step": 13021 }, { "epoch": 0.6902181114674157, "grad_norm": 55.75, "kl": 2.880340576171875, "learning_rate": 5e-07, "logits/chosen": -11069700.0, "logits/rejected": -17963200.0, "logps/chosen": -937.3644409179688, "logps/rejected": -442.61199951171875, "loss": 0.1168, "rewards/chosen": 2.5873355865478516, "rewards/margins": 6.1366002559661865, "rewards/rejected": -3.549264669418335, "step": 13022 }, { "epoch": 0.6902711154692179, "grad_norm": 43.25, "kl": 0.8463535308837891, "learning_rate": 5e-07, "logits/chosen": 319135.5, "logits/rejected": -28456874.0, "logps/chosen": -97.21995544433594, "logps/rejected": -141.12103271484375, "loss": 0.3879, "rewards/chosen": 0.38315141201019287, "rewards/margins": 1.1995347142219543, "rewards/rejected": -0.8163833022117615, "step": 13023 }, { "epoch": 0.69032411947102, "grad_norm": 44.0, "kl": 0.46985435485839844, "learning_rate": 5e-07, "logits/chosen": 2337637.0, "logits/rejected": -24472864.0, "logps/chosen": -32.6002197265625, "logps/rejected": -128.6627197265625, "loss": 0.2327, "rewards/chosen": 1.1715339422225952, "rewards/margins": 2.739926014627729, "rewards/rejected": -1.568392072405134, "step": 13024 }, { "epoch": 0.6903771234728222, "grad_norm": 38.75, "kl": 0.7940025329589844, "learning_rate": 5e-07, "logits/chosen": -28046936.0, "logits/rejected": -10350205.0, "logps/chosen": -283.49456787109375, "logps/rejected": -78.41142272949219, "loss": 0.237, "rewards/chosen": 0.7826151847839355, "rewards/margins": 3.5097479820251465, "rewards/rejected": -2.727132797241211, "step": 13025 }, { "epoch": 0.6904301274746243, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2411464.0, "logits/rejected": -14420460.8, "logps/chosen": -357.8562825520833, "logps/rejected": -433.32763671875, "loss": 0.2948, "rewards/chosen": -0.6262908776601156, "rewards/margins": 2.507508865992228, "rewards/rejected": -3.133799743652344, "step": 13026 }, { "epoch": 0.6904831314764265, "grad_norm": 52.0, "kl": 1.275115966796875, "learning_rate": 5e-07, "logits/chosen": -2534488.8, "logits/rejected": -17295358.666666668, "logps/chosen": -108.80377197265625, "logps/rejected": -254.98921712239584, "loss": 0.3218, "rewards/chosen": 0.6625356197357177, "rewards/margins": 1.9347673575083415, "rewards/rejected": -1.2722317377726238, "step": 13027 }, { "epoch": 0.6905361354782286, "grad_norm": 46.75, "kl": 1.1350898742675781, "learning_rate": 5e-07, "logits/chosen": -16677158.4, "logits/rejected": -16704978.666666666, "logps/chosen": -267.1670166015625, "logps/rejected": -242.876220703125, "loss": 0.2772, "rewards/chosen": 0.5278836727142334, "rewards/margins": 3.5046159267425536, "rewards/rejected": -2.9767322540283203, "step": 13028 }, { "epoch": 0.6905891394800308, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1446093.6, "logits/rejected": -37148701.333333336, "logps/chosen": -333.2295654296875, "logps/rejected": -305.1462809244792, "loss": 0.2525, "rewards/chosen": 0.8136654853820801, "rewards/margins": 3.4261930147806803, "rewards/rejected": -2.6125275293986, "step": 13029 }, { "epoch": 0.6906421434818328, "grad_norm": 26.125, "kl": 2.7508010864257812, "learning_rate": 5e-07, "logits/chosen": 5583479.5, "logits/rejected": -34164064.0, "logps/chosen": -69.18312072753906, "logps/rejected": -446.0548095703125, "loss": 0.3456, "rewards/chosen": 0.047256842255592346, "rewards/margins": 2.954928770661354, "rewards/rejected": -2.9076719284057617, "step": 13030 }, { "epoch": 0.690695147483635, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 24035824.0, "logits/rejected": -41281408.0, "logps/chosen": -401.18719482421875, "logps/rejected": -309.75784737723217, "loss": 0.1558, "rewards/chosen": 1.169061303138733, "rewards/margins": 3.5212099041257585, "rewards/rejected": -2.3521486009870256, "step": 13031 }, { "epoch": 0.6907481514854371, "grad_norm": 38.0, "kl": 1.0338973999023438, "learning_rate": 5e-07, "logits/chosen": -29516482.0, "logits/rejected": -59507752.0, "logps/chosen": -496.0054931640625, "logps/rejected": -591.74951171875, "loss": 0.1696, "rewards/chosen": 1.7115421295166016, "rewards/margins": 4.785589694976807, "rewards/rejected": -3.074047565460205, "step": 13032 }, { "epoch": 0.6908011554872393, "grad_norm": 33.75, "kl": 1.3819847106933594, "learning_rate": 5e-07, "logits/chosen": -10632500.0, "logits/rejected": -25160332.0, "logps/chosen": -157.41712951660156, "logps/rejected": -416.0688171386719, "loss": 0.2414, "rewards/chosen": 0.6080697774887085, "rewards/margins": 3.9106885194778442, "rewards/rejected": -3.3026187419891357, "step": 13033 }, { "epoch": 0.6908541594890414, "grad_norm": 46.5, "kl": 3.1090011596679688, "learning_rate": 5e-07, "logits/chosen": 2355378.5, "logits/rejected": -35202448.0, "logps/chosen": -152.51651000976562, "logps/rejected": -610.870849609375, "loss": 0.3536, "rewards/chosen": 0.5268716017405192, "rewards/margins": 3.846115986506144, "rewards/rejected": -3.319244384765625, "step": 13034 }, { "epoch": 0.6909071634908436, "grad_norm": 62.75, "kl": 0.8867959976196289, "learning_rate": 5e-07, "logits/chosen": 147278570.66666666, "logits/rejected": -29297382.4, "logps/chosen": -265.94651285807294, "logps/rejected": -235.7067138671875, "loss": 0.3187, "rewards/chosen": 0.18408342202504477, "rewards/margins": 1.510755403836568, "rewards/rejected": -1.3266719818115233, "step": 13035 }, { "epoch": 0.6909601674926457, "grad_norm": 63.25, "kl": 1.4457931518554688, "learning_rate": 5e-07, "logits/chosen": -81049424.0, "logits/rejected": -6510363.0, "logps/chosen": -386.1568196614583, "logps/rejected": -313.6315002441406, "loss": 0.4137, "rewards/chosen": 0.05042987068494161, "rewards/margins": 2.8859539528687796, "rewards/rejected": -2.835524082183838, "step": 13036 }, { "epoch": 0.6910131714944479, "grad_norm": 38.75, "kl": 0.4086723327636719, "learning_rate": 5e-07, "logits/chosen": -23717385.6, "logits/rejected": -16035717.333333334, "logps/chosen": -177.9239501953125, "logps/rejected": -138.0749715169271, "loss": 0.2531, "rewards/chosen": 0.6501753330230713, "rewards/margins": 3.7425358295440674, "rewards/rejected": -3.092360496520996, "step": 13037 }, { "epoch": 0.6910661754962499, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7299159.0, "logits/rejected": -10512226.0, "logps/chosen": -65.88609313964844, "logps/rejected": -230.19864908854166, "loss": 0.2705, "rewards/chosen": 0.4418591856956482, "rewards/margins": 2.0827149748802185, "rewards/rejected": -1.6408557891845703, "step": 13038 }, { "epoch": 0.6911191794980521, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26505568.0, "logits/rejected": -40015449.6, "logps/chosen": -656.81494140625, "logps/rejected": -292.4702880859375, "loss": 0.2239, "rewards/chosen": 0.4859934647878011, "rewards/margins": 3.1778075059254967, "rewards/rejected": -2.6918140411376954, "step": 13039 }, { "epoch": 0.6911721834998542, "grad_norm": 47.25, "kl": 0.3273773193359375, "learning_rate": 5e-07, "logits/chosen": -42160844.0, "logits/rejected": -37807192.0, "logps/chosen": -347.5120544433594, "logps/rejected": -298.6679992675781, "loss": 0.2375, "rewards/chosen": 0.4547865092754364, "rewards/margins": 3.389200657606125, "rewards/rejected": -2.9344141483306885, "step": 13040 }, { "epoch": 0.6912251875016564, "grad_norm": 71.5, "kl": 1.3423738479614258, "learning_rate": 5e-07, "logits/chosen": -21794321.6, "logits/rejected": -36194469.333333336, "logps/chosen": -315.195654296875, "logps/rejected": -386.7430826822917, "loss": 0.1953, "rewards/chosen": 1.156387710571289, "rewards/margins": 4.772833569844564, "rewards/rejected": -3.616445859273275, "step": 13041 }, { "epoch": 0.6912781915034585, "grad_norm": 64.0, "kl": 0.5549373626708984, "learning_rate": 5e-07, "logits/chosen": -17033344.0, "logits/rejected": -25160930.666666668, "logps/chosen": -84.5741195678711, "logps/rejected": -220.2369384765625, "loss": 0.3204, "rewards/chosen": -0.11720466613769531, "rewards/margins": 1.0577536424001057, "rewards/rejected": -1.174958308537801, "step": 13042 }, { "epoch": 0.6913311955052607, "grad_norm": 47.25, "kl": 0.008197784423828125, "learning_rate": 5e-07, "logits/chosen": -43274160.0, "logits/rejected": -27034101.333333332, "logps/chosen": -588.5885009765625, "logps/rejected": -281.5978597005208, "loss": 0.1952, "rewards/chosen": 0.747973620891571, "rewards/margins": 2.96576581398646, "rewards/rejected": -2.217792193094889, "step": 13043 }, { "epoch": 0.6913841995070628, "grad_norm": 42.75, "kl": 5.050058364868164, "learning_rate": 5e-07, "logits/chosen": -22356065.333333332, "logits/rejected": -31366086.4, "logps/chosen": -947.69921875, "logps/rejected": -327.566796875, "loss": 0.1868, "rewards/chosen": 2.923918088277181, "rewards/margins": 4.997071584065756, "rewards/rejected": -2.073153495788574, "step": 13044 }, { "epoch": 0.691437203508865, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33078556.8, "logits/rejected": -32347664.0, "logps/chosen": -320.140283203125, "logps/rejected": -308.3221028645833, "loss": 0.2608, "rewards/chosen": 0.660367774963379, "rewards/margins": 3.421721267700195, "rewards/rejected": -2.7613534927368164, "step": 13045 }, { "epoch": 0.691490207510667, "grad_norm": 59.25, "kl": 1.353515625, "learning_rate": 5e-07, "logits/chosen": -71274448.0, "logits/rejected": -8097908.666666667, "logps/chosen": -533.026611328125, "logps/rejected": -169.23296101888022, "loss": 0.2226, "rewards/chosen": 0.4587341547012329, "rewards/margins": 2.163342595100403, "rewards/rejected": -1.70460844039917, "step": 13046 }, { "epoch": 0.6915432115124692, "grad_norm": 32.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6015524.666666667, "logits/rejected": -12990580.8, "logps/chosen": -142.3409423828125, "logps/rejected": -263.9141845703125, "loss": 0.1959, "rewards/chosen": 0.5050210952758789, "rewards/margins": 3.8610418319702147, "rewards/rejected": -3.3560207366943358, "step": 13047 }, { "epoch": 0.6915962155142713, "grad_norm": 60.0, "kl": 0.179962158203125, "learning_rate": 5e-07, "logits/chosen": -6724009.0, "logits/rejected": -39592992.0, "logps/chosen": -238.86764526367188, "logps/rejected": -280.8609619140625, "loss": 0.375, "rewards/chosen": -0.24040381610393524, "rewards/margins": 1.5806436091661453, "rewards/rejected": -1.8210474252700806, "step": 13048 }, { "epoch": 0.6916492195160735, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18535267.2, "logits/rejected": -22227761.333333332, "logps/chosen": -239.210595703125, "logps/rejected": -186.758544921875, "loss": 0.4083, "rewards/chosen": 0.024933630228042604, "rewards/margins": 1.0613159875075022, "rewards/rejected": -1.0363823572794597, "step": 13049 }, { "epoch": 0.6917022235178756, "grad_norm": 84.5, "kl": 1.4381637573242188, "learning_rate": 5e-07, "logits/chosen": 58999242.666666664, "logits/rejected": -7354554.4, "logps/chosen": -631.203857421875, "logps/rejected": -191.1203857421875, "loss": 0.2751, "rewards/chosen": 0.7757105827331543, "rewards/margins": 3.0606436729431152, "rewards/rejected": -2.284933090209961, "step": 13050 }, { "epoch": 0.6917552275196778, "grad_norm": 41.25, "kl": 3.06011962890625, "learning_rate": 5e-07, "logits/chosen": -13062828.8, "logits/rejected": -11541012.0, "logps/chosen": -187.6379638671875, "logps/rejected": -284.76320393880206, "loss": 0.3215, "rewards/chosen": 0.8664276123046875, "rewards/margins": 2.9775357564290363, "rewards/rejected": -2.111108144124349, "step": 13051 }, { "epoch": 0.6918082315214799, "grad_norm": 42.75, "kl": 1.487884521484375, "learning_rate": 5e-07, "logits/chosen": -30032226.666666668, "logits/rejected": -8110177.6, "logps/chosen": -132.08610026041666, "logps/rejected": -83.21790161132813, "loss": 0.369, "rewards/chosen": 0.011508564154307047, "rewards/margins": 1.671456531683604, "rewards/rejected": -1.659947967529297, "step": 13052 }, { "epoch": 0.691861235523282, "grad_norm": 49.25, "kl": 0.26605796813964844, "learning_rate": 5e-07, "logits/chosen": -74936912.0, "logits/rejected": -12323585.6, "logps/chosen": -283.39040120442706, "logps/rejected": -363.362451171875, "loss": 0.1975, "rewards/chosen": 0.35881094137827557, "rewards/margins": 3.205823461214701, "rewards/rejected": -2.847012519836426, "step": 13053 }, { "epoch": 0.6919142395250841, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31076461.333333332, "logits/rejected": -25772878.4, "logps/chosen": -273.7060546875, "logps/rejected": -530.34912109375, "loss": 0.1795, "rewards/chosen": 0.5076970259348551, "rewards/margins": 4.45013477007548, "rewards/rejected": -3.942437744140625, "step": 13054 }, { "epoch": 0.6919672435268863, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19231126.0, "logits/rejected": -21396080.0, "logps/chosen": -293.41473388671875, "logps/rejected": -298.05792236328125, "loss": 0.2016, "rewards/chosen": 1.0231825113296509, "rewards/margins": 4.880358099937439, "rewards/rejected": -3.857175588607788, "step": 13055 }, { "epoch": 0.6920202475286884, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17225270.0, "logits/rejected": -22441004.0, "logps/chosen": -213.62030029296875, "logps/rejected": -393.69732666015625, "loss": 0.3301, "rewards/chosen": 0.27752429246902466, "rewards/margins": 1.9679574370384216, "rewards/rejected": -1.690433144569397, "step": 13056 }, { "epoch": 0.6920732515304906, "grad_norm": 51.5, "kl": 0.14497756958007812, "learning_rate": 5e-07, "logits/chosen": -48747413.333333336, "logits/rejected": -16546716.0, "logps/chosen": -328.7777913411458, "logps/rejected": -117.51426696777344, "loss": 0.3821, "rewards/chosen": 0.19055293003718057, "rewards/margins": 2.456446866194407, "rewards/rejected": -2.2658939361572266, "step": 13057 }, { "epoch": 0.6921262555322927, "grad_norm": 44.5, "kl": 0.12082290649414062, "learning_rate": 5e-07, "logits/chosen": 3987338.0, "logits/rejected": -26259760.0, "logps/chosen": -299.427001953125, "logps/rejected": -265.496484375, "loss": 0.2625, "rewards/chosen": 0.14570719997088113, "rewards/margins": 2.0695391396681466, "rewards/rejected": -1.9238319396972656, "step": 13058 }, { "epoch": 0.6921792595340949, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35523686.4, "logits/rejected": -27025325.333333332, "logps/chosen": -346.294921875, "logps/rejected": -269.1179606119792, "loss": 0.1941, "rewards/chosen": 1.3770052909851074, "rewards/margins": 3.5909150759379065, "rewards/rejected": -2.2139097849527993, "step": 13059 }, { "epoch": 0.692232263535897, "grad_norm": 53.0, "kl": 0.9852428436279297, "learning_rate": 5e-07, "logits/chosen": 5192955.2, "logits/rejected": -25772821.333333332, "logps/chosen": -290.815185546875, "logps/rejected": -295.1590576171875, "loss": 0.3226, "rewards/chosen": 0.41859002113342286, "rewards/margins": 2.8810808976491296, "rewards/rejected": -2.4624908765157065, "step": 13060 }, { "epoch": 0.6922852675376991, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20675833.6, "logits/rejected": -23543186.666666668, "logps/chosen": -551.952490234375, "logps/rejected": -333.50921630859375, "loss": 0.2443, "rewards/chosen": 0.9308467864990234, "rewards/margins": 3.579974110921224, "rewards/rejected": -2.6491273244222007, "step": 13061 }, { "epoch": 0.6923382715395012, "grad_norm": 44.25, "kl": 2.771904945373535, "learning_rate": 5e-07, "logits/chosen": 10653392.0, "logits/rejected": -24656590.4, "logps/chosen": -229.90824381510416, "logps/rejected": -364.3452392578125, "loss": 0.3336, "rewards/chosen": 0.40536022186279297, "rewards/margins": 1.916493034362793, "rewards/rejected": -1.5111328125, "step": 13062 }, { "epoch": 0.6923912755413034, "grad_norm": 74.5, "kl": 1.209059715270996, "learning_rate": 5e-07, "logits/chosen": -4428630.5, "logits/rejected": -28384412.0, "logps/chosen": -335.7881164550781, "logps/rejected": -239.211181640625, "loss": 0.3522, "rewards/chosen": 0.4325687289237976, "rewards/margins": 1.776608645915985, "rewards/rejected": -1.3440399169921875, "step": 13063 }, { "epoch": 0.6924442795431055, "grad_norm": 47.5, "kl": 0.014097213745117188, "learning_rate": 5e-07, "logits/chosen": 5484195.0, "logits/rejected": -22031784.0, "logps/chosen": -291.150634765625, "logps/rejected": -588.8842163085938, "loss": 0.2558, "rewards/chosen": 0.10097866505384445, "rewards/margins": 3.9085510298609734, "rewards/rejected": -3.807572364807129, "step": 13064 }, { "epoch": 0.6924972835449077, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65115088.0, "logits/rejected": -13123310.666666666, "logps/chosen": -444.6346130371094, "logps/rejected": -234.85701497395834, "loss": 0.2212, "rewards/chosen": 0.8589889407157898, "rewards/margins": 2.739862620830536, "rewards/rejected": -1.880873680114746, "step": 13065 }, { "epoch": 0.6925502875467098, "grad_norm": 51.75, "kl": 0.47144508361816406, "learning_rate": 5e-07, "logits/chosen": -2006270.0, "logits/rejected": -51678048.0, "logps/chosen": -429.07354736328125, "logps/rejected": -257.6243082682292, "loss": 0.2377, "rewards/chosen": 0.505401611328125, "rewards/margins": 2.1654547055562334, "rewards/rejected": -1.6600530942281086, "step": 13066 }, { "epoch": 0.692603291548512, "grad_norm": 48.0, "kl": 0.5332794189453125, "learning_rate": 5e-07, "logits/chosen": 13944265.333333334, "logits/rejected": -6192726.0, "logps/chosen": -187.44207763671875, "logps/rejected": -167.5325469970703, "loss": 0.3134, "rewards/chosen": 0.5760363340377808, "rewards/margins": 2.6616405248641968, "rewards/rejected": -2.085604190826416, "step": 13067 }, { "epoch": 0.692656295550314, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5900000.0, "logits/rejected": -29742761.6, "logps/chosen": -408.0071614583333, "logps/rejected": -293.243212890625, "loss": 0.1375, "rewards/chosen": 1.2832048734029133, "rewards/margins": 3.9537395795186363, "rewards/rejected": -2.6705347061157227, "step": 13068 }, { "epoch": 0.6927092995521161, "grad_norm": 45.25, "kl": 1.5515108108520508, "learning_rate": 5e-07, "logits/chosen": -23896314.0, "logits/rejected": -7051775.0, "logps/chosen": -277.8797607421875, "logps/rejected": -236.93746948242188, "loss": 0.2617, "rewards/chosen": 0.4897497296333313, "rewards/margins": 2.8974536061286926, "rewards/rejected": -2.4077038764953613, "step": 13069 }, { "epoch": 0.6927623035539183, "grad_norm": 55.5, "kl": 4.035933494567871, "learning_rate": 5e-07, "logits/chosen": 40337737.14285714, "logits/rejected": 3685752.5, "logps/chosen": -580.3933803013393, "logps/rejected": -57.85463333129883, "loss": 0.3534, "rewards/chosen": 1.074244771684919, "rewards/margins": 1.3064587456839425, "rewards/rejected": -0.23221397399902344, "step": 13070 }, { "epoch": 0.6928153075557204, "grad_norm": 51.75, "kl": 3.851593017578125, "learning_rate": 5e-07, "logits/chosen": -22425410.0, "logits/rejected": -6919750.0, "logps/chosen": -324.6489562988281, "logps/rejected": -246.47793579101562, "loss": 0.3161, "rewards/chosen": 1.1369835138320923, "rewards/margins": 2.4853519201278687, "rewards/rejected": -1.3483684062957764, "step": 13071 }, { "epoch": 0.6928683115575226, "grad_norm": 41.5, "kl": 1.4563102722167969, "learning_rate": 5e-07, "logits/chosen": 26081168.0, "logits/rejected": -21840824.0, "logps/chosen": -158.19507853190103, "logps/rejected": -246.3858154296875, "loss": 0.2034, "rewards/chosen": 0.17152607440948486, "rewards/margins": 4.157517743110657, "rewards/rejected": -3.985991668701172, "step": 13072 }, { "epoch": 0.6929213155593247, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54995932.0, "logits/rejected": -23525328.0, "logps/chosen": -276.4531555175781, "logps/rejected": -624.3218994140625, "loss": 0.3195, "rewards/chosen": 0.04846153408288956, "rewards/margins": 2.167739011347294, "rewards/rejected": -2.1192774772644043, "step": 13073 }, { "epoch": 0.6929743195611269, "grad_norm": 44.0, "kl": 2.198688507080078, "learning_rate": 5e-07, "logits/chosen": -13978105.6, "logits/rejected": 31924040.0, "logps/chosen": -292.4158935546875, "logps/rejected": -291.7592366536458, "loss": 0.2801, "rewards/chosen": 0.9588166236877441, "rewards/margins": 3.079214668273926, "rewards/rejected": -2.1203980445861816, "step": 13074 }, { "epoch": 0.693027323562929, "grad_norm": 41.25, "kl": 1.5385475158691406, "learning_rate": 5e-07, "logits/rejected": -22603644.0, "logps/rejected": -269.2790222167969, "loss": 0.1594, "rewards/rejected": -2.0350189208984375, "step": 13075 }, { "epoch": 0.6930803275647311, "grad_norm": 48.0, "kl": 5.578369140625, "learning_rate": 5e-07, "logits/chosen": -34406708.571428575, "logits/rejected": 4887507.0, "logps/chosen": -247.80130440848214, "logps/rejected": -46.782081604003906, "loss": 0.489, "rewards/chosen": 0.5186958653586251, "rewards/margins": 1.77434960433415, "rewards/rejected": -1.255653738975525, "step": 13076 }, { "epoch": 0.6931333315665332, "grad_norm": 37.5, "kl": 0.8600273132324219, "learning_rate": 5e-07, "logits/chosen": -16847452.0, "logits/rejected": -14378200.0, "logps/chosen": -359.7374267578125, "logps/rejected": -482.783447265625, "loss": 0.1928, "rewards/chosen": 1.0343514680862427, "rewards/margins": 4.001906037330627, "rewards/rejected": -2.9675545692443848, "step": 13077 }, { "epoch": 0.6931863355683354, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40336842.666666664, "logits/rejected": -22040214.4, "logps/chosen": -793.492431640625, "logps/rejected": -329.2689453125, "loss": 0.1486, "rewards/chosen": 1.3578310012817383, "rewards/margins": 4.974910926818848, "rewards/rejected": -3.6170799255371096, "step": 13078 }, { "epoch": 0.6932393395701375, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27486053.333333332, "logits/rejected": -11572472.0, "logps/chosen": -272.4542236328125, "logps/rejected": -267.362353515625, "loss": 0.2528, "rewards/chosen": 0.3671768506368001, "rewards/margins": 2.3211735089619956, "rewards/rejected": -1.9539966583251953, "step": 13079 }, { "epoch": 0.6932923435719397, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35135888.0, "logits/rejected": -8219949.5, "logps/chosen": -400.6766764322917, "logps/rejected": -240.69781494140625, "loss": 0.3208, "rewards/chosen": 0.4999916156133016, "rewards/margins": 2.7407600482304892, "rewards/rejected": -2.2407684326171875, "step": 13080 }, { "epoch": 0.6933453475737418, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7122224.0, "logits/rejected": -11535881.333333334, "logps/chosen": -255.55523681640625, "logps/rejected": -179.652099609375, "loss": 0.2688, "rewards/chosen": 0.4073086082935333, "rewards/margins": 2.1879929204781847, "rewards/rejected": -1.7806843121846516, "step": 13081 }, { "epoch": 0.693398351575544, "grad_norm": 50.0, "kl": 0.4285621643066406, "learning_rate": 5e-07, "logits/chosen": -56500032.0, "logits/rejected": -15157992.0, "logps/chosen": -342.48309326171875, "logps/rejected": -285.82415771484375, "loss": 0.1945, "rewards/chosen": 0.2524246275424957, "rewards/margins": 2.5619916021823883, "rewards/rejected": -2.3095669746398926, "step": 13082 }, { "epoch": 0.693451355577346, "grad_norm": 20.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16332342.0, "logits/rejected": -13994833.333333334, "logps/chosen": -453.63641357421875, "logps/rejected": -229.9812215169271, "loss": 0.0777, "rewards/chosen": 2.1470112800598145, "rewards/margins": 5.336255868275961, "rewards/rejected": -3.189244588216146, "step": 13083 }, { "epoch": 0.6935043595791482, "grad_norm": 54.25, "kl": 1.4327774047851562, "learning_rate": 5e-07, "logits/chosen": -31787804.0, "logits/rejected": -27302530.0, "logps/chosen": -497.3720397949219, "logps/rejected": -280.1069030761719, "loss": 0.2932, "rewards/chosen": 0.8278259634971619, "rewards/margins": 2.845430314540863, "rewards/rejected": -2.017604351043701, "step": 13084 }, { "epoch": 0.6935573635809503, "grad_norm": 70.0, "kl": 0.9278564453125, "learning_rate": 5e-07, "logits/chosen": -32689852.0, "logits/rejected": 35186420.0, "logps/chosen": -392.53948974609375, "logps/rejected": -199.38577270507812, "loss": 0.3231, "rewards/chosen": 0.1384359449148178, "rewards/margins": 2.0922177881002426, "rewards/rejected": -1.9537818431854248, "step": 13085 }, { "epoch": 0.6936103675827525, "grad_norm": 45.0, "kl": 0.5223846435546875, "learning_rate": 5e-07, "logits/chosen": -91744556.8, "logits/rejected": 1325068.6666666667, "logps/chosen": -221.68740234375, "logps/rejected": -150.46246337890625, "loss": 0.3936, "rewards/chosen": -0.20377540588378906, "rewards/margins": 2.7300310134887695, "rewards/rejected": -2.9338064193725586, "step": 13086 }, { "epoch": 0.6936633715845546, "grad_norm": 50.5, "kl": 4.011951446533203, "learning_rate": 5e-07, "logits/chosen": 164304.0, "logits/rejected": -19305706.666666668, "logps/chosen": -285.2431884765625, "logps/rejected": -194.85980224609375, "loss": 0.2926, "rewards/chosen": 0.8996847152709961, "rewards/margins": 4.876385498046875, "rewards/rejected": -3.976700782775879, "step": 13087 }, { "epoch": 0.6937163755863568, "grad_norm": 47.5, "kl": 1.1773567199707031, "learning_rate": 5e-07, "logits/chosen": -38005157.333333336, "logits/rejected": -17672022.0, "logps/chosen": -470.9440104166667, "logps/rejected": -218.52505493164062, "loss": 0.3519, "rewards/chosen": 0.5419000387191772, "rewards/margins": 3.102061152458191, "rewards/rejected": -2.5601611137390137, "step": 13088 }, { "epoch": 0.6937693795881589, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -82688824.0, "logits/rejected": -28905701.333333332, "logps/chosen": -486.59674072265625, "logps/rejected": -463.1319580078125, "loss": 0.2569, "rewards/chosen": -0.15037384629249573, "rewards/margins": 1.9919823904832206, "rewards/rejected": -2.1423562367757163, "step": 13089 }, { "epoch": 0.6938223835899611, "grad_norm": 58.5, "kl": 1.0988960266113281, "learning_rate": 5e-07, "logits/chosen": -41342240.0, "logits/rejected": 1614167.0, "logps/chosen": -616.7787679036459, "logps/rejected": -118.4624252319336, "loss": 0.3155, "rewards/chosen": 0.6202809810638428, "rewards/margins": 2.7762577533721924, "rewards/rejected": -2.1559767723083496, "step": 13090 }, { "epoch": 0.6938753875917631, "grad_norm": 44.5, "kl": 0.8166465759277344, "learning_rate": 5e-07, "logits/chosen": -17163380.0, "logits/rejected": -34080288.0, "logps/chosen": -185.36476135253906, "logps/rejected": -451.6949462890625, "loss": 0.2585, "rewards/chosen": 0.4566710591316223, "rewards/margins": 2.776032269001007, "rewards/rejected": -2.3193612098693848, "step": 13091 }, { "epoch": 0.6939283915935653, "grad_norm": 33.75, "kl": 0.6604738235473633, "learning_rate": 5e-07, "logits/chosen": -8938578.666666666, "logits/rejected": -20352550.4, "logps/chosen": -120.1781514485677, "logps/rejected": -193.38812255859375, "loss": 0.252, "rewards/chosen": 0.42441431681315106, "rewards/margins": 2.390423075358073, "rewards/rejected": -1.966008758544922, "step": 13092 }, { "epoch": 0.6939813955953674, "grad_norm": 49.25, "kl": 0.5664243698120117, "learning_rate": 5e-07, "logits/chosen": -4012786.0, "logits/rejected": -25318754.666666668, "logps/chosen": -211.11114501953125, "logps/rejected": -276.1433919270833, "loss": 0.1579, "rewards/chosen": 1.4910283088684082, "rewards/margins": 3.821659723917643, "rewards/rejected": -2.330631415049235, "step": 13093 }, { "epoch": 0.6940343995971696, "grad_norm": 44.75, "kl": 0.3858652114868164, "learning_rate": 5e-07, "logits/chosen": -22486987.2, "logits/rejected": -36606653.333333336, "logps/chosen": -350.0339111328125, "logps/rejected": -284.8310953776042, "loss": 0.2864, "rewards/chosen": 1.131557083129883, "rewards/margins": 2.6079588890075684, "rewards/rejected": -1.4764018058776855, "step": 13094 }, { "epoch": 0.6940874035989717, "grad_norm": 46.0, "kl": 0.24267959594726562, "learning_rate": 5e-07, "logits/chosen": 404125.6, "logits/rejected": -26431456.0, "logps/chosen": -217.5673583984375, "logps/rejected": -234.42718505859375, "loss": 0.3325, "rewards/chosen": 0.267003607749939, "rewards/margins": 2.2984710454940798, "rewards/rejected": -2.0314674377441406, "step": 13095 }, { "epoch": 0.6941404076007739, "grad_norm": 43.75, "kl": 1.9006576538085938, "learning_rate": 5e-07, "logits/chosen": -13176667.0, "logits/rejected": -17835126.0, "logps/chosen": -195.40769958496094, "logps/rejected": -493.18438720703125, "loss": 0.3094, "rewards/chosen": 0.367160439491272, "rewards/margins": 2.5973705053329468, "rewards/rejected": -2.230210065841675, "step": 13096 }, { "epoch": 0.694193411602576, "grad_norm": 49.5, "kl": 1.5202770233154297, "learning_rate": 5e-07, "logits/chosen": -23027998.4, "logits/rejected": -31504421.333333332, "logps/chosen": -247.1075927734375, "logps/rejected": -323.3973388671875, "loss": 0.3479, "rewards/chosen": 0.374579644203186, "rewards/margins": 2.262129203478495, "rewards/rejected": -1.8875495592753093, "step": 13097 }, { "epoch": 0.6942464156043782, "grad_norm": 41.5, "kl": 0.6341934204101562, "learning_rate": 5e-07, "logits/chosen": -39808870.4, "logits/rejected": -15732640.0, "logps/chosen": -311.3384765625, "logps/rejected": -262.84326171875, "loss": 0.214, "rewards/chosen": 1.0469188690185547, "rewards/margins": 3.7875003814697266, "rewards/rejected": -2.740581512451172, "step": 13098 }, { "epoch": 0.6942994196061802, "grad_norm": 50.25, "kl": 1.2814006805419922, "learning_rate": 5e-07, "logits/chosen": -784361.2, "logits/rejected": -17468946.666666668, "logps/chosen": -220.299365234375, "logps/rejected": -470.6992594401042, "loss": 0.36, "rewards/chosen": 0.08189078569412231, "rewards/margins": 2.021463596820831, "rewards/rejected": -1.939572811126709, "step": 13099 }, { "epoch": 0.6943524236079824, "grad_norm": 48.5, "kl": 0.5018033981323242, "learning_rate": 5e-07, "logits/chosen": -41128022.4, "logits/rejected": -33085285.333333332, "logps/chosen": -266.1198486328125, "logps/rejected": -227.9727783203125, "loss": 0.3065, "rewards/chosen": 0.400614070892334, "rewards/margins": 2.686066913604736, "rewards/rejected": -2.2854528427124023, "step": 13100 }, { "epoch": 0.6944054276097845, "grad_norm": 37.0, "kl": 1.325521469116211, "learning_rate": 5e-07, "logits/chosen": -37758040.0, "logits/rejected": -41232643.2, "logps/chosen": -183.28485107421875, "logps/rejected": -404.919091796875, "loss": 0.2522, "rewards/chosen": 0.06904291113217671, "rewards/margins": 2.673330249389013, "rewards/rejected": -2.604287338256836, "step": 13101 }, { "epoch": 0.6944584316115867, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35289379.2, "logits/rejected": -48462522.666666664, "logps/chosen": -434.57421875, "logps/rejected": -529.8209635416666, "loss": 0.3116, "rewards/chosen": 0.369291877746582, "rewards/margins": 2.4087242444356285, "rewards/rejected": -2.0394323666890464, "step": 13102 }, { "epoch": 0.6945114356133888, "grad_norm": 36.75, "kl": 1.3696565628051758, "learning_rate": 5e-07, "logits/chosen": -3084211.0, "logits/rejected": -28044600.0, "logps/chosen": -190.76806640625, "logps/rejected": -301.234130859375, "loss": 0.2448, "rewards/chosen": 0.7026697397232056, "rewards/margins": 3.1319397687911987, "rewards/rejected": -2.429270029067993, "step": 13103 }, { "epoch": 0.694564439615191, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18077989.333333332, "logits/rejected": 1627270.125, "logps/chosen": -184.9705810546875, "logps/rejected": -146.25704956054688, "loss": 0.3961, "rewards/chosen": 0.023100728789965313, "rewards/margins": 2.167422170440356, "rewards/rejected": -2.1443214416503906, "step": 13104 }, { "epoch": 0.6946174436169931, "grad_norm": 45.25, "kl": 2.334409713745117, "learning_rate": 5e-07, "logits/chosen": -2130683.6666666665, "logits/rejected": -34258124.8, "logps/chosen": -187.82391357421875, "logps/rejected": -341.511767578125, "loss": 0.2185, "rewards/chosen": 1.2869024276733398, "rewards/margins": 3.447945976257324, "rewards/rejected": -2.1610435485839843, "step": 13105 }, { "epoch": 0.6946704476187953, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59691763.2, "logits/rejected": -32422749.333333332, "logps/chosen": -536.2607421875, "logps/rejected": -296.670166015625, "loss": 0.274, "rewards/chosen": 1.0467576026916503, "rewards/margins": 2.9501295725504555, "rewards/rejected": -1.9033719698588054, "step": 13106 }, { "epoch": 0.6947234516205973, "grad_norm": 24.875, "kl": 5.095878601074219, "learning_rate": 5e-07, "logits/chosen": -5605344.0, "logits/rejected": -26650448.0, "logps/chosen": -203.0971221923828, "logps/rejected": -459.3743082682292, "loss": 0.2277, "rewards/chosen": 1.3155194520950317, "rewards/margins": 4.142173568407694, "rewards/rejected": -2.8266541163126626, "step": 13107 }, { "epoch": 0.6947764556223995, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55877196.0, "logits/rejected": 193352.5, "logps/chosen": -434.04290771484375, "logps/rejected": -356.1233825683594, "loss": 0.2462, "rewards/chosen": 0.6175335645675659, "rewards/margins": 2.7645756006240845, "rewards/rejected": -2.1470420360565186, "step": 13108 }, { "epoch": 0.6948294596242016, "grad_norm": 59.25, "kl": 2.7703514099121094, "learning_rate": 5e-07, "logits/chosen": -36880277.333333336, "logits/rejected": -4158858.4, "logps/chosen": -297.46783447265625, "logps/rejected": -421.399560546875, "loss": 0.2922, "rewards/chosen": 0.5702432791392008, "rewards/margins": 2.380595032374064, "rewards/rejected": -1.8103517532348632, "step": 13109 }, { "epoch": 0.6948824636260038, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58685792.0, "logits/rejected": -86621043.2, "logps/chosen": -408.40625, "logps/rejected": -467.85244140625, "loss": 0.216, "rewards/chosen": 0.3426981767018636, "rewards/margins": 3.080269511540731, "rewards/rejected": -2.7375713348388673, "step": 13110 }, { "epoch": 0.6949354676278059, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73930304.0, "logits/rejected": -9982419.2, "logps/chosen": -577.2024739583334, "logps/rejected": -183.430859375, "loss": 0.1546, "rewards/chosen": 1.0133035977681477, "rewards/margins": 4.115405495961507, "rewards/rejected": -3.1021018981933595, "step": 13111 }, { "epoch": 0.6949884716296081, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2313280.5, "logits/rejected": -13750172.0, "logps/chosen": -165.5040740966797, "logps/rejected": -309.95794677734375, "loss": 0.2227, "rewards/chosen": 0.5255181789398193, "rewards/margins": 3.4429280757904053, "rewards/rejected": -2.917409896850586, "step": 13112 }, { "epoch": 0.6950414756314102, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41039716.0, "logits/rejected": -70510144.0, "logps/chosen": -317.18280029296875, "logps/rejected": -412.71636962890625, "loss": 0.2484, "rewards/chosen": 0.38148003816604614, "rewards/margins": 2.9302156567573547, "rewards/rejected": -2.5487356185913086, "step": 13113 }, { "epoch": 0.6950944796332124, "grad_norm": 49.25, "kl": 0.4943714141845703, "learning_rate": 5e-07, "logits/chosen": -21609294.0, "logits/rejected": -18418834.0, "logps/chosen": -252.3815460205078, "logps/rejected": -504.9841003417969, "loss": 0.2928, "rewards/chosen": 0.9207015037536621, "rewards/margins": 3.297463893890381, "rewards/rejected": -2.3767623901367188, "step": 13114 }, { "epoch": 0.6951474836350144, "grad_norm": 42.75, "kl": 3.6178970336914062, "learning_rate": 5e-07, "logits/chosen": -469988.5, "logits/rejected": -31140182.0, "logps/chosen": -251.41497802734375, "logps/rejected": -491.18499755859375, "loss": 0.162, "rewards/chosen": 1.8493659496307373, "rewards/margins": 4.806748628616333, "rewards/rejected": -2.9573826789855957, "step": 13115 }, { "epoch": 0.6952004876368166, "grad_norm": 40.5, "kl": 1.0539731979370117, "learning_rate": 5e-07, "logits/chosen": -27814130.666666668, "logits/rejected": -33853174.4, "logps/chosen": -366.3660074869792, "logps/rejected": -355.515673828125, "loss": 0.2879, "rewards/chosen": 0.21623714764912924, "rewards/margins": 2.3103480180104574, "rewards/rejected": -2.094110870361328, "step": 13116 }, { "epoch": 0.6952534916386187, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86636748.8, "logits/rejected": -22632440.0, "logps/chosen": -386.72763671875, "logps/rejected": -455.5780843098958, "loss": 0.2994, "rewards/chosen": 0.21993529796600342, "rewards/margins": 3.3993595838546753, "rewards/rejected": -3.179424285888672, "step": 13117 }, { "epoch": 0.6953064956404208, "grad_norm": 37.0, "kl": 1.7746562957763672, "learning_rate": 5e-07, "logits/chosen": -17485770.0, "logits/rejected": -68866600.0, "logps/chosen": -74.57368469238281, "logps/rejected": -374.30621337890625, "loss": 0.3143, "rewards/chosen": 0.11032190918922424, "rewards/margins": 2.7617997229099274, "rewards/rejected": -2.651477813720703, "step": 13118 }, { "epoch": 0.695359499642223, "grad_norm": 42.0, "kl": 1.7808647155761719, "learning_rate": 5e-07, "logits/chosen": 9132753.333333334, "logits/rejected": -20914724.8, "logps/chosen": -56.35047912597656, "logps/rejected": -316.376953125, "loss": 0.2788, "rewards/chosen": 0.398901104927063, "rewards/margins": 2.8924782991409304, "rewards/rejected": -2.4935771942138674, "step": 13119 }, { "epoch": 0.6954125036440251, "grad_norm": 52.75, "kl": 1.379157543182373, "learning_rate": 5e-07, "logits/chosen": -65010090.666666664, "logits/rejected": -22128008.0, "logps/chosen": -194.28399658203125, "logps/rejected": -307.2689453125, "loss": 0.2597, "rewards/chosen": 0.4868532021840413, "rewards/margins": 2.764830764134725, "rewards/rejected": -2.2779775619506837, "step": 13120 }, { "epoch": 0.6954655076458273, "grad_norm": 53.75, "kl": 1.9884376525878906, "learning_rate": 5e-07, "logits/chosen": -41178573.71428572, "logits/rejected": -7991199.0, "logps/chosen": -362.08175223214283, "logps/rejected": -192.61502075195312, "loss": 0.2985, "rewards/chosen": 0.9583336285182408, "rewards/margins": 3.3642662933894565, "rewards/rejected": -2.405932664871216, "step": 13121 }, { "epoch": 0.6955185116476293, "grad_norm": 46.75, "kl": 0.7212142944335938, "learning_rate": 5e-07, "logits/chosen": -5466114.666666667, "logits/rejected": -12425760.0, "logps/chosen": -265.63519287109375, "logps/rejected": -197.43714599609376, "loss": 0.2837, "rewards/chosen": 0.7152629693349203, "rewards/margins": 2.096662410100301, "rewards/rejected": -1.381399440765381, "step": 13122 }, { "epoch": 0.6955715156494315, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10842045.333333334, "logits/rejected": -36163859.2, "logps/chosen": -589.7166341145834, "logps/rejected": -157.97103271484374, "loss": 0.2058, "rewards/chosen": 1.0438766479492188, "rewards/margins": 2.9725601196289064, "rewards/rejected": -1.9286834716796875, "step": 13123 }, { "epoch": 0.6956245196512336, "grad_norm": 40.5, "kl": 2.1845006942749023, "learning_rate": 5e-07, "logits/chosen": -12946655.0, "logits/rejected": -47369376.0, "logps/chosen": -265.1603698730469, "logps/rejected": -495.4304504394531, "loss": 0.2683, "rewards/chosen": 0.4289131760597229, "rewards/margins": 3.9303913712501526, "rewards/rejected": -3.5014781951904297, "step": 13124 }, { "epoch": 0.6956775236530358, "grad_norm": 35.75, "kl": 1.9927139282226562, "learning_rate": 5e-07, "logits/chosen": -13330037.333333334, "logits/rejected": -40488678.4, "logps/chosen": -232.12715657552084, "logps/rejected": -284.193603515625, "loss": 0.2001, "rewards/chosen": 1.178018569946289, "rewards/margins": 3.923172378540039, "rewards/rejected": -2.74515380859375, "step": 13125 }, { "epoch": 0.6957305276548379, "grad_norm": 43.5, "kl": 3.944683074951172, "learning_rate": 5e-07, "logits/chosen": -34417237.333333336, "logits/rejected": -15697168.0, "logps/chosen": -470.5387776692708, "logps/rejected": -238.6311279296875, "loss": 0.3093, "rewards/chosen": 0.3819007873535156, "rewards/margins": 2.07589054107666, "rewards/rejected": -1.6939897537231445, "step": 13126 }, { "epoch": 0.6957835316566401, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24852540.0, "logits/rejected": -16410181.333333334, "logps/chosen": -181.4559326171875, "logps/rejected": -179.4975382486979, "loss": 0.2657, "rewards/chosen": -0.09587936103343964, "rewards/margins": 1.691629762450854, "rewards/rejected": -1.7875091234842937, "step": 13127 }, { "epoch": 0.6958365356584422, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22661346.0, "logits/rejected": -31664692.0, "logps/chosen": -307.59588623046875, "logps/rejected": -387.39410400390625, "loss": 0.2096, "rewards/chosen": 0.6813980340957642, "rewards/margins": 3.8818360567092896, "rewards/rejected": -3.2004380226135254, "step": 13128 }, { "epoch": 0.6958895396602444, "grad_norm": 40.25, "kl": 1.4436416625976562, "learning_rate": 5e-07, "logits/chosen": -49702176.0, "logits/rejected": -14835933.333333334, "logps/chosen": -434.1658630371094, "logps/rejected": -326.7255045572917, "loss": 0.1988, "rewards/chosen": 0.02258606255054474, "rewards/margins": 2.2711140662431717, "rewards/rejected": -2.248528003692627, "step": 13129 }, { "epoch": 0.6959425436620464, "grad_norm": 64.5, "kl": 1.451080322265625, "learning_rate": 5e-07, "logits/chosen": -31226213.333333332, "logits/rejected": -64073248.0, "logps/chosen": -502.6899007161458, "logps/rejected": -573.658447265625, "loss": 0.3844, "rewards/chosen": 0.21861610809961954, "rewards/margins": 3.5262456933657327, "rewards/rejected": -3.3076295852661133, "step": 13130 }, { "epoch": 0.6959955476638486, "grad_norm": 41.0, "kl": 1.756606101989746, "learning_rate": 5e-07, "logits/chosen": -29182246.0, "logits/rejected": -59489240.0, "logps/chosen": -198.19000244140625, "logps/rejected": -182.57884216308594, "loss": 0.2713, "rewards/chosen": 0.7774669528007507, "rewards/margins": 2.4703293442726135, "rewards/rejected": -1.6928623914718628, "step": 13131 }, { "epoch": 0.6960485516656507, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1472697.0, "logits/rejected": -19525749.333333332, "logps/chosen": -256.02593994140625, "logps/rejected": -284.6783854166667, "loss": 0.1385, "rewards/chosen": 1.0744056701660156, "rewards/margins": 3.6758705774943032, "rewards/rejected": -2.6014649073282876, "step": 13132 }, { "epoch": 0.6961015556674529, "grad_norm": 55.0, "kl": 0.36826324462890625, "learning_rate": 5e-07, "logits/chosen": -34304931.2, "logits/rejected": -23287344.0, "logps/chosen": -209.142578125, "logps/rejected": -418.3056233723958, "loss": 0.2905, "rewards/chosen": 0.6155054569244385, "rewards/margins": 2.4732660452524824, "rewards/rejected": -1.8577605883280437, "step": 13133 }, { "epoch": 0.696154559669255, "grad_norm": 42.5, "kl": 0.8697910308837891, "learning_rate": 5e-07, "logits/chosen": -23611028.0, "logits/rejected": 4438036.0, "logps/chosen": -146.6491241455078, "logps/rejected": -289.177001953125, "loss": 0.3073, "rewards/chosen": 0.16264617443084717, "rewards/margins": 2.590082049369812, "rewards/rejected": -2.427435874938965, "step": 13134 }, { "epoch": 0.6962075636710572, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37969449.6, "logits/rejected": -31618944.0, "logps/chosen": -303.499560546875, "logps/rejected": -355.282470703125, "loss": 0.3914, "rewards/chosen": -0.32428803443908694, "rewards/margins": 2.6880760351816813, "rewards/rejected": -3.012364069620768, "step": 13135 }, { "epoch": 0.6962605676728593, "grad_norm": 53.25, "kl": 2.297941207885742, "learning_rate": 5e-07, "logits/chosen": -34070482.666666664, "logits/rejected": 26588352.0, "logps/chosen": -395.1549886067708, "logps/rejected": -95.70254516601562, "loss": 0.3886, "rewards/chosen": 0.7242085138956705, "rewards/margins": 1.994427959124247, "rewards/rejected": -1.2702194452285767, "step": 13136 }, { "epoch": 0.6963135716746615, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33354501.333333332, "logits/rejected": -15211395.2, "logps/chosen": -294.8310546875, "logps/rejected": -302.7669921875, "loss": 0.2411, "rewards/chosen": 0.546193520228068, "rewards/margins": 2.6067110220591228, "rewards/rejected": -2.060517501831055, "step": 13137 }, { "epoch": 0.6963665756764635, "grad_norm": 66.0, "kl": 2.930826187133789, "learning_rate": 5e-07, "logits/chosen": -34425938.28571428, "logits/rejected": 9584388.0, "logps/chosen": -208.60838099888392, "logps/rejected": -109.44022369384766, "loss": 0.4103, "rewards/chosen": 0.6646928787231445, "rewards/margins": 1.386540412902832, "rewards/rejected": -0.7218475341796875, "step": 13138 }, { "epoch": 0.6964195796782657, "grad_norm": 42.5, "kl": 0.33733558654785156, "learning_rate": 5e-07, "logits/chosen": -35101704.0, "logits/rejected": -30459056.0, "logps/chosen": -281.00006103515625, "logps/rejected": -249.1672159830729, "loss": 0.1915, "rewards/chosen": 0.16351088881492615, "rewards/margins": 2.655414193868637, "rewards/rejected": -2.491903305053711, "step": 13139 }, { "epoch": 0.6964725836800678, "grad_norm": 44.75, "kl": 2.6030406951904297, "learning_rate": 5e-07, "logits/chosen": -24071462.0, "logits/rejected": 2345464.0, "logps/chosen": -441.21038818359375, "logps/rejected": -386.5353088378906, "loss": 0.2736, "rewards/chosen": 0.588010311126709, "rewards/margins": 3.490567207336426, "rewards/rejected": -2.902556896209717, "step": 13140 }, { "epoch": 0.69652558768187, "grad_norm": 55.25, "kl": 1.17041015625, "learning_rate": 5e-07, "logits/chosen": -39614156.8, "logits/rejected": -18298661.333333332, "logps/chosen": -245.0511474609375, "logps/rejected": -196.30108642578125, "loss": 0.3215, "rewards/chosen": 0.6613231182098389, "rewards/margins": 1.8672328472137452, "rewards/rejected": -1.2059097290039062, "step": 13141 }, { "epoch": 0.6965785916836721, "grad_norm": 33.5, "kl": 1.5336151123046875, "learning_rate": 5e-07, "logits/chosen": -10907420.0, "logits/rejected": 63251488.0, "logps/chosen": -92.90394592285156, "logps/rejected": -285.96278889973956, "loss": 0.2313, "rewards/chosen": 0.10824127495288849, "rewards/margins": 2.36717322965463, "rewards/rejected": -2.2589319547017417, "step": 13142 }, { "epoch": 0.6966315956854743, "grad_norm": 55.25, "kl": 0.7509593963623047, "learning_rate": 5e-07, "logits/chosen": -11661086.4, "logits/rejected": -36382704.0, "logps/chosen": -269.6239501953125, "logps/rejected": -485.8058675130208, "loss": 0.2603, "rewards/chosen": 0.9593277931213379, "rewards/margins": 3.2126440366109215, "rewards/rejected": -2.2533162434895835, "step": 13143 }, { "epoch": 0.6966845996872764, "grad_norm": 64.0, "kl": 0.2531166076660156, "learning_rate": 5e-07, "logits/chosen": -36638256.0, "logits/rejected": -7808898.5, "logps/chosen": -415.2693786621094, "logps/rejected": -239.0264434814453, "loss": 0.3331, "rewards/chosen": 0.013092011213302612, "rewards/margins": 1.8593775928020477, "rewards/rejected": -1.8462855815887451, "step": 13144 }, { "epoch": 0.6967376036890786, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19080784.0, "logits/rejected": -16078309.333333334, "logps/chosen": -322.0171813964844, "logps/rejected": -185.20572916666666, "loss": 0.1818, "rewards/chosen": 0.5007789731025696, "rewards/margins": 2.933711508909861, "rewards/rejected": -2.4329325358072915, "step": 13145 }, { "epoch": 0.6967906076908806, "grad_norm": 51.5, "kl": 2.7413368225097656, "learning_rate": 5e-07, "logits/chosen": -43767401.14285714, "logits/rejected": 9071311.0, "logps/chosen": -250.14400809151786, "logps/rejected": -127.04338073730469, "loss": 0.4709, "rewards/chosen": 0.503328970500401, "rewards/margins": 0.5775561173047338, "rewards/rejected": -0.07422714680433273, "step": 13146 }, { "epoch": 0.6968436116926828, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21998734.0, "logits/rejected": -16084085.333333334, "logps/chosen": -352.9471130371094, "logps/rejected": -297.8486735026042, "loss": 0.1819, "rewards/chosen": 0.5106781125068665, "rewards/margins": 3.0499537587165833, "rewards/rejected": -2.539275646209717, "step": 13147 }, { "epoch": 0.6968966156944849, "grad_norm": 54.25, "kl": 2.4848785400390625, "learning_rate": 5e-07, "logits/chosen": -31190304.0, "logits/rejected": -15440460.0, "logps/chosen": -372.0079345703125, "logps/rejected": -126.84832763671875, "loss": 0.411, "rewards/chosen": -0.17482775449752808, "rewards/margins": 3.119305113951365, "rewards/rejected": -3.294132868448893, "step": 13148 }, { "epoch": 0.6969496196962871, "grad_norm": 53.75, "kl": 2.4861412048339844, "learning_rate": 5e-07, "logits/chosen": -48572032.0, "logits/rejected": -33208595.2, "logps/chosen": -398.8238932291667, "logps/rejected": -247.233935546875, "loss": 0.2937, "rewards/chosen": 0.5886799891789755, "rewards/margins": 2.598290356000265, "rewards/rejected": -2.0096103668212892, "step": 13149 }, { "epoch": 0.6970026236980892, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25533088.0, "logits/rejected": -8696670.0, "logps/chosen": -300.89263916015625, "logps/rejected": -375.6078796386719, "loss": 0.2589, "rewards/chosen": 0.3867258131504059, "rewards/margins": 3.2580982744693756, "rewards/rejected": -2.8713724613189697, "step": 13150 }, { "epoch": 0.6970556276998914, "grad_norm": 41.75, "kl": 0.9685859680175781, "learning_rate": 5e-07, "logits/chosen": 3797879.0, "logits/rejected": -32609965.714285713, "logps/chosen": -94.65249633789062, "logps/rejected": -425.4152134486607, "loss": 0.1718, "rewards/chosen": -0.23724137246608734, "rewards/margins": 2.233824148774147, "rewards/rejected": -2.4710655212402344, "step": 13151 }, { "epoch": 0.6971086317016935, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41743138.666666664, "logits/rejected": -20765840.0, "logps/chosen": -267.1997477213542, "logps/rejected": -248.50534057617188, "loss": 0.2969, "rewards/chosen": 0.7322298685709635, "rewards/margins": 2.6533769766489663, "rewards/rejected": -1.921147108078003, "step": 13152 }, { "epoch": 0.6971616357034957, "grad_norm": 44.5, "kl": 1.6416254043579102, "learning_rate": 5e-07, "logits/chosen": -15837139.2, "logits/rejected": -8502786.666666666, "logps/chosen": -167.83282470703125, "logps/rejected": -173.99979654947916, "loss": 0.3964, "rewards/chosen": 0.10809080600738526, "rewards/margins": 1.2484073718388875, "rewards/rejected": -1.1403165658315022, "step": 13153 }, { "epoch": 0.6972146397052977, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20738201.6, "logits/rejected": -43939344.0, "logps/chosen": -309.1754638671875, "logps/rejected": -270.533447265625, "loss": 0.3029, "rewards/chosen": 0.3326206922531128, "rewards/margins": 2.4944515943527223, "rewards/rejected": -2.1618309020996094, "step": 13154 }, { "epoch": 0.6972676437070999, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37229088.0, "logits/rejected": -98901813.33333333, "logps/chosen": -246.137841796875, "logps/rejected": -153.34967041015625, "loss": 0.4206, "rewards/chosen": -0.12698303461074828, "rewards/margins": 1.2780109763145446, "rewards/rejected": -1.404994010925293, "step": 13155 }, { "epoch": 0.697320647708902, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12177617.6, "logits/rejected": -61238112.0, "logps/chosen": -217.77216796875, "logps/rejected": -414.0716959635417, "loss": 0.3654, "rewards/chosen": -0.11898071765899658, "rewards/margins": 2.191816973686218, "rewards/rejected": -2.310797691345215, "step": 13156 }, { "epoch": 0.6973736517107042, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11885252.0, "logits/rejected": -41481691.428571425, "logps/chosen": -15.303400039672852, "logps/rejected": -219.40122767857142, "loss": 0.1131, "rewards/chosen": 1.8306249380111694, "rewards/margins": 4.292960252080645, "rewards/rejected": -2.4623353140694753, "step": 13157 }, { "epoch": 0.6974266557125063, "grad_norm": 55.75, "kl": 0.14460182189941406, "learning_rate": 5e-07, "logits/chosen": -48174344.0, "logits/rejected": -19440552.0, "logps/chosen": -444.80072021484375, "logps/rejected": -253.626708984375, "loss": 0.2159, "rewards/chosen": 0.162811279296875, "rewards/margins": 2.533435662587484, "rewards/rejected": -2.370624383290609, "step": 13158 }, { "epoch": 0.6974796597143085, "grad_norm": 55.0, "kl": 3.1502761840820312, "learning_rate": 5e-07, "logits/chosen": -42406745.6, "logits/rejected": -2728001.6666666665, "logps/chosen": -511.50625, "logps/rejected": -163.3665771484375, "loss": 0.1894, "rewards/chosen": 1.65716552734375, "rewards/margins": 4.796513875325521, "rewards/rejected": -3.139348347981771, "step": 13159 }, { "epoch": 0.6975326637161106, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19188018.666666668, "logits/rejected": -1266382.4, "logps/chosen": -309.51068115234375, "logps/rejected": -164.38577880859376, "loss": 0.2139, "rewards/chosen": 0.7864761352539062, "rewards/margins": 2.7739870071411135, "rewards/rejected": -1.987510871887207, "step": 13160 }, { "epoch": 0.6975856677179127, "grad_norm": 52.25, "kl": 0.3249702453613281, "learning_rate": 5e-07, "logits/chosen": -71753616.0, "logits/rejected": -47757241.6, "logps/chosen": -489.5550130208333, "logps/rejected": -350.3462890625, "loss": 0.2008, "rewards/chosen": 0.4133412837982178, "rewards/margins": 3.09995436668396, "rewards/rejected": -2.686613082885742, "step": 13161 }, { "epoch": 0.6976386717197148, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21009208.0, "logits/rejected": -24194444.8, "logps/chosen": -477.2713216145833, "logps/rejected": -251.301318359375, "loss": 0.2286, "rewards/chosen": 0.5669657786687216, "rewards/margins": 2.620588787396749, "rewards/rejected": -2.053623008728027, "step": 13162 }, { "epoch": 0.697691675721517, "grad_norm": 40.0, "kl": 1.1121110916137695, "learning_rate": 5e-07, "logits/chosen": -17912229.333333332, "logits/rejected": -6723202.0, "logps/chosen": -151.1831258138021, "logps/rejected": -252.62196350097656, "loss": 0.2838, "rewards/chosen": 0.83902374903361, "rewards/margins": 2.451732556025187, "rewards/rejected": -1.6127088069915771, "step": 13163 }, { "epoch": 0.6977446797233191, "grad_norm": 48.75, "kl": 6.502311706542969, "learning_rate": 5e-07, "logits/chosen": -62807464.0, "logits/rejected": -53535060.0, "logps/chosen": -919.172119140625, "logps/rejected": -370.61138916015625, "loss": 0.1983, "rewards/chosen": 1.734297275543213, "rewards/margins": 4.029469013214111, "rewards/rejected": -2.2951717376708984, "step": 13164 }, { "epoch": 0.6977976837251213, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -7823506.0, "logps/rejected": -226.0215606689453, "loss": 0.1434, "rewards/rejected": -2.732696771621704, "step": 13165 }, { "epoch": 0.6978506877269234, "grad_norm": 65.0, "kl": 0.9607677459716797, "learning_rate": 5e-07, "logits/chosen": -10930196.0, "logits/rejected": -34908984.0, "logps/chosen": -147.1620330810547, "logps/rejected": -376.4122314453125, "loss": 0.3621, "rewards/chosen": -0.21572944521903992, "rewards/margins": 1.616801530122757, "rewards/rejected": -1.8325309753417969, "step": 13166 }, { "epoch": 0.6979036917287256, "grad_norm": 50.0, "kl": 0.6577415466308594, "learning_rate": 5e-07, "logits/chosen": 23511660.0, "logits/rejected": 70867096.0, "logps/chosen": -247.81021118164062, "logps/rejected": -532.920166015625, "loss": 0.3488, "rewards/chosen": -0.22730770707130432, "rewards/margins": 2.9796405732631683, "rewards/rejected": -3.2069482803344727, "step": 13167 }, { "epoch": 0.6979566957305277, "grad_norm": 50.25, "kl": 1.3145408630371094, "learning_rate": 5e-07, "logits/chosen": -45666960.0, "logits/rejected": -71935568.0, "logps/chosen": -578.040771484375, "logps/rejected": -838.3857421875, "loss": 0.2147, "rewards/chosen": 0.47137656807899475, "rewards/margins": 3.435382753610611, "rewards/rejected": -2.964006185531616, "step": 13168 }, { "epoch": 0.6980096997323297, "grad_norm": 49.0, "kl": 1.6048583984375, "learning_rate": 5e-07, "logits/chosen": -77698442.66666667, "logits/rejected": 3057806.0, "logps/chosen": -243.27392578125, "logps/rejected": -167.18121337890625, "loss": 0.311, "rewards/chosen": 0.794681708017985, "rewards/margins": 2.20814839998881, "rewards/rejected": -1.4134666919708252, "step": 13169 }, { "epoch": 0.6980627037341319, "grad_norm": 46.5, "kl": 0.7648544311523438, "learning_rate": 5e-07, "logits/chosen": -27124248.0, "logits/rejected": 49468900.0, "logps/chosen": -345.3668212890625, "logps/rejected": -345.1494445800781, "loss": 0.338, "rewards/chosen": 0.8269103169441223, "rewards/margins": 2.069318115711212, "rewards/rejected": -1.2424077987670898, "step": 13170 }, { "epoch": 0.698115707735934, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24916708.0, "logits/rejected": -25675952.0, "logps/chosen": -323.9512023925781, "logps/rejected": -287.6622314453125, "loss": 0.2064, "rewards/chosen": 0.8994286060333252, "rewards/margins": 4.220797300338745, "rewards/rejected": -3.32136869430542, "step": 13171 }, { "epoch": 0.6981687117377362, "grad_norm": 68.5, "kl": 4.330284118652344, "learning_rate": 5e-07, "logits/chosen": -47443676.8, "logits/rejected": -38558544.0, "logps/chosen": -486.56796875, "logps/rejected": -324.74355061848956, "loss": 0.3282, "rewards/chosen": 0.7753180503845215, "rewards/margins": 3.297308031717936, "rewards/rejected": -2.5219899813334146, "step": 13172 }, { "epoch": 0.6982217157395383, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38065568.0, "logits/rejected": -16396437.333333334, "logps/chosen": -377.927392578125, "logps/rejected": -459.6986490885417, "loss": 0.2823, "rewards/chosen": 0.598492431640625, "rewards/margins": 3.5044261296590173, "rewards/rejected": -2.905933698018392, "step": 13173 }, { "epoch": 0.6982747197413405, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1795322.6666666667, "logits/rejected": -61891852.8, "logps/chosen": -244.4442138671875, "logps/rejected": -235.57255859375, "loss": 0.3169, "rewards/chosen": 0.37414395809173584, "rewards/margins": 1.5085123777389526, "rewards/rejected": -1.1343684196472168, "step": 13174 }, { "epoch": 0.6983277237431426, "grad_norm": 45.0, "kl": 0.69635009765625, "learning_rate": 5e-07, "logits/chosen": -27954560.0, "logits/rejected": -76655952.0, "logps/chosen": -362.738720703125, "logps/rejected": -101.47196451822917, "loss": 0.3516, "rewards/chosen": 0.28056395053863525, "rewards/margins": 2.1269413232803345, "rewards/rejected": -1.8463773727416992, "step": 13175 }, { "epoch": 0.6983807277449448, "grad_norm": 46.75, "kl": 2.4672622680664062, "learning_rate": 5e-07, "logits/chosen": -26500968.0, "logits/rejected": -18175666.0, "logps/chosen": -325.26165771484375, "logps/rejected": -198.99334716796875, "loss": 0.2993, "rewards/chosen": 0.6758190393447876, "rewards/margins": 2.29931902885437, "rewards/rejected": -1.6234999895095825, "step": 13176 }, { "epoch": 0.6984337317467468, "grad_norm": 31.0, "kl": 0.8450927734375, "learning_rate": 5e-07, "logits/chosen": -18162206.0, "logits/rejected": -28106300.0, "logps/chosen": -109.38160705566406, "logps/rejected": -120.95297241210938, "loss": 0.4023, "rewards/chosen": -0.1502881944179535, "rewards/margins": 1.4200041592121124, "rewards/rejected": -1.570292353630066, "step": 13177 }, { "epoch": 0.698486735748549, "grad_norm": 39.5, "kl": 5.284828186035156, "learning_rate": 5e-07, "logits/chosen": -24855050.666666668, "logits/rejected": -20353712.0, "logps/chosen": -360.0735677083333, "logps/rejected": -530.77880859375, "loss": 0.2271, "rewards/chosen": 0.3967554569244385, "rewards/margins": 5.255090570449829, "rewards/rejected": -4.85833511352539, "step": 13178 }, { "epoch": 0.6985397397503511, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 802382.3333333334, "logits/rejected": -8600158.4, "logps/chosen": -99.29344685872395, "logps/rejected": -170.37872314453125, "loss": 0.2541, "rewards/chosen": 0.6077117522557577, "rewards/margins": 2.4343398650487265, "rewards/rejected": -1.8266281127929687, "step": 13179 }, { "epoch": 0.6985927437521533, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40448004.0, "logits/rejected": -26476370.666666668, "logps/chosen": -253.573974609375, "logps/rejected": -231.16080729166666, "loss": 0.1705, "rewards/chosen": 0.23064881563186646, "rewards/margins": 3.4188682039578757, "rewards/rejected": -3.1882193883260093, "step": 13180 }, { "epoch": 0.6986457477539554, "grad_norm": 45.0, "kl": 1.2183036804199219, "learning_rate": 5e-07, "logits/chosen": -11145585.6, "logits/rejected": -11830884.0, "logps/chosen": -188.22669677734376, "logps/rejected": -310.0242513020833, "loss": 0.3451, "rewards/chosen": 0.16570926904678346, "rewards/margins": 2.43117341597875, "rewards/rejected": -2.2654641469319663, "step": 13181 }, { "epoch": 0.6986987517557576, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15334915.0, "logits/rejected": -48566568.0, "logps/chosen": -375.7650146484375, "logps/rejected": -551.4485473632812, "loss": 0.1722, "rewards/chosen": 1.0199098587036133, "rewards/margins": 4.090427398681641, "rewards/rejected": -3.0705175399780273, "step": 13182 }, { "epoch": 0.6987517557575597, "grad_norm": 53.5, "kl": 0.6587104797363281, "learning_rate": 5e-07, "logits/chosen": -24189081.6, "logits/rejected": -36136053.333333336, "logps/chosen": -267.934765625, "logps/rejected": -295.89040120442706, "loss": 0.432, "rewards/chosen": -0.22685136795043945, "rewards/margins": 1.390573279062907, "rewards/rejected": -1.6174246470133464, "step": 13183 }, { "epoch": 0.6988047597593618, "grad_norm": 51.75, "kl": 1.8045120239257812, "learning_rate": 5e-07, "logits/chosen": 4481792.857142857, "logits/rejected": -57801560.0, "logps/chosen": -170.26398577008928, "logps/rejected": -131.62615966796875, "loss": 0.4597, "rewards/chosen": 0.21455161912100656, "rewards/margins": 1.9302864585603987, "rewards/rejected": -1.715734839439392, "step": 13184 }, { "epoch": 0.6988577637611639, "grad_norm": 32.0, "kl": 3.20819091796875, "learning_rate": 5e-07, "logits/chosen": 978321.6, "logits/rejected": -75445514.66666667, "logps/chosen": -190.87794189453126, "logps/rejected": -637.7942708333334, "loss": 0.2962, "rewards/chosen": 0.6601542949676513, "rewards/margins": 3.4017638683319094, "rewards/rejected": -2.741609573364258, "step": 13185 }, { "epoch": 0.6989107677629661, "grad_norm": 47.5, "kl": 1.4188880920410156, "learning_rate": 5e-07, "logits/chosen": 32072160.0, "logits/rejected": -13421345.6, "logps/chosen": -299.4266764322917, "logps/rejected": -130.7621337890625, "loss": 0.2756, "rewards/chosen": 0.5979146162668864, "rewards/margins": 1.7171083609263103, "rewards/rejected": -1.1191937446594238, "step": 13186 }, { "epoch": 0.6989637717647682, "grad_norm": 41.0, "kl": 1.6566123962402344, "learning_rate": 5e-07, "logits/chosen": -28764966.4, "logits/rejected": -41160410.666666664, "logps/chosen": -414.437890625, "logps/rejected": -359.4012044270833, "loss": 0.2194, "rewards/chosen": 1.3434413909912108, "rewards/margins": 5.0317532857259115, "rewards/rejected": -3.6883118947347007, "step": 13187 }, { "epoch": 0.6990167757665704, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31321053.333333332, "logits/rejected": -42589696.0, "logps/chosen": -205.2955322265625, "logps/rejected": -588.117822265625, "loss": 0.2461, "rewards/chosen": 0.17035281658172607, "rewards/margins": 2.8882412672042848, "rewards/rejected": -2.7178884506225587, "step": 13188 }, { "epoch": 0.6990697797683725, "grad_norm": 39.0, "kl": 0.27487945556640625, "learning_rate": 5e-07, "logits/chosen": -25299312.0, "logits/rejected": -51021593.6, "logps/chosen": -207.87418619791666, "logps/rejected": -240.9516845703125, "loss": 0.2103, "rewards/chosen": 0.5579453706741333, "rewards/margins": 3.1712897539138796, "rewards/rejected": -2.6133443832397463, "step": 13189 }, { "epoch": 0.6991227837701747, "grad_norm": 31.625, "kl": 2.2866859436035156, "learning_rate": 5e-07, "logits/chosen": -43237776.0, "logits/rejected": -4545637.5, "logps/chosen": -714.1851806640625, "logps/rejected": -182.04933166503906, "loss": 0.2863, "rewards/chosen": 1.1557432413101196, "rewards/margins": 3.4969063997268677, "rewards/rejected": -2.341163158416748, "step": 13190 }, { "epoch": 0.6991757877719768, "grad_norm": 79.0, "kl": 2.6781234741210938, "learning_rate": 5e-07, "logits/chosen": -12582920.0, "logits/rejected": -24870536.0, "logps/chosen": -378.293896484375, "logps/rejected": -331.9178059895833, "loss": 0.3065, "rewards/chosen": 0.4261288642883301, "rewards/margins": 2.9151903788248696, "rewards/rejected": -2.4890615145365396, "step": 13191 }, { "epoch": 0.699228791773779, "grad_norm": 53.0, "kl": 2.6526403427124023, "learning_rate": 5e-07, "logits/chosen": -81002410.66666667, "logits/rejected": -38627801.6, "logps/chosen": -639.7786458333334, "logps/rejected": -278.563525390625, "loss": 0.264, "rewards/chosen": 0.528156558672587, "rewards/margins": 3.1960404316584268, "rewards/rejected": -2.66788387298584, "step": 13192 }, { "epoch": 0.699281795775581, "grad_norm": 51.25, "kl": 0.20234394073486328, "learning_rate": 5e-07, "logits/chosen": 13711711.0, "logits/rejected": -14221449.333333334, "logps/chosen": -148.16012573242188, "logps/rejected": -133.758544921875, "loss": 0.2889, "rewards/chosen": -0.6248397827148438, "rewards/margins": 1.4459409713745117, "rewards/rejected": -2.0707807540893555, "step": 13193 }, { "epoch": 0.6993347997773832, "grad_norm": 44.25, "kl": 0.3062705993652344, "learning_rate": 5e-07, "logits/chosen": -34221890.666666664, "logits/rejected": -57137606.4, "logps/chosen": -307.6026611328125, "logps/rejected": -448.63486328125, "loss": 0.2067, "rewards/chosen": 0.4976557493209839, "rewards/margins": 3.341405749320984, "rewards/rejected": -2.84375, "step": 13194 }, { "epoch": 0.6993878037791853, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37511912.0, "logits/rejected": -14989293.714285715, "logps/chosen": -325.89825439453125, "logps/rejected": -207.24124581473214, "loss": 0.1814, "rewards/chosen": 0.5775116086006165, "rewards/margins": 2.7165262613977705, "rewards/rejected": -2.139014652797154, "step": 13195 }, { "epoch": 0.6994408077809875, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39181732.0, "logits/rejected": -13992384.0, "logps/chosen": -236.17698669433594, "logps/rejected": -221.8099365234375, "loss": 0.228, "rewards/chosen": -0.22616520524024963, "rewards/margins": 2.1076551377773285, "rewards/rejected": -2.333820343017578, "step": 13196 }, { "epoch": 0.6994938117827896, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 594734.0, "logits/rejected": -36779500.0, "logps/chosen": -198.86602783203125, "logps/rejected": -380.1622619628906, "loss": 0.33, "rewards/chosen": 0.6327399412790934, "rewards/margins": 2.815044323603312, "rewards/rejected": -2.1823043823242188, "step": 13197 }, { "epoch": 0.6995468157845918, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26729452.0, "logits/rejected": -18781464.0, "logps/chosen": -266.75921630859375, "logps/rejected": -181.7919464111328, "loss": 0.3193, "rewards/chosen": 0.26654431223869324, "rewards/margins": 1.9004014432430267, "rewards/rejected": -1.6338571310043335, "step": 13198 }, { "epoch": 0.6995998197863939, "grad_norm": 70.0, "kl": 2.4459190368652344, "learning_rate": 5e-07, "logits/chosen": 42299236.571428575, "logits/rejected": 9060693.0, "logps/chosen": -411.54268973214283, "logps/rejected": -330.0641174316406, "loss": 0.4242, "rewards/chosen": 0.45527325357709614, "rewards/margins": 2.9693418570927212, "rewards/rejected": -2.514068603515625, "step": 13199 }, { "epoch": 0.699652823788196, "grad_norm": 69.0, "kl": 2.2244205474853516, "learning_rate": 5e-07, "logits/chosen": -16729273.6, "logits/rejected": -12571445.333333334, "logps/chosen": -108.174169921875, "logps/rejected": -298.89821370442706, "loss": 0.3418, "rewards/chosen": 0.26973743438720704, "rewards/margins": 2.371008427937826, "rewards/rejected": -2.1012709935506186, "step": 13200 }, { "epoch": 0.6997058277899981, "grad_norm": 24.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4970104.0, "logits/rejected": -43007995.428571425, "logps/chosen": -13.86790657043457, "logps/rejected": -520.5032784598214, "loss": 0.1431, "rewards/chosen": -0.03862934187054634, "rewards/margins": 3.0384716843920097, "rewards/rejected": -3.077101026262556, "step": 13201 }, { "epoch": 0.6997588317918003, "grad_norm": 55.0, "kl": 1.680130958557129, "learning_rate": 5e-07, "logits/chosen": -13648444.0, "logits/rejected": -397440.5, "logps/chosen": -235.66458129882812, "logps/rejected": -87.78306579589844, "loss": 0.2781, "rewards/chosen": 1.861479640007019, "rewards/margins": 2.5271241068840027, "rewards/rejected": -0.6656444668769836, "step": 13202 }, { "epoch": 0.6998118357936024, "grad_norm": 41.5, "kl": 7.639934539794922, "learning_rate": 5e-07, "logits/chosen": -17950844.8, "logits/rejected": -39736600.0, "logps/chosen": -682.69794921875, "logps/rejected": -409.8201090494792, "loss": 0.2625, "rewards/chosen": 1.4852253913879394, "rewards/margins": 4.6059189160664875, "rewards/rejected": -3.1206935246785483, "step": 13203 }, { "epoch": 0.6998648397954046, "grad_norm": 57.25, "kl": 1.8770408630371094, "learning_rate": 5e-07, "logits/chosen": -39818355.2, "logits/rejected": -39997189.333333336, "logps/chosen": -259.581689453125, "logps/rejected": -334.50966389973956, "loss": 0.3394, "rewards/chosen": 0.24424660205841064, "rewards/margins": 3.410435160001119, "rewards/rejected": -3.1661885579427085, "step": 13204 }, { "epoch": 0.6999178437972067, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49974457.6, "logits/rejected": -17855653.333333332, "logps/chosen": -368.87080078125, "logps/rejected": -357.624267578125, "loss": 0.2878, "rewards/chosen": 0.5562329292297363, "rewards/margins": 4.113575776418051, "rewards/rejected": -3.557342847188314, "step": 13205 }, { "epoch": 0.6999708477990089, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32933844.0, "logits/rejected": -12011284.0, "logps/chosen": -346.2639465332031, "logps/rejected": -270.9838460286458, "loss": 0.2603, "rewards/chosen": -0.06152801960706711, "rewards/margins": 1.8420520101984341, "rewards/rejected": -1.9035800298055012, "step": 13206 }, { "epoch": 0.700023851800811, "grad_norm": 39.25, "kl": 3.0529518127441406, "learning_rate": 5e-07, "logits/chosen": -50072150.4, "logits/rejected": -21612838.666666668, "logps/chosen": -171.22369384765625, "logps/rejected": -602.1808675130209, "loss": 0.3977, "rewards/chosen": -0.06647535562515258, "rewards/margins": 3.7791826287905375, "rewards/rejected": -3.84565798441569, "step": 13207 }, { "epoch": 0.7000768558026131, "grad_norm": 37.5, "kl": 0.23193740844726562, "learning_rate": 5e-07, "logits/chosen": -18297488.0, "logits/rejected": -36768165.333333336, "logps/chosen": -492.5513671875, "logps/rejected": -645.77001953125, "loss": 0.2425, "rewards/chosen": 0.9577778816223145, "rewards/margins": 5.763837718963623, "rewards/rejected": -4.806059837341309, "step": 13208 }, { "epoch": 0.7001298598044152, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33477893.333333332, "logits/rejected": -8312327.2, "logps/chosen": -322.5295003255208, "logps/rejected": -100.40535888671874, "loss": 0.285, "rewards/chosen": 0.2646471858024597, "rewards/margins": 2.307768905162811, "rewards/rejected": -2.0431217193603515, "step": 13209 }, { "epoch": 0.7001828638062174, "grad_norm": 64.0, "kl": 1.4610671997070312, "learning_rate": 5e-07, "logits/chosen": -16310896.0, "logits/rejected": -1680060.0, "logps/chosen": -513.254638671875, "logps/rejected": -218.45367431640625, "loss": 0.3832, "rewards/chosen": 0.20517733097076415, "rewards/margins": 1.8187620083491007, "rewards/rejected": -1.6135846773783367, "step": 13210 }, { "epoch": 0.7002358678080195, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3480359.0, "logits/rejected": 3640762.3333333335, "logps/chosen": -64.69219207763672, "logps/rejected": -172.0396728515625, "loss": 0.2565, "rewards/chosen": -0.40357181429862976, "rewards/margins": 1.9406416714191437, "rewards/rejected": -2.3442134857177734, "step": 13211 }, { "epoch": 0.7002888718098217, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8591746.0, "logits/rejected": -26648426.666666668, "logps/chosen": -345.4312744140625, "logps/rejected": -278.64687093098956, "loss": 0.1532, "rewards/chosen": 1.551387071609497, "rewards/margins": 3.729731639226278, "rewards/rejected": -2.1783445676167807, "step": 13212 }, { "epoch": 0.7003418758116238, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43169542.4, "logits/rejected": -20373557.333333332, "logps/chosen": -298.2909912109375, "logps/rejected": -323.8296305338542, "loss": 0.3123, "rewards/chosen": 0.34271013736724854, "rewards/margins": 3.3119515975316367, "rewards/rejected": -2.969241460164388, "step": 13213 }, { "epoch": 0.700394879813426, "grad_norm": 36.25, "kl": 6.0417070388793945, "learning_rate": 5e-07, "logits/chosen": -599995.75, "logits/rejected": -19026972.0, "logps/chosen": -62.76832071940104, "logps/rejected": -246.2543182373047, "loss": 0.3985, "rewards/chosen": 0.5532519817352295, "rewards/margins": 3.2778451442718506, "rewards/rejected": -2.724593162536621, "step": 13214 }, { "epoch": 0.700447883815228, "grad_norm": 58.0, "kl": 1.9623565673828125, "learning_rate": 5e-07, "logits/chosen": -26515178.666666668, "logits/rejected": 55945.0, "logps/chosen": -365.9144694010417, "logps/rejected": -308.947705078125, "loss": 0.2459, "rewards/chosen": 0.49948068459828693, "rewards/margins": 2.527699335416158, "rewards/rejected": -2.028218650817871, "step": 13215 }, { "epoch": 0.7005008878170302, "grad_norm": 28.75, "kl": 0.117706298828125, "learning_rate": 5e-07, "logits/chosen": -27892564.0, "logits/rejected": -12797936.0, "logps/chosen": -251.58285522460938, "logps/rejected": -343.5096435546875, "loss": 0.1359, "rewards/chosen": 0.8498789072036743, "rewards/margins": 3.7942529916763306, "rewards/rejected": -2.9443740844726562, "step": 13216 }, { "epoch": 0.7005538918188323, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12921142.0, "logits/rejected": -13816833.0, "logps/chosen": -308.854248046875, "logps/rejected": -237.26309204101562, "loss": 0.3076, "rewards/chosen": 0.30683937668800354, "rewards/margins": 2.2905243933200836, "rewards/rejected": -1.98368501663208, "step": 13217 }, { "epoch": 0.7006068958206345, "grad_norm": 49.0, "kl": 1.4711360931396484, "learning_rate": 5e-07, "logits/chosen": -25301244.8, "logits/rejected": -35468405.333333336, "logps/chosen": -475.527734375, "logps/rejected": -640.7103678385416, "loss": 0.3049, "rewards/chosen": 0.5682863712310791, "rewards/margins": 3.38505056699117, "rewards/rejected": -2.8167641957600913, "step": 13218 }, { "epoch": 0.7006598998224366, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32864324.0, "logits/rejected": -20162918.0, "logps/chosen": -364.36309814453125, "logps/rejected": -398.0335388183594, "loss": 0.1273, "rewards/chosen": 1.4507309198379517, "rewards/margins": 5.520875573158264, "rewards/rejected": -4.0701446533203125, "step": 13219 }, { "epoch": 0.7007129038242387, "grad_norm": 47.0, "kl": 1.1565990447998047, "learning_rate": 5e-07, "logits/chosen": -30483682.666666668, "logits/rejected": -63930072.0, "logps/chosen": -347.6765543619792, "logps/rejected": -472.00390625, "loss": 0.3001, "rewards/chosen": 0.6666316986083984, "rewards/margins": 4.0022361278533936, "rewards/rejected": -3.335604429244995, "step": 13220 }, { "epoch": 0.7007659078260409, "grad_norm": 52.5, "kl": 10.913875579833984, "learning_rate": 5e-07, "logits/chosen": -29574045.333333332, "logits/rejected": -23563676.0, "logps/chosen": -444.1638997395833, "logps/rejected": -109.53929901123047, "loss": 0.3963, "rewards/chosen": 1.0857517719268799, "rewards/margins": 1.7099533677101135, "rewards/rejected": -0.6242015957832336, "step": 13221 }, { "epoch": 0.700818911827843, "grad_norm": 46.25, "kl": 3.1964187622070312, "learning_rate": 5e-07, "logits/chosen": -7339093.0, "logits/rejected": -10960507.0, "logps/chosen": -259.9136962890625, "logps/rejected": -171.95359802246094, "loss": 0.3195, "rewards/chosen": 0.35760945081710815, "rewards/margins": 2.5326642394065857, "rewards/rejected": -2.1750547885894775, "step": 13222 }, { "epoch": 0.7008719158296451, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2875500.0, "logits/rejected": -29032203.42857143, "logps/chosen": -20.08788299560547, "logps/rejected": -397.9025181361607, "loss": 0.1494, "rewards/chosen": 1.787200927734375, "rewards/margins": 4.175332205636161, "rewards/rejected": -2.3881312779017856, "step": 13223 }, { "epoch": 0.7009249198314472, "grad_norm": 45.0, "kl": 2.0961227416992188, "learning_rate": 5e-07, "logits/chosen": -23812008.0, "logits/rejected": -66762085.333333336, "logps/chosen": -283.4246826171875, "logps/rejected": -475.9268798828125, "loss": 0.2585, "rewards/chosen": 1.107851505279541, "rewards/margins": 4.260121186574301, "rewards/rejected": -3.1522696812947593, "step": 13224 }, { "epoch": 0.7009779238332494, "grad_norm": 44.0, "kl": 1.5038127899169922, "learning_rate": 5e-07, "logits/chosen": -31736424.0, "logits/rejected": -13818844.0, "logps/chosen": -488.8282063802083, "logps/rejected": -464.22552490234375, "loss": 0.3004, "rewards/chosen": 0.9644261995951334, "rewards/margins": 4.228785435358684, "rewards/rejected": -3.26435923576355, "step": 13225 }, { "epoch": 0.7010309278350515, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11744316.0, "logits/rejected": -45533732.571428575, "logps/chosen": -418.79486083984375, "logps/rejected": -433.8946010044643, "loss": 0.1762, "rewards/chosen": 1.4890350103378296, "rewards/margins": 3.614205036844526, "rewards/rejected": -2.1251700265066966, "step": 13226 }, { "epoch": 0.7010839318368537, "grad_norm": 46.0, "kl": 1.1845722198486328, "learning_rate": 5e-07, "logits/chosen": -23065565.333333332, "logits/rejected": -41053606.4, "logps/chosen": -271.128173828125, "logps/rejected": -352.069677734375, "loss": 0.2455, "rewards/chosen": 0.675830602645874, "rewards/margins": 3.143827199935913, "rewards/rejected": -2.467996597290039, "step": 13227 }, { "epoch": 0.7011369358386558, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5867550.5, "logits/rejected": -33649878.85714286, "logps/chosen": -414.8902587890625, "logps/rejected": -283.42196219308033, "loss": 0.1665, "rewards/chosen": -0.300637811422348, "rewards/margins": 2.085130806480135, "rewards/rejected": -2.385768617902483, "step": 13228 }, { "epoch": 0.701189939840458, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11577339.0, "logits/rejected": -12483494.666666666, "logps/chosen": -401.9584045410156, "logps/rejected": -266.03936767578125, "loss": 0.1602, "rewards/chosen": 0.1928108036518097, "rewards/margins": 3.491897056500117, "rewards/rejected": -3.299086252848307, "step": 13229 }, { "epoch": 0.70124294384226, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -179697376.0, "logits/rejected": -28481675.42857143, "logps/chosen": -533.3680419921875, "logps/rejected": -444.28055245535717, "loss": 0.114, "rewards/chosen": 0.013439941219985485, "rewards/margins": 3.1454028536432554, "rewards/rejected": -3.13196291242327, "step": 13230 }, { "epoch": 0.7012959478440622, "grad_norm": 58.25, "kl": 0.7613029479980469, "learning_rate": 5e-07, "logits/chosen": 2850565.75, "logits/rejected": -42782848.0, "logps/chosen": -395.32440185546875, "logps/rejected": -368.8196716308594, "loss": 0.3087, "rewards/chosen": 0.08019523322582245, "rewards/margins": 2.231568619608879, "rewards/rejected": -2.1513733863830566, "step": 13231 }, { "epoch": 0.7013489518458643, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29103253.333333332, "logits/rejected": -31262584.0, "logps/chosen": -243.0677693684896, "logps/rejected": -264.71722412109375, "loss": 0.3388, "rewards/chosen": 0.2875955104827881, "rewards/margins": 3.8009462356567383, "rewards/rejected": -3.51335072517395, "step": 13232 }, { "epoch": 0.7014019558476665, "grad_norm": 44.75, "kl": 1.034423828125, "learning_rate": 5e-07, "logits/chosen": 1582094.25, "logits/rejected": -7635757.0, "logps/chosen": -474.30633544921875, "logps/rejected": -204.87599182128906, "loss": 0.2479, "rewards/chosen": 0.9772979021072388, "rewards/margins": 2.650938630104065, "rewards/rejected": -1.6736407279968262, "step": 13233 }, { "epoch": 0.7014549598494686, "grad_norm": 41.75, "kl": 0.9295730590820312, "learning_rate": 5e-07, "logits/chosen": -38572100.0, "logits/rejected": 200937456.0, "logps/chosen": -281.1327209472656, "logps/rejected": -178.00747680664062, "loss": 0.2752, "rewards/chosen": 0.039923325181007385, "rewards/margins": 3.434460774064064, "rewards/rejected": -3.3945374488830566, "step": 13234 }, { "epoch": 0.7015079638512708, "grad_norm": 33.75, "kl": 6.684309005737305, "learning_rate": 5e-07, "logits/chosen": -14432126.666666666, "logits/rejected": -34379372.0, "logps/chosen": -247.3409220377604, "logps/rejected": -472.3580322265625, "loss": 0.3612, "rewards/chosen": 0.9413742224375407, "rewards/margins": 3.5620316664377847, "rewards/rejected": -2.620657444000244, "step": 13235 }, { "epoch": 0.7015609678530729, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54221768.0, "logps/chosen": -306.8404846191406, "loss": 0.4605, "rewards/chosen": 0.2566632032394409, "step": 13236 }, { "epoch": 0.7016139718548751, "grad_norm": 56.5, "kl": 0.7325859069824219, "learning_rate": 5e-07, "logits/chosen": -76919280.0, "logits/rejected": -3301269.0, "logps/chosen": -318.9027099609375, "logps/rejected": -323.5640462239583, "loss": 0.2501, "rewards/chosen": 0.061074838042259216, "rewards/margins": 2.044440532724063, "rewards/rejected": -1.9833656946818035, "step": 13237 }, { "epoch": 0.7016669758566771, "grad_norm": 49.25, "kl": 0.6443519592285156, "learning_rate": 5e-07, "logits/chosen": -5177794.0, "logits/rejected": -59531000.0, "logps/chosen": -742.1372680664062, "logps/rejected": -289.760498046875, "loss": 0.3222, "rewards/chosen": 0.4130552411079407, "rewards/margins": 2.320113956928253, "rewards/rejected": -1.9070587158203125, "step": 13238 }, { "epoch": 0.7017199798584793, "grad_norm": 57.75, "kl": 3.460103988647461, "learning_rate": 5e-07, "logits/chosen": 20547868.0, "logits/rejected": -21844488.0, "logps/chosen": -277.11212158203125, "logps/rejected": -361.4020690917969, "loss": 0.309, "rewards/chosen": 0.7836002508799235, "rewards/margins": 2.192615191141764, "rewards/rejected": -1.4090149402618408, "step": 13239 }, { "epoch": 0.7017729838602814, "grad_norm": 56.75, "kl": 3.3852243423461914, "learning_rate": 5e-07, "logits/chosen": -21418416.0, "logits/rejected": -19744084.0, "logps/chosen": -210.89222717285156, "logps/rejected": -243.73187255859375, "loss": 0.3681, "rewards/chosen": 0.3506205081939697, "rewards/margins": 1.334061622619629, "rewards/rejected": -0.9834411144256592, "step": 13240 }, { "epoch": 0.7018259878620836, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14657884.0, "logits/rejected": -11895884.0, "logps/chosen": -386.53564453125, "logps/rejected": -231.0916290283203, "loss": 0.199, "rewards/chosen": 0.8668273687362671, "rewards/margins": 3.951586365699768, "rewards/rejected": -3.084758996963501, "step": 13241 }, { "epoch": 0.7018789918638857, "grad_norm": 46.0, "kl": 5.776313781738281, "learning_rate": 5e-07, "logits/chosen": -20250224.0, "logits/rejected": -30836858.0, "logps/chosen": -178.09405517578125, "logps/rejected": -263.2073974609375, "loss": 0.4287, "rewards/chosen": 0.4540393352508545, "rewards/margins": 2.2968517541885376, "rewards/rejected": -1.842812418937683, "step": 13242 }, { "epoch": 0.7019319958656879, "grad_norm": 60.25, "kl": 2.455050468444824, "learning_rate": 5e-07, "logits/chosen": -34950368.0, "logits/rejected": -6938921.6, "logps/chosen": -250.91194661458334, "logps/rejected": -295.3274658203125, "loss": 0.2648, "rewards/chosen": 1.1182864507039387, "rewards/margins": 2.7059083302815754, "rewards/rejected": -1.5876218795776367, "step": 13243 }, { "epoch": 0.70198499986749, "grad_norm": 41.25, "kl": 1.2412443161010742, "learning_rate": 5e-07, "logits/chosen": -55357364.0, "logits/rejected": -446401.6875, "logps/chosen": -137.51788330078125, "logps/rejected": -115.4139404296875, "loss": 0.3051, "rewards/chosen": 0.32234710454940796, "rewards/margins": 2.1705140471458435, "rewards/rejected": -1.8481669425964355, "step": 13244 }, { "epoch": 0.7020380038692922, "grad_norm": 59.5, "kl": 1.6814956665039062, "learning_rate": 5e-07, "logits/chosen": -30045155.2, "logits/rejected": -45411546.666666664, "logps/chosen": -275.101708984375, "logps/rejected": -586.7250162760416, "loss": 0.3532, "rewards/chosen": 0.17518287897109985, "rewards/margins": 3.776236116886139, "rewards/rejected": -3.601053237915039, "step": 13245 }, { "epoch": 0.7020910078710942, "grad_norm": 37.5, "kl": 0.3077430725097656, "learning_rate": 5e-07, "logits/chosen": -4121824.0, "logits/rejected": -18818261.333333332, "logps/chosen": -183.56552734375, "logps/rejected": -469.1015625, "loss": 0.2575, "rewards/chosen": 0.6618619441986084, "rewards/margins": 4.04120020866394, "rewards/rejected": -3.379338264465332, "step": 13246 }, { "epoch": 0.7021440118728964, "grad_norm": 59.25, "kl": 0.42021942138671875, "learning_rate": 5e-07, "logits/chosen": -67103624.0, "logits/rejected": -21075346.285714287, "logps/chosen": -552.34033203125, "logps/rejected": -312.0931919642857, "loss": 0.2026, "rewards/chosen": 1.3336060047149658, "rewards/margins": 3.2751750605446954, "rewards/rejected": -1.9415690558297294, "step": 13247 }, { "epoch": 0.7021970158746985, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20764601.333333332, "logits/rejected": -10301190.4, "logps/chosen": -493.65185546875, "logps/rejected": -186.40743408203124, "loss": 0.2262, "rewards/chosen": 0.9870905876159668, "rewards/margins": 2.7117199897766113, "rewards/rejected": -1.7246294021606445, "step": 13248 }, { "epoch": 0.7022500198765007, "grad_norm": 40.25, "kl": 0.8541526794433594, "learning_rate": 5e-07, "logits/chosen": -34980362.666666664, "logits/rejected": -64227321.6, "logps/chosen": -168.97064208984375, "logps/rejected": -298.688623046875, "loss": 0.2749, "rewards/chosen": 0.3269987901051839, "rewards/margins": 1.9623534043629964, "rewards/rejected": -1.6353546142578126, "step": 13249 }, { "epoch": 0.7023030238783028, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64106084.0, "logits/rejected": -9981846.0, "logps/chosen": -231.7814483642578, "logps/rejected": -280.84983317057294, "loss": 0.2033, "rewards/chosen": -0.1158042922616005, "rewards/margins": 2.4945518796642623, "rewards/rejected": -2.610356171925863, "step": 13250 }, { "epoch": 0.702356027880105, "grad_norm": 67.0, "kl": 2.9184818267822266, "learning_rate": 5e-07, "logits/chosen": -32712979.2, "logits/rejected": -23612418.666666668, "logps/chosen": -165.2343017578125, "logps/rejected": -362.197265625, "loss": 0.4142, "rewards/chosen": 0.26917502880096433, "rewards/margins": 1.911812122662862, "rewards/rejected": -1.6426370938618977, "step": 13251 }, { "epoch": 0.7024090318819071, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7776138.5, "logits/rejected": -35404256.0, "logps/chosen": -355.586181640625, "logps/rejected": -224.18416341145834, "loss": 0.1613, "rewards/chosen": 1.5521515607833862, "rewards/margins": 3.944580594698588, "rewards/rejected": -2.3924290339152017, "step": 13252 }, { "epoch": 0.7024620358837093, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -83307448.0, "logits/rejected": -36528260.0, "logps/chosen": -343.11419677734375, "logps/rejected": -342.8973083496094, "loss": 0.3268, "rewards/chosen": -0.24213677644729614, "rewards/margins": 2.1576690077781677, "rewards/rejected": -2.399805784225464, "step": 13253 }, { "epoch": 0.7025150398855113, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15095393.6, "logits/rejected": -33785112.0, "logps/chosen": -672.291064453125, "logps/rejected": -284.037841796875, "loss": 0.1696, "rewards/chosen": 1.179153823852539, "rewards/margins": 5.233937327067057, "rewards/rejected": -4.0547835032145185, "step": 13254 }, { "epoch": 0.7025680438873135, "grad_norm": 49.75, "kl": 4.034809112548828, "learning_rate": 5e-07, "logits/chosen": -30482300.8, "logits/rejected": -13890765.333333334, "logps/chosen": -140.57529296875, "logps/rejected": -194.14599609375, "loss": 0.379, "rewards/chosen": 0.2727493762969971, "rewards/margins": 3.8787464618682863, "rewards/rejected": -3.605997085571289, "step": 13255 }, { "epoch": 0.7026210478891156, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23201532.0, "logits/rejected": -26904245.333333332, "logps/chosen": -255.26983642578125, "logps/rejected": -243.469970703125, "loss": 0.2703, "rewards/chosen": 0.13295325636863708, "rewards/margins": 1.7948421736558278, "rewards/rejected": -1.6618889172871907, "step": 13256 }, { "epoch": 0.7026740518909178, "grad_norm": 46.5, "kl": 2.643711805343628, "learning_rate": 5e-07, "logits/chosen": -81856006.4, "logits/rejected": -48373962.666666664, "logps/chosen": -479.675439453125, "logps/rejected": -449.5846761067708, "loss": 0.2567, "rewards/chosen": 0.8407345771789551, "rewards/margins": 4.912197844187419, "rewards/rejected": -4.071463267008464, "step": 13257 }, { "epoch": 0.7027270558927199, "grad_norm": 40.75, "kl": 3.4634342193603516, "learning_rate": 5e-07, "logits/chosen": -4022102.0, "logits/rejected": -1100635.6666666667, "logps/chosen": -212.1310546875, "logps/rejected": -42.408182779947914, "loss": 0.4, "rewards/chosen": 0.7537096977233887, "rewards/margins": 1.6310330073038737, "rewards/rejected": -0.877323309580485, "step": 13258 }, { "epoch": 0.7027800598945221, "grad_norm": 40.25, "kl": 0.6977968215942383, "learning_rate": 5e-07, "logits/chosen": -24302733.333333332, "logits/rejected": -15122393.6, "logps/chosen": -228.488525390625, "logps/rejected": -130.05478515625, "loss": 0.3128, "rewards/chosen": 0.6263658205668131, "rewards/margins": 1.772263209025065, "rewards/rejected": -1.145897388458252, "step": 13259 }, { "epoch": 0.7028330638963242, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22034549.333333332, "logits/rejected": -36506649.6, "logps/chosen": -296.36098225911456, "logps/rejected": -240.62314453125, "loss": 0.2119, "rewards/chosen": 0.27908605337142944, "rewards/margins": 3.020557534694672, "rewards/rejected": -2.7414714813232424, "step": 13260 }, { "epoch": 0.7028860678981264, "grad_norm": 37.75, "kl": 0.05360126495361328, "learning_rate": 5e-07, "logits/chosen": 10412780.0, "logits/rejected": -38564228.0, "logps/chosen": -101.87870788574219, "logps/rejected": -399.98406982421875, "loss": 0.2775, "rewards/chosen": 0.06734543293714523, "rewards/margins": 2.847937159240246, "rewards/rejected": -2.7805917263031006, "step": 13261 }, { "epoch": 0.7029390718999284, "grad_norm": 36.5, "kl": 0.027469635009765625, "learning_rate": 5e-07, "logits/chosen": -12389724.0, "logits/rejected": -15210163.2, "logps/chosen": -172.35986328125, "logps/rejected": -285.312890625, "loss": 0.2766, "rewards/chosen": 0.15179729461669922, "rewards/margins": 2.5322656631469727, "rewards/rejected": -2.3804683685302734, "step": 13262 }, { "epoch": 0.7029920759017306, "grad_norm": 44.75, "kl": 1.0052433013916016, "learning_rate": 5e-07, "logits/chosen": -20460378.0, "logits/rejected": -32797154.666666668, "logps/chosen": -383.82147216796875, "logps/rejected": -727.44140625, "loss": 0.2358, "rewards/chosen": 1.193434238433838, "rewards/margins": 4.739534219106039, "rewards/rejected": -3.5460999806722007, "step": 13263 }, { "epoch": 0.7030450799035327, "grad_norm": 44.75, "kl": 0.22251510620117188, "learning_rate": 5e-07, "logits/chosen": -25667434.666666668, "logits/rejected": -17295320.0, "logps/chosen": -307.73101806640625, "logps/rejected": -498.966162109375, "loss": 0.2008, "rewards/chosen": 0.8349931240081787, "rewards/margins": 3.6956973552703856, "rewards/rejected": -2.860704231262207, "step": 13264 }, { "epoch": 0.7030980839053349, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7853789.5, "logits/rejected": -29016928.0, "logps/chosen": -251.294189453125, "logps/rejected": -332.1947326660156, "loss": 0.1915, "rewards/chosen": 1.1078187227249146, "rewards/margins": 4.635108113288879, "rewards/rejected": -3.527289390563965, "step": 13265 }, { "epoch": 0.703151087907137, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -102135637.33333333, "logits/rejected": -14440374.4, "logps/chosen": -279.274169921875, "logps/rejected": -234.1464111328125, "loss": 0.2307, "rewards/chosen": 0.5666677157084147, "rewards/margins": 3.4876859347025553, "rewards/rejected": -2.9210182189941407, "step": 13266 }, { "epoch": 0.7032040919089392, "grad_norm": 49.25, "kl": 0.5730743408203125, "learning_rate": 5e-07, "logits/chosen": -44410856.0, "logits/rejected": -12706616.0, "logps/chosen": -301.8075256347656, "logps/rejected": -226.38932291666666, "loss": 0.2777, "rewards/chosen": 0.30821454524993896, "rewards/margins": 1.802153468132019, "rewards/rejected": -1.49393892288208, "step": 13267 }, { "epoch": 0.7032570959107413, "grad_norm": 50.5, "kl": 3.183871269226074, "learning_rate": 5e-07, "logits/chosen": -36092229.333333336, "logits/rejected": -3305782.5, "logps/chosen": -216.82108561197916, "logps/rejected": -308.4058532714844, "loss": 0.3901, "rewards/chosen": 0.5257135629653931, "rewards/margins": 2.1114091873168945, "rewards/rejected": -1.5856956243515015, "step": 13268 }, { "epoch": 0.7033100999125435, "grad_norm": 82.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48578362.666666664, "logits/rejected": -61883161.6, "logps/chosen": -284.54189046223956, "logps/rejected": -432.036572265625, "loss": 0.2619, "rewards/chosen": 0.2636282642682393, "rewards/margins": 2.514670821030935, "rewards/rejected": -2.2510425567626955, "step": 13269 }, { "epoch": 0.7033631039143455, "grad_norm": 52.25, "kl": 0.9391002655029297, "learning_rate": 5e-07, "logits/chosen": -34252499.2, "logits/rejected": -30715957.333333332, "logps/chosen": -259.700537109375, "logps/rejected": -240.51806640625, "loss": 0.301, "rewards/chosen": 0.7204157829284668, "rewards/margins": 2.0320027987162272, "rewards/rejected": -1.3115870157877605, "step": 13270 }, { "epoch": 0.7034161079161476, "grad_norm": 55.0, "kl": 1.7787456512451172, "learning_rate": 5e-07, "logits/chosen": -34563056.0, "logits/rejected": -26016728.0, "logps/chosen": -280.8365234375, "logps/rejected": -396.19580078125, "loss": 0.3624, "rewards/chosen": 0.45302906036376955, "rewards/margins": 2.500576877593994, "rewards/rejected": -2.0475478172302246, "step": 13271 }, { "epoch": 0.7034691119179498, "grad_norm": 59.5, "kl": 1.129990577697754, "learning_rate": 5e-07, "logits/chosen": -27752025.6, "logits/rejected": -34560560.0, "logps/chosen": -408.1884765625, "logps/rejected": -279.3481038411458, "loss": 0.3774, "rewards/chosen": 0.159685218334198, "rewards/margins": 2.0256637771924337, "rewards/rejected": -1.8659785588582356, "step": 13272 }, { "epoch": 0.7035221159197519, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20806316.8, "logits/rejected": -40147816.0, "logps/chosen": -328.90986328125, "logps/rejected": -188.61429850260416, "loss": 0.2751, "rewards/chosen": 0.8103379249572754, "rewards/margins": 2.2687361081441244, "rewards/rejected": -1.4583981831868489, "step": 13273 }, { "epoch": 0.7035751199215541, "grad_norm": 40.75, "kl": 2.0652618408203125, "learning_rate": 5e-07, "logits/chosen": -56059013.333333336, "logits/rejected": -36441833.6, "logps/chosen": -333.79736328125, "logps/rejected": -637.88798828125, "loss": 0.1535, "rewards/chosen": 1.1190985838572185, "rewards/margins": 3.808531681696574, "rewards/rejected": -2.6894330978393555, "step": 13274 }, { "epoch": 0.7036281239233562, "grad_norm": 46.25, "kl": 0.3114776611328125, "learning_rate": 5e-07, "logits/chosen": -23640809.6, "logits/rejected": -19752745.333333332, "logps/chosen": -470.5275390625, "logps/rejected": -183.9324747721354, "loss": 0.2743, "rewards/chosen": 0.9797099113464356, "rewards/margins": 2.8845805803934734, "rewards/rejected": -1.9048706690470378, "step": 13275 }, { "epoch": 0.7036811279251584, "grad_norm": 51.75, "kl": 2.0330963134765625, "learning_rate": 5e-07, "logits/chosen": -37008824.0, "logits/rejected": -9631988.0, "logps/chosen": -198.2135467529297, "logps/rejected": -160.2686004638672, "loss": 0.3136, "rewards/chosen": 0.33283692598342896, "rewards/margins": 2.6734853386878967, "rewards/rejected": -2.3406484127044678, "step": 13276 }, { "epoch": 0.7037341319269604, "grad_norm": 59.5, "kl": 0.8315649032592773, "learning_rate": 5e-07, "logits/chosen": -55317920.0, "logits/rejected": -23792726.0, "logps/chosen": -629.3787841796875, "logps/rejected": -299.5218811035156, "loss": 0.2007, "rewards/chosen": 1.3161331415176392, "rewards/margins": 3.5784064531326294, "rewards/rejected": -2.2622733116149902, "step": 13277 }, { "epoch": 0.7037871359287626, "grad_norm": 47.0, "kl": 3.0640010833740234, "learning_rate": 5e-07, "logits/chosen": -53404992.0, "logits/rejected": -13535989.333333334, "logps/chosen": -534.031298828125, "logps/rejected": -203.048828125, "loss": 0.3108, "rewards/chosen": 1.1960501670837402, "rewards/margins": 2.408499638239543, "rewards/rejected": -1.2124494711558025, "step": 13278 }, { "epoch": 0.7038401399305647, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45928448.0, "logits/rejected": -25143760.0, "logps/chosen": -353.3671875, "logps/rejected": -516.414599609375, "loss": 0.2102, "rewards/chosen": 0.6080414056777954, "rewards/margins": 3.3207653284072878, "rewards/rejected": -2.7127239227294924, "step": 13279 }, { "epoch": 0.7038931439323669, "grad_norm": 32.5, "kl": 3.1198511123657227, "learning_rate": 5e-07, "logits/chosen": 2961467.2, "logits/rejected": -10520105.333333334, "logps/chosen": -201.48663330078125, "logps/rejected": -191.6219685872396, "loss": 0.2708, "rewards/chosen": 1.1333604812622071, "rewards/margins": 2.9805004755655924, "rewards/rejected": -1.8471399943033855, "step": 13280 }, { "epoch": 0.703946147934169, "grad_norm": 55.25, "kl": 1.391256332397461, "learning_rate": 5e-07, "logits/chosen": -1334426.6666666667, "logits/rejected": -54991668.0, "logps/chosen": -366.6907552083333, "logps/rejected": -434.85235595703125, "loss": 0.3629, "rewards/chosen": 0.5005133152008057, "rewards/margins": 2.3849337100982666, "rewards/rejected": -1.884420394897461, "step": 13281 }, { "epoch": 0.7039991519359712, "grad_norm": 52.0, "kl": 0.12782764434814453, "learning_rate": 5e-07, "logits/chosen": -9992681.333333334, "logits/rejected": -32951554.0, "logps/chosen": -149.40971883138022, "logps/rejected": -335.5982360839844, "loss": 0.4277, "rewards/chosen": -0.05722307165463766, "rewards/margins": 2.2790303329626718, "rewards/rejected": -2.3362534046173096, "step": 13282 }, { "epoch": 0.7040521559377733, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37667892.0, "logits/rejected": -15221969.0, "logps/chosen": -235.4755401611328, "logps/rejected": -251.69680786132812, "loss": 0.2651, "rewards/chosen": 0.5924402475357056, "rewards/margins": 2.659061074256897, "rewards/rejected": -2.0666208267211914, "step": 13283 }, { "epoch": 0.7041051599395755, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17381988.0, "logits/rejected": -22661932.8, "logps/chosen": -216.72576904296875, "logps/rejected": -123.569189453125, "loss": 0.2586, "rewards/chosen": 0.06602758169174194, "rewards/margins": 2.197911965847015, "rewards/rejected": -2.1318843841552733, "step": 13284 }, { "epoch": 0.7041581639413775, "grad_norm": 40.0, "kl": 4.487512588500977, "learning_rate": 5e-07, "logits/chosen": -11517937.6, "logits/rejected": 52907445.333333336, "logps/chosen": -249.842236328125, "logps/rejected": -643.53662109375, "loss": 0.3492, "rewards/chosen": 0.6128568649291992, "rewards/margins": 3.1274118423461914, "rewards/rejected": -2.514554977416992, "step": 13285 }, { "epoch": 0.7042111679431797, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12898130.666666666, "logits/rejected": -19715536.0, "logps/chosen": -111.4569091796875, "logps/rejected": -154.89410400390625, "loss": 0.334, "rewards/chosen": -0.4023948113123576, "rewards/margins": 1.2447941382726033, "rewards/rejected": -1.647188949584961, "step": 13286 }, { "epoch": 0.7042641719449818, "grad_norm": 54.25, "kl": 2.562997817993164, "learning_rate": 5e-07, "logits/chosen": -42780460.0, "logits/rejected": -4934058.0, "logps/chosen": -433.964111328125, "logps/rejected": -167.84982299804688, "loss": 0.2568, "rewards/chosen": 1.666823148727417, "rewards/margins": 2.544505476951599, "rewards/rejected": -0.8776823282241821, "step": 13287 }, { "epoch": 0.704317175946784, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44836920.0, "logits/rejected": -21372925.333333332, "logps/chosen": -442.28167724609375, "logps/rejected": -243.3890177408854, "loss": 0.1996, "rewards/chosen": -0.5055923461914062, "rewards/margins": 2.5682948430379233, "rewards/rejected": -3.0738871892293296, "step": 13288 }, { "epoch": 0.7043701799485861, "grad_norm": 32.5, "kl": 0.0888824462890625, "learning_rate": 5e-07, "logits/chosen": 482661.15625, "logits/rejected": -4012584.0, "logps/chosen": -76.31519317626953, "logps/rejected": -254.79824829101562, "loss": 0.2621, "rewards/chosen": 0.4772828221321106, "rewards/margins": 2.669213831424713, "rewards/rejected": -2.1919310092926025, "step": 13289 }, { "epoch": 0.7044231839503883, "grad_norm": 42.25, "kl": 0.31629180908203125, "learning_rate": 5e-07, "logits/chosen": -3357909.75, "logits/rejected": -24985820.0, "logps/chosen": -88.98255920410156, "logps/rejected": -461.9830322265625, "loss": 0.2752, "rewards/chosen": 0.32176801562309265, "rewards/margins": 3.147085875272751, "rewards/rejected": -2.825317859649658, "step": 13290 }, { "epoch": 0.7044761879521904, "grad_norm": 56.75, "kl": 0.683563232421875, "learning_rate": 5e-07, "logits/chosen": -33621610.666666664, "logits/rejected": -32212752.0, "logps/chosen": -200.06884765625, "logps/rejected": -362.86083984375, "loss": 0.4024, "rewards/chosen": 0.02825586994489034, "rewards/margins": 2.2434515257676444, "rewards/rejected": -2.215195655822754, "step": 13291 }, { "epoch": 0.7045291919539926, "grad_norm": 42.25, "kl": 0.4691305160522461, "learning_rate": 5e-07, "logits/chosen": 49601029.333333336, "logits/rejected": -1048497.2, "logps/chosen": -214.9581298828125, "logps/rejected": -113.42391357421874, "loss": 0.3111, "rewards/chosen": 0.20470402638117471, "rewards/margins": 1.6911670009295146, "rewards/rejected": -1.4864629745483398, "step": 13292 }, { "epoch": 0.7045821959557946, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13546009.6, "logits/rejected": -31553720.0, "logps/chosen": -208.549462890625, "logps/rejected": -457.7303873697917, "loss": 0.2807, "rewards/chosen": 0.595853042602539, "rewards/margins": 2.975984287261963, "rewards/rejected": -2.380131244659424, "step": 13293 }, { "epoch": 0.7046351999575968, "grad_norm": 39.5, "kl": 0.1851482391357422, "learning_rate": 5e-07, "logits/chosen": -6095425.333333333, "logits/rejected": -34602284.8, "logps/chosen": -86.45189412434895, "logps/rejected": -400.792578125, "loss": 0.2878, "rewards/chosen": -0.2631114919980367, "rewards/margins": 2.514198879400889, "rewards/rejected": -2.777310371398926, "step": 13294 }, { "epoch": 0.7046882039593989, "grad_norm": 67.5, "kl": 0.2589874267578125, "learning_rate": 5e-07, "logits/chosen": -27028880.0, "logits/rejected": -65190864.0, "logps/chosen": -464.2406412760417, "logps/rejected": -854.2784423828125, "loss": 0.2844, "rewards/chosen": 0.671985944112142, "rewards/margins": 5.0093458493550616, "rewards/rejected": -4.33735990524292, "step": 13295 }, { "epoch": 0.7047412079612011, "grad_norm": 39.5, "kl": 2.592487335205078, "learning_rate": 5e-07, "logits/chosen": -17292852.8, "logits/rejected": 12107185.333333334, "logps/chosen": -312.9261474609375, "logps/rejected": -541.5596110026041, "loss": 0.317, "rewards/chosen": 0.6740289688110351, "rewards/margins": 4.533870951334635, "rewards/rejected": -3.8598419825236, "step": 13296 }, { "epoch": 0.7047942119630032, "grad_norm": 65.0, "kl": 7.048855781555176, "learning_rate": 5e-07, "logits/chosen": -4282994.5, "logps/chosen": -358.39666748046875, "loss": 0.435, "rewards/chosen": 1.0629290342330933, "step": 13297 }, { "epoch": 0.7048472159648054, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43611284.0, "logits/rejected": -49121188.0, "logps/chosen": -271.75823974609375, "logps/rejected": -219.98806762695312, "loss": 0.312, "rewards/chosen": -0.0566963292658329, "rewards/margins": 2.1800602339208126, "rewards/rejected": -2.2367565631866455, "step": 13298 }, { "epoch": 0.7049002199666075, "grad_norm": 39.5, "kl": 4.162644386291504, "learning_rate": 5e-07, "logits/chosen": 580193.5, "logits/rejected": 521624.625, "logps/chosen": -158.20700073242188, "logps/rejected": -75.52680969238281, "loss": 0.3872, "rewards/chosen": 0.5288455883661906, "rewards/margins": 1.4781572620073953, "rewards/rejected": -0.9493116736412048, "step": 13299 }, { "epoch": 0.7049532239684096, "grad_norm": 61.75, "kl": 1.6102914810180664, "learning_rate": 5e-07, "logits/chosen": -10027765.6, "logits/rejected": 19389589.333333332, "logps/chosen": -127.50738525390625, "logps/rejected": -163.4629923502604, "loss": 0.4498, "rewards/chosen": 0.0017266809940338136, "rewards/margins": 0.4356965760389964, "rewards/rejected": -0.4339698950449626, "step": 13300 }, { "epoch": 0.7050062279702117, "grad_norm": 49.5, "kl": 0.4345054626464844, "learning_rate": 5e-07, "logits/chosen": -17122654.4, "logits/rejected": -49706378.666666664, "logps/chosen": -279.194677734375, "logps/rejected": -569.7857666015625, "loss": 0.2856, "rewards/chosen": 0.29804065227508547, "rewards/margins": 3.7641578753789267, "rewards/rejected": -3.4661172231038413, "step": 13301 }, { "epoch": 0.7050592319720139, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19352840.0, "logits/rejected": -12810066.4, "logps/chosen": -412.0320638020833, "logps/rejected": -335.853662109375, "loss": 0.3158, "rewards/chosen": -0.3277604579925537, "rewards/margins": 1.8723026752471923, "rewards/rejected": -2.200063133239746, "step": 13302 }, { "epoch": 0.705112235973816, "grad_norm": 41.0, "kl": 0.1798858642578125, "learning_rate": 5e-07, "logits/chosen": -55016458.666666664, "logits/rejected": -29520297.6, "logps/chosen": -281.0596516927083, "logps/rejected": -320.2589599609375, "loss": 0.2142, "rewards/chosen": 0.2501780390739441, "rewards/margins": 2.9205891489982605, "rewards/rejected": -2.6704111099243164, "step": 13303 }, { "epoch": 0.7051652399756182, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47108876.0, "logits/rejected": -33232218.666666668, "logps/chosen": -454.057373046875, "logps/rejected": -408.9573160807292, "loss": 0.1853, "rewards/chosen": 0.3316177427768707, "rewards/margins": 3.0734536548455558, "rewards/rejected": -2.741835912068685, "step": 13304 }, { "epoch": 0.7052182439774203, "grad_norm": 27.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13719635.0, "logits/rejected": -14199864.0, "logps/chosen": -261.4700012207031, "logps/rejected": -272.4661458333333, "loss": 0.1996, "rewards/chosen": 1.0474779605865479, "rewards/margins": 3.4815356731414795, "rewards/rejected": -2.4340577125549316, "step": 13305 }, { "epoch": 0.7052712479792225, "grad_norm": 47.25, "kl": 6.923223495483398, "learning_rate": 5e-07, "logits/chosen": -2541589.6666666665, "logits/rejected": -33404819.2, "logps/chosen": -191.87101236979166, "logps/rejected": -380.7435302734375, "loss": 0.2946, "rewards/chosen": 0.40622663497924805, "rewards/margins": 2.7203660011291504, "rewards/rejected": -2.3141393661499023, "step": 13306 }, { "epoch": 0.7053242519810246, "grad_norm": 46.5, "kl": 0.1962432861328125, "learning_rate": 5e-07, "logits/chosen": -24458692.0, "logits/rejected": -800409.0, "logps/chosen": -167.49220275878906, "logps/rejected": -253.0379638671875, "loss": 0.2807, "rewards/chosen": -0.16049803793430328, "rewards/margins": 1.4839268773794174, "rewards/rejected": -1.6444249153137207, "step": 13307 }, { "epoch": 0.7053772559828267, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -107699760.0, "logits/rejected": -10437228.0, "logps/chosen": -352.78546142578125, "logps/rejected": -492.5994873046875, "loss": 0.167, "rewards/chosen": 0.11505279690027237, "rewards/margins": 3.127918817102909, "rewards/rejected": -3.0128660202026367, "step": 13308 }, { "epoch": 0.7054302599846288, "grad_norm": 58.25, "kl": 1.3405609130859375, "learning_rate": 5e-07, "logits/chosen": -61383461.333333336, "logits/rejected": -32436598.4, "logps/chosen": -674.0573323567709, "logps/rejected": -239.594189453125, "loss": 0.2168, "rewards/chosen": 1.430511474609375, "rewards/margins": 3.471786308288574, "rewards/rejected": -2.041274833679199, "step": 13309 }, { "epoch": 0.705483263986431, "grad_norm": 43.25, "kl": 4.799594879150391, "learning_rate": 5e-07, "logits/chosen": -23712597.333333332, "logits/rejected": 49977987.2, "logps/chosen": -689.473876953125, "logps/rejected": -151.1692626953125, "loss": 0.2047, "rewards/chosen": 1.7499828338623047, "rewards/margins": 3.079419803619385, "rewards/rejected": -1.32943696975708, "step": 13310 }, { "epoch": 0.7055362679882331, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27711075.2, "logits/rejected": -3985346.6666666665, "logps/chosen": -248.2890380859375, "logps/rejected": -130.178466796875, "loss": 0.3371, "rewards/chosen": -0.04575684666633606, "rewards/margins": 3.3023371398448944, "rewards/rejected": -3.3480939865112305, "step": 13311 }, { "epoch": 0.7055892719900353, "grad_norm": 56.25, "kl": 1.8361015319824219, "learning_rate": 5e-07, "logits/chosen": -15666545.6, "logits/rejected": -13051106.666666666, "logps/chosen": -680.771435546875, "logps/rejected": -134.52006022135416, "loss": 0.2933, "rewards/chosen": 1.0162469863891601, "rewards/margins": 2.4304360071818034, "rewards/rejected": -1.4141890207926433, "step": 13312 }, { "epoch": 0.7056422759918374, "grad_norm": 39.5, "kl": 2.7839813232421875, "learning_rate": 5e-07, "logits/chosen": 7344092.0, "logits/rejected": -8059192.0, "logps/chosen": -52.040018717447914, "logps/rejected": -142.53582763671875, "loss": 0.2762, "rewards/chosen": 0.23416163523991904, "rewards/margins": 2.4624537189801536, "rewards/rejected": -2.2282920837402345, "step": 13313 }, { "epoch": 0.7056952799936396, "grad_norm": 45.0, "kl": 0.1411581039428711, "learning_rate": 5e-07, "logits/chosen": -27977372.0, "logits/rejected": -16473427.0, "logps/chosen": -203.73175048828125, "logps/rejected": -382.284912109375, "loss": 0.2866, "rewards/chosen": 0.3332114815711975, "rewards/margins": 3.1388054490089417, "rewards/rejected": -2.805593967437744, "step": 13314 }, { "epoch": 0.7057482839954416, "grad_norm": 47.5, "kl": 3.4907379150390625, "learning_rate": 5e-07, "logits/chosen": -30699082.666666668, "logits/rejected": -65170904.0, "logps/chosen": -426.1410725911458, "logps/rejected": -497.88629150390625, "loss": 0.2941, "rewards/chosen": 1.198637326558431, "rewards/margins": 2.3903116782506304, "rewards/rejected": -1.1916743516921997, "step": 13315 }, { "epoch": 0.7058012879972438, "grad_norm": 54.25, "kl": 4.469423294067383, "learning_rate": 5e-07, "logits/chosen": 1809267.625, "logits/rejected": -27722678.0, "logps/chosen": -205.85220336914062, "logps/rejected": -301.9980773925781, "loss": 0.2473, "rewards/chosen": 0.9398437142372131, "rewards/margins": 3.574728548526764, "rewards/rejected": -2.634884834289551, "step": 13316 }, { "epoch": 0.7058542919990459, "grad_norm": 59.25, "kl": 0.758885383605957, "learning_rate": 5e-07, "logits/chosen": -64264533.333333336, "logits/rejected": -6448606.5, "logps/chosen": -348.3974609375, "logps/rejected": -568.2550048828125, "loss": 0.4359, "rewards/chosen": 0.1772429347038269, "rewards/margins": 1.4044424891471863, "rewards/rejected": -1.2271995544433594, "step": 13317 }, { "epoch": 0.7059072960008481, "grad_norm": 48.0, "kl": 6.704074859619141, "learning_rate": 5e-07, "logits/chosen": -944415.375, "logits/rejected": -9871849.0, "logps/chosen": -402.103271484375, "logps/rejected": -338.5553283691406, "loss": 0.2144, "rewards/chosen": 2.5172600746154785, "rewards/margins": 3.6230908632278442, "rewards/rejected": -1.1058307886123657, "step": 13318 }, { "epoch": 0.7059603000026502, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37658812.8, "logits/rejected": -35078546.666666664, "logps/chosen": -352.24384765625, "logps/rejected": -440.0474853515625, "loss": 0.2514, "rewards/chosen": 0.47554688453674315, "rewards/margins": 4.293035554885864, "rewards/rejected": -3.817488670349121, "step": 13319 }, { "epoch": 0.7060133040044524, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62722644.0, "logits/rejected": -17724298.666666668, "logps/chosen": -441.0074157714844, "logps/rejected": -310.65704345703125, "loss": 0.1838, "rewards/chosen": 0.22034913301467896, "rewards/margins": 3.0372214118639627, "rewards/rejected": -2.8168722788492837, "step": 13320 }, { "epoch": 0.7060663080062545, "grad_norm": 45.25, "kl": 4.326833724975586, "learning_rate": 5e-07, "logits/chosen": -11664926.4, "logits/rejected": -96652.16666666667, "logps/chosen": -461.19130859375, "logps/rejected": -104.6553955078125, "loss": 0.3322, "rewards/chosen": 1.0737997055053712, "rewards/margins": 2.3209596633911134, "rewards/rejected": -1.2471599578857422, "step": 13321 }, { "epoch": 0.7061193120080566, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13452652.8, "logits/rejected": -8018156.666666667, "logps/chosen": -263.8428466796875, "logps/rejected": -131.99947102864584, "loss": 0.324, "rewards/chosen": 0.4876309871673584, "rewards/margins": 2.3678690433502196, "rewards/rejected": -1.8802380561828613, "step": 13322 }, { "epoch": 0.7061723160098587, "grad_norm": 43.25, "kl": 1.1398353576660156, "learning_rate": 5e-07, "logits/chosen": -37901872.0, "logits/rejected": 1724790.0, "logps/chosen": -257.90228271484375, "logps/rejected": -175.51870727539062, "loss": 0.2689, "rewards/chosen": 0.4354607164859772, "rewards/margins": 2.627422720193863, "rewards/rejected": -2.1919620037078857, "step": 13323 }, { "epoch": 0.7062253200116608, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11068908.0, "logits/rejected": -25531219.2, "logps/chosen": -240.23734537760416, "logps/rejected": -311.46650390625, "loss": 0.2257, "rewards/chosen": 0.43998106320699054, "rewards/margins": 3.3167541344960534, "rewards/rejected": -2.8767730712890627, "step": 13324 }, { "epoch": 0.706278324013463, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7162044.8, "logits/rejected": -54049754.666666664, "logps/chosen": -349.65595703125, "logps/rejected": -470.83984375, "loss": 0.1979, "rewards/chosen": 1.2668472290039063, "rewards/margins": 3.4519649505615235, "rewards/rejected": -2.185117721557617, "step": 13325 }, { "epoch": 0.7063313280152651, "grad_norm": 42.5, "kl": 0.9456405639648438, "learning_rate": 5e-07, "logits/chosen": -40975837.333333336, "logits/rejected": 5793546.4, "logps/chosen": -1191.891845703125, "logps/rejected": -621.673681640625, "loss": 0.1756, "rewards/chosen": 1.4134206771850586, "rewards/margins": 3.6296384811401365, "rewards/rejected": -2.216217803955078, "step": 13326 }, { "epoch": 0.7063843320170673, "grad_norm": 44.5, "kl": 3.9452877044677734, "learning_rate": 5e-07, "logits/chosen": 22970553.6, "logits/rejected": 8757052.666666666, "logps/chosen": -137.97247314453125, "logps/rejected": -199.12518310546875, "loss": 0.3646, "rewards/chosen": 0.7195607185363769, "rewards/margins": 2.1163928667704264, "rewards/rejected": -1.3968321482340496, "step": 13327 }, { "epoch": 0.7064373360188694, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23842020.0, "logits/rejected": -4333378.0, "logps/chosen": -226.70242309570312, "logps/rejected": -232.2025146484375, "loss": 0.2524, "rewards/chosen": 0.44402652978897095, "rewards/margins": 2.7135010361671448, "rewards/rejected": -2.269474506378174, "step": 13328 }, { "epoch": 0.7064903400206716, "grad_norm": 64.5, "kl": 0.33606815338134766, "learning_rate": 5e-07, "logits/chosen": -37628120.0, "logits/rejected": -25281078.0, "logps/chosen": -237.28292846679688, "logps/rejected": -553.6902465820312, "loss": 0.356, "rewards/chosen": -0.20845089852809906, "rewards/margins": 1.8928994089365005, "rewards/rejected": -2.1013503074645996, "step": 13329 }, { "epoch": 0.7065433440224737, "grad_norm": 78.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43114148.0, "logits/rejected": -24377653.333333332, "logps/chosen": -673.37646484375, "logps/rejected": -313.4858805338542, "loss": 0.2134, "rewards/chosen": 0.7488555908203125, "rewards/margins": 2.773040294647217, "rewards/rejected": -2.0241847038269043, "step": 13330 }, { "epoch": 0.7065963480242758, "grad_norm": 66.5, "kl": 1.5306463241577148, "learning_rate": 5e-07, "logits/chosen": -62749704.0, "logits/rejected": -23271732.0, "logps/chosen": -225.008544921875, "logps/rejected": -315.1199035644531, "loss": 0.3565, "rewards/chosen": 0.34073716402053833, "rewards/margins": 2.602912724018097, "rewards/rejected": -2.2621755599975586, "step": 13331 }, { "epoch": 0.7066493520260779, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26082288.0, "logits/rejected": -17881794.0, "logps/chosen": -265.84853108723956, "logps/rejected": -536.8753662109375, "loss": 0.3155, "rewards/chosen": 0.5175652503967285, "rewards/margins": 3.2778282165527344, "rewards/rejected": -2.760262966156006, "step": 13332 }, { "epoch": 0.7067023560278801, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26493860.0, "logits/rejected": -27997740.0, "logps/chosen": -279.5423278808594, "logps/rejected": -448.7631530761719, "loss": 0.3022, "rewards/chosen": 0.17901533842086792, "rewards/margins": 3.113506257534027, "rewards/rejected": -2.934490919113159, "step": 13333 }, { "epoch": 0.7067553600296822, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27752450.0, "logits/rejected": -13170028.0, "logps/chosen": -421.8296203613281, "logps/rejected": -328.22275797526044, "loss": 0.1178, "rewards/chosen": 2.4899699687957764, "rewards/margins": 4.918672323226929, "rewards/rejected": -2.4287023544311523, "step": 13334 }, { "epoch": 0.7068083640314844, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55172885.333333336, "logits/rejected": -6948113.0, "logps/chosen": -347.478515625, "logps/rejected": -180.57342529296875, "loss": 0.3041, "rewards/chosen": 0.46952323118845624, "rewards/margins": 3.9256170193354287, "rewards/rejected": -3.4560937881469727, "step": 13335 }, { "epoch": 0.7068613680332865, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25928714.0, "logits/rejected": -4238727.0, "logps/chosen": -272.57977294921875, "logps/rejected": -103.88673400878906, "loss": 0.2541, "rewards/chosen": 0.3956264555454254, "rewards/margins": 3.2283858358860016, "rewards/rejected": -2.832759380340576, "step": 13336 }, { "epoch": 0.7069143720350887, "grad_norm": 56.0, "kl": 1.7874259948730469, "learning_rate": 5e-07, "logits/chosen": -7319120.8, "logits/rejected": -46437973.333333336, "logps/chosen": -630.232763671875, "logps/rejected": -447.118896484375, "loss": 0.253, "rewards/chosen": 1.0229844093322753, "rewards/margins": 4.772647762298584, "rewards/rejected": -3.7496633529663086, "step": 13337 }, { "epoch": 0.7069673760368907, "grad_norm": 34.0, "kl": 1.8851203918457031, "learning_rate": 5e-07, "logits/chosen": 439943584.0, "logits/rejected": -21719966.0, "logps/chosen": -179.82681274414062, "logps/rejected": -246.35003662109375, "loss": 0.3505, "rewards/chosen": -0.015567585825920105, "rewards/margins": 1.9874240905046463, "rewards/rejected": -2.0029916763305664, "step": 13338 }, { "epoch": 0.7070203800386929, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51390924.8, "logits/rejected": -28833685.333333332, "logps/chosen": -585.547216796875, "logps/rejected": -339.5239664713542, "loss": 0.2299, "rewards/chosen": 0.9130084991455079, "rewards/margins": 3.936347770690918, "rewards/rejected": -3.02333927154541, "step": 13339 }, { "epoch": 0.707073384040495, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91810672.0, "logits/rejected": -28031369.14285714, "logps/chosen": -631.7799072265625, "logps/rejected": -362.125732421875, "loss": 0.1998, "rewards/chosen": 0.15639649331569672, "rewards/margins": 2.6850053604160036, "rewards/rejected": -2.528608867100307, "step": 13340 }, { "epoch": 0.7071263880422972, "grad_norm": 27.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2555385.3333333335, "logits/rejected": -20817089.6, "logps/chosen": -116.92239379882812, "logps/rejected": -271.7625, "loss": 0.2185, "rewards/chosen": 0.43350549538930255, "rewards/margins": 2.9401602347691855, "rewards/rejected": -2.506654739379883, "step": 13341 }, { "epoch": 0.7071793920440993, "grad_norm": 38.5, "kl": 4.5630035400390625, "learning_rate": 5e-07, "logits/chosen": -25858603.2, "logits/rejected": -47198368.0, "logps/chosen": -222.0669189453125, "logps/rejected": -595.7179768880209, "loss": 0.352, "rewards/chosen": 0.4974953651428223, "rewards/margins": 4.0023069699605305, "rewards/rejected": -3.5048116048177085, "step": 13342 }, { "epoch": 0.7072323960459015, "grad_norm": 60.5, "kl": 5.51247501373291, "learning_rate": 5e-07, "logits/chosen": -69389157.33333333, "logits/rejected": 2050116.875, "logps/chosen": -648.8956705729166, "logps/rejected": -46.66093444824219, "loss": 0.3802, "rewards/chosen": 1.4322627385457356, "rewards/margins": 1.7724330325921376, "rewards/rejected": -0.340170294046402, "step": 13343 }, { "epoch": 0.7072854000477036, "grad_norm": 51.5, "kl": 0.8606882095336914, "learning_rate": 5e-07, "logits/chosen": -12911597.333333334, "logits/rejected": -18721560.0, "logps/chosen": -221.32975260416666, "logps/rejected": -117.14651489257812, "loss": 0.3344, "rewards/chosen": 0.5757878224054972, "rewards/margins": 2.4059266249338784, "rewards/rejected": -1.8301388025283813, "step": 13344 }, { "epoch": 0.7073384040495058, "grad_norm": 28.75, "kl": 2.4470882415771484, "learning_rate": 5e-07, "logits/chosen": -16142222.666666666, "logits/rejected": -40267708.8, "logps/chosen": -172.5814412434896, "logps/rejected": -168.40548095703124, "loss": 0.1947, "rewards/chosen": 0.5772678454717001, "rewards/margins": 3.4642391284306844, "rewards/rejected": -2.886971282958984, "step": 13345 }, { "epoch": 0.7073914080513078, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16967176.0, "logits/rejected": -25595149.333333332, "logps/chosen": -343.48065185546875, "logps/rejected": -458.6499430338542, "loss": 0.1634, "rewards/chosen": 2.1606826782226562, "rewards/margins": 4.273869514465332, "rewards/rejected": -2.113186836242676, "step": 13346 }, { "epoch": 0.70744441205311, "grad_norm": 47.75, "kl": 0.16244125366210938, "learning_rate": 5e-07, "logits/chosen": -20115040.0, "logits/rejected": -17780093.333333332, "logps/chosen": -338.7257995605469, "logps/rejected": -218.60430908203125, "loss": 0.2881, "rewards/chosen": 0.0033279359340667725, "rewards/margins": 1.5195712347825368, "rewards/rejected": -1.51624329884847, "step": 13347 }, { "epoch": 0.7074974160549121, "grad_norm": 58.5, "kl": 5.712409496307373, "learning_rate": 5e-07, "logits/chosen": -37635116.0, "logits/rejected": -32952244.0, "logps/chosen": -581.5348510742188, "logps/rejected": -403.0829772949219, "loss": 0.247, "rewards/chosen": 1.5864304304122925, "rewards/margins": 3.720887541770935, "rewards/rejected": -2.1344571113586426, "step": 13348 }, { "epoch": 0.7075504200567143, "grad_norm": 40.75, "kl": 0.7532739639282227, "learning_rate": 5e-07, "logits/chosen": -18048604.0, "logits/rejected": -26434773.333333332, "logps/chosen": -265.72662353515625, "logps/rejected": -300.5307210286458, "loss": 0.2254, "rewards/chosen": 0.46846503019332886, "rewards/margins": 2.503111700216929, "rewards/rejected": -2.0346466700236, "step": 13349 }, { "epoch": 0.7076034240585164, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 26371338.0, "logits/rejected": -42831611.428571425, "logps/chosen": -487.49481201171875, "logps/rejected": -452.5575474330357, "loss": 0.2173, "rewards/chosen": -0.0762786865234375, "rewards/margins": 2.157346044267927, "rewards/rejected": -2.2336247307913646, "step": 13350 }, { "epoch": 0.7076564280603186, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18849218.666666668, "logits/rejected": -25285339.2, "logps/chosen": -221.57621256510416, "logps/rejected": -431.84599609375, "loss": 0.288, "rewards/chosen": -0.18907827138900757, "rewards/margins": 2.0371874451637266, "rewards/rejected": -2.226265716552734, "step": 13351 }, { "epoch": 0.7077094320621207, "grad_norm": 106.0, "kl": 0.34244537353515625, "learning_rate": 5e-07, "logits/chosen": -114600160.0, "logits/rejected": 4758888.8, "logps/chosen": -326.7646077473958, "logps/rejected": -297.094189453125, "loss": 0.2547, "rewards/chosen": -0.02156168222427368, "rewards/margins": 2.551577126979828, "rewards/rejected": -2.5731388092041017, "step": 13352 }, { "epoch": 0.7077624360639229, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31512786.0, "logits/rejected": -48659434.666666664, "logps/chosen": -315.1730651855469, "logps/rejected": -297.1495768229167, "loss": 0.3051, "rewards/chosen": -0.03629148006439209, "rewards/margins": 1.4009500741958618, "rewards/rejected": -1.437241554260254, "step": 13353 }, { "epoch": 0.7078154400657249, "grad_norm": 95.5, "kl": 1.5117568969726562, "learning_rate": 5e-07, "logits/chosen": -6479464.0, "logits/rejected": -29216982.0, "logps/chosen": -273.8928527832031, "logps/rejected": -311.49664306640625, "loss": 0.2059, "rewards/chosen": 1.289476752281189, "rewards/margins": 2.8819302320480347, "rewards/rejected": -1.5924534797668457, "step": 13354 }, { "epoch": 0.7078684440675271, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27852488.0, "logits/rejected": -34668442.666666664, "logps/chosen": -351.1900939941406, "logps/rejected": -316.06781005859375, "loss": 0.2322, "rewards/chosen": 0.01665191352367401, "rewards/margins": 2.0589745491743088, "rewards/rejected": -2.0423226356506348, "step": 13355 }, { "epoch": 0.7079214480693292, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33141910.0, "logits/rejected": -3651169.75, "logps/chosen": -177.1409912109375, "logps/rejected": -189.29141235351562, "loss": 0.3118, "rewards/chosen": 0.35915690660476685, "rewards/margins": 1.7614447474479675, "rewards/rejected": -1.4022878408432007, "step": 13356 }, { "epoch": 0.7079744520711314, "grad_norm": 47.5, "kl": 0.4921855926513672, "learning_rate": 5e-07, "logits/chosen": -28303488.0, "logits/rejected": -21241260.8, "logps/chosen": -513.0592041015625, "logps/rejected": -245.773291015625, "loss": 0.2489, "rewards/chosen": 0.9370148181915283, "rewards/margins": 2.422210454940796, "rewards/rejected": -1.4851956367492676, "step": 13357 }, { "epoch": 0.7080274560729335, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47553988.0, "logits/rejected": -2115289.5, "logps/chosen": -309.0018310546875, "logps/rejected": -249.61138916015625, "loss": 0.2697, "rewards/chosen": 0.5186787843704224, "rewards/margins": 2.6637531518936157, "rewards/rejected": -2.1450743675231934, "step": 13358 }, { "epoch": 0.7080804600747357, "grad_norm": 56.0, "kl": 1.0319843292236328, "learning_rate": 5e-07, "logits/chosen": -41715507.2, "logits/rejected": -33374032.0, "logps/chosen": -497.408935546875, "logps/rejected": -372.0675048828125, "loss": 0.3283, "rewards/chosen": 0.5969589233398438, "rewards/margins": 2.9289297739664715, "rewards/rejected": -2.3319708506266275, "step": 13359 }, { "epoch": 0.7081334640765378, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35492576.0, "logits/rejected": -30906840.0, "logps/chosen": -258.6806640625, "logps/rejected": -296.4496765136719, "loss": 0.2693, "rewards/chosen": 0.33169496059417725, "rewards/margins": 2.4269198179244995, "rewards/rejected": -2.0952248573303223, "step": 13360 }, { "epoch": 0.70818646807834, "grad_norm": 39.0, "kl": 4.454721450805664, "learning_rate": 5e-07, "logits/chosen": 3136417.6666666665, "logits/rejected": -14129371.0, "logps/chosen": -146.8916015625, "logps/rejected": -212.85362243652344, "loss": 0.2972, "rewards/chosen": 1.2228830655415852, "rewards/margins": 3.460607608159383, "rewards/rejected": -2.237724542617798, "step": 13361 }, { "epoch": 0.708239472080142, "grad_norm": 97.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16824571.2, "logits/rejected": -27151320.0, "logps/chosen": -172.814404296875, "logps/rejected": -755.4436848958334, "loss": 0.3026, "rewards/chosen": 0.4487248420715332, "rewards/margins": 2.819942569732666, "rewards/rejected": -2.371217727661133, "step": 13362 }, { "epoch": 0.7082924760819442, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12792092.0, "logits/rejected": -38186340.0, "logps/chosen": -208.3094482421875, "logps/rejected": -383.067138671875, "loss": 0.3094, "rewards/chosen": -0.1728445589542389, "rewards/margins": 2.6340366303920746, "rewards/rejected": -2.8068811893463135, "step": 13363 }, { "epoch": 0.7083454800837463, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30183282.0, "logits/rejected": -30746238.0, "logps/chosen": -176.78604125976562, "logps/rejected": -164.1067657470703, "loss": 0.3338, "rewards/chosen": -0.09680042415857315, "rewards/margins": 1.7918285354971886, "rewards/rejected": -1.8886289596557617, "step": 13364 }, { "epoch": 0.7083984840855485, "grad_norm": 53.5, "kl": 0.9268608093261719, "learning_rate": 5e-07, "logits/chosen": 2843609.0, "logits/rejected": -35741800.0, "logps/chosen": -89.03387451171875, "logps/rejected": -934.5980224609375, "loss": 0.2994, "rewards/chosen": 0.7234445412953695, "rewards/margins": 4.2699046929677325, "rewards/rejected": -3.5464601516723633, "step": 13365 }, { "epoch": 0.7084514880873506, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53673480.0, "logits/rejected": -26695072.0, "logps/chosen": -426.7174072265625, "logps/rejected": -308.245361328125, "loss": 0.1768, "rewards/chosen": 1.2049665451049805, "rewards/margins": 3.9334728717803955, "rewards/rejected": -2.728506326675415, "step": 13366 }, { "epoch": 0.7085044920891528, "grad_norm": 54.0, "kl": 0.408355712890625, "learning_rate": 5e-07, "logits/chosen": -21243860.0, "logits/rejected": -5120777.5, "logps/chosen": -367.01605224609375, "logps/rejected": -178.08001708984375, "loss": 0.3015, "rewards/chosen": 0.532196044921875, "rewards/margins": 2.083095073699951, "rewards/rejected": -1.5508990287780762, "step": 13367 }, { "epoch": 0.7085574960909549, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44409500.0, "logits/rejected": -35314800.0, "logps/chosen": -235.5492401123047, "logps/rejected": -379.8524169921875, "loss": 0.2293, "rewards/chosen": 0.3752436935901642, "rewards/margins": 3.70756533741951, "rewards/rejected": -3.3323216438293457, "step": 13368 }, { "epoch": 0.7086105000927571, "grad_norm": 44.0, "kl": 3.119504928588867, "learning_rate": 5e-07, "logits/chosen": -32721992.0, "logits/rejected": -19572876.0, "logps/chosen": -331.5052083333333, "logps/rejected": -527.013427734375, "loss": 0.2799, "rewards/chosen": 0.9632476170857748, "rewards/margins": 4.3305238087972, "rewards/rejected": -3.367276191711426, "step": 13369 }, { "epoch": 0.7086635040945591, "grad_norm": 67.5, "kl": 3.918703079223633, "learning_rate": 5e-07, "logits/chosen": 848507.8333333334, "logits/rejected": -22661707.2, "logps/chosen": -204.6898193359375, "logps/rejected": -631.38046875, "loss": 0.2988, "rewards/chosen": 0.49590039253234863, "rewards/margins": 4.358732652664185, "rewards/rejected": -3.862832260131836, "step": 13370 }, { "epoch": 0.7087165080963613, "grad_norm": 51.0, "kl": 5.461801528930664, "learning_rate": 5e-07, "logits/chosen": -50711654.4, "logits/rejected": -42158341.333333336, "logps/chosen": -640.23564453125, "logps/rejected": -418.5892333984375, "loss": 0.2859, "rewards/chosen": 1.6339712142944336, "rewards/margins": 3.977689743041992, "rewards/rejected": -2.3437185287475586, "step": 13371 }, { "epoch": 0.7087695120981634, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81051128.0, "logits/rejected": -9493422.857142856, "logps/chosen": -402.6978759765625, "logps/rejected": -253.66329520089286, "loss": 0.1302, "rewards/chosen": 0.02977295033633709, "rewards/margins": 2.692572785541415, "rewards/rejected": -2.662799835205078, "step": 13372 }, { "epoch": 0.7088225160999655, "grad_norm": 38.25, "kl": 1.373992919921875, "learning_rate": 5e-07, "logits/chosen": -3578005.3333333335, "logits/rejected": -43267222.4, "logps/chosen": -105.05619303385417, "logps/rejected": -307.552978515625, "loss": 0.2468, "rewards/chosen": 0.6131237347920736, "rewards/margins": 2.892769845326742, "rewards/rejected": -2.279646110534668, "step": 13373 }, { "epoch": 0.7088755201017677, "grad_norm": 65.5, "kl": 0.34400177001953125, "learning_rate": 5e-07, "logits/chosen": -22192028.0, "logits/rejected": -13611667.0, "logps/chosen": -338.2013244628906, "logps/rejected": -285.28936767578125, "loss": 0.3418, "rewards/chosen": 0.01526818610727787, "rewards/margins": 2.1853862795978785, "rewards/rejected": -2.1701180934906006, "step": 13374 }, { "epoch": 0.7089285241035698, "grad_norm": 53.25, "kl": 0.3373451232910156, "learning_rate": 5e-07, "logits/chosen": -31350755.2, "logits/rejected": -25528869.333333332, "logps/chosen": -149.727880859375, "logps/rejected": -280.3314615885417, "loss": 0.2672, "rewards/chosen": 0.5948848247528076, "rewards/margins": 3.1448166688283283, "rewards/rejected": -2.549931844075521, "step": 13375 }, { "epoch": 0.708981528105372, "grad_norm": 36.25, "kl": 1.6671056747436523, "learning_rate": 5e-07, "logits/chosen": 4823092.0, "logits/rejected": -256916.15, "logps/chosen": -75.94358825683594, "logps/rejected": -86.97403564453126, "loss": 0.2704, "rewards/chosen": 0.553326408068339, "rewards/margins": 2.726610747973124, "rewards/rejected": -2.173284339904785, "step": 13376 }, { "epoch": 0.709034532107174, "grad_norm": 54.75, "kl": 0.8804950714111328, "learning_rate": 5e-07, "logits/chosen": -23882825.6, "logits/rejected": -23732893.333333332, "logps/chosen": -329.2554443359375, "logps/rejected": -403.672607421875, "loss": 0.3742, "rewards/chosen": 0.0450935423374176, "rewards/margins": 2.4079744080702463, "rewards/rejected": -2.3628808657328286, "step": 13377 }, { "epoch": 0.7090875361089762, "grad_norm": 50.75, "kl": 5.020603179931641, "learning_rate": 5e-07, "logits/chosen": -68613752.0, "logits/rejected": -40419276.0, "logps/chosen": -318.3876647949219, "logps/rejected": -137.21485900878906, "loss": 0.3823, "rewards/chosen": 0.2754971385002136, "rewards/margins": 1.5688270926475525, "rewards/rejected": -1.2933299541473389, "step": 13378 }, { "epoch": 0.7091405401107783, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9791242.666666666, "logits/rejected": -23797862.4, "logps/chosen": -270.014892578125, "logps/rejected": -407.0898193359375, "loss": 0.1371, "rewards/chosen": 1.0933650334676106, "rewards/margins": 4.584982617696126, "rewards/rejected": -3.4916175842285155, "step": 13379 }, { "epoch": 0.7091935441125805, "grad_norm": 51.75, "kl": 0.7732620239257812, "learning_rate": 5e-07, "logits/chosen": -79021024.0, "logits/rejected": -59122556.0, "logps/chosen": -356.6540832519531, "logps/rejected": -532.5486450195312, "loss": 0.3004, "rewards/chosen": 0.33847159147262573, "rewards/margins": 3.3644554018974304, "rewards/rejected": -3.0259838104248047, "step": 13380 }, { "epoch": 0.7092465481143826, "grad_norm": 40.5, "kl": 4.364113807678223, "learning_rate": 5e-07, "logits/chosen": -27353290.666666668, "logits/rejected": -23469754.0, "logps/chosen": -220.65191650390625, "logps/rejected": -443.063720703125, "loss": 0.39, "rewards/chosen": 0.6053950786590576, "rewards/margins": 3.076139450073242, "rewards/rejected": -2.4707443714141846, "step": 13381 }, { "epoch": 0.7092995521161848, "grad_norm": 21.625, "kl": 3.258101463317871, "learning_rate": 5e-07, "logits/chosen": 955000.2, "logits/rejected": -31618210.666666668, "logps/chosen": -33.16841125488281, "logps/rejected": -342.1992594401042, "loss": 0.3009, "rewards/chosen": 0.5088505744934082, "rewards/margins": 3.304321765899658, "rewards/rejected": -2.79547119140625, "step": 13382 }, { "epoch": 0.7093525561179869, "grad_norm": 50.75, "kl": 1.3846511840820312, "learning_rate": 5e-07, "logits/chosen": -22306278.4, "logits/rejected": -15334068.0, "logps/chosen": -251.65673828125, "logps/rejected": -316.7393798828125, "loss": 0.3379, "rewards/chosen": 0.12390141487121582, "rewards/margins": 2.7454599221547444, "rewards/rejected": -2.621558507283529, "step": 13383 }, { "epoch": 0.7094055601197891, "grad_norm": 56.5, "kl": 3.4343395233154297, "learning_rate": 5e-07, "logits/chosen": -11887097.333333334, "logits/rejected": -29793660.0, "logps/chosen": -336.0741373697917, "logps/rejected": -240.74600219726562, "loss": 0.3224, "rewards/chosen": 0.783972422281901, "rewards/margins": 3.256201903025309, "rewards/rejected": -2.472229480743408, "step": 13384 }, { "epoch": 0.7094585641215911, "grad_norm": 36.5, "kl": 1.804281234741211, "learning_rate": 5e-07, "logits/chosen": -17508454.666666668, "logits/rejected": -11777113.6, "logps/chosen": -274.9099934895833, "logps/rejected": -181.06484375, "loss": 0.2507, "rewards/chosen": 0.1838284730911255, "rewards/margins": 3.338019108772278, "rewards/rejected": -3.1541906356811524, "step": 13385 }, { "epoch": 0.7095115681233933, "grad_norm": 44.75, "kl": 1.1973838806152344, "learning_rate": 5e-07, "logits/chosen": -59609108.0, "logits/rejected": -17017476.0, "logps/chosen": -333.13525390625, "logps/rejected": -248.1014404296875, "loss": 0.2666, "rewards/chosen": 0.7032028436660767, "rewards/margins": 2.3836885690689087, "rewards/rejected": -1.680485725402832, "step": 13386 }, { "epoch": 0.7095645721251954, "grad_norm": 82.0, "kl": 0.2635612487792969, "learning_rate": 5e-07, "logits/chosen": -1099268.2857142857, "logits/rejected": -35581084.0, "logps/chosen": -380.82345145089283, "logps/rejected": -419.84271240234375, "loss": 0.3302, "rewards/chosen": 0.7187198230198452, "rewards/margins": 2.658856613295419, "rewards/rejected": -1.9401367902755737, "step": 13387 }, { "epoch": 0.7096175761269976, "grad_norm": 50.5, "kl": 1.5474843978881836, "learning_rate": 5e-07, "logits/chosen": -24873304.0, "logits/rejected": -46241258.666666664, "logps/chosen": -257.608642578125, "logps/rejected": -261.6920166015625, "loss": 0.4399, "rewards/chosen": -0.22501533031463622, "rewards/margins": 0.9712556918462116, "rewards/rejected": -1.196271022160848, "step": 13388 }, { "epoch": 0.7096705801287997, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5458902.0, "logits/rejected": 373870.0, "logps/chosen": -400.6499837239583, "logps/rejected": -474.5340576171875, "loss": 0.41, "rewards/chosen": 0.02558222661415736, "rewards/margins": 1.7814020241300266, "rewards/rejected": -1.7558197975158691, "step": 13389 }, { "epoch": 0.7097235841306019, "grad_norm": 36.5, "kl": 1.8932991027832031, "learning_rate": 5e-07, "logits/chosen": 1338924.1666666667, "logits/rejected": -17363507.2, "logps/chosen": -229.25736490885416, "logps/rejected": -118.12109375, "loss": 0.1842, "rewards/chosen": 1.4688021341959636, "rewards/margins": 3.700030008951823, "rewards/rejected": -2.2312278747558594, "step": 13390 }, { "epoch": 0.709776588132404, "grad_norm": 63.0, "kl": 3.1609983444213867, "learning_rate": 5e-07, "logits/chosen": -10292117.0, "logits/rejected": -5039514.0, "logps/chosen": -167.92495727539062, "logps/rejected": -235.83267211914062, "loss": 0.3413, "rewards/chosen": 0.2692866027355194, "rewards/margins": 1.7350248992443085, "rewards/rejected": -1.465738296508789, "step": 13391 }, { "epoch": 0.7098295921342062, "grad_norm": 39.0, "kl": 0.1533203125, "learning_rate": 5e-07, "logits/chosen": -10300052.0, "logits/rejected": -5031239.0, "logps/chosen": -236.916748046875, "logps/rejected": -347.71917724609375, "loss": 0.2592, "rewards/chosen": 0.599312961101532, "rewards/margins": 2.8540918231010437, "rewards/rejected": -2.2547788619995117, "step": 13392 }, { "epoch": 0.7098825961360082, "grad_norm": 54.25, "kl": 1.5345096588134766, "learning_rate": 5e-07, "logits/chosen": -49063814.4, "logits/rejected": 12960549.333333334, "logps/chosen": -405.83876953125, "logps/rejected": -439.330078125, "loss": 0.2736, "rewards/chosen": 0.8874859809875488, "rewards/margins": 3.1250807444254556, "rewards/rejected": -2.2375947634379068, "step": 13393 }, { "epoch": 0.7099356001378104, "grad_norm": 58.0, "kl": 0.7957801818847656, "learning_rate": 5e-07, "logits/chosen": -41596307.2, "logits/rejected": -70339434.66666667, "logps/chosen": -199.73212890625, "logps/rejected": -218.0224609375, "loss": 0.359, "rewards/chosen": 0.24056806564331054, "rewards/margins": 2.0038042704264325, "rewards/rejected": -1.7632362047831218, "step": 13394 }, { "epoch": 0.7099886041396125, "grad_norm": 54.5, "kl": 0.8758411407470703, "learning_rate": 5e-07, "logits/chosen": -34245282.666666664, "logits/rejected": -34339480.0, "logps/chosen": -299.0983072916667, "logps/rejected": -178.33033752441406, "loss": 0.2883, "rewards/chosen": 0.7875626087188721, "rewards/margins": 3.5058367252349854, "rewards/rejected": -2.7182741165161133, "step": 13395 }, { "epoch": 0.7100416081414147, "grad_norm": 78.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43559616.0, "logits/rejected": -27781014.4, "logps/chosen": -406.3548177083333, "logps/rejected": -213.26923828125, "loss": 0.2594, "rewards/chosen": 0.433502197265625, "rewards/margins": 2.138457679748535, "rewards/rejected": -1.70495548248291, "step": 13396 }, { "epoch": 0.7100946121432168, "grad_norm": 59.25, "kl": 3.221280097961426, "learning_rate": 5e-07, "logits/chosen": -58542011.428571425, "logits/rejected": -65038944.0, "logps/chosen": -237.42586844308036, "logps/rejected": -570.6890869140625, "loss": 0.4111, "rewards/chosen": 0.5802815301077706, "rewards/margins": 2.1611897093909125, "rewards/rejected": -1.580908179283142, "step": 13397 }, { "epoch": 0.710147616145019, "grad_norm": 32.25, "kl": 2.7508468627929688, "learning_rate": 5e-07, "logits/chosen": -2992401.0, "logits/rejected": -80579285.33333333, "logps/chosen": -168.73671875, "logps/rejected": -299.28896077473956, "loss": 0.3516, "rewards/chosen": 0.09221124649047852, "rewards/margins": 3.1997132301330566, "rewards/rejected": -3.107501983642578, "step": 13398 }, { "epoch": 0.7102006201468211, "grad_norm": 44.5, "kl": 0.8996248245239258, "learning_rate": 5e-07, "logits/chosen": -23131028.0, "logits/rejected": -11743820.0, "logps/chosen": -149.562744140625, "logps/rejected": -247.14234924316406, "loss": 0.3206, "rewards/chosen": 0.13695582747459412, "rewards/margins": 2.9355379045009613, "rewards/rejected": -2.798582077026367, "step": 13399 }, { "epoch": 0.7102536241486233, "grad_norm": 54.75, "kl": 0.602508544921875, "learning_rate": 5e-07, "logits/chosen": -52351187.2, "logits/rejected": 18251768.0, "logps/chosen": -455.2458984375, "logps/rejected": -500.1793619791667, "loss": 0.3638, "rewards/chosen": 0.31922245025634766, "rewards/margins": 1.50800625483195, "rewards/rejected": -1.1887838045756023, "step": 13400 }, { "epoch": 0.7103066281504253, "grad_norm": 58.5, "kl": 1.0158195495605469, "learning_rate": 5e-07, "logits/chosen": 3095110.6, "logits/rejected": -12062725.333333334, "logps/chosen": -89.780224609375, "logps/rejected": -291.7744547526042, "loss": 0.3803, "rewards/chosen": 0.47293548583984374, "rewards/margins": 1.8910986264546712, "rewards/rejected": -1.4181631406148274, "step": 13401 }, { "epoch": 0.7103596321522275, "grad_norm": 45.25, "kl": 2.254391670227051, "learning_rate": 5e-07, "logits/chosen": -14324320.0, "logits/rejected": 2635708.75, "logps/chosen": -225.89900716145834, "logps/rejected": -92.01458740234375, "loss": 0.2921, "rewards/chosen": 0.9105680783589681, "rewards/margins": 3.5563047726949057, "rewards/rejected": -2.6457366943359375, "step": 13402 }, { "epoch": 0.7104126361540296, "grad_norm": 39.75, "kl": 1.2363319396972656, "learning_rate": 5e-07, "logits/chosen": -14184418.666666666, "logits/rejected": -28553827.2, "logps/chosen": -389.492919921875, "logps/rejected": -272.727490234375, "loss": 0.2374, "rewards/chosen": 0.17640956242879233, "rewards/margins": 3.3963550885518394, "rewards/rejected": -3.219945526123047, "step": 13403 }, { "epoch": 0.7104656401558318, "grad_norm": 47.0, "kl": 1.209303855895996, "learning_rate": 5e-07, "logits/chosen": -11076386.666666666, "logits/rejected": -21644744.0, "logps/chosen": -371.056884765625, "logps/rejected": -186.349853515625, "loss": 0.2397, "rewards/chosen": 0.7434285481770834, "rewards/margins": 2.6140740712483725, "rewards/rejected": -1.870645523071289, "step": 13404 }, { "epoch": 0.7105186441576339, "grad_norm": 54.75, "kl": 1.8999595642089844, "learning_rate": 5e-07, "logits/chosen": -57326112.0, "logits/rejected": -13242660.0, "logps/chosen": -234.3148193359375, "logps/rejected": -428.82562255859375, "loss": 0.3921, "rewards/chosen": 0.34258341789245605, "rewards/margins": 2.8602898120880127, "rewards/rejected": -2.5177063941955566, "step": 13405 }, { "epoch": 0.7105716481594361, "grad_norm": 49.0, "kl": 3.3925857543945312, "learning_rate": 5e-07, "logits/chosen": -25928374.0, "logits/rejected": 1871654.0, "logps/chosen": -872.8914794921875, "logps/rejected": -362.2255554199219, "loss": 0.1978, "rewards/chosen": 1.8720812797546387, "rewards/margins": 4.105384588241577, "rewards/rejected": -2.2333033084869385, "step": 13406 }, { "epoch": 0.7106246521612382, "grad_norm": 40.75, "kl": 3.695737838745117, "learning_rate": 5e-07, "logits/chosen": -30232952.0, "logits/rejected": -27150532.0, "logps/chosen": -404.9093424479167, "logps/rejected": -154.74697875976562, "loss": 0.3568, "rewards/chosen": 0.8223709265391032, "rewards/margins": 2.284430424372355, "rewards/rejected": -1.462059497833252, "step": 13407 }, { "epoch": 0.7106776561630403, "grad_norm": 34.25, "kl": 0.9012165069580078, "learning_rate": 5e-07, "logits/chosen": -45486480.0, "logits/rejected": -17401491.2, "logps/chosen": -500.9460856119792, "logps/rejected": -152.648828125, "loss": 0.2249, "rewards/chosen": 0.9274881680806478, "rewards/margins": 3.6557772000630697, "rewards/rejected": -2.728289031982422, "step": 13408 }, { "epoch": 0.7107306601648424, "grad_norm": 116.0, "kl": 0.46685028076171875, "learning_rate": 5e-07, "logits/chosen": -42597370.666666664, "logits/rejected": -22910340.0, "logps/chosen": -212.8226318359375, "logps/rejected": -225.65115356445312, "loss": 0.3818, "rewards/chosen": 0.19186743100484213, "rewards/margins": 3.1031883557637534, "rewards/rejected": -2.911320924758911, "step": 13409 }, { "epoch": 0.7107836641666446, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35669445.333333336, "logits/rejected": -6023846.5, "logps/chosen": -321.1607259114583, "logps/rejected": -140.83587646484375, "loss": 0.3664, "rewards/chosen": 0.3158903320630391, "rewards/margins": 2.7391813000043235, "rewards/rejected": -2.423290967941284, "step": 13410 }, { "epoch": 0.7108366681684467, "grad_norm": 39.75, "kl": 3.24456787109375, "learning_rate": 5e-07, "logits/chosen": -3430008.0, "logits/rejected": -49116506.666666664, "logps/chosen": -182.5833740234375, "logps/rejected": -394.7677001953125, "loss": 0.3053, "rewards/chosen": 0.6694289684295655, "rewards/margins": 2.2606841564178466, "rewards/rejected": -1.5912551879882812, "step": 13411 }, { "epoch": 0.7108896721702489, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6112524.666666667, "logits/rejected": -24711584.0, "logps/chosen": -481.7770589192708, "logps/rejected": -450.715576171875, "loss": 0.2768, "rewards/chosen": 0.632446805636088, "rewards/margins": 2.397255460421244, "rewards/rejected": -1.7648086547851562, "step": 13412 }, { "epoch": 0.710942676172051, "grad_norm": 67.0, "kl": 0.26296234130859375, "learning_rate": 5e-07, "logits/chosen": 9868853.333333334, "logits/rejected": -9179292.0, "logps/chosen": -390.8898111979167, "logps/rejected": -241.284033203125, "loss": 0.2124, "rewards/chosen": 0.7438562711079916, "rewards/margins": 3.1450037320454918, "rewards/rejected": -2.4011474609375, "step": 13413 }, { "epoch": 0.7109956801738532, "grad_norm": 48.0, "kl": 2.407449722290039, "learning_rate": 5e-07, "logits/chosen": 6100310.8, "logits/rejected": -19490185.333333332, "logps/chosen": -152.7987060546875, "logps/rejected": -311.0738932291667, "loss": 0.2516, "rewards/chosen": 0.9913564682006836, "rewards/margins": 3.8659743626912437, "rewards/rejected": -2.87461789449056, "step": 13414 }, { "epoch": 0.7110486841756553, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3718113.25, "logits/rejected": -16066058.0, "logps/chosen": -139.2600860595703, "logps/rejected": -334.0189208984375, "loss": 0.3256, "rewards/chosen": -0.08992132544517517, "rewards/margins": 2.977922350168228, "rewards/rejected": -3.0678436756134033, "step": 13415 }, { "epoch": 0.7111016881774574, "grad_norm": 40.5, "kl": 0.45372581481933594, "learning_rate": 5e-07, "logits/chosen": -83127296.0, "logits/rejected": -20929206.666666668, "logps/chosen": -627.2433471679688, "logps/rejected": -263.63954671223956, "loss": 0.1845, "rewards/chosen": 1.6044052839279175, "rewards/margins": 3.509496569633484, "rewards/rejected": -1.9050912857055664, "step": 13416 }, { "epoch": 0.7111546921792595, "grad_norm": 35.75, "kl": 1.9142112731933594, "learning_rate": 5e-07, "logits/chosen": -14480259.2, "logits/rejected": -26736869.333333332, "logps/chosen": -182.4734375, "logps/rejected": -322.7659505208333, "loss": 0.2774, "rewards/chosen": 0.9660833358764649, "rewards/margins": 2.313987064361572, "rewards/rejected": -1.3479037284851074, "step": 13417 }, { "epoch": 0.7112076961810617, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10674668.0, "logits/rejected": -13481398.4, "logps/chosen": -115.87839762369792, "logps/rejected": -296.05009765625, "loss": 0.2176, "rewards/chosen": 0.69158403078715, "rewards/margins": 2.5014848868052164, "rewards/rejected": -1.8099008560180665, "step": 13418 }, { "epoch": 0.7112607001828638, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -448210.0, "logits/rejected": -15109188.8, "logps/chosen": -294.15004475911456, "logps/rejected": -227.60439453125, "loss": 0.314, "rewards/chosen": -0.3801085154215495, "rewards/margins": 1.889986737569173, "rewards/rejected": -2.2700952529907226, "step": 13419 }, { "epoch": 0.711313704184666, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13726054.0, "logits/rejected": -15994368.0, "logps/chosen": -383.19158935546875, "logps/rejected": -200.31773376464844, "loss": 0.4262, "rewards/chosen": -0.4130697548389435, "rewards/margins": 0.8586218059062958, "rewards/rejected": -1.2716915607452393, "step": 13420 }, { "epoch": 0.7113667081864681, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32933328.0, "logits/rejected": -21017872.0, "logps/chosen": -174.34136962890625, "logps/rejected": -152.5166748046875, "loss": 0.2134, "rewards/chosen": 0.29021302858988446, "rewards/margins": 3.196536652247111, "rewards/rejected": -2.9063236236572267, "step": 13421 }, { "epoch": 0.7114197121882702, "grad_norm": 46.75, "kl": 0.31909751892089844, "learning_rate": 5e-07, "logits/chosen": -18134540.0, "logits/rejected": -50082252.0, "logps/chosen": -202.42381286621094, "logps/rejected": -380.06890869140625, "loss": 0.3099, "rewards/chosen": -0.33420029282569885, "rewards/margins": 3.0390873849391937, "rewards/rejected": -3.3732876777648926, "step": 13422 }, { "epoch": 0.7114727161900724, "grad_norm": 43.75, "kl": 0.1647930145263672, "learning_rate": 5e-07, "logits/chosen": -71671320.0, "logits/rejected": -61311664.0, "logps/chosen": -457.9442138671875, "logps/rejected": -231.6815185546875, "loss": 0.2097, "rewards/chosen": 1.4359054565429688, "rewards/margins": 3.6327826182047525, "rewards/rejected": -2.1968771616617837, "step": 13423 }, { "epoch": 0.7115257201918744, "grad_norm": 58.25, "kl": 0.2067546844482422, "learning_rate": 5e-07, "logits/chosen": -44469296.0, "logits/rejected": -20686870.0, "logps/chosen": -347.164794921875, "logps/rejected": -274.31982421875, "loss": 0.297, "rewards/chosen": 0.710414469242096, "rewards/margins": 1.9168632626533508, "rewards/rejected": -1.2064487934112549, "step": 13424 }, { "epoch": 0.7115787241936766, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16749632.0, "logits/rejected": -72329866.66666667, "logps/chosen": -137.69586181640625, "logps/rejected": -262.4573974609375, "loss": 0.3164, "rewards/chosen": 0.4958635807037354, "rewards/margins": 2.3395910104115805, "rewards/rejected": -1.843727429707845, "step": 13425 }, { "epoch": 0.7116317281954787, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8637674.0, "logits/rejected": -61859680.0, "logps/chosen": -383.85992431640625, "logps/rejected": -282.03265380859375, "loss": 0.2407, "rewards/chosen": 0.8247486352920532, "rewards/margins": 2.812118172645569, "rewards/rejected": -1.9873695373535156, "step": 13426 }, { "epoch": 0.7116847321972809, "grad_norm": 68.0, "kl": 4.442939758300781, "learning_rate": 5e-07, "logits/chosen": -49631360.0, "logits/rejected": -41995920.0, "logps/chosen": -324.76389567057294, "logps/rejected": -460.41015625, "loss": 0.3941, "rewards/chosen": 0.27690692742665607, "rewards/margins": 2.4049191077550254, "rewards/rejected": -2.128012180328369, "step": 13427 }, { "epoch": 0.711737736199083, "grad_norm": 37.0, "kl": 0.4329109191894531, "learning_rate": 5e-07, "logits/chosen": -16658072.0, "logits/rejected": -28557336.0, "logps/chosen": -260.6191101074219, "logps/rejected": -250.31500244140625, "loss": 0.2433, "rewards/chosen": 0.5341898798942566, "rewards/margins": 3.2353896498680115, "rewards/rejected": -2.701199769973755, "step": 13428 }, { "epoch": 0.7117907402008852, "grad_norm": 42.25, "kl": 1.6117267608642578, "learning_rate": 5e-07, "logits/chosen": -13403977.333333334, "logits/rejected": -36535016.0, "logps/chosen": -344.0781656901042, "logps/rejected": -420.1871032714844, "loss": 0.335, "rewards/chosen": 0.49560463428497314, "rewards/margins": 3.990731120109558, "rewards/rejected": -3.495126485824585, "step": 13429 }, { "epoch": 0.7118437442026873, "grad_norm": 63.75, "kl": 1.0292510986328125, "learning_rate": 5e-07, "logits/chosen": -47359641.6, "logits/rejected": -51253194.666666664, "logps/chosen": -452.88310546875, "logps/rejected": -475.0352376302083, "loss": 0.3058, "rewards/chosen": 0.4995568752288818, "rewards/margins": 2.8856220404307047, "rewards/rejected": -2.3860651652018228, "step": 13430 }, { "epoch": 0.7118967482044894, "grad_norm": 45.0, "kl": 0.8741798400878906, "learning_rate": 5e-07, "logits/chosen": -25687350.0, "logits/rejected": -19482269.333333332, "logps/chosen": -225.91375732421875, "logps/rejected": -245.4325154622396, "loss": 0.173, "rewards/chosen": 2.017101287841797, "rewards/margins": 3.515289942423503, "rewards/rejected": -1.4981886545817058, "step": 13431 }, { "epoch": 0.7119497522062915, "grad_norm": 43.25, "kl": 1.2012519836425781, "learning_rate": 5e-07, "logits/chosen": -31661206.0, "logits/rejected": -20233349.333333332, "logps/chosen": -341.9169006347656, "logps/rejected": -387.1726888020833, "loss": 0.1636, "rewards/chosen": 1.6355133056640625, "rewards/margins": 4.107487042744955, "rewards/rejected": -2.471973737080892, "step": 13432 }, { "epoch": 0.7120027562080937, "grad_norm": 45.25, "kl": 0.7793445587158203, "learning_rate": 5e-07, "logits/chosen": -8220658.0, "logits/rejected": -36016572.8, "logps/chosen": -276.4639078776042, "logps/rejected": -447.659619140625, "loss": 0.194, "rewards/chosen": 1.2450255552927654, "rewards/margins": 3.6487651030222574, "rewards/rejected": -2.4037395477294923, "step": 13433 }, { "epoch": 0.7120557602098958, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57407712.0, "logits/rejected": -26665252.8, "logps/chosen": -595.7566731770834, "logps/rejected": -584.00986328125, "loss": 0.1903, "rewards/chosen": 0.48833008607228595, "rewards/margins": 3.3503269275029504, "rewards/rejected": -2.8619968414306642, "step": 13434 }, { "epoch": 0.712108764211698, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21933760.0, "logits/rejected": -36422716.0, "logps/chosen": -362.5205993652344, "logps/rejected": -313.9969177246094, "loss": 0.2969, "rewards/chosen": 0.11014223098754883, "rewards/margins": 2.642839193344116, "rewards/rejected": -2.5326969623565674, "step": 13435 }, { "epoch": 0.7121617682135001, "grad_norm": 33.25, "kl": 0.19774246215820312, "learning_rate": 5e-07, "logits/chosen": -880516.0, "logits/rejected": -33909875.2, "logps/chosen": -137.45648193359375, "logps/rejected": -481.41640625, "loss": 0.2044, "rewards/chosen": 0.3787118196487427, "rewards/margins": 3.883906674385071, "rewards/rejected": -3.5051948547363283, "step": 13436 }, { "epoch": 0.7122147722153023, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50804218.666666664, "logits/rejected": -17818696.0, "logps/chosen": -504.7957356770833, "logps/rejected": -584.4012451171875, "loss": 0.3435, "rewards/chosen": 0.37199342250823975, "rewards/margins": 3.537842631340027, "rewards/rejected": -3.165849208831787, "step": 13437 }, { "epoch": 0.7122677762171044, "grad_norm": 51.5, "kl": 3.9281158447265625, "learning_rate": 5e-07, "logits/chosen": -37609949.333333336, "logits/rejected": -35142512.0, "logps/chosen": -332.2236735026042, "logps/rejected": -223.8645782470703, "loss": 0.454, "rewards/chosen": 0.43548814455668133, "rewards/margins": 1.7721214691797893, "rewards/rejected": -1.336633324623108, "step": 13438 }, { "epoch": 0.7123207802189065, "grad_norm": 67.0, "kl": 2.582317352294922, "learning_rate": 5e-07, "logits/chosen": -24707545.14285714, "logits/rejected": -59374628.0, "logps/chosen": -451.4047154017857, "logps/rejected": -771.75048828125, "loss": 0.3117, "rewards/chosen": 1.0399648802621024, "rewards/margins": 6.825566973005023, "rewards/rejected": -5.78560209274292, "step": 13439 }, { "epoch": 0.7123737842207086, "grad_norm": 56.5, "kl": 0.12005615234375, "learning_rate": 5e-07, "logits/chosen": -22086579.2, "logits/rejected": -35386338.666666664, "logps/chosen": -566.6283203125, "logps/rejected": -304.76904296875, "loss": 0.2097, "rewards/chosen": 1.170955276489258, "rewards/margins": 3.1874946912129722, "rewards/rejected": -2.0165394147237143, "step": 13440 }, { "epoch": 0.7124267882225108, "grad_norm": 49.75, "kl": 0.6095848083496094, "learning_rate": 5e-07, "logits/chosen": -3457311.3333333335, "logits/rejected": -20835310.0, "logps/chosen": -315.1146647135417, "logps/rejected": -206.71771240234375, "loss": 0.258, "rewards/chosen": 0.8536491394042969, "rewards/margins": 4.459332466125488, "rewards/rejected": -3.6056833267211914, "step": 13441 }, { "epoch": 0.7124797922243129, "grad_norm": 51.25, "kl": 2.1328506469726562, "learning_rate": 5e-07, "logits/chosen": -14519793.6, "logits/rejected": -49389194.666666664, "logps/chosen": -680.79375, "logps/rejected": -248.33394368489584, "loss": 0.2506, "rewards/chosen": 1.2372780799865724, "rewards/margins": 3.589889939626058, "rewards/rejected": -2.352611859639486, "step": 13442 }, { "epoch": 0.7125327962261151, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18834754.0, "logits/rejected": -30889488.0, "logps/chosen": -399.963134765625, "logps/rejected": -290.8221435546875, "loss": 0.2782, "rewards/chosen": 0.5206201672554016, "rewards/margins": 2.302913248538971, "rewards/rejected": -1.7822930812835693, "step": 13443 }, { "epoch": 0.7125858002279172, "grad_norm": 33.25, "kl": 1.143660545349121, "learning_rate": 5e-07, "logits/chosen": -13373466.666666666, "logits/rejected": -20150545.6, "logps/chosen": -523.6186930338541, "logps/rejected": -317.880029296875, "loss": 0.1638, "rewards/chosen": 1.4695439338684082, "rewards/margins": 3.9680827140808104, "rewards/rejected": -2.498538780212402, "step": 13444 }, { "epoch": 0.7126388042297194, "grad_norm": 56.5, "kl": 0.3521881103515625, "learning_rate": 5e-07, "logits/chosen": 230434.890625, "logits/rejected": -15235556.0, "logps/chosen": -311.42779541015625, "logps/rejected": -387.78826904296875, "loss": 0.223, "rewards/chosen": 1.030500888824463, "rewards/margins": 3.4979920387268066, "rewards/rejected": -2.4674911499023438, "step": 13445 }, { "epoch": 0.7126918082315215, "grad_norm": 29.375, "kl": 5.330034255981445, "learning_rate": 5e-07, "logits/chosen": -788834.6666666666, "logits/rejected": -12324898.4, "logps/chosen": -360.7591145833333, "logps/rejected": -390.2226806640625, "loss": 0.19, "rewards/chosen": 1.1771968205769856, "rewards/margins": 3.8026604970296223, "rewards/rejected": -2.625463676452637, "step": 13446 }, { "epoch": 0.7127448122333236, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4503393.333333333, "logits/rejected": -16098139.2, "logps/chosen": -612.0506998697916, "logps/rejected": -221.34716796875, "loss": 0.2516, "rewards/chosen": 0.6304829915364584, "rewards/margins": 2.628930409749349, "rewards/rejected": -1.9984474182128906, "step": 13447 }, { "epoch": 0.7127978162351257, "grad_norm": 59.0, "kl": 6.665966987609863, "learning_rate": 5e-07, "logits/chosen": -17656571.42857143, "logits/rejected": -44772744.0, "logps/chosen": -414.08447265625, "logps/rejected": -596.4337768554688, "loss": 0.4222, "rewards/chosen": 0.7879893439156669, "rewards/margins": 4.160096372876849, "rewards/rejected": -3.3721070289611816, "step": 13448 }, { "epoch": 0.7128508202369279, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11648225.0, "logits/rejected": -16842145.333333332, "logps/chosen": -275.6568603515625, "logps/rejected": -266.56553141276044, "loss": 0.1836, "rewards/chosen": 0.8924762606620789, "rewards/margins": 3.3421794772148132, "rewards/rejected": -2.4497032165527344, "step": 13449 }, { "epoch": 0.71290382423873, "grad_norm": 84.0, "kl": 0.9542083740234375, "learning_rate": 5e-07, "logits/chosen": 61107285.333333336, "logits/rejected": -32323868.8, "logps/chosen": -182.88846842447916, "logps/rejected": -307.822314453125, "loss": 0.2868, "rewards/chosen": 0.33659199873606366, "rewards/margins": 2.003697482744853, "rewards/rejected": -1.667105484008789, "step": 13450 }, { "epoch": 0.7129568282405322, "grad_norm": 56.0, "kl": 1.7201919555664062, "learning_rate": 5e-07, "logits/chosen": -17077596.0, "logits/rejected": -22558750.0, "logps/chosen": -177.88075256347656, "logps/rejected": -400.4693298339844, "loss": 0.3549, "rewards/chosen": 0.27295559644699097, "rewards/margins": 2.120743453502655, "rewards/rejected": -1.847787857055664, "step": 13451 }, { "epoch": 0.7130098322423343, "grad_norm": 53.25, "kl": 3.157777786254883, "learning_rate": 5e-07, "logits/chosen": -50280680.0, "logits/rejected": 15987355.0, "logps/chosen": -295.41046142578125, "logps/rejected": -265.9954833984375, "loss": 0.2486, "rewards/chosen": 0.8299223184585571, "rewards/margins": 3.923174500465393, "rewards/rejected": -3.093252182006836, "step": 13452 }, { "epoch": 0.7130628362441365, "grad_norm": 43.5, "kl": 3.3997039794921875, "learning_rate": 5e-07, "logits/chosen": -9064662.666666666, "logits/rejected": -25738480.0, "logps/chosen": -185.14664713541666, "logps/rejected": -256.6227722167969, "loss": 0.348, "rewards/chosen": 0.627243439356486, "rewards/margins": 2.334489027659098, "rewards/rejected": -1.7072455883026123, "step": 13453 }, { "epoch": 0.7131158402459385, "grad_norm": 38.75, "kl": 1.7744808197021484, "learning_rate": 5e-07, "logits/chosen": -35712880.0, "logits/rejected": -8713234.0, "logps/chosen": -224.98583984375, "logps/rejected": -156.73388671875, "loss": 0.2948, "rewards/chosen": 0.19694724678993225, "rewards/margins": 2.8926078379154205, "rewards/rejected": -2.6956605911254883, "step": 13454 }, { "epoch": 0.7131688442477407, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40139736.0, "logits/rejected": -34795288.0, "logps/chosen": -350.67755126953125, "logps/rejected": -557.7027587890625, "loss": 0.1464, "rewards/chosen": 1.365917682647705, "rewards/margins": 4.73301887512207, "rewards/rejected": -3.3671011924743652, "step": 13455 }, { "epoch": 0.7132218482495428, "grad_norm": 47.0, "kl": 0.4572944641113281, "learning_rate": 5e-07, "logits/chosen": -61661909.333333336, "logits/rejected": -19923302.4, "logps/chosen": -281.3651123046875, "logps/rejected": -263.69052734375, "loss": 0.2208, "rewards/chosen": 0.607373317082723, "rewards/margins": 2.821917422612508, "rewards/rejected": -2.214544105529785, "step": 13456 }, { "epoch": 0.713274852251345, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17021856.0, "logits/rejected": -11018718.0, "logps/chosen": -191.35763549804688, "logps/rejected": -293.9889221191406, "loss": 0.34, "rewards/chosen": -0.3355565369129181, "rewards/margins": 2.094140499830246, "rewards/rejected": -2.429697036743164, "step": 13457 }, { "epoch": 0.7133278562531471, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 16981278.0, "logits/rejected": -56930796.0, "logps/chosen": -302.4977722167969, "logps/rejected": -462.252197265625, "loss": 0.2784, "rewards/chosen": 0.39925339818000793, "rewards/margins": 3.192233592271805, "rewards/rejected": -2.792980194091797, "step": 13458 }, { "epoch": 0.7133808602549493, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58504956.0, "logits/rejected": -18016788.0, "logps/chosen": -386.1559143066406, "logps/rejected": -452.2437744140625, "loss": 0.1465, "rewards/chosen": 0.7581497430801392, "rewards/margins": 3.8729648192723594, "rewards/rejected": -3.11481507619222, "step": 13459 }, { "epoch": 0.7134338642567514, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29987458.0, "logits/rejected": -32417182.0, "logps/chosen": -368.00506591796875, "logps/rejected": -275.7630920410156, "loss": 0.301, "rewards/chosen": 0.31908491253852844, "rewards/margins": 2.3913677036762238, "rewards/rejected": -2.0722827911376953, "step": 13460 }, { "epoch": 0.7134868682585536, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47643040.0, "logits/rejected": -36100128.0, "logps/chosen": -221.04385375976562, "logps/rejected": -387.7822265625, "loss": 0.2835, "rewards/chosen": -0.14269523322582245, "rewards/margins": 3.894232466816902, "rewards/rejected": -4.036927700042725, "step": 13461 }, { "epoch": 0.7135398722603556, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8352935.333333333, "logits/rejected": -10739268.0, "logps/chosen": -328.7469889322917, "logps/rejected": -202.2393035888672, "loss": 0.3469, "rewards/chosen": 0.45629199345906574, "rewards/margins": 2.1999362309773765, "rewards/rejected": -1.7436442375183105, "step": 13462 }, { "epoch": 0.7135928762621578, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22476554.0, "logits/rejected": -23411888.0, "logps/chosen": -150.7520751953125, "logps/rejected": -292.2037353515625, "loss": 0.2842, "rewards/chosen": 0.1785227358341217, "rewards/margins": 2.4335664808750153, "rewards/rejected": -2.2550437450408936, "step": 13463 }, { "epoch": 0.7136458802639599, "grad_norm": 70.5, "kl": 0.4905738830566406, "learning_rate": 5e-07, "logits/chosen": 53484118.4, "logits/rejected": -15483885.333333334, "logps/chosen": -359.08955078125, "logps/rejected": -274.33642578125, "loss": 0.2491, "rewards/chosen": 0.908664321899414, "rewards/margins": 2.9426847457885743, "rewards/rejected": -2.03402042388916, "step": 13464 }, { "epoch": 0.7136988842657621, "grad_norm": 22.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21528992.0, "logits/rejected": -13120194.285714285, "logps/chosen": -44.82098388671875, "logps/rejected": -178.36941964285714, "loss": 0.08, "rewards/chosen": 1.4197098016738892, "rewards/margins": 4.251853312764849, "rewards/rejected": -2.8321435110909596, "step": 13465 }, { "epoch": 0.7137518882675642, "grad_norm": 73.5, "kl": 3.89947509765625, "learning_rate": 5e-07, "logits/chosen": -37727913.14285714, "logits/rejected": -8992854.0, "logps/chosen": -411.77207728794644, "logps/rejected": -151.19949340820312, "loss": 0.2295, "rewards/chosen": 1.6213254928588867, "rewards/margins": 5.864952087402344, "rewards/rejected": -4.243626594543457, "step": 13466 }, { "epoch": 0.7138048922693664, "grad_norm": 51.75, "kl": 3.537787437438965, "learning_rate": 5e-07, "logits/chosen": 11603040.0, "logits/rejected": 1930953.0, "logps/chosen": -208.25956217447916, "logps/rejected": -228.94423828125, "loss": 0.3549, "rewards/chosen": 0.3541335662206014, "rewards/margins": 1.5319833358128865, "rewards/rejected": -1.1778497695922852, "step": 13467 }, { "epoch": 0.7138578962711685, "grad_norm": 61.5, "kl": 1.7985610961914062, "learning_rate": 5e-07, "logits/chosen": -30473509.333333332, "logits/rejected": -61780856.0, "logps/chosen": -347.6660563151042, "logps/rejected": -458.4804382324219, "loss": 0.3623, "rewards/chosen": 0.37901751200358075, "rewards/margins": 3.679492155710856, "rewards/rejected": -3.3004746437072754, "step": 13468 }, { "epoch": 0.7139109002729707, "grad_norm": 39.75, "kl": 0.06350326538085938, "learning_rate": 5e-07, "logits/chosen": -29806516.0, "logits/rejected": -24682870.85714286, "logps/chosen": -599.0703735351562, "logps/rejected": -137.41055733816964, "loss": 0.243, "rewards/chosen": 0.34895631670951843, "rewards/margins": 1.8397207983902522, "rewards/rejected": -1.4907644816807337, "step": 13469 }, { "epoch": 0.7139639042747727, "grad_norm": 73.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4211728.0, "logits/rejected": -29337648.0, "logps/chosen": -393.1512451171875, "logps/rejected": -430.971435546875, "loss": 0.3203, "rewards/chosen": 0.20010955333709718, "rewards/margins": 2.539891791343689, "rewards/rejected": -2.339782238006592, "step": 13470 }, { "epoch": 0.7140169082765749, "grad_norm": 52.75, "kl": 0.20237255096435547, "learning_rate": 5e-07, "logits/chosen": -57306692.0, "logits/rejected": -33184276.0, "logps/chosen": -422.2032165527344, "logps/rejected": -353.427490234375, "loss": 0.3, "rewards/chosen": 0.007157608866691589, "rewards/margins": 2.5021214932203293, "rewards/rejected": -2.4949638843536377, "step": 13471 }, { "epoch": 0.714069912278377, "grad_norm": 48.0, "kl": 0.24163055419921875, "learning_rate": 5e-07, "logits/chosen": -31865184.0, "logits/rejected": -22881669.333333332, "logps/chosen": -392.24052734375, "logps/rejected": -330.35992431640625, "loss": 0.2171, "rewards/chosen": 0.9192514419555664, "rewards/margins": 4.163369178771973, "rewards/rejected": -3.2441177368164062, "step": 13472 }, { "epoch": 0.7141229162801791, "grad_norm": 44.25, "kl": 0.3651542663574219, "learning_rate": 5e-07, "logits/chosen": 12634277.0, "logits/rejected": -14961184.0, "logps/chosen": -92.89153289794922, "logps/rejected": -176.76678466796875, "loss": 0.2482, "rewards/chosen": 0.36694738268852234, "rewards/margins": 2.21738263964653, "rewards/rejected": -1.8504352569580078, "step": 13473 }, { "epoch": 0.7141759202819813, "grad_norm": 55.0, "kl": 0.0776529312133789, "learning_rate": 5e-07, "logits/chosen": -37750469.333333336, "logits/rejected": -4022896.75, "logps/chosen": -406.9619954427083, "logps/rejected": -272.9599304199219, "loss": 0.2813, "rewards/chosen": 1.1749107042948406, "rewards/margins": 2.7869359652201338, "rewards/rejected": -1.612025260925293, "step": 13474 }, { "epoch": 0.7142289242837834, "grad_norm": 34.5, "kl": 0.3955421447753906, "learning_rate": 5e-07, "logits/chosen": -9059008.0, "logits/rejected": -26654957.333333332, "logps/chosen": -193.41063232421874, "logps/rejected": -340.6089274088542, "loss": 0.291, "rewards/chosen": 0.6839824676513672, "rewards/margins": 2.687887763977051, "rewards/rejected": -2.0039052963256836, "step": 13475 }, { "epoch": 0.7142819282855856, "grad_norm": 48.75, "kl": 0.3317909240722656, "learning_rate": 5e-07, "logits/chosen": -39067781.333333336, "logits/rejected": -1190761.4, "logps/chosen": -254.3800048828125, "logps/rejected": -362.5130615234375, "loss": 0.287, "rewards/chosen": 0.6700773239135742, "rewards/margins": 2.04897518157959, "rewards/rejected": -1.3788978576660156, "step": 13476 }, { "epoch": 0.7143349322873876, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -126317128.0, "logits/rejected": -11263072.0, "logps/chosen": -499.14849853515625, "logps/rejected": -440.2626953125, "loss": 0.1624, "rewards/chosen": 0.8105309009552002, "rewards/margins": 3.551340023676554, "rewards/rejected": -2.740809122721354, "step": 13477 }, { "epoch": 0.7143879362891898, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36981168.0, "logits/rejected": -20133813.333333332, "logps/chosen": -320.23773193359375, "logps/rejected": -327.0683186848958, "loss": 0.1991, "rewards/chosen": 0.41653746366500854, "rewards/margins": 3.0320425232251487, "rewards/rejected": -2.61550505956014, "step": 13478 }, { "epoch": 0.7144409402909919, "grad_norm": 43.75, "kl": 3.0184097290039062, "learning_rate": 5e-07, "logits/chosen": -25476150.4, "logits/rejected": -8970282.666666666, "logps/chosen": -349.3835693359375, "logps/rejected": -408.1246744791667, "loss": 0.3068, "rewards/chosen": 0.4690548896789551, "rewards/margins": 3.8070491472880046, "rewards/rejected": -3.3379942576090493, "step": 13479 }, { "epoch": 0.7144939442927941, "grad_norm": 43.5, "kl": 1.5780868530273438, "learning_rate": 5e-07, "logits/chosen": -33976595.2, "logits/rejected": -14742169.333333334, "logps/chosen": -419.29599609375, "logps/rejected": -425.7666829427083, "loss": 0.2184, "rewards/chosen": 1.227115535736084, "rewards/margins": 3.37707732518514, "rewards/rejected": -2.149961789449056, "step": 13480 }, { "epoch": 0.7145469482945962, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8176688.666666667, "logits/rejected": -19103086.4, "logps/chosen": -23.169764200846355, "logps/rejected": -352.7858642578125, "loss": 0.27, "rewards/chosen": 0.18869986136754355, "rewards/margins": 2.0257527788480125, "rewards/rejected": -1.8370529174804688, "step": 13481 }, { "epoch": 0.7145999522963984, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63870096.0, "logits/rejected": -21925608.0, "logps/chosen": -377.3652038574219, "logps/rejected": -249.12261962890625, "loss": 0.2516, "rewards/chosen": 0.7571380734443665, "rewards/margins": 2.9904211163520813, "rewards/rejected": -2.233283042907715, "step": 13482 }, { "epoch": 0.7146529562982005, "grad_norm": 59.75, "kl": 3.5198822021484375, "learning_rate": 5e-07, "logits/chosen": -77107398.4, "logits/rejected": -39086168.0, "logps/chosen": -473.671875, "logps/rejected": -641.81640625, "loss": 0.3033, "rewards/chosen": 0.9207037925720215, "rewards/margins": 3.2472612698872885, "rewards/rejected": -2.326557477315267, "step": 13483 }, { "epoch": 0.7147059603000027, "grad_norm": 49.0, "kl": 0.15398216247558594, "learning_rate": 5e-07, "logits/chosen": -26628328.0, "logits/rejected": 3777513.75, "logps/chosen": -113.14483642578125, "logps/rejected": -351.24395751953125, "loss": 0.2875, "rewards/chosen": 0.26237910985946655, "rewards/margins": 2.9212576746940613, "rewards/rejected": -2.6588785648345947, "step": 13484 }, { "epoch": 0.7147589643018047, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69696064.0, "logits/rejected": -6983440.0, "logps/chosen": -528.9829711914062, "logps/rejected": -215.60627092633928, "loss": 0.1997, "rewards/chosen": 0.14872436225414276, "rewards/margins": 2.3732834649937495, "rewards/rejected": -2.2245591027396068, "step": 13485 }, { "epoch": 0.7148119683036069, "grad_norm": 58.75, "kl": 1.884378433227539, "learning_rate": 5e-07, "logits/chosen": -5791005.333333333, "logits/rejected": -103661.5, "logps/chosen": -284.821044921875, "logps/rejected": -96.26144409179688, "loss": 0.2828, "rewards/chosen": 0.8576554457346598, "rewards/margins": 2.3658150831858316, "rewards/rejected": -1.5081596374511719, "step": 13486 }, { "epoch": 0.714864972305409, "grad_norm": 45.5, "kl": 1.2655420303344727, "learning_rate": 5e-07, "logits/chosen": -10198848.666666666, "logits/rejected": 12334674.0, "logps/chosen": -146.36517333984375, "logps/rejected": -283.53057861328125, "loss": 0.3695, "rewards/chosen": 0.3809823195139567, "rewards/margins": 2.4076735178629556, "rewards/rejected": -2.026691198348999, "step": 13487 }, { "epoch": 0.7149179763072112, "grad_norm": 56.0, "kl": 0.9074859619140625, "learning_rate": 5e-07, "logits/chosen": 79782576.0, "logits/rejected": -24325098.0, "logps/chosen": -316.6839294433594, "logps/rejected": -351.15802001953125, "loss": 0.2827, "rewards/chosen": 0.1247243881225586, "rewards/margins": 2.696439504623413, "rewards/rejected": -2.5717151165008545, "step": 13488 }, { "epoch": 0.7149709803090133, "grad_norm": 47.75, "kl": 0.9292840957641602, "learning_rate": 5e-07, "logits/chosen": -16356675.42857143, "logits/rejected": 11834117.0, "logps/chosen": -177.919189453125, "logps/rejected": -399.71051025390625, "loss": 0.4649, "rewards/chosen": 0.013709264142172677, "rewards/margins": 1.872873740536826, "rewards/rejected": -1.8591644763946533, "step": 13489 }, { "epoch": 0.7150239843108155, "grad_norm": 44.75, "kl": 0.9537887573242188, "learning_rate": 5e-07, "logits/chosen": -25755402.666666668, "logits/rejected": -13192626.4, "logps/chosen": -317.0419921875, "logps/rejected": -129.40723876953126, "loss": 0.2455, "rewards/chosen": 0.2483129103978475, "rewards/margins": 2.2030847152074178, "rewards/rejected": -1.9547718048095704, "step": 13490 }, { "epoch": 0.7150769883126176, "grad_norm": 147.0, "kl": 0.8189187049865723, "learning_rate": 5e-07, "logits/chosen": -19854472.0, "logits/rejected": -50194016.0, "logps/chosen": -290.170166015625, "logps/rejected": -592.4407958984375, "loss": 0.3497, "rewards/chosen": 0.26454681158065796, "rewards/margins": 5.854453980922699, "rewards/rejected": -5.589907169342041, "step": 13491 }, { "epoch": 0.7151299923144198, "grad_norm": 56.25, "kl": 2.3810386657714844, "learning_rate": 5e-07, "logits/chosen": -17677564.8, "logits/rejected": -39059765.333333336, "logps/chosen": -189.43233642578124, "logps/rejected": -298.84474690755206, "loss": 0.3667, "rewards/chosen": 0.4683248043060303, "rewards/margins": 2.1370736598968505, "rewards/rejected": -1.6687488555908203, "step": 13492 }, { "epoch": 0.7151829963162218, "grad_norm": 63.75, "kl": 0.28890037536621094, "learning_rate": 5e-07, "logits/chosen": 4925258.333333333, "logits/rejected": -51772020.0, "logps/chosen": -391.8182779947917, "logps/rejected": -430.61578369140625, "loss": 0.2658, "rewards/chosen": 0.8418013254801432, "rewards/margins": 2.88234011332194, "rewards/rejected": -2.040538787841797, "step": 13493 }, { "epoch": 0.715236000318024, "grad_norm": 44.0, "kl": 2.672787666320801, "learning_rate": 5e-07, "logits/chosen": -29464708.57142857, "logits/rejected": -23043988.0, "logps/chosen": -490.24637276785717, "logps/rejected": -205.82955932617188, "loss": 0.3316, "rewards/chosen": 1.3047962188720703, "rewards/margins": 2.537753701210022, "rewards/rejected": -1.2329574823379517, "step": 13494 }, { "epoch": 0.7152890043198261, "grad_norm": 76.0, "kl": 1.1426315307617188, "learning_rate": 5e-07, "logits/chosen": 4824368.8, "logits/rejected": -25575077.333333332, "logps/chosen": -702.388916015625, "logps/rejected": -213.87274169921875, "loss": 0.3729, "rewards/chosen": 0.42815365791320803, "rewards/margins": 1.529153331120809, "rewards/rejected": -1.1009996732076008, "step": 13495 }, { "epoch": 0.7153420083216283, "grad_norm": 57.75, "kl": 2.432769775390625, "learning_rate": 5e-07, "logits/chosen": -2976681.25, "logits/rejected": 59392464.0, "logps/chosen": -209.81886291503906, "logps/rejected": -312.3108215332031, "loss": 0.2394, "rewards/chosen": 1.0111418962478638, "rewards/margins": 2.7369216680526733, "rewards/rejected": -1.7257797718048096, "step": 13496 }, { "epoch": 0.7153950123234304, "grad_norm": 52.75, "kl": 3.0794239044189453, "learning_rate": 5e-07, "logits/chosen": -12840415.2, "logits/rejected": -11769793.333333334, "logps/chosen": -135.31868896484374, "logps/rejected": -307.98358154296875, "loss": 0.4659, "rewards/chosen": -0.21091811656951903, "rewards/margins": 2.1505586226781213, "rewards/rejected": -2.36147673924764, "step": 13497 }, { "epoch": 0.7154480163252326, "grad_norm": 56.25, "kl": 2.2626190185546875, "learning_rate": 5e-07, "logits/chosen": -25425294.0, "logits/rejected": -25421078.0, "logps/chosen": -345.2837219238281, "logps/rejected": -406.9569396972656, "loss": 0.2742, "rewards/chosen": 0.7397590279579163, "rewards/margins": 3.3558185696601868, "rewards/rejected": -2.6160595417022705, "step": 13498 }, { "epoch": 0.7155010203270347, "grad_norm": 55.75, "kl": 1.4239788055419922, "learning_rate": 5e-07, "logits/chosen": -27282547.2, "logits/rejected": 4497516.333333333, "logps/chosen": -369.5314208984375, "logps/rejected": -68.04364522298177, "loss": 0.3987, "rewards/chosen": 0.18040771484375, "rewards/margins": 2.3877431869506838, "rewards/rejected": -2.2073354721069336, "step": 13499 }, { "epoch": 0.7155540243288369, "grad_norm": 64.5, "kl": 4.131580352783203, "learning_rate": 5e-07, "logits/chosen": -14284456.0, "logits/rejected": -32180940.8, "logps/chosen": -438.2572428385417, "logps/rejected": -365.165380859375, "loss": 0.1181, "rewards/chosen": 1.949385643005371, "rewards/margins": 5.016183662414551, "rewards/rejected": -3.0667980194091795, "step": 13500 }, { "epoch": 0.7156070283306389, "grad_norm": 53.5, "kl": 3.237088203430176, "learning_rate": 5e-07, "logits/chosen": -25643876.57142857, "logits/rejected": -2245864.0, "logps/chosen": -270.90872628348217, "logps/rejected": -98.14364624023438, "loss": 0.3973, "rewards/chosen": 0.712160587310791, "rewards/margins": 1.8777711391448975, "rewards/rejected": -1.1656105518341064, "step": 13501 }, { "epoch": 0.7156600323324411, "grad_norm": 29.125, "kl": 0.8756542205810547, "learning_rate": 5e-07, "logits/chosen": -26931722.666666668, "logits/rejected": -25230094.4, "logps/chosen": -275.9833984375, "logps/rejected": -172.3833740234375, "loss": 0.1792, "rewards/chosen": 1.9733339945475261, "rewards/margins": 3.4095312754313154, "rewards/rejected": -1.436197280883789, "step": 13502 }, { "epoch": 0.7157130363342432, "grad_norm": 56.0, "kl": 3.0942535400390625, "learning_rate": 5e-07, "logits/chosen": -2890579.3333333335, "logits/rejected": -37505600.0, "logps/chosen": -443.0493977864583, "logps/rejected": -528.78662109375, "loss": 0.2675, "rewards/chosen": 1.129031737645467, "rewards/margins": 3.7271779378255205, "rewards/rejected": -2.5981462001800537, "step": 13503 }, { "epoch": 0.7157660403360454, "grad_norm": 74.5, "kl": 7.284528732299805, "learning_rate": 5e-07, "logits/chosen": -34209952.0, "logits/rejected": -54103112.0, "logps/chosen": -823.02587890625, "logps/rejected": -515.5877685546875, "loss": 0.2254, "rewards/chosen": 2.0020341873168945, "rewards/margins": 5.065975666046143, "rewards/rejected": -3.063941478729248, "step": 13504 }, { "epoch": 0.7158190443378475, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4840599.2, "logits/rejected": -29238194.666666668, "logps/chosen": -204.236474609375, "logps/rejected": -401.8910725911458, "loss": 0.2808, "rewards/chosen": 0.43954076766967776, "rewards/margins": 3.118506654103597, "rewards/rejected": -2.6789658864339194, "step": 13505 }, { "epoch": 0.7158720483396497, "grad_norm": 56.5, "kl": 6.630117416381836, "learning_rate": 5e-07, "logits/chosen": -43518061.71428572, "logits/rejected": -5323543.0, "logps/chosen": -354.8217075892857, "logps/rejected": -278.99554443359375, "loss": 0.4578, "rewards/chosen": 0.7702857426234654, "rewards/margins": 1.8080817631312778, "rewards/rejected": -1.0377960205078125, "step": 13506 }, { "epoch": 0.7159250523414518, "grad_norm": 44.5, "kl": 0.8197708129882812, "learning_rate": 5e-07, "logits/chosen": -36397048.0, "logits/rejected": -69682616.0, "logps/chosen": -414.9892883300781, "logps/rejected": -527.2958984375, "loss": 0.2297, "rewards/chosen": 0.634055495262146, "rewards/margins": 3.457767367362976, "rewards/rejected": -2.82371187210083, "step": 13507 }, { "epoch": 0.715978056343254, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27362448.0, "logits/rejected": -20240382.666666668, "logps/chosen": -279.69189453125, "logps/rejected": -340.45119222005206, "loss": 0.3573, "rewards/chosen": -0.012829577922821045, "rewards/margins": 2.168759548664093, "rewards/rejected": -2.181589126586914, "step": 13508 }, { "epoch": 0.716031060345056, "grad_norm": 49.0, "kl": 2.8089962005615234, "learning_rate": 5e-07, "logits/chosen": -2291733.0, "logits/rejected": -35162845.333333336, "logps/chosen": -111.26259765625, "logps/rejected": -164.47474161783853, "loss": 0.3496, "rewards/chosen": 0.6196579933166504, "rewards/margins": 1.840773344039917, "rewards/rejected": -1.2211153507232666, "step": 13509 }, { "epoch": 0.7160840643468582, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29553556.0, "logits/rejected": -17679720.0, "logps/chosen": -386.9154052734375, "logps/rejected": -329.0555114746094, "loss": 0.2943, "rewards/chosen": 0.23541298508644104, "rewards/margins": 2.2568813860416412, "rewards/rejected": -2.0214684009552, "step": 13510 }, { "epoch": 0.7161370683486603, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14001940.0, "logits/rejected": -8247054.0, "logps/chosen": -370.5545349121094, "logps/rejected": -231.4210408528646, "loss": 0.1783, "rewards/chosen": 1.0002753734588623, "rewards/margins": 2.9939337571461992, "rewards/rejected": -1.9936583836873372, "step": 13511 }, { "epoch": 0.7161900723504625, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46149380.0, "logits/rejected": 3513381.3333333335, "logps/chosen": -434.40631103515625, "logps/rejected": -257.55194091796875, "loss": 0.2257, "rewards/chosen": 0.18222197890281677, "rewards/margins": 2.2508383691310883, "rewards/rejected": -2.0686163902282715, "step": 13512 }, { "epoch": 0.7162430763522646, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -85472664.0, "logits/rejected": -20420148.0, "logps/chosen": -540.485107421875, "logps/rejected": -500.5279134114583, "loss": 0.2086, "rewards/chosen": 0.5410065054893494, "rewards/margins": 3.1528982520103455, "rewards/rejected": -2.611891746520996, "step": 13513 }, { "epoch": 0.7162960803540668, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33098506.666666668, "logits/rejected": -44066640.0, "logps/chosen": -279.3075764973958, "logps/rejected": -454.49765625, "loss": 0.2177, "rewards/chosen": 0.3381078243255615, "rewards/margins": 2.6549743175506593, "rewards/rejected": -2.3168664932250977, "step": 13514 }, { "epoch": 0.7163490843558689, "grad_norm": 36.25, "kl": 3.3663597106933594, "learning_rate": 5e-07, "logits/chosen": 7636484.0, "logits/rejected": -22596964.0, "logps/chosen": -33.70293045043945, "logps/rejected": -295.7211608886719, "loss": 0.3227, "rewards/chosen": 0.38010871410369873, "rewards/margins": 3.3974095582962036, "rewards/rejected": -3.017300844192505, "step": 13515 }, { "epoch": 0.716402088357671, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11096079.0, "logits/rejected": -30765348.0, "logps/chosen": -396.52587890625, "logps/rejected": -450.9969482421875, "loss": 0.1567, "rewards/chosen": 1.2761276960372925, "rewards/margins": 4.184891581535339, "rewards/rejected": -2.908763885498047, "step": 13516 }, { "epoch": 0.7164550923594731, "grad_norm": 44.0, "kl": 0.8495597839355469, "learning_rate": 5e-07, "logits/chosen": 5398083.333333333, "logits/rejected": -14070868.8, "logps/chosen": -232.35272216796875, "logps/rejected": -184.14771728515626, "loss": 0.21, "rewards/chosen": 0.7080585161844889, "rewards/margins": 2.914648405710856, "rewards/rejected": -2.2065898895263674, "step": 13517 }, { "epoch": 0.7165080963612753, "grad_norm": 37.0, "kl": 0.6053085327148438, "learning_rate": 5e-07, "logits/chosen": -47775968.0, "logits/rejected": -44499884.8, "logps/chosen": -316.68507893880206, "logps/rejected": -317.25732421875, "loss": 0.2429, "rewards/chosen": 0.8950598239898682, "rewards/margins": 2.6905558109283447, "rewards/rejected": -1.7954959869384766, "step": 13518 }, { "epoch": 0.7165611003630774, "grad_norm": 52.25, "kl": 0.097991943359375, "learning_rate": 5e-07, "logits/chosen": -29626066.0, "logits/rejected": -4508101.0, "logps/chosen": -273.1096496582031, "logps/rejected": -416.87335205078125, "loss": 0.3046, "rewards/chosen": 0.4178522229194641, "rewards/margins": 2.3919478058815002, "rewards/rejected": -1.9740955829620361, "step": 13519 }, { "epoch": 0.7166141043648796, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12220246.0, "logits/rejected": -31333564.0, "logps/chosen": -290.1978759765625, "logps/rejected": -478.04583740234375, "loss": 0.248, "rewards/chosen": 0.5929282307624817, "rewards/margins": 3.2486546635627747, "rewards/rejected": -2.655726432800293, "step": 13520 }, { "epoch": 0.7166671083666817, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73415664.0, "logits/rejected": -30162725.333333332, "logps/chosen": -329.59832763671875, "logps/rejected": -306.6330973307292, "loss": 0.1975, "rewards/chosen": 0.4495483338832855, "rewards/margins": 2.701054185628891, "rewards/rejected": -2.2515058517456055, "step": 13521 }, { "epoch": 0.7167201123684839, "grad_norm": 40.25, "kl": 0.01380157470703125, "learning_rate": 5e-07, "logits/chosen": -115192874.66666667, "logits/rejected": -63368467.2, "logps/chosen": -424.2941080729167, "logps/rejected": -421.8853515625, "loss": 0.19, "rewards/chosen": 0.7968404293060303, "rewards/margins": 4.06112494468689, "rewards/rejected": -3.2642845153808593, "step": 13522 }, { "epoch": 0.716773116370286, "grad_norm": 56.25, "kl": 3.1316604614257812, "learning_rate": 5e-07, "logits/chosen": -30829818.666666668, "logits/rejected": -21773723.2, "logps/chosen": -897.0755208333334, "logps/rejected": -623.01435546875, "loss": 0.1262, "rewards/chosen": 2.4618000984191895, "rewards/margins": 5.261034297943115, "rewards/rejected": -2.7992341995239256, "step": 13523 }, { "epoch": 0.716826120372088, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41380784.0, "logits/rejected": -19571870.0, "logps/chosen": -437.41131591796875, "logps/rejected": -314.75, "loss": 0.2516, "rewards/chosen": 0.5277687311172485, "rewards/margins": 2.4947460889816284, "rewards/rejected": -1.9669773578643799, "step": 13524 }, { "epoch": 0.7168791243738902, "grad_norm": 60.25, "kl": 0.6729259490966797, "learning_rate": 5e-07, "logits/chosen": -47971532.8, "logits/rejected": -36561146.666666664, "logps/chosen": -618.368896484375, "logps/rejected": -329.00311279296875, "loss": 0.3127, "rewards/chosen": 0.6328572273254395, "rewards/margins": 2.9044164657592773, "rewards/rejected": -2.271559238433838, "step": 13525 }, { "epoch": 0.7169321283756923, "grad_norm": 30.875, "kl": 3.289557456970215, "learning_rate": 5e-07, "logits/chosen": 5234556.333333333, "logits/rejected": -45115430.4, "logps/chosen": -36.052782694498696, "logps/rejected": -411.5373046875, "loss": 0.1884, "rewards/chosen": 0.9581401348114014, "rewards/margins": 3.437174367904663, "rewards/rejected": -2.4790342330932615, "step": 13526 }, { "epoch": 0.7169851323774945, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 40991914.666666664, "logits/rejected": 21855326.4, "logps/chosen": -275.1207275390625, "logps/rejected": -280.170947265625, "loss": 0.2353, "rewards/chosen": 0.6526662111282349, "rewards/margins": 2.654196286201477, "rewards/rejected": -2.001530075073242, "step": 13527 }, { "epoch": 0.7170381363792966, "grad_norm": 50.0, "kl": 0.8733329772949219, "learning_rate": 5e-07, "logits/chosen": -25514360.0, "logits/rejected": -13666044.0, "logps/chosen": -291.12595621744794, "logps/rejected": -84.45823669433594, "loss": 0.2599, "rewards/chosen": 0.839255174001058, "rewards/margins": 3.422600587209066, "rewards/rejected": -2.583345413208008, "step": 13528 }, { "epoch": 0.7170911403810988, "grad_norm": 70.5, "kl": 0.27490234375, "learning_rate": 5e-07, "logits/chosen": -48870496.0, "logits/rejected": -86166696.0, "logps/chosen": -340.3875732421875, "logps/rejected": -241.60850524902344, "loss": 0.4268, "rewards/chosen": 0.052704617381095886, "rewards/margins": 0.6559637039899826, "rewards/rejected": -0.6032590866088867, "step": 13529 }, { "epoch": 0.7171441443829009, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36452565.333333336, "logits/rejected": -24170840.0, "logps/chosen": -224.9559326171875, "logps/rejected": -388.8355712890625, "loss": 0.2461, "rewards/chosen": 0.5070828596750895, "rewards/margins": 2.6000909010569253, "rewards/rejected": -2.093008041381836, "step": 13530 }, { "epoch": 0.717197148384703, "grad_norm": 59.5, "kl": 5.387350082397461, "learning_rate": 5e-07, "logits/chosen": -24580002.285714287, "logits/rejected": -72196344.0, "logps/chosen": -531.6867327008929, "logps/rejected": -660.2920532226562, "loss": 0.4083, "rewards/chosen": 1.0544110025678362, "rewards/margins": 4.770817347935267, "rewards/rejected": -3.7164063453674316, "step": 13531 }, { "epoch": 0.7172501523865051, "grad_norm": 48.25, "kl": 2.2317628860473633, "learning_rate": 5e-07, "logits/chosen": -23752081.6, "logits/rejected": -14532762.666666666, "logps/chosen": -310.931396484375, "logps/rejected": -91.05431111653645, "loss": 0.2926, "rewards/chosen": 0.5851043701171875, "rewards/margins": 3.4698312123616537, "rewards/rejected": -2.8847268422444663, "step": 13532 }, { "epoch": 0.7173031563883073, "grad_norm": 25.125, "kl": 3.01513671875, "learning_rate": 5e-07, "logits/chosen": -13607392.0, "logits/rejected": -23211590.4, "logps/chosen": -810.017333984375, "logps/rejected": -614.775390625, "loss": 0.2046, "rewards/chosen": 1.007381836573283, "rewards/margins": 4.500046173731486, "rewards/rejected": -3.492664337158203, "step": 13533 }, { "epoch": 0.7173561603901094, "grad_norm": 31.625, "kl": 2.457448959350586, "learning_rate": 5e-07, "logits/chosen": 2030336.25, "logits/rejected": -21053580.0, "logps/chosen": -248.05201721191406, "logps/rejected": -264.64707438151044, "loss": 0.1531, "rewards/chosen": 1.2314401865005493, "rewards/margins": 3.566649079322815, "rewards/rejected": -2.3352088928222656, "step": 13534 }, { "epoch": 0.7174091643919116, "grad_norm": 58.25, "kl": 2.047271728515625, "learning_rate": 5e-07, "logits/chosen": -49461046.4, "logits/rejected": -16160182.666666666, "logps/chosen": -389.5348876953125, "logps/rejected": -311.8477783203125, "loss": 0.4087, "rewards/chosen": 0.13321166038513182, "rewards/margins": 1.7272161324818929, "rewards/rejected": -1.594004472096761, "step": 13535 }, { "epoch": 0.7174621683937137, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91822784.0, "logits/rejected": 19574.625, "logps/chosen": -380.4833984375, "logps/rejected": -169.22634887695312, "loss": 0.3445, "rewards/chosen": -0.10559645295143127, "rewards/margins": 2.248034805059433, "rewards/rejected": -2.3536312580108643, "step": 13536 }, { "epoch": 0.7175151723955159, "grad_norm": 49.75, "kl": 2.4637603759765625, "learning_rate": 5e-07, "logits/chosen": -39457468.8, "logits/rejected": -48852373.333333336, "logps/chosen": -342.0154541015625, "logps/rejected": -308.7211507161458, "loss": 0.3175, "rewards/chosen": 0.4078849792480469, "rewards/margins": 2.932390276590983, "rewards/rejected": -2.524505297342936, "step": 13537 }, { "epoch": 0.717568176397318, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3829417.3333333335, "logits/rejected": -32420646.4, "logps/chosen": -125.3978983561198, "logps/rejected": -567.80908203125, "loss": 0.2, "rewards/chosen": 0.48332715034484863, "rewards/margins": 3.635391283035278, "rewards/rejected": -3.1520641326904295, "step": 13538 }, { "epoch": 0.7176211803991202, "grad_norm": 105.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36222580.0, "logits/rejected": -14085728.0, "logps/chosen": -407.647216796875, "logps/rejected": -234.6851806640625, "loss": 0.2715, "rewards/chosen": 0.5192108154296875, "rewards/margins": 2.5490617752075195, "rewards/rejected": -2.029850959777832, "step": 13539 }, { "epoch": 0.7176741844009222, "grad_norm": 38.0, "kl": 2.009553909301758, "learning_rate": 5e-07, "logits/chosen": -6418771.6, "logits/rejected": -90359349.33333333, "logps/chosen": -100.450390625, "logps/rejected": -520.2787272135416, "loss": 0.2587, "rewards/chosen": 0.6451942920684814, "rewards/margins": 3.4636127948760986, "rewards/rejected": -2.818418502807617, "step": 13540 }, { "epoch": 0.7177271884027244, "grad_norm": 58.0, "kl": 5.257072448730469, "learning_rate": 5e-07, "logits/chosen": -22603513.14285714, "logits/rejected": -17635656.0, "logps/chosen": -340.154541015625, "logps/rejected": -83.03707885742188, "loss": 0.5149, "rewards/chosen": 0.5190063885280064, "rewards/margins": 2.1003487280436923, "rewards/rejected": -1.581342339515686, "step": 13541 }, { "epoch": 0.7177801924045265, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4634984.0, "logits/rejected": -11885694.666666666, "logps/chosen": -328.58538818359375, "logps/rejected": -293.333984375, "loss": 0.1539, "rewards/chosen": 0.6452213525772095, "rewards/margins": 3.6803661584854126, "rewards/rejected": -3.035144805908203, "step": 13542 }, { "epoch": 0.7178331964063287, "grad_norm": 37.75, "kl": 1.2044601440429688, "learning_rate": 5e-07, "logits/chosen": -15838195.2, "logits/rejected": -26850605.333333332, "logps/chosen": -317.1209228515625, "logps/rejected": -411.1551920572917, "loss": 0.2589, "rewards/chosen": 1.1987789154052735, "rewards/margins": 3.024384784698486, "rewards/rejected": -1.825605869293213, "step": 13543 }, { "epoch": 0.7178862004081308, "grad_norm": 47.25, "kl": 0.3578376770019531, "learning_rate": 5e-07, "logits/chosen": 9920669.333333334, "logits/rejected": -47954131.2, "logps/chosen": -319.4634195963542, "logps/rejected": -292.6759033203125, "loss": 0.1789, "rewards/chosen": 0.6495102246602377, "rewards/margins": 4.355877717336019, "rewards/rejected": -3.7063674926757812, "step": 13544 }, { "epoch": 0.717939204409933, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7282694.666666667, "logits/rejected": -38921936.0, "logps/chosen": -141.7276814778646, "logps/rejected": -324.4173095703125, "loss": 0.2758, "rewards/chosen": -0.272688627243042, "rewards/margins": 2.16146035194397, "rewards/rejected": -2.434148979187012, "step": 13545 }, { "epoch": 0.7179922084117351, "grad_norm": 57.75, "kl": 0.8158769607543945, "learning_rate": 5e-07, "logits/chosen": -33566104.0, "logits/rejected": -16297794.0, "logps/chosen": -660.718994140625, "logps/rejected": -260.4697570800781, "loss": 0.2499, "rewards/chosen": 1.0510157346725464, "rewards/margins": 2.9243918657302856, "rewards/rejected": -1.8733761310577393, "step": 13546 }, { "epoch": 0.7180452124135372, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46401616.0, "logits/rejected": -49817860.0, "logps/chosen": -342.52532958984375, "logps/rejected": -361.1167297363281, "loss": 0.3127, "rewards/chosen": 0.19831734895706177, "rewards/margins": 1.984143078327179, "rewards/rejected": -1.7858257293701172, "step": 13547 }, { "epoch": 0.7180982164153393, "grad_norm": 79.0, "kl": 5.331172943115234, "learning_rate": 5e-07, "logits/chosen": -43463250.666666664, "logits/rejected": -12393947.0, "logps/chosen": -437.6966959635417, "logps/rejected": -201.0804901123047, "loss": 0.3532, "rewards/chosen": 0.9306907653808594, "rewards/margins": 3.8045549392700195, "rewards/rejected": -2.87386417388916, "step": 13548 }, { "epoch": 0.7181512204171415, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20095766.4, "logits/rejected": -32878906.666666668, "logps/chosen": -217.71435546875, "logps/rejected": -181.50944010416666, "loss": 0.3774, "rewards/chosen": -0.1562873125076294, "rewards/margins": 2.3069334427515664, "rewards/rejected": -2.463220755259196, "step": 13549 }, { "epoch": 0.7182042244189436, "grad_norm": 29.375, "kl": 0.9477272033691406, "learning_rate": 5e-07, "logits/chosen": -21602834.666666668, "logits/rejected": -25045664.0, "logps/chosen": -172.73234049479166, "logps/rejected": -586.644873046875, "loss": 0.2294, "rewards/chosen": 0.4253389835357666, "rewards/margins": 3.559758996963501, "rewards/rejected": -3.1344200134277345, "step": 13550 }, { "epoch": 0.7182572284207458, "grad_norm": 49.5, "kl": 0.20340347290039062, "learning_rate": 5e-07, "logits/chosen": -70248970.66666667, "logits/rejected": -23001270.4, "logps/chosen": -677.2867431640625, "logps/rejected": -275.56240234375, "loss": 0.1762, "rewards/chosen": 0.8825499216715494, "rewards/margins": 3.394990603129069, "rewards/rejected": -2.5124406814575195, "step": 13551 }, { "epoch": 0.7183102324225479, "grad_norm": 52.5, "kl": 4.681924819946289, "learning_rate": 5e-07, "logits/chosen": -37376405.333333336, "logits/rejected": -19499134.0, "logps/chosen": -249.38883463541666, "logps/rejected": -415.13958740234375, "loss": 0.3837, "rewards/chosen": 0.9814175764719645, "rewards/margins": 2.0176793734232583, "rewards/rejected": -1.036261796951294, "step": 13552 }, { "epoch": 0.7183632364243501, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59319173.333333336, "logits/rejected": -1240414.6, "logps/chosen": -310.338134765625, "logps/rejected": -206.4380126953125, "loss": 0.2454, "rewards/chosen": 1.0820236206054688, "rewards/margins": 2.3824817657470705, "rewards/rejected": -1.3004581451416015, "step": 13553 }, { "epoch": 0.7184162404261522, "grad_norm": 51.25, "kl": 2.907419204711914, "learning_rate": 5e-07, "logits/chosen": -34274329.6, "logits/rejected": -41556666.666666664, "logps/chosen": -308.7697998046875, "logps/rejected": -379.3500569661458, "loss": 0.2159, "rewards/chosen": 1.186401653289795, "rewards/margins": 4.657696374257406, "rewards/rejected": -3.471294720967611, "step": 13554 }, { "epoch": 0.7184692444279543, "grad_norm": 60.25, "kl": 4.443839073181152, "learning_rate": 5e-07, "logits/chosen": -34487664.0, "logits/rejected": -27388166.0, "logps/chosen": -309.387939453125, "logps/rejected": -182.01641845703125, "loss": 0.331, "rewards/chosen": 0.9150168100992838, "rewards/margins": 1.9838812748591104, "rewards/rejected": -1.0688644647598267, "step": 13555 }, { "epoch": 0.7185222484297564, "grad_norm": 38.5, "kl": 3.925691604614258, "learning_rate": 5e-07, "logits/chosen": -20512786.0, "logits/rejected": -64273584.0, "logps/chosen": -246.40269470214844, "logps/rejected": -427.49560546875, "loss": 0.2092, "rewards/chosen": 1.4399051666259766, "rewards/margins": 3.5135467052459717, "rewards/rejected": -2.073641538619995, "step": 13556 }, { "epoch": 0.7185752524315586, "grad_norm": 56.75, "kl": 3.7066192626953125, "learning_rate": 5e-07, "logits/chosen": -35404573.333333336, "logits/rejected": -70528864.0, "logps/chosen": -247.67972819010416, "logps/rejected": -491.1930236816406, "loss": 0.3287, "rewards/chosen": 1.0413991610209148, "rewards/margins": 3.0147420565287275, "rewards/rejected": -1.9733428955078125, "step": 13557 }, { "epoch": 0.7186282564333607, "grad_norm": 49.0, "kl": 6.620780944824219, "learning_rate": 5e-07, "logits/chosen": -10033889.333333334, "logits/rejected": -25439004.0, "logps/chosen": -126.38558959960938, "logps/rejected": -256.01983642578125, "loss": 0.4972, "rewards/chosen": 0.1628477374712626, "rewards/margins": 1.237515429655711, "rewards/rejected": -1.0746676921844482, "step": 13558 }, { "epoch": 0.7186812604351629, "grad_norm": 45.0, "kl": 1.2681655883789062, "learning_rate": 5e-07, "logits/chosen": 10475140.0, "logits/rejected": -1469456.0, "logps/chosen": -488.47003173828125, "logps/rejected": -283.5172119140625, "loss": 0.172, "rewards/chosen": 3.303741455078125, "rewards/margins": 5.297791889735631, "rewards/rejected": -1.9940504346575056, "step": 13559 }, { "epoch": 0.718734264436965, "grad_norm": 55.75, "kl": 1.3375492095947266, "learning_rate": 5e-07, "logits/chosen": -20905141.333333332, "logits/rejected": -18290168.0, "logps/chosen": -242.00386555989584, "logps/rejected": -316.0879821777344, "loss": 0.3086, "rewards/chosen": 0.5730610688527426, "rewards/margins": 3.8424235184987388, "rewards/rejected": -3.269362449645996, "step": 13560 }, { "epoch": 0.7187872684387672, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -108594848.0, "logits/rejected": -32546468.0, "logps/chosen": -277.8034362792969, "logps/rejected": -460.5562438964844, "loss": 0.3133, "rewards/chosen": 0.34592172503471375, "rewards/margins": 2.0170209109783173, "rewards/rejected": -1.6710991859436035, "step": 13561 }, { "epoch": 0.7188402724405693, "grad_norm": 64.0, "kl": 0.4142723083496094, "learning_rate": 5e-07, "logits/chosen": -92619808.0, "logits/rejected": -23436717.333333332, "logps/chosen": -519.10234375, "logps/rejected": -355.1064453125, "loss": 0.3257, "rewards/chosen": 0.2436511516571045, "rewards/margins": 2.376107613245646, "rewards/rejected": -2.1324564615885415, "step": 13562 }, { "epoch": 0.7188932764423714, "grad_norm": 45.75, "kl": 0.5651912689208984, "learning_rate": 5e-07, "logits/chosen": -53656469.333333336, "logits/rejected": -14449142.4, "logps/chosen": -226.90901692708334, "logps/rejected": -221.9289794921875, "loss": 0.2901, "rewards/chosen": 0.3165765206019084, "rewards/margins": 1.7256811539332073, "rewards/rejected": -1.4091046333312989, "step": 13563 }, { "epoch": 0.7189462804441735, "grad_norm": 57.0, "kl": 1.2533502578735352, "learning_rate": 5e-07, "logits/chosen": -31048489.6, "logits/rejected": -19070368.0, "logps/chosen": -270.765625, "logps/rejected": -363.8170166015625, "loss": 0.2993, "rewards/chosen": 0.3410785675048828, "rewards/margins": 4.050617027282715, "rewards/rejected": -3.709538459777832, "step": 13564 }, { "epoch": 0.7189992844459757, "grad_norm": 41.75, "kl": 0.6245536804199219, "learning_rate": 5e-07, "logits/chosen": -555840.2, "logits/rejected": 147830336.0, "logps/chosen": -180.93568115234376, "logps/rejected": -215.08746337890625, "loss": 0.2874, "rewards/chosen": 0.761754035949707, "rewards/margins": 2.7122472127278643, "rewards/rejected": -1.9504931767781575, "step": 13565 }, { "epoch": 0.7190522884477778, "grad_norm": 86.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52763600.0, "logits/rejected": -192589.84375, "logps/chosen": -331.5548400878906, "logps/rejected": -72.92005157470703, "loss": 0.3633, "rewards/chosen": 0.37101230025291443, "rewards/margins": 1.5388512313365936, "rewards/rejected": -1.1678389310836792, "step": 13566 }, { "epoch": 0.71910529244958, "grad_norm": 59.5, "kl": 0.34108734130859375, "learning_rate": 5e-07, "logits/chosen": -57400176.0, "logits/rejected": -44987896.0, "logps/chosen": -349.2950032552083, "logps/rejected": -387.6719055175781, "loss": 0.2979, "rewards/chosen": 0.7239125569661459, "rewards/margins": 2.130177100499471, "rewards/rejected": -1.4062645435333252, "step": 13567 }, { "epoch": 0.7191582964513821, "grad_norm": 62.5, "kl": 5.794218063354492, "learning_rate": 5e-07, "logits/chosen": -18073120.0, "logits/rejected": 1629026.3333333333, "logps/chosen": -330.65712890625, "logps/rejected": -169.07591756184897, "loss": 0.3036, "rewards/chosen": 1.8547924041748047, "rewards/margins": 2.7041906992594402, "rewards/rejected": -0.8493982950846354, "step": 13568 }, { "epoch": 0.7192113004531843, "grad_norm": 39.25, "kl": 0.3109569549560547, "learning_rate": 5e-07, "logits/chosen": -4925320.5, "logits/rejected": -26954280.0, "logps/chosen": -150.05990600585938, "logps/rejected": -601.9671630859375, "loss": 0.2845, "rewards/chosen": 0.0677870661020279, "rewards/margins": 4.499887362122536, "rewards/rejected": -4.432100296020508, "step": 13569 }, { "epoch": 0.7192643044549863, "grad_norm": 48.5, "kl": 0.8000564575195312, "learning_rate": 5e-07, "logits/chosen": -22540844.8, "logits/rejected": -27030677.333333332, "logps/chosen": -242.4145751953125, "logps/rejected": -437.0677083333333, "loss": 0.3285, "rewards/chosen": 0.16138341426849365, "rewards/margins": 3.2427006800969442, "rewards/rejected": -3.0813172658284507, "step": 13570 }, { "epoch": 0.7193173084567885, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3457867.25, "logits/rejected": 9347515.42857143, "logps/chosen": -348.5670166015625, "logps/rejected": -195.05936104910714, "loss": 0.2144, "rewards/chosen": -0.169708251953125, "rewards/margins": 1.641578129359654, "rewards/rejected": -1.811286381312779, "step": 13571 }, { "epoch": 0.7193703124585906, "grad_norm": 59.75, "kl": 0.6991195678710938, "learning_rate": 5e-07, "logits/chosen": -56454660.0, "logits/rejected": 17530800.0, "logps/chosen": -264.8974609375, "logps/rejected": -247.1317138671875, "loss": 0.3, "rewards/chosen": 0.05512465536594391, "rewards/margins": 1.6196140199899673, "rewards/rejected": -1.5644893646240234, "step": 13572 }, { "epoch": 0.7194233164603928, "grad_norm": 43.5, "kl": 4.800537109375, "learning_rate": 5e-07, "logits/chosen": -26600181.333333332, "logits/rejected": -23601172.0, "logps/chosen": -229.6324462890625, "logps/rejected": -546.7327270507812, "loss": 0.2839, "rewards/chosen": 1.1092225710550945, "rewards/margins": 5.2793504397074384, "rewards/rejected": -4.170127868652344, "step": 13573 }, { "epoch": 0.7194763204621949, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44058144.0, "logits/rejected": -10623329.333333334, "logps/chosen": -706.7452392578125, "logps/rejected": -269.58917236328125, "loss": 0.2063, "rewards/chosen": 1.6453309059143066, "rewards/margins": 3.1672682762145996, "rewards/rejected": -1.521937370300293, "step": 13574 }, { "epoch": 0.719529324463997, "grad_norm": 39.25, "kl": 3.315340042114258, "learning_rate": 5e-07, "logits/chosen": -31376397.333333332, "logits/rejected": -57473785.6, "logps/chosen": -225.4501953125, "logps/rejected": -338.07412109375, "loss": 0.2076, "rewards/chosen": 0.6508140563964844, "rewards/margins": 4.225166702270508, "rewards/rejected": -3.5743526458740233, "step": 13575 }, { "epoch": 0.7195823284657992, "grad_norm": 41.5, "kl": 7.275844573974609, "learning_rate": 5e-07, "logits/chosen": -6562524.0, "logits/rejected": 5396938.333333333, "logps/chosen": -358.04150390625, "logps/rejected": -200.91727701822916, "loss": 0.3884, "rewards/chosen": 0.8860137939453125, "rewards/margins": 2.6368509610493978, "rewards/rejected": -1.7508371671040852, "step": 13576 }, { "epoch": 0.7196353324676013, "grad_norm": 62.75, "kl": 0.5164670944213867, "learning_rate": 5e-07, "logits/chosen": 2292446.0, "logits/rejected": -11279952.0, "logps/chosen": -27.85540771484375, "logps/rejected": -262.0358479817708, "loss": 0.2398, "rewards/chosen": 0.20033103227615356, "rewards/margins": 1.9176285862922668, "rewards/rejected": -1.7172975540161133, "step": 13577 }, { "epoch": 0.7196883364694034, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39142176.0, "logits/rejected": -113243.33333333333, "logps/chosen": -282.9390625, "logps/rejected": -409.4219156901042, "loss": 0.3281, "rewards/chosen": 0.4958003044128418, "rewards/margins": 1.9869136492411297, "rewards/rejected": -1.4911133448282878, "step": 13578 }, { "epoch": 0.7197413404712055, "grad_norm": 48.5, "kl": 1.8456916809082031, "learning_rate": 5e-07, "logits/chosen": -52954566.4, "logits/rejected": -22732906.666666668, "logps/chosen": -293.009130859375, "logps/rejected": -382.215087890625, "loss": 0.3001, "rewards/chosen": 0.7154834270477295, "rewards/margins": 2.220334768295288, "rewards/rejected": -1.5048513412475586, "step": 13579 }, { "epoch": 0.7197943444730077, "grad_norm": 39.0, "kl": 3.299734115600586, "learning_rate": 5e-07, "logits/chosen": -29590848.0, "logits/rejected": -26452144.0, "logps/chosen": -714.92392578125, "logps/rejected": -314.60642496744794, "loss": 0.1745, "rewards/chosen": 1.716717529296875, "rewards/margins": 6.393322118123372, "rewards/rejected": -4.676604588826497, "step": 13580 }, { "epoch": 0.7198473484748098, "grad_norm": 80.0, "kl": 0.6436481475830078, "learning_rate": 5e-07, "logits/chosen": 15392152.0, "logits/rejected": -36336080.0, "logps/chosen": -404.91046142578125, "logps/rejected": -294.0469665527344, "loss": 0.2578, "rewards/chosen": 0.4734794795513153, "rewards/margins": 2.932399958372116, "rewards/rejected": -2.458920478820801, "step": 13581 }, { "epoch": 0.719900352476612, "grad_norm": 59.25, "kl": 1.9271621704101562, "learning_rate": 5e-07, "logits/chosen": -15298822.4, "logits/rejected": -27619613.333333332, "logps/chosen": -180.161962890625, "logps/rejected": -321.4062093098958, "loss": 0.4622, "rewards/chosen": -0.0865465521812439, "rewards/margins": 1.6659156759579976, "rewards/rejected": -1.7524622281392415, "step": 13582 }, { "epoch": 0.7199533564784141, "grad_norm": 22.0, "kl": 1.1340303421020508, "learning_rate": 5e-07, "logits/chosen": 6384628.0, "logits/rejected": -52352086.85714286, "logps/chosen": -4.262350082397461, "logps/rejected": -320.8068150111607, "loss": 0.1251, "rewards/chosen": 0.3433164656162262, "rewards/margins": 3.313091801745551, "rewards/rejected": -2.9697753361293246, "step": 13583 }, { "epoch": 0.7200063604802163, "grad_norm": 50.0, "kl": 2.3576860427856445, "learning_rate": 5e-07, "logits/chosen": -44518025.6, "logits/rejected": -10104846.0, "logps/chosen": -314.15166015625, "logps/rejected": -211.5484822591146, "loss": 0.3392, "rewards/chosen": 0.17590656280517578, "rewards/margins": 3.5844660441080727, "rewards/rejected": -3.408559481302897, "step": 13584 }, { "epoch": 0.7200593644820183, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30763790.0, "logits/rejected": -45116096.0, "logps/chosen": -157.57437133789062, "logps/rejected": -459.723876953125, "loss": 0.1578, "rewards/chosen": 0.5294368863105774, "rewards/margins": 3.2714374661445618, "rewards/rejected": -2.7420005798339844, "step": 13585 }, { "epoch": 0.7201123684838205, "grad_norm": 54.0, "kl": 0.701690673828125, "learning_rate": 5e-07, "logits/chosen": -33483884.8, "logits/rejected": -12120321.333333334, "logps/chosen": -261.866943359375, "logps/rejected": -405.4424641927083, "loss": 0.2456, "rewards/chosen": 0.951579475402832, "rewards/margins": 3.525274340311686, "rewards/rejected": -2.573694864908854, "step": 13586 }, { "epoch": 0.7201653724856226, "grad_norm": 42.5, "kl": 1.126882553100586, "learning_rate": 5e-07, "logits/chosen": -33767555.2, "logits/rejected": -36527290.666666664, "logps/chosen": -194.7827392578125, "logps/rejected": -481.1557210286458, "loss": 0.3276, "rewards/chosen": 0.11508811712265014, "rewards/margins": 3.102696708838145, "rewards/rejected": -2.9876085917154946, "step": 13587 }, { "epoch": 0.7202183764874248, "grad_norm": 47.75, "kl": 1.9313240051269531, "learning_rate": 5e-07, "logits/chosen": -19477764.57142857, "logits/rejected": -59128132.0, "logps/chosen": -546.4851422991071, "logps/rejected": -182.86856079101562, "loss": 0.3248, "rewards/chosen": 1.0640442030770438, "rewards/margins": 2.696118201528277, "rewards/rejected": -1.632073998451233, "step": 13588 }, { "epoch": 0.7202713804892269, "grad_norm": 37.0, "kl": 0.3768310546875, "learning_rate": 5e-07, "logits/chosen": -63258698.666666664, "logits/rejected": -8818486.4, "logps/chosen": -334.44561767578125, "logps/rejected": -197.973388671875, "loss": 0.2061, "rewards/chosen": 0.19182840983072916, "rewards/margins": 3.4063170115152994, "rewards/rejected": -3.2144886016845704, "step": 13589 }, { "epoch": 0.7203243844910291, "grad_norm": 54.25, "kl": 2.5982398986816406, "learning_rate": 5e-07, "logits/chosen": -643520.4, "logits/rejected": -131594773.33333333, "logps/chosen": -268.3572998046875, "logps/rejected": -304.3410237630208, "loss": 0.3303, "rewards/chosen": 0.44562592506408694, "rewards/margins": 2.0841472784678143, "rewards/rejected": -1.6385213534037273, "step": 13590 }, { "epoch": 0.7203773884928312, "grad_norm": 40.75, "kl": 2.6727447509765625, "learning_rate": 5e-07, "logits/chosen": -20331422.4, "logits/rejected": -6927036.666666667, "logps/chosen": -190.7932861328125, "logps/rejected": -97.73026529947917, "loss": 0.3492, "rewards/chosen": 0.7920930862426758, "rewards/margins": 2.28493226369222, "rewards/rejected": -1.4928391774495442, "step": 13591 }, { "epoch": 0.7204303924946334, "grad_norm": 52.5, "kl": 1.4942269325256348, "learning_rate": 5e-07, "logits/chosen": -27035014.4, "logits/rejected": -8914597.333333334, "logps/chosen": -249.145068359375, "logps/rejected": -92.57217407226562, "loss": 0.4084, "rewards/chosen": 0.2973365306854248, "rewards/margins": 1.2778998533884685, "rewards/rejected": -0.9805633227030436, "step": 13592 }, { "epoch": 0.7204833964964354, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38594428.0, "logits/rejected": 3833558.0, "logps/chosen": -414.64263916015625, "logps/rejected": -298.40981038411456, "loss": 0.1965, "rewards/chosen": 0.02672654390335083, "rewards/margins": 2.8341450492540994, "rewards/rejected": -2.8074185053507485, "step": 13593 }, { "epoch": 0.7205364004982376, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23995064.0, "logits/rejected": -5604491.5, "logps/chosen": -210.57826232910156, "logps/rejected": -141.292236328125, "loss": 0.21, "rewards/chosen": 1.0323594808578491, "rewards/margins": 3.1771310567855835, "rewards/rejected": -2.1447715759277344, "step": 13594 }, { "epoch": 0.7205894045000397, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1648759.6666666667, "logits/rejected": -28314601.6, "logps/chosen": -67.35726420084636, "logps/rejected": -340.426513671875, "loss": 0.3386, "rewards/chosen": 0.23276889324188232, "rewards/margins": 1.2552963972091675, "rewards/rejected": -1.0225275039672852, "step": 13595 }, { "epoch": 0.7206424085018419, "grad_norm": 51.5, "kl": 0.37412452697753906, "learning_rate": 5e-07, "logits/chosen": -54443525.333333336, "logits/rejected": -14279900.8, "logps/chosen": -213.1754353841146, "logps/rejected": -242.444287109375, "loss": 0.1844, "rewards/chosen": 1.0413773854573567, "rewards/margins": 3.6610768636067705, "rewards/rejected": -2.619699478149414, "step": 13596 }, { "epoch": 0.720695412503644, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24425492.0, "logits/rejected": -5878639.0, "logps/chosen": -323.1997375488281, "logps/rejected": -326.28106689453125, "loss": 0.3125, "rewards/chosen": 0.1473417431116104, "rewards/margins": 2.6219391971826553, "rewards/rejected": -2.474597454071045, "step": 13597 }, { "epoch": 0.7207484165054462, "grad_norm": 52.25, "kl": 2.071533203125, "learning_rate": 5e-07, "logits/chosen": -32828915.2, "logits/rejected": -26325301.333333332, "logps/chosen": -485.667578125, "logps/rejected": -258.0734456380208, "loss": 0.3127, "rewards/chosen": 0.847943115234375, "rewards/margins": 1.8366953214009603, "rewards/rejected": -0.9887522061665853, "step": 13598 }, { "epoch": 0.7208014205072483, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62992396.0, "logits/rejected": -19917852.0, "logps/chosen": -345.53350830078125, "logps/rejected": -216.1302490234375, "loss": 0.1993, "rewards/chosen": 0.423544317483902, "rewards/margins": 2.9011568129062653, "rewards/rejected": -2.4776124954223633, "step": 13599 }, { "epoch": 0.7208544245090505, "grad_norm": 52.25, "kl": 2.4436416625976562, "learning_rate": 5e-07, "logits/chosen": -47038624.0, "logits/rejected": -108083040.0, "logps/chosen": -369.9278971354167, "logps/rejected": -370.1497802734375, "loss": 0.3344, "rewards/chosen": 0.6806852022806803, "rewards/margins": 4.378317991892497, "rewards/rejected": -3.6976327896118164, "step": 13600 }, { "epoch": 0.7209074285108525, "grad_norm": 64.0, "kl": 1.3840293884277344, "learning_rate": 5e-07, "logits/chosen": 87177296.0, "logits/rejected": -4407261.6, "logps/chosen": -496.7666015625, "logps/rejected": -148.7022705078125, "loss": 0.3031, "rewards/chosen": 0.5348714192708334, "rewards/margins": 1.7513532956441242, "rewards/rejected": -1.216481876373291, "step": 13601 }, { "epoch": 0.7209604325126547, "grad_norm": 44.75, "kl": 2.729612350463867, "learning_rate": 5e-07, "logits/chosen": -168794.0, "logits/rejected": 22978948.0, "logps/chosen": -67.76641845703125, "logps/rejected": -133.46937561035156, "loss": 0.3529, "rewards/chosen": 0.44505593180656433, "rewards/margins": 1.8351885974407196, "rewards/rejected": -1.3901326656341553, "step": 13602 }, { "epoch": 0.7210134365144568, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43305234.666666664, "logits/rejected": -33980288.0, "logps/chosen": -852.0445963541666, "logps/rejected": -245.7072265625, "loss": 0.2272, "rewards/chosen": 0.8061645825703939, "rewards/margins": 3.897595818837484, "rewards/rejected": -3.09143123626709, "step": 13603 }, { "epoch": 0.721066440516259, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30698124.0, "logits/rejected": 18425028.0, "logps/chosen": -383.3897399902344, "logps/rejected": -347.835693359375, "loss": 0.2668, "rewards/chosen": 0.5416756272315979, "rewards/margins": 2.5817518830299377, "rewards/rejected": -2.04007625579834, "step": 13604 }, { "epoch": 0.7211194445180611, "grad_norm": 61.75, "kl": 2.5853633880615234, "learning_rate": 5e-07, "logits/chosen": -53895749.333333336, "logits/rejected": -22625779.2, "logps/chosen": -510.4890950520833, "logps/rejected": -115.31224365234375, "loss": 0.2797, "rewards/chosen": 1.015845775604248, "rewards/margins": 1.8528817176818848, "rewards/rejected": -0.8370359420776368, "step": 13605 }, { "epoch": 0.7211724485198633, "grad_norm": 40.25, "kl": 1.2984848022460938, "learning_rate": 5e-07, "logits/chosen": -50933592.0, "logits/rejected": -14935013.0, "logps/chosen": -356.9118347167969, "logps/rejected": -199.13970947265625, "loss": 0.1759, "rewards/chosen": 0.992701530456543, "rewards/margins": 4.362827777862549, "rewards/rejected": -3.370126247406006, "step": 13606 }, { "epoch": 0.7212254525216654, "grad_norm": 57.25, "kl": 0.3419342041015625, "learning_rate": 5e-07, "logits/chosen": -16950450.666666668, "logits/rejected": -11081649.0, "logps/chosen": -352.7559407552083, "logps/rejected": -196.59918212890625, "loss": 0.3386, "rewards/chosen": 0.48674901326497394, "rewards/margins": 2.1781285603841147, "rewards/rejected": -1.6913795471191406, "step": 13607 }, { "epoch": 0.7212784565234676, "grad_norm": 35.25, "kl": 1.4821586608886719, "learning_rate": 5e-07, "logits/chosen": -19611946.0, "logits/rejected": -20616636.0, "logps/chosen": -349.79571533203125, "logps/rejected": -199.04238891601562, "loss": 0.1969, "rewards/chosen": 1.3283826112747192, "rewards/margins": 3.050523281097412, "rewards/rejected": -1.7221406698226929, "step": 13608 }, { "epoch": 0.7213314605252696, "grad_norm": 71.0, "kl": 2.038848876953125, "learning_rate": 5e-07, "logits/chosen": -37082925.71428572, "logits/rejected": -82998944.0, "logps/chosen": -215.90475027901786, "logps/rejected": -722.9668579101562, "loss": 0.4291, "rewards/chosen": 0.27629566192626953, "rewards/margins": 5.079847812652588, "rewards/rejected": -4.803552150726318, "step": 13609 }, { "epoch": 0.7213844645270718, "grad_norm": 30.75, "kl": 4.268882751464844, "learning_rate": 5e-07, "logits/chosen": -36362648.0, "logits/rejected": -31561628.0, "logps/chosen": -196.39913940429688, "logps/rejected": -409.5794982910156, "loss": 0.241, "rewards/chosen": 0.9394649267196655, "rewards/margins": 3.3008493185043335, "rewards/rejected": -2.361384391784668, "step": 13610 }, { "epoch": 0.7214374685288739, "grad_norm": 50.75, "kl": 3.3259010314941406, "learning_rate": 5e-07, "logits/chosen": -20921913.6, "logits/rejected": -43462797.333333336, "logps/chosen": -455.473828125, "logps/rejected": -291.60475667317706, "loss": 0.3338, "rewards/chosen": 0.5904276371002197, "rewards/margins": 3.092794974644979, "rewards/rejected": -2.5023673375447593, "step": 13611 }, { "epoch": 0.7214904725306761, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31746086.0, "logits/rejected": -12738553.333333334, "logps/chosen": -94.19379425048828, "logps/rejected": -266.4545084635417, "loss": 0.1612, "rewards/chosen": 0.3369125425815582, "rewards/margins": 3.165770282347997, "rewards/rejected": -2.828857739766439, "step": 13612 }, { "epoch": 0.7215434765324782, "grad_norm": 49.75, "kl": 1.032012939453125, "learning_rate": 5e-07, "logits/chosen": 5153781.333333333, "logits/rejected": 2117873.6, "logps/chosen": -181.0983683268229, "logps/rejected": -269.6641357421875, "loss": 0.2636, "rewards/chosen": 0.3292485475540161, "rewards/margins": 2.1854405641555785, "rewards/rejected": -1.8561920166015624, "step": 13613 }, { "epoch": 0.7215964805342804, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 30240834.0, "logits/rejected": -8195152.5, "logps/chosen": -346.633056640625, "logps/rejected": -301.0759582519531, "loss": 0.282, "rewards/chosen": 0.1936483085155487, "rewards/margins": 2.4953069388866425, "rewards/rejected": -2.3016586303710938, "step": 13614 }, { "epoch": 0.7216494845360825, "grad_norm": 44.5, "kl": 2.1031856536865234, "learning_rate": 5e-07, "logits/chosen": -20612356.0, "logits/rejected": -6742412.0, "logps/chosen": -165.49287923177084, "logps/rejected": -180.4393310546875, "loss": 0.4347, "rewards/chosen": 0.06295343240102132, "rewards/margins": 2.3945855696996055, "rewards/rejected": -2.331632137298584, "step": 13615 }, { "epoch": 0.7217024885378847, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68041050.66666667, "logits/rejected": -21000777.6, "logps/chosen": -323.4354248046875, "logps/rejected": -300.99931640625, "loss": 0.292, "rewards/chosen": -0.6096430619557699, "rewards/margins": 1.948410145441691, "rewards/rejected": -2.558053207397461, "step": 13616 }, { "epoch": 0.7217554925396867, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28375738.666666668, "logits/rejected": -6260153.6, "logps/chosen": -256.46950276692706, "logps/rejected": -249.2762451171875, "loss": 0.1927, "rewards/chosen": 0.4344179630279541, "rewards/margins": 3.66473183631897, "rewards/rejected": -3.2303138732910157, "step": 13617 }, { "epoch": 0.7218084965414889, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8548333.0, "logits/rejected": -18867818.0, "logps/chosen": -167.16415405273438, "logps/rejected": -313.2996826171875, "loss": 0.3117, "rewards/chosen": -0.08762226998806, "rewards/margins": 2.369827166199684, "rewards/rejected": -2.457449436187744, "step": 13618 }, { "epoch": 0.721861500543291, "grad_norm": 36.75, "kl": 3.6711788177490234, "learning_rate": 5e-07, "logits/chosen": -21669622.666666668, "logits/rejected": -7345981.0, "logps/chosen": -378.8855794270833, "logps/rejected": -116.69566345214844, "loss": 0.3058, "rewards/chosen": 1.4941730499267578, "rewards/margins": 3.6937203407287598, "rewards/rejected": -2.199547290802002, "step": 13619 }, { "epoch": 0.7219145045450932, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26830988.8, "logits/rejected": -7471031.333333333, "logps/chosen": -156.2780029296875, "logps/rejected": -147.83416748046875, "loss": 0.3706, "rewards/chosen": 0.2065936803817749, "rewards/margins": 1.6822023471196492, "rewards/rejected": -1.4756086667378743, "step": 13620 }, { "epoch": 0.7219675085468953, "grad_norm": 60.75, "kl": 0.7965307235717773, "learning_rate": 5e-07, "logits/chosen": 61090572.8, "logits/rejected": -25465917.333333332, "logps/chosen": -354.9935302734375, "logps/rejected": -543.7317708333334, "loss": 0.2914, "rewards/chosen": 0.6568546295166016, "rewards/margins": 3.876980463663737, "rewards/rejected": -3.2201258341471353, "step": 13621 }, { "epoch": 0.7220205125486975, "grad_norm": 50.5, "kl": 0.9564247131347656, "learning_rate": 5e-07, "logits/chosen": -47752376.0, "logits/rejected": -52441872.0, "logps/chosen": -252.85614013671875, "logps/rejected": -597.6084594726562, "loss": 0.2904, "rewards/chosen": 0.4018573760986328, "rewards/margins": 2.653998374938965, "rewards/rejected": -2.252140998840332, "step": 13622 }, { "epoch": 0.7220735165504996, "grad_norm": 46.0, "kl": 1.9487323760986328, "learning_rate": 5e-07, "logits/chosen": -12006699.2, "logits/rejected": -57444469.333333336, "logps/chosen": -241.2387939453125, "logps/rejected": -370.8248697916667, "loss": 0.3214, "rewards/chosen": 0.5897880077362061, "rewards/margins": 2.2866845607757567, "rewards/rejected": -1.6968965530395508, "step": 13623 }, { "epoch": 0.7221265205523018, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9683798.666666666, "logits/rejected": -45558540.8, "logps/chosen": -275.3779296875, "logps/rejected": -192.6066162109375, "loss": 0.2168, "rewards/chosen": 0.838655153910319, "rewards/margins": 3.1718105951944984, "rewards/rejected": -2.3331554412841795, "step": 13624 }, { "epoch": 0.7221795245541038, "grad_norm": 46.75, "kl": 1.291717529296875, "learning_rate": 5e-07, "logits/chosen": -70006387.2, "logits/rejected": -26285693.333333332, "logps/chosen": -424.52080078125, "logps/rejected": -110.25606282552083, "loss": 0.3226, "rewards/chosen": 0.3380597114562988, "rewards/margins": 2.302584934234619, "rewards/rejected": -1.9645252227783203, "step": 13625 }, { "epoch": 0.7222325285559059, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16955682.666666668, "logits/rejected": -15095068.8, "logps/chosen": -353.98486328125, "logps/rejected": -277.72490234375, "loss": 0.207, "rewards/chosen": 0.4055023193359375, "rewards/margins": 2.8082212448120116, "rewards/rejected": -2.402718925476074, "step": 13626 }, { "epoch": 0.7222855325577081, "grad_norm": 50.0, "kl": 1.9550762176513672, "learning_rate": 5e-07, "logits/chosen": -8041077.0, "logits/rejected": -38514272.0, "logps/chosen": -131.4620361328125, "logps/rejected": -271.1312561035156, "loss": 0.3547, "rewards/chosen": 0.4262336790561676, "rewards/margins": 1.6892708837985992, "rewards/rejected": -1.2630372047424316, "step": 13627 }, { "epoch": 0.7223385365595102, "grad_norm": 42.75, "kl": 0.6088047027587891, "learning_rate": 5e-07, "logits/chosen": -36790690.666666664, "logits/rejected": -19383996.8, "logps/chosen": -134.72630818684897, "logps/rejected": -164.17216796875, "loss": 0.2915, "rewards/chosen": 0.2995407780011495, "rewards/margins": 1.9071205814679464, "rewards/rejected": -1.607579803466797, "step": 13628 }, { "epoch": 0.7223915405613124, "grad_norm": 44.5, "kl": 1.7406501770019531, "learning_rate": 5e-07, "logits/chosen": -64752714.666666664, "logits/rejected": -7863528.0, "logps/chosen": -275.75807698567706, "logps/rejected": -201.54566650390626, "loss": 0.2804, "rewards/chosen": 0.3217923641204834, "rewards/margins": 1.9657124996185302, "rewards/rejected": -1.6439201354980468, "step": 13629 }, { "epoch": 0.7224445445631145, "grad_norm": 35.0, "kl": 1.0895109176635742, "learning_rate": 5e-07, "logits/chosen": 2890758.25, "logits/rejected": -2064268.1666666667, "logps/chosen": -142.7872314453125, "logps/rejected": -96.45337931315105, "loss": 0.2816, "rewards/chosen": -0.317025750875473, "rewards/margins": 1.6466526091098785, "rewards/rejected": -1.9636783599853516, "step": 13630 }, { "epoch": 0.7224975485649167, "grad_norm": 55.75, "kl": 4.934902191162109, "learning_rate": 5e-07, "logits/chosen": -30546674.666666668, "logits/rejected": 40119924.0, "logps/chosen": -463.8173014322917, "logps/rejected": -556.673095703125, "loss": 0.2912, "rewards/chosen": 1.2599779764811199, "rewards/margins": 4.92409626642863, "rewards/rejected": -3.6641182899475098, "step": 13631 }, { "epoch": 0.7225505525667187, "grad_norm": 39.5, "kl": 0.6267871856689453, "learning_rate": 5e-07, "logits/chosen": 2673771.25, "logits/rejected": -17457876.0, "logps/chosen": -174.00059509277344, "logps/rejected": -149.083984375, "loss": 0.2595, "rewards/chosen": 1.0724338293075562, "rewards/margins": 2.779841423034668, "rewards/rejected": -1.7074075937271118, "step": 13632 }, { "epoch": 0.7226035565685209, "grad_norm": 90.5, "kl": 9.24843978881836, "learning_rate": 5e-07, "logits/chosen": -43281562.666666664, "logits/rejected": -19227394.0, "logps/chosen": -393.5217692057292, "logps/rejected": -232.77818298339844, "loss": 0.3763, "rewards/chosen": 1.2919035752614338, "rewards/margins": 2.4060576756795244, "rewards/rejected": -1.1141541004180908, "step": 13633 }, { "epoch": 0.722656560570323, "grad_norm": 43.25, "kl": 2.7661571502685547, "learning_rate": 5e-07, "logits/chosen": -32011988.0, "logits/rejected": -16653908.0, "logps/chosen": -248.65188598632812, "logps/rejected": -411.91961669921875, "loss": 0.2748, "rewards/chosen": 0.5937498807907104, "rewards/margins": 2.548457980155945, "rewards/rejected": -1.9547080993652344, "step": 13634 }, { "epoch": 0.7227095645721252, "grad_norm": 46.5, "kl": 3.6478796005249023, "learning_rate": 5e-07, "logits/chosen": -5243192.0, "logits/rejected": -33163384.0, "logps/chosen": -663.9520263671875, "logps/rejected": -280.3938903808594, "loss": 0.2753, "rewards/chosen": 1.3437864780426025, "rewards/margins": 2.68555748462677, "rewards/rejected": -1.3417710065841675, "step": 13635 }, { "epoch": 0.7227625685739273, "grad_norm": 47.75, "kl": 3.31695556640625, "learning_rate": 5e-07, "logits/chosen": -32020499.2, "logits/rejected": -52844090.666666664, "logps/chosen": -235.612841796875, "logps/rejected": -219.853759765625, "loss": 0.2719, "rewards/chosen": 1.1280376434326171, "rewards/margins": 5.002742640177408, "rewards/rejected": -3.8747049967447915, "step": 13636 }, { "epoch": 0.7228155725757295, "grad_norm": 43.5, "kl": 0.2698326110839844, "learning_rate": 5e-07, "logits/chosen": -32286800.0, "logits/rejected": -14535801.6, "logps/chosen": -153.70479329427084, "logps/rejected": -208.217529296875, "loss": 0.218, "rewards/chosen": 0.5368900299072266, "rewards/margins": 2.986970138549805, "rewards/rejected": -2.4500801086425783, "step": 13637 }, { "epoch": 0.7228685765775316, "grad_norm": 33.0, "kl": 1.451009750366211, "learning_rate": 5e-07, "logits/chosen": -8662342.666666666, "logits/rejected": -13838780.8, "logps/chosen": -285.5035807291667, "logps/rejected": -165.7357666015625, "loss": 0.1696, "rewards/chosen": 0.9959115982055664, "rewards/margins": 3.6652950286865233, "rewards/rejected": -2.669383430480957, "step": 13638 }, { "epoch": 0.7229215805793338, "grad_norm": 41.0, "kl": 5.049773216247559, "learning_rate": 5e-07, "logits/chosen": 4918972.0, "logits/rejected": 106334325.33333333, "logps/chosen": -150.977392578125, "logps/rejected": -406.9540201822917, "loss": 0.2975, "rewards/chosen": 1.14553804397583, "rewards/margins": 3.3904972394307453, "rewards/rejected": -2.2449591954549155, "step": 13639 }, { "epoch": 0.7229745845811358, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54867476.0, "logits/rejected": -28974494.0, "logps/chosen": -462.0678405761719, "logps/rejected": -419.5128479003906, "loss": 0.2611, "rewards/chosen": 0.12311667203903198, "rewards/margins": 3.103160560131073, "rewards/rejected": -2.980043888092041, "step": 13640 }, { "epoch": 0.723027588582938, "grad_norm": 44.5, "kl": 5.856452941894531, "learning_rate": 5e-07, "logits/chosen": -36207088.0, "logits/rejected": -20293488.0, "logps/chosen": -346.1095703125, "logps/rejected": -286.6106770833333, "loss": 0.3923, "rewards/chosen": 0.5074667453765869, "rewards/margins": 3.2915660063425696, "rewards/rejected": -2.784099260965983, "step": 13641 }, { "epoch": 0.7230805925847401, "grad_norm": 54.75, "kl": 2.1655044555664062, "learning_rate": 5e-07, "logits/chosen": -12521021.714285715, "logits/rejected": -18692220.0, "logps/chosen": -332.97914341517856, "logps/rejected": -334.35333251953125, "loss": 0.3077, "rewards/chosen": 0.9919656344822475, "rewards/margins": 3.5061868258884976, "rewards/rejected": -2.51422119140625, "step": 13642 }, { "epoch": 0.7231335965865423, "grad_norm": 35.75, "kl": 1.7634124755859375, "learning_rate": 5e-07, "logits/chosen": 20192124.8, "logits/rejected": -14188321.333333334, "logps/chosen": -48.86998291015625, "logps/rejected": -220.78365071614584, "loss": 0.3385, "rewards/chosen": 0.3107896327972412, "rewards/margins": 3.383876371383667, "rewards/rejected": -3.073086738586426, "step": 13643 }, { "epoch": 0.7231866005883444, "grad_norm": 44.25, "kl": 0.8899383544921875, "learning_rate": 5e-07, "logits/chosen": 19679186.666666668, "logits/rejected": -26081177.6, "logps/chosen": -735.4075520833334, "logps/rejected": -316.188427734375, "loss": 0.1686, "rewards/chosen": 1.5201258659362793, "rewards/margins": 3.76098051071167, "rewards/rejected": -2.2408546447753905, "step": 13644 }, { "epoch": 0.7232396045901466, "grad_norm": 37.5, "kl": 0.6198720932006836, "learning_rate": 5e-07, "logits/chosen": 8407803.333333334, "logits/rejected": -96667744.0, "logps/chosen": -179.39326985677084, "logps/rejected": -469.8890075683594, "loss": 0.2346, "rewards/chosen": 1.0720941225687664, "rewards/margins": 3.5656947294871015, "rewards/rejected": -2.493600606918335, "step": 13645 }, { "epoch": 0.7232926085919487, "grad_norm": 28.5, "kl": 1.4517288208007812, "learning_rate": 5e-07, "logits/chosen": -18280376.0, "logits/rejected": -26146437.333333332, "logps/chosen": -545.517333984375, "logps/rejected": -192.15340169270834, "loss": 0.1468, "rewards/chosen": 2.4348111152648926, "rewards/margins": 4.997577667236328, "rewards/rejected": -2.5627665519714355, "step": 13646 }, { "epoch": 0.7233456125937509, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13453061.333333334, "logits/rejected": -18894809.6, "logps/chosen": -197.02298990885416, "logps/rejected": -217.7863525390625, "loss": 0.2706, "rewards/chosen": 0.6067555745442709, "rewards/margins": 2.2064088185628257, "rewards/rejected": -1.5996532440185547, "step": 13647 }, { "epoch": 0.7233986165955529, "grad_norm": 46.75, "kl": 0.35713958740234375, "learning_rate": 5e-07, "logits/chosen": -31401251.2, "logits/rejected": -50545589.333333336, "logps/chosen": -296.0189208984375, "logps/rejected": -356.6188151041667, "loss": 0.2552, "rewards/chosen": 0.6482080459594727, "rewards/margins": 3.9269810358683266, "rewards/rejected": -3.278772989908854, "step": 13648 }, { "epoch": 0.7234516205973551, "grad_norm": 53.0, "kl": 2.679594039916992, "learning_rate": 5e-07, "logits/chosen": -38852668.0, "logits/rejected": -19884277.333333332, "logps/chosen": -545.2255249023438, "logps/rejected": -299.1290690104167, "loss": 0.2037, "rewards/chosen": 0.8141286969184875, "rewards/margins": 3.3693979382514954, "rewards/rejected": -2.555269241333008, "step": 13649 }, { "epoch": 0.7235046245991572, "grad_norm": 71.5, "kl": 2.028411865234375, "learning_rate": 5e-07, "logits/chosen": -2231049.2, "logits/rejected": -10278103.333333334, "logps/chosen": -393.7413330078125, "logps/rejected": -433.3072102864583, "loss": 0.3234, "rewards/chosen": 0.5749678134918212, "rewards/margins": 3.397604099909464, "rewards/rejected": -2.822636286417643, "step": 13650 }, { "epoch": 0.7235576286009594, "grad_norm": 40.0, "kl": 0.8653488159179688, "learning_rate": 5e-07, "logits/chosen": -18506082.0, "logits/rejected": -51864368.0, "logps/chosen": -235.57122802734375, "logps/rejected": -549.1737060546875, "loss": 0.2985, "rewards/chosen": 0.2667575478553772, "rewards/margins": 3.274421989917755, "rewards/rejected": -3.007664442062378, "step": 13651 }, { "epoch": 0.7236106326027615, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 86165120.0, "logits/rejected": -18957643.2, "logps/chosen": -485.309814453125, "logps/rejected": -238.716357421875, "loss": 0.2946, "rewards/chosen": 0.3782292604446411, "rewards/margins": 1.844606137275696, "rewards/rejected": -1.4663768768310548, "step": 13652 }, { "epoch": 0.7236636366045637, "grad_norm": 109.5, "kl": 2.3448054790496826, "learning_rate": 5e-07, "logits/chosen": -51751788.8, "logits/rejected": -23885938.666666668, "logps/chosen": -217.488232421875, "logps/rejected": -331.2597249348958, "loss": 0.323, "rewards/chosen": 0.5313761234283447, "rewards/margins": 2.397231499354045, "rewards/rejected": -1.8658553759257, "step": 13653 }, { "epoch": 0.7237166406063658, "grad_norm": 42.75, "kl": 1.318511962890625, "learning_rate": 5e-07, "logits/chosen": -55703744.0, "logits/rejected": -51511888.0, "logps/chosen": -164.24559020996094, "logps/rejected": -565.235107421875, "loss": 0.2622, "rewards/chosen": 0.1432277262210846, "rewards/margins": 4.484646946191788, "rewards/rejected": -4.341419219970703, "step": 13654 }, { "epoch": 0.723769644608168, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32076364.0, "logits/rejected": -71696440.0, "logps/chosen": -266.17205810546875, "logps/rejected": -200.22689819335938, "loss": 0.2936, "rewards/chosen": 0.3819461762905121, "rewards/margins": 2.2039569318294525, "rewards/rejected": -1.8220107555389404, "step": 13655 }, { "epoch": 0.72382264860997, "grad_norm": 42.75, "kl": 0.3732180595397949, "learning_rate": 5e-07, "logits/chosen": -8012054.666666667, "logits/rejected": -29205507.2, "logps/chosen": -129.25511678059897, "logps/rejected": -448.40791015625, "loss": 0.2195, "rewards/chosen": 0.5340547561645508, "rewards/margins": 3.2726974487304688, "rewards/rejected": -2.738642692565918, "step": 13656 }, { "epoch": 0.7238756526117722, "grad_norm": 37.5, "kl": 0.1354084014892578, "learning_rate": 5e-07, "logits/chosen": -37773333.333333336, "logits/rejected": -23300680.0, "logps/chosen": -91.29050699869792, "logps/rejected": -261.1357421875, "loss": 0.2084, "rewards/chosen": 0.41750959555308026, "rewards/margins": 3.3440673271814982, "rewards/rejected": -2.926557731628418, "step": 13657 }, { "epoch": 0.7239286566135743, "grad_norm": 59.5, "kl": 0.33632850646972656, "learning_rate": 5e-07, "logits/chosen": -27478765.333333332, "logits/rejected": -22360956.8, "logps/chosen": -155.24659220377603, "logps/rejected": -222.58291015625, "loss": 0.2155, "rewards/chosen": 0.7267377376556396, "rewards/margins": 3.3324323177337645, "rewards/rejected": -2.605694580078125, "step": 13658 }, { "epoch": 0.7239816606153765, "grad_norm": 51.0, "kl": 0.470245361328125, "learning_rate": 5e-07, "logits/chosen": -16642220.8, "logits/rejected": -18469253.333333332, "logps/chosen": -736.953369140625, "logps/rejected": -583.9449055989584, "loss": 0.2145, "rewards/chosen": 1.4458056449890138, "rewards/margins": 3.839855925242106, "rewards/rejected": -2.3940502802530923, "step": 13659 }, { "epoch": 0.7240346646171786, "grad_norm": 57.0, "kl": 1.430978775024414, "learning_rate": 5e-07, "logits/chosen": -72414832.0, "logits/rejected": -44503812.0, "logps/chosen": -876.537353515625, "logps/rejected": -390.76312255859375, "loss": 0.2742, "rewards/chosen": 0.792462944984436, "rewards/margins": 2.5016456842422485, "rewards/rejected": -1.7091827392578125, "step": 13660 }, { "epoch": 0.7240876686189808, "grad_norm": 43.25, "kl": 5.518054008483887, "learning_rate": 5e-07, "logits/chosen": -9113982.666666666, "logits/rejected": -7811979.0, "logps/chosen": -168.20756022135416, "logps/rejected": -199.75729370117188, "loss": 0.3852, "rewards/chosen": 0.5262551705042521, "rewards/margins": 2.3491334120432534, "rewards/rejected": -1.8228782415390015, "step": 13661 }, { "epoch": 0.7241406726207829, "grad_norm": 48.0, "kl": 0.6467781066894531, "learning_rate": 5e-07, "logits/chosen": -14927146.0, "logits/rejected": -9455935.333333334, "logps/chosen": -267.1844482421875, "logps/rejected": -241.8158162434896, "loss": 0.2366, "rewards/chosen": 0.08843308687210083, "rewards/margins": 2.6366969545682273, "rewards/rejected": -2.5482638676961265, "step": 13662 }, { "epoch": 0.724193676622585, "grad_norm": 37.5, "kl": 0.5268335342407227, "learning_rate": 5e-07, "logits/chosen": -2921048.6666666665, "logits/rejected": -38308332.0, "logps/chosen": -224.31941731770834, "logps/rejected": -394.31195068359375, "loss": 0.3307, "rewards/chosen": 0.40450119972229004, "rewards/margins": 3.6076505184173584, "rewards/rejected": -3.2031493186950684, "step": 13663 }, { "epoch": 0.7242466806243871, "grad_norm": 74.0, "kl": 5.962802886962891, "learning_rate": 5e-07, "logits/chosen": -50573668.571428575, "logits/rejected": -50509784.0, "logps/chosen": -547.8590611049107, "logps/rejected": -216.76327514648438, "loss": 0.4671, "rewards/chosen": 0.6008171354021344, "rewards/margins": 1.489968172141484, "rewards/rejected": -0.8891510367393494, "step": 13664 }, { "epoch": 0.7242996846261893, "grad_norm": 45.0, "kl": 1.5531730651855469, "learning_rate": 5e-07, "logits/chosen": -42013288.0, "logits/rejected": -65386617.6, "logps/chosen": -365.3372395833333, "logps/rejected": -508.903173828125, "loss": 0.2174, "rewards/chosen": 0.25241154432296753, "rewards/margins": 3.007399833202362, "rewards/rejected": -2.7549882888793946, "step": 13665 }, { "epoch": 0.7243526886279914, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 22874306.0, "logits/rejected": -35740845.71428572, "logps/chosen": -91.33200073242188, "logps/rejected": -242.41092354910714, "loss": 0.2179, "rewards/chosen": 0.6941863894462585, "rewards/margins": 2.665545914854322, "rewards/rejected": -1.9713595254080636, "step": 13666 }, { "epoch": 0.7244056926297936, "grad_norm": 50.75, "kl": 1.4372615814208984, "learning_rate": 5e-07, "logits/chosen": -36031628.8, "logits/rejected": -4747470.666666667, "logps/chosen": -287.7074462890625, "logps/rejected": -226.44657389322916, "loss": 0.3445, "rewards/chosen": 0.6325023651123047, "rewards/margins": 3.0403100331624353, "rewards/rejected": -2.4078076680501304, "step": 13667 }, { "epoch": 0.7244586966315957, "grad_norm": 61.25, "kl": 4.094392776489258, "learning_rate": 5e-07, "logits/chosen": -28124892.0, "logits/rejected": -9772003.0, "logps/chosen": -495.6878662109375, "logps/rejected": -507.6180114746094, "loss": 0.2743, "rewards/chosen": 0.3662973642349243, "rewards/margins": 3.4293144941329956, "rewards/rejected": -3.0630171298980713, "step": 13668 }, { "epoch": 0.7245117006333979, "grad_norm": 41.25, "kl": 4.785828590393066, "learning_rate": 5e-07, "logits/chosen": 294639.8, "logits/rejected": -12488301.333333334, "logps/chosen": -155.283740234375, "logps/rejected": -391.2128092447917, "loss": 0.3548, "rewards/chosen": 0.5260346412658692, "rewards/margins": 2.6705182075500487, "rewards/rejected": -2.1444835662841797, "step": 13669 }, { "epoch": 0.7245647046352, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9510344.0, "logits/rejected": -14994886.4, "logps/chosen": -297.53444417317706, "logps/rejected": -400.278515625, "loss": 0.2394, "rewards/chosen": 0.7630128065745035, "rewards/margins": 3.134048000971476, "rewards/rejected": -2.3710351943969727, "step": 13670 }, { "epoch": 0.7246177086370021, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45183468.8, "logits/rejected": -40756754.666666664, "logps/chosen": -406.1025390625, "logps/rejected": -386.4971516927083, "loss": 0.2221, "rewards/chosen": 0.8105947494506835, "rewards/margins": 4.088651847839356, "rewards/rejected": -3.278057098388672, "step": 13671 }, { "epoch": 0.7246707126388042, "grad_norm": 52.75, "kl": 0.5306205749511719, "learning_rate": 5e-07, "logits/chosen": -5134299.0, "logits/rejected": -757097.9166666666, "logps/chosen": -239.54373168945312, "logps/rejected": -114.661865234375, "loss": 0.2883, "rewards/chosen": 0.7233715653419495, "rewards/margins": 2.1824713349342346, "rewards/rejected": -1.4590997695922852, "step": 13672 }, { "epoch": 0.7247237166406064, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24777387.2, "logits/rejected": 84503205.33333333, "logps/chosen": -518.5361328125, "logps/rejected": -164.95914713541666, "loss": 0.2101, "rewards/chosen": 1.343461799621582, "rewards/margins": 3.6141283988952635, "rewards/rejected": -2.2706665992736816, "step": 13673 }, { "epoch": 0.7247767206424085, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -83541941.33333333, "logits/rejected": -41069427.2, "logps/chosen": -478.4801025390625, "logps/rejected": -445.59658203125, "loss": 0.1794, "rewards/chosen": 0.9328415393829346, "rewards/margins": 3.5488514423370363, "rewards/rejected": -2.6160099029541017, "step": 13674 }, { "epoch": 0.7248297246442107, "grad_norm": 53.5, "kl": 4.245075225830078, "learning_rate": 5e-07, "logits/chosen": -34198704.0, "logits/rejected": -30404508.0, "logps/chosen": -365.4583740234375, "logps/rejected": -190.389404296875, "loss": 0.2749, "rewards/chosen": 0.9991769790649414, "rewards/margins": 3.6631174087524414, "rewards/rejected": -2.6639404296875, "step": 13675 }, { "epoch": 0.7248827286460128, "grad_norm": 30.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31242924.0, "logits/rejected": -18660652.0, "logps/chosen": -131.94363403320312, "logps/rejected": -212.7860565185547, "loss": 0.1529, "rewards/chosen": 1.2476242780685425, "rewards/margins": 4.393863558769226, "rewards/rejected": -3.1462392807006836, "step": 13676 }, { "epoch": 0.7249357326478149, "grad_norm": 62.25, "kl": 1.3017539978027344, "learning_rate": 5e-07, "logits/chosen": -63533248.0, "logits/rejected": -20247198.0, "logps/chosen": -419.8069661458333, "logps/rejected": -295.8092956542969, "loss": 0.3736, "rewards/chosen": 0.19155985116958618, "rewards/margins": 2.799758017063141, "rewards/rejected": -2.6081981658935547, "step": 13677 }, { "epoch": 0.724988736649617, "grad_norm": 49.5, "kl": 1.467782974243164, "learning_rate": 5e-07, "logits/chosen": -40522412.8, "logits/rejected": -29305584.0, "logps/chosen": -251.590576171875, "logps/rejected": -225.353271484375, "loss": 0.2858, "rewards/chosen": 0.6833473682403565, "rewards/margins": 2.1936748663584393, "rewards/rejected": -1.5103274981180828, "step": 13678 }, { "epoch": 0.7250417406514191, "grad_norm": 48.5, "kl": 2.960702896118164, "learning_rate": 5e-07, "logits/chosen": -15844062.666666666, "logits/rejected": -40008108.0, "logps/chosen": -146.97317504882812, "logps/rejected": -201.4112548828125, "loss": 0.4329, "rewards/chosen": 0.3749956687291463, "rewards/margins": 1.7464152177174885, "rewards/rejected": -1.3714195489883423, "step": 13679 }, { "epoch": 0.7250947446532213, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43162560.0, "logits/rejected": -28720264.0, "logps/chosen": -387.52532958984375, "logps/rejected": -329.1187744140625, "loss": 0.3345, "rewards/chosen": -0.19137802720069885, "rewards/margins": 2.159409373998642, "rewards/rejected": -2.350787401199341, "step": 13680 }, { "epoch": 0.7251477486550234, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44723424.0, "logits/rejected": -20797773.333333332, "logps/chosen": -294.587646484375, "logps/rejected": -393.5793050130208, "loss": 0.28, "rewards/chosen": 0.39681596755981446, "rewards/margins": 3.924623203277588, "rewards/rejected": -3.5278072357177734, "step": 13681 }, { "epoch": 0.7252007526568256, "grad_norm": 47.0, "kl": 1.2563018798828125, "learning_rate": 5e-07, "logits/chosen": -17794778.666666668, "logits/rejected": -120102936.0, "logps/chosen": -285.98004150390625, "logps/rejected": -425.3358459472656, "loss": 0.3139, "rewards/chosen": 0.8301545778910319, "rewards/margins": 3.679764906565348, "rewards/rejected": -2.8496103286743164, "step": 13682 }, { "epoch": 0.7252537566586277, "grad_norm": 38.75, "kl": 1.1503753662109375, "learning_rate": 5e-07, "logits/chosen": -39856936.0, "logits/rejected": 13754192.0, "logps/chosen": -612.2108764648438, "logps/rejected": -285.2467346191406, "loss": 0.1555, "rewards/chosen": 1.5017013549804688, "rewards/margins": 4.72904109954834, "rewards/rejected": -3.227339744567871, "step": 13683 }, { "epoch": 0.7253067606604299, "grad_norm": 49.75, "kl": 1.9348030090332031, "learning_rate": 5e-07, "logits/chosen": -25327244.0, "logits/rejected": -39410024.0, "logps/chosen": -226.473388671875, "logps/rejected": -128.11192321777344, "loss": 0.3414, "rewards/chosen": 0.5005006790161133, "rewards/margins": 2.4563212394714355, "rewards/rejected": -1.9558205604553223, "step": 13684 }, { "epoch": 0.725359764662232, "grad_norm": 45.0, "kl": 5.236589431762695, "learning_rate": 5e-07, "logits/chosen": -49113749.333333336, "logits/rejected": -11814795.0, "logps/chosen": -504.0133463541667, "logps/rejected": -426.4955749511719, "loss": 0.3186, "rewards/chosen": 1.108455975850423, "rewards/margins": 4.1057530244191485, "rewards/rejected": -2.9972970485687256, "step": 13685 }, { "epoch": 0.7254127686640341, "grad_norm": 55.25, "kl": 0.06487655639648438, "learning_rate": 5e-07, "logits/chosen": -43029128.0, "logits/rejected": -2324883.75, "logps/chosen": -572.3432006835938, "logps/rejected": -141.216064453125, "loss": 0.3062, "rewards/chosen": 0.7383418679237366, "rewards/margins": 2.3492977023124695, "rewards/rejected": -1.610955834388733, "step": 13686 }, { "epoch": 0.7254657726658362, "grad_norm": 58.0, "kl": 4.6065673828125, "learning_rate": 5e-07, "logits/chosen": 8276560.0, "logits/rejected": -18822526.0, "logps/chosen": -277.31996663411456, "logps/rejected": -130.49624633789062, "loss": 0.3416, "rewards/chosen": 0.8712739944458008, "rewards/margins": 1.686060607433319, "rewards/rejected": -0.8147866129875183, "step": 13687 }, { "epoch": 0.7255187766676384, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25639544.0, "logits/rejected": -9617258.666666666, "logps/chosen": -230.1144775390625, "logps/rejected": -180.88260904947916, "loss": 0.3508, "rewards/chosen": 0.37944109439849855, "rewards/margins": 1.6515475034713745, "rewards/rejected": -1.272106409072876, "step": 13688 }, { "epoch": 0.7255717806694405, "grad_norm": 48.0, "kl": 3.956890106201172, "learning_rate": 5e-07, "logits/chosen": -32459672.0, "logits/rejected": -17634730.0, "logps/chosen": -1155.5262451171875, "logps/rejected": -452.7636413574219, "loss": 0.2223, "rewards/chosen": 1.0846668481826782, "rewards/margins": 3.8401790857315063, "rewards/rejected": -2.755512237548828, "step": 13689 }, { "epoch": 0.7256247846712427, "grad_norm": 46.5, "kl": 0.753997802734375, "learning_rate": 5e-07, "logits/chosen": -45157544.0, "logits/rejected": -17905258.0, "logps/chosen": -380.8177185058594, "logps/rejected": -285.53033447265625, "loss": 0.1673, "rewards/chosen": 1.1757128238677979, "rewards/margins": 4.162637948989868, "rewards/rejected": -2.9869251251220703, "step": 13690 }, { "epoch": 0.7256777886730448, "grad_norm": 36.25, "kl": 0.38320159912109375, "learning_rate": 5e-07, "logits/chosen": 7031541.0, "logits/rejected": -55390249.14285714, "logps/chosen": -38.67173767089844, "logps/rejected": -390.29213169642856, "loss": 0.1491, "rewards/chosen": 0.6999801993370056, "rewards/margins": 3.2492611833981107, "rewards/rejected": -2.549280984061105, "step": 13691 }, { "epoch": 0.725730792674847, "grad_norm": 57.25, "kl": 0.36683130264282227, "learning_rate": 5e-07, "logits/chosen": -22929672.0, "logits/rejected": -35660836.0, "logps/chosen": -413.4276123046875, "logps/rejected": -325.7164001464844, "loss": 0.2156, "rewards/chosen": 1.1965748071670532, "rewards/margins": 3.2859259843826294, "rewards/rejected": -2.089351177215576, "step": 13692 }, { "epoch": 0.725783796676649, "grad_norm": 51.75, "kl": 0.0833282470703125, "learning_rate": 5e-07, "logits/chosen": -6295974.0, "logits/rejected": -25815352.0, "logps/chosen": -347.2474670410156, "logps/rejected": -310.84307861328125, "loss": 0.294, "rewards/chosen": 0.20357438921928406, "rewards/margins": 2.954633444547653, "rewards/rejected": -2.751059055328369, "step": 13693 }, { "epoch": 0.7258368006784512, "grad_norm": 39.75, "kl": 1.632516860961914, "learning_rate": 5e-07, "logits/chosen": -37207104.0, "logits/rejected": -24826768.0, "logps/chosen": -437.686279296875, "logps/rejected": -230.849560546875, "loss": 0.2853, "rewards/chosen": 0.402674396832784, "rewards/margins": 2.389741619427999, "rewards/rejected": -1.9870672225952148, "step": 13694 }, { "epoch": 0.7258898046802533, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48791452.0, "logits/rejected": -25229390.0, "logps/chosen": -298.291748046875, "logps/rejected": -338.49310302734375, "loss": 0.3144, "rewards/chosen": -0.0021675098687410355, "rewards/margins": 2.808855725452304, "rewards/rejected": -2.811023235321045, "step": 13695 }, { "epoch": 0.7259428086820555, "grad_norm": 30.375, "kl": 0.37784290313720703, "learning_rate": 5e-07, "logits/chosen": 9100442.0, "logits/rejected": -2743822.0, "logps/chosen": -32.05226135253906, "logps/rejected": -98.43855285644531, "loss": 0.296, "rewards/chosen": 0.2443741261959076, "rewards/margins": 2.751299947500229, "rewards/rejected": -2.5069258213043213, "step": 13696 }, { "epoch": 0.7259958126838576, "grad_norm": 31.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14666456.0, "logits/rejected": -26042624.0, "logps/chosen": -374.38134765625, "logps/rejected": -476.515625, "loss": 0.1061, "rewards/chosen": 2.0701568126678467, "rewards/margins": 4.8074634075164795, "rewards/rejected": -2.737306594848633, "step": 13697 }, { "epoch": 0.7260488166856598, "grad_norm": 38.0, "kl": 0.7789058685302734, "learning_rate": 5e-07, "logits/chosen": -36579680.0, "logits/rejected": 57572876.8, "logps/chosen": -175.2956339518229, "logps/rejected": -298.979541015625, "loss": 0.2209, "rewards/chosen": 0.420948584874471, "rewards/margins": 2.699287589391073, "rewards/rejected": -2.2783390045166017, "step": 13698 }, { "epoch": 0.7261018206874619, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29046944.0, "logits/rejected": -11360352.8, "logps/chosen": -271.2556966145833, "logps/rejected": -304.77529296875, "loss": 0.254, "rewards/chosen": -0.03094889720280965, "rewards/margins": 2.7852425853411353, "rewards/rejected": -2.816191482543945, "step": 13699 }, { "epoch": 0.7261548246892641, "grad_norm": 39.0, "kl": 0.7478485107421875, "learning_rate": 5e-07, "logits/chosen": -16985264.0, "logits/rejected": -26922504.0, "logps/chosen": -185.12619018554688, "logps/rejected": -274.52081298828125, "loss": 0.2417, "rewards/chosen": 0.5276352167129517, "rewards/margins": 3.6262682676315308, "rewards/rejected": -3.098633050918579, "step": 13700 }, { "epoch": 0.7262078286910661, "grad_norm": 45.25, "kl": 4.869297027587891, "learning_rate": 5e-07, "logits/chosen": -26322096.0, "logits/rejected": -28904308.0, "logps/chosen": -215.6765340169271, "logps/rejected": -573.8294067382812, "loss": 0.3794, "rewards/chosen": 0.8772745132446289, "rewards/margins": 3.3874521255493164, "rewards/rejected": -2.5101776123046875, "step": 13701 }, { "epoch": 0.7262608326928683, "grad_norm": 28.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15573790.0, "logits/rejected": -40181275.428571425, "logps/chosen": -66.77815246582031, "logps/rejected": -329.11593191964283, "loss": 0.1461, "rewards/chosen": -0.02540740929543972, "rewards/margins": 2.8325784959431206, "rewards/rejected": -2.8579859052385603, "step": 13702 }, { "epoch": 0.7263138366946704, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36966056.0, "logits/rejected": -20332866.0, "logps/chosen": -409.40777587890625, "logps/rejected": -244.72622680664062, "loss": 0.1813, "rewards/chosen": 1.0047391653060913, "rewards/margins": 3.912556529045105, "rewards/rejected": -2.9078173637390137, "step": 13703 }, { "epoch": 0.7263668406964726, "grad_norm": 59.0, "kl": 0.468475341796875, "learning_rate": 5e-07, "logits/chosen": -74292656.0, "logits/rejected": -35707336.0, "logps/chosen": -365.47119140625, "logps/rejected": -635.5531005859375, "loss": 0.3331, "rewards/chosen": -0.34505194425582886, "rewards/margins": 2.5250821709632874, "rewards/rejected": -2.870134115219116, "step": 13704 }, { "epoch": 0.7264198446982747, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6684717.0, "logits/rejected": -47994093.71428572, "logps/chosen": -45.25091552734375, "logps/rejected": -240.58858816964286, "loss": 0.2554, "rewards/chosen": -0.3984886109828949, "rewards/margins": 1.5746598712035589, "rewards/rejected": -1.9731484821864538, "step": 13705 }, { "epoch": 0.7264728487000769, "grad_norm": 41.5, "kl": 0.8615617752075195, "learning_rate": 5e-07, "logits/chosen": -16550898.0, "logits/rejected": -24082166.0, "logps/chosen": -245.17205810546875, "logps/rejected": -380.84051513671875, "loss": 0.2013, "rewards/chosen": 1.0130863189697266, "rewards/margins": 3.3813090324401855, "rewards/rejected": -2.368222713470459, "step": 13706 }, { "epoch": 0.726525852701879, "grad_norm": 59.0, "kl": 6.13755989074707, "learning_rate": 5e-07, "logits/chosen": -24557947.42857143, "logits/rejected": -6117604.0, "logps/chosen": -295.44119698660717, "logps/rejected": -147.70816040039062, "loss": 0.4408, "rewards/chosen": 0.6900625910077777, "rewards/margins": 5.099813052586147, "rewards/rejected": -4.409750461578369, "step": 13707 }, { "epoch": 0.7265788567036812, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19014120.0, "logits/rejected": -19965000.0, "logps/chosen": -202.362548828125, "logps/rejected": -399.2008972167969, "loss": 0.2902, "rewards/chosen": -0.05755210667848587, "rewards/margins": 3.4077782556414604, "rewards/rejected": -3.4653303623199463, "step": 13708 }, { "epoch": 0.7266318607054832, "grad_norm": 48.0, "kl": 1.1078357696533203, "learning_rate": 5e-07, "logits/chosen": -34915165.333333336, "logits/rejected": -26376130.0, "logps/chosen": -468.770263671875, "logps/rejected": -159.4237060546875, "loss": 0.2764, "rewards/chosen": 0.9130562941233317, "rewards/margins": 4.353473583857219, "rewards/rejected": -3.4404172897338867, "step": 13709 }, { "epoch": 0.7266848647072854, "grad_norm": 50.25, "kl": 2.3314266204833984, "learning_rate": 5e-07, "logits/chosen": -11146874.666666666, "logits/rejected": -12359516.0, "logps/chosen": -275.7295328776042, "logps/rejected": -241.4881591796875, "loss": 0.2366, "rewards/chosen": 1.223828395207723, "rewards/margins": 3.3477824052174885, "rewards/rejected": -2.123954010009766, "step": 13710 }, { "epoch": 0.7267378687090875, "grad_norm": 54.5, "kl": 0.7955894470214844, "learning_rate": 5e-07, "logits/chosen": -55851056.0, "logits/rejected": -4290474.0, "logps/chosen": -385.6230875651042, "logps/rejected": -287.70111083984375, "loss": 0.3635, "rewards/chosen": 0.5692039330800375, "rewards/margins": 2.3038726647694907, "rewards/rejected": -1.7346687316894531, "step": 13711 }, { "epoch": 0.7267908727108897, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2848205.0, "logits/rejected": -9978352.0, "logps/chosen": -96.45791625976562, "logps/rejected": -250.1610107421875, "loss": 0.3409, "rewards/chosen": -0.44101446866989136, "rewards/margins": 0.9466010530789692, "rewards/rejected": -1.3876155217488606, "step": 13712 }, { "epoch": 0.7268438767126918, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30504426.666666668, "logits/rejected": -31447616.0, "logps/chosen": -338.7468668619792, "logps/rejected": -254.948876953125, "loss": 0.2733, "rewards/chosen": 0.22388710578282675, "rewards/margins": 1.9970554788907369, "rewards/rejected": -1.7731683731079102, "step": 13713 }, { "epoch": 0.726896880714494, "grad_norm": 45.75, "kl": 2.816911220550537, "learning_rate": 5e-07, "logits/chosen": -39165592.0, "logits/rejected": 39089717.333333336, "logps/chosen": -226.5452423095703, "logps/rejected": -408.3699137369792, "loss": 0.2404, "rewards/chosen": 0.5115347504615784, "rewards/margins": 2.5497131943702698, "rewards/rejected": -2.0381784439086914, "step": 13714 }, { "epoch": 0.7269498847162961, "grad_norm": 55.5, "kl": 0.6342086791992188, "learning_rate": 5e-07, "logits/chosen": -44694064.0, "logits/rejected": -13802350.0, "logps/chosen": -503.58343505859375, "logps/rejected": -155.90689086914062, "loss": 0.3316, "rewards/chosen": 0.047275349497795105, "rewards/margins": 1.8021004647016525, "rewards/rejected": -1.7548251152038574, "step": 13715 }, { "epoch": 0.7270028887180983, "grad_norm": 55.0, "kl": 1.458984375, "learning_rate": 5e-07, "logits/chosen": -34470740.0, "logits/rejected": -26022896.0, "logps/chosen": -454.19659423828125, "logps/rejected": -219.29159545898438, "loss": 0.2261, "rewards/chosen": 0.9482598304748535, "rewards/margins": 2.8359732627868652, "rewards/rejected": -1.8877134323120117, "step": 13716 }, { "epoch": 0.7270558927199003, "grad_norm": 46.25, "kl": 3.1738357543945312, "learning_rate": 5e-07, "logits/chosen": -12282892.0, "logits/rejected": -19955266.666666668, "logps/chosen": -217.5651123046875, "logps/rejected": -155.66374715169272, "loss": 0.3072, "rewards/chosen": 0.8752716064453125, "rewards/margins": 3.747075843811035, "rewards/rejected": -2.8718042373657227, "step": 13717 }, { "epoch": 0.7271088967217025, "grad_norm": 50.75, "kl": 4.87419319152832, "learning_rate": 5e-07, "logits/chosen": -33544788.0, "logits/rejected": -33216796.0, "logps/chosen": -612.5177001953125, "logps/rejected": -369.11907958984375, "loss": 0.2195, "rewards/chosen": 1.4088146686553955, "rewards/margins": 3.5944111347198486, "rewards/rejected": -2.185596466064453, "step": 13718 }, { "epoch": 0.7271619007235046, "grad_norm": 49.75, "kl": 0.9538955688476562, "learning_rate": 5e-07, "logits/chosen": -12093117.333333334, "logits/rejected": -7871982.4, "logps/chosen": -142.6315714518229, "logps/rejected": -121.68970947265625, "loss": 0.2655, "rewards/chosen": 1.6973015467325847, "rewards/margins": 2.8647692362467447, "rewards/rejected": -1.1674676895141602, "step": 13719 }, { "epoch": 0.7272149047253068, "grad_norm": 37.5, "kl": 2.90496826171875, "learning_rate": 5e-07, "logits/chosen": -15839736.0, "logits/rejected": -3095493.5, "logps/chosen": -266.1496887207031, "logps/rejected": -191.217041015625, "loss": 0.2736, "rewards/chosen": 0.8083860278129578, "rewards/margins": 2.785632073879242, "rewards/rejected": -1.9772460460662842, "step": 13720 }, { "epoch": 0.7272679087271089, "grad_norm": 45.25, "kl": 0.8934488296508789, "learning_rate": 5e-07, "logits/chosen": -23235858.0, "logits/rejected": -14794185.0, "logps/chosen": -496.0947265625, "logps/rejected": -319.9342041015625, "loss": 0.2038, "rewards/chosen": 1.3482739925384521, "rewards/margins": 3.373936414718628, "rewards/rejected": -2.025662422180176, "step": 13721 }, { "epoch": 0.7273209127289111, "grad_norm": 50.5, "kl": 1.7623109817504883, "learning_rate": 5e-07, "logits/chosen": -31323709.333333332, "logits/rejected": 1516922.0, "logps/chosen": -438.85009765625, "logps/rejected": -70.58129119873047, "loss": 0.4126, "rewards/chosen": 0.5969489812850952, "rewards/margins": 1.4578549265861511, "rewards/rejected": -0.8609059453010559, "step": 13722 }, { "epoch": 0.7273739167307132, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13371048.0, "logits/rejected": -6668574.666666667, "logps/chosen": -191.1069091796875, "logps/rejected": -240.9790242513021, "loss": 0.4174, "rewards/chosen": -0.19251450300216674, "rewards/margins": 1.256568507353465, "rewards/rejected": -1.4490830103556316, "step": 13723 }, { "epoch": 0.7274269207325154, "grad_norm": 82.0, "kl": 4.491271018981934, "learning_rate": 5e-07, "logits/chosen": -8971022.666666666, "logits/rejected": 997266.0, "logps/chosen": -488.0223388671875, "logps/rejected": -338.35113525390625, "loss": 0.3574, "rewards/chosen": 1.2711520195007324, "rewards/margins": 3.753340482711792, "rewards/rejected": -2.4821884632110596, "step": 13724 }, { "epoch": 0.7274799247343174, "grad_norm": 33.25, "kl": 0.5332927703857422, "learning_rate": 5e-07, "logits/chosen": -26781852.0, "logits/rejected": 4580737.333333333, "logps/chosen": -103.21924591064453, "logps/rejected": -132.89042154947916, "loss": 0.1948, "rewards/chosen": 0.8331204652786255, "rewards/margins": 3.3452572425206504, "rewards/rejected": -2.512136777242025, "step": 13725 }, { "epoch": 0.7275329287361195, "grad_norm": 57.25, "kl": 0.24412918090820312, "learning_rate": 5e-07, "logits/chosen": -29988422.4, "logits/rejected": -101603573.33333333, "logps/chosen": -354.0620361328125, "logps/rejected": -325.8352864583333, "loss": 0.3209, "rewards/chosen": 0.2705985069274902, "rewards/margins": 2.711846129099528, "rewards/rejected": -2.4412476221720376, "step": 13726 }, { "epoch": 0.7275859327379217, "grad_norm": 52.25, "kl": 2.64501953125, "learning_rate": 5e-07, "logits/chosen": -43682552.0, "logits/rejected": -14041585.0, "logps/chosen": -334.4324137369792, "logps/rejected": -333.7388000488281, "loss": 0.3927, "rewards/chosen": 0.25876198212305707, "rewards/margins": 2.983688692251841, "rewards/rejected": -2.724926710128784, "step": 13727 }, { "epoch": 0.7276389367397238, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24286438.0, "logits/rejected": -28227397.333333332, "logps/chosen": -227.82449340820312, "logps/rejected": -450.7448323567708, "loss": 0.1894, "rewards/chosen": 0.01757049560546875, "rewards/margins": 2.5992301305135093, "rewards/rejected": -2.5816596349080405, "step": 13728 }, { "epoch": 0.727691940741526, "grad_norm": 41.0, "kl": 2.054607391357422, "learning_rate": 5e-07, "logits/chosen": -33161264.0, "logits/rejected": -19534158.666666668, "logps/chosen": -176.1931884765625, "logps/rejected": -389.4938151041667, "loss": 0.3358, "rewards/chosen": 0.3744891881942749, "rewards/margins": 3.005953828493754, "rewards/rejected": -2.631464640299479, "step": 13729 }, { "epoch": 0.7277449447433281, "grad_norm": 50.75, "kl": 0.26219749450683594, "learning_rate": 5e-07, "logits/chosen": -9104468.0, "logits/rejected": -4487343.5, "logps/chosen": -381.9069519042969, "logps/rejected": -171.84808349609375, "loss": 0.196, "rewards/chosen": 0.8711336851119995, "rewards/margins": 4.055373787879944, "rewards/rejected": -3.1842401027679443, "step": 13730 }, { "epoch": 0.7277979487451303, "grad_norm": 55.75, "kl": 4.7889404296875, "learning_rate": 5e-07, "logits/chosen": -29654509.333333332, "logits/rejected": -49898840.0, "logps/chosen": -357.1304524739583, "logps/rejected": -272.201171875, "loss": 0.439, "rewards/chosen": 0.5522872606913248, "rewards/margins": 2.027984539667765, "rewards/rejected": -1.4756972789764404, "step": 13731 }, { "epoch": 0.7278509527469323, "grad_norm": 32.0, "kl": 1.2376680374145508, "learning_rate": 5e-07, "logits/chosen": -6228072.0, "logits/rejected": 6184100.666666667, "logps/chosen": -114.83013916015625, "logps/rejected": -158.3175048828125, "loss": 0.3506, "rewards/chosen": -0.0037456125020980833, "rewards/margins": 3.117999497056007, "rewards/rejected": -3.1217451095581055, "step": 13732 }, { "epoch": 0.7279039567487345, "grad_norm": 51.75, "kl": 2.4835281372070312, "learning_rate": 5e-07, "logits/chosen": -16348596.8, "logits/rejected": -3278307.3333333335, "logps/chosen": -271.977099609375, "logps/rejected": -87.44644165039062, "loss": 0.3256, "rewards/chosen": 0.5981749534606934, "rewards/margins": 3.122261905670166, "rewards/rejected": -2.5240869522094727, "step": 13733 }, { "epoch": 0.7279569607505366, "grad_norm": 46.25, "kl": 1.630056381225586, "learning_rate": 5e-07, "logits/chosen": 10264803.333333334, "logits/rejected": 4127929.5, "logps/chosen": -138.02078247070312, "logps/rejected": -74.65460968017578, "loss": 0.4012, "rewards/chosen": 0.46704359849294025, "rewards/margins": 1.4924264748891194, "rewards/rejected": -1.0253828763961792, "step": 13734 }, { "epoch": 0.7280099647523388, "grad_norm": 37.0, "kl": 3.1886062622070312, "learning_rate": 5e-07, "logits/chosen": -38780624.0, "logits/rejected": -39982901.333333336, "logps/chosen": -237.4798583984375, "logps/rejected": -554.6392415364584, "loss": 0.3028, "rewards/chosen": 0.4414694309234619, "rewards/margins": 4.623921950658162, "rewards/rejected": -4.1824525197347, "step": 13735 }, { "epoch": 0.7280629687541409, "grad_norm": 28.125, "kl": 2.3933563232421875, "learning_rate": 5e-07, "logits/chosen": -32212022.0, "logits/rejected": -24989116.0, "logps/chosen": -190.75875854492188, "logps/rejected": -112.50041198730469, "loss": 0.2743, "rewards/chosen": 0.9980125427246094, "rewards/margins": 2.463592290878296, "rewards/rejected": -1.4655797481536865, "step": 13736 }, { "epoch": 0.7281159727559431, "grad_norm": 47.25, "kl": 4.7483367919921875, "learning_rate": 5e-07, "logits/chosen": -27692804.0, "logits/rejected": -21171054.0, "logps/chosen": -321.05072021484375, "logps/rejected": -318.687255859375, "loss": 0.3479, "rewards/chosen": 0.6763612627983093, "rewards/margins": 3.0413569808006287, "rewards/rejected": -2.3649957180023193, "step": 13737 }, { "epoch": 0.7281689767577452, "grad_norm": 51.75, "kl": 0.3834381103515625, "learning_rate": 5e-07, "logits/chosen": -19515324.8, "logits/rejected": -60004869.333333336, "logps/chosen": -326.329541015625, "logps/rejected": -655.0044352213541, "loss": 0.2401, "rewards/chosen": 0.6283985614776612, "rewards/margins": 4.252608283360799, "rewards/rejected": -3.624209721883138, "step": 13738 }, { "epoch": 0.7282219807595474, "grad_norm": 42.5, "kl": 4.036548614501953, "learning_rate": 5e-07, "logits/chosen": -15017017.6, "logits/rejected": -8703490.0, "logps/chosen": -111.14288330078125, "logps/rejected": -296.3284098307292, "loss": 0.3784, "rewards/chosen": 0.7068292140960694, "rewards/margins": 2.846094592412313, "rewards/rejected": -2.1392653783162436, "step": 13739 }, { "epoch": 0.7282749847613494, "grad_norm": 59.5, "kl": 1.1538124084472656, "learning_rate": 5e-07, "logits/chosen": -28796233.6, "logits/rejected": 203529.33333333334, "logps/chosen": -163.0778564453125, "logps/rejected": -443.0170084635417, "loss": 0.3347, "rewards/chosen": 0.2231837749481201, "rewards/margins": 2.430623992284139, "rewards/rejected": -2.207440217336019, "step": 13740 }, { "epoch": 0.7283279887631516, "grad_norm": 65.5, "kl": 1.1374835968017578, "learning_rate": 5e-07, "logits/chosen": -15819002.0, "logits/rejected": -23557778.0, "logps/chosen": -354.7406921386719, "logps/rejected": -301.1325378417969, "loss": 0.3108, "rewards/chosen": 0.3904927968978882, "rewards/margins": 2.0008078813552856, "rewards/rejected": -1.6103150844573975, "step": 13741 }, { "epoch": 0.7283809927649537, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31795754.666666668, "logits/rejected": -30386224.0, "logps/chosen": -276.2731119791667, "logps/rejected": -381.9189697265625, "loss": 0.2263, "rewards/chosen": 0.397449254989624, "rewards/margins": 2.6102559566497803, "rewards/rejected": -2.2128067016601562, "step": 13742 }, { "epoch": 0.7284339967667559, "grad_norm": 44.75, "kl": 0.7216339111328125, "learning_rate": 5e-07, "logits/chosen": -66046180.0, "logits/rejected": -28920402.0, "logps/chosen": -451.68035888671875, "logps/rejected": -382.3832702636719, "loss": 0.2962, "rewards/chosen": 0.3036079406738281, "rewards/margins": 3.2542617321014404, "rewards/rejected": -2.9506537914276123, "step": 13743 }, { "epoch": 0.728487000768558, "grad_norm": 42.5, "kl": 3.38983154296875, "learning_rate": 5e-07, "logits/chosen": 4261728.0, "logits/rejected": -55668464.0, "logps/chosen": -170.27178955078125, "logps/rejected": -252.08984375, "loss": 0.3359, "rewards/chosen": 0.6335104465484619, "rewards/margins": 2.984840567906698, "rewards/rejected": -2.351330121358236, "step": 13744 }, { "epoch": 0.7285400047703602, "grad_norm": 40.75, "kl": 0.2838592529296875, "learning_rate": 5e-07, "logits/chosen": -43013760.0, "logits/rejected": -57215628.0, "logps/chosen": -311.51800537109375, "logps/rejected": -238.64480590820312, "loss": 0.2085, "rewards/chosen": 1.1582659482955933, "rewards/margins": 3.721897006034851, "rewards/rejected": -2.563631057739258, "step": 13745 }, { "epoch": 0.7285930087721623, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30698933.333333332, "logits/rejected": -16011584.0, "logps/chosen": -293.0408121744792, "logps/rejected": -554.438525390625, "loss": 0.194, "rewards/chosen": 0.33932483196258545, "rewards/margins": 3.5723463773727415, "rewards/rejected": -3.233021545410156, "step": 13746 }, { "epoch": 0.7286460127739645, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57953052.0, "logits/rejected": -44593820.0, "logps/chosen": -395.9475402832031, "logps/rejected": -442.76251220703125, "loss": 0.2768, "rewards/chosen": 0.2514396905899048, "rewards/margins": 2.723002552986145, "rewards/rejected": -2.4715628623962402, "step": 13747 }, { "epoch": 0.7286990167757665, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17523470.0, "logits/rejected": -60332548.0, "logps/chosen": -302.009765625, "logps/rejected": -336.664306640625, "loss": 0.2808, "rewards/chosen": 0.03502463549375534, "rewards/margins": 2.997438423335552, "rewards/rejected": -2.962413787841797, "step": 13748 }, { "epoch": 0.7287520207775687, "grad_norm": 40.25, "kl": 1.453857421875, "learning_rate": 5e-07, "logits/chosen": -71542240.0, "logits/rejected": -18848092.0, "logps/chosen": -431.4005126953125, "logps/rejected": -209.5474090576172, "loss": 0.1513, "rewards/chosen": 1.803895115852356, "rewards/margins": 4.422118782997131, "rewards/rejected": -2.6182236671447754, "step": 13749 }, { "epoch": 0.7288050247793708, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12063858.0, "logits/rejected": -74000280.0, "logps/chosen": -134.72576904296875, "logps/rejected": -436.3985595703125, "loss": 0.3657, "rewards/chosen": -0.44799476861953735, "rewards/margins": 1.9594632983207703, "rewards/rejected": -2.4074580669403076, "step": 13750 }, { "epoch": 0.728858028781173, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 108219664.0, "logits/rejected": -32402875.42857143, "logps/chosen": -968.4002685546875, "logps/rejected": -424.0606166294643, "loss": 0.1063, "rewards/chosen": 0.09812011569738388, "rewards/margins": 3.9638694748282433, "rewards/rejected": -3.8657493591308594, "step": 13751 }, { "epoch": 0.7289110327829751, "grad_norm": 52.75, "kl": 0.00881195068359375, "learning_rate": 5e-07, "logits/chosen": 7184868.5, "logits/rejected": -9734704.666666666, "logps/chosen": -56.07582473754883, "logps/rejected": -459.5801595052083, "loss": 0.1814, "rewards/chosen": 0.8724162578582764, "rewards/margins": 2.862062692642212, "rewards/rejected": -1.9896464347839355, "step": 13752 }, { "epoch": 0.7289640367847773, "grad_norm": 46.75, "kl": 1.8988866806030273, "learning_rate": 5e-07, "logits/chosen": -39372808.0, "logits/rejected": -35312876.0, "logps/chosen": -298.0801696777344, "logps/rejected": -317.2982177734375, "loss": 0.3426, "rewards/chosen": 0.4433881938457489, "rewards/margins": 2.3371895253658295, "rewards/rejected": -1.8938013315200806, "step": 13753 }, { "epoch": 0.7290170407865794, "grad_norm": 38.5, "kl": 1.3569564819335938, "learning_rate": 5e-07, "logits/chosen": 1453006.4, "logits/rejected": -28673162.666666668, "logps/chosen": -221.205615234375, "logps/rejected": -387.055419921875, "loss": 0.2571, "rewards/chosen": 0.6821863651275635, "rewards/margins": 3.1544903914133706, "rewards/rejected": -2.472304026285807, "step": 13754 }, { "epoch": 0.7290700447883816, "grad_norm": 45.0, "kl": 1.274688720703125, "learning_rate": 5e-07, "logits/chosen": -22927760.0, "logits/rejected": -1785557.6, "logps/chosen": -337.33986409505206, "logps/rejected": -462.4013671875, "loss": 0.2743, "rewards/chosen": 0.21960222721099854, "rewards/margins": 2.469576287269592, "rewards/rejected": -2.2499740600585936, "step": 13755 }, { "epoch": 0.7291230487901836, "grad_norm": 55.75, "kl": 0.03040027618408203, "learning_rate": 5e-07, "logits/chosen": -72395008.0, "logits/rejected": -20559510.0, "logps/chosen": -228.081787109375, "logps/rejected": -280.82989501953125, "loss": 0.3393, "rewards/chosen": 0.17161142826080322, "rewards/margins": 1.6357307434082031, "rewards/rejected": -1.4641193151474, "step": 13756 }, { "epoch": 0.7291760527919858, "grad_norm": 34.75, "kl": 0.18091583251953125, "learning_rate": 5e-07, "logits/chosen": 5842492.666666667, "logits/rejected": -18831804.8, "logps/chosen": -21.246287027994793, "logps/rejected": -216.4335693359375, "loss": 0.2406, "rewards/chosen": 1.0899806022644043, "rewards/margins": 2.7039402961730956, "rewards/rejected": -1.6139596939086913, "step": 13757 }, { "epoch": 0.7292290567937879, "grad_norm": 30.25, "kl": 4.205540657043457, "learning_rate": 5e-07, "logits/chosen": -3126727.5, "logits/rejected": -63584328.0, "logps/chosen": -99.98411560058594, "logps/rejected": -365.1946716308594, "loss": 0.3353, "rewards/chosen": 0.43928009271621704, "rewards/margins": 2.97885662317276, "rewards/rejected": -2.539576530456543, "step": 13758 }, { "epoch": 0.7292820607955901, "grad_norm": 52.25, "kl": 0.9535598754882812, "learning_rate": 5e-07, "logits/chosen": -11650630.0, "logits/rejected": -12828590.0, "logps/chosen": -460.8675537109375, "logps/rejected": -293.549072265625, "loss": 0.2657, "rewards/chosen": 0.7390937805175781, "rewards/margins": 2.498543381690979, "rewards/rejected": -1.7594496011734009, "step": 13759 }, { "epoch": 0.7293350647973922, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36198216.0, "logits/rejected": -53067872.0, "logps/chosen": -514.9490966796875, "logps/rejected": -402.22481863839283, "loss": 0.1231, "rewards/chosen": 0.843914806842804, "rewards/margins": 3.3969271097864424, "rewards/rejected": -2.5530123029436385, "step": 13760 }, { "epoch": 0.7293880687991944, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43517568.0, "logits/rejected": -39088326.4, "logps/chosen": -556.187255859375, "logps/rejected": -480.38837890625, "loss": 0.1859, "rewards/chosen": 1.0756040414174397, "rewards/margins": 3.7809789498647053, "rewards/rejected": -2.705374908447266, "step": 13761 }, { "epoch": 0.7294410728009965, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 248053600.0, "logits/rejected": 169517712.0, "logps/chosen": -620.1005859375, "logps/rejected": -337.6672668457031, "loss": 0.2416, "rewards/chosen": 0.662901759147644, "rewards/margins": 3.4330450296401978, "rewards/rejected": -2.7701432704925537, "step": 13762 }, { "epoch": 0.7294940768027987, "grad_norm": 47.5, "kl": 0.5141830444335938, "learning_rate": 5e-07, "logits/chosen": -53621264.0, "logits/rejected": -23148516.8, "logps/chosen": -291.09731038411456, "logps/rejected": -457.584765625, "loss": 0.2009, "rewards/chosen": 0.8478196461995443, "rewards/margins": 3.3418435414632164, "rewards/rejected": -2.494023895263672, "step": 13763 }, { "epoch": 0.7295470808046007, "grad_norm": 28.75, "kl": 0.9893722534179688, "learning_rate": 5e-07, "logits/chosen": 7068153.333333333, "logits/rejected": -27507049.6, "logps/chosen": -458.84423828125, "logps/rejected": -280.4228271484375, "loss": 0.1199, "rewards/chosen": 1.5810073216756184, "rewards/margins": 4.625767453511556, "rewards/rejected": -3.0447601318359374, "step": 13764 }, { "epoch": 0.7296000848064029, "grad_norm": 29.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31957060.0, "logits/rejected": -20609840.0, "logps/chosen": -458.57513427734375, "logps/rejected": -117.35174560546875, "loss": 0.1754, "rewards/chosen": 1.1379899978637695, "rewards/margins": 4.669877529144287, "rewards/rejected": -3.5318875312805176, "step": 13765 }, { "epoch": 0.729653088808205, "grad_norm": 57.75, "kl": 0.39063453674316406, "learning_rate": 5e-07, "logits/chosen": -13561044.0, "logits/rejected": -43018323.2, "logps/chosen": -194.87939453125, "logps/rejected": -270.185693359375, "loss": 0.3081, "rewards/chosen": 0.04394645492235819, "rewards/margins": 1.7093744258085888, "rewards/rejected": -1.6654279708862305, "step": 13766 }, { "epoch": 0.7297060928100072, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3226489.6666666665, "logits/rejected": -29564960.0, "logps/chosen": -146.00912475585938, "logps/rejected": -339.0815185546875, "loss": 0.2539, "rewards/chosen": 0.2139981190363566, "rewards/margins": 2.474245540301005, "rewards/rejected": -2.2602474212646486, "step": 13767 }, { "epoch": 0.7297590968118093, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30424772.0, "logits/rejected": -26130421.333333332, "logps/chosen": -175.86849975585938, "logps/rejected": -310.85658772786456, "loss": 0.3089, "rewards/chosen": -0.5107153058052063, "rewards/margins": 1.6179695328076682, "rewards/rejected": -2.1286848386128745, "step": 13768 }, { "epoch": 0.7298121008136115, "grad_norm": 49.75, "kl": 2.0131187438964844, "learning_rate": 5e-07, "logits/chosen": -56776243.2, "logits/rejected": -6605127.333333333, "logps/chosen": -656.4017578125, "logps/rejected": -220.64290364583334, "loss": 0.2806, "rewards/chosen": 0.977590274810791, "rewards/margins": 2.6141589800516765, "rewards/rejected": -1.6365687052408855, "step": 13769 }, { "epoch": 0.7298651048154136, "grad_norm": 61.25, "kl": 0.7719993591308594, "learning_rate": 5e-07, "logits/chosen": -6215083.0, "logits/rejected": -16151942.666666666, "logps/chosen": -166.58755493164062, "logps/rejected": -369.6805826822917, "loss": 0.2839, "rewards/chosen": 0.4570637047290802, "rewards/margins": 2.1593569417794543, "rewards/rejected": -1.7022932370503743, "step": 13770 }, { "epoch": 0.7299181088172157, "grad_norm": 50.75, "kl": 0.6149673461914062, "learning_rate": 5e-07, "logits/chosen": -69234757.33333333, "logits/rejected": -12346493.6, "logps/chosen": -378.4510091145833, "logps/rejected": -319.8556640625, "loss": 0.2549, "rewards/chosen": 0.2928311824798584, "rewards/margins": 2.127572774887085, "rewards/rejected": -1.8347415924072266, "step": 13771 }, { "epoch": 0.7299711128190178, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56901672.0, "logits/rejected": -6518002.666666667, "logps/chosen": -299.3672790527344, "logps/rejected": -320.8199462890625, "loss": 0.271, "rewards/chosen": -0.19650231301784515, "rewards/margins": 1.8390948524077735, "rewards/rejected": -2.0355971654256186, "step": 13772 }, { "epoch": 0.73002411682082, "grad_norm": 49.0, "kl": 1.2801599502563477, "learning_rate": 5e-07, "logits/chosen": -10148189.6, "logits/rejected": -41924010.666666664, "logps/chosen": -183.5606201171875, "logps/rejected": -483.0436197916667, "loss": 0.3634, "rewards/chosen": 0.3671561241149902, "rewards/margins": 1.9917431831359864, "rewards/rejected": -1.624587059020996, "step": 13773 }, { "epoch": 0.7300771208226221, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71362757.33333333, "logits/rejected": -51452134.4, "logps/chosen": -275.0635172526042, "logps/rejected": -517.548486328125, "loss": 0.2349, "rewards/chosen": 0.032192230224609375, "rewards/margins": 2.735930633544922, "rewards/rejected": -2.7037384033203127, "step": 13774 }, { "epoch": 0.7301301248244243, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31961816.0, "logits/rejected": -24764777.6, "logps/chosen": -244.236572265625, "logps/rejected": -449.781298828125, "loss": 0.2089, "rewards/chosen": 0.4514533281326294, "rewards/margins": 2.965092396736145, "rewards/rejected": -2.5136390686035157, "step": 13775 }, { "epoch": 0.7301831288262264, "grad_norm": 70.0, "kl": 2.0583648681640625, "learning_rate": 5e-07, "logits/chosen": -51006117.333333336, "logits/rejected": -3631563.2, "logps/chosen": -507.4682210286458, "logps/rejected": -391.03662109375, "loss": 0.2069, "rewards/chosen": 1.7104899088541667, "rewards/margins": 3.1487520853678386, "rewards/rejected": -1.4382621765136718, "step": 13776 }, { "epoch": 0.7302361328280285, "grad_norm": 65.0, "kl": 0.4439554214477539, "learning_rate": 5e-07, "logits/chosen": -72877802.66666667, "logits/rejected": -10549542.4, "logps/chosen": -552.4010823567709, "logps/rejected": -246.6199951171875, "loss": 0.2695, "rewards/chosen": 0.5107330481211344, "rewards/margins": 2.146632782618205, "rewards/rejected": -1.6358997344970703, "step": 13777 }, { "epoch": 0.7302891368298307, "grad_norm": 46.0, "kl": 2.2277183532714844, "learning_rate": 5e-07, "logits/chosen": -8431599.333333334, "logits/rejected": -43971158.4, "logps/chosen": -196.24580891927084, "logps/rejected": -215.3864990234375, "loss": 0.2447, "rewards/chosen": 0.9667150179545084, "rewards/margins": 2.953314367930094, "rewards/rejected": -1.9865993499755858, "step": 13778 }, { "epoch": 0.7303421408316327, "grad_norm": 42.5, "kl": 0.1587677001953125, "learning_rate": 5e-07, "logits/chosen": 7612053.333333333, "logits/rejected": -61016998.4, "logps/chosen": -203.7189737955729, "logps/rejected": -504.3443359375, "loss": 0.1635, "rewards/chosen": 1.5175002415974934, "rewards/margins": 4.566270764668783, "rewards/rejected": -3.048770523071289, "step": 13779 }, { "epoch": 0.7303951448334349, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28733764.0, "logits/rejected": -62951332.571428575, "logps/chosen": -220.6129150390625, "logps/rejected": -339.31368582589283, "loss": 0.1871, "rewards/chosen": -0.6152405142784119, "rewards/margins": 2.0686175397464206, "rewards/rejected": -2.6838580540248325, "step": 13780 }, { "epoch": 0.730448148835237, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -138389072.0, "logits/rejected": -10396637.714285715, "logps/chosen": -417.9749755859375, "logps/rejected": -340.09130859375, "loss": 0.2945, "rewards/chosen": -0.5915161371231079, "rewards/margins": 0.6698767287390572, "rewards/rejected": -1.2613928658621651, "step": 13781 }, { "epoch": 0.7305011528370392, "grad_norm": 41.75, "kl": 0.6901931762695312, "learning_rate": 5e-07, "logits/chosen": -368051.3333333333, "logits/rejected": -22919990.4, "logps/chosen": -122.36903889973958, "logps/rejected": -352.7251953125, "loss": 0.2434, "rewards/chosen": 0.2528059085210164, "rewards/margins": 2.653839119275411, "rewards/rejected": -2.4010332107543944, "step": 13782 }, { "epoch": 0.7305541568388413, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40090150.4, "logits/rejected": -92433984.0, "logps/chosen": -229.555859375, "logps/rejected": -425.0557454427083, "loss": 0.2833, "rewards/chosen": 0.5723261356353759, "rewards/margins": 3.255282195409139, "rewards/rejected": -2.682956059773763, "step": 13783 }, { "epoch": 0.7306071608406435, "grad_norm": 62.5, "kl": 2.8585052490234375, "learning_rate": 5e-07, "logits/chosen": -21043629.714285713, "logits/rejected": -54692624.0, "logps/chosen": -351.99417550223217, "logps/rejected": -204.99658203125, "loss": 0.2759, "rewards/chosen": 1.237457547869001, "rewards/margins": 3.7641101224081854, "rewards/rejected": -2.5266525745391846, "step": 13784 }, { "epoch": 0.7306601648424456, "grad_norm": 48.25, "kl": 1.2768707275390625, "learning_rate": 5e-07, "logits/chosen": -10303547.0, "logits/rejected": -19363574.0, "logps/chosen": -184.9208984375, "logps/rejected": -337.9412841796875, "loss": 0.3069, "rewards/chosen": 0.26645490527153015, "rewards/margins": 2.0234446227550507, "rewards/rejected": -1.7569897174835205, "step": 13785 }, { "epoch": 0.7307131688442478, "grad_norm": 96.0, "kl": 4.6058349609375, "learning_rate": 5e-07, "logits/chosen": -30729629.333333332, "logits/rejected": -8321499.5, "logps/chosen": -684.0260416666666, "logps/rejected": -530.9512329101562, "loss": 0.2398, "rewards/chosen": 1.4276485443115234, "rewards/margins": 7.2528977394104, "rewards/rejected": -5.825249195098877, "step": 13786 }, { "epoch": 0.7307661728460498, "grad_norm": 40.75, "kl": 0.6760292053222656, "learning_rate": 5e-07, "logits/chosen": -19964290.666666668, "logits/rejected": -11107268.0, "logps/chosen": -401.3649088541667, "logps/rejected": -268.16025390625, "loss": 0.2111, "rewards/chosen": 0.9069621562957764, "rewards/margins": 2.9474956035614013, "rewards/rejected": -2.040533447265625, "step": 13787 }, { "epoch": 0.730819176847852, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40968032.0, "logits/rejected": -38398924.0, "logps/chosen": -238.22555541992188, "logps/rejected": -289.1867980957031, "loss": 0.3407, "rewards/chosen": 0.2162860929965973, "rewards/margins": 1.634214073419571, "rewards/rejected": -1.4179279804229736, "step": 13788 }, { "epoch": 0.7308721808496541, "grad_norm": 45.75, "kl": 0.8907489776611328, "learning_rate": 5e-07, "logits/chosen": -34520870.4, "logits/rejected": -15718884.0, "logps/chosen": -329.0033935546875, "logps/rejected": -695.439453125, "loss": 0.2933, "rewards/chosen": 0.327623176574707, "rewards/margins": 3.8219856897989906, "rewards/rejected": -3.4943625132242837, "step": 13789 }, { "epoch": 0.7309251848514563, "grad_norm": 42.75, "kl": 3.014946937561035, "learning_rate": 5e-07, "logits/chosen": -14092433.6, "logits/rejected": 1627207.5, "logps/chosen": -212.8173583984375, "logps/rejected": -153.961669921875, "loss": 0.4067, "rewards/chosen": 0.28719115257263184, "rewards/margins": 1.7176212469736736, "rewards/rejected": -1.4304300944010417, "step": 13790 }, { "epoch": 0.7309781888532584, "grad_norm": 37.75, "kl": 0.08628273010253906, "learning_rate": 5e-07, "logits/chosen": -11526068.0, "logits/rejected": -34220076.0, "logps/chosen": -269.74530029296875, "logps/rejected": -389.13568115234375, "loss": 0.2399, "rewards/chosen": 0.5878057479858398, "rewards/margins": 3.016422748565674, "rewards/rejected": -2.428617000579834, "step": 13791 }, { "epoch": 0.7310311928550606, "grad_norm": 39.5, "kl": 2.4411888122558594, "learning_rate": 5e-07, "logits/chosen": -4510057.0, "logits/rejected": -53491560.0, "logps/chosen": -227.96841430664062, "logps/rejected": -622.4161987304688, "loss": 0.2106, "rewards/chosen": 1.5801551342010498, "rewards/margins": 4.796100616455078, "rewards/rejected": -3.2159454822540283, "step": 13792 }, { "epoch": 0.7310841968568627, "grad_norm": 64.5, "kl": 6.130624771118164, "learning_rate": 5e-07, "logits/chosen": -25100888.0, "logits/rejected": -77481216.0, "logps/chosen": -210.36995442708334, "logps/rejected": -180.20571899414062, "loss": 0.4626, "rewards/chosen": 0.6132744550704956, "rewards/margins": 1.9654535055160522, "rewards/rejected": -1.3521790504455566, "step": 13793 }, { "epoch": 0.7311372008586648, "grad_norm": 57.75, "kl": 5.244014739990234, "learning_rate": 5e-07, "logits/chosen": -32751238.4, "logits/rejected": -31789450.666666668, "logps/chosen": -422.375537109375, "logps/rejected": -348.5692138671875, "loss": 0.3619, "rewards/chosen": 0.7258109092712403, "rewards/margins": 2.978730742136637, "rewards/rejected": -2.252919832865397, "step": 13794 }, { "epoch": 0.7311902048604669, "grad_norm": 69.5, "kl": 2.8744659423828125, "learning_rate": 5e-07, "logits/chosen": -24621796.57142857, "logits/rejected": -75355168.0, "logps/chosen": -576.8019670758929, "logps/rejected": -296.67901611328125, "loss": 0.4055, "rewards/chosen": 0.7060601370675224, "rewards/margins": 1.6805962579590934, "rewards/rejected": -0.974536120891571, "step": 13795 }, { "epoch": 0.7312432088622691, "grad_norm": 57.75, "kl": 7.265442848205566, "learning_rate": 5e-07, "logits/chosen": -10142066.0, "logps/chosen": -650.9085693359375, "loss": 0.5008, "rewards/chosen": 0.9023569226264954, "step": 13796 }, { "epoch": 0.7312962128640712, "grad_norm": 62.0, "kl": 3.004119873046875, "learning_rate": 5e-07, "logits/chosen": -10771330.4, "logits/rejected": -9044828.666666666, "logps/chosen": -995.99501953125, "logps/rejected": -228.16536458333334, "loss": 0.2499, "rewards/chosen": 1.5135534286499024, "rewards/margins": 3.7421825408935545, "rewards/rejected": -2.2286291122436523, "step": 13797 }, { "epoch": 0.7313492168658734, "grad_norm": 41.75, "kl": 0.7934656143188477, "learning_rate": 5e-07, "logits/chosen": -9583645.0, "logits/rejected": -28538214.0, "logps/chosen": -309.2853088378906, "logps/rejected": -158.3180694580078, "loss": 0.2488, "rewards/chosen": 0.7632992267608643, "rewards/margins": 3.327165126800537, "rewards/rejected": -2.563865900039673, "step": 13798 }, { "epoch": 0.7314022208676755, "grad_norm": 29.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39305000.0, "logits/rejected": -88636928.0, "logps/chosen": -120.33287048339844, "logps/rejected": -396.4193929036458, "loss": 0.1841, "rewards/chosen": 0.010303877294063568, "rewards/margins": 2.7531195307771363, "rewards/rejected": -2.7428156534830728, "step": 13799 }, { "epoch": 0.7314552248694777, "grad_norm": 46.75, "kl": 4.499048233032227, "learning_rate": 5e-07, "logits/chosen": -25508224.0, "logits/rejected": -7929725.333333333, "logps/chosen": -367.3194580078125, "logps/rejected": -417.1842447916667, "loss": 0.3115, "rewards/chosen": 1.084821891784668, "rewards/margins": 3.6439628918965656, "rewards/rejected": -2.559141000111898, "step": 13800 }, { "epoch": 0.7315082288712798, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13002139.2, "logits/rejected": -34623578.666666664, "logps/chosen": -273.40341796875, "logps/rejected": -424.259521484375, "loss": 0.3322, "rewards/chosen": 0.32241029739379884, "rewards/margins": 2.1920992851257326, "rewards/rejected": -1.8696889877319336, "step": 13801 }, { "epoch": 0.731561232873082, "grad_norm": 59.25, "kl": 1.2187767028808594, "learning_rate": 5e-07, "logits/chosen": -33791641.6, "logits/rejected": 20112428.0, "logps/chosen": -355.8624267578125, "logps/rejected": -251.87115478515625, "loss": 0.3027, "rewards/chosen": 0.35394988059997556, "rewards/margins": 2.888511037826538, "rewards/rejected": -2.5345611572265625, "step": 13802 }, { "epoch": 0.731614236874884, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -113921561.6, "logits/rejected": -14763750.666666666, "logps/chosen": -229.54794921875, "logps/rejected": -170.6893310546875, "loss": 0.3425, "rewards/chosen": 0.4577265739440918, "rewards/margins": 2.2275334358215333, "rewards/rejected": -1.7698068618774414, "step": 13803 }, { "epoch": 0.7316672408766862, "grad_norm": 48.0, "kl": 3.28826904296875, "learning_rate": 5e-07, "logits/chosen": -33263608.0, "logits/rejected": -12974472.0, "logps/chosen": -240.4286651611328, "logps/rejected": -249.66525268554688, "loss": 0.3, "rewards/chosen": 0.4516352117061615, "rewards/margins": 2.906568855047226, "rewards/rejected": -2.4549336433410645, "step": 13804 }, { "epoch": 0.7317202448784883, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42236266.666666664, "logits/rejected": -56017228.8, "logps/chosen": -328.18503824869794, "logps/rejected": -418.02939453125, "loss": 0.1648, "rewards/chosen": 1.2138783931732178, "rewards/margins": 3.4931777477264405, "rewards/rejected": -2.2792993545532227, "step": 13805 }, { "epoch": 0.7317732488802905, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34938536.0, "logits/rejected": -19962862.0, "logps/chosen": -688.6031494140625, "logps/rejected": -422.67034912109375, "loss": 0.162, "rewards/chosen": 0.9673759937286377, "rewards/margins": 5.275935888290405, "rewards/rejected": -4.308559894561768, "step": 13806 }, { "epoch": 0.7318262528820926, "grad_norm": 35.5, "kl": 2.1342649459838867, "learning_rate": 5e-07, "logits/chosen": -9785510.4, "logits/rejected": 5455574.666666667, "logps/chosen": -128.56353759765625, "logps/rejected": -97.92628987630208, "loss": 0.3606, "rewards/chosen": 0.359992527961731, "rewards/margins": 2.343916916847229, "rewards/rejected": -1.983924388885498, "step": 13807 }, { "epoch": 0.7318792568838948, "grad_norm": 39.0, "kl": 2.7987289428710938, "learning_rate": 5e-07, "logits/chosen": -52668992.0, "logits/rejected": 27961324.0, "logps/chosen": -479.61932373046875, "logps/rejected": -435.10162353515625, "loss": 0.289, "rewards/chosen": 0.7635257840156555, "rewards/margins": 3.5344391465187073, "rewards/rejected": -2.7709133625030518, "step": 13808 }, { "epoch": 0.7319322608856969, "grad_norm": 53.5, "kl": 0.6905670166015625, "learning_rate": 5e-07, "logits/chosen": -45985638.4, "logits/rejected": -32983552.0, "logps/chosen": -266.5434326171875, "logps/rejected": -414.2669270833333, "loss": 0.3354, "rewards/chosen": 0.18919533491134644, "rewards/margins": 2.2467444936434426, "rewards/rejected": -2.057549158732096, "step": 13809 }, { "epoch": 0.731985264887499, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48933416.0, "logits/rejected": 15276637.0, "logps/chosen": -227.68795776367188, "logps/rejected": -331.5654296875, "loss": 0.2445, "rewards/chosen": 0.5840057134628296, "rewards/margins": 3.41517436504364, "rewards/rejected": -2.8311686515808105, "step": 13810 }, { "epoch": 0.7320382688893011, "grad_norm": 43.0, "kl": 1.2256202697753906, "learning_rate": 5e-07, "logits/chosen": -7910696.666666667, "logits/rejected": -10898400.0, "logps/chosen": -300.74151611328125, "logps/rejected": -211.25546875, "loss": 0.1058, "rewards/chosen": 1.6284535725911458, "rewards/margins": 5.1885022481282554, "rewards/rejected": -3.5600486755371095, "step": 13811 }, { "epoch": 0.7320912728911033, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29478514.666666668, "logits/rejected": -4190768.8, "logps/chosen": -162.23311360677084, "logps/rejected": -237.0438232421875, "loss": 0.292, "rewards/chosen": -0.005082706610361735, "rewards/margins": 2.281551166375478, "rewards/rejected": -2.2866338729858398, "step": 13812 }, { "epoch": 0.7321442768929054, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30622546.666666668, "logits/rejected": -28480185.6, "logps/chosen": -505.8135986328125, "logps/rejected": -151.5578125, "loss": 0.1698, "rewards/chosen": 0.7409769694010416, "rewards/margins": 4.81533915201823, "rewards/rejected": -4.074362182617188, "step": 13813 }, { "epoch": 0.7321972808947076, "grad_norm": 74.0, "kl": 3.1228179931640625, "learning_rate": 5e-07, "logits/chosen": -31912820.0, "logits/rejected": -13642446.0, "logps/chosen": -405.6313171386719, "logps/rejected": -198.22352600097656, "loss": 0.3207, "rewards/chosen": 0.26542624831199646, "rewards/margins": 3.3953047692775726, "rewards/rejected": -3.129878520965576, "step": 13814 }, { "epoch": 0.7322502848965097, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7319831.333333333, "logits/rejected": -4662765.2, "logps/chosen": -204.8232218424479, "logps/rejected": -283.28798828125, "loss": 0.2173, "rewards/chosen": 0.7930754025777181, "rewards/margins": 3.539127572377523, "rewards/rejected": -2.7460521697998046, "step": 13815 }, { "epoch": 0.7323032888983119, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66085337.6, "logits/rejected": -17570573.333333332, "logps/chosen": -559.481103515625, "logps/rejected": -156.9190877278646, "loss": 0.3694, "rewards/chosen": 0.17531050443649293, "rewards/margins": 2.669781736532847, "rewards/rejected": -2.494471232096354, "step": 13816 }, { "epoch": 0.732356292900114, "grad_norm": 45.5, "kl": 2.198375701904297, "learning_rate": 5e-07, "logits/chosen": -56979308.0, "logits/rejected": -29629288.0, "logps/chosen": -377.71441650390625, "logps/rejected": -345.2339172363281, "loss": 0.3443, "rewards/chosen": 0.3010851740837097, "rewards/margins": 2.063434064388275, "rewards/rejected": -1.7623488903045654, "step": 13817 }, { "epoch": 0.7324092969019161, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26706394.666666668, "logits/rejected": -43335552.0, "logps/chosen": -264.17041015625, "logps/rejected": -361.0331298828125, "loss": 0.2653, "rewards/chosen": -0.05188249051570892, "rewards/margins": 1.9543602615594864, "rewards/rejected": -2.0062427520751953, "step": 13818 }, { "epoch": 0.7324623009037182, "grad_norm": 47.75, "kl": 0.5864028930664062, "learning_rate": 5e-07, "logits/chosen": -45446848.0, "logits/rejected": -13198956.0, "logps/chosen": -341.98427734375, "logps/rejected": -139.87954711914062, "loss": 0.2456, "rewards/chosen": 0.7995015144348144, "rewards/margins": 3.2619668006896974, "rewards/rejected": -2.462465286254883, "step": 13819 }, { "epoch": 0.7325153049055204, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43260112.0, "logits/rejected": -47370489.6, "logps/chosen": -412.594482421875, "logps/rejected": -695.12822265625, "loss": 0.1749, "rewards/chosen": 0.5624033610026041, "rewards/margins": 4.58732172648112, "rewards/rejected": -4.0249183654785154, "step": 13820 }, { "epoch": 0.7325683089073225, "grad_norm": 46.0, "kl": 2.10443115234375, "learning_rate": 5e-07, "logits/chosen": -14415988.8, "logits/rejected": -10463545.333333334, "logps/chosen": -251.4925048828125, "logps/rejected": -65.44142659505208, "loss": 0.3669, "rewards/chosen": 0.6443074703216553, "rewards/margins": 1.8727789084116617, "rewards/rejected": -1.2284714380900066, "step": 13821 }, { "epoch": 0.7326213129091247, "grad_norm": 53.5, "kl": 1.6209526062011719, "learning_rate": 5e-07, "logits/chosen": -44864272.0, "logits/rejected": 1265588.75, "logps/chosen": -391.9111022949219, "logps/rejected": -142.25997924804688, "loss": 0.2916, "rewards/chosen": 0.6194517016410828, "rewards/margins": 2.708988845348358, "rewards/rejected": -2.0895371437072754, "step": 13822 }, { "epoch": 0.7326743169109268, "grad_norm": 59.0, "kl": 1.7689094543457031, "learning_rate": 5e-07, "logits/chosen": -29604744.0, "logits/rejected": -15634592.0, "logps/chosen": -210.54806518554688, "logps/rejected": -276.711181640625, "loss": 0.2329, "rewards/chosen": 0.5689159035682678, "rewards/margins": 2.79115492105484, "rewards/rejected": -2.2222390174865723, "step": 13823 }, { "epoch": 0.732727320912729, "grad_norm": 47.0, "kl": 3.8945751190185547, "learning_rate": 5e-07, "logits/chosen": 578614.0, "logits/rejected": -35979168.0, "logps/chosen": -129.22962951660156, "logps/rejected": -259.6704406738281, "loss": 0.2919, "rewards/chosen": 0.7869035601615906, "rewards/margins": 2.6163517832756042, "rewards/rejected": -1.8294482231140137, "step": 13824 }, { "epoch": 0.732780324914531, "grad_norm": 49.0, "kl": 0.22345733642578125, "learning_rate": 5e-07, "logits/chosen": -36914484.0, "logits/rejected": 255517.0, "logps/chosen": -269.7549133300781, "logps/rejected": -118.85169982910156, "loss": 0.3422, "rewards/chosen": 0.4663267731666565, "rewards/margins": 2.038805902004242, "rewards/rejected": -1.5724791288375854, "step": 13825 }, { "epoch": 0.7328333289163332, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5572441.5, "logits/rejected": 1006780.8333333334, "logps/chosen": -64.81939697265625, "logps/rejected": -299.69163004557294, "loss": 0.1616, "rewards/chosen": 0.31251201033592224, "rewards/margins": 3.036615937948227, "rewards/rejected": -2.7241039276123047, "step": 13826 }, { "epoch": 0.7328863329181353, "grad_norm": 65.0, "kl": 1.5093269348144531, "learning_rate": 5e-07, "logits/chosen": -8196027.0, "logits/rejected": -20564932.0, "logps/chosen": -411.4336242675781, "logps/rejected": -334.84271240234375, "loss": 0.2607, "rewards/chosen": 0.7435859441757202, "rewards/margins": 2.7670847177505493, "rewards/rejected": -2.023498773574829, "step": 13827 }, { "epoch": 0.7329393369199374, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39121648.0, "logits/rejected": -14383970.666666666, "logps/chosen": -280.17669677734375, "logps/rejected": -245.2138468424479, "loss": 0.1687, "rewards/chosen": 1.9100639820098877, "rewards/margins": 3.683605432510376, "rewards/rejected": -1.7735414505004883, "step": 13828 }, { "epoch": 0.7329923409217396, "grad_norm": 40.75, "kl": 2.125843048095703, "learning_rate": 5e-07, "logits/chosen": -9663282.0, "logits/rejected": -65557460.0, "logps/chosen": -194.8297882080078, "logps/rejected": -313.737060546875, "loss": 0.254, "rewards/chosen": 0.8936907052993774, "rewards/margins": 3.059402585029602, "rewards/rejected": -2.1657118797302246, "step": 13829 }, { "epoch": 0.7330453449235417, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39005952.0, "logits/rejected": -4787019.2, "logps/chosen": -240.44376627604166, "logps/rejected": -138.13111572265626, "loss": 0.2122, "rewards/chosen": 0.8465878963470459, "rewards/margins": 2.8454827785491945, "rewards/rejected": -1.9988948822021484, "step": 13830 }, { "epoch": 0.7330983489253439, "grad_norm": 29.5, "kl": 0.24083805084228516, "learning_rate": 5e-07, "logits/chosen": 8175613.333333333, "logits/rejected": -32636505.6, "logps/chosen": -11.20874277750651, "logps/rejected": -375.659375, "loss": 0.1722, "rewards/chosen": 0.8209223747253418, "rewards/margins": 3.4550814628601074, "rewards/rejected": -2.6341590881347656, "step": 13831 }, { "epoch": 0.733151352927146, "grad_norm": 107.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45700420.0, "logits/rejected": -37225344.0, "logps/chosen": -219.16635131835938, "logps/rejected": -327.70751953125, "loss": 0.3084, "rewards/chosen": 0.285677969455719, "rewards/margins": 2.3285840153694153, "rewards/rejected": -2.0429060459136963, "step": 13832 }, { "epoch": 0.7332043569289481, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35217141.333333336, "logits/rejected": -7750743.2, "logps/chosen": -232.34781901041666, "logps/rejected": -208.7348388671875, "loss": 0.2576, "rewards/chosen": 0.17813867330551147, "rewards/margins": 2.900547730922699, "rewards/rejected": -2.7224090576171873, "step": 13833 }, { "epoch": 0.7332573609307502, "grad_norm": 60.5, "kl": 2.6037559509277344, "learning_rate": 5e-07, "logits/chosen": -110820592.0, "logits/rejected": -25562486.0, "logps/chosen": -309.03656005859375, "logps/rejected": -120.70272827148438, "loss": 0.2718, "rewards/chosen": 1.03536856174469, "rewards/margins": 2.1116950511932373, "rewards/rejected": -1.0763264894485474, "step": 13834 }, { "epoch": 0.7333103649325524, "grad_norm": 47.75, "kl": 1.5645751953125, "learning_rate": 5e-07, "logits/chosen": -47073045.333333336, "logits/rejected": -273207.45, "logps/chosen": -283.7948811848958, "logps/rejected": -126.4398193359375, "loss": 0.2471, "rewards/chosen": 0.7112993399302164, "rewards/margins": 3.0083746115366616, "rewards/rejected": -2.2970752716064453, "step": 13835 }, { "epoch": 0.7333633689343545, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -194303.45, "logits/rejected": 260699.0, "logps/chosen": -124.5968994140625, "logps/rejected": -303.2978515625, "loss": 0.2831, "rewards/chosen": 0.44662842750549314, "rewards/margins": 3.159623956680298, "rewards/rejected": -2.7129955291748047, "step": 13836 }, { "epoch": 0.7334163729361567, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8181193.0, "logits/rejected": -12419426.0, "logps/chosen": -164.6187744140625, "logps/rejected": -155.3367462158203, "loss": 0.264, "rewards/chosen": 0.3445731997489929, "rewards/margins": 2.659235179424286, "rewards/rejected": -2.314661979675293, "step": 13837 }, { "epoch": 0.7334693769379588, "grad_norm": 35.25, "kl": 0.5181770324707031, "learning_rate": 5e-07, "logits/chosen": -16807844.0, "logits/rejected": -41176816.0, "logps/chosen": -936.72216796875, "logps/rejected": -303.6349609375, "loss": 0.1363, "rewards/chosen": 2.002033551534017, "rewards/margins": 4.6949105580647785, "rewards/rejected": -2.692877006530762, "step": 13838 }, { "epoch": 0.733522380939761, "grad_norm": 52.5, "kl": 1.7457962036132812, "learning_rate": 5e-07, "logits/chosen": -28691766.4, "logits/rejected": -29798872.0, "logps/chosen": -358.7313720703125, "logps/rejected": -468.0768636067708, "loss": 0.2317, "rewards/chosen": 1.3151803970336915, "rewards/margins": 3.8068252563476563, "rewards/rejected": -2.491644859313965, "step": 13839 }, { "epoch": 0.733575384941563, "grad_norm": 46.5, "kl": 0.2799034118652344, "learning_rate": 5e-07, "logits/chosen": -27545113.6, "logits/rejected": -47427269.333333336, "logps/chosen": -249.1531005859375, "logps/rejected": -229.9598185221354, "loss": 0.3856, "rewards/chosen": -0.10830535888671874, "rewards/margins": 1.391390101114909, "rewards/rejected": -1.4996954600016277, "step": 13840 }, { "epoch": 0.7336283889433652, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26575338.666666668, "logits/rejected": -31193763.2, "logps/chosen": -321.3988037109375, "logps/rejected": -330.4108154296875, "loss": 0.1779, "rewards/chosen": 0.7747163772583008, "rewards/margins": 4.0100603103637695, "rewards/rejected": -3.2353439331054688, "step": 13841 }, { "epoch": 0.7336813929451673, "grad_norm": 35.25, "kl": 3.2359399795532227, "learning_rate": 5e-07, "logits/chosen": -13258536.0, "logits/rejected": -47520052.0, "logps/chosen": -182.17581176757812, "logps/rejected": -443.50128173828125, "loss": 0.2542, "rewards/chosen": 0.6963128447532654, "rewards/margins": 3.3420602679252625, "rewards/rejected": -2.645747423171997, "step": 13842 }, { "epoch": 0.7337343969469695, "grad_norm": 46.5, "kl": 0.3972892761230469, "learning_rate": 5e-07, "logits/chosen": -54782656.0, "logits/rejected": -24545410.0, "logps/chosen": -312.27862548828125, "logps/rejected": -327.15081787109375, "loss": 0.3158, "rewards/chosen": -0.03464268520474434, "rewards/margins": 2.4562554471194744, "rewards/rejected": -2.4908981323242188, "step": 13843 }, { "epoch": 0.7337874009487716, "grad_norm": 47.5, "kl": 3.314861297607422, "learning_rate": 5e-07, "logits/chosen": -21642016.0, "logits/rejected": -106049184.0, "logps/chosen": -259.89601643880206, "logps/rejected": -217.2681884765625, "loss": 0.4003, "rewards/chosen": 0.5982877016067505, "rewards/margins": 2.5274159908294678, "rewards/rejected": -1.9291282892227173, "step": 13844 }, { "epoch": 0.7338404049505738, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -89637504.0, "logits/rejected": -37722832.0, "logps/chosen": -427.02374267578125, "logps/rejected": -371.0039876302083, "loss": 0.2495, "rewards/chosen": 0.1403457671403885, "rewards/margins": 2.1922986060380936, "rewards/rejected": -2.051952838897705, "step": 13845 }, { "epoch": 0.7338934089523759, "grad_norm": 28.125, "kl": 1.2075862884521484, "learning_rate": 5e-07, "logits/chosen": -7382813.333333333, "logits/rejected": -24766899.2, "logps/chosen": -121.81034342447917, "logps/rejected": -261.5366943359375, "loss": 0.1806, "rewards/chosen": 0.943274974822998, "rewards/margins": 3.615672206878662, "rewards/rejected": -2.672397232055664, "step": 13846 }, { "epoch": 0.7339464129541781, "grad_norm": 37.5, "kl": 8.008109092712402, "learning_rate": 5e-07, "logits/chosen": -8576386.4, "logits/rejected": -24640762.666666668, "logps/chosen": -104.02877197265624, "logps/rejected": -306.0926920572917, "loss": 0.4541, "rewards/chosen": 0.2883659839630127, "rewards/margins": 2.4665673414866127, "rewards/rejected": -2.1782013575236, "step": 13847 }, { "epoch": 0.7339994169559801, "grad_norm": 60.5, "kl": 1.0042762756347656, "learning_rate": 5e-07, "logits/chosen": -7207825.6, "logits/rejected": -12496362.666666666, "logps/chosen": -262.578564453125, "logps/rejected": -97.45182291666667, "loss": 0.3711, "rewards/chosen": 0.31177961826324463, "rewards/margins": 1.4696648518244426, "rewards/rejected": -1.157885233561198, "step": 13848 }, { "epoch": 0.7340524209577823, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35784896.0, "logits/rejected": -33947005.333333336, "logps/chosen": -781.4882202148438, "logps/rejected": -360.9792073567708, "loss": 0.1723, "rewards/chosen": 1.6859002113342285, "rewards/margins": 3.994351863861084, "rewards/rejected": -2.3084516525268555, "step": 13849 }, { "epoch": 0.7341054249595844, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33253746.666666668, "logits/rejected": -18012003.2, "logps/chosen": -436.2161051432292, "logps/rejected": -339.48134765625, "loss": 0.1873, "rewards/chosen": 0.6864456335703532, "rewards/margins": 3.9581955115000405, "rewards/rejected": -3.2717498779296874, "step": 13850 }, { "epoch": 0.7341584289613866, "grad_norm": 47.0, "kl": 3.1984634399414062, "learning_rate": 5e-07, "logits/chosen": -41994200.0, "logits/rejected": -10564570.0, "logps/chosen": -449.9688720703125, "logps/rejected": -262.6871337890625, "loss": 0.285, "rewards/chosen": 0.429373174905777, "rewards/margins": 2.201876312494278, "rewards/rejected": -1.772503137588501, "step": 13851 }, { "epoch": 0.7342114329631887, "grad_norm": 38.0, "kl": 2.1332473754882812, "learning_rate": 5e-07, "logits/chosen": -35897401.6, "logits/rejected": -20771909.333333332, "logps/chosen": -463.10654296875, "logps/rejected": -237.2690633138021, "loss": 0.2249, "rewards/chosen": 1.6316232681274414, "rewards/margins": 4.015746275583902, "rewards/rejected": -2.3841230074564614, "step": 13852 }, { "epoch": 0.7342644369649909, "grad_norm": 55.75, "kl": 0.775299072265625, "learning_rate": 5e-07, "logits/chosen": 3329229.3333333335, "logits/rejected": 46979651.2, "logps/chosen": -415.8400065104167, "logps/rejected": -314.69501953125, "loss": 0.2636, "rewards/chosen": 0.7611002922058105, "rewards/margins": 2.6122864723205566, "rewards/rejected": -1.851186180114746, "step": 13853 }, { "epoch": 0.734317440966793, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6061462.0, "logits/rejected": -33641640.0, "logps/chosen": -202.25067138671875, "logps/rejected": -332.2613830566406, "loss": 0.267, "rewards/chosen": 0.28416261076927185, "rewards/margins": 2.626492828130722, "rewards/rejected": -2.34233021736145, "step": 13854 }, { "epoch": 0.7343704449685952, "grad_norm": 48.75, "kl": 3.084257125854492, "learning_rate": 5e-07, "logits/chosen": -40070452.0, "logits/rejected": -25336802.0, "logps/chosen": -927.3594970703125, "logps/rejected": -229.58853149414062, "loss": 0.2446, "rewards/chosen": 1.5152573585510254, "rewards/margins": 2.922994613647461, "rewards/rejected": -1.4077372550964355, "step": 13855 }, { "epoch": 0.7344234489703972, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -341545.0, "logits/rejected": -8461792.0, "logps/chosen": -122.22615814208984, "logps/rejected": -278.6038818359375, "loss": 0.175, "rewards/chosen": 0.7257903814315796, "rewards/margins": 3.309047738711039, "rewards/rejected": -2.5832573572794595, "step": 13856 }, { "epoch": 0.7344764529721994, "grad_norm": 64.5, "kl": 0.9543018341064453, "learning_rate": 5e-07, "logits/chosen": -28685093.333333332, "logits/rejected": -62880520.0, "logps/chosen": -363.309326171875, "logps/rejected": -330.7202453613281, "loss": 0.3818, "rewards/chosen": 0.30836741129557294, "rewards/margins": 1.9572214285532634, "rewards/rejected": -1.6488540172576904, "step": 13857 }, { "epoch": 0.7345294569740015, "grad_norm": 60.25, "kl": 4.6959733963012695, "learning_rate": 5e-07, "logits/chosen": -20648185.333333332, "logits/rejected": -6971848.0, "logps/chosen": -306.9501953125, "logps/rejected": -118.43570709228516, "loss": 0.3404, "rewards/chosen": 1.160512129465739, "rewards/margins": 1.9462852875391643, "rewards/rejected": -0.7857731580734253, "step": 13858 }, { "epoch": 0.7345824609758037, "grad_norm": 54.0, "kl": 2.6883468627929688, "learning_rate": 5e-07, "logits/chosen": -32120192.0, "logits/rejected": -2758421.75, "logps/chosen": -472.87139892578125, "logps/rejected": -181.25216674804688, "loss": 0.2644, "rewards/chosen": 0.5545585751533508, "rewards/margins": 2.7955127358436584, "rewards/rejected": -2.2409541606903076, "step": 13859 }, { "epoch": 0.7346354649776058, "grad_norm": 64.0, "kl": 4.357702255249023, "learning_rate": 5e-07, "logits/chosen": -27605229.333333332, "logits/rejected": 47045192.0, "logps/chosen": -272.8861897786458, "logps/rejected": -436.5259704589844, "loss": 0.3567, "rewards/chosen": 0.9404458999633789, "rewards/margins": 2.6504664421081543, "rewards/rejected": -1.7100205421447754, "step": 13860 }, { "epoch": 0.734688468979408, "grad_norm": 36.25, "kl": 2.807464599609375, "learning_rate": 5e-07, "logits/chosen": -98729192.0, "logits/rejected": -36123312.0, "logps/chosen": -263.7004089355469, "logps/rejected": -352.70257568359375, "loss": 0.3286, "rewards/chosen": 0.08768616616725922, "rewards/margins": 3.3723383992910385, "rewards/rejected": -3.2846522331237793, "step": 13861 }, { "epoch": 0.7347414729812101, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55349384.0, "logits/rejected": -2218832.5, "logps/chosen": -349.1689453125, "logps/rejected": -126.80476379394531, "loss": 0.2563, "rewards/chosen": 0.6249359250068665, "rewards/margins": 3.546234905719757, "rewards/rejected": -2.9212989807128906, "step": 13862 }, { "epoch": 0.7347944769830123, "grad_norm": 48.5, "kl": 1.8893775939941406, "learning_rate": 5e-07, "logits/chosen": -6668904.0, "logits/rejected": -9956032.0, "logps/chosen": -420.0028483072917, "logps/rejected": -183.9736328125, "loss": 0.1926, "rewards/chosen": 0.5222241878509521, "rewards/margins": 4.018213415145874, "rewards/rejected": -3.495989227294922, "step": 13863 }, { "epoch": 0.7348474809848143, "grad_norm": 33.75, "kl": 0.3018455505371094, "learning_rate": 5e-07, "logits/chosen": -55527408.0, "logits/rejected": -12935312.0, "logps/chosen": -280.1390380859375, "logps/rejected": -124.33958740234375, "loss": 0.1408, "rewards/chosen": 1.2282268206278484, "rewards/margins": 4.395362631479899, "rewards/rejected": -3.167135810852051, "step": 13864 }, { "epoch": 0.7349004849866165, "grad_norm": 30.75, "kl": 0.8472318649291992, "learning_rate": 5e-07, "logits/chosen": -22782965.333333332, "logits/rejected": -21460212.8, "logps/chosen": -536.31298828125, "logps/rejected": -274.49072265625, "loss": 0.1773, "rewards/chosen": 1.0649386246999104, "rewards/margins": 3.956471522649129, "rewards/rejected": -2.8915328979492188, "step": 13865 }, { "epoch": 0.7349534889884186, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25973104.0, "logits/rejected": -22007969.6, "logps/chosen": -221.81201171875, "logps/rejected": -238.111279296875, "loss": 0.3593, "rewards/chosen": -0.5738592147827148, "rewards/margins": 0.9767431259155273, "rewards/rejected": -1.550602340698242, "step": 13866 }, { "epoch": 0.7350064929902208, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3295486.5, "logits/rejected": -48862136.0, "logps/chosen": -175.0709686279297, "logps/rejected": -337.97369384765625, "loss": 0.2092, "rewards/chosen": 0.63544762134552, "rewards/margins": 4.748921990394592, "rewards/rejected": -4.113474369049072, "step": 13867 }, { "epoch": 0.7350594969920229, "grad_norm": 31.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2709904.5, "logits/rejected": -25684381.333333332, "logps/chosen": -39.78740692138672, "logps/rejected": -469.5028483072917, "loss": 0.2035, "rewards/chosen": -0.016116715967655182, "rewards/margins": 3.0941165909171104, "rewards/rejected": -3.1102333068847656, "step": 13868 }, { "epoch": 0.7351125009938251, "grad_norm": 54.5, "kl": 0.9080476760864258, "learning_rate": 5e-07, "logits/chosen": 33614306.666666664, "logits/rejected": -62604992.0, "logps/chosen": -180.18758138020834, "logps/rejected": -294.137646484375, "loss": 0.3187, "rewards/chosen": -0.17175596952438354, "rewards/margins": 2.1389656186103823, "rewards/rejected": -2.310721588134766, "step": 13869 }, { "epoch": 0.7351655049956272, "grad_norm": 79.0, "kl": 0.5748977661132812, "learning_rate": 5e-07, "logits/chosen": -11698889.333333334, "logits/rejected": 4629306.5, "logps/chosen": -469.50830078125, "logps/rejected": -70.32434844970703, "loss": 0.4298, "rewards/chosen": 0.42262542247772217, "rewards/margins": 0.6352618038654327, "rewards/rejected": -0.21263638138771057, "step": 13870 }, { "epoch": 0.7352185089974294, "grad_norm": 70.5, "kl": 2.3469276428222656, "learning_rate": 5e-07, "logits/chosen": -18064556.8, "logits/rejected": -64482160.0, "logps/chosen": -516.96474609375, "logps/rejected": -164.80514526367188, "loss": 0.3461, "rewards/chosen": 0.53880615234375, "rewards/margins": 2.919430605570475, "rewards/rejected": -2.380624453226725, "step": 13871 }, { "epoch": 0.7352715129992314, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6828815.2, "logits/rejected": -25597720.0, "logps/chosen": -369.8128173828125, "logps/rejected": -328.6697591145833, "loss": 0.2444, "rewards/chosen": 0.9190324783325196, "rewards/margins": 3.655422910054525, "rewards/rejected": -2.7363904317220054, "step": 13872 }, { "epoch": 0.7353245170010336, "grad_norm": 52.75, "kl": 2.384127616882324, "learning_rate": 5e-07, "logits/chosen": -4638190.8, "logits/rejected": -22557805.333333332, "logps/chosen": -241.72099609375, "logps/rejected": -208.2648722330729, "loss": 0.3359, "rewards/chosen": 0.5307438850402832, "rewards/margins": 1.7951266447703045, "rewards/rejected": -1.2643827597300212, "step": 13873 }, { "epoch": 0.7353775210028357, "grad_norm": 46.25, "kl": 1.0811805725097656, "learning_rate": 5e-07, "logits/chosen": -20205584.0, "logits/rejected": -15608845.333333334, "logps/chosen": -557.9609985351562, "logps/rejected": -226.53125, "loss": 0.1646, "rewards/chosen": 1.2385163307189941, "rewards/margins": 3.186240990956624, "rewards/rejected": -1.9477246602376301, "step": 13874 }, { "epoch": 0.7354305250046379, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76864416.0, "logits/rejected": -16426238.0, "logps/chosen": -442.9613037109375, "logps/rejected": -249.5978240966797, "loss": 0.2368, "rewards/chosen": 0.7497814893722534, "rewards/margins": 4.103744149208069, "rewards/rejected": -3.3539626598358154, "step": 13875 }, { "epoch": 0.73548352900644, "grad_norm": 28.25, "kl": 0.46118927001953125, "learning_rate": 5e-07, "logits/chosen": -838446.5, "logits/rejected": -31883221.333333332, "logps/chosen": -108.41153717041016, "logps/rejected": -342.1309407552083, "loss": 0.1468, "rewards/chosen": 0.13904142379760742, "rewards/margins": 3.8369997342427573, "rewards/rejected": -3.69795831044515, "step": 13876 }, { "epoch": 0.7355365330082422, "grad_norm": 39.5, "kl": 0.4175567626953125, "learning_rate": 5e-07, "logits/chosen": -36465116.0, "logits/rejected": -11691795.0, "logps/chosen": -271.1742248535156, "logps/rejected": -258.6274719238281, "loss": 0.2583, "rewards/chosen": 0.4075315594673157, "rewards/margins": 4.309938728809357, "rewards/rejected": -3.902407169342041, "step": 13877 }, { "epoch": 0.7355895370100443, "grad_norm": 55.0, "kl": 2.4502086639404297, "learning_rate": 5e-07, "logits/chosen": -223935.375, "logits/rejected": -5014208.0, "logps/chosen": -268.20880126953125, "logps/rejected": -247.818359375, "loss": 0.3793, "rewards/chosen": 0.5635451674461365, "rewards/margins": 1.7828447222709656, "rewards/rejected": -1.219299554824829, "step": 13878 }, { "epoch": 0.7356425410118463, "grad_norm": 35.25, "kl": 3.7280807495117188, "learning_rate": 5e-07, "logits/chosen": -6895200.5, "logits/rejected": 2370568.75, "logps/chosen": -209.2598114013672, "logps/rejected": -233.41766357421875, "loss": 0.2624, "rewards/chosen": 1.1535128355026245, "rewards/margins": 2.862181782722473, "rewards/rejected": -1.7086689472198486, "step": 13879 }, { "epoch": 0.7356955450136485, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19445472.0, "logits/rejected": -3930332.4, "logps/chosen": -244.92215983072916, "logps/rejected": -362.543310546875, "loss": 0.2464, "rewards/chosen": 0.254553218682607, "rewards/margins": 3.028104015191396, "rewards/rejected": -2.773550796508789, "step": 13880 }, { "epoch": 0.7357485490154506, "grad_norm": 67.5, "kl": 2.021575927734375, "learning_rate": 5e-07, "logits/chosen": -23768867.2, "logits/rejected": -27067098.666666668, "logps/chosen": -245.83232421875, "logps/rejected": -251.02242024739584, "loss": 0.3525, "rewards/chosen": 0.05680079460144043, "rewards/margins": 2.3417092482248942, "rewards/rejected": -2.2849084536234536, "step": 13881 }, { "epoch": 0.7358015530172528, "grad_norm": 92.5, "kl": 2.0920562744140625, "learning_rate": 5e-07, "logits/chosen": -15876465.6, "logits/rejected": -19673536.0, "logps/chosen": -347.102978515625, "logps/rejected": -228.70157877604166, "loss": 0.264, "rewards/chosen": 0.9173138618469239, "rewards/margins": 2.7565523465474446, "rewards/rejected": -1.8392384847005208, "step": 13882 }, { "epoch": 0.7358545570190549, "grad_norm": 32.0, "kl": 1.4221343994140625, "learning_rate": 5e-07, "logits/chosen": -41033896.0, "logits/rejected": -22004516.57142857, "logps/chosen": -1200.52685546875, "logps/rejected": -370.1412876674107, "loss": 0.0856, "rewards/chosen": 4.790332317352295, "rewards/margins": 7.276663167136056, "rewards/rejected": -2.4863308497837613, "step": 13883 }, { "epoch": 0.7359075610208571, "grad_norm": 56.25, "kl": 3.6273574829101562, "learning_rate": 5e-07, "logits/chosen": 31935398.0, "logits/rejected": -9647342.0, "logps/chosen": -265.8600769042969, "logps/rejected": -192.71612548828125, "loss": 0.422, "rewards/chosen": 0.35430651903152466, "rewards/margins": 1.5428122878074646, "rewards/rejected": -1.18850576877594, "step": 13884 }, { "epoch": 0.7359605650226592, "grad_norm": 48.0, "kl": 1.5349159240722656, "learning_rate": 5e-07, "logits/chosen": -10414005.6, "logits/rejected": 13318098.666666666, "logps/chosen": -466.9162109375, "logps/rejected": -121.14569091796875, "loss": 0.2084, "rewards/chosen": 1.2673643112182618, "rewards/margins": 3.6918638229370115, "rewards/rejected": -2.42449951171875, "step": 13885 }, { "epoch": 0.7360135690244614, "grad_norm": 46.0, "kl": 1.4144210815429688, "learning_rate": 5e-07, "logits/chosen": -23801131.2, "logits/rejected": -12540684.0, "logps/chosen": -381.740478515625, "logps/rejected": -564.5895182291666, "loss": 0.2207, "rewards/chosen": 0.9743570327758789, "rewards/margins": 4.290926043192545, "rewards/rejected": -3.3165690104166665, "step": 13886 }, { "epoch": 0.7360665730262634, "grad_norm": 42.5, "kl": 1.0635976791381836, "learning_rate": 5e-07, "logits/chosen": -58923392.0, "logits/rejected": -40372532.0, "logps/chosen": -219.83456420898438, "logps/rejected": -397.2326965332031, "loss": 0.22, "rewards/chosen": 0.7405796051025391, "rewards/margins": 3.5069780349731445, "rewards/rejected": -2.7663984298706055, "step": 13887 }, { "epoch": 0.7361195770280656, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73581696.0, "logits/rejected": -13130070.0, "logps/chosen": -312.7159118652344, "logps/rejected": -164.672607421875, "loss": 0.2837, "rewards/chosen": 0.0945281982421875, "rewards/margins": 2.4876456260681152, "rewards/rejected": -2.3931174278259277, "step": 13888 }, { "epoch": 0.7361725810298677, "grad_norm": 52.5, "kl": 1.6684322357177734, "learning_rate": 5e-07, "logits/chosen": -41968346.666666664, "logits/rejected": 13483114.0, "logps/chosen": -267.1108805338542, "logps/rejected": -180.15155029296875, "loss": 0.4412, "rewards/chosen": -0.046597421169281006, "rewards/margins": 1.3523032069206238, "rewards/rejected": -1.3989006280899048, "step": 13889 }, { "epoch": 0.7362255850316699, "grad_norm": 44.75, "kl": 1.2430458068847656, "learning_rate": 5e-07, "logits/chosen": -30823538.0, "logits/rejected": -55229272.0, "logps/chosen": -240.51803588867188, "logps/rejected": -314.50830078125, "loss": 0.2826, "rewards/chosen": 0.2952040135860443, "rewards/margins": 2.9646194875240326, "rewards/rejected": -2.6694154739379883, "step": 13890 }, { "epoch": 0.736278589033472, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55183704.0, "logits/rejected": -31237784.0, "logps/chosen": -202.9293212890625, "logps/rejected": -254.33721923828125, "loss": 0.2638, "rewards/chosen": 0.3677421510219574, "rewards/margins": 2.7852111756801605, "rewards/rejected": -2.417469024658203, "step": 13891 }, { "epoch": 0.7363315930352742, "grad_norm": 52.75, "kl": 0.7883491516113281, "learning_rate": 5e-07, "logits/chosen": -13637402.666666666, "logits/rejected": -79230643.2, "logps/chosen": -413.4510091145833, "logps/rejected": -237.2023681640625, "loss": 0.2431, "rewards/chosen": 0.540924072265625, "rewards/margins": 2.2647216796875, "rewards/rejected": -1.723797607421875, "step": 13892 }, { "epoch": 0.7363845970370763, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 551819.75, "logits/rejected": 1218948.3333333333, "logps/chosen": -119.00178527832031, "logps/rejected": -424.3634440104167, "loss": 0.1553, "rewards/chosen": 0.6365599036216736, "rewards/margins": 3.6533254981040955, "rewards/rejected": -3.016765594482422, "step": 13893 }, { "epoch": 0.7364376010388785, "grad_norm": 74.0, "kl": 1.0283308029174805, "learning_rate": 5e-07, "logits/chosen": -3343845.6666666665, "logits/rejected": 1708236.0, "logps/chosen": -148.41315714518228, "logps/rejected": -71.62544250488281, "loss": 0.3792, "rewards/chosen": 0.470858097076416, "rewards/margins": 2.1431851387023926, "rewards/rejected": -1.6723270416259766, "step": 13894 }, { "epoch": 0.7364906050406805, "grad_norm": 55.5, "kl": 2.6857337951660156, "learning_rate": 5e-07, "logits/chosen": -36659692.8, "logits/rejected": -29290512.0, "logps/chosen": -230.93544921875, "logps/rejected": -297.93967692057294, "loss": 0.3589, "rewards/chosen": 0.6612979412078858, "rewards/margins": 2.224609676996867, "rewards/rejected": -1.5633117357889812, "step": 13895 }, { "epoch": 0.7365436090424827, "grad_norm": 26.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12885215.0, "logits/rejected": -38444288.0, "logps/chosen": -17.052692413330078, "logps/rejected": -420.4094935825893, "loss": 0.1886, "rewards/chosen": -0.0208740234375, "rewards/margins": 2.533377238682338, "rewards/rejected": -2.554251262119838, "step": 13896 }, { "epoch": 0.7365966130442848, "grad_norm": 35.25, "kl": 1.1448841094970703, "learning_rate": 5e-07, "logits/chosen": -49250661.333333336, "logits/rejected": -6694985.2, "logps/chosen": -195.2106730143229, "logps/rejected": -159.4839599609375, "loss": 0.3182, "rewards/chosen": -0.07132607698440552, "rewards/margins": 1.6937190890312195, "rewards/rejected": -1.765045166015625, "step": 13897 }, { "epoch": 0.736649617046087, "grad_norm": 48.75, "kl": 0.037872314453125, "learning_rate": 5e-07, "logits/chosen": -18689966.0, "logits/rejected": -57807120.0, "logps/chosen": -120.14982604980469, "logps/rejected": -413.20654296875, "loss": 0.2182, "rewards/chosen": 0.4256626069545746, "rewards/margins": 3.9960547387599945, "rewards/rejected": -3.57039213180542, "step": 13898 }, { "epoch": 0.7367026210478891, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44261712.0, "logits/rejected": -3245121.3333333335, "logps/chosen": -76.91400146484375, "logps/rejected": -406.5045166015625, "loss": 0.2078, "rewards/chosen": 0.012988854199647903, "rewards/margins": 2.6384345379968486, "rewards/rejected": -2.6254456837972007, "step": 13899 }, { "epoch": 0.7367556250496913, "grad_norm": 53.0, "kl": 0.37877655029296875, "learning_rate": 5e-07, "logits/chosen": -16361305.0, "logits/rejected": -32029872.0, "logps/chosen": -378.2770080566406, "logps/rejected": -445.4280090332031, "loss": 0.2333, "rewards/chosen": 0.8823603987693787, "rewards/margins": 3.2996641993522644, "rewards/rejected": -2.4173038005828857, "step": 13900 }, { "epoch": 0.7368086290514934, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47583764.0, "logits/rejected": -48803092.0, "logps/chosen": -230.1773681640625, "logps/rejected": -331.5003662109375, "loss": 0.2965, "rewards/chosen": -0.012011539191007614, "rewards/margins": 2.950830925256014, "rewards/rejected": -2.9628424644470215, "step": 13901 }, { "epoch": 0.7368616330532956, "grad_norm": 36.75, "kl": 2.679452896118164, "learning_rate": 5e-07, "logits/chosen": -20719057.6, "logits/rejected": -58482437.333333336, "logps/chosen": -167.8222412109375, "logps/rejected": -553.5630289713541, "loss": 0.3086, "rewards/chosen": 0.3379435777664185, "rewards/margins": 4.238851094245911, "rewards/rejected": -3.900907516479492, "step": 13902 }, { "epoch": 0.7369146370550976, "grad_norm": 77.0, "kl": 0.7570466995239258, "learning_rate": 5e-07, "logits/chosen": -61041452.0, "logits/rejected": -12510745.0, "logps/chosen": -488.7450256347656, "logps/rejected": -211.0194549560547, "loss": 0.2935, "rewards/chosen": 0.5955970287322998, "rewards/margins": 2.5652908086776733, "rewards/rejected": -1.9696937799453735, "step": 13903 }, { "epoch": 0.7369676410568998, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35966221.333333336, "logits/rejected": -51500473.6, "logps/chosen": -362.8993326822917, "logps/rejected": -272.1449462890625, "loss": 0.2468, "rewards/chosen": 0.40235475699106854, "rewards/margins": 2.7727134148279826, "rewards/rejected": -2.3703586578369142, "step": 13904 }, { "epoch": 0.7370206450587019, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17648306.0, "logits/rejected": -27183314.0, "logps/chosen": -196.38943481445312, "logps/rejected": -261.71868896484375, "loss": 0.2791, "rewards/chosen": 0.16229481995105743, "rewards/margins": 2.792294457554817, "rewards/rejected": -2.6299996376037598, "step": 13905 }, { "epoch": 0.7370736490605041, "grad_norm": 41.0, "kl": 3.8121891021728516, "learning_rate": 5e-07, "logits/chosen": -33997648.0, "logits/rejected": 1974632.75, "logps/chosen": -392.7252604166667, "logps/rejected": -46.821590423583984, "loss": 0.4674, "rewards/chosen": 0.38412487506866455, "rewards/margins": 1.9003478288650513, "rewards/rejected": -1.5162229537963867, "step": 13906 }, { "epoch": 0.7371266530623062, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55696085.333333336, "logits/rejected": -92343256.0, "logps/chosen": -247.1004638671875, "logps/rejected": -827.0255126953125, "loss": 0.3446, "rewards/chosen": 0.29325244824091595, "rewards/margins": 4.217840413252513, "rewards/rejected": -3.9245879650115967, "step": 13907 }, { "epoch": 0.7371796570641084, "grad_norm": 59.25, "kl": 4.073551177978516, "learning_rate": 5e-07, "logits/chosen": -61876058.666666664, "logits/rejected": 2059062.625, "logps/chosen": -544.2920328776041, "logps/rejected": -48.636043548583984, "loss": 0.4206, "rewards/chosen": 0.57531209786733, "rewards/margins": 1.3191826542218528, "rewards/rejected": -0.7438705563545227, "step": 13908 }, { "epoch": 0.7372326610659105, "grad_norm": 42.0, "kl": 1.8197498321533203, "learning_rate": 5e-07, "logits/chosen": -47550168.0, "logits/rejected": -30473384.0, "logps/chosen": -328.495361328125, "logps/rejected": -284.1901550292969, "loss": 0.2771, "rewards/chosen": 0.5918530225753784, "rewards/margins": 3.3412145376205444, "rewards/rejected": -2.749361515045166, "step": 13909 }, { "epoch": 0.7372856650677126, "grad_norm": 43.5, "kl": 1.8035430908203125, "learning_rate": 5e-07, "logits/chosen": -33971644.0, "logits/rejected": -27676064.0, "logps/chosen": -411.45587158203125, "logps/rejected": -668.15771484375, "loss": 0.2128, "rewards/chosen": 0.8077551126480103, "rewards/margins": 4.397358059883118, "rewards/rejected": -3.5896029472351074, "step": 13910 }, { "epoch": 0.7373386690695147, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49071584.0, "logits/rejected": -81667014.4, "logps/chosen": -350.6969401041667, "logps/rejected": -295.9822021484375, "loss": 0.213, "rewards/chosen": 0.1794171134630839, "rewards/margins": 3.4948567191759743, "rewards/rejected": -3.3154396057128905, "step": 13911 }, { "epoch": 0.7373916730713169, "grad_norm": 38.75, "kl": 1.1414222717285156, "learning_rate": 5e-07, "logits/chosen": -3306653.6, "logits/rejected": -3660027.6666666665, "logps/chosen": -162.6589111328125, "logps/rejected": -253.66780598958334, "loss": 0.32, "rewards/chosen": 0.5115867614746094, "rewards/margins": 2.3732019424438477, "rewards/rejected": -1.8616151809692383, "step": 13912 }, { "epoch": 0.737444677073119, "grad_norm": 45.25, "kl": 1.3937416076660156, "learning_rate": 5e-07, "logits/chosen": -41218521.6, "logits/rejected": -509165.1666666667, "logps/chosen": -290.728759765625, "logps/rejected": -102.88511149088542, "loss": 0.3383, "rewards/chosen": 0.37164318561553955, "rewards/margins": 3.5691694815953574, "rewards/rejected": -3.197526295979818, "step": 13913 }, { "epoch": 0.7374976810749212, "grad_norm": 52.25, "kl": 1.2690448760986328, "learning_rate": 5e-07, "logits/chosen": -39068515.2, "logits/rejected": -5405404.0, "logps/chosen": -409.648046875, "logps/rejected": -418.5857340494792, "loss": 0.3184, "rewards/chosen": 0.4518467426300049, "rewards/margins": 2.425659481684367, "rewards/rejected": -1.973812739054362, "step": 13914 }, { "epoch": 0.7375506850767233, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20517652.0, "logits/rejected": -25271434.666666668, "logps/chosen": -211.6554718017578, "logps/rejected": -268.4659016927083, "loss": 0.1332, "rewards/chosen": 1.4480907917022705, "rewards/margins": 4.161212841669718, "rewards/rejected": -2.7131220499674478, "step": 13915 }, { "epoch": 0.7376036890785255, "grad_norm": 54.0, "kl": 1.824127197265625, "learning_rate": 5e-07, "logits/chosen": -14292619.2, "logits/rejected": -101104586.66666667, "logps/chosen": -435.6314453125, "logps/rejected": -101.65260823567708, "loss": 0.2317, "rewards/chosen": 1.1132769584655762, "rewards/margins": 3.8773814837137857, "rewards/rejected": -2.7641045252482095, "step": 13916 }, { "epoch": 0.7376566930803276, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17778066.0, "logits/rejected": -7677974.666666667, "logps/chosen": -23.52462387084961, "logps/rejected": -236.4725545247396, "loss": 0.2558, "rewards/chosen": -0.11430082470178604, "rewards/margins": 2.231822393834591, "rewards/rejected": -2.346123218536377, "step": 13917 }, { "epoch": 0.7377096970821297, "grad_norm": 41.25, "kl": 0.5509452819824219, "learning_rate": 5e-07, "logits/chosen": -25466482.0, "logits/rejected": -28246298.0, "logps/chosen": -371.3420715332031, "logps/rejected": -542.6702880859375, "loss": 0.2978, "rewards/chosen": 0.4273303747177124, "rewards/margins": 2.7822526693344116, "rewards/rejected": -2.354922294616699, "step": 13918 }, { "epoch": 0.7377627010839318, "grad_norm": 46.0, "kl": 1.7310237884521484, "learning_rate": 5e-07, "logits/chosen": -65365596.0, "logits/rejected": -3015106.75, "logps/chosen": -640.1409912109375, "logps/rejected": -113.76554870605469, "loss": 0.2839, "rewards/chosen": 1.1757726669311523, "rewards/margins": 2.5649763345718384, "rewards/rejected": -1.389203667640686, "step": 13919 }, { "epoch": 0.737815705085734, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80380874.66666667, "logits/rejected": -9110413.6, "logps/chosen": -277.97414143880206, "logps/rejected": -170.9186767578125, "loss": 0.2786, "rewards/chosen": -0.14549662669499716, "rewards/margins": 2.4281914989153544, "rewards/rejected": -2.5736881256103517, "step": 13920 }, { "epoch": 0.7378687090875361, "grad_norm": 62.75, "kl": 0.5494976043701172, "learning_rate": 5e-07, "logits/chosen": -12560573.0, "logits/rejected": -34545260.0, "logps/chosen": -121.70772552490234, "logps/rejected": -533.358642578125, "loss": 0.3514, "rewards/chosen": -0.2045934647321701, "rewards/margins": 2.6533205062150955, "rewards/rejected": -2.8579139709472656, "step": 13921 }, { "epoch": 0.7379217130893383, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68322416.0, "logits/rejected": -19733186.285714287, "logps/chosen": -641.4910888671875, "logps/rejected": -172.36952427455358, "loss": 0.1437, "rewards/chosen": 0.163604736328125, "rewards/margins": 2.6843910217285156, "rewards/rejected": -2.5207862854003906, "step": 13922 }, { "epoch": 0.7379747170911404, "grad_norm": 45.75, "kl": 1.6775131225585938, "learning_rate": 5e-07, "logits/chosen": -44569608.0, "logits/rejected": -13965812.0, "logps/chosen": -257.9295349121094, "logps/rejected": -219.86781311035156, "loss": 0.2506, "rewards/chosen": 0.7136996984481812, "rewards/margins": 3.115352988243103, "rewards/rejected": -2.401653289794922, "step": 13923 }, { "epoch": 0.7380277210929426, "grad_norm": 57.25, "kl": 0.4429473876953125, "learning_rate": 5e-07, "logits/chosen": -31367891.2, "logits/rejected": -26244533.333333332, "logps/chosen": -321.726171875, "logps/rejected": -165.62914021809897, "loss": 0.3911, "rewards/chosen": -0.06778464913368225, "rewards/margins": 1.8622396568457287, "rewards/rejected": -1.9300243059794109, "step": 13924 }, { "epoch": 0.7380807250947446, "grad_norm": 53.5, "kl": 0.39141273498535156, "learning_rate": 5e-07, "logits/chosen": -21306544.0, "logits/rejected": -25290950.4, "logps/chosen": -357.7527669270833, "logps/rejected": -370.15751953125, "loss": 0.2414, "rewards/chosen": 0.5087341467539469, "rewards/margins": 3.0974134604136148, "rewards/rejected": -2.588679313659668, "step": 13925 }, { "epoch": 0.7381337290965468, "grad_norm": 49.25, "kl": 0.48223114013671875, "learning_rate": 5e-07, "logits/chosen": -46861689.6, "logits/rejected": -1789296.6666666667, "logps/chosen": -353.708447265625, "logps/rejected": -463.7765706380208, "loss": 0.2256, "rewards/chosen": 0.8493674278259278, "rewards/margins": 4.262968349456787, "rewards/rejected": -3.4136009216308594, "step": 13926 }, { "epoch": 0.7381867330983489, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3145430.0, "logits/rejected": -90726816.0, "logps/chosen": -196.81282552083334, "logps/rejected": -456.039111328125, "loss": 0.1615, "rewards/chosen": 1.0463109016418457, "rewards/margins": 3.409735584259033, "rewards/rejected": -2.3634246826171874, "step": 13927 }, { "epoch": 0.7382397371001511, "grad_norm": 56.0, "kl": 1.9068584442138672, "learning_rate": 5e-07, "logits/chosen": -20470900.8, "logits/rejected": -41741664.0, "logps/chosen": -263.2383544921875, "logps/rejected": -142.01444498697916, "loss": 0.349, "rewards/chosen": 0.5397891044616699, "rewards/margins": 1.5995652516682943, "rewards/rejected": -1.0597761472066243, "step": 13928 }, { "epoch": 0.7382927411019532, "grad_norm": 45.25, "kl": 2.792543411254883, "learning_rate": 5e-07, "logits/chosen": -32047987.2, "logits/rejected": -24391200.0, "logps/chosen": -229.8234375, "logps/rejected": -258.42616780598956, "loss": 0.2891, "rewards/chosen": 0.734286880493164, "rewards/margins": 4.505497550964355, "rewards/rejected": -3.7712106704711914, "step": 13929 }, { "epoch": 0.7383457451037553, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22225820.0, "logits/rejected": -22209569.6, "logps/chosen": -241.10919189453125, "logps/rejected": -138.67412109375, "loss": 0.2355, "rewards/chosen": 0.4171437819798787, "rewards/margins": 2.6842298110326133, "rewards/rejected": -2.2670860290527344, "step": 13930 }, { "epoch": 0.7383987491055575, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55791514.666666664, "logits/rejected": -40723478.4, "logps/chosen": -307.995361328125, "logps/rejected": -291.747509765625, "loss": 0.286, "rewards/chosen": 0.19739766915639242, "rewards/margins": 2.8980890830357873, "rewards/rejected": -2.7006914138793947, "step": 13931 }, { "epoch": 0.7384517531073596, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28144792.0, "logits/rejected": -32400838.0, "logps/chosen": -166.9818878173828, "logps/rejected": -442.4173583984375, "loss": 0.2119, "rewards/chosen": 1.027557611465454, "rewards/margins": 3.5014243125915527, "rewards/rejected": -2.4738667011260986, "step": 13932 }, { "epoch": 0.7385047571091617, "grad_norm": 47.75, "kl": 0.46489715576171875, "learning_rate": 5e-07, "logits/chosen": -38326376.0, "logits/rejected": -55197080.0, "logps/chosen": -505.3455505371094, "logps/rejected": -277.97027587890625, "loss": 0.2367, "rewards/chosen": 0.6821007132530212, "rewards/margins": 2.8240696787834167, "rewards/rejected": -2.1419689655303955, "step": 13933 }, { "epoch": 0.7385577611109638, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31373384.0, "logits/rejected": -29595619.2, "logps/chosen": -306.96645100911456, "logps/rejected": -322.682421875, "loss": 0.2179, "rewards/chosen": 0.2573506434758504, "rewards/margins": 2.9262728770573934, "rewards/rejected": -2.668922233581543, "step": 13934 }, { "epoch": 0.738610765112766, "grad_norm": 40.25, "kl": 0.4549436569213867, "learning_rate": 5e-07, "logits/chosen": 6504736.0, "logits/rejected": -74693904.0, "logps/chosen": -74.18328857421875, "logps/rejected": -352.6202392578125, "loss": 0.2773, "rewards/chosen": 0.5959795713424683, "rewards/margins": 2.393918752670288, "rewards/rejected": -1.7979391813278198, "step": 13935 }, { "epoch": 0.7386637691145681, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30460432.0, "logits/rejected": -20445924.0, "logps/chosen": -351.73529052734375, "logps/rejected": -441.4412536621094, "loss": 0.242, "rewards/chosen": 0.5569177865982056, "rewards/margins": 3.518955111503601, "rewards/rejected": -2.9620373249053955, "step": 13936 }, { "epoch": 0.7387167731163703, "grad_norm": 31.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27603877.333333332, "logits/rejected": -42085088.0, "logps/chosen": -410.0477701822917, "logps/rejected": -350.06513671875, "loss": 0.1347, "rewards/chosen": 1.9955204327901204, "rewards/margins": 4.260699876149495, "rewards/rejected": -2.265179443359375, "step": 13937 }, { "epoch": 0.7387697771181724, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22903720.0, "logits/rejected": 80134156.8, "logps/chosen": -204.73225911458334, "logps/rejected": -370.911083984375, "loss": 0.2176, "rewards/chosen": 0.44865373770395917, "rewards/margins": 2.7157866875330607, "rewards/rejected": -2.2671329498291017, "step": 13938 }, { "epoch": 0.7388227811199746, "grad_norm": 41.5, "kl": 2.91339111328125, "learning_rate": 5e-07, "logits/chosen": -9720926.666666666, "logits/rejected": -42765078.4, "logps/chosen": -602.973388671875, "logps/rejected": -428.99921875, "loss": 0.1515, "rewards/chosen": 2.3560543060302734, "rewards/margins": 5.791413116455078, "rewards/rejected": -3.4353588104248045, "step": 13939 }, { "epoch": 0.7388757851217767, "grad_norm": 59.5, "kl": 1.2676448822021484, "learning_rate": 5e-07, "logits/chosen": -35958960.0, "logits/rejected": -78759088.0, "logps/chosen": -323.4600524902344, "logps/rejected": -568.9774169921875, "loss": 0.2672, "rewards/chosen": 0.37426358461380005, "rewards/margins": 3.1408491730690002, "rewards/rejected": -2.7665855884552, "step": 13940 }, { "epoch": 0.7389287891235788, "grad_norm": 57.0, "kl": 1.30743408203125, "learning_rate": 5e-07, "logits/chosen": -1240183.5, "logits/rejected": -13513093.333333334, "logps/chosen": -233.76309204101562, "logps/rejected": -297.8056233723958, "loss": 0.271, "rewards/chosen": 0.03737753629684448, "rewards/margins": 1.8078551491101582, "rewards/rejected": -1.7704776128133137, "step": 13941 }, { "epoch": 0.7389817931253809, "grad_norm": 53.0, "kl": 0.8516387939453125, "learning_rate": 5e-07, "logits/chosen": -30015540.0, "logits/rejected": -58336680.0, "logps/chosen": -412.42633056640625, "logps/rejected": -455.1944580078125, "loss": 0.2576, "rewards/chosen": 0.386713445186615, "rewards/margins": 2.695933759212494, "rewards/rejected": -2.309220314025879, "step": 13942 }, { "epoch": 0.7390347971271831, "grad_norm": 41.0, "kl": 0.20002365112304688, "learning_rate": 5e-07, "logits/chosen": -36836312.0, "logits/rejected": -25817920.0, "logps/chosen": -301.098876953125, "logps/rejected": -282.4336344401042, "loss": 0.1834, "rewards/chosen": 0.6100303530693054, "rewards/margins": 3.0234524607658386, "rewards/rejected": -2.413422107696533, "step": 13943 }, { "epoch": 0.7390878011289852, "grad_norm": 44.0, "kl": 0.38895416259765625, "learning_rate": 5e-07, "logits/chosen": -36911112.0, "logits/rejected": -709280.0, "logps/chosen": -501.3711853027344, "logps/rejected": -383.8381042480469, "loss": 0.1612, "rewards/chosen": 1.6335196495056152, "rewards/margins": 3.981614589691162, "rewards/rejected": -2.348094940185547, "step": 13944 }, { "epoch": 0.7391408051307874, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64659.25, "logits/rejected": -20531501.333333332, "logps/chosen": -373.697265625, "logps/rejected": -296.8031412760417, "loss": 0.2676, "rewards/chosen": 0.4434448480606079, "rewards/margins": 2.126397093137105, "rewards/rejected": -1.6829522450764973, "step": 13945 }, { "epoch": 0.7391938091325895, "grad_norm": 40.75, "kl": 2.1845436096191406, "learning_rate": 5e-07, "logits/chosen": -20923214.4, "logits/rejected": -47397610.666666664, "logps/chosen": -121.1194091796875, "logps/rejected": -523.9579264322916, "loss": 0.2753, "rewards/chosen": 0.7228579521179199, "rewards/margins": 3.2170840899149575, "rewards/rejected": -2.4942261377970376, "step": 13946 }, { "epoch": 0.7392468131343917, "grad_norm": 58.0, "kl": 1.1092071533203125, "learning_rate": 5e-07, "logits/chosen": -10508537.142857144, "logits/rejected": -4957532.0, "logps/chosen": -355.485107421875, "logps/rejected": -179.87705993652344, "loss": 0.3731, "rewards/chosen": 0.5401332718985421, "rewards/margins": 6.208730970110212, "rewards/rejected": -5.66859769821167, "step": 13947 }, { "epoch": 0.7392998171361937, "grad_norm": 79.0, "kl": 3.896717071533203, "learning_rate": 5e-07, "logits/chosen": -18255733.333333332, "logits/rejected": -3034482.5, "logps/chosen": -320.8892008463542, "logps/rejected": -288.29443359375, "loss": 0.3041, "rewards/chosen": 0.911517063776652, "rewards/margins": 4.66070834795634, "rewards/rejected": -3.7491912841796875, "step": 13948 }, { "epoch": 0.7393528211379959, "grad_norm": 49.0, "kl": 1.3535633087158203, "learning_rate": 5e-07, "logits/chosen": -13687096.0, "logits/rejected": -23706662.0, "logps/chosen": -139.35391235351562, "logps/rejected": -423.9012451171875, "loss": 0.2261, "rewards/chosen": 0.999850869178772, "rewards/margins": 4.022797226905823, "rewards/rejected": -3.022946357727051, "step": 13949 }, { "epoch": 0.739405825139798, "grad_norm": 46.75, "kl": 0.7067756652832031, "learning_rate": 5e-07, "logits/chosen": -37691412.0, "logits/rejected": -61228364.0, "logps/chosen": -291.25787353515625, "logps/rejected": -471.68218994140625, "loss": 0.2411, "rewards/chosen": 0.6488169431686401, "rewards/margins": 3.7301875352859497, "rewards/rejected": -3.0813705921173096, "step": 13950 }, { "epoch": 0.7394588291416002, "grad_norm": 53.75, "kl": 0.4478750228881836, "learning_rate": 5e-07, "logits/chosen": -17464867.2, "logits/rejected": -63477130.666666664, "logps/chosen": -215.062255859375, "logps/rejected": -367.88720703125, "loss": 0.2915, "rewards/chosen": 0.540879774093628, "rewards/margins": 2.5409733931223553, "rewards/rejected": -2.000093619028727, "step": 13951 }, { "epoch": 0.7395118331434023, "grad_norm": 41.75, "kl": 2.2360572814941406, "learning_rate": 5e-07, "logits/chosen": -22097808.0, "logits/rejected": -21077794.666666668, "logps/chosen": -242.915966796875, "logps/rejected": -267.29595947265625, "loss": 0.3199, "rewards/chosen": 0.5668000221252442, "rewards/margins": 2.374466323852539, "rewards/rejected": -1.807666301727295, "step": 13952 }, { "epoch": 0.7395648371452045, "grad_norm": 51.0, "kl": 0.4305000305175781, "learning_rate": 5e-07, "logits/chosen": -12792729.0, "logits/rejected": -26752062.0, "logps/chosen": -308.72625732421875, "logps/rejected": -295.8543701171875, "loss": 0.3138, "rewards/chosen": 0.12059421837329865, "rewards/margins": 2.3051361590623856, "rewards/rejected": -2.184541940689087, "step": 13953 }, { "epoch": 0.7396178411470066, "grad_norm": 40.75, "kl": 0.5347633361816406, "learning_rate": 5e-07, "logits/chosen": -9041082.0, "logits/rejected": -15081881.0, "logps/chosen": -348.19317626953125, "logps/rejected": -487.1549072265625, "loss": 0.2146, "rewards/chosen": 1.4122660160064697, "rewards/margins": 4.670293092727661, "rewards/rejected": -3.2580270767211914, "step": 13954 }, { "epoch": 0.7396708451488088, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5171732.5, "logits/rejected": 5156761.0, "logps/chosen": -62.293121337890625, "logps/rejected": -163.61138916015625, "loss": 0.272, "rewards/chosen": 0.5637205243110657, "rewards/margins": 1.8567336201667786, "rewards/rejected": -1.293013095855713, "step": 13955 }, { "epoch": 0.7397238491506108, "grad_norm": 34.75, "kl": 0.2958183288574219, "learning_rate": 5e-07, "logits/chosen": -24204922.666666668, "logits/rejected": -43620761.6, "logps/chosen": -140.3183797200521, "logps/rejected": -265.8798095703125, "loss": 0.2662, "rewards/chosen": 0.08107287685076396, "rewards/margins": 3.461952088276545, "rewards/rejected": -3.380879211425781, "step": 13956 }, { "epoch": 0.739776853152413, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25577384.0, "logits/rejected": -3723078.8, "logps/chosen": -547.6332600911459, "logps/rejected": -293.3425537109375, "loss": 0.2793, "rewards/chosen": 0.8715312480926514, "rewards/margins": 2.4841758251190185, "rewards/rejected": -1.612644577026367, "step": 13957 }, { "epoch": 0.7398298571542151, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42949254.4, "logits/rejected": -45381530.666666664, "logps/chosen": -343.7966552734375, "logps/rejected": -432.7722574869792, "loss": 0.2656, "rewards/chosen": 0.5957022190093995, "rewards/margins": 2.8649913946787517, "rewards/rejected": -2.269289175669352, "step": 13958 }, { "epoch": 0.7398828611560173, "grad_norm": 81.0, "kl": 5.301206588745117, "learning_rate": 5e-07, "logits/chosen": -6914983.333333333, "logits/rejected": -14321088.0, "logps/chosen": -173.3501993815104, "logps/rejected": -368.6453369140625, "loss": 0.235, "rewards/chosen": 1.5283352533976238, "rewards/margins": 4.709971300760905, "rewards/rejected": -3.1816360473632814, "step": 13959 }, { "epoch": 0.7399358651578194, "grad_norm": 50.0, "kl": 0.12285995483398438, "learning_rate": 5e-07, "logits/chosen": -10621726.0, "logits/rejected": -27925100.8, "logps/chosen": -211.1279296875, "logps/rejected": -355.3526611328125, "loss": 0.3086, "rewards/chosen": -0.15168259541193643, "rewards/margins": 2.5311270435651143, "rewards/rejected": -2.6828096389770506, "step": 13960 }, { "epoch": 0.7399888691596216, "grad_norm": 44.25, "kl": 1.6721000671386719, "learning_rate": 5e-07, "logits/chosen": -29757811.2, "logits/rejected": -12632250.666666666, "logps/chosen": -389.1301513671875, "logps/rejected": -292.67156982421875, "loss": 0.264, "rewards/chosen": 0.7918646812438965, "rewards/margins": 3.741242694854736, "rewards/rejected": -2.94937801361084, "step": 13961 }, { "epoch": 0.7400418731614237, "grad_norm": 54.0, "kl": 6.265813827514648, "learning_rate": 5e-07, "logits/chosen": 7937578.5, "logits/rejected": -123815648.0, "logps/chosen": -448.20648193359375, "logps/rejected": -294.1592102050781, "loss": 0.2348, "rewards/chosen": 1.3292133808135986, "rewards/margins": 3.9999632835388184, "rewards/rejected": -2.6707499027252197, "step": 13962 }, { "epoch": 0.7400948771632259, "grad_norm": 44.0, "kl": 4.779058456420898, "learning_rate": 5e-07, "logits/chosen": 5981466.4, "logits/rejected": 5518949.333333333, "logps/chosen": -481.93818359375, "logps/rejected": -421.0109049479167, "loss": 0.2444, "rewards/chosen": 1.1432546615600585, "rewards/margins": 4.713359896341959, "rewards/rejected": -3.570105234781901, "step": 13963 }, { "epoch": 0.7401478811650279, "grad_norm": 43.75, "kl": 0.7047309875488281, "learning_rate": 5e-07, "logits/chosen": -15543850.0, "logits/rejected": -94734752.0, "logps/chosen": -195.32652282714844, "logps/rejected": -255.97769165039062, "loss": 0.2372, "rewards/chosen": 0.6047887206077576, "rewards/margins": 3.172336995601654, "rewards/rejected": -2.5675482749938965, "step": 13964 }, { "epoch": 0.7402008851668301, "grad_norm": 42.25, "kl": 0.009883880615234375, "learning_rate": 5e-07, "logits/chosen": -17534838.0, "logits/rejected": -35739336.0, "logps/chosen": -341.78076171875, "logps/rejected": -298.517578125, "loss": 0.1849, "rewards/chosen": 1.306605339050293, "rewards/margins": 3.6730451583862305, "rewards/rejected": -2.3664398193359375, "step": 13965 }, { "epoch": 0.7402538891686322, "grad_norm": 42.75, "kl": 0.8403520584106445, "learning_rate": 5e-07, "logits/chosen": -44946816.0, "logits/rejected": -2978682.8, "logps/chosen": -321.63979085286456, "logps/rejected": -134.10284423828125, "loss": 0.1956, "rewards/chosen": 1.3452259699503581, "rewards/margins": 3.157114664713542, "rewards/rejected": -1.8118886947631836, "step": 13966 }, { "epoch": 0.7403068931704344, "grad_norm": 49.0, "kl": 0.37212085723876953, "learning_rate": 5e-07, "logits/chosen": 3282672.3333333335, "logits/rejected": -14869576.0, "logps/chosen": -185.250732421875, "logps/rejected": -185.93367919921874, "loss": 0.3739, "rewards/chosen": -0.3470843235651652, "rewards/margins": 1.065209682782491, "rewards/rejected": -1.4122940063476563, "step": 13967 }, { "epoch": 0.7403598971722365, "grad_norm": 133.0, "kl": 1.598236083984375, "learning_rate": 5e-07, "logits/chosen": -31171542.4, "logits/rejected": -21517689.333333332, "logps/chosen": -190.3521728515625, "logps/rejected": -123.458984375, "loss": 0.4038, "rewards/chosen": 0.09559504985809326, "rewards/margins": 1.9823258797327679, "rewards/rejected": -1.8867308298746746, "step": 13968 }, { "epoch": 0.7404129011740387, "grad_norm": 57.25, "kl": 1.4268951416015625, "learning_rate": 5e-07, "logits/chosen": -19997488.0, "logits/rejected": -22011756.0, "logps/chosen": -197.0145721435547, "logps/rejected": -205.3878631591797, "loss": 0.2866, "rewards/chosen": 0.7376439571380615, "rewards/margins": 1.9536175727844238, "rewards/rejected": -1.2159736156463623, "step": 13969 }, { "epoch": 0.7404659051758408, "grad_norm": 56.25, "kl": 1.0630111694335938, "learning_rate": 5e-07, "logits/chosen": -51781659.428571425, "logits/rejected": -26912420.0, "logps/chosen": -340.5322963169643, "logps/rejected": -306.4054870605469, "loss": 0.3714, "rewards/chosen": 0.4979636328560965, "rewards/margins": 2.4622856037957326, "rewards/rejected": -1.9643219709396362, "step": 13970 }, { "epoch": 0.740518909177643, "grad_norm": 91.5, "kl": 0.5866889953613281, "learning_rate": 5e-07, "logits/chosen": -67939800.0, "logits/rejected": 8175437.333333333, "logps/chosen": -675.9479370117188, "logps/rejected": -237.80863444010416, "loss": 0.2313, "rewards/chosen": 1.4692809581756592, "rewards/margins": 2.696592092514038, "rewards/rejected": -1.227311134338379, "step": 13971 }, { "epoch": 0.740571913179445, "grad_norm": 47.0, "kl": 0.13479995727539062, "learning_rate": 5e-07, "logits/chosen": -89556360.0, "logits/rejected": -11847675.0, "logps/chosen": -449.20379638671875, "logps/rejected": -320.8963623046875, "loss": 0.1563, "rewards/chosen": 1.7428672313690186, "rewards/margins": 4.033419609069824, "rewards/rejected": -2.2905523777008057, "step": 13972 }, { "epoch": 0.7406249171812472, "grad_norm": 57.75, "kl": 6.287991523742676, "learning_rate": 5e-07, "logits/chosen": -22019179.42857143, "logits/rejected": -2805948.5, "logps/chosen": -297.3543178013393, "logps/rejected": -78.0994873046875, "loss": 0.4185, "rewards/chosen": 1.3420124053955078, "rewards/margins": 3.248307466506958, "rewards/rejected": -1.9062950611114502, "step": 13973 }, { "epoch": 0.7406779211830493, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25909158.0, "logits/rejected": 2411377.0, "logps/chosen": -424.37591552734375, "logps/rejected": -193.1939697265625, "loss": 0.2927, "rewards/chosen": 0.4848613440990448, "rewards/margins": 2.2906578481197357, "rewards/rejected": -1.805796504020691, "step": 13974 }, { "epoch": 0.7407309251848515, "grad_norm": 43.5, "kl": 3.3685569763183594, "learning_rate": 5e-07, "logits/chosen": -16280864.0, "logits/rejected": -27238224.0, "logps/chosen": -357.4989420572917, "logps/rejected": -507.579248046875, "loss": 0.2213, "rewards/chosen": 1.111143986384074, "rewards/margins": 3.6730897108713787, "rewards/rejected": -2.5619457244873045, "step": 13975 }, { "epoch": 0.7407839291866536, "grad_norm": 35.5, "kl": 2.803314208984375, "learning_rate": 5e-07, "logits/chosen": 5144606.0, "logits/rejected": -43145458.666666664, "logps/chosen": -672.97373046875, "logps/rejected": -411.693115234375, "loss": 0.2656, "rewards/chosen": 1.0629907608032227, "rewards/margins": 5.482965405782064, "rewards/rejected": -4.419974644978841, "step": 13976 }, { "epoch": 0.7408369331884558, "grad_norm": 41.5, "kl": 0.5572147369384766, "learning_rate": 5e-07, "logits/chosen": -58816852.0, "logits/rejected": -46471248.0, "logps/chosen": -210.86708068847656, "logps/rejected": -529.53125, "loss": 0.2508, "rewards/chosen": 0.8661904335021973, "rewards/margins": 4.805659294128418, "rewards/rejected": -3.9394688606262207, "step": 13977 }, { "epoch": 0.7408899371902579, "grad_norm": 54.0, "kl": 0.8590316772460938, "learning_rate": 5e-07, "logits/chosen": -105802128.0, "logits/rejected": -35461068.0, "logps/chosen": -395.68438720703125, "logps/rejected": -219.51651000976562, "loss": 0.2861, "rewards/chosen": 0.6746193170547485, "rewards/margins": 2.285489797592163, "rewards/rejected": -1.6108704805374146, "step": 13978 }, { "epoch": 0.74094294119206, "grad_norm": 60.0, "kl": 1.1337909698486328, "learning_rate": 5e-07, "logits/chosen": -2274574.5, "logits/rejected": -31953747.2, "logps/chosen": -763.3639322916666, "logps/rejected": -543.051611328125, "loss": 0.223, "rewards/chosen": 1.1852020422617595, "rewards/margins": 4.422666947046916, "rewards/rejected": -3.2374649047851562, "step": 13979 }, { "epoch": 0.7409959451938621, "grad_norm": 48.25, "kl": 0.7324867248535156, "learning_rate": 5e-07, "logits/chosen": -5586575.0, "logits/rejected": -50683620.0, "logps/chosen": -304.22418212890625, "logps/rejected": -305.6348876953125, "loss": 0.3216, "rewards/chosen": -0.19048666954040527, "rewards/margins": 2.5717780590057373, "rewards/rejected": -2.7622647285461426, "step": 13980 }, { "epoch": 0.7410489491956642, "grad_norm": 51.25, "kl": 0.24087142944335938, "learning_rate": 5e-07, "logits/chosen": 11014352.0, "logits/rejected": -7658487.0, "logps/chosen": -440.134033203125, "logps/rejected": -138.4544677734375, "loss": 0.3174, "rewards/chosen": 1.1855005025863647, "rewards/margins": 1.9930288791656494, "rewards/rejected": -0.8075283765792847, "step": 13981 }, { "epoch": 0.7411019531974664, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78220800.0, "logits/rejected": -12329694.666666666, "logps/chosen": -362.6669677734375, "logps/rejected": -272.8181966145833, "loss": 0.2983, "rewards/chosen": 0.5495989799499512, "rewards/margins": 3.8621225039164226, "rewards/rejected": -3.312523523966471, "step": 13982 }, { "epoch": 0.7411549571992685, "grad_norm": 48.5, "kl": 1.356536865234375, "learning_rate": 5e-07, "logits/chosen": -13580144.0, "logits/rejected": 51774276.0, "logps/chosen": -204.2414093017578, "logps/rejected": -356.086181640625, "loss": 0.3223, "rewards/chosen": 0.16547150909900665, "rewards/margins": 2.590587332844734, "rewards/rejected": -2.4251158237457275, "step": 13983 }, { "epoch": 0.7412079612010707, "grad_norm": 42.0, "kl": 0.29898643493652344, "learning_rate": 5e-07, "logits/chosen": -46804645.333333336, "logits/rejected": -37411081.6, "logps/chosen": -199.3807576497396, "logps/rejected": -329.3376708984375, "loss": 0.2378, "rewards/chosen": 0.23898514111836752, "rewards/margins": 2.738440783818563, "rewards/rejected": -2.4994556427001955, "step": 13984 }, { "epoch": 0.7412609652028728, "grad_norm": 45.75, "kl": 0.28862762451171875, "learning_rate": 5e-07, "logits/chosen": -66918645.333333336, "logits/rejected": -53982182.4, "logps/chosen": -529.4957682291666, "logps/rejected": -315.4186279296875, "loss": 0.2573, "rewards/chosen": 0.8847224712371826, "rewards/margins": 3.0007091045379637, "rewards/rejected": -2.115986633300781, "step": 13985 }, { "epoch": 0.741313969204675, "grad_norm": 38.75, "kl": 2.8474044799804688, "learning_rate": 5e-07, "logits/chosen": -2764225.5, "logits/rejected": -52857514.666666664, "logps/chosen": -646.0305786132812, "logps/rejected": -311.9624430338542, "loss": 0.1197, "rewards/chosen": 1.8192658424377441, "rewards/margins": 4.8966288566589355, "rewards/rejected": -3.0773630142211914, "step": 13986 }, { "epoch": 0.741366973206477, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17432886.4, "logits/rejected": -20097240.0, "logps/chosen": -227.291162109375, "logps/rejected": -253.83060709635416, "loss": 0.2608, "rewards/chosen": 0.6237675666809082, "rewards/margins": 3.4764859199523928, "rewards/rejected": -2.8527183532714844, "step": 13987 }, { "epoch": 0.7414199772082792, "grad_norm": 42.0, "kl": 0.7790794372558594, "learning_rate": 5e-07, "logits/chosen": -26820901.333333332, "logits/rejected": -24379209.6, "logps/chosen": -245.746337890625, "logps/rejected": -255.699560546875, "loss": 0.2372, "rewards/chosen": 0.1397282878557841, "rewards/margins": 2.9844111720720927, "rewards/rejected": -2.8446828842163088, "step": 13988 }, { "epoch": 0.7414729812100813, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41520410.666666664, "logits/rejected": -19240011.2, "logps/chosen": -309.09014892578125, "logps/rejected": -232.6634765625, "loss": 0.2419, "rewards/chosen": 0.6271860599517822, "rewards/margins": 2.691339349746704, "rewards/rejected": -2.064153289794922, "step": 13989 }, { "epoch": 0.7415259852118835, "grad_norm": 44.5, "kl": 1.7509222030639648, "learning_rate": 5e-07, "logits/chosen": -18483233.6, "logits/rejected": -62668165.333333336, "logps/chosen": -247.712646484375, "logps/rejected": -263.50897216796875, "loss": 0.3723, "rewards/chosen": 0.21587090492248534, "rewards/margins": 1.895563332239787, "rewards/rejected": -1.6796924273173015, "step": 13990 }, { "epoch": 0.7415789892136856, "grad_norm": 38.25, "kl": 0.6800918579101562, "learning_rate": 5e-07, "logits/chosen": 3442720.25, "logits/rejected": -17551564.0, "logps/chosen": -117.91297912597656, "logps/rejected": -291.524658203125, "loss": 0.3155, "rewards/chosen": -0.21628372371196747, "rewards/margins": 2.8694167882204056, "rewards/rejected": -3.085700511932373, "step": 13991 }, { "epoch": 0.7416319932154878, "grad_norm": 60.25, "kl": 0.3367118835449219, "learning_rate": 5e-07, "logits/chosen": -9209898.4, "logits/rejected": -40311594.666666664, "logps/chosen": -342.437158203125, "logps/rejected": -579.0637613932291, "loss": 0.3171, "rewards/chosen": 0.3328793287277222, "rewards/margins": 2.7421639204025268, "rewards/rejected": -2.4092845916748047, "step": 13992 }, { "epoch": 0.7416849972172899, "grad_norm": 59.75, "kl": 0.5288238525390625, "learning_rate": 5e-07, "logits/chosen": -8225477.5, "logits/rejected": -19726786.0, "logps/chosen": -161.95291137695312, "logps/rejected": -269.30889892578125, "loss": 0.2849, "rewards/chosen": 0.13471774756908417, "rewards/margins": 3.607042595744133, "rewards/rejected": -3.472324848175049, "step": 13993 }, { "epoch": 0.7417380012190921, "grad_norm": 47.75, "kl": 0.6197052001953125, "learning_rate": 5e-07, "logits/chosen": -24019544.0, "logits/rejected": -34883912.0, "logps/chosen": -274.2636311848958, "logps/rejected": -297.8269958496094, "loss": 0.3653, "rewards/chosen": 0.6921922365824381, "rewards/margins": 2.4671639601389566, "rewards/rejected": -1.7749717235565186, "step": 13994 }, { "epoch": 0.7417910052208941, "grad_norm": 90.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34271812.0, "logits/rejected": -15586978.0, "logps/chosen": -332.7642822265625, "logps/rejected": -324.091796875, "loss": 0.2946, "rewards/chosen": 0.31492388248443604, "rewards/margins": 2.546486020088196, "rewards/rejected": -2.2315621376037598, "step": 13995 }, { "epoch": 0.7418440092226963, "grad_norm": 29.125, "kl": 2.728555679321289, "learning_rate": 5e-07, "logits/chosen": -16740172.0, "logits/rejected": -4406580.0, "logps/chosen": -237.1734619140625, "logps/rejected": -389.722607421875, "loss": 0.2392, "rewards/chosen": 0.7785172462463379, "rewards/margins": 3.2319170951843263, "rewards/rejected": -2.4533998489379885, "step": 13996 }, { "epoch": 0.7418970132244984, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7339833.5, "logits/rejected": -27639272.0, "logps/chosen": -129.3780517578125, "logps/rejected": -215.49302673339844, "loss": 0.2531, "rewards/chosen": 0.6051211357116699, "rewards/margins": 2.736651659011841, "rewards/rejected": -2.131530523300171, "step": 13997 }, { "epoch": 0.7419500172263006, "grad_norm": 36.5, "kl": 1.1376380920410156, "learning_rate": 5e-07, "logits/chosen": -7431259.2, "logits/rejected": -8563253.333333334, "logps/chosen": -101.42681274414062, "logps/rejected": -207.76053873697916, "loss": 0.2984, "rewards/chosen": 0.4900486469268799, "rewards/margins": 3.420314073562622, "rewards/rejected": -2.930265426635742, "step": 13998 }, { "epoch": 0.7420030212281027, "grad_norm": 81.0, "kl": 0.24581241607666016, "learning_rate": 5e-07, "logits/chosen": 28649491.2, "logits/rejected": -36221754.666666664, "logps/chosen": -548.18232421875, "logps/rejected": -256.2367757161458, "loss": 0.3621, "rewards/chosen": 0.24128615856170654, "rewards/margins": 1.7086598475774128, "rewards/rejected": -1.4673736890157063, "step": 13999 }, { "epoch": 0.7420560252299049, "grad_norm": 56.75, "kl": 0.14592742919921875, "learning_rate": 5e-07, "logits/chosen": -51432339.2, "logits/rejected": -133768234.66666667, "logps/chosen": -331.5649169921875, "logps/rejected": -315.5643717447917, "loss": 0.2583, "rewards/chosen": 0.8653826713562012, "rewards/margins": 2.9560901323954263, "rewards/rejected": -2.090707461039225, "step": 14000 }, { "epoch": 0.742109029231707, "grad_norm": 52.25, "kl": 2.2666702270507812, "learning_rate": 5e-07, "logits/chosen": -23492742.4, "logits/rejected": -42844978.666666664, "logps/chosen": -232.7342529296875, "logps/rejected": -271.09763590494794, "loss": 0.3029, "rewards/chosen": 0.503701114654541, "rewards/margins": 2.4288923581441244, "rewards/rejected": -1.9251912434895833, "step": 14001 }, { "epoch": 0.7421620332335092, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25725517.333333332, "logits/rejected": 26831066.0, "logps/chosen": -288.6614176432292, "logps/rejected": -505.4466552734375, "loss": 0.3585, "rewards/chosen": 0.6897384325663248, "rewards/margins": 1.4509628216425576, "rewards/rejected": -0.7612243890762329, "step": 14002 }, { "epoch": 0.7422150372353112, "grad_norm": 56.5, "kl": 2.108367919921875, "learning_rate": 5e-07, "logits/chosen": -28684866.666666668, "logits/rejected": -6801808.0, "logps/chosen": -186.66455078125, "logps/rejected": -81.94871520996094, "loss": 0.4929, "rewards/chosen": -0.08225884040196736, "rewards/margins": 1.5525233546892803, "rewards/rejected": -1.6347821950912476, "step": 14003 }, { "epoch": 0.7422680412371134, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48980890.666666664, "logits/rejected": -6959752.8, "logps/chosen": -406.632080078125, "logps/rejected": -77.35968017578125, "loss": 0.3145, "rewards/chosen": -0.06635131935278575, "rewards/margins": 1.8402809133132298, "rewards/rejected": -1.9066322326660157, "step": 14004 }, { "epoch": 0.7423210452389155, "grad_norm": 53.5, "kl": 2.0021486282348633, "learning_rate": 5e-07, "logits/chosen": -35193536.0, "logits/rejected": -6826461.333333333, "logps/chosen": -293.907421875, "logps/rejected": -192.6430867513021, "loss": 0.3318, "rewards/chosen": 0.569656229019165, "rewards/margins": 1.8338167031606036, "rewards/rejected": -1.2641604741414387, "step": 14005 }, { "epoch": 0.7423740492407177, "grad_norm": 38.5, "kl": 3.2760791778564453, "learning_rate": 5e-07, "logits/chosen": 3538524.8571428573, "logits/rejected": -94541216.0, "logps/chosen": -149.56846400669642, "logps/rejected": -1090.912109375, "loss": 0.4095, "rewards/chosen": 0.6993717466081891, "rewards/margins": 7.717664173671177, "rewards/rejected": -7.018292427062988, "step": 14006 }, { "epoch": 0.7424270532425198, "grad_norm": 97.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32954458.666666668, "logits/rejected": -42655480.0, "logps/chosen": -216.506591796875, "logps/rejected": -872.027587890625, "loss": 0.3157, "rewards/chosen": 0.36435214678446454, "rewards/margins": 4.557922283808391, "rewards/rejected": -4.193570137023926, "step": 14007 }, { "epoch": 0.742480057244322, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1522953.0, "logits/rejected": -10451298.666666666, "logps/chosen": -215.43731689453125, "logps/rejected": -213.3731892903646, "loss": 0.1262, "rewards/chosen": 1.3513025045394897, "rewards/margins": 3.986004869143168, "rewards/rejected": -2.6347023646036782, "step": 14008 }, { "epoch": 0.7425330612461241, "grad_norm": 43.0, "kl": 2.000102996826172, "learning_rate": 5e-07, "logits/chosen": -11448489.6, "logits/rejected": -39296437.333333336, "logps/chosen": -501.32490234375, "logps/rejected": -501.1046549479167, "loss": 0.233, "rewards/chosen": 1.2739255905151368, "rewards/margins": 4.21007391611735, "rewards/rejected": -2.9361483256022134, "step": 14009 }, { "epoch": 0.7425860652479263, "grad_norm": 49.5, "kl": 2.601654052734375, "learning_rate": 5e-07, "logits/chosen": -14117155.0, "logits/rejected": -27259380.0, "logps/chosen": -187.03829956054688, "logps/rejected": -413.3070068359375, "loss": 0.2989, "rewards/chosen": 0.6737533807754517, "rewards/margins": 2.3599789142608643, "rewards/rejected": -1.6862255334854126, "step": 14010 }, { "epoch": 0.7426390692497283, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16287280.0, "logits/rejected": -41254162.666666664, "logps/chosen": -156.06028747558594, "logps/rejected": -493.5250244140625, "loss": 0.1911, "rewards/chosen": -0.11895827949047089, "rewards/margins": 3.3686458617448807, "rewards/rejected": -3.4876041412353516, "step": 14011 }, { "epoch": 0.7426920732515305, "grad_norm": 37.5, "kl": 2.71652889251709, "learning_rate": 5e-07, "logits/chosen": -10502686.4, "logits/rejected": -94890218.66666667, "logps/chosen": -296.1429931640625, "logps/rejected": -338.5074055989583, "loss": 0.3004, "rewards/chosen": 0.9896323204040527, "rewards/margins": 3.4558298110961916, "rewards/rejected": -2.4661974906921387, "step": 14012 }, { "epoch": 0.7427450772533326, "grad_norm": 35.0, "kl": 0.2160472869873047, "learning_rate": 5e-07, "logits/chosen": -4318321.5, "logits/rejected": -16801708.0, "logps/chosen": -155.4307861328125, "logps/rejected": -217.32393391927084, "loss": 0.1268, "rewards/chosen": 1.968756914138794, "rewards/margins": 4.7967277367909755, "rewards/rejected": -2.827970822652181, "step": 14013 }, { "epoch": 0.7427980812551348, "grad_norm": 83.0, "kl": 2.8848514556884766, "learning_rate": 5e-07, "logits/chosen": -49216060.0, "logits/rejected": 2178039.0, "logps/chosen": -521.74609375, "logps/rejected": -278.0162353515625, "loss": 0.1991, "rewards/chosen": 1.072335124015808, "rewards/margins": 3.9710806608200073, "rewards/rejected": -2.898745536804199, "step": 14014 }, { "epoch": 0.7428510852569369, "grad_norm": 56.0, "kl": 0.4101715087890625, "learning_rate": 5e-07, "logits/chosen": -28404314.0, "logits/rejected": 12269262.0, "logps/chosen": -257.40313720703125, "logps/rejected": -243.4176483154297, "loss": 0.3572, "rewards/chosen": 0.11975429952144623, "rewards/margins": 1.467785581946373, "rewards/rejected": -1.3480312824249268, "step": 14015 }, { "epoch": 0.7429040892587391, "grad_norm": 70.0, "kl": 2.6262989044189453, "learning_rate": 5e-07, "logits/chosen": -17457904.0, "logits/rejected": -38734120.0, "logps/chosen": -289.2987060546875, "logps/rejected": -247.68344116210938, "loss": 0.3263, "rewards/chosen": 0.775775671005249, "rewards/margins": 3.2851054668426514, "rewards/rejected": -2.5093297958374023, "step": 14016 }, { "epoch": 0.7429570932605412, "grad_norm": 47.5, "kl": 0.9914708137512207, "learning_rate": 5e-07, "logits/chosen": -7914998.666666667, "logits/rejected": -18876729.6, "logps/chosen": -275.0345865885417, "logps/rejected": -277.4654541015625, "loss": 0.2478, "rewards/chosen": 1.0196447372436523, "rewards/margins": 3.3473028182983398, "rewards/rejected": -2.3276580810546874, "step": 14017 }, { "epoch": 0.7430100972623433, "grad_norm": 93.5, "kl": 1.8949356079101562, "learning_rate": 5e-07, "logits/chosen": 18805090.666666668, "logits/rejected": -48676728.0, "logps/chosen": -392.6244710286458, "logps/rejected": -215.05117797851562, "loss": 0.3667, "rewards/chosen": 0.5437285105387369, "rewards/margins": 2.1470443407694497, "rewards/rejected": -1.603315830230713, "step": 14018 }, { "epoch": 0.7430631012641454, "grad_norm": 39.5, "kl": 0.3435783386230469, "learning_rate": 5e-07, "logits/chosen": 5983378.5, "logits/rejected": -20058536.0, "logps/chosen": -131.82284545898438, "logps/rejected": -509.7808532714844, "loss": 0.2239, "rewards/chosen": 0.7047733664512634, "rewards/margins": 3.713488519191742, "rewards/rejected": -3.0087151527404785, "step": 14019 }, { "epoch": 0.7431161052659476, "grad_norm": 40.5, "kl": 0.9424619674682617, "learning_rate": 5e-07, "logits/chosen": -36950768.0, "logits/rejected": -79001075.2, "logps/chosen": -216.85371907552084, "logps/rejected": -430.0783203125, "loss": 0.2029, "rewards/chosen": 0.3810444275538127, "rewards/margins": 2.8864784638086953, "rewards/rejected": -2.505434036254883, "step": 14020 }, { "epoch": 0.7431691092677497, "grad_norm": 54.5, "kl": 2.511920928955078, "learning_rate": 5e-07, "logits/chosen": -2240388.0, "logits/rejected": -14218947.2, "logps/chosen": -418.8526611328125, "logps/rejected": -215.760498046875, "loss": 0.2526, "rewards/chosen": 1.1342213948567708, "rewards/margins": 3.0244746526082356, "rewards/rejected": -1.8902532577514648, "step": 14021 }, { "epoch": 0.7432221132695519, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33264752.0, "logits/rejected": -12673198.666666666, "logps/chosen": -441.837158203125, "logps/rejected": -515.1387125651041, "loss": 0.2767, "rewards/chosen": 0.40429062843322755, "rewards/margins": 3.27913613319397, "rewards/rejected": -2.874845504760742, "step": 14022 }, { "epoch": 0.743275117271354, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24016520.0, "logits/rejected": -60411920.0, "logps/chosen": -209.7578887939453, "logps/rejected": -373.7191162109375, "loss": 0.3651, "rewards/chosen": -0.22366906702518463, "rewards/margins": 1.5547754615545273, "rewards/rejected": -1.778444528579712, "step": 14023 }, { "epoch": 0.7433281212731562, "grad_norm": 41.25, "kl": 1.450439453125, "learning_rate": 5e-07, "logits/chosen": -31097352.0, "logits/rejected": -30761308.0, "logps/chosen": -121.94439697265625, "logps/rejected": -368.7423400878906, "loss": 0.3411, "rewards/chosen": 0.08067923039197922, "rewards/margins": 2.381056122481823, "rewards/rejected": -2.3003768920898438, "step": 14024 }, { "epoch": 0.7433811252749583, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1343205.6666666667, "logits/rejected": -19851406.4, "logps/chosen": -293.78594970703125, "logps/rejected": -169.464404296875, "loss": 0.2562, "rewards/chosen": 0.4641968011856079, "rewards/margins": 2.361249566078186, "rewards/rejected": -1.8970527648925781, "step": 14025 }, { "epoch": 0.7434341292767604, "grad_norm": 56.75, "kl": 0.18542098999023438, "learning_rate": 5e-07, "logits/chosen": -39418761.6, "logits/rejected": -21037728.0, "logps/chosen": -347.1130615234375, "logps/rejected": -298.4021402994792, "loss": 0.2692, "rewards/chosen": 0.5126519680023194, "rewards/margins": 3.744845279057821, "rewards/rejected": -3.2321933110555015, "step": 14026 }, { "epoch": 0.7434871332785625, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36127804.0, "logits/rejected": 14311964.0, "logps/chosen": -373.5619201660156, "logps/rejected": -215.01358032226562, "loss": 0.2686, "rewards/chosen": 0.4844014048576355, "rewards/margins": 2.4555978178977966, "rewards/rejected": -1.9711964130401611, "step": 14027 }, { "epoch": 0.7435401372803647, "grad_norm": 33.75, "kl": 0.8701133728027344, "learning_rate": 5e-07, "logits/chosen": -23967450.0, "logits/rejected": -20445234.0, "logps/chosen": -197.32237243652344, "logps/rejected": -316.3123779296875, "loss": 0.223, "rewards/chosen": 0.5468613505363464, "rewards/margins": 3.7485219836235046, "rewards/rejected": -3.201660633087158, "step": 14028 }, { "epoch": 0.7435931412821668, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36058708.0, "logits/rejected": -52435946.666666664, "logps/chosen": -241.9581756591797, "logps/rejected": -361.4131673177083, "loss": 0.2494, "rewards/chosen": 0.3320499658584595, "rewards/margins": 2.821592926979065, "rewards/rejected": -2.4895429611206055, "step": 14029 }, { "epoch": 0.7436461452839689, "grad_norm": 47.0, "kl": 0.0047149658203125, "learning_rate": 5e-07, "logits/chosen": -50519176.0, "logits/rejected": -23011808.0, "logps/chosen": -626.61376953125, "logps/rejected": -197.01885986328125, "loss": 0.2615, "rewards/chosen": 1.296687364578247, "rewards/margins": 2.622969388961792, "rewards/rejected": -1.326282024383545, "step": 14030 }, { "epoch": 0.7436991492857711, "grad_norm": 44.75, "kl": 1.2779436111450195, "learning_rate": 5e-07, "logits/chosen": 30529619.2, "logits/rejected": -44699605.333333336, "logps/chosen": -70.39707641601562, "logps/rejected": -125.1167500813802, "loss": 0.2885, "rewards/chosen": 0.8259540557861328, "rewards/margins": 3.3385342915852867, "rewards/rejected": -2.512580235799154, "step": 14031 }, { "epoch": 0.7437521532875732, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42093308.0, "logits/rejected": -25890842.666666668, "logps/chosen": -393.9525146484375, "logps/rejected": -376.0450846354167, "loss": 0.1756, "rewards/chosen": 0.3433723449707031, "rewards/margins": 3.1809139251708984, "rewards/rejected": -2.8375415802001953, "step": 14032 }, { "epoch": 0.7438051572893754, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38617844.0, "logits/rejected": -34753909.333333336, "logps/chosen": -418.34228515625, "logps/rejected": -239.50846354166666, "loss": 0.1208, "rewards/chosen": 1.2914352416992188, "rewards/margins": 4.1695960362752285, "rewards/rejected": -2.8781607945760093, "step": 14033 }, { "epoch": 0.7438581612911774, "grad_norm": 56.0, "kl": 1.3339290618896484, "learning_rate": 5e-07, "logits/chosen": -4441864.666666667, "logits/rejected": -2374662.0, "logps/chosen": -261.6899820963542, "logps/rejected": -126.94107666015626, "loss": 0.2185, "rewards/chosen": 0.922021230061849, "rewards/margins": 2.774638303120931, "rewards/rejected": -1.852617073059082, "step": 14034 }, { "epoch": 0.7439111652929796, "grad_norm": 64.5, "kl": 2.37139892578125, "learning_rate": 5e-07, "logits/chosen": -132965600.0, "logits/rejected": -9389239.0, "logps/chosen": -1031.9954833984375, "logps/rejected": -387.065673828125, "loss": 0.2225, "rewards/chosen": 1.4703065156936646, "rewards/margins": 3.4399075508117676, "rewards/rejected": -1.969601035118103, "step": 14035 }, { "epoch": 0.7439641692947817, "grad_norm": 77.0, "kl": 0.07867145538330078, "learning_rate": 5e-07, "logits/chosen": -24417818.666666668, "logits/rejected": -37146307.2, "logps/chosen": -220.68656412760416, "logps/rejected": -159.62337646484374, "loss": 0.2382, "rewards/chosen": 0.9445071220397949, "rewards/margins": 2.5149245262145996, "rewards/rejected": -1.5704174041748047, "step": 14036 }, { "epoch": 0.7440171732965839, "grad_norm": 95.0, "kl": 1.59283447265625, "learning_rate": 5e-07, "logits/chosen": -22890224.0, "logits/rejected": 6945359.0, "logps/chosen": -213.32954915364584, "logps/rejected": -111.3967056274414, "loss": 0.4622, "rewards/chosen": 0.044141982992490135, "rewards/margins": 0.5871833910544714, "rewards/rejected": -0.5430414080619812, "step": 14037 }, { "epoch": 0.744070177298386, "grad_norm": 27.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3076576.0, "logits/rejected": -43328169.6, "logps/chosen": -73.07196553548177, "logps/rejected": -237.9689453125, "loss": 0.2325, "rewards/chosen": 0.1390599807103475, "rewards/margins": 3.168768080075582, "rewards/rejected": -3.0297080993652346, "step": 14038 }, { "epoch": 0.7441231813001882, "grad_norm": 47.75, "kl": 1.7835988998413086, "learning_rate": 5e-07, "logits/chosen": -2868556.4, "logits/rejected": -38138805.333333336, "logps/chosen": -238.7491943359375, "logps/rejected": -186.37418619791666, "loss": 0.3701, "rewards/chosen": -0.0994404673576355, "rewards/margins": 3.0443580428759254, "rewards/rejected": -3.143798510233561, "step": 14039 }, { "epoch": 0.7441761853019903, "grad_norm": 51.75, "kl": 3.956289291381836, "learning_rate": 5e-07, "logits/chosen": 23304174.0, "logits/rejected": -56384992.0, "logps/chosen": -1222.3055419921875, "logps/rejected": -518.442626953125, "loss": 0.1722, "rewards/chosen": 1.5177381038665771, "rewards/margins": 5.156784772872925, "rewards/rejected": -3.6390466690063477, "step": 14040 }, { "epoch": 0.7442291893037924, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11558928.8, "logits/rejected": -44597936.0, "logps/chosen": -236.177490234375, "logps/rejected": -394.9969482421875, "loss": 0.3885, "rewards/chosen": -0.24527482986450194, "rewards/margins": 1.9918303807576498, "rewards/rejected": -2.237105210622152, "step": 14041 }, { "epoch": 0.7442821933055945, "grad_norm": 62.5, "kl": 0.1884765625, "learning_rate": 5e-07, "logits/chosen": -46188857.6, "logits/rejected": -12918222.666666666, "logps/chosen": -648.41015625, "logps/rejected": -252.22408040364584, "loss": 0.2994, "rewards/chosen": 0.31573853492736814, "rewards/margins": 3.701969321568807, "rewards/rejected": -3.386230786641439, "step": 14042 }, { "epoch": 0.7443351973073967, "grad_norm": 50.75, "kl": 2.323831558227539, "learning_rate": 5e-07, "logits/chosen": -54522248.0, "logits/rejected": -10803165.0, "logps/chosen": -524.2265625, "logps/rejected": -318.16375732421875, "loss": 0.2438, "rewards/chosen": 1.1269497871398926, "rewards/margins": 3.1510815620422363, "rewards/rejected": -2.0241317749023438, "step": 14043 }, { "epoch": 0.7443882013091988, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51574053.333333336, "logits/rejected": -16037132.8, "logps/chosen": -685.509521484375, "logps/rejected": -264.6052734375, "loss": 0.1807, "rewards/chosen": 0.5101491610209147, "rewards/margins": 3.4511424700419107, "rewards/rejected": -2.940993309020996, "step": 14044 }, { "epoch": 0.744441205311001, "grad_norm": 37.5, "kl": 4.462990760803223, "learning_rate": 5e-07, "logits/chosen": -6449128.4, "logits/rejected": -9542674.0, "logps/chosen": -129.1142578125, "logps/rejected": -184.15620930989584, "loss": 0.1914, "rewards/chosen": 1.463509750366211, "rewards/margins": 5.671272150675456, "rewards/rejected": -4.207762400309245, "step": 14045 }, { "epoch": 0.7444942093128031, "grad_norm": 43.75, "kl": 3.2303967475891113, "learning_rate": 5e-07, "logits/chosen": -131247072.0, "logits/rejected": -1199277.8333333333, "logps/chosen": -1163.8489990234375, "logps/rejected": -110.443115234375, "loss": 0.2931, "rewards/chosen": 0.968188464641571, "rewards/margins": 1.922172168890635, "rewards/rejected": -0.9539837042490641, "step": 14046 }, { "epoch": 0.7445472133146053, "grad_norm": 50.5, "kl": 1.6672821044921875, "learning_rate": 5e-07, "logits/chosen": -52341560.0, "logits/rejected": -21348788.0, "logps/chosen": -380.00732421875, "logps/rejected": -542.25537109375, "loss": 0.2395, "rewards/chosen": 0.6175982356071472, "rewards/margins": 4.19511216878891, "rewards/rejected": -3.5775139331817627, "step": 14047 }, { "epoch": 0.7446002173164074, "grad_norm": 30.5, "kl": 1.3170204162597656, "learning_rate": 5e-07, "logits/chosen": -1379757.75, "logits/rejected": -11731178.666666666, "logps/chosen": -418.5605163574219, "logps/rejected": -393.1995035807292, "loss": 0.1173, "rewards/chosen": 1.4148391485214233, "rewards/margins": 4.004597703615824, "rewards/rejected": -2.589758555094401, "step": 14048 }, { "epoch": 0.7446532213182095, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22056377.6, "logits/rejected": -21647078.666666668, "logps/chosen": -259.804296875, "logps/rejected": -236.79364013671875, "loss": 0.3225, "rewards/chosen": 0.475386381149292, "rewards/margins": 2.1336861451466875, "rewards/rejected": -1.6582997639973958, "step": 14049 }, { "epoch": 0.7447062253200116, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66596389.333333336, "logits/rejected": -26238041.6, "logps/chosen": -544.6434733072916, "logps/rejected": -376.946484375, "loss": 0.2497, "rewards/chosen": -0.05917969346046448, "rewards/margins": 2.7395868241786956, "rewards/rejected": -2.79876651763916, "step": 14050 }, { "epoch": 0.7447592293218138, "grad_norm": 62.0, "kl": 1.9952926635742188, "learning_rate": 5e-07, "logits/chosen": -55783052.8, "logits/rejected": -34475104.0, "logps/chosen": -344.94306640625, "logps/rejected": -327.9786376953125, "loss": 0.3349, "rewards/chosen": 0.17701447010040283, "rewards/margins": 2.3304818073908486, "rewards/rejected": -2.153467337290446, "step": 14051 }, { "epoch": 0.7448122333236159, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 706754.0625, "logits/rejected": 80708757.33333333, "logps/chosen": -74.74742126464844, "logps/rejected": -600.4056396484375, "loss": 0.2068, "rewards/chosen": 0.7250422239303589, "rewards/margins": 3.057161053021749, "rewards/rejected": -2.33211882909139, "step": 14052 }, { "epoch": 0.7448652373254181, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1367862.5, "logits/rejected": 7677114.285714285, "logps/chosen": -31.364158630371094, "logps/rejected": -223.04872349330358, "loss": 0.1899, "rewards/chosen": -0.26064014434814453, "rewards/margins": 1.7941132954188754, "rewards/rejected": -2.05475343976702, "step": 14053 }, { "epoch": 0.7449182413272202, "grad_norm": 46.25, "kl": 0.8030261993408203, "learning_rate": 5e-07, "logits/chosen": -15431957.333333334, "logits/rejected": -54036518.4, "logps/chosen": -234.72513834635416, "logps/rejected": -337.058203125, "loss": 0.1811, "rewards/chosen": 1.3080353736877441, "rewards/margins": 3.360716724395752, "rewards/rejected": -2.0526813507080077, "step": 14054 }, { "epoch": 0.7449712453290224, "grad_norm": 37.25, "kl": 2.2370834350585938, "learning_rate": 5e-07, "logits/chosen": 2715228.75, "logits/rejected": -17239208.0, "logps/chosen": -90.17135620117188, "logps/rejected": -256.14898681640625, "loss": 0.3145, "rewards/chosen": 0.6070940494537354, "rewards/margins": 2.1311392784118652, "rewards/rejected": -1.5240452289581299, "step": 14055 }, { "epoch": 0.7450242493308245, "grad_norm": 35.5, "kl": 0.49565887451171875, "learning_rate": 5e-07, "logits/chosen": 148127.5, "logits/rejected": -31028976.0, "logps/chosen": -249.08404541015625, "logps/rejected": -435.2410481770833, "loss": 0.1229, "rewards/chosen": 2.2345657348632812, "rewards/margins": 4.8001556396484375, "rewards/rejected": -2.5655899047851562, "step": 14056 }, { "epoch": 0.7450772533326266, "grad_norm": 37.75, "kl": 5.318591117858887, "learning_rate": 5e-07, "logits/chosen": 3170230.6666666665, "logits/rejected": -17293278.0, "logps/chosen": -327.9923095703125, "logps/rejected": -300.34375, "loss": 0.2581, "rewards/chosen": 1.5376294453938801, "rewards/margins": 3.5018900235493975, "rewards/rejected": -1.9642605781555176, "step": 14057 }, { "epoch": 0.7451302573344287, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -23943672.0, "logps/rejected": -502.4215087890625, "loss": 0.0972, "rewards/rejected": -2.636509418487549, "step": 14058 }, { "epoch": 0.7451832613362309, "grad_norm": 44.5, "kl": 1.905242919921875, "learning_rate": 5e-07, "logits/chosen": -74239797.33333333, "logits/rejected": -30500950.4, "logps/chosen": -561.5247395833334, "logps/rejected": -333.9689453125, "loss": 0.1678, "rewards/chosen": 0.8733134269714355, "rewards/margins": 3.7687907218933105, "rewards/rejected": -2.895477294921875, "step": 14059 }, { "epoch": 0.745236265338033, "grad_norm": 47.5, "kl": 4.043586730957031, "learning_rate": 5e-07, "logits/chosen": -23913349.333333332, "logits/rejected": -8544632.0, "logps/chosen": -263.09814453125, "logps/rejected": -113.8726577758789, "loss": 0.4779, "rewards/chosen": 0.10187691450119019, "rewards/margins": 2.654017388820648, "rewards/rejected": -2.552140474319458, "step": 14060 }, { "epoch": 0.7452892693398352, "grad_norm": 47.5, "kl": 0.5869808197021484, "learning_rate": 5e-07, "logits/chosen": -788804.0, "logits/rejected": -23482888.0, "logps/chosen": -120.63507080078125, "logps/rejected": -429.08575439453125, "loss": 0.3344, "rewards/chosen": 0.5233396689097086, "rewards/margins": 2.3938620487848916, "rewards/rejected": -1.870522379875183, "step": 14061 }, { "epoch": 0.7453422733416373, "grad_norm": 45.5, "kl": 0.30803871154785156, "learning_rate": 5e-07, "logits/chosen": -28546902.4, "logits/rejected": -26284653.333333332, "logps/chosen": -141.1845703125, "logps/rejected": -256.2567952473958, "loss": 0.3456, "rewards/chosen": 0.4255840301513672, "rewards/margins": 2.040284506479899, "rewards/rejected": -1.614700476328532, "step": 14062 }, { "epoch": 0.7453952773434395, "grad_norm": 29.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1345881.25, "logits/rejected": -16937453.714285713, "logps/chosen": -43.76319122314453, "logps/rejected": -323.5206821986607, "loss": 0.1161, "rewards/chosen": 1.9841701984405518, "rewards/margins": 4.3874973228999545, "rewards/rejected": -2.4033271244594028, "step": 14063 }, { "epoch": 0.7454482813452415, "grad_norm": 79.5, "kl": 0.3313102722167969, "learning_rate": 5e-07, "logits/chosen": -29333836.0, "logits/rejected": -583344.375, "logps/chosen": -290.6228942871094, "logps/rejected": -283.7031656901042, "loss": 0.2525, "rewards/chosen": 0.351826012134552, "rewards/margins": 1.9668330947558086, "rewards/rejected": -1.6150070826212566, "step": 14064 }, { "epoch": 0.7455012853470437, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94176800.0, "logits/rejected": -34723330.28571428, "logps/chosen": -840.4090576171875, "logps/rejected": -318.86924525669644, "loss": 0.1264, "rewards/chosen": 1.8858642578125, "rewards/margins": 4.2432626996721545, "rewards/rejected": -2.357398441859654, "step": 14065 }, { "epoch": 0.7455542893488458, "grad_norm": 37.75, "kl": 0.8401679992675781, "learning_rate": 5e-07, "logits/chosen": -38456674.666666664, "logits/rejected": -23508025.6, "logps/chosen": -335.8381754557292, "logps/rejected": -376.785107421875, "loss": 0.205, "rewards/chosen": 0.5260701974232992, "rewards/margins": 3.6971158822377523, "rewards/rejected": -3.171045684814453, "step": 14066 }, { "epoch": 0.745607293350648, "grad_norm": 48.25, "kl": 4.978730201721191, "learning_rate": 5e-07, "logits/chosen": -2994150.0, "logits/rejected": -36707808.0, "logps/chosen": -549.4869995117188, "logps/rejected": -319.5046691894531, "loss": 0.206, "rewards/chosen": 1.5527379512786865, "rewards/margins": 4.8884313106536865, "rewards/rejected": -3.335693359375, "step": 14067 }, { "epoch": 0.7456602973524501, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81852360.0, "logits/rejected": -35292168.0, "logps/chosen": -384.3907470703125, "logps/rejected": -420.783935546875, "loss": 0.3475, "rewards/chosen": 0.013766102492809296, "rewards/margins": 2.1313059851527214, "rewards/rejected": -2.117539882659912, "step": 14068 }, { "epoch": 0.7457133013542523, "grad_norm": 23.0, "kl": 0.9610595703125, "learning_rate": 5e-07, "logits/chosen": 4664107.0, "logits/rejected": -13594502.0, "logps/chosen": -145.70388793945312, "logps/rejected": -86.92049407958984, "loss": 0.1417, "rewards/chosen": 1.291308879852295, "rewards/margins": 5.038615703582764, "rewards/rejected": -3.7473068237304688, "step": 14069 }, { "epoch": 0.7457663053560544, "grad_norm": 65.5, "kl": 1.6805400848388672, "learning_rate": 5e-07, "logits/chosen": -35792629.333333336, "logits/rejected": -33723640.0, "logps/chosen": -252.76175944010416, "logps/rejected": -349.916259765625, "loss": 0.3397, "rewards/chosen": 0.6348704099655151, "rewards/margins": 2.700239062309265, "rewards/rejected": -2.06536865234375, "step": 14070 }, { "epoch": 0.7458193093578566, "grad_norm": 42.5, "kl": 1.6511154174804688, "learning_rate": 5e-07, "logits/chosen": 2210885.0, "logits/rejected": -10774242.0, "logps/chosen": -121.47907257080078, "logps/rejected": -286.5793151855469, "loss": 0.2288, "rewards/chosen": 0.7774550318717957, "rewards/margins": 3.650506913661957, "rewards/rejected": -2.873051881790161, "step": 14071 }, { "epoch": 0.7458723133596586, "grad_norm": 38.75, "kl": 5.271232604980469, "learning_rate": 5e-07, "logits/chosen": -38035462.4, "logits/rejected": -57796245.333333336, "logps/chosen": -617.823681640625, "logps/rejected": -349.1961669921875, "loss": 0.2775, "rewards/chosen": 1.6063024520874023, "rewards/margins": 5.015035502115885, "rewards/rejected": -3.408733050028483, "step": 14072 }, { "epoch": 0.7459253173614608, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43764708.0, "logits/rejected": -6364927.5, "logps/chosen": -522.4871826171875, "logps/rejected": -243.97882080078125, "loss": 0.2278, "rewards/chosen": 1.0658832788467407, "rewards/margins": 3.0548670291900635, "rewards/rejected": -1.9889837503433228, "step": 14073 }, { "epoch": 0.7459783213632629, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23008146.0, "logits/rejected": -59581772.0, "logps/chosen": -412.77838134765625, "logps/rejected": -668.6168212890625, "loss": 0.2612, "rewards/chosen": 0.2981685698032379, "rewards/margins": 3.1732617914676666, "rewards/rejected": -2.8750932216644287, "step": 14074 }, { "epoch": 0.7460313253650651, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84324464.0, "logits/rejected": -39900784.0, "logps/chosen": -298.69879150390625, "logps/rejected": -420.872314453125, "loss": 0.2326, "rewards/chosen": 0.8496311902999878, "rewards/margins": 2.8859604597091675, "rewards/rejected": -2.0363292694091797, "step": 14075 }, { "epoch": 0.7460843293668672, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 32867661.333333332, "logits/rejected": -5084302.8, "logps/chosen": -374.0108235677083, "logps/rejected": -363.9101318359375, "loss": 0.3287, "rewards/chosen": -0.3948984940846761, "rewards/margins": 1.5501069227854412, "rewards/rejected": -1.9450054168701172, "step": 14076 }, { "epoch": 0.7461373333686694, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52692112.0, "logits/rejected": -28836096.0, "logps/chosen": -948.19775390625, "logps/rejected": -537.51787109375, "loss": 0.2214, "rewards/chosen": 0.607037345568339, "rewards/margins": 3.241227332750956, "rewards/rejected": -2.634189987182617, "step": 14077 }, { "epoch": 0.7461903373704715, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81975568.0, "logits/rejected": -22065930.666666668, "logps/chosen": -334.549072265625, "logps/rejected": -242.1447550455729, "loss": 0.2714, "rewards/chosen": -0.5022659301757812, "rewards/margins": 1.8239696820576987, "rewards/rejected": -2.32623561223348, "step": 14078 }, { "epoch": 0.7462433413722737, "grad_norm": 44.5, "kl": 2.404438018798828, "learning_rate": 5e-07, "logits/chosen": -26278892.0, "logits/rejected": -92876456.0, "logps/chosen": -213.97573852539062, "logps/rejected": -444.13458251953125, "loss": 0.2901, "rewards/chosen": 0.4471012353897095, "rewards/margins": 2.4268654584884644, "rewards/rejected": -1.9797642230987549, "step": 14079 }, { "epoch": 0.7462963453740757, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26846314.666666668, "logits/rejected": -24492281.6, "logps/chosen": -366.6506754557292, "logps/rejected": -638.93154296875, "loss": 0.1337, "rewards/chosen": 1.3920227686564128, "rewards/margins": 5.150436464945475, "rewards/rejected": -3.7584136962890624, "step": 14080 }, { "epoch": 0.7463493493758778, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50236851.2, "logits/rejected": -32084000.0, "logps/chosen": -392.14169921875, "logps/rejected": -377.3057454427083, "loss": 0.2614, "rewards/chosen": 0.4707186698913574, "rewards/margins": 3.538043944040934, "rewards/rejected": -3.0673252741495767, "step": 14081 }, { "epoch": 0.74640235337768, "grad_norm": 42.0, "kl": 0.1845378875732422, "learning_rate": 5e-07, "logits/chosen": -41615324.0, "logits/rejected": -25312877.333333332, "logps/chosen": -195.34982299804688, "logps/rejected": -239.6966552734375, "loss": 0.2013, "rewards/chosen": 0.7069997787475586, "rewards/margins": 3.1237212816874185, "rewards/rejected": -2.41672150293986, "step": 14082 }, { "epoch": 0.7464553573794821, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4665128.0, "logits/rejected": -20810932.0, "logps/chosen": -620.17685546875, "logps/rejected": -466.8659261067708, "loss": 0.2591, "rewards/chosen": 0.9237316131591797, "rewards/margins": 3.75250670115153, "rewards/rejected": -2.82877508799235, "step": 14083 }, { "epoch": 0.7465083613812843, "grad_norm": 58.25, "kl": 5.503242492675781, "learning_rate": 5e-07, "logits/chosen": -17546724.0, "logps/chosen": -240.3270263671875, "loss": 0.4599, "rewards/chosen": 0.7617963552474976, "step": 14084 }, { "epoch": 0.7465613653830864, "grad_norm": 50.5, "kl": 4.127740859985352, "learning_rate": 5e-07, "logits/chosen": -16975618.666666668, "logits/rejected": -18678746.0, "logps/chosen": -427.7742919921875, "logps/rejected": -250.4154815673828, "loss": 0.238, "rewards/chosen": 1.2403249740600586, "rewards/margins": 4.60104775428772, "rewards/rejected": -3.360722780227661, "step": 14085 }, { "epoch": 0.7466143693848886, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7909454.0, "logits/rejected": -349811.90625, "logps/chosen": -327.7828063964844, "logps/rejected": -121.71328735351562, "loss": 0.1967, "rewards/chosen": 1.0144497156143188, "rewards/margins": 3.440903067588806, "rewards/rejected": -2.4264533519744873, "step": 14086 }, { "epoch": 0.7466673733866906, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21153080.0, "logits/rejected": -9112238.666666666, "logps/chosen": -502.8668518066406, "logps/rejected": -267.09311930338544, "loss": 0.2379, "rewards/chosen": 0.05476227402687073, "rewards/margins": 2.1458381712436676, "rewards/rejected": -2.091075897216797, "step": 14087 }, { "epoch": 0.7467203773884928, "grad_norm": 138.0, "kl": 0.49884605407714844, "learning_rate": 5e-07, "logits/chosen": -21962508.0, "logits/rejected": -32133651.2, "logps/chosen": -392.4878743489583, "logps/rejected": -349.1473388671875, "loss": 0.2274, "rewards/chosen": 1.0751088460286458, "rewards/margins": 2.9158345540364583, "rewards/rejected": -1.8407257080078125, "step": 14088 }, { "epoch": 0.7467733813902949, "grad_norm": 46.0, "kl": 0.5854110717773438, "learning_rate": 5e-07, "logits/chosen": -13791028.0, "logits/rejected": 17496448.0, "logps/chosen": -311.0909016927083, "logps/rejected": -282.73369140625, "loss": 0.302, "rewards/chosen": 0.42855580647786456, "rewards/margins": 2.002182420094808, "rewards/rejected": -1.5736266136169434, "step": 14089 }, { "epoch": 0.7468263853920971, "grad_norm": 22.125, "kl": 0.38199615478515625, "learning_rate": 5e-07, "logits/chosen": -15674394.0, "logits/rejected": -47237840.0, "logps/chosen": -224.60328674316406, "logps/rejected": -327.69846598307294, "loss": 0.1487, "rewards/chosen": 1.2707794904708862, "rewards/margins": 4.330011566480001, "rewards/rejected": -3.0592320760091147, "step": 14090 }, { "epoch": 0.7468793893938992, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34235348.0, "logits/rejected": -24391234.285714287, "logps/chosen": -515.6401977539062, "logps/rejected": -436.41702706473217, "loss": 0.2281, "rewards/chosen": -2.19512939453125, "rewards/margins": 0.6296517508370534, "rewards/rejected": -2.8247811453683034, "step": 14091 }, { "epoch": 0.7469323933957014, "grad_norm": 42.75, "kl": 1.9950332641601562, "learning_rate": 5e-07, "logits/chosen": -25464508.0, "logits/rejected": -27546562.0, "logps/chosen": -279.8092041015625, "logps/rejected": -249.5723114013672, "loss": 0.2352, "rewards/chosen": 0.8458776473999023, "rewards/margins": 2.7971749305725098, "rewards/rejected": -1.9512972831726074, "step": 14092 }, { "epoch": 0.7469853973975035, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40137168.0, "logits/rejected": -34738904.0, "logps/chosen": -364.6916198730469, "logps/rejected": -457.281494140625, "loss": 0.2488, "rewards/chosen": 0.5682762861251831, "rewards/margins": 2.845279097557068, "rewards/rejected": -2.2770028114318848, "step": 14093 }, { "epoch": 0.7470384013993057, "grad_norm": 90.5, "kl": 3.0058517456054688, "learning_rate": 5e-07, "logits/chosen": -78391948.8, "logits/rejected": 16901858.666666668, "logps/chosen": -513.39716796875, "logps/rejected": -536.7943522135416, "loss": 0.2259, "rewards/chosen": 1.6220230102539062, "rewards/margins": 4.032793744405111, "rewards/rejected": -2.4107707341512046, "step": 14094 }, { "epoch": 0.7470914054011077, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47336490.666666664, "logits/rejected": -8503582.4, "logps/chosen": -280.3163655598958, "logps/rejected": -341.7298828125, "loss": 0.2259, "rewards/chosen": 0.33876752853393555, "rewards/margins": 3.905904674530029, "rewards/rejected": -3.5671371459960937, "step": 14095 }, { "epoch": 0.7471444094029099, "grad_norm": 49.25, "kl": 2.6919803619384766, "learning_rate": 5e-07, "logits/chosen": -10235234.666666666, "logits/rejected": -29932130.0, "logps/chosen": -120.74086507161458, "logps/rejected": -401.6739501953125, "loss": 0.4266, "rewards/chosen": 0.13503981630007425, "rewards/margins": 3.290533790985743, "rewards/rejected": -3.155493974685669, "step": 14096 }, { "epoch": 0.747197413404712, "grad_norm": 38.75, "kl": 1.8010921478271484, "learning_rate": 5e-07, "logits/chosen": -12623072.0, "logits/rejected": -25219342.0, "logps/chosen": -211.3765869140625, "logps/rejected": -301.37939453125, "loss": 0.2869, "rewards/chosen": 0.6547331809997559, "rewards/margins": 2.4400689601898193, "rewards/rejected": -1.7853357791900635, "step": 14097 }, { "epoch": 0.7472504174065142, "grad_norm": 36.75, "kl": 1.1604156494140625, "learning_rate": 5e-07, "logits/chosen": -31011148.8, "logits/rejected": -40017173.333333336, "logps/chosen": -438.59873046875, "logps/rejected": -356.7528889973958, "loss": 0.2366, "rewards/chosen": 1.2591000556945802, "rewards/margins": 3.806860065460205, "rewards/rejected": -2.547760009765625, "step": 14098 }, { "epoch": 0.7473034214083163, "grad_norm": 33.5, "kl": 1.6454944610595703, "learning_rate": 5e-07, "logits/chosen": 4154382.6666666665, "logits/rejected": -9692710.4, "logps/chosen": -178.4679972330729, "logps/rejected": -203.59910888671874, "loss": 0.193, "rewards/chosen": 0.6358062823613485, "rewards/margins": 4.0486953814824425, "rewards/rejected": -3.412889099121094, "step": 14099 }, { "epoch": 0.7473564254101185, "grad_norm": 51.75, "kl": 0.16335010528564453, "learning_rate": 5e-07, "logits/chosen": -7819688.8, "logits/rejected": -14791862.666666666, "logps/chosen": -258.1574462890625, "logps/rejected": -317.09466552734375, "loss": 0.3234, "rewards/chosen": 0.27499687671661377, "rewards/margins": 2.373588045438131, "rewards/rejected": -2.098591168721517, "step": 14100 }, { "epoch": 0.7474094294119206, "grad_norm": 58.25, "kl": 2.0307235717773438, "learning_rate": 5e-07, "logits/chosen": -58140889.6, "logits/rejected": -11996572.0, "logps/chosen": -394.675732421875, "logps/rejected": -146.7301228841146, "loss": 0.4263, "rewards/chosen": 0.16450011730194092, "rewards/margins": 1.9139193296432495, "rewards/rejected": -1.7494192123413086, "step": 14101 }, { "epoch": 0.7474624334137228, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53312328.0, "logits/rejected": 4547582.0, "logps/chosen": -282.80657958984375, "logps/rejected": -266.8570556640625, "loss": 0.2619, "rewards/chosen": 0.6782298684120178, "rewards/margins": 2.6521851420402527, "rewards/rejected": -1.9739552736282349, "step": 14102 }, { "epoch": 0.7475154374155248, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31144020.0, "logits/rejected": -30540398.0, "logps/chosen": -285.60577392578125, "logps/rejected": -514.2587890625, "loss": 0.2435, "rewards/chosen": 0.8837189674377441, "rewards/margins": 3.0677638053894043, "rewards/rejected": -2.18404483795166, "step": 14103 }, { "epoch": 0.747568441417327, "grad_norm": 35.0, "kl": 0.7153730392456055, "learning_rate": 5e-07, "logits/chosen": 3355598.75, "logits/rejected": -5891771.5, "logps/chosen": -217.61703491210938, "logps/rejected": -235.2499542236328, "loss": 0.2494, "rewards/chosen": 0.5829606056213379, "rewards/margins": 3.0969324111938477, "rewards/rejected": -2.5139718055725098, "step": 14104 }, { "epoch": 0.7476214454191291, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78982080.0, "logits/rejected": -55438246.4, "logps/chosen": -378.650146484375, "logps/rejected": -292.939306640625, "loss": 0.2189, "rewards/chosen": 0.5915331840515137, "rewards/margins": 2.7167201042175293, "rewards/rejected": -2.1251869201660156, "step": 14105 }, { "epoch": 0.7476744494209313, "grad_norm": 33.75, "kl": 3.3361434936523438, "learning_rate": 5e-07, "logits/chosen": -12917738.4, "logits/rejected": -19321413.333333332, "logps/chosen": -208.539111328125, "logps/rejected": -459.05859375, "loss": 0.3119, "rewards/chosen": 0.8117235183715821, "rewards/margins": 3.8799417495727537, "rewards/rejected": -3.068218231201172, "step": 14106 }, { "epoch": 0.7477274534227334, "grad_norm": 50.5, "kl": 1.6472206115722656, "learning_rate": 5e-07, "logits/chosen": -26755454.0, "logits/rejected": -32787509.333333332, "logps/chosen": -527.52978515625, "logps/rejected": -331.06394449869794, "loss": 0.1716, "rewards/chosen": 1.810464859008789, "rewards/margins": 4.109889030456543, "rewards/rejected": -2.299424171447754, "step": 14107 }, { "epoch": 0.7477804574245356, "grad_norm": 25.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13276976.0, "logits/rejected": -12001314.666666666, "logps/chosen": -166.88143920898438, "logps/rejected": -385.5320638020833, "loss": 0.143, "rewards/chosen": 0.31129884719848633, "rewards/margins": 3.7128793398539224, "rewards/rejected": -3.401580492655436, "step": 14108 }, { "epoch": 0.7478334614263377, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3359019.75, "logits/rejected": -47077453.71428572, "logps/chosen": -151.62258911132812, "logps/rejected": -382.994140625, "loss": 0.1718, "rewards/chosen": -0.1256561279296875, "rewards/margins": 2.321944372994559, "rewards/rejected": -2.4476005009242465, "step": 14109 }, { "epoch": 0.7478864654281399, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75829696.0, "logits/rejected": -33589360.0, "logps/chosen": -741.1044921875, "logps/rejected": -454.50848388671875, "loss": 0.2558, "rewards/chosen": 0.9606174230575562, "rewards/margins": 3.1765111684799194, "rewards/rejected": -2.2158937454223633, "step": 14110 }, { "epoch": 0.7479394694299419, "grad_norm": 41.75, "kl": 0.5304183959960938, "learning_rate": 5e-07, "logits/chosen": 9927406.0, "logits/rejected": -10976802.0, "logps/chosen": -247.16806030273438, "logps/rejected": -148.9625244140625, "loss": 0.2911, "rewards/chosen": 0.30997544527053833, "rewards/margins": 2.3962185978889465, "rewards/rejected": -2.086243152618408, "step": 14111 }, { "epoch": 0.7479924734317441, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37441540.0, "logits/rejected": -18944108.0, "logps/chosen": -381.5704040527344, "logps/rejected": -217.63069661458334, "loss": 0.3484, "rewards/chosen": -0.8423141837120056, "rewards/margins": 0.7563591599464417, "rewards/rejected": -1.5986733436584473, "step": 14112 }, { "epoch": 0.7480454774335462, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -802983.0, "logits/rejected": 423803.0, "logps/chosen": -134.52186584472656, "logps/rejected": -357.89337158203125, "loss": 0.2628, "rewards/chosen": 0.6260108947753906, "rewards/margins": 2.655413866043091, "rewards/rejected": -2.0294029712677, "step": 14113 }, { "epoch": 0.7480984814353484, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50324492.0, "logits/rejected": -13179393.333333334, "logps/chosen": -272.96649169921875, "logps/rejected": -222.99479166666666, "loss": 0.2484, "rewards/chosen": -0.25746995210647583, "rewards/margins": 1.773323873678843, "rewards/rejected": -2.030793825785319, "step": 14114 }, { "epoch": 0.7481514854371505, "grad_norm": 45.5, "kl": 0.6570663452148438, "learning_rate": 5e-07, "logits/chosen": -4676069.333333333, "logits/rejected": -56191078.4, "logps/chosen": -353.1711832682292, "logps/rejected": -429.0970703125, "loss": 0.1919, "rewards/chosen": 0.4157544771830241, "rewards/margins": 3.5343373934427897, "rewards/rejected": -3.118582916259766, "step": 14115 }, { "epoch": 0.7482044894389527, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59609141.333333336, "logits/rejected": 163322521.6, "logps/chosen": -578.045654296875, "logps/rejected": -439.229150390625, "loss": 0.3409, "rewards/chosen": -0.3425516684850057, "rewards/margins": 1.6230151573816936, "rewards/rejected": -1.9655668258666992, "step": 14116 }, { "epoch": 0.7482574934407548, "grad_norm": 27.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7674227.333333333, "logits/rejected": -47325392.0, "logps/chosen": -162.5073038736979, "logps/rejected": -234.8140869140625, "loss": 0.143, "rewards/chosen": 1.0508073170979817, "rewards/margins": 4.419667180379232, "rewards/rejected": -3.36885986328125, "step": 14117 }, { "epoch": 0.748310497442557, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64519877.333333336, "logits/rejected": -59290944.0, "logps/chosen": -500.03173828125, "logps/rejected": -361.6540283203125, "loss": 0.2107, "rewards/chosen": 1.3365929921468098, "rewards/margins": 3.09097417195638, "rewards/rejected": -1.7543811798095703, "step": 14118 }, { "epoch": 0.748363501444359, "grad_norm": 46.75, "kl": 0.9281063079833984, "learning_rate": 5e-07, "logits/chosen": -16907316.0, "logits/rejected": -2859047.5, "logps/chosen": -228.43294270833334, "logps/rejected": -83.66487121582031, "loss": 0.4856, "rewards/chosen": -0.23568489154179892, "rewards/margins": 2.2081219951311746, "rewards/rejected": -2.4438068866729736, "step": 14119 }, { "epoch": 0.7484165054461612, "grad_norm": 77.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 45990425.6, "logits/rejected": -32200485.333333332, "logps/chosen": -318.11728515625, "logps/rejected": -306.7816569010417, "loss": 0.3782, "rewards/chosen": 0.3021167039871216, "rewards/margins": 1.4726767460505168, "rewards/rejected": -1.1705600420633953, "step": 14120 }, { "epoch": 0.7484695094479633, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15082090.666666666, "logits/rejected": -45743904.0, "logps/chosen": -518.6610514322916, "logps/rejected": -439.965771484375, "loss": 0.1973, "rewards/chosen": 0.7164067427317301, "rewards/margins": 3.501815525690714, "rewards/rejected": -2.785408782958984, "step": 14121 }, { "epoch": 0.7485225134497655, "grad_norm": 48.5, "kl": 0.93865966796875, "learning_rate": 5e-07, "logits/chosen": -39446197.333333336, "logits/rejected": -12500856.0, "logps/chosen": -316.14361572265625, "logps/rejected": -228.9529052734375, "loss": 0.2578, "rewards/chosen": 0.5506952603658041, "rewards/margins": 2.714887650807699, "rewards/rejected": -2.1641923904418947, "step": 14122 }, { "epoch": 0.7485755174515676, "grad_norm": 81.5, "kl": 2.761343002319336, "learning_rate": 5e-07, "logits/chosen": -9490233.6, "logits/rejected": 8560122.666666666, "logps/chosen": -396.768896484375, "logps/rejected": -595.6818033854166, "loss": 0.2941, "rewards/chosen": 1.003587532043457, "rewards/margins": 2.756722259521484, "rewards/rejected": -1.7531347274780273, "step": 14123 }, { "epoch": 0.7486285214533698, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10853792.666666666, "logits/rejected": -23148998.4, "logps/chosen": -219.50504557291666, "logps/rejected": -246.713134765625, "loss": 0.2691, "rewards/chosen": -0.07694536944230397, "rewards/margins": 2.3773696253697074, "rewards/rejected": -2.4543149948120115, "step": 14124 }, { "epoch": 0.7486815254551719, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13629168.0, "logits/rejected": -23852938.0, "logps/chosen": -221.70838928222656, "logps/rejected": -340.7930603027344, "loss": 0.2423, "rewards/chosen": 0.37659722566604614, "rewards/margins": 3.303009569644928, "rewards/rejected": -2.926412343978882, "step": 14125 }, { "epoch": 0.748734529456974, "grad_norm": 63.5, "kl": 0.1603870391845703, "learning_rate": 5e-07, "logits/chosen": -69875605.33333333, "logits/rejected": 18416620.0, "logps/chosen": -339.17055257161456, "logps/rejected": -191.5230712890625, "loss": 0.4504, "rewards/chosen": -0.029623548189798992, "rewards/margins": 0.9894247849782308, "rewards/rejected": -1.0190483331680298, "step": 14126 }, { "epoch": 0.7487875334587761, "grad_norm": 44.75, "kl": 3.4701499938964844, "learning_rate": 5e-07, "logits/chosen": -7160516.8, "logits/rejected": -6620968.0, "logps/chosen": -156.9094970703125, "logps/rejected": -132.36536661783853, "loss": 0.365, "rewards/chosen": 0.596946382522583, "rewards/margins": 1.7063708146413168, "rewards/rejected": -1.1094244321187336, "step": 14127 }, { "epoch": 0.7488405374605783, "grad_norm": 41.0, "kl": 1.580718994140625, "learning_rate": 5e-07, "logits/chosen": -16748917.333333334, "logits/rejected": -14131558.4, "logps/chosen": -441.0340983072917, "logps/rejected": -157.66851806640625, "loss": 0.1707, "rewards/chosen": 1.4829589525858562, "rewards/margins": 3.472671095530192, "rewards/rejected": -1.9897121429443358, "step": 14128 }, { "epoch": 0.7488935414623804, "grad_norm": 41.5, "kl": 1.6909637451171875, "learning_rate": 5e-07, "logits/chosen": -8513530.0, "logits/rejected": -42521616.0, "logps/chosen": -176.1869354248047, "logps/rejected": -303.11553955078125, "loss": 0.3322, "rewards/chosen": -0.21779517829418182, "rewards/margins": 2.81473608314991, "rewards/rejected": -3.032531261444092, "step": 14129 }, { "epoch": 0.7489465454641826, "grad_norm": 43.0, "kl": 1.547698974609375, "learning_rate": 5e-07, "logits/chosen": -40380824.0, "logits/rejected": -43667328.0, "logps/chosen": -367.6947021484375, "logps/rejected": -428.1585693359375, "loss": 0.2215, "rewards/chosen": 0.8075107336044312, "rewards/margins": 4.357118010520935, "rewards/rejected": -3.549607276916504, "step": 14130 }, { "epoch": 0.7489995494659847, "grad_norm": 45.0, "kl": 1.5902233123779297, "learning_rate": 5e-07, "logits/chosen": -27727588.0, "logits/rejected": -11489470.0, "logps/chosen": -294.42010498046875, "logps/rejected": -430.8134765625, "loss": 0.3075, "rewards/chosen": 0.3625478744506836, "rewards/margins": 2.2624435424804688, "rewards/rejected": -1.8998956680297852, "step": 14131 }, { "epoch": 0.7490525534677868, "grad_norm": 43.75, "kl": 2.0451316833496094, "learning_rate": 5e-07, "logits/chosen": -15855399.0, "logits/rejected": -58069592.0, "logps/chosen": -745.9751586914062, "logps/rejected": -327.7242126464844, "loss": 0.2584, "rewards/chosen": 0.7394741773605347, "rewards/margins": 3.1174176931381226, "rewards/rejected": -2.377943515777588, "step": 14132 }, { "epoch": 0.749105557469589, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26627203.2, "logits/rejected": -18503145.333333332, "logps/chosen": -206.140478515625, "logps/rejected": -359.3740234375, "loss": 0.3091, "rewards/chosen": 0.13570184707641603, "rewards/margins": 3.2058051745096843, "rewards/rejected": -3.070103327433268, "step": 14133 }, { "epoch": 0.749158561471391, "grad_norm": 59.75, "kl": 1.3542003631591797, "learning_rate": 5e-07, "logits/chosen": -23904716.8, "logits/rejected": -30956048.0, "logps/chosen": -219.2544677734375, "logps/rejected": -635.3129069010416, "loss": 0.3951, "rewards/chosen": -0.1278616428375244, "rewards/margins": 2.921732600529989, "rewards/rejected": -3.049594243367513, "step": 14134 }, { "epoch": 0.7492115654731932, "grad_norm": 51.0, "kl": 2.8789138793945312, "learning_rate": 5e-07, "logits/chosen": -6979050.0, "logits/rejected": -17882812.0, "logps/chosen": -138.3037109375, "logps/rejected": -250.83975219726562, "loss": 0.3964, "rewards/chosen": 0.08965131640434265, "rewards/margins": 2.401614397764206, "rewards/rejected": -2.3119630813598633, "step": 14135 }, { "epoch": 0.7492645694749953, "grad_norm": 45.75, "kl": 1.1254043579101562, "learning_rate": 5e-07, "logits/chosen": -39736256.0, "logits/rejected": -33283262.0, "logps/chosen": -287.58160400390625, "logps/rejected": -379.09698486328125, "loss": 0.1816, "rewards/chosen": 1.3342856168746948, "rewards/margins": 3.621111273765564, "rewards/rejected": -2.286825656890869, "step": 14136 }, { "epoch": 0.7493175734767975, "grad_norm": 70.5, "kl": 2.751702308654785, "learning_rate": 5e-07, "logits/chosen": -11123202.0, "logits/rejected": -9986152.0, "logps/chosen": -265.9148864746094, "logps/rejected": -365.4091491699219, "loss": 0.2623, "rewards/chosen": 0.9450498819351196, "rewards/margins": 3.381271481513977, "rewards/rejected": -2.4362215995788574, "step": 14137 }, { "epoch": 0.7493705774785996, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3664439.6666666665, "logits/rejected": -46131232.0, "logps/chosen": -130.5478515625, "logps/rejected": -412.28388671875, "loss": 0.237, "rewards/chosen": 0.7928628921508789, "rewards/margins": 2.870220756530762, "rewards/rejected": -2.077357864379883, "step": 14138 }, { "epoch": 0.7494235814804018, "grad_norm": 29.875, "kl": 0.25077247619628906, "learning_rate": 5e-07, "logits/chosen": -9498564.0, "logits/rejected": -37515290.666666664, "logps/chosen": -141.54161071777344, "logps/rejected": -305.9512939453125, "loss": 0.2249, "rewards/chosen": -0.48565903306007385, "rewards/margins": 2.969109147787094, "rewards/rejected": -3.454768180847168, "step": 14139 }, { "epoch": 0.7494765854822039, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55970604.0, "logits/rejected": -8289034.666666667, "logps/chosen": -345.903564453125, "logps/rejected": -317.7544352213542, "loss": 0.258, "rewards/chosen": 0.2154388427734375, "rewards/margins": 2.017553647359212, "rewards/rejected": -1.8021148045857747, "step": 14140 }, { "epoch": 0.749529589484006, "grad_norm": 33.0, "kl": 1.3335342407226562, "learning_rate": 5e-07, "logits/chosen": 2014551.5, "logits/rejected": -12171825.333333334, "logps/chosen": -28.312559127807617, "logps/rejected": -208.51318359375, "loss": 0.2389, "rewards/chosen": 0.10658597946166992, "rewards/margins": 2.183895746866862, "rewards/rejected": -2.077309767405192, "step": 14141 }, { "epoch": 0.7495825934858081, "grad_norm": 52.5, "kl": 1.750101089477539, "learning_rate": 5e-07, "logits/chosen": -48825712.0, "logits/rejected": -3639177.6, "logps/chosen": -672.81298828125, "logps/rejected": -182.3386474609375, "loss": 0.1973, "rewards/chosen": 0.8618062337239584, "rewards/margins": 3.286652692159017, "rewards/rejected": -2.4248464584350584, "step": 14142 }, { "epoch": 0.7496355974876103, "grad_norm": 41.75, "kl": 2.121278762817383, "learning_rate": 5e-07, "logits/chosen": -24032244.0, "logits/rejected": -34568592.0, "logps/chosen": -213.42762756347656, "logps/rejected": -472.09979248046875, "loss": 0.3034, "rewards/chosen": 0.5833563208580017, "rewards/margins": 2.900082290172577, "rewards/rejected": -2.316725969314575, "step": 14143 }, { "epoch": 0.7496886014894124, "grad_norm": 44.5, "kl": 0.7242536544799805, "learning_rate": 5e-07, "logits/chosen": -9135759.0, "logits/rejected": -19898540.0, "logps/chosen": -218.0106201171875, "logps/rejected": -180.4398193359375, "loss": 0.2675, "rewards/chosen": 0.9509448409080505, "rewards/margins": 3.386014401912689, "rewards/rejected": -2.4350695610046387, "step": 14144 }, { "epoch": 0.7497416054912146, "grad_norm": 42.0, "kl": 0.17287826538085938, "learning_rate": 5e-07, "logits/chosen": 4438357.0, "logits/rejected": -34162194.666666664, "logps/chosen": -165.0727996826172, "logps/rejected": -390.1580403645833, "loss": 0.1695, "rewards/chosen": 0.5212342143058777, "rewards/margins": 3.0750640432039895, "rewards/rejected": -2.553829828898112, "step": 14145 }, { "epoch": 0.7497946094930167, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6339726.666666667, "logits/rejected": 6239917.6, "logps/chosen": -673.9883626302084, "logps/rejected": -230.1518310546875, "loss": 0.2056, "rewards/chosen": 1.8292198181152344, "rewards/margins": 3.650528335571289, "rewards/rejected": -1.8213085174560546, "step": 14146 }, { "epoch": 0.7498476134948189, "grad_norm": 67.5, "kl": 1.7757911682128906, "learning_rate": 5e-07, "logits/chosen": -67542028.8, "logits/rejected": -13162890.666666666, "logps/chosen": -558.8255859375, "logps/rejected": -132.8681844075521, "loss": 0.3559, "rewards/chosen": 0.7751635551452637, "rewards/margins": 1.505103095372518, "rewards/rejected": -0.7299395402272543, "step": 14147 }, { "epoch": 0.749900617496621, "grad_norm": 47.0, "kl": 0.9181480407714844, "learning_rate": 5e-07, "logits/chosen": -35215112.0, "logits/rejected": 2613023.6, "logps/chosen": -213.94879150390625, "logps/rejected": -132.43497314453126, "loss": 0.2113, "rewards/chosen": 0.8141164779663086, "rewards/margins": 3.0429563522338867, "rewards/rejected": -2.228839874267578, "step": 14148 }, { "epoch": 0.7499536214984232, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19491891.2, "logits/rejected": -10901358.666666666, "logps/chosen": -124.79512939453124, "logps/rejected": -146.8933308919271, "loss": 0.3503, "rewards/chosen": 0.29927637577056887, "rewards/margins": 1.611977283159892, "rewards/rejected": -1.312700907389323, "step": 14149 }, { "epoch": 0.7500066255002252, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14231234.666666666, "logits/rejected": -12462567.2, "logps/chosen": -204.03946940104166, "logps/rejected": -452.549755859375, "loss": 0.2015, "rewards/chosen": 0.7456972599029541, "rewards/margins": 3.6132997989654543, "rewards/rejected": -2.8676025390625, "step": 14150 }, { "epoch": 0.7500596295020274, "grad_norm": 69.0, "kl": 2.417387008666992, "learning_rate": 5e-07, "logits/chosen": -57759040.0, "logits/rejected": 29663096.0, "logps/chosen": -304.14432198660717, "logps/rejected": -257.95208740234375, "loss": 0.3954, "rewards/chosen": 0.654407228742327, "rewards/margins": 1.5194233911378043, "rewards/rejected": -0.8650161623954773, "step": 14151 }, { "epoch": 0.7501126335038295, "grad_norm": 83.5, "kl": 1.4979400634765625, "learning_rate": 5e-07, "logits/chosen": -21716608.0, "logits/rejected": -47617834.666666664, "logps/chosen": -429.68349609375, "logps/rejected": -397.7134195963542, "loss": 0.2311, "rewards/chosen": 1.2652776718139649, "rewards/margins": 3.5713104883829754, "rewards/rejected": -2.3060328165690103, "step": 14152 }, { "epoch": 0.7501656375056317, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28724482.0, "logits/rejected": -58462074.666666664, "logps/chosen": -219.65963745117188, "logps/rejected": -456.624267578125, "loss": 0.1618, "rewards/chosen": 0.27268752455711365, "rewards/margins": 3.097714990377426, "rewards/rejected": -2.8250274658203125, "step": 14153 }, { "epoch": 0.7502186415074338, "grad_norm": 42.25, "kl": 0.6021566390991211, "learning_rate": 5e-07, "logits/chosen": 7162370.666666667, "logits/rejected": -4465414.0, "logps/chosen": -185.63655598958334, "logps/rejected": -86.162890625, "loss": 0.2788, "rewards/chosen": 0.7521680196126302, "rewards/margins": 2.324273618062337, "rewards/rejected": -1.572105598449707, "step": 14154 }, { "epoch": 0.750271645509236, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17442875.2, "logits/rejected": -3378116.3333333335, "logps/chosen": -265.3859130859375, "logps/rejected": -191.3053995768229, "loss": 0.3636, "rewards/chosen": 0.06630245447158814, "rewards/margins": 2.106606638431549, "rewards/rejected": -2.040304183959961, "step": 14155 }, { "epoch": 0.750324649511038, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1434533.0, "logits/rejected": -42003789.333333336, "logps/chosen": -209.5633544921875, "logps/rejected": -221.01566569010416, "loss": 0.1767, "rewards/chosen": 1.1001085042953491, "rewards/margins": 3.2976983785629272, "rewards/rejected": -2.197589874267578, "step": 14156 }, { "epoch": 0.7503776535128402, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49979576.0, "logits/rejected": 4468062.0, "logps/chosen": -436.767333984375, "logps/rejected": -321.7156066894531, "loss": 0.2746, "rewards/chosen": 0.6693024039268494, "rewards/margins": 2.539642035961151, "rewards/rejected": -1.8703396320343018, "step": 14157 }, { "epoch": 0.7504306575146423, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51618144.0, "logits/rejected": -23351606.4, "logps/chosen": -308.6997477213542, "logps/rejected": -515.66904296875, "loss": 0.2019, "rewards/chosen": 0.6752594312032064, "rewards/margins": 3.1756840070088708, "rewards/rejected": -2.5004245758056642, "step": 14158 }, { "epoch": 0.7504836615164445, "grad_norm": 31.125, "kl": 0.04467964172363281, "learning_rate": 5e-07, "logits/chosen": -30576614.0, "logits/rejected": -49061962.666666664, "logps/chosen": -94.27557373046875, "logps/rejected": -465.2960611979167, "loss": 0.2119, "rewards/chosen": 0.4671753942966461, "rewards/margins": 2.866333077351252, "rewards/rejected": -2.399157683054606, "step": 14159 }, { "epoch": 0.7505366655182466, "grad_norm": 47.5, "kl": 0.12105655670166016, "learning_rate": 5e-07, "logits/chosen": -18230742.4, "logits/rejected": -5739848.0, "logps/chosen": -307.4232666015625, "logps/rejected": -93.45399983723958, "loss": 0.2394, "rewards/chosen": 0.6363548755645752, "rewards/margins": 4.25361803372701, "rewards/rejected": -3.617263158162435, "step": 14160 }, { "epoch": 0.7505896695200488, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5402958.666666667, "logits/rejected": -14212398.4, "logps/chosen": -203.33878580729166, "logps/rejected": -166.860986328125, "loss": 0.2859, "rewards/chosen": -0.026448885599772137, "rewards/margins": 2.7421222050984704, "rewards/rejected": -2.7685710906982424, "step": 14161 }, { "epoch": 0.7506426735218509, "grad_norm": 57.25, "kl": 2.663106918334961, "learning_rate": 5e-07, "logits/chosen": -15094588.0, "logits/rejected": -33047308.0, "logps/chosen": -304.8775634765625, "logps/rejected": -94.2951889038086, "loss": 0.3821, "rewards/chosen": 0.5921437740325928, "rewards/margins": 3.1576523780822754, "rewards/rejected": -2.5655086040496826, "step": 14162 }, { "epoch": 0.7506956775236531, "grad_norm": 57.25, "kl": 4.413539886474609, "learning_rate": 5e-07, "logits/chosen": 24573624.0, "logits/rejected": 3902054.0, "logps/chosen": -268.5089111328125, "logps/rejected": -211.21408081054688, "loss": 0.3305, "rewards/chosen": 1.1414947509765625, "rewards/margins": 2.8674798011779785, "rewards/rejected": -1.725985050201416, "step": 14163 }, { "epoch": 0.7507486815254552, "grad_norm": 54.25, "kl": 1.5194711685180664, "learning_rate": 5e-07, "logits/chosen": -390505.5, "logits/rejected": 18040542.0, "logps/chosen": -147.34857177734375, "logps/rejected": -88.80918884277344, "loss": 0.3665, "rewards/chosen": 0.8232738971710205, "rewards/margins": 1.1885541677474976, "rewards/rejected": -0.36528027057647705, "step": 14164 }, { "epoch": 0.7508016855272573, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62860316.0, "logits/rejected": -20287117.333333332, "logps/chosen": -660.4415283203125, "logps/rejected": -310.3380940755208, "loss": 0.1152, "rewards/chosen": 1.0540587902069092, "rewards/margins": 4.469651301701864, "rewards/rejected": -3.4155925114949546, "step": 14165 }, { "epoch": 0.7508546895290594, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33947104.0, "logits/rejected": -27024012.8, "logps/chosen": -220.9399210611979, "logps/rejected": -78.2846923828125, "loss": 0.345, "rewards/chosen": -0.34858906269073486, "rewards/margins": 1.3518912076950074, "rewards/rejected": -1.7004802703857422, "step": 14166 }, { "epoch": 0.7509076935308616, "grad_norm": 39.0, "kl": 1.2537946701049805, "learning_rate": 5e-07, "logits/chosen": -18166136.0, "logits/rejected": -35191436.0, "logps/chosen": -321.6788330078125, "logps/rejected": -549.606201171875, "loss": 0.2364, "rewards/chosen": 0.8560927510261536, "rewards/margins": 4.104269802570343, "rewards/rejected": -3.2481770515441895, "step": 14167 }, { "epoch": 0.7509606975326637, "grad_norm": 56.75, "kl": 4.661650657653809, "learning_rate": 5e-07, "logits/chosen": -18265482.666666668, "logits/rejected": -28310412.8, "logps/chosen": -197.02703857421875, "logps/rejected": -547.4240234375, "loss": 0.3496, "rewards/chosen": 0.8229853312174479, "rewards/margins": 2.284467856089274, "rewards/rejected": -1.4614825248718262, "step": 14168 }, { "epoch": 0.7510137015344659, "grad_norm": 41.75, "kl": 2.1512069702148438, "learning_rate": 5e-07, "logits/chosen": -37219028.0, "logits/rejected": -13762878.0, "logps/chosen": -304.1113586425781, "logps/rejected": -265.2431640625, "loss": 0.3126, "rewards/chosen": 0.9607855081558228, "rewards/margins": 3.47506844997406, "rewards/rejected": -2.5142829418182373, "step": 14169 }, { "epoch": 0.751066705536268, "grad_norm": 47.75, "kl": 0.20784378051757812, "learning_rate": 5e-07, "logits/chosen": -31957102.0, "logits/rejected": -36468992.0, "logps/chosen": -586.1267700195312, "logps/rejected": -439.3360900878906, "loss": 0.2131, "rewards/chosen": 0.8623878359794617, "rewards/margins": 3.654566466808319, "rewards/rejected": -2.7921786308288574, "step": 14170 }, { "epoch": 0.7511197095380702, "grad_norm": 54.5, "kl": 2.7352142333984375, "learning_rate": 5e-07, "logits/chosen": -21450574.0, "logits/rejected": -39418544.0, "logps/chosen": -412.23760986328125, "logps/rejected": -397.3331604003906, "loss": 0.2544, "rewards/chosen": 0.8606849908828735, "rewards/margins": 3.321487545967102, "rewards/rejected": -2.4608025550842285, "step": 14171 }, { "epoch": 0.7511727135398722, "grad_norm": 64.0, "kl": 0.18270111083984375, "learning_rate": 5e-07, "logits/chosen": -66577744.0, "logits/rejected": 5087451.0, "logps/chosen": -588.5650024414062, "logps/rejected": -210.2254180908203, "loss": 0.3465, "rewards/chosen": 0.631848931312561, "rewards/margins": 1.869961142539978, "rewards/rejected": -1.238112211227417, "step": 14172 }, { "epoch": 0.7512257175416744, "grad_norm": 35.0, "kl": 2.484421730041504, "learning_rate": 5e-07, "logits/chosen": -10205361.0, "logits/rejected": -37416112.0, "logps/chosen": -126.99592590332031, "logps/rejected": -341.421875, "loss": 0.2207, "rewards/chosen": 1.072885513305664, "rewards/margins": 3.180483818054199, "rewards/rejected": -2.107598304748535, "step": 14173 }, { "epoch": 0.7512787215434765, "grad_norm": 45.75, "kl": 1.3665962219238281, "learning_rate": 5e-07, "logits/chosen": -61204774.4, "logits/rejected": -42965509.333333336, "logps/chosen": -284.7535888671875, "logps/rejected": -683.2000325520834, "loss": 0.347, "rewards/chosen": 0.11212357282638549, "rewards/margins": 3.679256204764048, "rewards/rejected": -3.5671326319376626, "step": 14174 }, { "epoch": 0.7513317255452787, "grad_norm": 47.0, "kl": 2.834930419921875, "learning_rate": 5e-07, "logits/chosen": -64806300.0, "logits/rejected": -63159424.0, "logps/chosen": -234.80966186523438, "logps/rejected": -440.1325988769531, "loss": 0.2566, "rewards/chosen": 0.7661738991737366, "rewards/margins": 3.0563300251960754, "rewards/rejected": -2.290156126022339, "step": 14175 }, { "epoch": 0.7513847295470808, "grad_norm": 37.75, "kl": 2.532815933227539, "learning_rate": 5e-07, "logits/chosen": -9740152.0, "logits/rejected": -23215124.0, "logps/chosen": -127.14109802246094, "logps/rejected": -390.9173583984375, "loss": 0.324, "rewards/chosen": 0.22578011453151703, "rewards/margins": 4.31096325814724, "rewards/rejected": -4.085183143615723, "step": 14176 }, { "epoch": 0.751437733548883, "grad_norm": 44.25, "kl": 0.13797664642333984, "learning_rate": 5e-07, "logits/chosen": -18372942.0, "logits/rejected": 120478504.0, "logps/chosen": -293.01727294921875, "logps/rejected": -358.2369384765625, "loss": 0.3024, "rewards/chosen": 0.2553401291370392, "rewards/margins": 2.467422991991043, "rewards/rejected": -2.212082862854004, "step": 14177 }, { "epoch": 0.7514907375506851, "grad_norm": 54.25, "kl": 1.3203182220458984, "learning_rate": 5e-07, "logits/chosen": -6793576.0, "logits/rejected": -51805032.0, "logps/chosen": -397.6546325683594, "logps/rejected": -342.35528564453125, "loss": 0.3269, "rewards/chosen": 0.17779769003391266, "rewards/margins": 2.4203044325113297, "rewards/rejected": -2.242506742477417, "step": 14178 }, { "epoch": 0.7515437415524873, "grad_norm": 53.0, "kl": 7.418932914733887, "learning_rate": 5e-07, "logits/chosen": -29072636.8, "logits/rejected": -57778165.333333336, "logps/chosen": -409.79990234375, "logps/rejected": -378.9888509114583, "loss": 0.3941, "rewards/chosen": 0.756423044204712, "rewards/margins": 2.117898670832316, "rewards/rejected": -1.3614756266276042, "step": 14179 }, { "epoch": 0.7515967455542893, "grad_norm": 72.5, "kl": 1.9200000762939453, "learning_rate": 5e-07, "logits/chosen": -3796236.8, "logits/rejected": -15164822.666666666, "logps/chosen": -398.209912109375, "logps/rejected": -292.8420817057292, "loss": 0.3316, "rewards/chosen": 0.5022344589233398, "rewards/margins": 2.7856222788492837, "rewards/rejected": -2.283387819925944, "step": 14180 }, { "epoch": 0.7516497495560915, "grad_norm": 39.5, "kl": 0.37535667419433594, "learning_rate": 5e-07, "logits/chosen": -44217605.333333336, "logits/rejected": -28177750.4, "logps/chosen": -409.995361328125, "logps/rejected": -171.33914794921876, "loss": 0.3001, "rewards/chosen": -0.12223585446675618, "rewards/margins": 2.2297098318735755, "rewards/rejected": -2.351945686340332, "step": 14181 }, { "epoch": 0.7517027535578936, "grad_norm": 45.0, "kl": 2.5934810638427734, "learning_rate": 5e-07, "logits/chosen": 1687821.1666666667, "logits/rejected": -1030357.6, "logps/chosen": -60.34748331705729, "logps/rejected": -301.318994140625, "loss": 0.2727, "rewards/chosen": 0.13906834522883096, "rewards/margins": 3.4659837047259012, "rewards/rejected": -3.3269153594970704, "step": 14182 }, { "epoch": 0.7517557575596957, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11390556.0, "logits/rejected": -46329920.0, "logps/chosen": -293.5475769042969, "logps/rejected": -509.3537902832031, "loss": 0.2682, "rewards/chosen": 0.14864087104797363, "rewards/margins": 3.2684426307678223, "rewards/rejected": -3.1198017597198486, "step": 14183 }, { "epoch": 0.7518087615614979, "grad_norm": 33.25, "kl": 0.8842582702636719, "learning_rate": 5e-07, "logits/chosen": -8808666.0, "logits/rejected": -57714420.0, "logps/chosen": -610.2030029296875, "logps/rejected": -479.08807373046875, "loss": 0.209, "rewards/chosen": 1.0106427669525146, "rewards/margins": 4.6563475131988525, "rewards/rejected": -3.645704746246338, "step": 14184 }, { "epoch": 0.7518617655633, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5551438.0, "logits/rejected": -59751475.2, "logps/chosen": -52.677876790364586, "logps/rejected": -360.013671875, "loss": 0.3122, "rewards/chosen": -0.15908255179723105, "rewards/margins": 2.312083868185679, "rewards/rejected": -2.47116641998291, "step": 14185 }, { "epoch": 0.7519147695651022, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28293760.0, "logits/rejected": -30149635.2, "logps/chosen": -436.9315592447917, "logps/rejected": -235.8294677734375, "loss": 0.2263, "rewards/chosen": 0.21901257832845053, "rewards/margins": 2.9828059514363607, "rewards/rejected": -2.76379337310791, "step": 14186 }, { "epoch": 0.7519677735669043, "grad_norm": 27.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27427380.0, "logits/rejected": -7268534.666666667, "logps/chosen": -95.59603881835938, "logps/rejected": -358.184326171875, "loss": 0.1483, "rewards/chosen": 0.7874796390533447, "rewards/margins": 3.6980342070261636, "rewards/rejected": -2.910554567972819, "step": 14187 }, { "epoch": 0.7520207775687064, "grad_norm": 44.0, "kl": 1.2807159423828125, "learning_rate": 5e-07, "logits/chosen": 6334708.666666667, "logits/rejected": 1938972.6, "logps/chosen": -124.66732788085938, "logps/rejected": -539.327001953125, "loss": 0.3227, "rewards/chosen": -0.046939343214035034, "rewards/margins": 2.852655917406082, "rewards/rejected": -2.899595260620117, "step": 14188 }, { "epoch": 0.7520737815705085, "grad_norm": 32.5, "kl": 1.978926658630371, "learning_rate": 5e-07, "logits/chosen": 10437798.666666666, "logits/rejected": -79678598.4, "logps/chosen": -65.22908528645833, "logps/rejected": -419.791748046875, "loss": 0.1678, "rewards/chosen": 1.0963919162750244, "rewards/margins": 4.313649606704712, "rewards/rejected": -3.2172576904296877, "step": 14189 }, { "epoch": 0.7521267855723107, "grad_norm": 46.0, "kl": 1.1154956817626953, "learning_rate": 5e-07, "logits/chosen": 3369658.0, "logits/rejected": -22232000.0, "logps/chosen": -262.62689208984375, "logps/rejected": -299.42840576171875, "loss": 0.2509, "rewards/chosen": 1.2589612007141113, "rewards/margins": 3.609523057937622, "rewards/rejected": -2.3505618572235107, "step": 14190 }, { "epoch": 0.7521797895741128, "grad_norm": 57.25, "kl": 1.0774250030517578, "learning_rate": 5e-07, "logits/chosen": -24426581.333333332, "logits/rejected": -33630520.0, "logps/chosen": -249.5369873046875, "logps/rejected": -609.3072509765625, "loss": 0.355, "rewards/chosen": 0.34395356973012287, "rewards/margins": 3.8253836234410605, "rewards/rejected": -3.4814300537109375, "step": 14191 }, { "epoch": 0.752232793575915, "grad_norm": 51.0, "kl": 0.7125225067138672, "learning_rate": 5e-07, "logits/chosen": -37105848.0, "logits/rejected": -7378691.0, "logps/chosen": -346.3551025390625, "logps/rejected": -145.89016723632812, "loss": 0.2925, "rewards/chosen": 0.17787352204322815, "rewards/margins": 2.8649963438510895, "rewards/rejected": -2.6871228218078613, "step": 14192 }, { "epoch": 0.7522857975777171, "grad_norm": 40.5, "kl": 0.8335447311401367, "learning_rate": 5e-07, "logits/chosen": -52824172.0, "logits/rejected": -72584021.33333333, "logps/chosen": -144.4636993408203, "logps/rejected": -245.535400390625, "loss": 0.2495, "rewards/chosen": 0.2639608681201935, "rewards/margins": 2.065567682186763, "rewards/rejected": -1.801606814066569, "step": 14193 }, { "epoch": 0.7523388015795193, "grad_norm": 37.75, "kl": 1.2376899719238281, "learning_rate": 5e-07, "logits/chosen": -65958608.0, "logits/rejected": -17046244.8, "logps/chosen": -696.4637858072916, "logps/rejected": -223.1086181640625, "loss": 0.1262, "rewards/chosen": 1.444481372833252, "rewards/margins": 5.032928562164306, "rewards/rejected": -3.5884471893310548, "step": 14194 }, { "epoch": 0.7523918055813213, "grad_norm": 51.5, "kl": 3.2944412231445312, "learning_rate": 5e-07, "logits/chosen": -23364632.0, "logits/rejected": -92163256.0, "logps/chosen": -620.2646484375, "logps/rejected": -669.2521362304688, "loss": 0.2144, "rewards/chosen": 1.7654611269632976, "rewards/margins": 3.8484002749125166, "rewards/rejected": -2.0829391479492188, "step": 14195 }, { "epoch": 0.7524448095831235, "grad_norm": 49.25, "kl": 2.5127811431884766, "learning_rate": 5e-07, "logits/chosen": -11892751.2, "logits/rejected": -17260753.333333332, "logps/chosen": -260.82412109375, "logps/rejected": -403.009521484375, "loss": 0.3286, "rewards/chosen": 0.497640323638916, "rewards/margins": 3.2294645627339684, "rewards/rejected": -2.7318242390950522, "step": 14196 }, { "epoch": 0.7524978135849256, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15563181.333333334, "logits/rejected": -32168380.8, "logps/chosen": -216.46136474609375, "logps/rejected": -339.0490234375, "loss": 0.238, "rewards/chosen": -0.041631569465001426, "rewards/margins": 3.2047177652517953, "rewards/rejected": -3.246349334716797, "step": 14197 }, { "epoch": 0.7525508175867278, "grad_norm": 58.0, "kl": 0.8301525115966797, "learning_rate": 5e-07, "logits/chosen": -59410725.333333336, "logits/rejected": -7182514.0, "logps/chosen": -235.5025634765625, "logps/rejected": -142.8235321044922, "loss": 0.4236, "rewards/chosen": 0.0486914316813151, "rewards/margins": 1.7619728247324626, "rewards/rejected": -1.7132813930511475, "step": 14198 }, { "epoch": 0.7526038215885299, "grad_norm": 43.75, "kl": 2.7605514526367188, "learning_rate": 5e-07, "logits/chosen": -10477028.0, "logits/rejected": -42404890.666666664, "logps/chosen": -184.10340576171876, "logps/rejected": -403.6338704427083, "loss": 0.3054, "rewards/chosen": 0.6723249435424805, "rewards/margins": 3.5725909550984705, "rewards/rejected": -2.9002660115559897, "step": 14199 }, { "epoch": 0.7526568255903321, "grad_norm": 43.0, "kl": 0.3984088897705078, "learning_rate": 5e-07, "logits/chosen": -21111385.333333332, "logits/rejected": -16047228.0, "logps/chosen": -334.6487630208333, "logps/rejected": -250.64866638183594, "loss": 0.2638, "rewards/chosen": 0.8794504006703695, "rewards/margins": 3.2952233155568442, "rewards/rejected": -2.4157729148864746, "step": 14200 }, { "epoch": 0.7527098295921342, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -33560512.0, "logps/rejected": -348.2680969238281, "loss": 0.1621, "rewards/rejected": -2.1450324058532715, "step": 14201 }, { "epoch": 0.7527628335939364, "grad_norm": 47.25, "kl": 1.5646705627441406, "learning_rate": 5e-07, "logits/chosen": -64958090.666666664, "logits/rejected": -28980640.0, "logps/chosen": -531.4930419921875, "logps/rejected": -455.7837890625, "loss": 0.217, "rewards/chosen": 1.5197885831197102, "rewards/margins": 4.04730323155721, "rewards/rejected": -2.5275146484375, "step": 14202 }, { "epoch": 0.7528158375957384, "grad_norm": 58.0, "kl": 4.779156684875488, "learning_rate": 5e-07, "logits/chosen": -14558065.333333334, "logits/rejected": -39948596.0, "logps/chosen": -308.9910888671875, "logps/rejected": -534.8304443359375, "loss": 0.319, "rewards/chosen": 1.0509045918782551, "rewards/margins": 3.758560021718343, "rewards/rejected": -2.707655429840088, "step": 14203 }, { "epoch": 0.7528688415975406, "grad_norm": 36.25, "kl": 0.8743247985839844, "learning_rate": 5e-07, "logits/chosen": -8033295.0, "logits/rejected": -27468196.57142857, "logps/chosen": -302.2043762207031, "logps/rejected": -333.38657924107144, "loss": 0.1462, "rewards/chosen": 1.8064240217208862, "rewards/margins": 4.087368845939636, "rewards/rejected": -2.28094482421875, "step": 14204 }, { "epoch": 0.7529218455993427, "grad_norm": 70.5, "kl": 0.9724044799804688, "learning_rate": 5e-07, "logits/chosen": -24670724.8, "logits/rejected": -57058656.0, "logps/chosen": -356.0339111328125, "logps/rejected": -255.36808268229166, "loss": 0.284, "rewards/chosen": 0.5691000938415527, "rewards/margins": 3.3529180208841955, "rewards/rejected": -2.783817927042643, "step": 14205 }, { "epoch": 0.7529748496011449, "grad_norm": 49.5, "kl": 3.648061752319336, "learning_rate": 5e-07, "logits/chosen": -16375222.4, "logits/rejected": -17431092.0, "logps/chosen": -267.6970947265625, "logps/rejected": -349.6800130208333, "loss": 0.3178, "rewards/chosen": 0.542622184753418, "rewards/margins": 3.6464439392089845, "rewards/rejected": -3.1038217544555664, "step": 14206 }, { "epoch": 0.753027853602947, "grad_norm": 50.75, "kl": 3.136113166809082, "learning_rate": 5e-07, "logits/chosen": -47814457.6, "logits/rejected": -90146176.0, "logps/chosen": -377.6437744140625, "logps/rejected": -516.240966796875, "loss": 0.266, "rewards/chosen": 1.0866355895996094, "rewards/margins": 3.4586362838745117, "rewards/rejected": -2.3720006942749023, "step": 14207 }, { "epoch": 0.7530808576047492, "grad_norm": 61.25, "kl": 4.035350799560547, "learning_rate": 5e-07, "logits/chosen": -43965045.333333336, "logits/rejected": -2700013.25, "logps/chosen": -422.2390950520833, "logps/rejected": -169.15353393554688, "loss": 0.3556, "rewards/chosen": 0.6552244027455648, "rewards/margins": 3.1481133302052817, "rewards/rejected": -2.492888927459717, "step": 14208 }, { "epoch": 0.7531338616065513, "grad_norm": 54.75, "kl": 5.576972961425781, "learning_rate": 5e-07, "logits/chosen": -39175858.666666664, "logits/rejected": -17696910.0, "logps/chosen": -346.8889973958333, "logps/rejected": -203.87986755371094, "loss": 0.4306, "rewards/chosen": 0.542086124420166, "rewards/margins": 2.7710647583007812, "rewards/rejected": -2.2289786338806152, "step": 14209 }, { "epoch": 0.7531868656083535, "grad_norm": 45.25, "kl": 1.0723686218261719, "learning_rate": 5e-07, "logits/chosen": -18631043.2, "logits/rejected": -6190681.333333333, "logps/chosen": -415.10888671875, "logps/rejected": -159.3210652669271, "loss": 0.315, "rewards/chosen": 0.9612292289733887, "rewards/margins": 2.7500154813130697, "rewards/rejected": -1.788786252339681, "step": 14210 }, { "epoch": 0.7532398696101555, "grad_norm": 70.5, "kl": 1.2591629028320312, "learning_rate": 5e-07, "logits/chosen": -26164314.666666668, "logits/rejected": -46151689.6, "logps/chosen": -375.1174723307292, "logps/rejected": -504.55830078125, "loss": 0.1681, "rewards/chosen": 1.1238800684611003, "rewards/margins": 4.040633837381999, "rewards/rejected": -2.9167537689208984, "step": 14211 }, { "epoch": 0.7532928736119577, "grad_norm": 46.5, "kl": 1.1161918640136719, "learning_rate": 5e-07, "logits/chosen": -91440408.0, "logits/rejected": -42801216.0, "logps/chosen": -445.70513916015625, "logps/rejected": -406.333740234375, "loss": 0.1834, "rewards/chosen": 1.3749901056289673, "rewards/margins": 3.552307963371277, "rewards/rejected": -2.1773178577423096, "step": 14212 }, { "epoch": 0.7533458776137598, "grad_norm": 65.0, "kl": 6.26988410949707, "learning_rate": 5e-07, "logits/chosen": -18339784.0, "logits/rejected": -38978296.0, "logps/chosen": -508.4341796875, "logps/rejected": -166.5488484700521, "loss": 0.2734, "rewards/chosen": 1.4926164627075196, "rewards/margins": 3.0658074696858724, "rewards/rejected": -1.5731910069783528, "step": 14213 }, { "epoch": 0.753398881615562, "grad_norm": 43.75, "kl": 0.3857574462890625, "learning_rate": 5e-07, "logits/chosen": -37195609.6, "logits/rejected": -22905557.333333332, "logps/chosen": -311.3136474609375, "logps/rejected": -203.3952840169271, "loss": 0.2185, "rewards/chosen": 0.9227962493896484, "rewards/margins": 5.141989390055339, "rewards/rejected": -4.21919314066569, "step": 14214 }, { "epoch": 0.7534518856173641, "grad_norm": 54.5, "kl": 0.15740966796875, "learning_rate": 5e-07, "logits/chosen": -99142387.2, "logits/rejected": -16957353.333333332, "logps/chosen": -336.0150390625, "logps/rejected": -256.9516194661458, "loss": 0.3296, "rewards/chosen": 0.14620559215545653, "rewards/margins": 3.7063105980555213, "rewards/rejected": -3.560105005900065, "step": 14215 }, { "epoch": 0.7535048896191663, "grad_norm": 43.0, "kl": 1.0959892272949219, "learning_rate": 5e-07, "logits/chosen": -20269596.0, "logits/rejected": -50450800.0, "logps/chosen": -255.57278442382812, "logps/rejected": -367.370849609375, "loss": 0.13, "rewards/chosen": 1.3703696727752686, "rewards/margins": 4.290281057357788, "rewards/rejected": -2.9199113845825195, "step": 14216 }, { "epoch": 0.7535578936209684, "grad_norm": 67.5, "kl": 0.8227100372314453, "learning_rate": 5e-07, "logits/chosen": -76144204.8, "logits/rejected": -25100365.333333332, "logps/chosen": -401.696044921875, "logps/rejected": -120.58695475260417, "loss": 0.3886, "rewards/chosen": -0.009730207920074462, "rewards/margins": 2.2607116421063744, "rewards/rejected": -2.2704418500264487, "step": 14217 }, { "epoch": 0.7536108976227706, "grad_norm": 28.125, "kl": 0.7767095565795898, "learning_rate": 5e-07, "logits/chosen": -7221852.5, "logits/rejected": -8017834.5, "logps/chosen": -100.54452514648438, "logps/rejected": -184.3346710205078, "loss": 0.2262, "rewards/chosen": 1.1128787994384766, "rewards/margins": 3.4770050048828125, "rewards/rejected": -2.364126205444336, "step": 14218 }, { "epoch": 0.7536639016245726, "grad_norm": 62.25, "kl": 2.502635955810547, "learning_rate": 5e-07, "logits/chosen": -17605474.0, "logits/rejected": -2441230.5, "logps/chosen": -336.5759582519531, "logps/rejected": -228.12551879882812, "loss": 0.2277, "rewards/chosen": 0.8051807880401611, "rewards/margins": 4.575784921646118, "rewards/rejected": -3.770604133605957, "step": 14219 }, { "epoch": 0.7537169056263748, "grad_norm": 49.75, "kl": 1.2174034118652344, "learning_rate": 5e-07, "logits/chosen": -34945160.0, "logits/rejected": -1854568.4, "logps/chosen": -571.9493408203125, "logps/rejected": -215.35908203125, "loss": 0.2586, "rewards/chosen": 0.4298590421676636, "rewards/margins": 2.333413004875183, "rewards/rejected": -1.9035539627075195, "step": 14220 }, { "epoch": 0.7537699096281769, "grad_norm": 38.5, "kl": 0.7029628753662109, "learning_rate": 5e-07, "logits/chosen": -62431385.6, "logits/rejected": -49220368.0, "logps/chosen": -368.981640625, "logps/rejected": -271.6924641927083, "loss": 0.1875, "rewards/chosen": 1.1798124313354492, "rewards/margins": 4.409452756245932, "rewards/rejected": -3.229640324910482, "step": 14221 }, { "epoch": 0.7538229136299791, "grad_norm": 43.75, "kl": 0.7117691040039062, "learning_rate": 5e-07, "logits/chosen": -6587323.2, "logits/rejected": -55692789.333333336, "logps/chosen": -138.40634765625, "logps/rejected": -355.4951985677083, "loss": 0.3297, "rewards/chosen": 0.2242872714996338, "rewards/margins": 3.1812075773874917, "rewards/rejected": -2.956920305887858, "step": 14222 }, { "epoch": 0.7538759176317812, "grad_norm": 47.25, "kl": 4.789581298828125, "learning_rate": 5e-07, "logits/chosen": -63953888.0, "logits/rejected": -51892520.0, "logps/chosen": -854.5515747070312, "logps/rejected": -626.022216796875, "loss": 0.1818, "rewards/chosen": 1.7883388996124268, "rewards/margins": 4.694088459014893, "rewards/rejected": -2.905749559402466, "step": 14223 }, { "epoch": 0.7539289216335834, "grad_norm": 36.25, "kl": 1.0692329406738281, "learning_rate": 5e-07, "logits/chosen": -44241200.0, "logits/rejected": -61928384.0, "logps/chosen": -184.4251912434896, "logps/rejected": -187.3052001953125, "loss": 0.2351, "rewards/chosen": 0.4596695899963379, "rewards/margins": 2.492242527008057, "rewards/rejected": -2.032572937011719, "step": 14224 }, { "epoch": 0.7539819256353855, "grad_norm": 59.25, "kl": 1.2270870208740234, "learning_rate": 5e-07, "logits/chosen": -31203072.0, "logits/rejected": -17909402.666666668, "logps/chosen": -382.0850830078125, "logps/rejected": -255.189697265625, "loss": 0.2464, "rewards/chosen": 1.0151843070983886, "rewards/margins": 2.955501619974772, "rewards/rejected": -1.9403173128763835, "step": 14225 }, { "epoch": 0.7540349296371877, "grad_norm": 49.0, "kl": 2.513143539428711, "learning_rate": 5e-07, "logits/chosen": -36536960.0, "logits/rejected": -61250192.0, "logps/chosen": -487.2752990722656, "logps/rejected": -456.4512939453125, "loss": 0.2247, "rewards/chosen": 0.7997097969055176, "rewards/margins": 3.529710054397583, "rewards/rejected": -2.7300002574920654, "step": 14226 }, { "epoch": 0.7540879336389897, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 72394928.0, "logits/rejected": -16576616.0, "logps/chosen": -481.6579996744792, "logps/rejected": -495.739990234375, "loss": 0.2262, "rewards/chosen": 0.981005589167277, "rewards/margins": 3.8822259108225503, "rewards/rejected": -2.9012203216552734, "step": 14227 }, { "epoch": 0.7541409376407919, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29980652.8, "logits/rejected": -37526218.666666664, "logps/chosen": -396.1345458984375, "logps/rejected": -289.56591796875, "loss": 0.3646, "rewards/chosen": 0.13019589185714722, "rewards/margins": 2.1391985774040223, "rewards/rejected": -2.009002685546875, "step": 14228 }, { "epoch": 0.754193941642594, "grad_norm": 49.5, "kl": 1.5806159973144531, "learning_rate": 5e-07, "logits/chosen": 2191005.0, "logits/rejected": -49374700.0, "logps/chosen": -206.8811492919922, "logps/rejected": -306.1878662109375, "loss": 0.3599, "rewards/chosen": -0.01684938371181488, "rewards/margins": 1.4802901893854141, "rewards/rejected": -1.497139573097229, "step": 14229 }, { "epoch": 0.7542469456443962, "grad_norm": 60.25, "kl": 3.4853057861328125, "learning_rate": 5e-07, "logits/chosen": 8709737.6, "logits/rejected": -56341338.666666664, "logps/chosen": -315.8146728515625, "logps/rejected": -282.07973225911456, "loss": 0.3947, "rewards/chosen": 0.17554905414581298, "rewards/margins": 3.1513587872187294, "rewards/rejected": -2.9758097330729165, "step": 14230 }, { "epoch": 0.7542999496461983, "grad_norm": 42.75, "kl": 0.8307571411132812, "learning_rate": 5e-07, "logits/chosen": -19354265.333333332, "logits/rejected": -9679529.6, "logps/chosen": -281.1566569010417, "logps/rejected": -143.7513427734375, "loss": 0.3492, "rewards/chosen": 0.12592238187789917, "rewards/margins": 1.5455438494682312, "rewards/rejected": -1.419621467590332, "step": 14231 }, { "epoch": 0.7543529536480005, "grad_norm": 38.5, "kl": 0.6636428833007812, "learning_rate": 5e-07, "logits/chosen": -41748506.666666664, "logits/rejected": -63728403.2, "logps/chosen": -475.056396484375, "logps/rejected": -408.3938232421875, "loss": 0.1469, "rewards/chosen": 1.193305492401123, "rewards/margins": 4.5504168510437015, "rewards/rejected": -3.357111358642578, "step": 14232 }, { "epoch": 0.7544059576498026, "grad_norm": 37.5, "kl": 0.29898834228515625, "learning_rate": 5e-07, "logits/chosen": -100630944.0, "logits/rejected": -40464652.0, "logps/chosen": -338.052490234375, "logps/rejected": -337.1026611328125, "loss": 0.2223, "rewards/chosen": 0.6176645755767822, "rewards/margins": 3.7380714416503906, "rewards/rejected": -3.1204068660736084, "step": 14233 }, { "epoch": 0.7544589616516046, "grad_norm": 54.25, "kl": 0.6036758422851562, "learning_rate": 5e-07, "logits/chosen": -26876821.333333332, "logits/rejected": -1173871.75, "logps/chosen": -326.4120279947917, "logps/rejected": -187.42874145507812, "loss": 0.3644, "rewards/chosen": 0.6378785769144694, "rewards/margins": 1.6707272926966348, "rewards/rejected": -1.0328487157821655, "step": 14234 }, { "epoch": 0.7545119656534068, "grad_norm": 49.0, "kl": 5.079168319702148, "learning_rate": 5e-07, "logits/chosen": -55327680.0, "logits/rejected": 1533961.6666666667, "logps/chosen": -426.807568359375, "logps/rejected": -137.3275146484375, "loss": 0.3413, "rewards/chosen": 0.8983868598937989, "rewards/margins": 2.912277921040853, "rewards/rejected": -2.013891061147054, "step": 14235 }, { "epoch": 0.7545649696552089, "grad_norm": 97.0, "kl": 4.169317245483398, "learning_rate": 5e-07, "logits/chosen": -32132797.333333332, "logits/rejected": -18034118.0, "logps/chosen": -396.0324300130208, "logps/rejected": -408.60894775390625, "loss": 0.3079, "rewards/chosen": 1.219042698542277, "rewards/margins": 3.4761838118235273, "rewards/rejected": -2.25714111328125, "step": 14236 }, { "epoch": 0.7546179736570111, "grad_norm": 39.25, "kl": 2.333629608154297, "learning_rate": 5e-07, "logits/chosen": -16018046.0, "logits/rejected": -33493168.0, "logps/chosen": -202.53317260742188, "logps/rejected": -161.202880859375, "loss": 0.3182, "rewards/chosen": 0.2210974544286728, "rewards/margins": 2.2748687118291855, "rewards/rejected": -2.0537712574005127, "step": 14237 }, { "epoch": 0.7546709776588132, "grad_norm": 57.75, "kl": 3.2415313720703125, "learning_rate": 5e-07, "logits/chosen": 6666445.0, "logits/rejected": -7191842.666666667, "logps/chosen": -556.8511352539062, "logps/rejected": -171.81197102864584, "loss": 0.1922, "rewards/chosen": 1.8786712884902954, "rewards/margins": 3.978917717933655, "rewards/rejected": -2.1002464294433594, "step": 14238 }, { "epoch": 0.7547239816606154, "grad_norm": 46.0, "kl": 0.41701507568359375, "learning_rate": 5e-07, "logits/chosen": -36040080.0, "logits/rejected": -11658659.0, "logps/chosen": -256.7974853515625, "logps/rejected": -264.47003173828125, "loss": 0.3141, "rewards/chosen": 0.1946345418691635, "rewards/margins": 1.90803824365139, "rewards/rejected": -1.7134037017822266, "step": 14239 }, { "epoch": 0.7547769856624175, "grad_norm": 57.0, "kl": 2.531961441040039, "learning_rate": 5e-07, "logits/chosen": -27237806.0, "logps/chosen": -278.28704833984375, "loss": 0.5104, "rewards/chosen": 0.20467829704284668, "step": 14240 }, { "epoch": 0.7548299896642197, "grad_norm": 65.0, "kl": 4.893058776855469, "learning_rate": 5e-07, "logits/chosen": -45609112.0, "logits/rejected": 4435065.0, "logps/chosen": -550.9132080078125, "logps/rejected": -133.5823974609375, "loss": 0.3088, "rewards/chosen": 1.4699835777282715, "rewards/margins": 3.2186416387557983, "rewards/rejected": -1.7486580610275269, "step": 14241 }, { "epoch": 0.7548829936660217, "grad_norm": 54.0, "kl": 3.8888769149780273, "learning_rate": 5e-07, "logits/chosen": -18910434.666666668, "logits/rejected": -18113752.0, "logps/chosen": -346.44873046875, "logps/rejected": -218.3374786376953, "loss": 0.4153, "rewards/chosen": 0.3840155204137166, "rewards/margins": 3.499067743619283, "rewards/rejected": -3.1150522232055664, "step": 14242 }, { "epoch": 0.7549359976678239, "grad_norm": 29.125, "kl": 1.6068267822265625, "learning_rate": 5e-07, "logits/chosen": -8735497.6, "logits/rejected": -32488794.666666668, "logps/chosen": -176.41776123046876, "logps/rejected": -493.0383707682292, "loss": 0.2523, "rewards/chosen": 0.8148564338684082, "rewards/margins": 5.144443734486898, "rewards/rejected": -4.329587300618489, "step": 14243 }, { "epoch": 0.754989001669626, "grad_norm": 43.75, "kl": 1.3058738708496094, "learning_rate": 5e-07, "logits/chosen": -41673776.0, "logits/rejected": -23777352.0, "logps/chosen": -256.4127197265625, "logps/rejected": -263.4988098144531, "loss": 0.2562, "rewards/chosen": 0.3753877580165863, "rewards/margins": 3.6456333100795746, "rewards/rejected": -3.2702455520629883, "step": 14244 }, { "epoch": 0.7550420056714282, "grad_norm": 29.875, "kl": 1.3131179809570312, "learning_rate": 5e-07, "logits/chosen": -12246712.0, "logits/rejected": -18865192.0, "logps/chosen": -231.72445678710938, "logps/rejected": -422.5799967447917, "loss": 0.2224, "rewards/chosen": -0.6511551141738892, "rewards/margins": 2.3956181605656943, "rewards/rejected": -3.0467732747395835, "step": 14245 }, { "epoch": 0.7550950096732303, "grad_norm": 33.5, "kl": 1.2049598693847656, "learning_rate": 5e-07, "logits/chosen": -6703800.0, "logits/rejected": -13445547.2, "logps/chosen": -682.1603597005209, "logps/rejected": -170.31109619140625, "loss": 0.128, "rewards/chosen": 2.2217464447021484, "rewards/margins": 4.656761360168457, "rewards/rejected": -2.4350149154663088, "step": 14246 }, { "epoch": 0.7551480136750325, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28163792.0, "logits/rejected": -10124496.8, "logps/chosen": -330.9546305338542, "logps/rejected": -182.806689453125, "loss": 0.2121, "rewards/chosen": 0.4522993564605713, "rewards/margins": 3.6238062381744385, "rewards/rejected": -3.171506881713867, "step": 14247 }, { "epoch": 0.7552010176768346, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6294937.6, "logits/rejected": -20419444.0, "logps/chosen": -197.3376708984375, "logps/rejected": -306.2017008463542, "loss": 0.3598, "rewards/chosen": -0.02699218988418579, "rewards/margins": 2.382092924912771, "rewards/rejected": -2.4090851147969565, "step": 14248 }, { "epoch": 0.7552540216786368, "grad_norm": 54.75, "kl": 1.2529983520507812, "learning_rate": 5e-07, "logits/chosen": -67802634.66666667, "logits/rejected": -26665649.6, "logps/chosen": -627.2736002604166, "logps/rejected": -225.806298828125, "loss": 0.2498, "rewards/chosen": 0.6130279699961344, "rewards/margins": 2.2622604529062906, "rewards/rejected": -1.6492324829101563, "step": 14249 }, { "epoch": 0.7553070256804388, "grad_norm": 47.75, "kl": 0.7114944458007812, "learning_rate": 5e-07, "logits/chosen": -55859456.0, "logits/rejected": -2571410.4, "logps/chosen": -447.1024983723958, "logps/rejected": -331.5878173828125, "loss": 0.2485, "rewards/chosen": 0.4210622310638428, "rewards/margins": 2.5568908214569093, "rewards/rejected": -2.1358285903930665, "step": 14250 }, { "epoch": 0.755360029682241, "grad_norm": 47.0, "kl": 1.4602794647216797, "learning_rate": 5e-07, "logits/chosen": -19441748.8, "logits/rejected": -26274810.666666668, "logps/chosen": -215.290771484375, "logps/rejected": -491.9200032552083, "loss": 0.2402, "rewards/chosen": 0.8755118370056152, "rewards/margins": 5.120864899953206, "rewards/rejected": -4.245353062947591, "step": 14251 }, { "epoch": 0.7554130336840431, "grad_norm": 64.0, "kl": 6.453165054321289, "learning_rate": 5e-07, "logits/chosen": -14223968.0, "logits/rejected": 285145216.0, "logps/chosen": -482.06611328125, "logps/rejected": -320.629638671875, "loss": 0.2779, "rewards/chosen": 0.9317151069641113, "rewards/margins": 2.7572766304016114, "rewards/rejected": -1.8255615234375, "step": 14252 }, { "epoch": 0.7554660376858453, "grad_norm": 77.5, "kl": 3.842466354370117, "learning_rate": 5e-07, "logits/chosen": -25628050.285714287, "logits/rejected": -4613954.5, "logps/chosen": -465.0160435267857, "logps/rejected": -60.99479293823242, "loss": 0.4153, "rewards/chosen": 0.6463063103812081, "rewards/margins": 3.1071019513266425, "rewards/rejected": -2.4607956409454346, "step": 14253 }, { "epoch": 0.7555190416876474, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4661837.5, "logits/rejected": -25434378.666666668, "logps/chosen": -343.6838073730469, "logps/rejected": -289.3799235026042, "loss": 0.2006, "rewards/chosen": 1.0536915063858032, "rewards/margins": 3.148915727933248, "rewards/rejected": -2.095224221547445, "step": 14254 }, { "epoch": 0.7555720456894496, "grad_norm": 43.75, "kl": 0.5287494659423828, "learning_rate": 5e-07, "logits/chosen": -8134089.5, "logits/rejected": -2839836.0, "logps/chosen": -50.31974792480469, "logps/rejected": -269.05364990234375, "loss": 0.2731, "rewards/chosen": 0.9957759380340576, "rewards/margins": 2.3276126384735107, "rewards/rejected": -1.3318367004394531, "step": 14255 }, { "epoch": 0.7556250496912517, "grad_norm": 56.75, "kl": 0.6849746704101562, "learning_rate": 5e-07, "logits/chosen": -31172035.2, "logits/rejected": -49280490.666666664, "logps/chosen": -403.142724609375, "logps/rejected": -568.0033772786459, "loss": 0.3082, "rewards/chosen": 0.3090569019317627, "rewards/margins": 3.13122493426005, "rewards/rejected": -2.8221680323282876, "step": 14256 }, { "epoch": 0.7556780536930539, "grad_norm": 54.25, "kl": 0.2590789794921875, "learning_rate": 5e-07, "logits/chosen": -17206142.4, "logits/rejected": -36279989.333333336, "logps/chosen": -209.31728515625, "logps/rejected": -384.9563395182292, "loss": 0.3577, "rewards/chosen": 0.2097501277923584, "rewards/margins": 1.9907639344533283, "rewards/rejected": -1.78101380666097, "step": 14257 }, { "epoch": 0.7557310576948559, "grad_norm": 47.25, "kl": 1.1988029479980469, "learning_rate": 5e-07, "logits/chosen": -30330249.6, "logits/rejected": -27734392.0, "logps/chosen": -238.2942626953125, "logps/rejected": -534.9166666666666, "loss": 0.2639, "rewards/chosen": 0.7882931709289551, "rewards/margins": 3.3320544242858885, "rewards/rejected": -2.5437612533569336, "step": 14258 }, { "epoch": 0.7557840616966581, "grad_norm": 37.0, "kl": 3.3199119567871094, "learning_rate": 5e-07, "logits/chosen": -5436736.5, "logits/rejected": -13985772.0, "logps/chosen": -343.14971923828125, "logps/rejected": -372.5018005371094, "loss": 0.2628, "rewards/chosen": 0.8396391868591309, "rewards/margins": 4.0796709060668945, "rewards/rejected": -3.2400317192077637, "step": 14259 }, { "epoch": 0.7558370656984602, "grad_norm": 70.5, "kl": 1.0542526245117188, "learning_rate": 5e-07, "logits/chosen": 10021690.0, "logits/rejected": -24870490.0, "logps/chosen": -442.2175598144531, "logps/rejected": -266.1258544921875, "loss": 0.2684, "rewards/chosen": 0.6489806771278381, "rewards/margins": 3.3073158860206604, "rewards/rejected": -2.6583352088928223, "step": 14260 }, { "epoch": 0.7558900697002624, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49464132.0, "logits/rejected": -33926868.0, "logps/chosen": -527.6048583984375, "logps/rejected": -366.85882568359375, "loss": 0.2457, "rewards/chosen": 0.4219345152378082, "rewards/margins": 2.9836415350437164, "rewards/rejected": -2.561707019805908, "step": 14261 }, { "epoch": 0.7559430737020645, "grad_norm": 61.25, "kl": 5.168850898742676, "learning_rate": 5e-07, "logits/chosen": 204535.5, "logits/rejected": -8551589.0, "logps/chosen": -206.3104248046875, "logps/rejected": -158.50759887695312, "loss": 0.3962, "rewards/chosen": 0.6464080015818278, "rewards/margins": 3.2861003081003823, "rewards/rejected": -2.6396923065185547, "step": 14262 }, { "epoch": 0.7559960777038667, "grad_norm": 54.0, "kl": 0.7137966156005859, "learning_rate": 5e-07, "logits/chosen": 8706756.0, "logits/rejected": -7829814.666666667, "logps/chosen": -211.3043975830078, "logps/rejected": -277.6971028645833, "loss": 0.2038, "rewards/chosen": 0.6878408193588257, "rewards/margins": 2.6154024203618365, "rewards/rejected": -1.927561601003011, "step": 14263 }, { "epoch": 0.7560490817056688, "grad_norm": 41.0, "kl": 3.399625778198242, "learning_rate": 5e-07, "logits/chosen": -13560945.333333334, "logits/rejected": -46760118.4, "logps/chosen": -489.6582438151042, "logps/rejected": -219.3780517578125, "loss": 0.2051, "rewards/chosen": 1.2584784825642903, "rewards/margins": 3.5796836217244463, "rewards/rejected": -2.3212051391601562, "step": 14264 }, { "epoch": 0.756102085707471, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58648288.0, "logits/rejected": -36789513.14285714, "logps/chosen": -318.5048828125, "logps/rejected": -399.9908970424107, "loss": 0.1698, "rewards/chosen": 0.40907594561576843, "rewards/margins": 2.6628485449722836, "rewards/rejected": -2.253772599356515, "step": 14265 }, { "epoch": 0.756155089709273, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47612528.0, "logits/rejected": -40586960.0, "logps/chosen": -364.308837890625, "logps/rejected": -249.81138610839844, "loss": 0.3899, "rewards/chosen": -0.30873680114746094, "rewards/margins": 1.2637383937835693, "rewards/rejected": -1.5724751949310303, "step": 14266 }, { "epoch": 0.7562080937110752, "grad_norm": 52.0, "kl": 0.7567901611328125, "learning_rate": 5e-07, "logits/chosen": -15057157.333333334, "logits/rejected": 93577968.0, "logps/chosen": -345.4075113932292, "logps/rejected": -774.541015625, "loss": 0.3102, "rewards/chosen": 0.8477976322174072, "rewards/margins": 4.3347694873809814, "rewards/rejected": -3.486971855163574, "step": 14267 }, { "epoch": 0.7562610977128773, "grad_norm": 39.75, "kl": 0.47917652130126953, "learning_rate": 5e-07, "logits/chosen": -17521240.0, "logits/rejected": 11366945.333333334, "logps/chosen": -225.12997436523438, "logps/rejected": -307.7769775390625, "loss": 0.2664, "rewards/chosen": -0.11510926485061646, "rewards/margins": 2.0192214846611023, "rewards/rejected": -2.1343307495117188, "step": 14268 }, { "epoch": 0.7563141017146795, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32228256.0, "logits/rejected": -35439606.85714286, "logps/chosen": -281.14654541015625, "logps/rejected": -331.39913504464283, "loss": 0.1277, "rewards/chosen": 0.141876220703125, "rewards/margins": 2.9381686619349887, "rewards/rejected": -2.7962924412318637, "step": 14269 }, { "epoch": 0.7563671057164816, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41081336.0, "logits/rejected": -3516265.0, "logps/chosen": -358.741455078125, "logps/rejected": -93.40518188476562, "loss": 0.286, "rewards/chosen": 0.40599822998046875, "rewards/margins": 2.6298115253448486, "rewards/rejected": -2.22381329536438, "step": 14270 }, { "epoch": 0.7564201097182838, "grad_norm": 37.0, "kl": 2.687654495239258, "learning_rate": 5e-07, "logits/chosen": -25315382.0, "logits/rejected": -65060732.0, "logps/chosen": -331.2199401855469, "logps/rejected": -480.638916015625, "loss": 0.2274, "rewards/chosen": 0.7428922653198242, "rewards/margins": 3.974703550338745, "rewards/rejected": -3.231811285018921, "step": 14271 }, { "epoch": 0.7564731137200859, "grad_norm": 67.0, "kl": 4.986083984375, "learning_rate": 5e-07, "logits/chosen": -22792130.666666668, "logits/rejected": -30761908.0, "logps/chosen": -353.295166015625, "logps/rejected": -239.73495483398438, "loss": 0.3445, "rewards/chosen": 0.9500762621561686, "rewards/margins": 3.4704219500223794, "rewards/rejected": -2.520345687866211, "step": 14272 }, { "epoch": 0.756526117721888, "grad_norm": 38.25, "kl": 4.147322654724121, "learning_rate": 5e-07, "logits/chosen": -31776704.0, "logits/rejected": -35935378.666666664, "logps/chosen": -180.3356201171875, "logps/rejected": -150.364013671875, "loss": 0.4627, "rewards/chosen": -0.007918515801429748, "rewards/margins": 2.7517873456080757, "rewards/rejected": -2.7597058614095054, "step": 14273 }, { "epoch": 0.7565791217236901, "grad_norm": 71.0, "kl": 4.639982223510742, "learning_rate": 5e-07, "logits/chosen": -34697910.85714286, "logits/rejected": -876600.75, "logps/chosen": -294.42361886160717, "logps/rejected": -100.44161987304688, "loss": 0.4147, "rewards/chosen": 0.826643535069057, "rewards/margins": 1.6821596111570085, "rewards/rejected": -0.8555160760879517, "step": 14274 }, { "epoch": 0.7566321257254923, "grad_norm": 49.25, "kl": 0.1574249267578125, "learning_rate": 5e-07, "logits/chosen": -38074792.0, "logits/rejected": -19827876.0, "logps/chosen": -323.1494445800781, "logps/rejected": -260.32952880859375, "loss": 0.2218, "rewards/chosen": 0.6434528231620789, "rewards/margins": 3.4544422030448914, "rewards/rejected": -2.8109893798828125, "step": 14275 }, { "epoch": 0.7566851297272944, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21629456.0, "logits/rejected": -10997750.0, "logps/chosen": -464.0316162109375, "logps/rejected": -389.12060546875, "loss": 0.1873, "rewards/chosen": 0.7728058099746704, "rewards/margins": 3.14189875125885, "rewards/rejected": -2.3690929412841797, "step": 14276 }, { "epoch": 0.7567381337290966, "grad_norm": 52.25, "kl": 2.508626937866211, "learning_rate": 5e-07, "logits/chosen": 9789795.2, "logits/rejected": 8140682.0, "logps/chosen": -305.002294921875, "logps/rejected": -347.5658772786458, "loss": 0.3003, "rewards/chosen": 0.6555130481719971, "rewards/margins": 3.492525657018026, "rewards/rejected": -2.837012608846029, "step": 14277 }, { "epoch": 0.7567911377308987, "grad_norm": 40.25, "kl": 1.4408988952636719, "learning_rate": 5e-07, "logits/chosen": -25588653.333333332, "logits/rejected": -2835459.0, "logps/chosen": -253.25056966145834, "logps/rejected": -181.716845703125, "loss": 0.235, "rewards/chosen": 1.1425479253133137, "rewards/margins": 3.6287123998006185, "rewards/rejected": -2.4861644744873046, "step": 14278 }, { "epoch": 0.7568441417327009, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64140328.0, "logits/rejected": -7176772.666666667, "logps/chosen": -437.8669128417969, "logps/rejected": -183.91975911458334, "loss": 0.2491, "rewards/chosen": 0.275299072265625, "rewards/margins": 2.251493453979492, "rewards/rejected": -1.9761943817138672, "step": 14279 }, { "epoch": 0.756897145734503, "grad_norm": 36.0, "kl": 1.019063949584961, "learning_rate": 5e-07, "logits/chosen": -11157600.666666666, "logits/rejected": -20848348.8, "logps/chosen": -206.5225830078125, "logps/rejected": -310.4181884765625, "loss": 0.2199, "rewards/chosen": 0.4569867451985677, "rewards/margins": 3.107886060078939, "rewards/rejected": -2.650899314880371, "step": 14280 }, { "epoch": 0.7569501497363051, "grad_norm": 51.25, "kl": 0.6039485931396484, "learning_rate": 5e-07, "logits/chosen": -50680256.0, "logits/rejected": -30059182.0, "logps/chosen": -561.9439697265625, "logps/rejected": -289.0579528808594, "loss": 0.1757, "rewards/chosen": 1.369300127029419, "rewards/margins": 3.693415641784668, "rewards/rejected": -2.324115514755249, "step": 14281 }, { "epoch": 0.7570031537381072, "grad_norm": 39.5, "kl": 1.1048288345336914, "learning_rate": 5e-07, "logits/chosen": -7022835.2, "logits/rejected": 10678465.333333334, "logps/chosen": -232.714794921875, "logps/rejected": -508.9990641276042, "loss": 0.1962, "rewards/chosen": 1.2929150581359863, "rewards/margins": 3.812797419230143, "rewards/rejected": -2.5198823610941568, "step": 14282 }, { "epoch": 0.7570561577399093, "grad_norm": 42.5, "kl": 0.2614326477050781, "learning_rate": 5e-07, "logits/chosen": -11214112.0, "logits/rejected": -18233632.0, "logps/chosen": -200.4899139404297, "logps/rejected": -471.33892822265625, "loss": 0.3709, "rewards/chosen": 0.24480438232421875, "rewards/margins": 1.8327027559280396, "rewards/rejected": -1.5878983736038208, "step": 14283 }, { "epoch": 0.7571091617417115, "grad_norm": 74.0, "kl": 0.6525688171386719, "learning_rate": 5e-07, "logits/chosen": -4956844.0, "logits/rejected": -8135035.5, "logps/chosen": -87.99515533447266, "logps/rejected": -238.86351013183594, "loss": 0.3892, "rewards/chosen": -0.09381432831287384, "rewards/margins": 1.4326093643903732, "rewards/rejected": -1.526423692703247, "step": 14284 }, { "epoch": 0.7571621657435136, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17611216.0, "logits/rejected": -31725459.2, "logps/chosen": -133.3394978841146, "logps/rejected": -457.56201171875, "loss": 0.2717, "rewards/chosen": -0.09897512197494507, "rewards/margins": 2.100055754184723, "rewards/rejected": -2.199030876159668, "step": 14285 }, { "epoch": 0.7572151697453158, "grad_norm": 56.75, "kl": 0.9110946655273438, "learning_rate": 5e-07, "logits/chosen": -50795116.0, "logits/rejected": -14042847.0, "logps/chosen": -403.5135192871094, "logps/rejected": -242.93028259277344, "loss": 0.2462, "rewards/chosen": 0.7777634263038635, "rewards/margins": 3.2791925072669983, "rewards/rejected": -2.5014290809631348, "step": 14286 }, { "epoch": 0.7572681737471179, "grad_norm": 37.25, "kl": 1.99188232421875, "learning_rate": 5e-07, "logits/chosen": -6872249.0, "logits/rejected": -55477788.0, "logps/chosen": -197.02584838867188, "logps/rejected": -561.7578125, "loss": 0.305, "rewards/chosen": 0.26021525263786316, "rewards/margins": 2.7026219069957733, "rewards/rejected": -2.44240665435791, "step": 14287 }, { "epoch": 0.75732117774892, "grad_norm": 30.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2837809.5, "logits/rejected": -32022477.714285713, "logps/chosen": -48.10956954956055, "logps/rejected": -442.11317661830356, "loss": 0.1528, "rewards/chosen": -0.5561931729316711, "rewards/margins": 2.1389351316860745, "rewards/rejected": -2.6951283046177457, "step": 14288 }, { "epoch": 0.7573741817507221, "grad_norm": 73.5, "kl": 7.792764663696289, "learning_rate": 5e-07, "logits/chosen": -28383910.85714286, "logits/rejected": -38535068.0, "logps/chosen": -345.5262974330357, "logps/rejected": -117.60581970214844, "loss": 0.4027, "rewards/chosen": 1.0665627207074846, "rewards/margins": 5.063820668629237, "rewards/rejected": -3.997257947921753, "step": 14289 }, { "epoch": 0.7574271857525243, "grad_norm": 49.25, "kl": 8.556528091430664, "learning_rate": 5e-07, "logits/chosen": 3774410.6666666665, "logits/rejected": -59827016.0, "logps/chosen": -163.18734741210938, "logps/rejected": -607.9979248046875, "loss": 0.441, "rewards/chosen": 0.8112053871154785, "rewards/margins": 4.915076732635498, "rewards/rejected": -4.1038713455200195, "step": 14290 }, { "epoch": 0.7574801897543264, "grad_norm": 48.0, "kl": 1.8031196594238281, "learning_rate": 5e-07, "logits/chosen": 36803512.0, "logits/rejected": -4708128.5, "logps/chosen": -276.2687683105469, "logps/rejected": -319.04803466796875, "loss": 0.2517, "rewards/chosen": 0.8959261178970337, "rewards/margins": 3.2776297330856323, "rewards/rejected": -2.3817036151885986, "step": 14291 }, { "epoch": 0.7575331937561286, "grad_norm": 144.0, "kl": 1.9795398712158203, "learning_rate": 5e-07, "logits/chosen": -19389806.0, "logits/rejected": -33913472.0, "logps/chosen": -101.572265625, "logps/rejected": -431.890869140625, "loss": 0.3455, "rewards/chosen": 0.2619999945163727, "rewards/margins": 2.6820692121982574, "rewards/rejected": -2.4200692176818848, "step": 14292 }, { "epoch": 0.7575861977579307, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12514018.666666666, "logits/rejected": -48443462.4, "logps/chosen": -303.9847412109375, "logps/rejected": -425.68779296875, "loss": 0.1825, "rewards/chosen": 0.6143010457356771, "rewards/margins": 3.3236836751302086, "rewards/rejected": -2.7093826293945313, "step": 14293 }, { "epoch": 0.7576392017597329, "grad_norm": 30.625, "kl": 3.7054004669189453, "learning_rate": 5e-07, "logits/chosen": 7211552.0, "logits/rejected": -20235838.0, "logps/chosen": -27.42669105529785, "logps/rejected": -299.3903503417969, "loss": 0.2915, "rewards/chosen": 0.9143304824829102, "rewards/margins": 2.6943849325180054, "rewards/rejected": -1.7800544500350952, "step": 14294 }, { "epoch": 0.757692205761535, "grad_norm": 41.25, "kl": 3.4400634765625, "learning_rate": 5e-07, "logits/chosen": -57215636.0, "logits/rejected": -1571135.0, "logps/chosen": -719.60009765625, "logps/rejected": -272.2324523925781, "loss": 0.2156, "rewards/chosen": 1.226369023323059, "rewards/margins": 3.185171961784363, "rewards/rejected": -1.9588029384613037, "step": 14295 }, { "epoch": 0.7577452097633371, "grad_norm": 31.625, "kl": 1.8726005554199219, "learning_rate": 5e-07, "logits/chosen": -16925038.666666668, "logits/rejected": -18683804.8, "logps/chosen": -238.1355997721354, "logps/rejected": -314.814501953125, "loss": 0.2434, "rewards/chosen": 1.0093461672465007, "rewards/margins": 3.8328267733256025, "rewards/rejected": -2.8234806060791016, "step": 14296 }, { "epoch": 0.7577982137651392, "grad_norm": 42.75, "kl": 2.4159088134765625, "learning_rate": 5e-07, "logits/chosen": -35579285.333333336, "logits/rejected": -29741539.2, "logps/chosen": -284.534423828125, "logps/rejected": -410.46943359375, "loss": 0.1955, "rewards/chosen": 1.0683612028757732, "rewards/margins": 4.375720516840617, "rewards/rejected": -3.307359313964844, "step": 14297 }, { "epoch": 0.7578512177669414, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31985616.0, "logits/rejected": -6969817.6, "logps/chosen": -347.1221516927083, "logps/rejected": -376.357666015625, "loss": 0.2964, "rewards/chosen": -0.4783465067545573, "rewards/margins": 1.6597609837849936, "rewards/rejected": -2.138107490539551, "step": 14298 }, { "epoch": 0.7579042217687435, "grad_norm": 40.0, "kl": 4.918560981750488, "learning_rate": 5e-07, "logits/chosen": -16852549.333333332, "logits/rejected": -83773792.0, "logps/chosen": -160.14220174153647, "logps/rejected": -433.4156188964844, "loss": 0.4619, "rewards/chosen": 0.266329566637675, "rewards/margins": 2.6528664032618203, "rewards/rejected": -2.3865368366241455, "step": 14299 }, { "epoch": 0.7579572257705457, "grad_norm": 39.5, "kl": 0.5471076965332031, "learning_rate": 5e-07, "logits/chosen": -17623716.0, "logits/rejected": -37217592.0, "logps/chosen": -239.86534118652344, "logps/rejected": -248.80892944335938, "loss": 0.3199, "rewards/chosen": -0.09687485545873642, "rewards/margins": 3.1913960948586464, "rewards/rejected": -3.288270950317383, "step": 14300 }, { "epoch": 0.7580102297723478, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94170176.0, "logits/rejected": -53727242.666666664, "logps/chosen": -382.90283203125, "logps/rejected": -532.663818359375, "loss": 0.2986, "rewards/chosen": 0.17876526117324829, "rewards/margins": 3.660156532128652, "rewards/rejected": -3.481391270955404, "step": 14301 }, { "epoch": 0.75806323377415, "grad_norm": 77.5, "kl": 2.1000747680664062, "learning_rate": 5e-07, "logits/chosen": 6711697.142857143, "logits/rejected": -22322614.0, "logps/chosen": -396.00362723214283, "logps/rejected": -499.2723693847656, "loss": 0.4556, "rewards/chosen": 0.29160942350115093, "rewards/margins": 2.1591173580714633, "rewards/rejected": -1.8675079345703125, "step": 14302 }, { "epoch": 0.758116237775952, "grad_norm": 39.75, "kl": 0.5828437805175781, "learning_rate": 5e-07, "logits/chosen": -18474541.333333332, "logits/rejected": -17015801.6, "logps/chosen": -317.20871988932294, "logps/rejected": -264.6408447265625, "loss": 0.1721, "rewards/chosen": 0.9788747628529867, "rewards/margins": 4.031971343358357, "rewards/rejected": -3.053096580505371, "step": 14303 }, { "epoch": 0.7581692417777542, "grad_norm": 29.5, "kl": 0.021536827087402344, "learning_rate": 5e-07, "logits/chosen": -15535862.666666666, "logits/rejected": -24793864.0, "logps/chosen": -167.1607462565104, "logps/rejected": -313.7763427734375, "loss": 0.186, "rewards/chosen": 0.649341901143392, "rewards/margins": 3.2668271382649743, "rewards/rejected": -2.617485237121582, "step": 14304 }, { "epoch": 0.7582222457795563, "grad_norm": 51.5, "kl": 1.9382305145263672, "learning_rate": 5e-07, "logits/chosen": -16736132.0, "logits/rejected": -56685488.0, "logps/chosen": -377.00543212890625, "logps/rejected": -387.284912109375, "loss": 0.2329, "rewards/chosen": 1.1899795532226562, "rewards/margins": 3.3327183723449707, "rewards/rejected": -2.1427388191223145, "step": 14305 }, { "epoch": 0.7582752497813585, "grad_norm": 47.5, "kl": 2.023042678833008, "learning_rate": 5e-07, "logits/chosen": -3728411.25, "logits/rejected": 1388734.375, "logps/chosen": -180.31626892089844, "logps/rejected": -234.99440002441406, "loss": 0.3764, "rewards/chosen": 0.015203908085823059, "rewards/margins": 1.3808654099702835, "rewards/rejected": -1.3656615018844604, "step": 14306 }, { "epoch": 0.7583282537831606, "grad_norm": 42.0, "kl": 1.9773063659667969, "learning_rate": 5e-07, "logits/chosen": -35251192.0, "logits/rejected": -53441580.0, "logps/chosen": -455.10430908203125, "logps/rejected": -591.223876953125, "loss": 0.195, "rewards/chosen": 1.1203348636627197, "rewards/margins": 4.178439617156982, "rewards/rejected": -3.0581047534942627, "step": 14307 }, { "epoch": 0.7583812577849628, "grad_norm": 26.625, "kl": 3.5267105102539062, "learning_rate": 5e-07, "logits/chosen": -2806104.75, "logits/rejected": -29191936.0, "logps/chosen": -309.8712158203125, "logps/rejected": -190.82968139648438, "loss": 0.2539, "rewards/chosen": 1.1453090906143188, "rewards/margins": 3.422812581062317, "rewards/rejected": -2.277503490447998, "step": 14308 }, { "epoch": 0.7584342617867649, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24829280.0, "logits/rejected": -41029756.0, "logps/chosen": -338.0669860839844, "logps/rejected": -256.22344970703125, "loss": 0.256, "rewards/chosen": 0.5420078635215759, "rewards/margins": 3.0112733244895935, "rewards/rejected": -2.4692654609680176, "step": 14309 }, { "epoch": 0.7584872657885671, "grad_norm": 54.25, "kl": 0.4506874084472656, "learning_rate": 5e-07, "logits/chosen": -72194976.0, "logits/rejected": -2729522.0, "logps/chosen": -552.211669921875, "logps/rejected": -390.6678161621094, "loss": 0.3407, "rewards/chosen": -0.009989947080612183, "rewards/margins": 2.547428399324417, "rewards/rejected": -2.5574183464050293, "step": 14310 }, { "epoch": 0.7585402697903691, "grad_norm": 48.75, "kl": 2.32757568359375, "learning_rate": 5e-07, "logits/chosen": -26643498.666666668, "logits/rejected": -38265545.6, "logps/chosen": -260.35870361328125, "logps/rejected": -567.95849609375, "loss": 0.1633, "rewards/chosen": 1.49748961130778, "rewards/margins": 4.353562227884929, "rewards/rejected": -2.8560726165771486, "step": 14311 }, { "epoch": 0.7585932737921713, "grad_norm": 69.5, "kl": 2.198974609375, "learning_rate": 5e-07, "logits/chosen": 3527474.285714286, "logits/rejected": -105368816.0, "logps/chosen": -400.43673270089283, "logps/rejected": -619.9893798828125, "loss": 0.3476, "rewards/chosen": 0.6866156033107212, "rewards/margins": 5.430634157998221, "rewards/rejected": -4.7440185546875, "step": 14312 }, { "epoch": 0.7586462777939734, "grad_norm": 49.75, "kl": 2.5130434036254883, "learning_rate": 5e-07, "logits/chosen": -56562488.0, "logits/rejected": -3348755.0, "logps/chosen": -891.288330078125, "logps/rejected": -159.12931315104166, "loss": 0.1908, "rewards/chosen": 3.3983917236328125, "rewards/margins": 4.5469536781311035, "rewards/rejected": -1.148561954498291, "step": 14313 }, { "epoch": 0.7586992817957756, "grad_norm": 62.25, "kl": 3.7983217239379883, "learning_rate": 5e-07, "logits/chosen": -51039219.2, "logits/rejected": -64469632.0, "logps/chosen": -82.59945678710938, "logps/rejected": -628.4704182942709, "loss": 0.3157, "rewards/chosen": 0.5619219779968262, "rewards/margins": 3.891734663645426, "rewards/rejected": -3.3298126856486, "step": 14314 }, { "epoch": 0.7587522857975777, "grad_norm": 45.5, "kl": 0.9352436065673828, "learning_rate": 5e-07, "logits/chosen": -28258186.666666668, "logits/rejected": -16227299.2, "logps/chosen": -224.51556396484375, "logps/rejected": -271.2856689453125, "loss": 0.3035, "rewards/chosen": -0.026259342829386394, "rewards/margins": 1.9642178376515707, "rewards/rejected": -1.9904771804809571, "step": 14315 }, { "epoch": 0.7588052897993799, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24087064.0, "logits/rejected": -4832852.666666667, "logps/chosen": -151.25428771972656, "logps/rejected": -186.3169148763021, "loss": 0.2692, "rewards/chosen": -0.2000427395105362, "rewards/margins": 1.6171331256628036, "rewards/rejected": -1.8171758651733398, "step": 14316 }, { "epoch": 0.758858293801182, "grad_norm": 67.5, "kl": 0.9396514892578125, "learning_rate": 5e-07, "logits/chosen": -24689845.333333332, "logits/rejected": -15281369.6, "logps/chosen": -451.8205159505208, "logps/rejected": -311.5662841796875, "loss": 0.3084, "rewards/chosen": 0.25814135869344074, "rewards/margins": 1.6397371610005695, "rewards/rejected": -1.3815958023071289, "step": 14317 }, { "epoch": 0.7589112978029842, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75997061.33333333, "logits/rejected": -14008252.8, "logps/chosen": -463.4945475260417, "logps/rejected": -234.5403564453125, "loss": 0.2174, "rewards/chosen": 0.592395027478536, "rewards/margins": 2.490402992566427, "rewards/rejected": -1.8980079650878907, "step": 14318 }, { "epoch": 0.7589643018047862, "grad_norm": 51.75, "kl": 0.7559890747070312, "learning_rate": 5e-07, "logits/chosen": -47403622.4, "logits/rejected": -11578388.0, "logps/chosen": -348.1960205078125, "logps/rejected": -207.65625, "loss": 0.4033, "rewards/chosen": 0.29818224906921387, "rewards/margins": 1.1864253679911294, "rewards/rejected": -0.8882431189219157, "step": 14319 }, { "epoch": 0.7590173058065884, "grad_norm": 65.0, "kl": 3.647777557373047, "learning_rate": 5e-07, "logits/chosen": -34417728.0, "logits/rejected": -24727416.0, "logps/chosen": -315.7943115234375, "logps/rejected": -314.896240234375, "loss": 0.338, "rewards/chosen": 0.9028372764587402, "rewards/margins": 2.876136064529419, "rewards/rejected": -1.9732987880706787, "step": 14320 }, { "epoch": 0.7590703098083905, "grad_norm": 54.75, "kl": 1.87799072265625, "learning_rate": 5e-07, "logits/chosen": -59685254.4, "logits/rejected": -19510377.333333332, "logps/chosen": -659.90302734375, "logps/rejected": -318.16363525390625, "loss": 0.2872, "rewards/chosen": 0.6019076824188232, "rewards/margins": 3.9624111970265705, "rewards/rejected": -3.3605035146077475, "step": 14321 }, { "epoch": 0.7591233138101927, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49304520.0, "logits/rejected": -20547030.85714286, "logps/chosen": -456.61395263671875, "logps/rejected": -253.29499162946428, "loss": 0.1414, "rewards/chosen": 0.41378480195999146, "rewards/margins": 2.853380160672324, "rewards/rejected": -2.4395953587123325, "step": 14322 }, { "epoch": 0.7591763178119948, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31485626.666666668, "logits/rejected": -50001964.8, "logps/chosen": -155.1903076171875, "logps/rejected": -327.2341064453125, "loss": 0.2771, "rewards/chosen": 0.12789340813954672, "rewards/margins": 2.2126420577367147, "rewards/rejected": -2.084748649597168, "step": 14323 }, { "epoch": 0.759229321813797, "grad_norm": 65.5, "kl": 0.8085746765136719, "learning_rate": 5e-07, "logits/chosen": -75322272.0, "logits/rejected": -7284218.0, "logps/chosen": -312.3545837402344, "logps/rejected": -290.1601867675781, "loss": 0.3075, "rewards/chosen": 0.2331131100654602, "rewards/margins": 2.1397979855537415, "rewards/rejected": -1.9066848754882812, "step": 14324 }, { "epoch": 0.7592823258155991, "grad_norm": 51.75, "kl": 0.3216419219970703, "learning_rate": 5e-07, "logits/chosen": -23075677.333333332, "logits/rejected": 5938479.0, "logps/chosen": -260.6785481770833, "logps/rejected": -160.9814453125, "loss": 0.3529, "rewards/chosen": 0.6737628777821859, "rewards/margins": 1.5225703914960227, "rewards/rejected": -0.8488075137138367, "step": 14325 }, { "epoch": 0.7593353298174013, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2809142.5, "logits/rejected": -3902660.0, "logps/chosen": -80.43118286132812, "logps/rejected": -128.4642333984375, "loss": 0.2289, "rewards/chosen": -0.4101608395576477, "rewards/margins": 2.394197960694631, "rewards/rejected": -2.804358800252279, "step": 14326 }, { "epoch": 0.7593883338192033, "grad_norm": 69.5, "kl": 0.7020339965820312, "learning_rate": 5e-07, "logits/chosen": -22085966.0, "logits/rejected": -60890936.0, "logps/chosen": -213.53350830078125, "logps/rejected": -337.78729248046875, "loss": 0.4282, "rewards/chosen": -0.4859251081943512, "rewards/margins": 1.4458807408809662, "rewards/rejected": -1.9318058490753174, "step": 14327 }, { "epoch": 0.7594413378210055, "grad_norm": 56.25, "kl": 0.7686882019042969, "learning_rate": 5e-07, "logits/chosen": -23201508.0, "logits/rejected": -11457504.0, "logps/chosen": -359.7353515625, "logps/rejected": -231.76780700683594, "loss": 0.3126, "rewards/chosen": 0.6652504205703735, "rewards/margins": 2.143041491508484, "rewards/rejected": -1.4777910709381104, "step": 14328 }, { "epoch": 0.7594943418228076, "grad_norm": 42.75, "kl": 0.4309577941894531, "learning_rate": 5e-07, "logits/chosen": -3404194.3333333335, "logits/rejected": -33839315.2, "logps/chosen": -406.217529296875, "logps/rejected": -424.2498046875, "loss": 0.2288, "rewards/chosen": 0.8542738755544027, "rewards/margins": 3.4075890382130942, "rewards/rejected": -2.5533151626586914, "step": 14329 }, { "epoch": 0.7595473458246098, "grad_norm": 42.5, "kl": 1.833404541015625, "learning_rate": 5e-07, "logits/chosen": -40036109.333333336, "logits/rejected": -87934048.0, "logps/chosen": -306.6736653645833, "logps/rejected": -685.78955078125, "loss": 0.3215, "rewards/chosen": 0.5872930288314819, "rewards/margins": 4.296445250511169, "rewards/rejected": -3.7091522216796875, "step": 14330 }, { "epoch": 0.7596003498264119, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45378308.0, "logits/rejected": -23975131.42857143, "logps/chosen": -311.2566223144531, "logps/rejected": -313.2610560825893, "loss": 0.1576, "rewards/chosen": 0.33792421221733093, "rewards/margins": 2.850840232201985, "rewards/rejected": -2.512916019984654, "step": 14331 }, { "epoch": 0.7596533538282141, "grad_norm": 54.5, "kl": 2.9549341201782227, "learning_rate": 5e-07, "logits/chosen": -61095589.333333336, "logits/rejected": -47894184.0, "logps/chosen": -330.4000651041667, "logps/rejected": -371.089111328125, "loss": 0.3959, "rewards/chosen": 0.3284778594970703, "rewards/margins": 3.5627593994140625, "rewards/rejected": -3.234281539916992, "step": 14332 }, { "epoch": 0.7597063578300162, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6356513.0, "logits/rejected": -13404284.57142857, "logps/chosen": -34.58362579345703, "logps/rejected": -362.21407645089283, "loss": 0.1369, "rewards/chosen": 0.23827628791332245, "rewards/margins": 3.655689999461174, "rewards/rejected": -3.4174137115478516, "step": 14333 }, { "epoch": 0.7597593618318182, "grad_norm": 59.25, "kl": 2.8848323822021484, "learning_rate": 5e-07, "logits/chosen": -56529190.4, "logits/rejected": -14990792.0, "logps/chosen": -425.862109375, "logps/rejected": -447.6656087239583, "loss": 0.2559, "rewards/chosen": 0.9988163948059082, "rewards/margins": 3.760949993133545, "rewards/rejected": -2.7621335983276367, "step": 14334 }, { "epoch": 0.7598123658336204, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11205542.0, "logits/rejected": -18029280.0, "logps/chosen": -259.544677734375, "logps/rejected": -446.6534946986607, "loss": 0.1017, "rewards/chosen": 1.3529785871505737, "rewards/margins": 4.3111297232764105, "rewards/rejected": -2.9581511361258372, "step": 14335 }, { "epoch": 0.7598653698354225, "grad_norm": 51.5, "kl": 0.08226776123046875, "learning_rate": 5e-07, "logits/chosen": -76971520.0, "logits/rejected": -52721788.0, "logps/chosen": -356.92803955078125, "logps/rejected": -348.1347961425781, "loss": 0.3003, "rewards/chosen": 0.23386625945568085, "rewards/margins": 2.0269662588834763, "rewards/rejected": -1.7930999994277954, "step": 14336 }, { "epoch": 0.7599183738372247, "grad_norm": 44.75, "kl": 0.5948247909545898, "learning_rate": 5e-07, "logits/chosen": -38884732.0, "logits/rejected": 129922184.0, "logps/chosen": -200.00192260742188, "logps/rejected": -348.2313232421875, "loss": 0.3191, "rewards/chosen": 0.17649325728416443, "rewards/margins": 2.517058938741684, "rewards/rejected": -2.3405656814575195, "step": 14337 }, { "epoch": 0.7599713778390268, "grad_norm": 55.25, "kl": 1.509063720703125, "learning_rate": 5e-07, "logits/chosen": -48344997.333333336, "logits/rejected": 1068896.0, "logps/chosen": -307.02978515625, "logps/rejected": -576.8328247070312, "loss": 0.41, "rewards/chosen": 0.11339696248372395, "rewards/margins": 2.7162748177846274, "rewards/rejected": -2.6028778553009033, "step": 14338 }, { "epoch": 0.760024381840829, "grad_norm": 29.125, "kl": 0.3296089172363281, "learning_rate": 5e-07, "logits/chosen": -1701303.5, "logits/rejected": -36527957.333333336, "logps/chosen": -237.44955444335938, "logps/rejected": -466.962646484375, "loss": 0.1342, "rewards/chosen": 0.9558154940605164, "rewards/margins": 3.521229604880015, "rewards/rejected": -2.5654141108194985, "step": 14339 }, { "epoch": 0.7600773858426311, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65184088.0, "logits/rejected": -66404932.0, "logps/chosen": -528.8084716796875, "logps/rejected": -609.27734375, "loss": 0.1898, "rewards/chosen": 0.8144821524620056, "rewards/margins": 4.147827327251434, "rewards/rejected": -3.3333451747894287, "step": 14340 }, { "epoch": 0.7601303898444333, "grad_norm": 44.5, "kl": 0.3870105743408203, "learning_rate": 5e-07, "logits/chosen": -40671546.666666664, "logits/rejected": -15863940.8, "logps/chosen": -229.0238037109375, "logps/rejected": -235.088623046875, "loss": 0.2395, "rewards/chosen": 0.9513605435689291, "rewards/margins": 2.6786074002583824, "rewards/rejected": -1.7272468566894532, "step": 14341 }, { "epoch": 0.7601833938462353, "grad_norm": 64.0, "kl": 1.7208442687988281, "learning_rate": 5e-07, "logits/chosen": -41060345.6, "logits/rejected": 47653344.0, "logps/chosen": -337.579443359375, "logps/rejected": -774.5897623697916, "loss": 0.2441, "rewards/chosen": 1.177809715270996, "rewards/margins": 3.163655598958333, "rewards/rejected": -1.9858458836873372, "step": 14342 }, { "epoch": 0.7602363978480375, "grad_norm": 47.75, "kl": 0.33905601501464844, "learning_rate": 5e-07, "logits/chosen": -18197321.6, "logits/rejected": -34517336.0, "logps/chosen": -314.9400390625, "logps/rejected": -454.0008951822917, "loss": 0.3256, "rewards/chosen": 0.16939079761505127, "rewards/margins": 4.44564163684845, "rewards/rejected": -4.276250839233398, "step": 14343 }, { "epoch": 0.7602894018498396, "grad_norm": 43.25, "kl": 2.5571250915527344, "learning_rate": 5e-07, "logits/chosen": -982804.8, "logits/rejected": -19364861.333333332, "logps/chosen": -176.144677734375, "logps/rejected": -192.7076416015625, "loss": 0.3434, "rewards/chosen": 0.8176054954528809, "rewards/margins": 2.3875800768534345, "rewards/rejected": -1.5699745814005535, "step": 14344 }, { "epoch": 0.7603424058516418, "grad_norm": 42.25, "kl": 2.2163925170898438, "learning_rate": 5e-07, "logits/chosen": -5033468.0, "logits/rejected": -6303599.0, "logps/chosen": -158.41668701171875, "logps/rejected": -123.04670715332031, "loss": 0.297, "rewards/chosen": 0.7804977297782898, "rewards/margins": 2.715767562389374, "rewards/rejected": -1.935269832611084, "step": 14345 }, { "epoch": 0.7603954098534439, "grad_norm": 83.5, "kl": 2.2634544372558594, "learning_rate": 5e-07, "logits/chosen": -36463209.14285714, "logits/rejected": -62933200.0, "logps/chosen": -425.87123325892856, "logps/rejected": -684.82470703125, "loss": 0.314, "rewards/chosen": 0.9283838272094727, "rewards/margins": 4.786715269088745, "rewards/rejected": -3.8583314418792725, "step": 14346 }, { "epoch": 0.7604484138552461, "grad_norm": 61.75, "kl": 4.7635955810546875, "learning_rate": 5e-07, "logits/chosen": -44877017.6, "logits/rejected": -21351598.666666668, "logps/chosen": -609.185205078125, "logps/rejected": -342.0135904947917, "loss": 0.2735, "rewards/chosen": 1.005499267578125, "rewards/margins": 4.080607477823893, "rewards/rejected": -3.075108210245768, "step": 14347 }, { "epoch": 0.7605014178570482, "grad_norm": 41.0, "kl": 0.7572784423828125, "learning_rate": 5e-07, "logits/chosen": -25857754.666666668, "logits/rejected": -27030546.0, "logps/chosen": -292.54738362630206, "logps/rejected": -375.5991516113281, "loss": 0.2923, "rewards/chosen": 0.8707111676534017, "rewards/margins": 3.2954066594441733, "rewards/rejected": -2.4246954917907715, "step": 14348 }, { "epoch": 0.7605544218588504, "grad_norm": 52.75, "kl": 3.1828956604003906, "learning_rate": 5e-07, "logits/chosen": -14755244.0, "logits/rejected": -47010000.0, "logps/chosen": -209.77836100260416, "logps/rejected": -256.1741943359375, "loss": 0.4168, "rewards/chosen": 0.552041252454122, "rewards/margins": 1.7440762122472129, "rewards/rejected": -1.1920349597930908, "step": 14349 }, { "epoch": 0.7606074258606524, "grad_norm": 59.0, "kl": 2.3971633911132812, "learning_rate": 5e-07, "logits/chosen": -4883656.0, "logits/rejected": -11959240.0, "logps/chosen": -327.26605224609375, "logps/rejected": -122.11317443847656, "loss": 0.3555, "rewards/chosen": 0.22108766436576843, "rewards/margins": 2.6315252482891083, "rewards/rejected": -2.41043758392334, "step": 14350 }, { "epoch": 0.7606604298624546, "grad_norm": 48.0, "kl": 1.7987213134765625, "learning_rate": 5e-07, "logits/chosen": -28557778.0, "logits/rejected": -32083048.0, "logps/chosen": -684.7609252929688, "logps/rejected": -250.8523712158203, "loss": 0.2684, "rewards/chosen": 0.9219753742218018, "rewards/margins": 3.023171901702881, "rewards/rejected": -2.101196527481079, "step": 14351 }, { "epoch": 0.7607134338642567, "grad_norm": 69.0, "kl": 5.355220794677734, "learning_rate": 5e-07, "logits/chosen": -1496929.6, "logits/rejected": -47077397.333333336, "logps/chosen": -314.271044921875, "logps/rejected": -488.1972249348958, "loss": 0.2567, "rewards/chosen": 1.215432834625244, "rewards/margins": 3.906579685211182, "rewards/rejected": -2.6911468505859375, "step": 14352 }, { "epoch": 0.7607664378660589, "grad_norm": 52.75, "kl": 2.82672119140625, "learning_rate": 5e-07, "logits/chosen": -14547353.6, "logits/rejected": -26310696.0, "logps/chosen": -311.899072265625, "logps/rejected": -720.7062174479166, "loss": 0.3322, "rewards/chosen": 0.49766931533813474, "rewards/margins": 3.229859129587809, "rewards/rejected": -2.7321898142496743, "step": 14353 }, { "epoch": 0.760819441867861, "grad_norm": 58.5, "kl": 3.882063865661621, "learning_rate": 5e-07, "logits/chosen": -41664473.6, "logits/rejected": -6223268.666666667, "logps/chosen": -249.5545166015625, "logps/rejected": -163.5908406575521, "loss": 0.4628, "rewards/chosen": 0.35012266635894773, "rewards/margins": 1.314324434598287, "rewards/rejected": -0.9642017682393392, "step": 14354 }, { "epoch": 0.7608724458696632, "grad_norm": 59.25, "kl": 3.4025230407714844, "learning_rate": 5e-07, "logits/chosen": -19154614.4, "logits/rejected": -78091008.0, "logps/chosen": -604.06474609375, "logps/rejected": -349.5496419270833, "loss": 0.3363, "rewards/chosen": 0.9800344467163086, "rewards/margins": 2.844047482808431, "rewards/rejected": -1.8640130360921223, "step": 14355 }, { "epoch": 0.7609254498714653, "grad_norm": 33.5, "kl": 0.22961807250976562, "learning_rate": 5e-07, "logits/chosen": -9497726.666666666, "logits/rejected": -17926264.0, "logps/chosen": -340.30470784505206, "logps/rejected": -305.9610107421875, "loss": 0.1387, "rewards/chosen": 1.2226659456888835, "rewards/margins": 4.5078561464945475, "rewards/rejected": -3.285190200805664, "step": 14356 }, { "epoch": 0.7609784538732675, "grad_norm": 31.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61602744.0, "logits/rejected": -33989397.333333336, "logps/chosen": -237.72447204589844, "logps/rejected": -240.4275105794271, "loss": 0.2238, "rewards/chosen": -0.04568175971508026, "rewards/margins": 2.2966326425472894, "rewards/rejected": -2.3423144022623696, "step": 14357 }, { "epoch": 0.7610314578750695, "grad_norm": 41.0, "kl": 2.111806869506836, "learning_rate": 5e-07, "logits/chosen": -17705096.0, "logits/rejected": -58920986.666666664, "logps/chosen": -213.7069580078125, "logps/rejected": -340.32175699869794, "loss": 0.3346, "rewards/chosen": 0.6301580905914307, "rewards/margins": 2.7199657599131264, "rewards/rejected": -2.089807669321696, "step": 14358 }, { "epoch": 0.7610844618768717, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57434368.0, "logits/rejected": -18849827.2, "logps/chosen": -287.2166748046875, "logps/rejected": -303.408544921875, "loss": 0.3066, "rewards/chosen": -0.09709777434666951, "rewards/margins": 1.784109119574229, "rewards/rejected": -1.8812068939208983, "step": 14359 }, { "epoch": 0.7611374658786738, "grad_norm": 53.5, "kl": 0.04718208312988281, "learning_rate": 5e-07, "logits/chosen": -30947200.0, "logits/rejected": -9356490.0, "logps/chosen": -289.749609375, "logps/rejected": -348.7823486328125, "loss": 0.3667, "rewards/chosen": 0.12528417110443116, "rewards/margins": 2.066693600018819, "rewards/rejected": -1.941409428914388, "step": 14360 }, { "epoch": 0.761190469880476, "grad_norm": 57.25, "kl": 1.652374267578125, "learning_rate": 5e-07, "logits/chosen": -12692974.0, "logits/rejected": -27578050.0, "logps/chosen": -149.63015747070312, "logps/rejected": -308.64990234375, "loss": 0.201, "rewards/chosen": 1.3783860206604004, "rewards/margins": 4.639647960662842, "rewards/rejected": -3.2612619400024414, "step": 14361 }, { "epoch": 0.7612434738822781, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46664256.0, "logits/rejected": -32573792.0, "logps/chosen": -840.5006510416666, "logps/rejected": -479.137109375, "loss": 0.2096, "rewards/chosen": 0.532355785369873, "rewards/margins": 3.419096088409424, "rewards/rejected": -2.886740303039551, "step": 14362 }, { "epoch": 0.7612964778840803, "grad_norm": 67.0, "kl": 0.5920066833496094, "learning_rate": 5e-07, "logits/chosen": 9932589.0, "logits/rejected": -20340101.333333332, "logps/chosen": -33.32771682739258, "logps/rejected": -383.4961751302083, "loss": 0.3384, "rewards/chosen": 0.06644965708255768, "rewards/margins": 1.0736468782027562, "rewards/rejected": -1.0071972211201985, "step": 14363 }, { "epoch": 0.7613494818858824, "grad_norm": 58.25, "kl": 1.0330257415771484, "learning_rate": 5e-07, "logits/chosen": -10487386.0, "logits/rejected": -51748944.0, "logps/chosen": -194.01271057128906, "logps/rejected": -369.86309814453125, "loss": 0.4564, "rewards/chosen": -0.6399785280227661, "rewards/margins": 0.6615382432937622, "rewards/rejected": -1.3015167713165283, "step": 14364 }, { "epoch": 0.7614024858876846, "grad_norm": 58.5, "kl": 3.4756698608398438, "learning_rate": 5e-07, "logits/chosen": -38895476.0, "logits/rejected": -52415752.0, "logps/chosen": -442.29339599609375, "logps/rejected": -456.3020935058594, "loss": 0.1974, "rewards/chosen": 1.561315894126892, "rewards/margins": 3.7648357152938843, "rewards/rejected": -2.203519821166992, "step": 14365 }, { "epoch": 0.7614554898894866, "grad_norm": 26.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19156753.333333332, "logits/rejected": -38024848.0, "logps/chosen": -313.760009765625, "logps/rejected": -334.457568359375, "loss": 0.1225, "rewards/chosen": 1.184097687403361, "rewards/margins": 4.76858827273051, "rewards/rejected": -3.5844905853271483, "step": 14366 }, { "epoch": 0.7615084938912888, "grad_norm": 48.0, "kl": 1.2356910705566406, "learning_rate": 5e-07, "logits/chosen": -33794060.0, "logits/rejected": -26565044.0, "logps/chosen": -548.9072875976562, "logps/rejected": -209.25430297851562, "loss": 0.3038, "rewards/chosen": 0.6335010528564453, "rewards/margins": 2.6499269008636475, "rewards/rejected": -2.016425848007202, "step": 14367 }, { "epoch": 0.7615614978930909, "grad_norm": 44.5, "kl": 3.374448776245117, "learning_rate": 5e-07, "logits/chosen": -12914834.285714285, "logits/rejected": -11430714.0, "logps/chosen": -199.24093191964286, "logps/rejected": -109.8285903930664, "loss": 0.4231, "rewards/chosen": 0.5876484598432269, "rewards/margins": 2.2834660496030534, "rewards/rejected": -1.6958175897598267, "step": 14368 }, { "epoch": 0.7616145018948931, "grad_norm": 36.25, "kl": 0.7233724594116211, "learning_rate": 5e-07, "logits/chosen": -56018000.0, "logits/rejected": -7455848.0, "logps/chosen": -298.33905029296875, "logps/rejected": -380.85546875, "loss": 0.2285, "rewards/chosen": 0.6980881094932556, "rewards/margins": 2.668407380580902, "rewards/rejected": -1.9703192710876465, "step": 14369 }, { "epoch": 0.7616675058966952, "grad_norm": 41.75, "kl": 2.5405149459838867, "learning_rate": 5e-07, "logits/chosen": -13821579.2, "logits/rejected": -26400456.0, "logps/chosen": -139.57359619140624, "logps/rejected": -401.8482259114583, "loss": 0.3191, "rewards/chosen": 0.44214601516723634, "rewards/margins": 4.73189738591512, "rewards/rejected": -4.289751370747884, "step": 14370 }, { "epoch": 0.7617205098984974, "grad_norm": 50.25, "kl": 2.596158981323242, "learning_rate": 5e-07, "logits/chosen": -25319371.42857143, "logits/rejected": -57374672.0, "logps/chosen": -238.98202078683036, "logps/rejected": -626.1292724609375, "loss": 0.3791, "rewards/chosen": 0.6904360226222447, "rewards/margins": 3.5194581917354038, "rewards/rejected": -2.829022169113159, "step": 14371 }, { "epoch": 0.7617735139002995, "grad_norm": 82.0, "kl": 2.2316884994506836, "learning_rate": 5e-07, "logits/chosen": -65667091.2, "logits/rejected": -16894602.666666668, "logps/chosen": -191.094384765625, "logps/rejected": -235.80672200520834, "loss": 0.3204, "rewards/chosen": 0.7294404029846191, "rewards/margins": 1.982343069712321, "rewards/rejected": -1.252902666727702, "step": 14372 }, { "epoch": 0.7618265179021017, "grad_norm": 42.5, "kl": 2.2791566848754883, "learning_rate": 5e-07, "logits/chosen": -19429041.333333332, "logits/rejected": -33445126.4, "logps/chosen": -183.53426106770834, "logps/rejected": -264.1749755859375, "loss": 0.2367, "rewards/chosen": 1.118634859720866, "rewards/margins": 2.765916124979655, "rewards/rejected": -1.6472812652587892, "step": 14373 }, { "epoch": 0.7618795219039037, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12279370.666666666, "logits/rejected": -14351139.2, "logps/chosen": -453.961669921875, "logps/rejected": -149.2166748046875, "loss": 0.2664, "rewards/chosen": 0.23485819498697916, "rewards/margins": 2.9551092783610025, "rewards/rejected": -2.7202510833740234, "step": 14374 }, { "epoch": 0.7619325259057059, "grad_norm": 44.75, "kl": 2.1324968338012695, "learning_rate": 5e-07, "logits/chosen": -16029632.0, "logits/rejected": -21066321.333333332, "logps/chosen": -221.889697265625, "logps/rejected": -177.4473876953125, "loss": 0.3771, "rewards/chosen": 0.2270440101623535, "rewards/margins": 2.180951976776123, "rewards/rejected": -1.9539079666137695, "step": 14375 }, { "epoch": 0.761985529907508, "grad_norm": 50.5, "kl": 3.0234384536743164, "learning_rate": 5e-07, "logits/chosen": -18445001.6, "logits/rejected": -22465042.666666668, "logps/chosen": -227.5882080078125, "logps/rejected": -370.4148763020833, "loss": 0.303, "rewards/chosen": 0.8111028671264648, "rewards/margins": 4.739703814188639, "rewards/rejected": -3.9286009470621743, "step": 14376 }, { "epoch": 0.7620385339093102, "grad_norm": 55.75, "kl": 2.4875316619873047, "learning_rate": 5e-07, "logits/chosen": -35573040.0, "logps/chosen": -248.62973022460938, "loss": 0.4507, "rewards/chosen": 0.4866969585418701, "step": 14377 }, { "epoch": 0.7620915379111123, "grad_norm": 51.0, "kl": 0.9394989013671875, "learning_rate": 5e-07, "logits/chosen": -54838444.8, "logits/rejected": -45782970.666666664, "logps/chosen": -404.79189453125, "logps/rejected": -276.59613037109375, "loss": 0.3682, "rewards/chosen": -0.06326147317886352, "rewards/margins": 3.0574902256329857, "rewards/rejected": -3.120751698811849, "step": 14378 }, { "epoch": 0.7621445419129145, "grad_norm": 50.0, "kl": 2.643310546875, "learning_rate": 5e-07, "logits/chosen": -79929200.0, "logits/rejected": -31497840.0, "logps/chosen": -436.1204833984375, "logps/rejected": -424.6152648925781, "loss": 0.1793, "rewards/chosen": 1.321143388748169, "rewards/margins": 3.9721357822418213, "rewards/rejected": -2.6509923934936523, "step": 14379 }, { "epoch": 0.7621975459147166, "grad_norm": 41.5, "kl": 1.0342941284179688, "learning_rate": 5e-07, "logits/chosen": -25271072.0, "logits/rejected": -29945408.0, "logps/chosen": -173.93893432617188, "logps/rejected": -253.33065795898438, "loss": 0.2369, "rewards/chosen": 0.7247999310493469, "rewards/margins": 3.114758789539337, "rewards/rejected": -2.3899588584899902, "step": 14380 }, { "epoch": 0.7622505499165187, "grad_norm": 39.5, "kl": 1.5190200805664062, "learning_rate": 5e-07, "logits/chosen": -45996341.333333336, "logits/rejected": -15774870.4, "logps/chosen": -351.176025390625, "logps/rejected": -280.0591796875, "loss": 0.1391, "rewards/chosen": 2.4759623209635415, "rewards/margins": 4.890494028727213, "rewards/rejected": -2.414531707763672, "step": 14381 }, { "epoch": 0.7623035539183208, "grad_norm": 49.0, "kl": 3.415679931640625, "learning_rate": 5e-07, "logits/chosen": -29323923.2, "logits/rejected": -51618880.0, "logps/chosen": -234.2281005859375, "logps/rejected": -330.1103922526042, "loss": 0.2673, "rewards/chosen": 1.0943575859069825, "rewards/margins": 3.397471364339193, "rewards/rejected": -2.3031137784322104, "step": 14382 }, { "epoch": 0.762356557920123, "grad_norm": 47.75, "kl": 2.7375411987304688, "learning_rate": 5e-07, "logits/chosen": -26878166.4, "logits/rejected": -17845720.0, "logps/chosen": -151.279541015625, "logps/rejected": -206.7584025065104, "loss": 0.4453, "rewards/chosen": 0.038074266910552976, "rewards/margins": 1.243235917886098, "rewards/rejected": -1.2051616509755452, "step": 14383 }, { "epoch": 0.7624095619219251, "grad_norm": 66.5, "kl": 3.8271255493164062, "learning_rate": 5e-07, "logits/chosen": -29810926.0, "logits/rejected": -27483908.0, "logps/chosen": -632.6270751953125, "logps/rejected": -264.4434509277344, "loss": 0.332, "rewards/chosen": 0.647832453250885, "rewards/margins": 3.1732357144355774, "rewards/rejected": -2.5254032611846924, "step": 14384 }, { "epoch": 0.7624625659237272, "grad_norm": 37.25, "kl": 0.9534759521484375, "learning_rate": 5e-07, "logits/chosen": -47522741.333333336, "logits/rejected": 9093891.2, "logps/chosen": -522.6029866536459, "logps/rejected": -381.61005859375, "loss": 0.2471, "rewards/chosen": 1.0520462195078533, "rewards/margins": 3.15157052675883, "rewards/rejected": -2.0995243072509764, "step": 14385 }, { "epoch": 0.7625155699255294, "grad_norm": 49.0, "kl": 2.3223609924316406, "learning_rate": 5e-07, "logits/chosen": 21359904.0, "logits/rejected": -11967560.0, "logps/chosen": -255.12319946289062, "logps/rejected": -450.2966003417969, "loss": 0.247, "rewards/chosen": 0.8305172920227051, "rewards/margins": 4.866455554962158, "rewards/rejected": -4.035938262939453, "step": 14386 }, { "epoch": 0.7625685739273315, "grad_norm": 48.0, "kl": 0.5575523376464844, "learning_rate": 5e-07, "logits/chosen": -48495696.0, "logits/rejected": -34680392.0, "logps/chosen": -388.989013671875, "logps/rejected": -690.67919921875, "loss": 0.2971, "rewards/chosen": 0.3462463617324829, "rewards/margins": 3.3071130514144897, "rewards/rejected": -2.960866689682007, "step": 14387 }, { "epoch": 0.7626215779291337, "grad_norm": 43.25, "kl": 0.5831737518310547, "learning_rate": 5e-07, "logits/chosen": 28582710.0, "logits/rejected": 49041816.0, "logps/chosen": -249.8800811767578, "logps/rejected": -216.11346435546875, "loss": 0.2564, "rewards/chosen": 0.7591428756713867, "rewards/margins": 2.676042318344116, "rewards/rejected": -1.9168994426727295, "step": 14388 }, { "epoch": 0.7626745819309357, "grad_norm": 58.25, "kl": 3.7486448287963867, "learning_rate": 5e-07, "logits/chosen": 849339.375, "logits/rejected": -47776108.0, "logps/chosen": -385.28076171875, "logps/rejected": -607.3953857421875, "loss": 0.2486, "rewards/chosen": 1.0333911180496216, "rewards/margins": 3.9889761209487915, "rewards/rejected": -2.95558500289917, "step": 14389 }, { "epoch": 0.7627275859327379, "grad_norm": 45.75, "kl": 0.21825408935546875, "learning_rate": 5e-07, "logits/chosen": -14704089.0, "logits/rejected": -24437800.0, "logps/chosen": -305.5207214355469, "logps/rejected": -253.37814331054688, "loss": 0.3265, "rewards/chosen": 0.4429013431072235, "rewards/margins": 1.8165449798107147, "rewards/rejected": -1.3736436367034912, "step": 14390 }, { "epoch": 0.76278058993454, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52110835.2, "logits/rejected": -37930128.0, "logps/chosen": -415.43056640625, "logps/rejected": -148.85075887044272, "loss": 0.3905, "rewards/chosen": 0.09046890139579773, "rewards/margins": 2.540580449501673, "rewards/rejected": -2.4501115481058755, "step": 14391 }, { "epoch": 0.7628335939363422, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2181829.5, "logits/rejected": -30710485.333333332, "logps/chosen": -218.62265014648438, "logps/rejected": -277.73895263671875, "loss": 0.1744, "rewards/chosen": 1.2094619274139404, "rewards/margins": 3.54352339108785, "rewards/rejected": -2.3340614636739097, "step": 14392 }, { "epoch": 0.7628865979381443, "grad_norm": 60.75, "kl": 0.4181995391845703, "learning_rate": 5e-07, "logits/chosen": -47958176.0, "logits/rejected": -11849453.0, "logps/chosen": -355.54132080078125, "logps/rejected": -144.09207153320312, "loss": 0.3309, "rewards/chosen": 0.20767110586166382, "rewards/margins": 1.8155606389045715, "rewards/rejected": -1.6078895330429077, "step": 14393 }, { "epoch": 0.7629396019399465, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61614261.333333336, "logits/rejected": -34029257.6, "logps/chosen": -515.5707194010416, "logps/rejected": -194.296826171875, "loss": 0.282, "rewards/chosen": 0.3723967870076497, "rewards/margins": 2.333830197652181, "rewards/rejected": -1.9614334106445312, "step": 14394 }, { "epoch": 0.7629926059417486, "grad_norm": 23.5, "kl": 0.15300941467285156, "learning_rate": 5e-07, "logits/chosen": -6418538.0, "logits/rejected": -20314752.0, "logps/chosen": -118.193359375, "logps/rejected": -233.79779052734375, "loss": 0.2185, "rewards/chosen": 0.5838695168495178, "rewards/margins": 3.502235233783722, "rewards/rejected": -2.918365716934204, "step": 14395 }, { "epoch": 0.7630456099435508, "grad_norm": 45.25, "kl": 0.9943180084228516, "learning_rate": 5e-07, "logits/chosen": -17377778.0, "logits/rejected": -15595330.666666666, "logps/chosen": -818.1300048828125, "logps/rejected": -478.6114501953125, "loss": 0.1673, "rewards/chosen": 3.0593855381011963, "rewards/margins": 5.06203850110372, "rewards/rejected": -2.002652963002523, "step": 14396 }, { "epoch": 0.7630986139453528, "grad_norm": 27.0, "kl": 1.6707372665405273, "learning_rate": 5e-07, "logits/chosen": -27294858.666666668, "logits/rejected": -6949140.8, "logps/chosen": -738.7163899739584, "logps/rejected": -295.705810546875, "loss": 0.2026, "rewards/chosen": 1.8730015754699707, "rewards/margins": 4.52849645614624, "rewards/rejected": -2.6554948806762697, "step": 14397 }, { "epoch": 0.763151617947155, "grad_norm": 63.75, "kl": 5.476705551147461, "learning_rate": 5e-07, "logits/chosen": -40361573.333333336, "logits/rejected": -345261.375, "logps/chosen": -338.7119954427083, "logps/rejected": -92.89311218261719, "loss": 0.402, "rewards/chosen": 0.6174576679865519, "rewards/margins": 1.8130811850229898, "rewards/rejected": -1.195623517036438, "step": 14398 }, { "epoch": 0.7632046219489571, "grad_norm": 49.75, "kl": 1.9396591186523438, "learning_rate": 5e-07, "logits/chosen": -17731075.2, "logits/rejected": 9495628.0, "logps/chosen": -555.24453125, "logps/rejected": -252.20149739583334, "loss": 0.2697, "rewards/chosen": 1.3533227920532227, "rewards/margins": 3.521936384836833, "rewards/rejected": -2.16861359278361, "step": 14399 }, { "epoch": 0.7632576259507593, "grad_norm": 50.75, "kl": 5.212060928344727, "learning_rate": 5e-07, "logits/chosen": -13900461.714285715, "logits/rejected": -11232878.0, "logps/chosen": -246.83510044642858, "logps/rejected": -258.8670959472656, "loss": 0.3748, "rewards/chosen": 1.0273914337158203, "rewards/margins": 2.522981643676758, "rewards/rejected": -1.4955902099609375, "step": 14400 }, { "epoch": 0.7633106299525614, "grad_norm": 47.75, "kl": 2.270620346069336, "learning_rate": 5e-07, "logits/chosen": 10407471.333333334, "logits/rejected": -24311865.6, "logps/chosen": -289.07578531901044, "logps/rejected": -346.308447265625, "loss": 0.2493, "rewards/chosen": 1.0716382662455242, "rewards/margins": 2.966484800974528, "rewards/rejected": -1.894846534729004, "step": 14401 }, { "epoch": 0.7633636339543636, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11264382.666666666, "logits/rejected": -17084403.2, "logps/chosen": -377.4584554036458, "logps/rejected": -266.46220703125, "loss": 0.2324, "rewards/chosen": 0.4658999443054199, "rewards/margins": 2.643257999420166, "rewards/rejected": -2.177358055114746, "step": 14402 }, { "epoch": 0.7634166379561657, "grad_norm": 42.5, "kl": 0.9601917266845703, "learning_rate": 5e-07, "logits/chosen": -37029730.666666664, "logits/rejected": -26523888.0, "logps/chosen": -369.7658284505208, "logps/rejected": -266.383544921875, "loss": 0.2424, "rewards/chosen": 1.1330976486206055, "rewards/margins": 3.5771069526672363, "rewards/rejected": -2.444009304046631, "step": 14403 }, { "epoch": 0.7634696419579678, "grad_norm": 49.25, "kl": 0.15552711486816406, "learning_rate": 5e-07, "logits/chosen": -52610876.0, "logits/rejected": -28611284.0, "logps/chosen": -368.40130615234375, "logps/rejected": -365.8840637207031, "loss": 0.27, "rewards/chosen": 0.40471401810646057, "rewards/margins": 2.9414009153842926, "rewards/rejected": -2.536686897277832, "step": 14404 }, { "epoch": 0.7635226459597699, "grad_norm": 29.125, "kl": 2.3209104537963867, "learning_rate": 5e-07, "logits/chosen": 3699655.6, "logits/rejected": -22564080.0, "logps/chosen": -102.21090087890624, "logps/rejected": -364.3767903645833, "loss": 0.1664, "rewards/chosen": 1.5624822616577148, "rewards/margins": 4.554548327128092, "rewards/rejected": -2.9920660654703775, "step": 14405 }, { "epoch": 0.7635756499615721, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79884680.0, "logits/rejected": -12071508.57142857, "logps/chosen": -1886.8544921875, "logps/rejected": -265.19505092075894, "loss": 0.1573, "rewards/chosen": 3.201464891433716, "rewards/margins": 4.948235613959176, "rewards/rejected": -1.7467707225254603, "step": 14406 }, { "epoch": 0.7636286539633742, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8827996.666666666, "logits/rejected": -29802739.2, "logps/chosen": -77.30812072753906, "logps/rejected": -313.8890869140625, "loss": 0.2981, "rewards/chosen": -0.18085453907648721, "rewards/margins": 1.6908190449078877, "rewards/rejected": -1.871673583984375, "step": 14407 }, { "epoch": 0.7636816579651764, "grad_norm": 48.25, "kl": 0.12957000732421875, "learning_rate": 5e-07, "logits/chosen": -13551352.0, "logits/rejected": -8710940.0, "logps/chosen": -321.4839680989583, "logps/rejected": -63.15037155151367, "loss": 0.3502, "rewards/chosen": 0.4185752868652344, "rewards/margins": 2.087505578994751, "rewards/rejected": -1.6689302921295166, "step": 14408 }, { "epoch": 0.7637346619669785, "grad_norm": 47.75, "kl": 0.01674365997314453, "learning_rate": 5e-07, "logits/chosen": -46658032.0, "logits/rejected": -12352008.0, "logps/chosen": -309.9731750488281, "logps/rejected": -235.0001220703125, "loss": 0.2839, "rewards/chosen": 0.4792160093784332, "rewards/margins": 2.3389711678028107, "rewards/rejected": -1.8597551584243774, "step": 14409 }, { "epoch": 0.7637876659687807, "grad_norm": 50.0, "kl": 1.2165050506591797, "learning_rate": 5e-07, "logits/chosen": -68649221.33333333, "logits/rejected": -17433321.6, "logps/chosen": -601.9728597005209, "logps/rejected": -252.5419677734375, "loss": 0.1954, "rewards/chosen": 0.9199136892954508, "rewards/margins": 3.113252274195353, "rewards/rejected": -2.1933385848999025, "step": 14410 }, { "epoch": 0.7638406699705828, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84029104.0, "logits/rejected": -35578864.0, "logps/chosen": -445.4886474609375, "logps/rejected": -375.9214782714844, "loss": 0.3426, "rewards/chosen": -0.31497690081596375, "rewards/margins": 3.012185364961624, "rewards/rejected": -3.327162265777588, "step": 14411 }, { "epoch": 0.763893673972385, "grad_norm": 56.0, "kl": 0.5578818321228027, "learning_rate": 5e-07, "logits/chosen": -35589029.333333336, "logits/rejected": -5793810.8, "logps/chosen": -203.51127115885416, "logps/rejected": -194.0917724609375, "loss": 0.2494, "rewards/chosen": 0.48435767491658527, "rewards/margins": 2.668273957570394, "rewards/rejected": -2.1839162826538088, "step": 14412 }, { "epoch": 0.763946677974187, "grad_norm": 47.75, "kl": 2.4637088775634766, "learning_rate": 5e-07, "logits/chosen": -38771013.333333336, "logits/rejected": -32235212.8, "logps/chosen": -344.0503743489583, "logps/rejected": -300.005224609375, "loss": 0.2209, "rewards/chosen": 0.9620888233184814, "rewards/margins": 3.4561434268951414, "rewards/rejected": -2.49405460357666, "step": 14413 }, { "epoch": 0.7639996819759892, "grad_norm": 31.75, "kl": 1.5231924057006836, "learning_rate": 5e-07, "logits/chosen": 8005137.333333333, "logits/rejected": -17460766.4, "logps/chosen": -38.25181579589844, "logps/rejected": -575.679296875, "loss": 0.1953, "rewards/chosen": 0.2738426923751831, "rewards/margins": 4.515620493888855, "rewards/rejected": -4.241777801513672, "step": 14414 }, { "epoch": 0.7640526859777913, "grad_norm": 41.0, "kl": 2.9612693786621094, "learning_rate": 5e-07, "logits/chosen": -34565730.666666664, "logits/rejected": -40797344.0, "logps/chosen": -250.444091796875, "logps/rejected": -453.0611328125, "loss": 0.2229, "rewards/chosen": 0.6730147997538248, "rewards/margins": 2.9958255449930826, "rewards/rejected": -2.322810745239258, "step": 14415 }, { "epoch": 0.7641056899795935, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24789456.0, "logits/rejected": -15127932.8, "logps/chosen": -190.16166178385416, "logps/rejected": -221.1239990234375, "loss": 0.3128, "rewards/chosen": -0.031535844008127846, "rewards/margins": 2.245581312974294, "rewards/rejected": -2.277117156982422, "step": 14416 }, { "epoch": 0.7641586939813956, "grad_norm": 57.25, "kl": 0.8033809661865234, "learning_rate": 5e-07, "logits/chosen": -46246326.85714286, "logits/rejected": -39142124.0, "logps/chosen": -295.0553501674107, "logps/rejected": -999.7770385742188, "loss": 0.3925, "rewards/chosen": 0.28889029366629465, "rewards/margins": 4.515507629939488, "rewards/rejected": -4.226617336273193, "step": 14417 }, { "epoch": 0.7642116979831978, "grad_norm": 46.25, "kl": 1.5909042358398438, "learning_rate": 5e-07, "logits/chosen": -36303356.0, "logits/rejected": -17885452.0, "logps/chosen": -635.434814453125, "logps/rejected": -261.98773193359375, "loss": 0.2687, "rewards/chosen": 1.5482635498046875, "rewards/margins": 3.647428035736084, "rewards/rejected": -2.0991644859313965, "step": 14418 }, { "epoch": 0.7642647019849999, "grad_norm": 56.75, "kl": 2.359283447265625, "learning_rate": 5e-07, "logits/chosen": -58256064.0, "logits/rejected": -6280032.666666667, "logps/chosen": -466.84267578125, "logps/rejected": -167.505126953125, "loss": 0.4106, "rewards/chosen": -0.1952539086341858, "rewards/margins": 2.711211621761322, "rewards/rejected": -2.906465530395508, "step": 14419 }, { "epoch": 0.764317705986802, "grad_norm": 61.25, "kl": 1.485147476196289, "learning_rate": 5e-07, "logits/chosen": -29681493.333333332, "logits/rejected": -7609845.0, "logps/chosen": -211.00946044921875, "logps/rejected": -208.29556274414062, "loss": 0.3624, "rewards/chosen": 0.5052743355433146, "rewards/margins": 1.8524918953577676, "rewards/rejected": -1.3472175598144531, "step": 14420 }, { "epoch": 0.7643707099886041, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71405280.0, "logits/rejected": -68727097.6, "logps/chosen": -481.6728515625, "logps/rejected": -404.846875, "loss": 0.3382, "rewards/chosen": -0.2745620608329773, "rewards/margins": 1.4670868039131164, "rewards/rejected": -1.7416488647460937, "step": 14421 }, { "epoch": 0.7644237139904063, "grad_norm": 41.25, "kl": 1.7944297790527344, "learning_rate": 5e-07, "logits/chosen": -2837623.3333333335, "logits/rejected": -19573384.0, "logps/chosen": -541.0830891927084, "logps/rejected": -471.6338806152344, "loss": 0.2824, "rewards/chosen": 1.1126627922058105, "rewards/margins": 3.719633102416992, "rewards/rejected": -2.6069703102111816, "step": 14422 }, { "epoch": 0.7644767179922084, "grad_norm": 48.25, "kl": 2.8952255249023438, "learning_rate": 5e-07, "logits/chosen": -8306808.0, "logits/rejected": -3828720.6666666665, "logps/chosen": -207.3370849609375, "logps/rejected": -125.89501953125, "loss": 0.4752, "rewards/chosen": 0.40780930519104003, "rewards/margins": 0.7394410053888957, "rewards/rejected": -0.33163170019785565, "step": 14423 }, { "epoch": 0.7645297219940106, "grad_norm": 43.25, "kl": 0.19236373901367188, "learning_rate": 5e-07, "logits/chosen": -27264044.8, "logits/rejected": -3637496.6666666665, "logps/chosen": -168.4636962890625, "logps/rejected": -148.63323974609375, "loss": 0.3552, "rewards/chosen": -0.09112336635589599, "rewards/margins": 2.756424164772034, "rewards/rejected": -2.8475475311279297, "step": 14424 }, { "epoch": 0.7645827259958127, "grad_norm": 36.5, "kl": 1.166548728942871, "learning_rate": 5e-07, "logits/chosen": -2000944.0, "logits/rejected": -47020120.0, "logps/chosen": -76.591064453125, "logps/rejected": -589.9783935546875, "loss": 0.2874, "rewards/chosen": 0.0624941810965538, "rewards/margins": 2.6829184517264366, "rewards/rejected": -2.620424270629883, "step": 14425 }, { "epoch": 0.7646357299976149, "grad_norm": 56.5, "kl": 2.4083762168884277, "learning_rate": 5e-07, "logits/chosen": -44519876.571428575, "logits/rejected": -31095630.0, "logps/chosen": -348.16029575892856, "logps/rejected": -193.4220428466797, "loss": 0.4317, "rewards/chosen": 0.5447825023106166, "rewards/margins": 1.8537974187305997, "rewards/rejected": -1.309014916419983, "step": 14426 }, { "epoch": 0.764688733999417, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7889032.5, "logits/rejected": -23226965.333333332, "logps/chosen": -244.09812927246094, "logps/rejected": -214.35176595052084, "loss": 0.2307, "rewards/chosen": 0.15315744280815125, "rewards/margins": 2.192242354154587, "rewards/rejected": -2.0390849113464355, "step": 14427 }, { "epoch": 0.7647417380012191, "grad_norm": 80.0, "kl": 7.15226936340332, "learning_rate": 5e-07, "logits/chosen": -50269241.6, "logits/rejected": -91712832.0, "logps/chosen": -684.188427734375, "logps/rejected": -241.91654459635416, "loss": 0.2492, "rewards/chosen": 1.7634878158569336, "rewards/margins": 4.732796351114908, "rewards/rejected": -2.969308535257975, "step": 14428 }, { "epoch": 0.7647947420030212, "grad_norm": 40.25, "kl": 0.31134796142578125, "learning_rate": 5e-07, "logits/chosen": -53281916.0, "logits/rejected": -18884396.0, "logps/chosen": -204.33155822753906, "logps/rejected": -338.52244059244794, "loss": 0.1562, "rewards/chosen": 0.27856940031051636, "rewards/margins": 3.5773666898409524, "rewards/rejected": -3.298797289530436, "step": 14429 }, { "epoch": 0.7648477460048234, "grad_norm": 38.25, "kl": 0.27112340927124023, "learning_rate": 5e-07, "logits/chosen": -9117437.333333334, "logits/rejected": -30146118.4, "logps/chosen": -159.02486165364584, "logps/rejected": -200.44052734375, "loss": 0.2293, "rewards/chosen": 0.4351789156595866, "rewards/margins": 2.4250905672709147, "rewards/rejected": -1.9899116516113282, "step": 14430 }, { "epoch": 0.7649007500066255, "grad_norm": 65.5, "kl": 0.41845130920410156, "learning_rate": 5e-07, "logits/chosen": -18950285.333333332, "logits/rejected": 17399458.0, "logps/chosen": -365.8590087890625, "logps/rejected": -494.91619873046875, "loss": 0.3258, "rewards/chosen": 0.687952438990275, "rewards/margins": 2.6123096148173013, "rewards/rejected": -1.9243571758270264, "step": 14431 }, { "epoch": 0.7649537540084277, "grad_norm": 41.25, "kl": 0.6585540771484375, "learning_rate": 5e-07, "logits/chosen": -43945152.0, "logits/rejected": -32564714.666666668, "logps/chosen": -467.0372314453125, "logps/rejected": -441.0416259765625, "loss": 0.1607, "rewards/chosen": 0.9359542727470398, "rewards/margins": 3.4830421010653176, "rewards/rejected": -2.547087828318278, "step": 14432 }, { "epoch": 0.7650067580102298, "grad_norm": 49.0, "kl": 0.6063804626464844, "learning_rate": 5e-07, "logits/chosen": -26793789.333333332, "logits/rejected": -31049942.4, "logps/chosen": -296.9513346354167, "logps/rejected": -596.551806640625, "loss": 0.2603, "rewards/chosen": 0.4948565165201823, "rewards/margins": 4.210389582316081, "rewards/rejected": -3.7155330657958983, "step": 14433 }, { "epoch": 0.765059762012032, "grad_norm": 35.75, "kl": 1.2855949401855469, "learning_rate": 5e-07, "logits/chosen": -9558014.666666666, "logits/rejected": -51389702.4, "logps/chosen": -65.06337483723958, "logps/rejected": -273.3962646484375, "loss": 0.2887, "rewards/chosen": 0.5740234057108561, "rewards/margins": 2.3067859331766765, "rewards/rejected": -1.7327625274658203, "step": 14434 }, { "epoch": 0.765112766013834, "grad_norm": 64.5, "kl": 0.2791748046875, "learning_rate": 5e-07, "logits/chosen": -136987840.0, "logits/rejected": 1875695.0, "logps/chosen": -612.6217651367188, "logps/rejected": -488.8258056640625, "loss": 0.1997, "rewards/chosen": 0.7018440365791321, "rewards/margins": 4.089858591556549, "rewards/rejected": -3.388014554977417, "step": 14435 }, { "epoch": 0.7651657700156361, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19460668.0, "logits/rejected": -17577168.0, "logps/chosen": -183.76806640625, "logps/rejected": -392.3873596191406, "loss": 0.3049, "rewards/chosen": 0.12528815865516663, "rewards/margins": 3.0604497492313385, "rewards/rejected": -2.935161590576172, "step": 14436 }, { "epoch": 0.7652187740174383, "grad_norm": 44.5, "kl": 0.2756509780883789, "learning_rate": 5e-07, "logits/chosen": -12446253.333333334, "logits/rejected": -15192489.6, "logps/chosen": -172.1754353841146, "logps/rejected": -292.7923095703125, "loss": 0.2428, "rewards/chosen": 1.243312915166219, "rewards/margins": 2.699356253941854, "rewards/rejected": -1.4560433387756349, "step": 14437 }, { "epoch": 0.7652717780192404, "grad_norm": 73.5, "kl": 2.053546905517578, "learning_rate": 5e-07, "logits/chosen": -50304432.0, "logits/rejected": -15418429.0, "logps/chosen": -363.01336669921875, "logps/rejected": -126.12977600097656, "loss": 0.2151, "rewards/chosen": 1.1791610717773438, "rewards/margins": 3.7017970085144043, "rewards/rejected": -2.5226359367370605, "step": 14438 }, { "epoch": 0.7653247820210426, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4514726.0, "logits/rejected": -32963168.0, "logps/chosen": -283.34443359375, "logps/rejected": -453.79150390625, "loss": 0.2756, "rewards/chosen": 0.39725046157836913, "rewards/margins": 3.8078825950622557, "rewards/rejected": -3.4106321334838867, "step": 14439 }, { "epoch": 0.7653777860228447, "grad_norm": 57.0, "kl": 2.2348480224609375, "learning_rate": 5e-07, "logits/chosen": 1041001.6, "logits/rejected": -28095312.0, "logps/chosen": -85.66280517578124, "logps/rejected": -228.27494303385416, "loss": 0.307, "rewards/chosen": 0.787771463394165, "rewards/margins": 2.4925698439280195, "rewards/rejected": -1.7047983805338542, "step": 14440 }, { "epoch": 0.7654307900246469, "grad_norm": 65.5, "kl": 4.458967208862305, "learning_rate": 5e-07, "logits/chosen": -65963568.0, "logits/rejected": -21150806.0, "logps/chosen": -464.0377604166667, "logps/rejected": -45.32521438598633, "loss": 0.4991, "rewards/chosen": 0.3126642902692159, "rewards/margins": 1.0941797097524006, "rewards/rejected": -0.7815154194831848, "step": 14441 }, { "epoch": 0.765483794026449, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6286529.0, "logits/rejected": 6872710.285714285, "logps/chosen": -12.137237548828125, "logps/rejected": -395.18310546875, "loss": 0.2702, "rewards/chosen": -0.32186564803123474, "rewards/margins": 1.4995106075491225, "rewards/rejected": -1.8213762555803572, "step": 14442 }, { "epoch": 0.7655367980282511, "grad_norm": 33.0, "kl": 1.5476303100585938, "learning_rate": 5e-07, "logits/chosen": 11940330.666666666, "logits/rejected": -15090636.8, "logps/chosen": -151.27850341796875, "logps/rejected": -283.6886474609375, "loss": 0.2822, "rewards/chosen": 0.36254727840423584, "rewards/margins": 3.2032970190048218, "rewards/rejected": -2.840749740600586, "step": 14443 }, { "epoch": 0.7655898020300532, "grad_norm": 63.5, "kl": 2.5194435119628906, "learning_rate": 5e-07, "logits/chosen": -36985552.0, "logits/rejected": -34814701.333333336, "logps/chosen": -65.119140625, "logps/rejected": -245.8537801106771, "loss": 0.4253, "rewards/chosen": -0.014740371704101562, "rewards/margins": 2.014059638977051, "rewards/rejected": -2.0288000106811523, "step": 14444 }, { "epoch": 0.7656428060318554, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20647460.0, "logits/rejected": -13218606.0, "logps/chosen": -245.59649658203125, "logps/rejected": -363.7261657714844, "loss": 0.4248, "rewards/chosen": 0.06787001093228658, "rewards/margins": 1.2007641692956288, "rewards/rejected": -1.1328941583633423, "step": 14445 }, { "epoch": 0.7656958100336575, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50815376.0, "logits/rejected": -39505346.666666664, "logps/chosen": -529.522705078125, "logps/rejected": -338.92165120442706, "loss": 0.2567, "rewards/chosen": -0.05539856106042862, "rewards/margins": 2.3118692065278688, "rewards/rejected": -2.3672677675882974, "step": 14446 }, { "epoch": 0.7657488140354597, "grad_norm": 38.25, "kl": 4.248420715332031, "learning_rate": 5e-07, "logits/chosen": -30881305.6, "logits/rejected": 2154806.3333333335, "logps/chosen": -547.747900390625, "logps/rejected": -113.9027811686198, "loss": 0.3042, "rewards/chosen": 1.0242477416992188, "rewards/margins": 3.5805941581726075, "rewards/rejected": -2.5563464164733887, "step": 14447 }, { "epoch": 0.7658018180372618, "grad_norm": 87.5, "kl": 3.4019460678100586, "learning_rate": 5e-07, "logits/chosen": -22773632.0, "logits/rejected": -25319470.4, "logps/chosen": -189.0604451497396, "logps/rejected": -459.7013671875, "loss": 0.2394, "rewards/chosen": 0.9436597029368082, "rewards/margins": 3.3965897719065348, "rewards/rejected": -2.4529300689697267, "step": 14448 }, { "epoch": 0.765854822039064, "grad_norm": 50.0, "kl": 2.7954978942871094, "learning_rate": 5e-07, "logits/chosen": -4198225.5, "logits/rejected": -17791306.0, "logps/chosen": -482.61468505859375, "logps/rejected": -148.55813598632812, "loss": 0.2401, "rewards/chosen": 1.6394453048706055, "rewards/margins": 3.911590099334717, "rewards/rejected": -2.2721447944641113, "step": 14449 }, { "epoch": 0.765907826040866, "grad_norm": 46.75, "kl": 2.9668197631835938, "learning_rate": 5e-07, "logits/chosen": 8783576.0, "logits/rejected": -8632294.0, "logps/chosen": -182.0762481689453, "logps/rejected": -288.76080322265625, "loss": 0.251, "rewards/chosen": 1.3372142314910889, "rewards/margins": 2.8420965671539307, "rewards/rejected": -1.5048823356628418, "step": 14450 }, { "epoch": 0.7659608300426682, "grad_norm": 89.5, "kl": 1.0435333251953125, "learning_rate": 5e-07, "logits/chosen": -83254329.6, "logits/rejected": -45970837.333333336, "logps/chosen": -630.90498046875, "logps/rejected": -245.09733072916666, "loss": 0.4089, "rewards/chosen": -0.14000978469848632, "rewards/margins": 2.0365034103393556, "rewards/rejected": -2.176513195037842, "step": 14451 }, { "epoch": 0.7660138340444703, "grad_norm": 38.0, "kl": 2.2560601234436035, "learning_rate": 5e-07, "logits/chosen": -2934559.8, "logits/rejected": 1202533.3333333333, "logps/chosen": -75.36680908203125, "logps/rejected": -553.2099609375, "loss": 0.3392, "rewards/chosen": 0.2882744789123535, "rewards/margins": 3.245146147410075, "rewards/rejected": -2.956871668497721, "step": 14452 }, { "epoch": 0.7660668380462725, "grad_norm": 71.0, "kl": 4.217540740966797, "learning_rate": 5e-07, "logits/chosen": -23507227.42857143, "logits/rejected": -15330582.0, "logps/chosen": -427.67661830357144, "logps/rejected": -114.72805786132812, "loss": 0.4247, "rewards/chosen": 0.5891742025102887, "rewards/margins": 3.867747953959874, "rewards/rejected": -3.278573751449585, "step": 14453 }, { "epoch": 0.7661198420480746, "grad_norm": 87.0, "kl": 2.7987117767333984, "learning_rate": 5e-07, "logits/chosen": 52359060.0, "logits/rejected": -4366113.5, "logps/chosen": -420.4461975097656, "logps/rejected": -94.18717193603516, "loss": 0.3458, "rewards/chosen": 0.8598592877388, "rewards/margins": 1.9431366324424744, "rewards/rejected": -1.0832773447036743, "step": 14454 }, { "epoch": 0.7661728460498768, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32950521.6, "logits/rejected": -29111114.666666668, "logps/chosen": -383.4092041015625, "logps/rejected": -527.7337239583334, "loss": 0.266, "rewards/chosen": 0.5309631824493408, "rewards/margins": 3.887671709060669, "rewards/rejected": -3.356708526611328, "step": 14455 }, { "epoch": 0.7662258500516789, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51117824.0, "logits/rejected": 2467996.6666666665, "logps/chosen": -149.49607849121094, "logps/rejected": -552.5314127604166, "loss": 0.1789, "rewards/chosen": 0.8405933976173401, "rewards/margins": 3.354562819004059, "rewards/rejected": -2.5139694213867188, "step": 14456 }, { "epoch": 0.7662788540534811, "grad_norm": 56.5, "kl": 2.140460968017578, "learning_rate": 5e-07, "logits/chosen": -37484096.0, "logits/rejected": -23023814.0, "logps/chosen": -290.4136047363281, "logps/rejected": -477.3785400390625, "loss": 0.243, "rewards/chosen": 0.81464684009552, "rewards/margins": 3.779342770576477, "rewards/rejected": -2.964695930480957, "step": 14457 }, { "epoch": 0.7663318580552831, "grad_norm": 46.0, "kl": 0.7648320198059082, "learning_rate": 5e-07, "logits/chosen": -29738569.6, "logits/rejected": -2788184.5, "logps/chosen": -362.8452880859375, "logps/rejected": -121.47739664713542, "loss": 0.284, "rewards/chosen": 0.8414811134338379, "rewards/margins": 2.626435979207357, "rewards/rejected": -1.7849548657735188, "step": 14458 }, { "epoch": 0.7663848620570853, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20531440.0, "logits/rejected": -53276960.0, "logps/chosen": -296.1111246744792, "logps/rejected": -468.068994140625, "loss": 0.258, "rewards/chosen": 0.18833414713541666, "rewards/margins": 2.1475926081339516, "rewards/rejected": -1.959258460998535, "step": 14459 }, { "epoch": 0.7664378660588874, "grad_norm": 44.0, "kl": 2.45263671875, "learning_rate": 5e-07, "logits/chosen": -25582398.0, "logits/rejected": 19276132.0, "logps/chosen": -314.87066650390625, "logps/rejected": -190.7821502685547, "loss": 0.323, "rewards/chosen": 0.49184390902519226, "rewards/margins": 2.4127430617809296, "rewards/rejected": -1.9208991527557373, "step": 14460 }, { "epoch": 0.7664908700606896, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23288480.0, "logits/rejected": -14328076.8, "logps/chosen": -259.5636393229167, "logps/rejected": -292.393896484375, "loss": 0.2071, "rewards/chosen": 0.6435597737630209, "rewards/margins": 3.357245190938314, "rewards/rejected": -2.713685417175293, "step": 14461 }, { "epoch": 0.7665438740624917, "grad_norm": 36.25, "kl": 0.5996589660644531, "learning_rate": 5e-07, "logits/chosen": -9775376.0, "logits/rejected": -26334299.42857143, "logps/chosen": -219.96014404296875, "logps/rejected": -439.47021484375, "loss": 0.1073, "rewards/chosen": 3.748089551925659, "rewards/margins": 6.274376085826329, "rewards/rejected": -2.5262865339006697, "step": 14462 }, { "epoch": 0.7665968780642939, "grad_norm": 37.75, "kl": 1.475825309753418, "learning_rate": 5e-07, "logits/chosen": -12308334.0, "logits/rejected": -23244524.0, "logps/chosen": -87.66954803466797, "logps/rejected": -195.71359252929688, "loss": 0.3156, "rewards/chosen": 0.25721269845962524, "rewards/margins": 2.0451332926750183, "rewards/rejected": -1.787920594215393, "step": 14463 }, { "epoch": 0.766649882066096, "grad_norm": 51.5, "kl": 4.14388370513916, "learning_rate": 5e-07, "logits/chosen": 23465861.333333332, "logits/rejected": -44402200.0, "logps/chosen": -181.02945963541666, "logps/rejected": -703.0543212890625, "loss": 0.4041, "rewards/chosen": 0.31348860263824463, "rewards/margins": 3.2170103788375854, "rewards/rejected": -2.903521776199341, "step": 14464 }, { "epoch": 0.7667028860678982, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77086944.0, "logits/rejected": -10498270.4, "logps/chosen": -491.156494140625, "logps/rejected": -253.355712890625, "loss": 0.2221, "rewards/chosen": 0.5465271472930908, "rewards/margins": 2.952911043167114, "rewards/rejected": -2.4063838958740233, "step": 14465 }, { "epoch": 0.7667558900697002, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22088092.0, "logits/rejected": -33005939.2, "logps/chosen": -162.50761922200522, "logps/rejected": -596.028125, "loss": 0.2042, "rewards/chosen": 0.4900476932525635, "rewards/margins": 5.033533716201783, "rewards/rejected": -4.543486022949219, "step": 14466 }, { "epoch": 0.7668088940715024, "grad_norm": 40.75, "kl": 4.137420654296875, "learning_rate": 5e-07, "logits/chosen": -18791585.6, "logits/rejected": -51646741.333333336, "logps/chosen": -311.340771484375, "logps/rejected": -746.3984375, "loss": 0.2985, "rewards/chosen": 0.9038608551025391, "rewards/margins": 4.688227780659994, "rewards/rejected": -3.7843669255574546, "step": 14467 }, { "epoch": 0.7668618980733045, "grad_norm": 90.5, "kl": 5.156089782714844, "learning_rate": 5e-07, "logits/chosen": -45892342.85714286, "logits/rejected": -46963668.0, "logps/chosen": -578.2963518415179, "logps/rejected": -406.2884521484375, "loss": 0.2441, "rewards/chosen": 1.6311578750610352, "rewards/margins": 4.662029504776001, "rewards/rejected": -3.030871629714966, "step": 14468 }, { "epoch": 0.7669149020751067, "grad_norm": 42.0, "kl": 2.376267433166504, "learning_rate": 5e-07, "logits/chosen": -18573493.333333332, "logits/rejected": -71108819.2, "logps/chosen": -337.4319254557292, "logps/rejected": -210.2273193359375, "loss": 0.2384, "rewards/chosen": 1.0984748204549153, "rewards/margins": 2.6448931058247886, "rewards/rejected": -1.5464182853698731, "step": 14469 }, { "epoch": 0.7669679060769088, "grad_norm": 70.0, "kl": 0.9903488159179688, "learning_rate": 5e-07, "logits/chosen": -41541238.85714286, "logits/rejected": -43894888.0, "logps/chosen": -345.17128208705356, "logps/rejected": -406.6063232421875, "loss": 0.5158, "rewards/chosen": -0.15152490139007568, "rewards/margins": 1.0273630619049072, "rewards/rejected": -1.178887963294983, "step": 14470 }, { "epoch": 0.767020910078711, "grad_norm": 56.75, "kl": 2.352476119995117, "learning_rate": 5e-07, "logits/chosen": -22260414.0, "logits/rejected": -19047460.0, "logps/chosen": -441.21832275390625, "logps/rejected": -305.2330017089844, "loss": 0.2423, "rewards/chosen": 1.1260876655578613, "rewards/margins": 3.4802591800689697, "rewards/rejected": -2.3541715145111084, "step": 14471 }, { "epoch": 0.7670739140805131, "grad_norm": 39.0, "kl": 0.8904037475585938, "learning_rate": 5e-07, "logits/chosen": -5338343.0, "logits/rejected": 160504.8125, "logps/chosen": -218.52920532226562, "logps/rejected": -95.61384582519531, "loss": 0.1911, "rewards/chosen": 1.1743919849395752, "rewards/margins": 4.2967143058776855, "rewards/rejected": -3.1223223209381104, "step": 14472 }, { "epoch": 0.7671269180823153, "grad_norm": 63.25, "kl": 1.1100177764892578, "learning_rate": 5e-07, "logits/chosen": 10484583.2, "logits/rejected": -18206329.333333332, "logps/chosen": -179.6255126953125, "logps/rejected": -302.83473714192706, "loss": 0.3358, "rewards/chosen": 0.6400857925415039, "rewards/margins": 2.2476933161417643, "rewards/rejected": -1.6076075236002605, "step": 14473 }, { "epoch": 0.7671799220841173, "grad_norm": 38.75, "kl": 1.5782318115234375, "learning_rate": 5e-07, "logits/chosen": -45802741.333333336, "logits/rejected": -27696700.8, "logps/chosen": -257.0638834635417, "logps/rejected": -512.9296875, "loss": 0.3042, "rewards/chosen": -0.21659223238627115, "rewards/margins": 2.2747926553090414, "rewards/rejected": -2.4913848876953124, "step": 14474 }, { "epoch": 0.7672329260859195, "grad_norm": 35.25, "kl": 0.7581253051757812, "learning_rate": 5e-07, "logits/chosen": -17565180.0, "logits/rejected": -89337292.8, "logps/chosen": -344.6578776041667, "logps/rejected": -459.78759765625, "loss": 0.1654, "rewards/chosen": 0.8314643700917562, "rewards/margins": 4.004685195287069, "rewards/rejected": -3.1732208251953127, "step": 14475 }, { "epoch": 0.7672859300877216, "grad_norm": 50.25, "kl": 4.500602722167969, "learning_rate": 5e-07, "logits/chosen": -29579970.666666668, "logits/rejected": -63188848.0, "logps/chosen": -332.1915283203125, "logps/rejected": -431.8213806152344, "loss": 0.42, "rewards/chosen": 0.581798235575358, "rewards/margins": 3.7944416205088296, "rewards/rejected": -3.2126433849334717, "step": 14476 }, { "epoch": 0.7673389340895238, "grad_norm": 35.25, "kl": 0.3543052673339844, "learning_rate": 5e-07, "logits/chosen": 2454315.6666666665, "logits/rejected": 17141654.4, "logps/chosen": -116.03633626302083, "logps/rejected": -411.908984375, "loss": 0.2263, "rewards/chosen": 0.6175240278244019, "rewards/margins": 2.8086106061935423, "rewards/rejected": -2.1910865783691404, "step": 14477 }, { "epoch": 0.7673919380913259, "grad_norm": 27.25, "kl": 1.0892820358276367, "learning_rate": 5e-07, "logits/chosen": -9153631.0, "logits/rejected": -24424701.333333332, "logps/chosen": -37.875972747802734, "logps/rejected": -397.1672770182292, "loss": 0.1889, "rewards/chosen": 0.2510121464729309, "rewards/margins": 2.8450361688931785, "rewards/rejected": -2.5940240224202475, "step": 14478 }, { "epoch": 0.7674449420931281, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -103706092.8, "logits/rejected": -17167853.333333332, "logps/chosen": -337.2040283203125, "logps/rejected": -235.53641764322916, "loss": 0.3555, "rewards/chosen": 0.014035958051681518, "rewards/margins": 1.9846971213817597, "rewards/rejected": -1.9706611633300781, "step": 14479 }, { "epoch": 0.7674979460949302, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54071288.0, "logits/rejected": -23689356.0, "logps/chosen": -478.1556396484375, "logps/rejected": -333.06610107421875, "loss": 0.2761, "rewards/chosen": 0.4719230532646179, "rewards/margins": 2.3246116042137146, "rewards/rejected": -1.8526885509490967, "step": 14480 }, { "epoch": 0.7675509500967324, "grad_norm": 56.25, "kl": 2.978090286254883, "learning_rate": 5e-07, "logits/chosen": -7232009.333333333, "logits/rejected": -30755432.0, "logps/chosen": -279.92372639973956, "logps/rejected": -195.0619659423828, "loss": 0.3742, "rewards/chosen": 0.43287694454193115, "rewards/margins": 4.021585822105408, "rewards/rejected": -3.5887088775634766, "step": 14481 }, { "epoch": 0.7676039540985344, "grad_norm": 52.25, "kl": 0.6308708190917969, "learning_rate": 5e-07, "logits/chosen": -9017520.0, "logits/rejected": -29063658.666666668, "logps/chosen": -313.3128356933594, "logps/rejected": -349.346923828125, "loss": 0.195, "rewards/chosen": 0.9507851004600525, "rewards/margins": 3.0787128806114197, "rewards/rejected": -2.127927780151367, "step": 14482 }, { "epoch": 0.7676569581003366, "grad_norm": 55.5, "kl": 0.7736320495605469, "learning_rate": 5e-07, "logits/chosen": -37308576.0, "logits/rejected": -39890836.0, "logps/chosen": -222.39862060546875, "logps/rejected": -333.41473388671875, "loss": 0.3461, "rewards/chosen": -0.05545195937156677, "rewards/margins": 2.2014171183109283, "rewards/rejected": -2.256869077682495, "step": 14483 }, { "epoch": 0.7677099621021387, "grad_norm": 56.75, "kl": 2.0444469451904297, "learning_rate": 5e-07, "logits/chosen": -43956626.28571428, "logits/rejected": -10303486.0, "logps/chosen": -333.58530970982144, "logps/rejected": -180.51759338378906, "loss": 0.3959, "rewards/chosen": 0.5846729278564453, "rewards/margins": 3.0406131744384766, "rewards/rejected": -2.4559402465820312, "step": 14484 }, { "epoch": 0.7677629661039409, "grad_norm": 46.75, "kl": 2.2784175872802734, "learning_rate": 5e-07, "logits/chosen": -1852217.5, "logits/rejected": -27231964.0, "logps/chosen": -264.46490478515625, "logps/rejected": -256.24261474609375, "loss": 0.2816, "rewards/chosen": 0.655677855014801, "rewards/margins": 2.6775529980659485, "rewards/rejected": -2.0218751430511475, "step": 14485 }, { "epoch": 0.767815970105743, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64774986.666666664, "logits/rejected": 19997275.2, "logps/chosen": -314.72605387369794, "logps/rejected": -254.020458984375, "loss": 0.3954, "rewards/chosen": -0.575069785118103, "rewards/margins": 0.7434623956680297, "rewards/rejected": -1.3185321807861328, "step": 14486 }, { "epoch": 0.7678689741075451, "grad_norm": 52.25, "kl": 3.8262338638305664, "learning_rate": 5e-07, "logits/chosen": -6216796.666666667, "logits/rejected": -753533.125, "logps/chosen": -286.16310628255206, "logps/rejected": -106.31592559814453, "loss": 0.3903, "rewards/chosen": 0.6189524332682291, "rewards/margins": 2.84755531946818, "rewards/rejected": -2.228602886199951, "step": 14487 }, { "epoch": 0.7679219781093473, "grad_norm": 62.25, "kl": 1.7422122955322266, "learning_rate": 5e-07, "logits/chosen": -93397235.2, "logits/rejected": -66375424.0, "logps/chosen": -402.07783203125, "logps/rejected": -645.2062174479166, "loss": 0.2835, "rewards/chosen": 0.5797431945800782, "rewards/margins": 3.616215833028158, "rewards/rejected": -3.0364726384480796, "step": 14488 }, { "epoch": 0.7679749821111493, "grad_norm": 21.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7631628.0, "logits/rejected": -8028102.285714285, "logps/chosen": -5.706661224365234, "logps/rejected": -192.14920479910714, "loss": 0.2079, "rewards/chosen": -0.1309823989868164, "rewards/margins": 2.447178840637207, "rewards/rejected": -2.5781612396240234, "step": 14489 }, { "epoch": 0.7680279861129515, "grad_norm": 47.75, "kl": 3.425294876098633, "learning_rate": 5e-07, "logits/chosen": -25019376.0, "logits/rejected": -24301622.0, "logps/chosen": -384.58209228515625, "logps/rejected": -359.6161804199219, "loss": 0.3885, "rewards/chosen": 0.22791290283203125, "rewards/margins": 1.904000997543335, "rewards/rejected": -1.6760880947113037, "step": 14490 }, { "epoch": 0.7680809901147536, "grad_norm": 40.25, "kl": 0.12572717666625977, "learning_rate": 5e-07, "logits/chosen": -21732496.0, "logits/rejected": -39426996.0, "logps/chosen": -175.49307250976562, "logps/rejected": -255.02488708496094, "loss": 0.3111, "rewards/chosen": 0.056227631866931915, "rewards/margins": 2.5354885533452034, "rewards/rejected": -2.4792609214782715, "step": 14491 }, { "epoch": 0.7681339941165558, "grad_norm": 43.0, "kl": 3.396712303161621, "learning_rate": 5e-07, "logits/chosen": -13535669.333333334, "logits/rejected": -28586592.0, "logps/chosen": -195.80364990234375, "logps/rejected": -264.7833251953125, "loss": 0.4182, "rewards/chosen": 0.43852198123931885, "rewards/margins": 2.13718843460083, "rewards/rejected": -1.6986664533615112, "step": 14492 }, { "epoch": 0.7681869981183579, "grad_norm": 41.5, "kl": 0.515568733215332, "learning_rate": 5e-07, "logits/chosen": 4502044.0, "logits/rejected": -36258453.333333336, "logps/chosen": -18.127357482910156, "logps/rejected": -344.308837890625, "loss": 0.2687, "rewards/chosen": 0.13363569974899292, "rewards/margins": 2.7346102595329285, "rewards/rejected": -2.6009745597839355, "step": 14493 }, { "epoch": 0.7682400021201601, "grad_norm": 48.75, "kl": 0.2068634033203125, "learning_rate": 5e-07, "logits/chosen": 18177702.0, "logits/rejected": -27456844.0, "logps/chosen": -341.379638671875, "logps/rejected": -157.08291625976562, "loss": 0.2257, "rewards/chosen": 1.1320992708206177, "rewards/margins": 2.81902015209198, "rewards/rejected": -1.6869208812713623, "step": 14494 }, { "epoch": 0.7682930061219622, "grad_norm": 44.0, "kl": 1.975799560546875, "learning_rate": 5e-07, "logits/chosen": -21585184.0, "logits/rejected": -33265344.0, "logps/chosen": -249.2855224609375, "logps/rejected": -421.3958333333333, "loss": 0.3262, "rewards/chosen": 0.25211548805236816, "rewards/margins": 4.798975865046184, "rewards/rejected": -4.546860376993815, "step": 14495 }, { "epoch": 0.7683460101237644, "grad_norm": 32.25, "kl": 1.3815593719482422, "learning_rate": 5e-07, "logits/chosen": -32396924.0, "logits/rejected": -48106800.0, "logps/chosen": -227.9758758544922, "logps/rejected": -306.7597351074219, "loss": 0.2796, "rewards/chosen": 0.30640897154808044, "rewards/margins": 3.713622659444809, "rewards/rejected": -3.4072136878967285, "step": 14496 }, { "epoch": 0.7683990141255664, "grad_norm": 39.25, "kl": 1.1789665222167969, "learning_rate": 5e-07, "logits/chosen": -14041064.0, "logits/rejected": -24087763.2, "logps/chosen": -316.30788167317706, "logps/rejected": -336.617333984375, "loss": 0.2518, "rewards/chosen": 0.6302276452382406, "rewards/margins": 2.8124622186024983, "rewards/rejected": -2.1822345733642576, "step": 14497 }, { "epoch": 0.7684520181273686, "grad_norm": 48.0, "kl": 2.891343116760254, "learning_rate": 5e-07, "logits/chosen": 9917767.333333334, "logits/rejected": -7145688.8, "logps/chosen": -125.7609354654948, "logps/rejected": -154.724365234375, "loss": 0.2114, "rewards/chosen": 1.7947934468587239, "rewards/margins": 2.9947256406148277, "rewards/rejected": -1.1999321937561036, "step": 14498 }, { "epoch": 0.7685050221291707, "grad_norm": 38.5, "kl": 0.07848739624023438, "learning_rate": 5e-07, "logits/chosen": -30146766.0, "logits/rejected": -14528390.0, "logps/chosen": -226.1718292236328, "logps/rejected": -283.24261474609375, "loss": 0.262, "rewards/chosen": 0.24568557739257812, "rewards/margins": 3.1143319606781006, "rewards/rejected": -2.8686463832855225, "step": 14499 }, { "epoch": 0.7685580261309729, "grad_norm": 52.0, "kl": 0.8337326049804688, "learning_rate": 5e-07, "logits/chosen": -64370636.8, "logits/rejected": -16837954.666666668, "logps/chosen": -330.16533203125, "logps/rejected": -163.83941650390625, "loss": 0.3489, "rewards/chosen": 0.43677387237548826, "rewards/margins": 2.2341312726338702, "rewards/rejected": -1.797357400258382, "step": 14500 }, { "epoch": 0.768611030132775, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21764574.0, "logits/rejected": -20536053.333333332, "logps/chosen": -202.80328369140625, "logps/rejected": -622.5652669270834, "loss": 0.2027, "rewards/chosen": 0.24119627475738525, "rewards/margins": 3.5426024993260703, "rewards/rejected": -3.301406224568685, "step": 14501 }, { "epoch": 0.7686640341345772, "grad_norm": 46.0, "kl": 4.195917129516602, "learning_rate": 5e-07, "logits/chosen": -3719922.0, "logits/rejected": -41606716.8, "logps/chosen": -307.2804768880208, "logps/rejected": -447.71796875, "loss": 0.2318, "rewards/chosen": 0.8822382291158041, "rewards/margins": 3.8479917844136557, "rewards/rejected": -2.9657535552978516, "step": 14502 }, { "epoch": 0.7687170381363793, "grad_norm": 56.5, "kl": 3.448169708251953, "learning_rate": 5e-07, "logits/chosen": -21920750.0, "logps/chosen": -383.78021240234375, "loss": 0.427, "rewards/chosen": 0.6915631294250488, "step": 14503 }, { "epoch": 0.7687700421381815, "grad_norm": 50.25, "kl": 2.98807430267334, "learning_rate": 5e-07, "logits/chosen": -22573228.8, "logits/rejected": -40973034.666666664, "logps/chosen": -230.03515625, "logps/rejected": -357.8157145182292, "loss": 0.3948, "rewards/chosen": 0.41260480880737305, "rewards/margins": 2.5268430709838867, "rewards/rejected": -2.1142382621765137, "step": 14504 }, { "epoch": 0.7688230461399835, "grad_norm": 55.5, "kl": 3.125223159790039, "learning_rate": 5e-07, "logits/chosen": -30918370.666666668, "logits/rejected": -26001184.0, "logps/chosen": -236.46195475260416, "logps/rejected": -563.519775390625, "loss": 0.4133, "rewards/chosen": 0.2864419221878052, "rewards/margins": 3.2915674448013306, "rewards/rejected": -3.0051255226135254, "step": 14505 }, { "epoch": 0.7688760501417857, "grad_norm": 47.0, "kl": 1.2912158966064453, "learning_rate": 5e-07, "logits/chosen": -8744561.333333334, "logits/rejected": -47306521.6, "logps/chosen": -103.24990844726562, "logps/rejected": -280.71494140625, "loss": 0.2887, "rewards/chosen": 0.7170490423838297, "rewards/margins": 2.5322256247202555, "rewards/rejected": -1.8151765823364259, "step": 14506 }, { "epoch": 0.7689290541435878, "grad_norm": 45.5, "kl": 0.7868986129760742, "learning_rate": 5e-07, "logits/chosen": -8898154.0, "logits/rejected": -33136706.0, "logps/chosen": -299.86383056640625, "logps/rejected": -231.2706298828125, "loss": 0.3109, "rewards/chosen": 0.02385411038994789, "rewards/margins": 2.9461019970476627, "rewards/rejected": -2.922247886657715, "step": 14507 }, { "epoch": 0.76898205814539, "grad_norm": 55.75, "kl": 1.2969837188720703, "learning_rate": 5e-07, "logits/chosen": 24160800.0, "logits/rejected": -22222800.0, "logps/chosen": -1421.0084228515625, "logps/rejected": -291.6371256510417, "loss": 0.1823, "rewards/chosen": 1.91510009765625, "rewards/margins": 3.9617697397867837, "rewards/rejected": -2.0466696421305337, "step": 14508 }, { "epoch": 0.7690350621471921, "grad_norm": 40.0, "kl": 0.3227996826171875, "learning_rate": 5e-07, "logits/chosen": -53746016.0, "logits/rejected": -6924652.8, "logps/chosen": -360.5911458333333, "logps/rejected": -172.26461181640624, "loss": 0.2586, "rewards/chosen": 0.6196838219960531, "rewards/margins": 2.1554044564565022, "rewards/rejected": -1.5357206344604493, "step": 14509 }, { "epoch": 0.7690880661489943, "grad_norm": 70.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12228411.2, "logits/rejected": -40581725.333333336, "logps/chosen": -498.99248046875, "logps/rejected": -365.7850748697917, "loss": 0.2458, "rewards/chosen": 0.6967798233032226, "rewards/margins": 3.378413454691569, "rewards/rejected": -2.681633631388346, "step": 14510 }, { "epoch": 0.7691410701507964, "grad_norm": 72.5, "kl": 1.4811859130859375, "learning_rate": 5e-07, "logits/chosen": -47605059.2, "logits/rejected": -15101960.0, "logps/chosen": -342.878125, "logps/rejected": -327.10483805338544, "loss": 0.4969, "rewards/chosen": -0.04315109252929687, "rewards/margins": 0.5042190233866374, "rewards/rejected": -0.5473701159159342, "step": 14511 }, { "epoch": 0.7691940741525986, "grad_norm": 51.5, "kl": 0.413726806640625, "learning_rate": 5e-07, "logits/chosen": -41523874.666666664, "logits/rejected": -27376547.2, "logps/chosen": -391.5764567057292, "logps/rejected": -420.704736328125, "loss": 0.2039, "rewards/chosen": 0.5085469881693522, "rewards/margins": 4.424017111460368, "rewards/rejected": -3.9154701232910156, "step": 14512 }, { "epoch": 0.7692470781544006, "grad_norm": 54.0, "kl": 1.9905548095703125, "learning_rate": 5e-07, "logits/chosen": -46434742.85714286, "logits/rejected": -99672256.0, "logps/chosen": -533.3826032366071, "logps/rejected": -519.7294311523438, "loss": 0.4036, "rewards/chosen": 0.5842032432556152, "rewards/margins": 4.220165252685547, "rewards/rejected": -3.6359620094299316, "step": 14513 }, { "epoch": 0.7693000821562028, "grad_norm": 55.5, "kl": 1.1137142181396484, "learning_rate": 5e-07, "logits/chosen": 21606179.2, "logits/rejected": -11489701.333333334, "logps/chosen": -448.1263671875, "logps/rejected": -200.29667154947916, "loss": 0.4005, "rewards/chosen": 0.092711341381073, "rewards/margins": 1.24033758242925, "rewards/rejected": -1.147626241048177, "step": 14514 }, { "epoch": 0.7693530861580049, "grad_norm": 37.0, "kl": 1.6934471130371094, "learning_rate": 5e-07, "logits/chosen": -23431818.0, "logits/rejected": -4115158.0, "logps/chosen": -296.3698425292969, "logps/rejected": -344.6025390625, "loss": 0.238, "rewards/chosen": 0.9070588946342468, "rewards/margins": 3.5301095843315125, "rewards/rejected": -2.6230506896972656, "step": 14515 }, { "epoch": 0.7694060901598071, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30234958.0, "logits/rejected": -24382826.666666668, "logps/chosen": -427.538330078125, "logps/rejected": -243.81868489583334, "loss": 0.2178, "rewards/chosen": 0.31801146268844604, "rewards/margins": 2.589065412680308, "rewards/rejected": -2.271053949991862, "step": 14516 }, { "epoch": 0.7694590941616092, "grad_norm": 47.75, "kl": 0.5317583084106445, "learning_rate": 5e-07, "logits/chosen": -21461397.333333332, "logits/rejected": -498468.0, "logps/chosen": -294.1508382161458, "logps/rejected": -287.85458984375, "loss": 0.2951, "rewards/chosen": 0.37587698300679523, "rewards/margins": 1.8564498742421467, "rewards/rejected": -1.4805728912353515, "step": 14517 }, { "epoch": 0.7695120981634114, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13520854.4, "logits/rejected": -7547867.333333333, "logps/chosen": -277.76787109375, "logps/rejected": -99.440185546875, "loss": 0.3096, "rewards/chosen": 0.7088477134704589, "rewards/margins": 2.7389479001363117, "rewards/rejected": -2.030100186665853, "step": 14518 }, { "epoch": 0.7695651021652135, "grad_norm": 58.75, "kl": 0.9586715698242188, "learning_rate": 5e-07, "logits/chosen": -44561168.0, "logits/rejected": -55840360.0, "logps/chosen": -445.61712646484375, "logps/rejected": -316.5290832519531, "loss": 0.2349, "rewards/chosen": 0.5494896173477173, "rewards/margins": 3.864124894142151, "rewards/rejected": -3.3146352767944336, "step": 14519 }, { "epoch": 0.7696181061670156, "grad_norm": 42.25, "kl": 2.0954742431640625, "learning_rate": 5e-07, "logits/chosen": -41959760.0, "logits/rejected": -5712708.0, "logps/chosen": -409.5330078125, "logps/rejected": -141.0853271484375, "loss": 0.2362, "rewards/chosen": 1.1723394393920898, "rewards/margins": 5.017695744832357, "rewards/rejected": -3.845356305440267, "step": 14520 }, { "epoch": 0.7696711101688177, "grad_norm": 57.25, "kl": 5.008792877197266, "learning_rate": 5e-07, "logits/chosen": -17106859.2, "logits/rejected": -73705242.66666667, "logps/chosen": -445.448681640625, "logps/rejected": -270.2528076171875, "loss": 0.4249, "rewards/chosen": 0.46777987480163574, "rewards/margins": 2.2186082998911543, "rewards/rejected": -1.7508284250895183, "step": 14521 }, { "epoch": 0.7697241141706199, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10321593.333333334, "logits/rejected": -7620130.4, "logps/chosen": -111.1886494954427, "logps/rejected": -317.62744140625, "loss": 0.2755, "rewards/chosen": -0.34337568283081055, "rewards/margins": 2.386933994293213, "rewards/rejected": -2.7303096771240236, "step": 14522 }, { "epoch": 0.769777118172422, "grad_norm": 51.75, "kl": 1.31683349609375, "learning_rate": 5e-07, "logits/chosen": -19872910.0, "logits/rejected": -6461213.0, "logps/chosen": -339.69342041015625, "logps/rejected": -142.35757446289062, "loss": 0.2834, "rewards/chosen": 0.7959940433502197, "rewards/margins": 2.0812597274780273, "rewards/rejected": -1.2852656841278076, "step": 14523 }, { "epoch": 0.7698301221742242, "grad_norm": 87.0, "kl": 2.2225351333618164, "learning_rate": 5e-07, "logits/chosen": -20113372.0, "logits/rejected": -18761096.0, "logps/chosen": -440.358154296875, "logps/rejected": -250.67660522460938, "loss": 0.3286, "rewards/chosen": 0.482781320810318, "rewards/margins": 2.2382594645023346, "rewards/rejected": -1.7554781436920166, "step": 14524 }, { "epoch": 0.7698831261760263, "grad_norm": 45.25, "kl": 2.40725040435791, "learning_rate": 5e-07, "logits/chosen": 6813546.666666667, "logits/rejected": -27670899.2, "logps/chosen": -40.484962463378906, "logps/rejected": -361.1240234375, "loss": 0.2856, "rewards/chosen": 0.2629363536834717, "rewards/margins": 2.7899856090545656, "rewards/rejected": -2.527049255371094, "step": 14525 }, { "epoch": 0.7699361301778285, "grad_norm": 50.0, "kl": 2.8999738693237305, "learning_rate": 5e-07, "logits/chosen": -10772012.666666666, "logits/rejected": -16164051.0, "logps/chosen": -232.6881103515625, "logps/rejected": -546.6141967773438, "loss": 0.3357, "rewards/chosen": 0.8965522448221842, "rewards/margins": 3.6022791067759194, "rewards/rejected": -2.7057268619537354, "step": 14526 }, { "epoch": 0.7699891341796306, "grad_norm": 58.5, "kl": 2.010580062866211, "learning_rate": 5e-07, "logits/chosen": -22294938.0, "logits/rejected": -29487800.0, "logps/chosen": -212.13714599609375, "logps/rejected": -579.9002685546875, "loss": 0.2832, "rewards/chosen": 0.6048699617385864, "rewards/margins": 3.425763249397278, "rewards/rejected": -2.8208932876586914, "step": 14527 }, { "epoch": 0.7700421381814327, "grad_norm": 65.0, "kl": 1.9715089797973633, "learning_rate": 5e-07, "logits/chosen": -19744448.0, "logits/rejected": 111465520.0, "logps/chosen": -365.5504557291667, "logps/rejected": -370.8905944824219, "loss": 0.3505, "rewards/chosen": 0.6038643519083658, "rewards/margins": 2.81717316309611, "rewards/rejected": -2.213308811187744, "step": 14528 }, { "epoch": 0.7700951421832348, "grad_norm": 40.75, "kl": 2.7558345794677734, "learning_rate": 5e-07, "logits/chosen": -18867.666666666668, "logits/rejected": -60059865.6, "logps/chosen": -184.77421061197916, "logps/rejected": -391.419970703125, "loss": 0.2448, "rewards/chosen": 1.084320306777954, "rewards/margins": 3.4170877933502197, "rewards/rejected": -2.3327674865722656, "step": 14529 }, { "epoch": 0.770148146185037, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21149259.2, "logits/rejected": -26989405.333333332, "logps/chosen": -184.6616943359375, "logps/rejected": -432.7710774739583, "loss": 0.321, "rewards/chosen": -0.02054092139005661, "rewards/margins": 4.0411688004930815, "rewards/rejected": -4.061709721883138, "step": 14530 }, { "epoch": 0.7702011501868391, "grad_norm": 93.0, "kl": 2.59328556060791, "learning_rate": 5e-07, "logits/chosen": -64864037.333333336, "logits/rejected": -4979641.0, "logps/chosen": -541.8762613932291, "logps/rejected": -105.91036987304688, "loss": 0.2731, "rewards/chosen": 1.0935614903767903, "rewards/margins": 2.9449556668599444, "rewards/rejected": -1.8513941764831543, "step": 14531 }, { "epoch": 0.7702541541886413, "grad_norm": 53.5, "kl": 0.8669929504394531, "learning_rate": 5e-07, "logits/chosen": -42906160.0, "logits/rejected": -50567450.666666664, "logps/chosen": -456.70830078125, "logps/rejected": -559.0797526041666, "loss": 0.2759, "rewards/chosen": 0.7788360595703125, "rewards/margins": 3.0949862480163572, "rewards/rejected": -2.316150188446045, "step": 14532 }, { "epoch": 0.7703071581904434, "grad_norm": 50.0, "kl": 1.057302474975586, "learning_rate": 5e-07, "logits/chosen": 582673.625, "logits/rejected": -22553842.666666668, "logps/chosen": -393.5109558105469, "logps/rejected": -218.27787272135416, "loss": 0.2044, "rewards/chosen": 0.9534587264060974, "rewards/margins": 2.6561199227968855, "rewards/rejected": -1.7026611963907878, "step": 14533 }, { "epoch": 0.7703601621922456, "grad_norm": 46.75, "kl": 4.320918083190918, "learning_rate": 5e-07, "logits/chosen": -28738331.42857143, "logits/rejected": -7336609.5, "logps/chosen": -272.2002650669643, "logps/rejected": -102.74544525146484, "loss": 0.4667, "rewards/chosen": 0.39552903175354004, "rewards/margins": 1.244259536266327, "rewards/rejected": -0.8487305045127869, "step": 14534 }, { "epoch": 0.7704131661940476, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 20925140.0, "logits/rejected": -9176988.0, "logps/chosen": -266.79901123046875, "logps/rejected": -333.45186941964283, "loss": 0.1711, "rewards/chosen": 0.414083868265152, "rewards/margins": 2.645762013537543, "rewards/rejected": -2.231678145272391, "step": 14535 }, { "epoch": 0.7704661701958498, "grad_norm": 39.5, "kl": 1.7140026092529297, "learning_rate": 5e-07, "logits/chosen": -17838989.333333332, "logits/rejected": -34939696.0, "logps/chosen": -261.29408772786456, "logps/rejected": -414.83046875, "loss": 0.1702, "rewards/chosen": 1.154769738515218, "rewards/margins": 3.772519715627034, "rewards/rejected": -2.6177499771118162, "step": 14536 }, { "epoch": 0.7705191741976519, "grad_norm": 53.75, "kl": 2.094217300415039, "learning_rate": 5e-07, "logits/chosen": -51035477.333333336, "logits/rejected": -31748441.6, "logps/chosen": -591.3645833333334, "logps/rejected": -220.48955078125, "loss": 0.2381, "rewards/chosen": 1.9130210876464844, "rewards/margins": 3.1533841133117675, "rewards/rejected": -1.2403630256652831, "step": 14537 }, { "epoch": 0.770572178199454, "grad_norm": 50.0, "kl": 1.6389923095703125, "learning_rate": 5e-07, "logits/chosen": -13950822.666666666, "logits/rejected": -26898876.0, "logps/chosen": -222.660400390625, "logps/rejected": -182.07810974121094, "loss": 0.2844, "rewards/chosen": 1.308758020401001, "rewards/margins": 2.096911132335663, "rewards/rejected": -0.7881531119346619, "step": 14538 }, { "epoch": 0.7706251822012562, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26627682.0, "logits/rejected": -41780708.0, "logps/chosen": -278.490234375, "logps/rejected": -276.9673156738281, "loss": 0.3251, "rewards/chosen": -0.09142078459262848, "rewards/margins": 2.1423603147268295, "rewards/rejected": -2.233781099319458, "step": 14539 }, { "epoch": 0.7706781862030583, "grad_norm": 60.0, "kl": 5.112109184265137, "learning_rate": 5e-07, "logits/chosen": -10922092.0, "logits/rejected": -33949384.0, "logps/chosen": -166.95394897460938, "logps/rejected": -382.7757568359375, "loss": 0.3812, "rewards/chosen": 0.4835401475429535, "rewards/margins": 2.412571042776108, "rewards/rejected": -1.9290308952331543, "step": 14540 }, { "epoch": 0.7707311902048605, "grad_norm": 73.0, "kl": 2.544144630432129, "learning_rate": 5e-07, "logits/chosen": -35906930.28571428, "logits/rejected": -31041946.0, "logps/chosen": -387.4247349330357, "logps/rejected": -709.2616577148438, "loss": 0.463, "rewards/chosen": 0.25176182815006803, "rewards/margins": 2.202616368021284, "rewards/rejected": -1.9508545398712158, "step": 14541 }, { "epoch": 0.7707841942066626, "grad_norm": 48.5, "kl": 0.757049560546875, "learning_rate": 5e-07, "logits/chosen": -53263676.0, "logits/rejected": -21222604.0, "logps/chosen": -362.0975036621094, "logps/rejected": -573.4208984375, "loss": 0.2202, "rewards/chosen": 0.7607576251029968, "rewards/margins": 4.371101796627045, "rewards/rejected": -3.610344171524048, "step": 14542 }, { "epoch": 0.7708371982084647, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11590019.0, "logits/rejected": 1954876.375, "logps/chosen": -76.16919708251953, "logps/rejected": -411.4756774902344, "loss": 0.2793, "rewards/chosen": 0.2314773052930832, "rewards/margins": 4.167918726801872, "rewards/rejected": -3.936441421508789, "step": 14543 }, { "epoch": 0.7708902022102668, "grad_norm": 35.75, "kl": 1.8893938064575195, "learning_rate": 5e-07, "logits/chosen": -5177954.0, "logits/rejected": -20649270.4, "logps/chosen": -241.36991373697916, "logps/rejected": -329.728271484375, "loss": 0.1845, "rewards/chosen": 1.2520445187886555, "rewards/margins": 3.892570718129476, "rewards/rejected": -2.6405261993408202, "step": 14544 }, { "epoch": 0.770943206212069, "grad_norm": 24.625, "kl": 2.7324304580688477, "learning_rate": 5e-07, "logits/chosen": 6399427.0, "logits/rejected": -8368439.0, "logps/chosen": -230.7891387939453, "logps/rejected": -454.830322265625, "loss": 0.165, "rewards/chosen": 1.938637375831604, "rewards/margins": 4.92501437664032, "rewards/rejected": -2.986377000808716, "step": 14545 }, { "epoch": 0.7709962102138711, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -866715.75, "logits/rejected": -43516933.333333336, "logps/chosen": -226.28138732910156, "logps/rejected": -248.76558430989584, "loss": 0.2131, "rewards/chosen": 1.4382761716842651, "rewards/margins": 2.8516450325647993, "rewards/rejected": -1.413368860880534, "step": 14546 }, { "epoch": 0.7710492142156733, "grad_norm": 75.5, "kl": 4.53101921081543, "learning_rate": 5e-07, "logits/chosen": -36436160.0, "logits/rejected": -42941802.666666664, "logps/chosen": -434.3068359375, "logps/rejected": -357.0064290364583, "loss": 0.223, "rewards/chosen": 1.6591535568237306, "rewards/margins": 4.144276555379232, "rewards/rejected": -2.4851229985555015, "step": 14547 }, { "epoch": 0.7711022182174754, "grad_norm": 61.25, "kl": 4.232588291168213, "learning_rate": 5e-07, "logits/chosen": 11620312.0, "logits/rejected": -18367794.0, "logps/chosen": -243.70321655273438, "logps/rejected": -230.73764038085938, "loss": 0.2814, "rewards/chosen": 0.8569908738136292, "rewards/margins": 2.8298738598823547, "rewards/rejected": -1.9728829860687256, "step": 14548 }, { "epoch": 0.7711552222192776, "grad_norm": 40.0, "kl": 2.2100868225097656, "learning_rate": 5e-07, "logits/chosen": -2885798.6666666665, "logits/rejected": -43704275.2, "logps/chosen": -179.45222981770834, "logps/rejected": -145.57958984375, "loss": 0.2926, "rewards/chosen": 0.8910363515218099, "rewards/margins": 3.110317738850912, "rewards/rejected": -2.2192813873291017, "step": 14549 }, { "epoch": 0.7712082262210797, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33548846.0, "logits/rejected": -26442944.0, "logps/chosen": -208.1350860595703, "logps/rejected": -278.9700927734375, "loss": 0.2558, "rewards/chosen": -0.3924552798271179, "rewards/margins": 1.7159867087999978, "rewards/rejected": -2.1084419886271157, "step": 14550 }, { "epoch": 0.7712612302228818, "grad_norm": 49.0, "kl": 1.416579246520996, "learning_rate": 5e-07, "logits/chosen": -22306105.6, "logits/rejected": -20361668.0, "logps/chosen": -216.137255859375, "logps/rejected": -498.8273111979167, "loss": 0.3154, "rewards/chosen": 0.3160261154174805, "rewards/margins": 4.266165479024251, "rewards/rejected": -3.950139363606771, "step": 14551 }, { "epoch": 0.7713142342246839, "grad_norm": 64.0, "kl": 2.1571044921875, "learning_rate": 5e-07, "logits/chosen": -22738486.0, "logps/chosen": -648.3482055664062, "loss": 0.3744, "rewards/chosen": 0.9955054521560669, "step": 14552 }, { "epoch": 0.7713672382264861, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19375972.0, "logits/rejected": -7988581.0, "logps/chosen": -242.36073303222656, "logps/rejected": -139.3169403076172, "loss": 0.3165, "rewards/chosen": 0.2046460509300232, "rewards/margins": 2.0853282809257507, "rewards/rejected": -1.8806822299957275, "step": 14553 }, { "epoch": 0.7714202422282882, "grad_norm": 48.0, "kl": 3.8017024993896484, "learning_rate": 5e-07, "logits/chosen": -39495139.2, "logits/rejected": -20471385.333333332, "logps/chosen": -280.3062744140625, "logps/rejected": -274.93109130859375, "loss": 0.4644, "rewards/chosen": 0.008777880668640136, "rewards/margins": 1.9792630910873412, "rewards/rejected": -1.9704852104187012, "step": 14554 }, { "epoch": 0.7714732462300904, "grad_norm": 40.5, "kl": 2.0326480865478516, "learning_rate": 5e-07, "logits/chosen": -4195066.0, "logits/rejected": -2982009.5, "logps/chosen": -175.39767456054688, "logps/rejected": -117.28913879394531, "loss": 0.2727, "rewards/chosen": 1.2463713884353638, "rewards/margins": 3.0906232595443726, "rewards/rejected": -1.8442518711090088, "step": 14555 }, { "epoch": 0.7715262502318925, "grad_norm": 58.25, "kl": 5.579538345336914, "learning_rate": 5e-07, "logits/chosen": 7774997.333333333, "logits/rejected": -152204992.0, "logps/chosen": -317.4168701171875, "logps/rejected": -160.86915588378906, "loss": 0.4169, "rewards/chosen": 0.8847298622131348, "rewards/margins": 2.0514367818832397, "rewards/rejected": -1.166706919670105, "step": 14556 }, { "epoch": 0.7715792542336947, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -126017048.0, "logits/rejected": -16133848.0, "logps/chosen": -332.30987548828125, "logps/rejected": -372.2857360839844, "loss": 0.2477, "rewards/chosen": 0.4746112823486328, "rewards/margins": 3.12788724899292, "rewards/rejected": -2.653275966644287, "step": 14557 }, { "epoch": 0.7716322582354967, "grad_norm": 38.25, "kl": 1.22430419921875, "learning_rate": 5e-07, "logits/chosen": -8137682.0, "logits/rejected": -22222724.0, "logps/chosen": -198.33364868164062, "logps/rejected": -101.98784637451172, "loss": 0.279, "rewards/chosen": 0.39924904704093933, "rewards/margins": 2.5548166930675507, "rewards/rejected": -2.1555676460266113, "step": 14558 }, { "epoch": 0.7716852622372989, "grad_norm": 67.0, "kl": 1.5000495910644531, "learning_rate": 5e-07, "logits/chosen": -35043680.0, "logits/rejected": -11188188.0, "logps/chosen": -281.81370035807294, "logps/rejected": -197.5396728515625, "loss": 0.155, "rewards/chosen": 1.275993824005127, "rewards/margins": 4.126781940460205, "rewards/rejected": -2.850788116455078, "step": 14559 }, { "epoch": 0.771738266239101, "grad_norm": 54.5, "kl": 5.57867431640625, "learning_rate": 5e-07, "logits/chosen": -615845.65, "logits/rejected": -30763429.333333332, "logps/chosen": -634.79736328125, "logps/rejected": -250.0776570638021, "loss": 0.2162, "rewards/chosen": 1.90747013092041, "rewards/margins": 3.0603217919667562, "rewards/rejected": -1.152851661046346, "step": 14560 }, { "epoch": 0.7717912702409032, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -149189653.33333334, "logits/rejected": -24588118.4, "logps/chosen": -413.5011800130208, "logps/rejected": -206.999072265625, "loss": 0.2427, "rewards/chosen": 1.0393839677174885, "rewards/margins": 2.571958049138387, "rewards/rejected": -1.5325740814208983, "step": 14561 }, { "epoch": 0.7718442742427053, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46449061.333333336, "logits/rejected": -21265857.6, "logps/chosen": -289.24403889973956, "logps/rejected": -305.1025146484375, "loss": 0.195, "rewards/chosen": 0.5356892744700114, "rewards/margins": 3.6499409834543863, "rewards/rejected": -3.114251708984375, "step": 14562 }, { "epoch": 0.7718972782445075, "grad_norm": 54.0, "kl": 1.4626960754394531, "learning_rate": 5e-07, "logits/chosen": -26982019.2, "logits/rejected": -24415472.0, "logps/chosen": -251.86513671875, "logps/rejected": -96.2877705891927, "loss": 0.3697, "rewards/chosen": 0.34527747631072997, "rewards/margins": 1.4554556290308636, "rewards/rejected": -1.1101781527201335, "step": 14563 }, { "epoch": 0.7719502822463096, "grad_norm": 36.25, "kl": 0.8313522338867188, "learning_rate": 5e-07, "logits/chosen": -18330086.0, "logits/rejected": -29315352.0, "logps/chosen": -387.8638000488281, "logps/rejected": -352.8966064453125, "loss": 0.1894, "rewards/chosen": 1.1137686967849731, "rewards/margins": 3.955487370491028, "rewards/rejected": -2.8417186737060547, "step": 14564 }, { "epoch": 0.7720032862481118, "grad_norm": 32.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9781224.0, "logits/rejected": -36793146.666666664, "logps/chosen": -122.19783782958984, "logps/rejected": -168.5634765625, "loss": 0.1728, "rewards/chosen": 0.7525795102119446, "rewards/margins": 2.9532440304756165, "rewards/rejected": -2.200664520263672, "step": 14565 }, { "epoch": 0.7720562902499138, "grad_norm": 50.0, "kl": 3.3515968322753906, "learning_rate": 5e-07, "logits/chosen": 1720355.2, "logits/rejected": -15787657.333333334, "logps/chosen": -249.894677734375, "logps/rejected": -199.4495849609375, "loss": 0.2439, "rewards/chosen": 1.37581787109375, "rewards/margins": 3.465893363952637, "rewards/rejected": -2.0900754928588867, "step": 14566 }, { "epoch": 0.772109294251716, "grad_norm": 35.75, "kl": 1.9407310485839844, "learning_rate": 5e-07, "logits/chosen": 12129350.0, "logits/rejected": -6693781.0, "logps/chosen": -183.57423400878906, "logps/rejected": -182.5557098388672, "loss": 0.2384, "rewards/chosen": 0.9311779141426086, "rewards/margins": 3.892135202884674, "rewards/rejected": -2.9609572887420654, "step": 14567 }, { "epoch": 0.7721622982535181, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3016711.5, "logits/rejected": -16937813.333333332, "logps/chosen": -120.40422058105469, "logps/rejected": -179.56158447265625, "loss": 0.244, "rewards/chosen": 0.3107917606830597, "rewards/margins": 1.9376396636168163, "rewards/rejected": -1.6268479029337566, "step": 14568 }, { "epoch": 0.7722153022553203, "grad_norm": 76.0, "kl": 1.0711126327514648, "learning_rate": 5e-07, "logits/chosen": -15792471.0, "logits/rejected": -36777304.0, "logps/chosen": -327.91717529296875, "logps/rejected": -399.8209533691406, "loss": 0.1855, "rewards/chosen": 1.4583648443222046, "rewards/margins": 3.7393354177474976, "rewards/rejected": -2.280970573425293, "step": 14569 }, { "epoch": 0.7722683062571224, "grad_norm": 48.0, "kl": 1.7019519805908203, "learning_rate": 5e-07, "logits/chosen": -17429849.6, "logits/rejected": -30217037.333333332, "logps/chosen": -196.27025146484374, "logps/rejected": -397.3556315104167, "loss": 0.3037, "rewards/chosen": 0.43129539489746094, "rewards/margins": 2.883120536804199, "rewards/rejected": -2.4518251419067383, "step": 14570 }, { "epoch": 0.7723213102589246, "grad_norm": 47.25, "kl": 4.237748146057129, "learning_rate": 5e-07, "logits/chosen": -16980768.0, "logits/rejected": -43145084.0, "logps/chosen": -216.996337890625, "logps/rejected": -275.12506103515625, "loss": 0.3463, "rewards/chosen": 0.6098229090372721, "rewards/margins": 3.3505708376566568, "rewards/rejected": -2.7407479286193848, "step": 14571 }, { "epoch": 0.7723743142607267, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59737624.0, "logits/rejected": -54631072.0, "logps/chosen": -424.56268310546875, "logps/rejected": -440.1654866536458, "loss": 0.1186, "rewards/chosen": 0.6092849969863892, "rewards/margins": 4.018527309099833, "rewards/rejected": -3.409242312113444, "step": 14572 }, { "epoch": 0.7724273182625289, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67149029.33333333, "logits/rejected": -30541481.6, "logps/chosen": -328.9375406901042, "logps/rejected": -437.22177734375, "loss": 0.2755, "rewards/chosen": -0.23112944761912027, "rewards/margins": 3.2263950427373254, "rewards/rejected": -3.4575244903564455, "step": 14573 }, { "epoch": 0.7724803222643309, "grad_norm": 39.75, "kl": 1.1102581024169922, "learning_rate": 5e-07, "logits/chosen": -10220860.0, "logits/rejected": 526545.5, "logps/chosen": -243.78348795572916, "logps/rejected": -87.09895629882813, "loss": 0.2588, "rewards/chosen": 0.4644145965576172, "rewards/margins": 3.6077136993408203, "rewards/rejected": -3.143299102783203, "step": 14574 }, { "epoch": 0.7725333262661331, "grad_norm": 43.0, "kl": 2.094484329223633, "learning_rate": 5e-07, "logits/chosen": -19564202.0, "logits/rejected": -31975716.0, "logps/chosen": -520.3694458007812, "logps/rejected": -485.3084716796875, "loss": 0.2467, "rewards/chosen": 1.2206605672836304, "rewards/margins": 3.793697953224182, "rewards/rejected": -2.5730373859405518, "step": 14575 }, { "epoch": 0.7725863302679352, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21080589.333333332, "logits/rejected": -50901206.4, "logps/chosen": -380.287353515625, "logps/rejected": -332.8189453125, "loss": 0.2568, "rewards/chosen": 0.4981842041015625, "rewards/margins": 2.4545124053955076, "rewards/rejected": -1.9563282012939454, "step": 14576 }, { "epoch": 0.7726393342697374, "grad_norm": 49.75, "kl": 2.580678939819336, "learning_rate": 5e-07, "logits/chosen": 1493264.0, "logits/rejected": -42299203.2, "logps/chosen": -248.6300252278646, "logps/rejected": -272.2456298828125, "loss": 0.2416, "rewards/chosen": 0.6441070636113485, "rewards/margins": 3.798147209485372, "rewards/rejected": -3.1540401458740233, "step": 14577 }, { "epoch": 0.7726923382715395, "grad_norm": 38.0, "kl": 2.811819076538086, "learning_rate": 5e-07, "logits/chosen": -8756658.0, "logits/rejected": -44065628.0, "logps/chosen": -256.510986328125, "logps/rejected": -265.6605224609375, "loss": 0.3172, "rewards/chosen": 0.789019763469696, "rewards/margins": 2.7824036478996277, "rewards/rejected": -1.9933838844299316, "step": 14578 }, { "epoch": 0.7727453422733417, "grad_norm": 41.75, "kl": 1.2439298629760742, "learning_rate": 5e-07, "logits/chosen": -41697312.0, "logits/rejected": -39890965.333333336, "logps/chosen": -233.6433563232422, "logps/rejected": -433.22021484375, "loss": 0.2157, "rewards/chosen": 0.5331804156303406, "rewards/margins": 3.0020504196484885, "rewards/rejected": -2.468870004018148, "step": 14579 }, { "epoch": 0.7727983462751438, "grad_norm": 51.75, "kl": 0.2948455810546875, "learning_rate": 5e-07, "logits/chosen": -49920868.0, "logits/rejected": -29631936.0, "logps/chosen": -323.748291015625, "logps/rejected": -249.29786682128906, "loss": 0.2384, "rewards/chosen": 0.5674968957901001, "rewards/margins": 3.324706196784973, "rewards/rejected": -2.757209300994873, "step": 14580 }, { "epoch": 0.772851350276946, "grad_norm": 59.25, "kl": 1.2975540161132812, "learning_rate": 5e-07, "logits/chosen": -49182984.0, "logits/rejected": -36240744.0, "logps/chosen": -478.19354248046875, "logps/rejected": -471.6332092285156, "loss": 0.2884, "rewards/chosen": 0.8328918218612671, "rewards/margins": 2.6895257234573364, "rewards/rejected": -1.8566339015960693, "step": 14581 }, { "epoch": 0.772904354278748, "grad_norm": 31.125, "kl": 1.5495338439941406, "learning_rate": 5e-07, "logits/chosen": 4802683.0, "logits/rejected": -38741645.71428572, "logps/chosen": -35.41217803955078, "logps/rejected": -429.93784877232144, "loss": 0.1732, "rewards/chosen": -0.5704788565635681, "rewards/margins": 2.361422530242375, "rewards/rejected": -2.931901386805943, "step": 14582 }, { "epoch": 0.7729573582805502, "grad_norm": 45.25, "kl": 0.16927242279052734, "learning_rate": 5e-07, "logits/chosen": 3842928.0, "logits/rejected": -41859602.28571428, "logps/chosen": -307.49029541015625, "logps/rejected": -335.56124441964283, "loss": 0.2298, "rewards/chosen": 0.04826660081744194, "rewards/margins": 2.610875047211136, "rewards/rejected": -2.562608446393694, "step": 14583 }, { "epoch": 0.7730103622823523, "grad_norm": 44.75, "kl": 0.5870370864868164, "learning_rate": 5e-07, "logits/chosen": 6234490.666666667, "logits/rejected": -2482140.4, "logps/chosen": -322.1988525390625, "logps/rejected": -120.64468994140626, "loss": 0.1841, "rewards/chosen": 0.7762706279754639, "rewards/margins": 3.16065993309021, "rewards/rejected": -2.384389305114746, "step": 14584 }, { "epoch": 0.7730633662841545, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21527318.666666668, "logits/rejected": -23708172.8, "logps/chosen": -122.65160115559895, "logps/rejected": -401.9671142578125, "loss": 0.2262, "rewards/chosen": 0.38473625977834064, "rewards/margins": 2.793722160657247, "rewards/rejected": -2.408985900878906, "step": 14585 }, { "epoch": 0.7731163702859566, "grad_norm": 47.0, "kl": 0.06963539123535156, "learning_rate": 5e-07, "logits/chosen": -62447648.0, "logits/rejected": -50822186.666666664, "logps/chosen": -250.7475128173828, "logps/rejected": -490.1975504557292, "loss": 0.166, "rewards/chosen": 0.9052789807319641, "rewards/margins": 3.3796826799710593, "rewards/rejected": -2.474403699239095, "step": 14586 }, { "epoch": 0.7731693742877587, "grad_norm": 47.5, "kl": 0.5687160491943359, "learning_rate": 5e-07, "logits/chosen": -22274820.0, "logits/rejected": -35463696.0, "logps/chosen": -151.84210205078125, "logps/rejected": -303.76776123046875, "loss": 0.3122, "rewards/chosen": 0.1074599176645279, "rewards/margins": 2.090114966034889, "rewards/rejected": -1.9826550483703613, "step": 14587 }, { "epoch": 0.7732223782895609, "grad_norm": 58.5, "kl": 2.638019561767578, "learning_rate": 5e-07, "logits/chosen": -30398544.0, "logits/rejected": 4917594.0, "logps/chosen": -466.3870035807292, "logps/rejected": -483.86407470703125, "loss": 0.2724, "rewards/chosen": 1.1987125873565674, "rewards/margins": 3.9515445232391357, "rewards/rejected": -2.7528319358825684, "step": 14588 }, { "epoch": 0.773275382291363, "grad_norm": 36.25, "kl": 2.9503841400146484, "learning_rate": 5e-07, "logits/chosen": -57302069.333333336, "logits/rejected": -36138345.6, "logps/chosen": -70.67261759440105, "logps/rejected": -360.0689697265625, "loss": 0.3234, "rewards/chosen": 0.19337991873423258, "rewards/margins": 1.7903382698694865, "rewards/rejected": -1.5969583511352539, "step": 14589 }, { "epoch": 0.7733283862931651, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43102792.0, "logits/rejected": -31795734.85714286, "logps/chosen": -281.06011962890625, "logps/rejected": -455.52413504464283, "loss": 0.1935, "rewards/chosen": -0.8600708246231079, "rewards/margins": 1.726819259779794, "rewards/rejected": -2.586890084402902, "step": 14590 }, { "epoch": 0.7733813902949672, "grad_norm": 51.0, "kl": 0.948699951171875, "learning_rate": 5e-07, "logits/chosen": -106205216.0, "logits/rejected": -7402063.428571428, "logps/chosen": -686.5625, "logps/rejected": -260.17771693638394, "loss": 0.1977, "rewards/chosen": 0.90325927734375, "rewards/margins": 3.1022802080426897, "rewards/rejected": -2.1990209306989397, "step": 14591 }, { "epoch": 0.7734343942967694, "grad_norm": 52.5, "kl": 3.23635196685791, "learning_rate": 5e-07, "logits/chosen": -15965658.285714285, "logits/rejected": -62808320.0, "logps/chosen": -336.49267578125, "logps/rejected": -558.429443359375, "loss": 0.3247, "rewards/chosen": 1.1649270738874162, "rewards/margins": 4.149546214512416, "rewards/rejected": -2.984619140625, "step": 14592 }, { "epoch": 0.7734873982985715, "grad_norm": 65.5, "kl": 3.384004592895508, "learning_rate": 5e-07, "logits/chosen": -6029710.8, "logits/rejected": -29572032.0, "logps/chosen": -592.482080078125, "logps/rejected": -464.9247233072917, "loss": 0.2688, "rewards/chosen": 0.7886788845062256, "rewards/margins": 3.412432909011841, "rewards/rejected": -2.6237540245056152, "step": 14593 }, { "epoch": 0.7735404023003737, "grad_norm": 40.25, "kl": 5.62730598449707, "learning_rate": 5e-07, "logits/chosen": -50194250.666666664, "logits/rejected": -35544656.0, "logps/chosen": -311.6537272135417, "logps/rejected": -316.6553466796875, "loss": 0.3852, "rewards/chosen": -0.20112754901250204, "rewards/margins": 1.9407963554064434, "rewards/rejected": -2.1419239044189453, "step": 14594 }, { "epoch": 0.7735934063021758, "grad_norm": 44.0, "kl": 0.35486698150634766, "learning_rate": 5e-07, "logits/chosen": 3047439.6666666665, "logits/rejected": 2137737.6, "logps/chosen": -28.65924580891927, "logps/rejected": -265.985595703125, "loss": 0.3314, "rewards/chosen": -0.28643176952997845, "rewards/margins": 1.5504828651746114, "rewards/rejected": -1.83691463470459, "step": 14595 }, { "epoch": 0.773646410303978, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47454136.0, "logits/rejected": -31108608.0, "logps/chosen": -414.1131591796875, "logps/rejected": -274.7920619419643, "loss": 0.1758, "rewards/chosen": -0.37222597002983093, "rewards/margins": 2.08692952139037, "rewards/rejected": -2.459155491420201, "step": 14596 }, { "epoch": 0.77369941430578, "grad_norm": 56.75, "kl": 2.001955032348633, "learning_rate": 5e-07, "logits/chosen": -60358666.666666664, "logits/rejected": -8930070.0, "logps/chosen": -231.40034993489584, "logps/rejected": -169.30923461914062, "loss": 0.3961, "rewards/chosen": 0.5643272399902344, "rewards/margins": 1.6754181385040283, "rewards/rejected": -1.111090898513794, "step": 14597 }, { "epoch": 0.7737524183075822, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60610596.0, "logits/rejected": 51563480.0, "logps/chosen": -357.6239013671875, "logps/rejected": -261.36529541015625, "loss": 0.336, "rewards/chosen": 0.3828117549419403, "rewards/margins": 1.617954820394516, "rewards/rejected": -1.2351430654525757, "step": 14598 }, { "epoch": 0.7738054223093843, "grad_norm": 47.75, "kl": 1.36529541015625, "learning_rate": 5e-07, "logits/chosen": -45280266.666666664, "logits/rejected": -16756729.6, "logps/chosen": -407.916748046875, "logps/rejected": -258.6156494140625, "loss": 0.1919, "rewards/chosen": 1.0458088715871174, "rewards/margins": 3.5382006486256916, "rewards/rejected": -2.4923917770385744, "step": 14599 }, { "epoch": 0.7738584263111865, "grad_norm": 42.5, "kl": 1.272913932800293, "learning_rate": 5e-07, "logits/chosen": -43225798.4, "logits/rejected": -24663496.0, "logps/chosen": -180.92440185546874, "logps/rejected": -329.3765055338542, "loss": 0.374, "rewards/chosen": -0.02369370013475418, "rewards/margins": 3.312970499694347, "rewards/rejected": -3.3366641998291016, "step": 14600 }, { "epoch": 0.7739114303129886, "grad_norm": 63.5, "kl": 3.74951171875, "learning_rate": 5e-07, "logits/chosen": -24059445.333333332, "logits/rejected": -42918208.0, "logps/chosen": -319.59783935546875, "logps/rejected": -75.63227844238281, "loss": 0.4457, "rewards/chosen": 0.3129219214121501, "rewards/margins": 2.0621394316355386, "rewards/rejected": -1.7492175102233887, "step": 14601 }, { "epoch": 0.7739644343147908, "grad_norm": 45.25, "kl": 1.4562873840332031, "learning_rate": 5e-07, "logits/chosen": -18718320.0, "logits/rejected": -63144853.333333336, "logps/chosen": -162.08277587890626, "logps/rejected": -339.7874348958333, "loss": 0.4168, "rewards/chosen": -0.299191951751709, "rewards/margins": 2.464062531789144, "rewards/rejected": -2.763254483540853, "step": 14602 }, { "epoch": 0.7740174383165929, "grad_norm": 67.5, "kl": 1.7108840942382812, "learning_rate": 5e-07, "logits/chosen": -22228654.0, "logits/rejected": -12606868.0, "logps/chosen": -349.85040283203125, "logps/rejected": -312.8474426269531, "loss": 0.3639, "rewards/chosen": 0.08345870673656464, "rewards/margins": 2.6639337986707687, "rewards/rejected": -2.580475091934204, "step": 14603 }, { "epoch": 0.7740704423183951, "grad_norm": 61.25, "kl": 0.48541831970214844, "learning_rate": 5e-07, "logits/chosen": -39910493.333333336, "logits/rejected": -33946195.2, "logps/chosen": -752.2069498697916, "logps/rejected": -390.453125, "loss": 0.1572, "rewards/chosen": 1.1992228825887044, "rewards/margins": 4.308872159322103, "rewards/rejected": -3.1096492767333985, "step": 14604 }, { "epoch": 0.7741234463201971, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49624796.8, "logits/rejected": -7009142.666666667, "logps/chosen": -333.3322998046875, "logps/rejected": -269.9697672526042, "loss": 0.2894, "rewards/chosen": 0.41412935256958006, "rewards/margins": 2.9730314572652183, "rewards/rejected": -2.558902104695638, "step": 14605 }, { "epoch": 0.7741764503219993, "grad_norm": 51.75, "kl": 0.9676036834716797, "learning_rate": 5e-07, "logits/chosen": -8815869.333333334, "logits/rejected": 3725558.4, "logps/chosen": -741.9558919270834, "logps/rejected": -457.81669921875, "loss": 0.2034, "rewards/chosen": 1.3863881429036458, "rewards/margins": 3.9502063115437824, "rewards/rejected": -2.563818168640137, "step": 14606 }, { "epoch": 0.7742294543238014, "grad_norm": 76.0, "kl": 1.8700637817382812, "learning_rate": 5e-07, "logits/chosen": 36860608.0, "logits/rejected": -52280408.0, "logps/chosen": -368.0231526692708, "logps/rejected": -238.0126190185547, "loss": 0.397, "rewards/chosen": 0.2907348871231079, "rewards/margins": 2.293888211250305, "rewards/rejected": -2.0031533241271973, "step": 14607 }, { "epoch": 0.7742824583256036, "grad_norm": 28.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28073922.0, "logits/rejected": -4812568.0, "logps/chosen": -403.15838623046875, "logps/rejected": -459.136474609375, "loss": 0.0767, "rewards/chosen": 1.70133376121521, "rewards/margins": 5.183666626612345, "rewards/rejected": -3.4823328653971353, "step": 14608 }, { "epoch": 0.7743354623274057, "grad_norm": 52.25, "kl": 5.322887420654297, "learning_rate": 5e-07, "logits/chosen": -1802777.625, "logps/chosen": -116.20745086669922, "loss": 0.5958, "rewards/chosen": 0.12512174248695374, "step": 14609 }, { "epoch": 0.7743884663292079, "grad_norm": 46.75, "kl": 6.230561256408691, "learning_rate": 5e-07, "logits/chosen": -57572138.666666664, "logits/rejected": -27772346.0, "logps/chosen": -265.70977783203125, "logps/rejected": -319.03997802734375, "loss": 0.3337, "rewards/chosen": 1.3391911188761394, "rewards/margins": 4.946798483530681, "rewards/rejected": -3.607607364654541, "step": 14610 }, { "epoch": 0.77444147033101, "grad_norm": 28.25, "kl": 0.8568353652954102, "learning_rate": 5e-07, "logits/chosen": -4352574.4, "logits/rejected": -32751544.0, "logps/chosen": -159.4913330078125, "logps/rejected": -299.4449462890625, "loss": 0.202, "rewards/chosen": 1.273188877105713, "rewards/margins": 4.13596658706665, "rewards/rejected": -2.8627777099609375, "step": 14611 }, { "epoch": 0.7744944743328122, "grad_norm": 44.75, "kl": 0.8248357772827148, "learning_rate": 5e-07, "logits/chosen": -27464692.0, "logits/rejected": -27779204.0, "logps/chosen": -300.23040771484375, "logps/rejected": -367.8826599121094, "loss": 0.3437, "rewards/chosen": -0.024451136589050293, "rewards/margins": 2.034390091896057, "rewards/rejected": -2.0588412284851074, "step": 14612 }, { "epoch": 0.7745474783346142, "grad_norm": 44.5, "kl": 0.30863189697265625, "learning_rate": 5e-07, "logits/chosen": -17102702.0, "logits/rejected": -21496732.0, "logps/chosen": -532.853515625, "logps/rejected": -217.21834309895834, "loss": 0.1653, "rewards/chosen": 1.14862060546875, "rewards/margins": 3.7368386586507163, "rewards/rejected": -2.5882180531819663, "step": 14613 }, { "epoch": 0.7746004823364164, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -93694776.0, "logits/rejected": -32946996.0, "logps/chosen": -443.7179870605469, "logps/rejected": -326.82049560546875, "loss": 0.2174, "rewards/chosen": 0.5393722653388977, "rewards/margins": 3.6425865292549133, "rewards/rejected": -3.1032142639160156, "step": 14614 }, { "epoch": 0.7746534863382185, "grad_norm": 42.25, "kl": 6.588069915771484, "learning_rate": 5e-07, "logits/chosen": -69637753.6, "logits/rejected": -7856148.0, "logps/chosen": -1181.76689453125, "logps/rejected": -430.7610270182292, "loss": 0.1944, "rewards/chosen": 2.32822208404541, "rewards/margins": 4.405846563975016, "rewards/rejected": -2.077624479929606, "step": 14615 }, { "epoch": 0.7747064903400207, "grad_norm": 65.0, "kl": 3.6614742279052734, "learning_rate": 5e-07, "logits/chosen": -34797160.0, "logits/rejected": -15170461.0, "logps/chosen": -346.888427734375, "logps/rejected": -240.32261657714844, "loss": 0.3629, "rewards/chosen": 0.9108982086181641, "rewards/margins": 2.110081672668457, "rewards/rejected": -1.199183464050293, "step": 14616 }, { "epoch": 0.7747594943418228, "grad_norm": 66.0, "kl": 1.3369827270507812, "learning_rate": 5e-07, "logits/chosen": -16831480.0, "logits/rejected": -42016509.333333336, "logps/chosen": -360.0537109375, "logps/rejected": -419.0321044921875, "loss": 0.2649, "rewards/chosen": 1.011329460144043, "rewards/margins": 3.362618414560954, "rewards/rejected": -2.3512889544169107, "step": 14617 }, { "epoch": 0.774812498343625, "grad_norm": 46.25, "kl": 0.8854856491088867, "learning_rate": 5e-07, "logits/chosen": -37581081.6, "logits/rejected": 31560850.666666668, "logps/chosen": -377.89619140625, "logps/rejected": -399.7663981119792, "loss": 0.2667, "rewards/chosen": 0.540357255935669, "rewards/margins": 3.284776989618937, "rewards/rejected": -2.744419733683268, "step": 14618 }, { "epoch": 0.7748655023454271, "grad_norm": 94.0, "kl": 2.1784133911132812, "learning_rate": 5e-07, "logits/chosen": -44042284.8, "logits/rejected": -9594004.0, "logps/chosen": -308.128369140625, "logps/rejected": -212.17097981770834, "loss": 0.2721, "rewards/chosen": 0.770648193359375, "rewards/margins": 2.4660382588704426, "rewards/rejected": -1.6953900655110676, "step": 14619 }, { "epoch": 0.7749185063472293, "grad_norm": 45.5, "kl": 3.9223289489746094, "learning_rate": 5e-07, "logits/chosen": -31182964.0, "logits/rejected": -18592896.0, "logps/chosen": -277.0785827636719, "logps/rejected": -170.83851623535156, "loss": 0.3106, "rewards/chosen": 1.0203275680541992, "rewards/margins": 3.0141921043395996, "rewards/rejected": -1.9938645362854004, "step": 14620 }, { "epoch": 0.7749715103490313, "grad_norm": 43.25, "kl": 0.7720880508422852, "learning_rate": 5e-07, "logits/chosen": -28062611.2, "logits/rejected": -5605346.0, "logps/chosen": -222.8729248046875, "logps/rejected": -179.7564697265625, "loss": 0.2431, "rewards/chosen": 0.800963306427002, "rewards/margins": 3.8145970344543456, "rewards/rejected": -3.0136337280273438, "step": 14621 }, { "epoch": 0.7750245143508335, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28823034.0, "logits/rejected": -71184584.0, "logps/chosen": -303.65142822265625, "logps/rejected": -188.03448486328125, "loss": 0.2749, "rewards/chosen": 0.8883737921714783, "rewards/margins": 2.738481819629669, "rewards/rejected": -1.850108027458191, "step": 14622 }, { "epoch": 0.7750775183526356, "grad_norm": 60.25, "kl": 1.0628681182861328, "learning_rate": 5e-07, "logits/chosen": -48666213.333333336, "logits/rejected": -23091468.0, "logps/chosen": -396.7756754557292, "logps/rejected": -443.4356689453125, "loss": 0.4265, "rewards/chosen": -0.12143649657567342, "rewards/margins": 3.813623527685801, "rewards/rejected": -3.9350600242614746, "step": 14623 }, { "epoch": 0.7751305223544378, "grad_norm": 63.5, "kl": 0.49370574951171875, "learning_rate": 5e-07, "logits/chosen": -6877052.0, "logits/rejected": -18637940.0, "logps/chosen": -279.3547668457031, "logps/rejected": -519.2988891601562, "loss": 0.2339, "rewards/chosen": 0.823356032371521, "rewards/margins": 4.025720000267029, "rewards/rejected": -3.202363967895508, "step": 14624 }, { "epoch": 0.7751835263562399, "grad_norm": 41.0, "kl": 1.5342445373535156, "learning_rate": 5e-07, "logits/chosen": -10407366.0, "logits/rejected": -17237494.4, "logps/chosen": -751.0286458333334, "logps/rejected": -75.28434448242187, "loss": 0.2987, "rewards/chosen": 1.2265966733296711, "rewards/margins": 2.5333382924397787, "rewards/rejected": -1.3067416191101073, "step": 14625 }, { "epoch": 0.7752365303580421, "grad_norm": 84.0, "kl": 4.925378799438477, "learning_rate": 5e-07, "logits/chosen": -23625498.666666668, "logits/rejected": -40143568.0, "logps/chosen": -322.1104329427083, "logps/rejected": -221.654541015625, "loss": 0.3471, "rewards/chosen": 1.0965065161387126, "rewards/margins": 2.226940194765727, "rewards/rejected": -1.1304336786270142, "step": 14626 }, { "epoch": 0.7752895343598442, "grad_norm": 57.75, "kl": 0.05541229248046875, "learning_rate": 5e-07, "logits/chosen": -26699956.0, "logits/rejected": 3792889.6666666665, "logps/chosen": -317.9715576171875, "logps/rejected": -202.7287801106771, "loss": 0.247, "rewards/chosen": 0.9715080857276917, "rewards/margins": 2.6212047537167864, "rewards/rejected": -1.649696667989095, "step": 14627 }, { "epoch": 0.7753425383616463, "grad_norm": 60.5, "kl": 7.031865119934082, "learning_rate": 5e-07, "logits/chosen": -43079392.0, "logits/rejected": -3499631.5, "logps/chosen": -308.12998453776044, "logps/rejected": -564.7388916015625, "loss": 0.3366, "rewards/chosen": 1.5811055501302083, "rewards/margins": 3.184309919675191, "rewards/rejected": -1.603204369544983, "step": 14628 }, { "epoch": 0.7753955423634484, "grad_norm": 51.5, "kl": 2.389288902282715, "learning_rate": 5e-07, "logits/chosen": -22126232.0, "logits/rejected": -5254393.2, "logps/chosen": -242.6605021158854, "logps/rejected": -149.18853759765625, "loss": 0.2645, "rewards/chosen": 0.44773586591084796, "rewards/margins": 2.7621090730031335, "rewards/rejected": -2.3143732070922853, "step": 14629 }, { "epoch": 0.7754485463652506, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76769664.0, "logits/rejected": -27197656.0, "logps/chosen": -386.119091796875, "logps/rejected": -478.8389485677083, "loss": 0.3625, "rewards/chosen": 0.05636078119277954, "rewards/margins": 2.4331873854001365, "rewards/rejected": -2.376826604207357, "step": 14630 }, { "epoch": 0.7755015503670527, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7859846.666666667, "logits/rejected": -39501766.4, "logps/chosen": -266.256591796875, "logps/rejected": -426.44560546875, "loss": 0.1779, "rewards/chosen": 1.053946892420451, "rewards/margins": 3.7309021155039472, "rewards/rejected": -2.676955223083496, "step": 14631 }, { "epoch": 0.7755545543688549, "grad_norm": 65.5, "kl": 0.6348876953125, "learning_rate": 5e-07, "logits/chosen": -40736682.666666664, "logits/rejected": -12249143.0, "logps/chosen": -347.6416015625, "logps/rejected": -120.08084106445312, "loss": 0.3483, "rewards/chosen": 0.25999120871225995, "rewards/margins": 4.029836932818095, "rewards/rejected": -3.769845724105835, "step": 14632 }, { "epoch": 0.775607558370657, "grad_norm": 41.25, "kl": 2.3265628814697266, "learning_rate": 5e-07, "logits/chosen": -17903219.2, "logits/rejected": -24023448.0, "logps/chosen": -173.193212890625, "logps/rejected": -266.13909912109375, "loss": 0.2374, "rewards/chosen": 0.7861630916595459, "rewards/margins": 4.270310799280802, "rewards/rejected": -3.4841477076212564, "step": 14633 }, { "epoch": 0.7756605623724592, "grad_norm": 86.0, "kl": 0.4718189239501953, "learning_rate": 5e-07, "logits/chosen": -49168392.0, "logits/rejected": -13103293.0, "logps/chosen": -607.8484497070312, "logps/rejected": -167.54763793945312, "loss": 0.2364, "rewards/chosen": 0.6970623135566711, "rewards/margins": 3.1387298703193665, "rewards/rejected": -2.4416675567626953, "step": 14634 }, { "epoch": 0.7757135663742613, "grad_norm": 54.75, "kl": 4.927163124084473, "learning_rate": 5e-07, "logits/chosen": -41872458.666666664, "logits/rejected": -46913020.0, "logps/chosen": -333.7151692708333, "logps/rejected": -635.1614990234375, "loss": 0.385, "rewards/chosen": 0.7266205151875814, "rewards/margins": 5.4840396245320635, "rewards/rejected": -4.757419109344482, "step": 14635 }, { "epoch": 0.7757665703760634, "grad_norm": 39.0, "kl": 1.730031967163086, "learning_rate": 5e-07, "logits/chosen": -28247865.6, "logits/rejected": -80614629.33333333, "logps/chosen": -413.07451171875, "logps/rejected": -371.3406982421875, "loss": 0.2788, "rewards/chosen": 1.0949851036071778, "rewards/margins": 3.6676490465799967, "rewards/rejected": -2.572663942972819, "step": 14636 }, { "epoch": 0.7758195743778655, "grad_norm": 26.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39185136.0, "logits/rejected": -14436563.42857143, "logps/chosen": -332.3953857421875, "logps/rejected": -426.26517159598217, "loss": 0.1552, "rewards/chosen": -1.046850562095642, "rewards/margins": 2.4664823498044695, "rewards/rejected": -3.5133329119001115, "step": 14637 }, { "epoch": 0.7758725783796676, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1870876.0, "logits/rejected": -29719810.0, "logps/chosen": -498.45477294921875, "logps/rejected": -267.5633239746094, "loss": 0.2751, "rewards/chosen": 0.5370559692382812, "rewards/margins": 3.2322006225585938, "rewards/rejected": -2.6951446533203125, "step": 14638 }, { "epoch": 0.7759255823814698, "grad_norm": 39.5, "kl": 2.4602298736572266, "learning_rate": 5e-07, "logits/chosen": -39139929.6, "logits/rejected": -78119509.33333333, "logps/chosen": -323.6190185546875, "logps/rejected": -537.3211263020834, "loss": 0.2537, "rewards/chosen": 0.9235417366027832, "rewards/margins": 5.119188340504964, "rewards/rejected": -4.195646603902181, "step": 14639 }, { "epoch": 0.7759785863832719, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69177008.0, "logits/rejected": -36909717.333333336, "logps/chosen": -421.6060791015625, "logps/rejected": -339.69594319661456, "loss": 0.2396, "rewards/chosen": 0.14950866997241974, "rewards/margins": 2.2007748633623123, "rewards/rejected": -2.0512661933898926, "step": 14640 }, { "epoch": 0.7760315903850741, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53264928.0, "logits/rejected": -13617630.0, "logps/chosen": -355.1858825683594, "logps/rejected": -288.87408447265625, "loss": 0.2557, "rewards/chosen": 0.5962957143783569, "rewards/margins": 3.1380776166915894, "rewards/rejected": -2.5417819023132324, "step": 14641 }, { "epoch": 0.7760845943868762, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36364556.0, "logits/rejected": -31784252.0, "logps/chosen": -306.3490905761719, "logps/rejected": -450.54315185546875, "loss": 0.2806, "rewards/chosen": 0.09847506880760193, "rewards/margins": 3.004891484975815, "rewards/rejected": -2.906416416168213, "step": 14642 }, { "epoch": 0.7761375983886784, "grad_norm": 70.0, "kl": 3.744950294494629, "learning_rate": 5e-07, "logits/chosen": -29953868.8, "logits/rejected": -14089416.0, "logps/chosen": -372.471044921875, "logps/rejected": -122.30677286783855, "loss": 0.3725, "rewards/chosen": 0.4628001689910889, "rewards/margins": 3.030221446355184, "rewards/rejected": -2.567421277364095, "step": 14643 }, { "epoch": 0.7761906023904804, "grad_norm": 96.0, "kl": 6.23529052734375, "learning_rate": 5e-07, "logits/chosen": -48275632.0, "logits/rejected": 163934992.0, "logps/chosen": -399.29461669921875, "logps/rejected": -284.05731201171875, "loss": 0.3108, "rewards/chosen": 1.6370296478271484, "rewards/margins": 3.705817461013794, "rewards/rejected": -2.0687878131866455, "step": 14644 }, { "epoch": 0.7762436063922826, "grad_norm": 49.0, "kl": 1.7689886093139648, "learning_rate": 5e-07, "logits/chosen": -32529404.8, "logits/rejected": -48505589.333333336, "logps/chosen": -241.3703857421875, "logps/rejected": -468.57080078125, "loss": 0.3611, "rewards/chosen": 0.5083637237548828, "rewards/margins": 1.977400779724121, "rewards/rejected": -1.4690370559692383, "step": 14645 }, { "epoch": 0.7762966103940847, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4853164.5, "logits/rejected": -31003211.42857143, "logps/chosen": -15.545915603637695, "logps/rejected": -543.6691545758929, "loss": 0.1473, "rewards/chosen": 0.7401546835899353, "rewards/margins": 3.7558442609650746, "rewards/rejected": -3.0156895773751393, "step": 14646 }, { "epoch": 0.7763496143958869, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52735348.0, "logits/rejected": -67148827.42857143, "logps/chosen": -334.21636962890625, "logps/rejected": -384.73238699776783, "loss": 0.1855, "rewards/chosen": -0.28715822100639343, "rewards/margins": 2.5565492723669325, "rewards/rejected": -2.843707493373326, "step": 14647 }, { "epoch": 0.776402618397689, "grad_norm": 44.75, "kl": 2.0560131072998047, "learning_rate": 5e-07, "logits/chosen": -68809.6, "logits/rejected": -27940514.666666668, "logps/chosen": -176.17208251953124, "logps/rejected": -405.4749348958333, "loss": 0.3676, "rewards/chosen": 0.2919055938720703, "rewards/margins": 2.05519806543986, "rewards/rejected": -1.7632924715677898, "step": 14648 }, { "epoch": 0.7764556223994912, "grad_norm": 47.0, "kl": 0.5479736328125, "learning_rate": 5e-07, "logits/chosen": -22405626.666666668, "logits/rejected": -50912236.8, "logps/chosen": -385.9455159505208, "logps/rejected": -478.4306640625, "loss": 0.1686, "rewards/chosen": 0.8917126655578613, "rewards/margins": 3.6693814277648924, "rewards/rejected": -2.777668762207031, "step": 14649 }, { "epoch": 0.7765086264012933, "grad_norm": 52.5, "kl": 3.135873794555664, "learning_rate": 5e-07, "logits/chosen": -39501350.4, "logits/rejected": -728992.0, "logps/chosen": -141.078369140625, "logps/rejected": -72.02435302734375, "loss": 0.3717, "rewards/chosen": 0.7430234909057617, "rewards/margins": 1.8675198713938395, "rewards/rejected": -1.1244963804880779, "step": 14650 }, { "epoch": 0.7765616304030954, "grad_norm": 43.5, "kl": 1.4100971221923828, "learning_rate": 5e-07, "logits/chosen": -1108866.4, "logits/rejected": -1481914.8333333333, "logps/chosen": -273.080322265625, "logps/rejected": -170.04668172200522, "loss": 0.2367, "rewards/chosen": 1.064048671722412, "rewards/margins": 4.439514700571696, "rewards/rejected": -3.3754660288492837, "step": 14651 }, { "epoch": 0.7766146344048975, "grad_norm": 53.0, "kl": 0.19826507568359375, "learning_rate": 5e-07, "logits/chosen": -56770416.0, "logits/rejected": -32543024.0, "logps/chosen": -443.902587890625, "logps/rejected": -362.67236328125, "loss": 0.2889, "rewards/chosen": 0.11468658844629924, "rewards/margins": 1.9348403970400494, "rewards/rejected": -1.82015380859375, "step": 14652 }, { "epoch": 0.7766676384066997, "grad_norm": 49.5, "kl": 0.1981182098388672, "learning_rate": 5e-07, "logits/chosen": 31813192.0, "logits/rejected": -37944333.71428572, "logps/chosen": -336.962646484375, "logps/rejected": -171.22086007254464, "loss": 0.3258, "rewards/chosen": -0.2712036073207855, "rewards/margins": 0.7880681369985854, "rewards/rejected": -1.0592717443193709, "step": 14653 }, { "epoch": 0.7767206424085018, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4958718.5, "logits/rejected": -27255318.85714286, "logps/chosen": -13.94035530090332, "logps/rejected": -231.05691964285714, "loss": 0.1481, "rewards/chosen": 0.5261940360069275, "rewards/margins": 2.876574524811336, "rewards/rejected": -2.3503804888044084, "step": 14654 }, { "epoch": 0.776773646410304, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -93894016.0, "logits/rejected": -54017331.2, "logps/chosen": -350.9710286458333, "logps/rejected": -486.225390625, "loss": 0.2555, "rewards/chosen": -0.16776329278945923, "rewards/margins": 2.979373013973236, "rewards/rejected": -3.1471363067626954, "step": 14655 }, { "epoch": 0.7768266504121061, "grad_norm": 58.25, "kl": 4.359264373779297, "learning_rate": 5e-07, "logits/chosen": -1998825.0, "logits/rejected": -35299772.8, "logps/chosen": -340.17315673828125, "logps/rejected": -322.840625, "loss": 0.2227, "rewards/chosen": 1.2487347920735676, "rewards/margins": 3.6181886037190756, "rewards/rejected": -2.3694538116455077, "step": 14656 }, { "epoch": 0.7768796544139083, "grad_norm": 56.25, "kl": 0.15406036376953125, "learning_rate": 5e-07, "logits/chosen": -42795304.0, "logits/rejected": -16596025.0, "logps/chosen": -348.0860595703125, "logps/rejected": -262.249267578125, "loss": 0.2909, "rewards/chosen": 0.603212833404541, "rewards/margins": 2.3645726442337036, "rewards/rejected": -1.7613598108291626, "step": 14657 }, { "epoch": 0.7769326584157104, "grad_norm": 56.75, "kl": 0.02271270751953125, "learning_rate": 5e-07, "logits/chosen": -18409852.0, "logits/rejected": 6920256.0, "logps/chosen": -335.62652587890625, "logps/rejected": -223.2316691080729, "loss": 0.248, "rewards/chosen": 1.0310958623886108, "rewards/margins": 2.7340719302495318, "rewards/rejected": -1.7029760678609211, "step": 14658 }, { "epoch": 0.7769856624175125, "grad_norm": 90.5, "kl": 13.523094177246094, "learning_rate": 5e-07, "logits/chosen": -34553065.6, "logits/rejected": -33461432.0, "logps/chosen": -954.83701171875, "logps/rejected": -527.601806640625, "loss": 0.2985, "rewards/chosen": 2.061336135864258, "rewards/margins": 6.849684651692709, "rewards/rejected": -4.78834851582845, "step": 14659 }, { "epoch": 0.7770386664193146, "grad_norm": 55.5, "kl": 0.23615264892578125, "learning_rate": 5e-07, "logits/chosen": -9198628.0, "logits/rejected": -6683054.0, "logps/chosen": -477.234375, "logps/rejected": -193.86715698242188, "loss": 0.334, "rewards/chosen": -0.1304733157157898, "rewards/margins": 2.4335878491401672, "rewards/rejected": -2.564061164855957, "step": 14660 }, { "epoch": 0.7770916704211168, "grad_norm": 67.5, "kl": 5.325065612792969, "learning_rate": 5e-07, "logits/chosen": -75810480.0, "logits/rejected": -27024604.0, "logps/chosen": -678.2055053710938, "logps/rejected": -304.89898681640625, "loss": 0.1985, "rewards/chosen": 1.9545230865478516, "rewards/margins": 4.705681324005127, "rewards/rejected": -2.7511582374572754, "step": 14661 }, { "epoch": 0.7771446744229189, "grad_norm": 51.0, "kl": 2.758288621902466, "learning_rate": 5e-07, "logits/chosen": -13961630.4, "logits/rejected": -630092.6666666666, "logps/chosen": -251.3310791015625, "logps/rejected": -160.07402547200522, "loss": 0.3218, "rewards/chosen": 1.0222882270812987, "rewards/margins": 3.2069961865743, "rewards/rejected": -2.1847079594930015, "step": 14662 }, { "epoch": 0.7771976784247211, "grad_norm": 48.25, "kl": 1.66790771484375, "learning_rate": 5e-07, "logits/chosen": -3244696.0, "logits/rejected": -28468466.0, "logps/chosen": -288.0413818359375, "logps/rejected": -467.17950439453125, "loss": 0.3389, "rewards/chosen": 0.22578315436840057, "rewards/margins": 2.0542855709791183, "rewards/rejected": -1.8285024166107178, "step": 14663 }, { "epoch": 0.7772506824265232, "grad_norm": 43.25, "kl": 0.9828376770019531, "learning_rate": 5e-07, "logits/chosen": -33370936.0, "logits/rejected": -53224006.4, "logps/chosen": -1169.5037434895833, "logps/rejected": -356.8383056640625, "loss": 0.1925, "rewards/chosen": 1.2545730272928874, "rewards/margins": 4.243901220957438, "rewards/rejected": -2.9893281936645506, "step": 14664 }, { "epoch": 0.7773036864283254, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79540608.0, "logits/rejected": 34816314.666666664, "logps/chosen": -489.54046630859375, "logps/rejected": -373.0474446614583, "loss": 0.1961, "rewards/chosen": 0.8033599853515625, "rewards/margins": 2.5639591217041016, "rewards/rejected": -1.760599136352539, "step": 14665 }, { "epoch": 0.7773566904301275, "grad_norm": 53.5, "kl": 4.086521148681641, "learning_rate": 5e-07, "logits/chosen": -23840979.2, "logits/rejected": -38314373.333333336, "logps/chosen": -76.2139404296875, "logps/rejected": -323.1269124348958, "loss": 0.3128, "rewards/chosen": 0.9597747802734375, "rewards/margins": 3.183728504180908, "rewards/rejected": -2.2239537239074707, "step": 14666 }, { "epoch": 0.7774096944319296, "grad_norm": 44.25, "kl": 3.539358139038086, "learning_rate": 5e-07, "logits/chosen": -21351634.0, "logits/rejected": -27414016.0, "logps/chosen": -353.9101257324219, "logps/rejected": -229.6305389404297, "loss": 0.224, "rewards/chosen": 1.4487439393997192, "rewards/margins": 3.7610808610916138, "rewards/rejected": -2.3123369216918945, "step": 14667 }, { "epoch": 0.7774626984337317, "grad_norm": 30.375, "kl": 0.632904052734375, "learning_rate": 5e-07, "logits/chosen": 7464550.0, "logits/rejected": 27560796.8, "logps/chosen": -150.7704060872396, "logps/rejected": -470.666552734375, "loss": 0.2126, "rewards/chosen": 0.8771677017211914, "rewards/margins": 3.461012840270996, "rewards/rejected": -2.5838451385498047, "step": 14668 }, { "epoch": 0.7775157024355339, "grad_norm": 44.25, "kl": 0.9290771484375, "learning_rate": 5e-07, "logits/chosen": -1155328.0, "logits/rejected": -33280106.666666668, "logps/chosen": -1250.92041015625, "logps/rejected": -307.2822265625, "loss": 0.1504, "rewards/chosen": 2.0064423084259033, "rewards/margins": 4.414523363113403, "rewards/rejected": -2.4080810546875, "step": 14669 }, { "epoch": 0.777568706437336, "grad_norm": 56.75, "kl": 4.120243072509766, "learning_rate": 5e-07, "logits/chosen": -24820133.333333332, "logits/rejected": -9534657.0, "logps/chosen": -473.2436116536458, "logps/rejected": -179.98751831054688, "loss": 0.3392, "rewards/chosen": 1.1374691327412922, "rewards/margins": 2.629974802335103, "rewards/rejected": -1.492505669593811, "step": 14670 }, { "epoch": 0.7776217104391382, "grad_norm": 52.25, "kl": 1.6551284790039062, "learning_rate": 5e-07, "logits/chosen": 1433474.1666666667, "logits/rejected": -8721960.8, "logps/chosen": -205.04886881510416, "logps/rejected": -283.7819580078125, "loss": 0.2615, "rewards/chosen": 0.6342129707336426, "rewards/margins": 2.5715012550354004, "rewards/rejected": -1.9372882843017578, "step": 14671 }, { "epoch": 0.7776747144409403, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30194270.0, "logits/rejected": -17599848.0, "logps/chosen": -204.42910766601562, "logps/rejected": -232.8122355143229, "loss": 0.1979, "rewards/chosen": 0.18489301204681396, "rewards/margins": 2.4421765406926474, "rewards/rejected": -2.2572835286458335, "step": 14672 }, { "epoch": 0.7777277184427425, "grad_norm": 46.5, "kl": 2.1485514640808105, "learning_rate": 5e-07, "logits/chosen": -18150222.666666668, "logits/rejected": -25226699.2, "logps/chosen": -262.2471923828125, "logps/rejected": -331.239599609375, "loss": 0.221, "rewards/chosen": 1.6027075449625652, "rewards/margins": 3.308008639017741, "rewards/rejected": -1.7053010940551758, "step": 14673 }, { "epoch": 0.7777807224445445, "grad_norm": 43.0, "kl": 0.28981590270996094, "learning_rate": 5e-07, "logits/chosen": -55401850.666666664, "logits/rejected": -23642646.4, "logps/chosen": -222.74409993489584, "logps/rejected": -461.362109375, "loss": 0.248, "rewards/chosen": 0.29768164952596027, "rewards/margins": 3.6402090390523276, "rewards/rejected": -3.342527389526367, "step": 14674 }, { "epoch": 0.7778337264463467, "grad_norm": 151.0, "kl": 6.62750244140625, "learning_rate": 5e-07, "logits/chosen": -40380992.0, "logits/rejected": -21264307.2, "logps/chosen": -1067.6642252604167, "logps/rejected": -314.8067626953125, "loss": 0.2495, "rewards/chosen": 2.1486124992370605, "rewards/margins": 4.198643016815185, "rewards/rejected": -2.050030517578125, "step": 14675 }, { "epoch": 0.7778867304481488, "grad_norm": 59.0, "kl": 3.1797561645507812, "learning_rate": 5e-07, "logits/chosen": -5223586.0, "logits/rejected": -21858742.0, "logps/chosen": -111.54381561279297, "logps/rejected": -322.2281494140625, "loss": 0.3602, "rewards/chosen": 0.17741726338863373, "rewards/margins": 3.65278123319149, "rewards/rejected": -3.4753639698028564, "step": 14676 }, { "epoch": 0.777939734449951, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -22437992.0, "logps/rejected": -239.3065185546875, "loss": 0.1675, "rewards/rejected": -1.926751732826233, "step": 14677 }, { "epoch": 0.7779927384517531, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25613516.0, "logits/rejected": -65209232.0, "logps/chosen": -287.3600158691406, "logps/rejected": -221.96339416503906, "loss": 0.3206, "rewards/chosen": -0.08243637531995773, "rewards/margins": 2.1188227608799934, "rewards/rejected": -2.201259136199951, "step": 14678 }, { "epoch": 0.7780457424535553, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2523439.1666666665, "logits/rejected": -27236912.0, "logps/chosen": -101.01546223958333, "logps/rejected": -373.8625, "loss": 0.1829, "rewards/chosen": 0.6732354164123535, "rewards/margins": 3.859171199798584, "rewards/rejected": -3.1859357833862303, "step": 14679 }, { "epoch": 0.7780987464553574, "grad_norm": 49.25, "kl": 1.2868061065673828, "learning_rate": 5e-07, "logits/chosen": -16043412.0, "logits/rejected": -35745785.6, "logps/chosen": -116.6370137532552, "logps/rejected": -375.2129150390625, "loss": 0.2519, "rewards/chosen": 0.28589340051015216, "rewards/margins": 2.7388158400853477, "rewards/rejected": -2.4529224395751954, "step": 14680 }, { "epoch": 0.7781517504571596, "grad_norm": 38.75, "kl": 0.4818229675292969, "learning_rate": 5e-07, "logits/chosen": 14909673.0, "logits/rejected": -23034101.333333332, "logps/chosen": -281.5853576660156, "logps/rejected": -281.66864013671875, "loss": 0.1751, "rewards/chosen": 1.641268253326416, "rewards/margins": 3.3237026532491045, "rewards/rejected": -1.6824343999226887, "step": 14681 }, { "epoch": 0.7782047544589616, "grad_norm": 32.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33363056.0, "logits/rejected": -18346442.285714287, "logps/chosen": -207.75074768066406, "logps/rejected": -311.98311941964283, "loss": 0.1304, "rewards/chosen": -0.0026397705078125, "rewards/margins": 3.269879477364676, "rewards/rejected": -3.2725192478724887, "step": 14682 }, { "epoch": 0.7782577584607638, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26699744.0, "logits/rejected": -16676451.2, "logps/chosen": -353.3027750651042, "logps/rejected": -457.06279296875, "loss": 0.1826, "rewards/chosen": 0.40752077102661133, "rewards/margins": 3.5434754371643065, "rewards/rejected": -3.135954666137695, "step": 14683 }, { "epoch": 0.7783107624625659, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29522744.0, "logits/rejected": -15818324.0, "logps/chosen": -247.45963541666666, "logps/rejected": -242.6480712890625, "loss": 0.4068, "rewards/chosen": 0.02687181035677592, "rewards/margins": 3.320395360390345, "rewards/rejected": -3.2935235500335693, "step": 14684 }, { "epoch": 0.7783637664643681, "grad_norm": 75.0, "kl": 0.012243270874023438, "learning_rate": 5e-07, "logits/chosen": -23660442.666666668, "logits/rejected": -27945017.6, "logps/chosen": -424.9834798177083, "logps/rejected": -334.6169677734375, "loss": 0.3109, "rewards/chosen": 0.036712646484375, "rewards/margins": 1.829060173034668, "rewards/rejected": -1.792347526550293, "step": 14685 }, { "epoch": 0.7784167704661702, "grad_norm": 33.25, "kl": 1.853987693786621, "learning_rate": 5e-07, "logits/chosen": -33280642.666666668, "logits/rejected": -48405923.2, "logps/chosen": -198.18094889322916, "logps/rejected": -484.031884765625, "loss": 0.2186, "rewards/chosen": 0.24197353919347128, "rewards/margins": 3.232208295663198, "rewards/rejected": -2.9902347564697265, "step": 14686 }, { "epoch": 0.7784697744679724, "grad_norm": 49.75, "kl": 0.6305618286132812, "learning_rate": 5e-07, "logits/chosen": 6568794.8, "logits/rejected": -42061616.0, "logps/chosen": -371.62783203125, "logps/rejected": -736.28662109375, "loss": 0.2828, "rewards/chosen": 0.5722026348114013, "rewards/margins": 3.677475563685099, "rewards/rejected": -3.1052729288736978, "step": 14687 }, { "epoch": 0.7785227784697745, "grad_norm": 45.75, "kl": 2.335519790649414, "learning_rate": 5e-07, "logits/chosen": 9268632.0, "logits/rejected": -29147018.0, "logps/chosen": -287.4794006347656, "logps/rejected": -370.2843933105469, "loss": 0.3047, "rewards/chosen": 0.3928025960922241, "rewards/margins": 2.597319483757019, "rewards/rejected": -2.204516887664795, "step": 14688 }, { "epoch": 0.7785757824715765, "grad_norm": 47.5, "kl": 1.2659778594970703, "learning_rate": 5e-07, "logits/chosen": -30810794.666666668, "logits/rejected": -21580852.0, "logps/chosen": -359.3182373046875, "logps/rejected": -226.05734252929688, "loss": 0.2695, "rewards/chosen": 0.9307411511739095, "rewards/margins": 4.002941211064656, "rewards/rejected": -3.072200059890747, "step": 14689 }, { "epoch": 0.7786287864733787, "grad_norm": 26.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15209853.0, "logits/rejected": -6712953.333333333, "logps/chosen": -311.13726806640625, "logps/rejected": -162.348388671875, "loss": 0.1143, "rewards/chosen": 1.0940377712249756, "rewards/margins": 4.1763951778411865, "rewards/rejected": -3.082357406616211, "step": 14690 }, { "epoch": 0.7786817904751808, "grad_norm": 39.25, "kl": 0.45513343811035156, "learning_rate": 5e-07, "logits/chosen": -22662510.4, "logits/rejected": -9312245.333333334, "logps/chosen": -250.3817626953125, "logps/rejected": -654.9021809895834, "loss": 0.2422, "rewards/chosen": 0.8467130661010742, "rewards/margins": 3.6859893798828125, "rewards/rejected": -2.8392763137817383, "step": 14691 }, { "epoch": 0.778734794476983, "grad_norm": 29.625, "kl": 2.116910934448242, "learning_rate": 5e-07, "logits/chosen": -17780592.0, "logits/rejected": -38737104.0, "logps/chosen": -562.2470703125, "logps/rejected": -250.82626342773438, "loss": 0.246, "rewards/chosen": 0.886756181716919, "rewards/margins": 4.170138597488403, "rewards/rejected": -3.2833824157714844, "step": 14692 }, { "epoch": 0.7787877984787851, "grad_norm": 52.0, "kl": 1.2477302551269531, "learning_rate": 5e-07, "logits/chosen": -55830184.0, "logits/rejected": -3578111.75, "logps/chosen": -573.8038330078125, "logps/rejected": -272.294921875, "loss": 0.2861, "rewards/chosen": 0.7384069561958313, "rewards/margins": 2.625851809978485, "rewards/rejected": -1.8874448537826538, "step": 14693 }, { "epoch": 0.7788408024805873, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15618336.0, "logits/rejected": -51384571.428571425, "logps/chosen": -176.8181610107422, "logps/rejected": -393.7648228236607, "loss": 0.1562, "rewards/chosen": -0.30370789766311646, "rewards/margins": 2.2449251328195845, "rewards/rejected": -2.548633030482701, "step": 14694 }, { "epoch": 0.7788938064823894, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78545808.0, "logits/rejected": -13675346.285714285, "logps/chosen": -480.0850830078125, "logps/rejected": -304.26112583705356, "loss": 0.2, "rewards/chosen": 0.65997314453125, "rewards/margins": 2.7266412462506975, "rewards/rejected": -2.0666681017194475, "step": 14695 }, { "epoch": 0.7789468104841916, "grad_norm": 38.25, "kl": 1.4553499221801758, "learning_rate": 5e-07, "logits/chosen": -25073072.0, "logits/rejected": -27491973.333333332, "logps/chosen": -114.84500122070312, "logps/rejected": -394.1881510416667, "loss": 0.2179, "rewards/chosen": -0.0330832302570343, "rewards/margins": 2.5314967334270477, "rewards/rejected": -2.564579963684082, "step": 14696 }, { "epoch": 0.7789998144859936, "grad_norm": 54.0, "kl": 3.2181930541992188, "learning_rate": 5e-07, "logits/chosen": -27897952.0, "logits/rejected": -39944170.666666664, "logps/chosen": -238.980615234375, "logps/rejected": -461.6959635416667, "loss": 0.4071, "rewards/chosen": 0.09857966899871826, "rewards/margins": 2.7552276690800985, "rewards/rejected": -2.6566480000813804, "step": 14697 }, { "epoch": 0.7790528184877958, "grad_norm": 42.25, "kl": 0.5022029876708984, "learning_rate": 5e-07, "logits/chosen": -55412864.0, "logits/rejected": -4957090.4, "logps/chosen": -320.4691162109375, "logps/rejected": -205.5, "loss": 0.2149, "rewards/chosen": 0.6583221753438314, "rewards/margins": 2.664684518178304, "rewards/rejected": -2.0063623428344726, "step": 14698 }, { "epoch": 0.7791058224895979, "grad_norm": 42.75, "kl": 1.0042409896850586, "learning_rate": 5e-07, "logits/chosen": -26449536.0, "logits/rejected": 446025.3333333333, "logps/chosen": -143.50592041015625, "logps/rejected": -177.98189290364584, "loss": 0.3498, "rewards/chosen": 0.18491785526275634, "rewards/margins": 2.7403479496637977, "rewards/rejected": -2.5554300944010415, "step": 14699 }, { "epoch": 0.7791588264914001, "grad_norm": 38.0, "kl": 4.141977310180664, "learning_rate": 5e-07, "logits/chosen": -4821502.333333333, "logits/rejected": -65395560.0, "logps/chosen": -349.98828125, "logps/rejected": -376.0058288574219, "loss": 0.3742, "rewards/chosen": 0.8956063588460287, "rewards/margins": 3.357626517613729, "rewards/rejected": -2.4620201587677, "step": 14700 }, { "epoch": 0.7792118304932022, "grad_norm": 53.75, "kl": 0.08199501037597656, "learning_rate": 5e-07, "logits/chosen": -21606318.0, "logits/rejected": -24181458.0, "logps/chosen": -287.4061584472656, "logps/rejected": -186.14686584472656, "loss": 0.2786, "rewards/chosen": 0.4096843898296356, "rewards/margins": 2.398824065923691, "rewards/rejected": -1.9891396760940552, "step": 14701 }, { "epoch": 0.7792648344950044, "grad_norm": 36.75, "kl": 0.18718433380126953, "learning_rate": 5e-07, "logits/chosen": -70978624.0, "logits/rejected": -34811008.0, "logps/chosen": -292.53639729817706, "logps/rejected": -199.7139892578125, "loss": 0.2236, "rewards/chosen": 0.5302045345306396, "rewards/margins": 2.846424627304077, "rewards/rejected": -2.3162200927734373, "step": 14702 }, { "epoch": 0.7793178384968065, "grad_norm": 51.5, "kl": 0.9194622039794922, "learning_rate": 5e-07, "logits/chosen": -25612232.0, "logits/rejected": -42155244.0, "logps/chosen": -189.17591857910156, "logps/rejected": -338.6104431152344, "loss": 0.366, "rewards/chosen": -0.0727817490696907, "rewards/margins": 2.0375931784510612, "rewards/rejected": -2.110374927520752, "step": 14703 }, { "epoch": 0.7793708424986087, "grad_norm": 69.5, "kl": 1.6091842651367188, "learning_rate": 5e-07, "logits/chosen": 7032250.0, "logits/rejected": -31551808.0, "logps/chosen": -258.2852478027344, "logps/rejected": -244.48307291666666, "loss": 0.2942, "rewards/chosen": 0.6262214779853821, "rewards/margins": 1.9432963331540425, "rewards/rejected": -1.3170748551686604, "step": 14704 }, { "epoch": 0.7794238465004107, "grad_norm": 42.0, "kl": 0.4895820617675781, "learning_rate": 5e-07, "logits/chosen": 3589213.25, "logits/rejected": -3890412.5714285714, "logps/chosen": -49.112030029296875, "logps/rejected": -220.80841936383928, "loss": 0.2546, "rewards/chosen": -0.5480033755302429, "rewards/margins": 1.0999611020088196, "rewards/rejected": -1.6479644775390625, "step": 14705 }, { "epoch": 0.7794768505022129, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56274856.0, "logits/rejected": -24005349.333333332, "logps/chosen": -284.1943054199219, "logps/rejected": -258.04246012369794, "loss": 0.1574, "rewards/chosen": 0.648516833782196, "rewards/margins": 3.8538015882174173, "rewards/rejected": -3.205284754435221, "step": 14706 }, { "epoch": 0.779529854504015, "grad_norm": 49.75, "kl": 3.3773040771484375, "learning_rate": 5e-07, "logits/chosen": -16677301.333333334, "logits/rejected": -47913337.6, "logps/chosen": -764.8255208333334, "logps/rejected": -237.438916015625, "loss": 0.2472, "rewards/chosen": 1.2846955458323162, "rewards/margins": 3.6309373060862224, "rewards/rejected": -2.346241760253906, "step": 14707 }, { "epoch": 0.7795828585058172, "grad_norm": 62.5, "kl": 1.7700958251953125, "learning_rate": 5e-07, "logits/chosen": -27463882.0, "logits/rejected": -60000976.0, "logps/chosen": -410.1865234375, "logps/rejected": -583.0892333984375, "loss": 0.2143, "rewards/chosen": 0.9886882901191711, "rewards/margins": 4.696491062641144, "rewards/rejected": -3.7078027725219727, "step": 14708 }, { "epoch": 0.7796358625076193, "grad_norm": 50.0, "kl": 1.8047599792480469, "learning_rate": 5e-07, "logits/chosen": -12911278.666666666, "logits/rejected": -15781192.0, "logps/chosen": -136.59967041015625, "logps/rejected": -214.551904296875, "loss": 0.2216, "rewards/chosen": 0.43992793560028076, "rewards/margins": 3.266734766960144, "rewards/rejected": -2.826806831359863, "step": 14709 }, { "epoch": 0.7796888665094215, "grad_norm": 45.25, "kl": 0.7738685607910156, "learning_rate": 5e-07, "logits/chosen": -28954878.0, "logits/rejected": -11015404.0, "logps/chosen": -294.5520324707031, "logps/rejected": -186.8032989501953, "loss": 0.3529, "rewards/chosen": -0.22512571513652802, "rewards/margins": 1.7817035466432571, "rewards/rejected": -2.006829261779785, "step": 14710 }, { "epoch": 0.7797418705112236, "grad_norm": 47.25, "kl": 0.16407394409179688, "learning_rate": 5e-07, "logits/chosen": -66150368.0, "logits/rejected": -18088866.0, "logps/chosen": -289.21136474609375, "logps/rejected": -223.375732421875, "loss": 0.2247, "rewards/chosen": 0.5588135123252869, "rewards/margins": 3.397801101207733, "rewards/rejected": -2.8389875888824463, "step": 14711 }, { "epoch": 0.7797948745130258, "grad_norm": 35.25, "kl": 0.37145042419433594, "learning_rate": 5e-07, "logits/chosen": -1324274.3333333333, "logits/rejected": -19688273.6, "logps/chosen": -190.529296875, "logps/rejected": -277.5137939453125, "loss": 0.1986, "rewards/chosen": 0.9445083936055502, "rewards/margins": 3.2351402600606285, "rewards/rejected": -2.290631866455078, "step": 14712 }, { "epoch": 0.7798478785148278, "grad_norm": 51.5, "kl": 0.2239990234375, "learning_rate": 5e-07, "logits/chosen": -118736864.0, "logits/rejected": -26559534.0, "logps/chosen": -360.50439453125, "logps/rejected": -310.8424987792969, "loss": 0.2481, "rewards/chosen": 0.616106390953064, "rewards/margins": 3.2902740240097046, "rewards/rejected": -2.6741676330566406, "step": 14713 }, { "epoch": 0.77990088251663, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17552792.0, "logits/rejected": -36402117.333333336, "logps/chosen": -241.68043518066406, "logps/rejected": -381.3074544270833, "loss": 0.1727, "rewards/chosen": 0.593473494052887, "rewards/margins": 3.455511152744293, "rewards/rejected": -2.8620376586914062, "step": 14714 }, { "epoch": 0.7799538865184321, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14315150.0, "logits/rejected": -34295930.666666664, "logps/chosen": -486.0645446777344, "logps/rejected": -282.24481201171875, "loss": 0.2235, "rewards/chosen": 0.14044341444969177, "rewards/margins": 2.5176545083522797, "rewards/rejected": -2.377211093902588, "step": 14715 }, { "epoch": 0.7800068905202343, "grad_norm": 47.0, "kl": 0.7436141967773438, "learning_rate": 5e-07, "logits/chosen": -10122414.0, "logits/rejected": -18483800.0, "logps/chosen": -250.62026977539062, "logps/rejected": -241.2266082763672, "loss": 0.2594, "rewards/chosen": 0.5315746068954468, "rewards/margins": 2.458365321159363, "rewards/rejected": -1.926790714263916, "step": 14716 }, { "epoch": 0.7800598945220364, "grad_norm": 43.75, "kl": 1.507223129272461, "learning_rate": 5e-07, "logits/chosen": -41584691.2, "logits/rejected": -27410146.666666668, "logps/chosen": -205.6893798828125, "logps/rejected": -449.609130859375, "loss": 0.319, "rewards/chosen": 0.17822937965393065, "rewards/margins": 3.7861439863840736, "rewards/rejected": -3.607914606730143, "step": 14717 }, { "epoch": 0.7801128985238386, "grad_norm": 50.0, "kl": 2.084564208984375, "learning_rate": 5e-07, "logits/chosen": -37134000.0, "logits/rejected": -59402508.0, "logps/chosen": -511.5170084635417, "logps/rejected": -309.82177734375, "loss": 0.2693, "rewards/chosen": 1.147827943166097, "rewards/margins": 3.906496127446492, "rewards/rejected": -2.7586681842803955, "step": 14718 }, { "epoch": 0.7801659025256407, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25211968.0, "logits/rejected": -43654048.0, "logps/chosen": -266.1073811848958, "logps/rejected": -505.91396484375, "loss": 0.2698, "rewards/chosen": -0.06163128217061361, "rewards/margins": 2.4309950987497966, "rewards/rejected": -2.4926263809204103, "step": 14719 }, { "epoch": 0.7802189065274429, "grad_norm": 42.75, "kl": 0.15494918823242188, "learning_rate": 5e-07, "logits/chosen": -12121374.0, "logits/rejected": -582054.1428571428, "logps/chosen": -259.9080810546875, "logps/rejected": -280.7681884765625, "loss": 0.1208, "rewards/chosen": 1.3438202142715454, "rewards/margins": 3.6745348487581526, "rewards/rejected": -2.330714634486607, "step": 14720 }, { "epoch": 0.7802719105292449, "grad_norm": 43.25, "kl": 2.4036788940429688, "learning_rate": 5e-07, "logits/chosen": -63980581.333333336, "logits/rejected": -51716182.4, "logps/chosen": -548.6077067057291, "logps/rejected": -235.6273681640625, "loss": 0.2225, "rewards/chosen": 1.8414672215779622, "rewards/margins": 3.042987569173177, "rewards/rejected": -1.2015203475952148, "step": 14721 }, { "epoch": 0.7803249145310471, "grad_norm": 68.0, "kl": 0.1532421112060547, "learning_rate": 5e-07, "logits/chosen": 34394148.0, "logits/rejected": -20457430.0, "logps/chosen": -325.4939880371094, "logps/rejected": -201.27667236328125, "loss": 0.3962, "rewards/chosen": -0.05169610679149628, "rewards/margins": 1.3086463958024979, "rewards/rejected": -1.3603425025939941, "step": 14722 }, { "epoch": 0.7803779185328492, "grad_norm": 39.75, "kl": 0.6578102111816406, "learning_rate": 5e-07, "logits/chosen": -2782360.0, "logits/rejected": -60733624.0, "logps/chosen": -152.0200958251953, "logps/rejected": -425.329345703125, "loss": 0.2273, "rewards/chosen": 0.4614495635032654, "rewards/margins": 3.8081406950950623, "rewards/rejected": -3.346691131591797, "step": 14723 }, { "epoch": 0.7804309225346514, "grad_norm": 36.0, "kl": 1.3707046508789062, "learning_rate": 5e-07, "logits/chosen": -23732876.8, "logits/rejected": -80364954.66666667, "logps/chosen": -208.1792236328125, "logps/rejected": -169.03032430013022, "loss": 0.2671, "rewards/chosen": 0.8426645278930665, "rewards/margins": 3.447102355957031, "rewards/rejected": -2.604437828063965, "step": 14724 }, { "epoch": 0.7804839265364535, "grad_norm": 49.25, "kl": 5.885321617126465, "learning_rate": 5e-07, "logits/chosen": -20667398.0, "logits/rejected": -103765232.0, "logps/chosen": -228.49163818359375, "logps/rejected": -224.89247131347656, "loss": 0.3444, "rewards/chosen": 1.033638834953308, "rewards/margins": 2.9021717309951782, "rewards/rejected": -1.8685328960418701, "step": 14725 }, { "epoch": 0.7805369305382557, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61621542.4, "logits/rejected": -25560416.0, "logps/chosen": -490.99873046875, "logps/rejected": -183.4968465169271, "loss": 0.3311, "rewards/chosen": 0.23882415294647216, "rewards/margins": 2.181408985455831, "rewards/rejected": -1.9425848325093586, "step": 14726 }, { "epoch": 0.7805899345400578, "grad_norm": 50.5, "kl": 0.79681396484375, "learning_rate": 5e-07, "logits/chosen": -11026611.2, "logits/rejected": -29633344.0, "logps/chosen": -309.80556640625, "logps/rejected": -286.6785481770833, "loss": 0.2548, "rewards/chosen": 0.6261401653289795, "rewards/margins": 4.120403178532919, "rewards/rejected": -3.494263013203939, "step": 14727 }, { "epoch": 0.78064293854186, "grad_norm": 133.0, "kl": 12.981634140014648, "learning_rate": 5e-07, "logits/chosen": -46177952.0, "logits/rejected": -54801322.666666664, "logps/chosen": -878.60078125, "logps/rejected": -430.457275390625, "loss": 0.2946, "rewards/chosen": 2.629940223693848, "rewards/margins": 4.957859706878662, "rewards/rejected": -2.3279194831848145, "step": 14728 }, { "epoch": 0.780695942543662, "grad_norm": 47.5, "kl": 0.9138660430908203, "learning_rate": 5e-07, "logits/chosen": -22882072.0, "logits/rejected": -23662238.0, "logps/chosen": -355.7969665527344, "logps/rejected": -125.81770324707031, "loss": 0.1967, "rewards/chosen": 0.8910737037658691, "rewards/margins": 3.5146613121032715, "rewards/rejected": -2.6235876083374023, "step": 14729 }, { "epoch": 0.7807489465454642, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36633068.0, "logits/rejected": 18667062.666666668, "logps/chosen": -358.36126708984375, "logps/rejected": -394.7948811848958, "loss": 0.2491, "rewards/chosen": -0.7329727411270142, "rewards/margins": 1.5743515094121299, "rewards/rejected": -2.307324250539144, "step": 14730 }, { "epoch": 0.7808019505472663, "grad_norm": 53.0, "kl": 1.0258064270019531, "learning_rate": 5e-07, "logits/chosen": -35978546.666666664, "logits/rejected": -56285416.0, "logps/chosen": -316.92567952473956, "logps/rejected": -512.7528076171875, "loss": 0.3469, "rewards/chosen": 0.4156246582667033, "rewards/margins": 4.061491052309672, "rewards/rejected": -3.6458663940429688, "step": 14731 }, { "epoch": 0.7808549545490685, "grad_norm": 38.0, "kl": 3.06536865234375, "learning_rate": 5e-07, "logits/chosen": 8504530.0, "logits/rejected": 1223585.0, "logps/chosen": -177.26048278808594, "logps/rejected": -182.1198272705078, "loss": 0.3506, "rewards/chosen": 0.044100768864154816, "rewards/margins": 2.282518394291401, "rewards/rejected": -2.238417625427246, "step": 14732 }, { "epoch": 0.7809079585508706, "grad_norm": 46.5, "kl": 0.30429840087890625, "learning_rate": 5e-07, "logits/chosen": -253311.90625, "logits/rejected": -5494117.333333333, "logps/chosen": -75.57587432861328, "logps/rejected": -345.6833902994792, "loss": 0.2289, "rewards/chosen": -0.23320266604423523, "rewards/margins": 2.147981435060501, "rewards/rejected": -2.3811841011047363, "step": 14733 }, { "epoch": 0.7809609625526728, "grad_norm": 42.5, "kl": 1.6432771682739258, "learning_rate": 5e-07, "logits/chosen": -11757152.0, "logits/rejected": -21217410.0, "logps/chosen": -185.85545349121094, "logps/rejected": -353.43951416015625, "loss": 0.2327, "rewards/chosen": 0.8005824089050293, "rewards/margins": 3.393604278564453, "rewards/rejected": -2.593021869659424, "step": 14734 }, { "epoch": 0.7810139665544749, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78476528.0, "logits/rejected": -5924178.0, "logps/chosen": -507.1346740722656, "logps/rejected": -295.34779866536456, "loss": 0.2184, "rewards/chosen": -0.42102357745170593, "rewards/margins": 2.1911299526691437, "rewards/rejected": -2.6121535301208496, "step": 14735 }, { "epoch": 0.781066970556277, "grad_norm": 55.0, "kl": 0.8194503784179688, "learning_rate": 5e-07, "logits/chosen": 4870926.0, "logits/rejected": -13343434.0, "logps/chosen": -121.57728576660156, "logps/rejected": -277.38543701171875, "loss": 0.2827, "rewards/chosen": 0.8089549541473389, "rewards/margins": 2.55004620552063, "rewards/rejected": -1.741091251373291, "step": 14736 }, { "epoch": 0.7811199745580791, "grad_norm": 31.5, "kl": 3.8078603744506836, "learning_rate": 5e-07, "logits/chosen": -17292524.8, "logits/rejected": -12701276.0, "logps/chosen": -127.69854736328125, "logps/rejected": -301.4721272786458, "loss": 0.3506, "rewards/chosen": 0.2987198352813721, "rewards/margins": 3.936985445022583, "rewards/rejected": -3.638265609741211, "step": 14737 }, { "epoch": 0.7811729785598813, "grad_norm": 73.0, "kl": 3.0084667205810547, "learning_rate": 5e-07, "logits/chosen": -38659242.666666664, "logits/rejected": -35645120.0, "logps/chosen": -448.3955078125, "logps/rejected": -205.71568298339844, "loss": 0.3282, "rewards/chosen": 0.9202124277750651, "rewards/margins": 3.2132712999979653, "rewards/rejected": -2.2930588722229004, "step": 14738 }, { "epoch": 0.7812259825616834, "grad_norm": 44.5, "kl": 1.1113767623901367, "learning_rate": 5e-07, "logits/chosen": -33742936.0, "logits/rejected": -32426842.0, "logps/chosen": -201.88412475585938, "logps/rejected": -275.8004150390625, "loss": 0.3153, "rewards/chosen": 0.15041598677635193, "rewards/margins": 2.4334984719753265, "rewards/rejected": -2.2830824851989746, "step": 14739 }, { "epoch": 0.7812789865634855, "grad_norm": 39.0, "kl": 0.18519210815429688, "learning_rate": 5e-07, "logits/chosen": -25339094.4, "logits/rejected": -35783522.666666664, "logps/chosen": -157.68773193359374, "logps/rejected": -500.0752766927083, "loss": 0.3274, "rewards/chosen": 0.17384281158447265, "rewards/margins": 3.4435638427734374, "rewards/rejected": -3.269721031188965, "step": 14740 }, { "epoch": 0.7813319905652877, "grad_norm": 56.0, "kl": 1.2304344177246094, "learning_rate": 5e-07, "logits/chosen": -41374392.0, "logits/rejected": -20332280.0, "logps/chosen": -454.5492858886719, "logps/rejected": -320.6548767089844, "loss": 0.2867, "rewards/chosen": 0.44210129976272583, "rewards/margins": 3.4144099354743958, "rewards/rejected": -2.97230863571167, "step": 14741 }, { "epoch": 0.7813849945670898, "grad_norm": 42.25, "kl": 2.088656425476074, "learning_rate": 5e-07, "logits/chosen": -22546915.2, "logits/rejected": -44885189.333333336, "logps/chosen": -294.4543701171875, "logps/rejected": -459.3902180989583, "loss": 0.237, "rewards/chosen": 0.8331254959106446, "rewards/margins": 3.881888771057129, "rewards/rejected": -3.0487632751464844, "step": 14742 }, { "epoch": 0.781437998568892, "grad_norm": 54.5, "kl": 1.9875402450561523, "learning_rate": 5e-07, "logits/chosen": -5557826.5, "logits/rejected": -26271464.0, "logps/chosen": -249.59872436523438, "logps/rejected": -404.49163818359375, "loss": 0.2899, "rewards/chosen": 0.42254775762557983, "rewards/margins": 3.074405610561371, "rewards/rejected": -2.651857852935791, "step": 14743 }, { "epoch": 0.781491002570694, "grad_norm": 40.75, "kl": 0.24990367889404297, "learning_rate": 5e-07, "logits/chosen": -3053286.3333333335, "logits/rejected": -31209043.2, "logps/chosen": -99.4432373046875, "logps/rejected": -479.10576171875, "loss": 0.1882, "rewards/chosen": 1.1738153298695881, "rewards/margins": 3.549097998936971, "rewards/rejected": -2.3752826690673827, "step": 14744 }, { "epoch": 0.7815440065724962, "grad_norm": 40.0, "kl": 0.09931182861328125, "learning_rate": 5e-07, "logits/chosen": -20547533.333333332, "logits/rejected": -8558657.6, "logps/chosen": -207.31770833333334, "logps/rejected": -158.02340087890624, "loss": 0.2398, "rewards/chosen": -0.06910146276156108, "rewards/margins": 3.7510466237862907, "rewards/rejected": -3.8201480865478517, "step": 14745 }, { "epoch": 0.7815970105742983, "grad_norm": 49.5, "kl": 0.4988822937011719, "learning_rate": 5e-07, "logits/chosen": -45292124.0, "logits/rejected": -36025568.0, "logps/chosen": -398.92724609375, "logps/rejected": -316.8380126953125, "loss": 0.2583, "rewards/chosen": 0.5222465395927429, "rewards/margins": 2.9864131808280945, "rewards/rejected": -2.4641666412353516, "step": 14746 }, { "epoch": 0.7816500145761005, "grad_norm": 36.75, "kl": 1.4784259796142578, "learning_rate": 5e-07, "logits/chosen": -9136576.0, "logits/rejected": -54456888.0, "logps/chosen": -708.4090576171875, "logps/rejected": -221.13278198242188, "loss": 0.1645, "rewards/chosen": 1.4692089557647705, "rewards/margins": 4.616171360015869, "rewards/rejected": -3.1469624042510986, "step": 14747 }, { "epoch": 0.7817030185779026, "grad_norm": 45.75, "kl": 1.8460578918457031, "learning_rate": 5e-07, "logits/chosen": -11489144.0, "logits/rejected": 623752.75, "logps/chosen": -286.41680908203125, "logps/rejected": -58.770938873291016, "loss": 0.3441, "rewards/chosen": 0.44280868768692017, "rewards/margins": 2.3972155451774597, "rewards/rejected": -1.9544068574905396, "step": 14748 }, { "epoch": 0.7817560225797048, "grad_norm": 61.25, "kl": 3.179623603820801, "learning_rate": 5e-07, "logits/chosen": -96056432.0, "logits/rejected": -23087914.0, "logps/chosen": -729.547607421875, "logps/rejected": -261.2419128417969, "loss": 0.2797, "rewards/chosen": 1.1528375148773193, "rewards/margins": 2.1194652318954468, "rewards/rejected": -0.9666277170181274, "step": 14749 }, { "epoch": 0.7818090265815069, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56075568.0, "logits/rejected": -48383660.0, "logps/chosen": -294.40032958984375, "logps/rejected": -431.7764892578125, "loss": 0.3014, "rewards/chosen": 0.07640542834997177, "rewards/margins": 2.574175499379635, "rewards/rejected": -2.497770071029663, "step": 14750 }, { "epoch": 0.781862030583309, "grad_norm": 91.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21029470.0, "logits/rejected": -15220014.0, "logps/chosen": -661.5145263671875, "logps/rejected": -252.25808715820312, "loss": 0.2983, "rewards/chosen": 0.03976362198591232, "rewards/margins": 2.4779800698161125, "rewards/rejected": -2.4382164478302, "step": 14751 }, { "epoch": 0.7819150345851111, "grad_norm": 37.0, "kl": 0.6811285018920898, "learning_rate": 5e-07, "logits/chosen": 11426166.666666666, "logits/rejected": -8947688.8, "logps/chosen": -36.37135569254557, "logps/rejected": -160.6673828125, "loss": 0.3216, "rewards/chosen": -0.3221687475840251, "rewards/margins": 2.0238813241322835, "rewards/rejected": -2.3460500717163084, "step": 14752 }, { "epoch": 0.7819680385869133, "grad_norm": 54.25, "kl": 2.4174041748046875, "learning_rate": 5e-07, "logits/chosen": -12353032.0, "logits/rejected": -41322420.0, "logps/chosen": -239.40357971191406, "logps/rejected": -390.1153564453125, "loss": 0.239, "rewards/chosen": 0.7969828248023987, "rewards/margins": 3.4208629727363586, "rewards/rejected": -2.62388014793396, "step": 14753 }, { "epoch": 0.7820210425887154, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35664760.0, "logits/rejected": -23058826.666666668, "logps/chosen": -394.5697326660156, "logps/rejected": -239.72420247395834, "loss": 0.3077, "rewards/chosen": -0.9943087697029114, "rewards/margins": 1.5300372640291848, "rewards/rejected": -2.524346033732096, "step": 14754 }, { "epoch": 0.7820740465905176, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34686008.0, "logits/rejected": -34549808.0, "logps/chosen": -411.651611328125, "logps/rejected": -411.02911376953125, "loss": 0.2776, "rewards/chosen": 0.22685356438159943, "rewards/margins": 3.008548215031624, "rewards/rejected": -2.7816946506500244, "step": 14755 }, { "epoch": 0.7821270505923197, "grad_norm": 50.5, "kl": 0.9385833740234375, "learning_rate": 5e-07, "logits/chosen": -4801740.0, "logits/rejected": 6471228.666666667, "logps/chosen": -908.0208129882812, "logps/rejected": -332.21478271484375, "loss": 0.1721, "rewards/chosen": 1.779219150543213, "rewards/margins": 4.480734348297119, "rewards/rejected": -2.7015151977539062, "step": 14756 }, { "epoch": 0.7821800545941219, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11488437.333333334, "logits/rejected": -47571193.6, "logps/chosen": -266.60992431640625, "logps/rejected": -559.51103515625, "loss": 0.2624, "rewards/chosen": -0.09924530982971191, "rewards/margins": 2.5306695461273194, "rewards/rejected": -2.6299148559570313, "step": 14757 }, { "epoch": 0.782233058595924, "grad_norm": 29.0, "kl": 1.0442924499511719, "learning_rate": 5e-07, "logits/chosen": -98340056.0, "logits/rejected": -20854264.0, "logps/chosen": -69.28624725341797, "logps/rejected": -372.2369689941406, "loss": 0.2668, "rewards/chosen": 0.197806254029274, "rewards/margins": 3.9510630518198013, "rewards/rejected": -3.7532567977905273, "step": 14758 }, { "epoch": 0.7822860625977262, "grad_norm": 67.0, "kl": 5.0637407302856445, "learning_rate": 5e-07, "logits/chosen": -30188962.285714287, "logits/rejected": -2586430.0, "logps/chosen": -500.4193638392857, "logps/rejected": -91.06097412109375, "loss": 0.3023, "rewards/chosen": 1.415325437273298, "rewards/margins": 4.08419064113072, "rewards/rejected": -2.668865203857422, "step": 14759 }, { "epoch": 0.7823390665995282, "grad_norm": 68.0, "kl": 1.0940933227539062, "learning_rate": 5e-07, "logits/chosen": -21099026.666666668, "logits/rejected": -42789160.0, "logps/chosen": -422.6418863932292, "logps/rejected": -713.8380126953125, "loss": 0.3766, "rewards/chosen": 0.2500406901041667, "rewards/margins": 2.79770294825236, "rewards/rejected": -2.5476622581481934, "step": 14760 }, { "epoch": 0.7823920706013304, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2907887.75, "logits/rejected": -36504472.0, "logps/chosen": -259.6807861328125, "logps/rejected": -443.6424560546875, "loss": 0.2559, "rewards/chosen": 0.6952418684959412, "rewards/margins": 2.7982720732688904, "rewards/rejected": -2.103030204772949, "step": 14761 }, { "epoch": 0.7824450746031325, "grad_norm": 47.0, "kl": 1.1858882904052734, "learning_rate": 5e-07, "logits/chosen": -27650565.333333332, "logits/rejected": -5869335.2, "logps/chosen": -959.819580078125, "logps/rejected": -394.7589599609375, "loss": 0.1885, "rewards/chosen": 1.6726818084716797, "rewards/margins": 4.443143844604492, "rewards/rejected": -2.7704620361328125, "step": 14762 }, { "epoch": 0.7824980786049347, "grad_norm": 62.75, "kl": 3.462390899658203, "learning_rate": 5e-07, "logits/chosen": -6832216.8, "logits/rejected": -6284356.666666667, "logps/chosen": -288.7491943359375, "logps/rejected": -160.64539591471353, "loss": 0.3547, "rewards/chosen": 0.5204963684082031, "rewards/margins": 2.616562843322754, "rewards/rejected": -2.096066474914551, "step": 14763 }, { "epoch": 0.7825510826067368, "grad_norm": 51.0, "kl": 3.957855224609375, "learning_rate": 5e-07, "logits/chosen": -44076533.333333336, "logits/rejected": 14232940.8, "logps/chosen": -639.0605875651041, "logps/rejected": -321.4212890625, "loss": 0.2433, "rewards/chosen": 1.4346168835957844, "rewards/margins": 3.396170647939046, "rewards/rejected": -1.9615537643432617, "step": 14764 }, { "epoch": 0.782604086608539, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20404412.0, "logits/rejected": -53088819.2, "logps/chosen": -143.10728963216147, "logps/rejected": -456.74736328125, "loss": 0.2309, "rewards/chosen": -0.03990962107976278, "rewards/margins": 2.8380692760149637, "rewards/rejected": -2.8779788970947267, "step": 14765 }, { "epoch": 0.782657090610341, "grad_norm": 47.25, "kl": 1.4791297912597656, "learning_rate": 5e-07, "logits/chosen": -3036007.2, "logits/rejected": -18440160.0, "logps/chosen": -198.350390625, "logps/rejected": -175.3427734375, "loss": 0.3951, "rewards/chosen": 0.29312725067138673, "rewards/margins": 1.5563355604807536, "rewards/rejected": -1.263208309809367, "step": 14766 }, { "epoch": 0.7827100946121432, "grad_norm": 58.5, "kl": 3.6196842193603516, "learning_rate": 5e-07, "logits/chosen": -22706228.8, "logits/rejected": -6405900.0, "logps/chosen": -208.56796875, "logps/rejected": -217.0613810221354, "loss": 0.3938, "rewards/chosen": 0.5827295780181885, "rewards/margins": 1.6494181156158447, "rewards/rejected": -1.0666885375976562, "step": 14767 }, { "epoch": 0.7827630986139453, "grad_norm": 52.5, "kl": 5.901212692260742, "learning_rate": 5e-07, "logits/chosen": -12044660.0, "logits/rejected": -23154260.8, "logps/chosen": -380.244384765625, "logps/rejected": -598.3392578125, "loss": 0.2456, "rewards/chosen": 1.4265478452046711, "rewards/margins": 4.9389719327290855, "rewards/rejected": -3.512424087524414, "step": 14768 }, { "epoch": 0.7828161026157475, "grad_norm": 61.5, "kl": 0.511444091796875, "learning_rate": 5e-07, "logits/chosen": 16093932.0, "logits/rejected": 5559362.0, "logps/chosen": -343.096435546875, "logps/rejected": -177.8475830078125, "loss": 0.3334, "rewards/chosen": 0.16162516673405966, "rewards/margins": 1.7689127643903095, "rewards/rejected": -1.60728759765625, "step": 14769 }, { "epoch": 0.7828691066175496, "grad_norm": 46.0, "kl": 2.2779388427734375, "learning_rate": 5e-07, "logits/chosen": -18630146.0, "logits/rejected": -56511780.0, "logps/chosen": -431.84503173828125, "logps/rejected": -330.0682678222656, "loss": 0.2513, "rewards/chosen": 0.8047606945037842, "rewards/margins": 2.824840784072876, "rewards/rejected": -2.020080089569092, "step": 14770 }, { "epoch": 0.7829221106193518, "grad_norm": 45.5, "kl": 2.9139633178710938, "learning_rate": 5e-07, "logits/chosen": -29130466.666666668, "logits/rejected": -13723612.0, "logps/chosen": -380.1957194010417, "logps/rejected": -209.01611328125, "loss": 0.3341, "rewards/chosen": 1.0360093116760254, "rewards/margins": 2.4891072511672974, "rewards/rejected": -1.453097939491272, "step": 14771 }, { "epoch": 0.7829751146211539, "grad_norm": 49.25, "kl": 5.02479362487793, "learning_rate": 5e-07, "logits/chosen": -10224081.333333334, "logits/rejected": -51902372.0, "logps/chosen": -163.61517333984375, "logps/rejected": -324.76995849609375, "loss": 0.4896, "rewards/chosen": 0.23056228955586752, "rewards/margins": 1.2345677216847737, "rewards/rejected": -1.0040054321289062, "step": 14772 }, { "epoch": 0.7830281186229561, "grad_norm": 46.25, "kl": 1.8830223083496094, "learning_rate": 5e-07, "logits/chosen": -49358214.4, "logits/rejected": -35505496.0, "logps/chosen": -624.055078125, "logps/rejected": -227.12516276041666, "loss": 0.3307, "rewards/chosen": 0.8731366157531738, "rewards/margins": 2.642643388112386, "rewards/rejected": -1.7695067723592122, "step": 14773 }, { "epoch": 0.7830811226247582, "grad_norm": 50.25, "kl": 0.688868522644043, "learning_rate": 5e-07, "logits/chosen": -14551106.0, "logits/rejected": -48260012.0, "logps/chosen": -86.75498962402344, "logps/rejected": -419.65283203125, "loss": 0.22, "rewards/chosen": 0.6212892532348633, "rewards/margins": 3.5866050720214844, "rewards/rejected": -2.965315818786621, "step": 14774 }, { "epoch": 0.7831341266265603, "grad_norm": 36.75, "kl": 0.4488868713378906, "learning_rate": 5e-07, "logits/chosen": -16915488.0, "logits/rejected": -48906888.0, "logps/chosen": -190.5941619873047, "logps/rejected": -271.0843505859375, "loss": 0.2795, "rewards/chosen": 0.5739185810089111, "rewards/margins": 2.4363880157470703, "rewards/rejected": -1.8624694347381592, "step": 14775 }, { "epoch": 0.7831871306283624, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42452392.0, "logits/rejected": -18118516.0, "logps/chosen": -294.21533203125, "logps/rejected": -267.068603515625, "loss": 0.3099, "rewards/chosen": 0.15894660353660583, "rewards/margins": 2.261108487844467, "rewards/rejected": -2.1021618843078613, "step": 14776 }, { "epoch": 0.7832401346301646, "grad_norm": 41.25, "kl": 3.315408706665039, "learning_rate": 5e-07, "logits/chosen": 6327026.666666667, "logits/rejected": -27327452.8, "logps/chosen": -46.83895365397135, "logps/rejected": -350.701318359375, "loss": 0.2673, "rewards/chosen": 0.46449120839436847, "rewards/margins": 2.559066899617513, "rewards/rejected": -2.0945756912231444, "step": 14777 }, { "epoch": 0.7832931386319667, "grad_norm": 39.75, "kl": 2.46915340423584, "learning_rate": 5e-07, "logits/chosen": 3020955.5, "logits/rejected": -15581374.0, "logps/chosen": -116.6937255859375, "logps/rejected": -257.35955810546875, "loss": 0.3138, "rewards/chosen": 0.47717979550361633, "rewards/margins": 2.5104039013385773, "rewards/rejected": -2.033224105834961, "step": 14778 }, { "epoch": 0.7833461426337689, "grad_norm": 62.25, "kl": 4.376770973205566, "learning_rate": 5e-07, "logits/chosen": -31606393.6, "logits/rejected": -42243845.333333336, "logps/chosen": -513.588916015625, "logps/rejected": -472.831298828125, "loss": 0.2675, "rewards/chosen": 1.3247210502624511, "rewards/margins": 3.7815056165059406, "rewards/rejected": -2.4567845662434897, "step": 14779 }, { "epoch": 0.783399146635571, "grad_norm": 61.0, "kl": 2.9603538513183594, "learning_rate": 5e-07, "logits/chosen": -46045548.8, "logits/rejected": -51416858.666666664, "logps/chosen": -497.0677734375, "logps/rejected": -664.7928059895834, "loss": 0.3779, "rewards/chosen": 0.4245631217956543, "rewards/margins": 3.714791329701742, "rewards/rejected": -3.2902282079060874, "step": 14780 }, { "epoch": 0.7834521506373732, "grad_norm": 100.5, "kl": 1.9026374816894531, "learning_rate": 5e-07, "logits/chosen": -12514892.0, "logits/rejected": -17419746.0, "logps/chosen": -180.24337768554688, "logps/rejected": -704.3182983398438, "loss": 0.2265, "rewards/chosen": 0.9193839430809021, "rewards/margins": 6.655401170253754, "rewards/rejected": -5.736017227172852, "step": 14781 }, { "epoch": 0.7835051546391752, "grad_norm": 39.0, "kl": 3.753063201904297, "learning_rate": 5e-07, "logits/chosen": 4890650.0, "logits/rejected": -16094699.2, "logps/chosen": -143.56863403320312, "logps/rejected": -205.0135009765625, "loss": 0.2247, "rewards/chosen": 0.43270917733510333, "rewards/margins": 3.6653267463048302, "rewards/rejected": -3.2326175689697267, "step": 14782 }, { "epoch": 0.7835581586409774, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31331592.0, "logits/rejected": -27474020.0, "logps/chosen": -335.84332275390625, "logps/rejected": -215.08731079101562, "loss": 0.309, "rewards/chosen": 0.11558380722999573, "rewards/margins": 2.225342184305191, "rewards/rejected": -2.1097583770751953, "step": 14783 }, { "epoch": 0.7836111626427795, "grad_norm": 35.75, "kl": 3.4530715942382812, "learning_rate": 5e-07, "logits/chosen": -12759858.0, "logits/rejected": -4474136.0, "logps/chosen": -100.6053237915039, "logps/rejected": -200.21029663085938, "loss": 0.336, "rewards/chosen": 0.47830069065093994, "rewards/margins": 2.912077784538269, "rewards/rejected": -2.433777093887329, "step": 14784 }, { "epoch": 0.7836641666445817, "grad_norm": 53.25, "kl": 0.8921794891357422, "learning_rate": 5e-07, "logits/chosen": -53208985.6, "logits/rejected": 30071802.666666668, "logps/chosen": -327.2550048828125, "logps/rejected": -279.02602132161456, "loss": 0.3012, "rewards/chosen": 0.5506351470947266, "rewards/margins": 2.840488402048747, "rewards/rejected": -2.28985325495402, "step": 14785 }, { "epoch": 0.7837171706463838, "grad_norm": 44.0, "kl": 2.7858963012695312, "learning_rate": 5e-07, "logits/chosen": -22373174.0, "logits/rejected": -8715656.0, "logps/chosen": -195.40792846679688, "logps/rejected": -164.83238220214844, "loss": 0.3251, "rewards/chosen": 0.5179908275604248, "rewards/margins": 2.8170483112335205, "rewards/rejected": -2.2990574836730957, "step": 14786 }, { "epoch": 0.783770174648186, "grad_norm": 53.25, "kl": 2.632902145385742, "learning_rate": 5e-07, "logits/chosen": 21386738.0, "logits/rejected": -23599796.0, "logps/chosen": -220.26829528808594, "logps/rejected": -351.611328125, "loss": 0.2931, "rewards/chosen": 0.7323952913284302, "rewards/margins": 2.8068286180496216, "rewards/rejected": -2.0744333267211914, "step": 14787 }, { "epoch": 0.7838231786499881, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50682176.0, "logits/rejected": -28468752.0, "logps/chosen": -445.71142578125, "logps/rejected": -398.7191162109375, "loss": 0.1823, "rewards/chosen": 0.09174041450023651, "rewards/margins": 3.0403024007876716, "rewards/rejected": -2.948561986287435, "step": 14788 }, { "epoch": 0.7838761826517903, "grad_norm": 38.0, "kl": 0.23745155334472656, "learning_rate": 5e-07, "logits/chosen": 2011817.0, "logits/rejected": -21385296.0, "logps/chosen": -271.6134033203125, "logps/rejected": -283.334228515625, "loss": 0.168, "rewards/chosen": 0.6576614379882812, "rewards/margins": 3.32253360748291, "rewards/rejected": -2.664872169494629, "step": 14789 }, { "epoch": 0.7839291866535923, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36263504.0, "logits/rejected": -38767852.0, "logps/chosen": -696.0018920898438, "logps/rejected": -373.11767578125, "loss": 0.2065, "rewards/chosen": 1.5149269104003906, "rewards/margins": 3.5329995155334473, "rewards/rejected": -2.0180726051330566, "step": 14790 }, { "epoch": 0.7839821906553944, "grad_norm": 43.25, "kl": 0.18804550170898438, "learning_rate": 5e-07, "logits/chosen": -32170986.666666668, "logits/rejected": -25707491.2, "logps/chosen": -316.1363118489583, "logps/rejected": -346.627880859375, "loss": 0.2136, "rewards/chosen": 0.5010265906651815, "rewards/margins": 2.794487245877584, "rewards/rejected": -2.293460655212402, "step": 14791 }, { "epoch": 0.7840351946571966, "grad_norm": 37.25, "kl": 3.6095638275146484, "learning_rate": 5e-07, "logits/chosen": -40198156.8, "logits/rejected": -9580117.333333334, "logps/chosen": -300.181201171875, "logps/rejected": -173.1815185546875, "loss": 0.312, "rewards/chosen": 1.1506782531738282, "rewards/margins": 2.712236022949219, "rewards/rejected": -1.5615577697753906, "step": 14792 }, { "epoch": 0.7840881986589987, "grad_norm": 59.0, "kl": 1.1507749557495117, "learning_rate": 5e-07, "logits/chosen": 10118025.6, "logits/rejected": -58825045.333333336, "logps/chosen": -350.200439453125, "logps/rejected": -433.3685302734375, "loss": 0.3217, "rewards/chosen": 0.12923741340637207, "rewards/margins": 3.7107377847035727, "rewards/rejected": -3.5815003712972007, "step": 14793 }, { "epoch": 0.7841412026608009, "grad_norm": 56.5, "kl": 1.3857011795043945, "learning_rate": 5e-07, "logits/chosen": 15011845.0, "logits/rejected": -50476304.0, "logps/chosen": -257.51324462890625, "logps/rejected": -250.2026824951172, "loss": 0.3144, "rewards/chosen": 0.2646227777004242, "rewards/margins": 2.2070902287960052, "rewards/rejected": -1.942467451095581, "step": 14794 }, { "epoch": 0.784194206662603, "grad_norm": 55.5, "kl": 1.653594970703125, "learning_rate": 5e-07, "logits/chosen": -75644016.0, "logits/rejected": -24119020.8, "logps/chosen": -752.3155924479166, "logps/rejected": -161.628955078125, "loss": 0.1886, "rewards/chosen": 1.0497070948282878, "rewards/margins": 3.6412575403849283, "rewards/rejected": -2.5915504455566407, "step": 14795 }, { "epoch": 0.7842472106644052, "grad_norm": 67.0, "kl": 1.1953582763671875, "learning_rate": 5e-07, "logits/chosen": -23039938.285714287, "logits/rejected": 2678853.5, "logps/chosen": -360.61471121651783, "logps/rejected": -61.57282257080078, "loss": 0.3745, "rewards/chosen": 0.5552193096705845, "rewards/margins": 2.653202908379691, "rewards/rejected": -2.0979835987091064, "step": 14796 }, { "epoch": 0.7843002146662073, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5257387.333333333, "logits/rejected": -18833984.0, "logps/chosen": -134.31224568684897, "logps/rejected": -280.0752197265625, "loss": 0.2433, "rewards/chosen": 0.44921926657358807, "rewards/margins": 2.9485394875208537, "rewards/rejected": -2.499320220947266, "step": 14797 }, { "epoch": 0.7843532186680094, "grad_norm": 35.75, "kl": 0.3235788345336914, "learning_rate": 5e-07, "logits/chosen": -197707712.0, "logits/rejected": -72332876.8, "logps/chosen": -187.88409423828125, "logps/rejected": -525.854052734375, "loss": 0.2125, "rewards/chosen": 0.3914538621902466, "rewards/margins": 3.915250325202942, "rewards/rejected": -3.5237964630126952, "step": 14798 }, { "epoch": 0.7844062226698115, "grad_norm": 50.75, "kl": 0.7670936584472656, "learning_rate": 5e-07, "logits/chosen": -28897472.0, "logits/rejected": -49091132.0, "logps/chosen": -332.031494140625, "logps/rejected": -400.5198974609375, "loss": 0.2973, "rewards/chosen": 1.0104138851165771, "rewards/margins": 3.2730000019073486, "rewards/rejected": -2.2625861167907715, "step": 14799 }, { "epoch": 0.7844592266716137, "grad_norm": 40.5, "kl": 0.5433712005615234, "learning_rate": 5e-07, "logits/chosen": -33243662.0, "logits/rejected": -39486052.0, "logps/chosen": -266.0732727050781, "logps/rejected": -468.10150146484375, "loss": 0.2451, "rewards/chosen": 0.7556747794151306, "rewards/margins": 3.3151578307151794, "rewards/rejected": -2.559483051300049, "step": 14800 }, { "epoch": 0.7845122306734158, "grad_norm": 58.75, "kl": 5.583930969238281, "learning_rate": 5e-07, "logits/chosen": -25553621.333333332, "logits/rejected": -34711288.0, "logps/chosen": -560.2037353515625, "logps/rejected": -272.9266662597656, "loss": 0.3792, "rewards/chosen": 0.915395180384318, "rewards/margins": 3.7181147734324136, "rewards/rejected": -2.8027195930480957, "step": 14801 }, { "epoch": 0.784565234675218, "grad_norm": 63.5, "kl": 1.2931466102600098, "learning_rate": 5e-07, "logits/chosen": -38378506.666666664, "logits/rejected": 1127329.125, "logps/chosen": -322.33152262369794, "logps/rejected": -70.99845886230469, "loss": 0.4746, "rewards/chosen": 0.05725173155466715, "rewards/margins": 0.29817255834738415, "rewards/rejected": -0.24092082679271698, "step": 14802 }, { "epoch": 0.7846182386770201, "grad_norm": 49.75, "kl": 0.12987899780273438, "learning_rate": 5e-07, "logits/chosen": -5740837.0, "logits/rejected": -32817580.0, "logps/chosen": -715.904052734375, "logps/rejected": -389.65582275390625, "loss": 0.213, "rewards/chosen": 1.1403807401657104, "rewards/margins": 3.706762671470642, "rewards/rejected": -2.5663819313049316, "step": 14803 }, { "epoch": 0.7846712426788223, "grad_norm": 54.0, "kl": 2.581331253051758, "learning_rate": 5e-07, "logits/chosen": -18256741.333333332, "logits/rejected": -21969443.2, "logps/chosen": -293.7760416666667, "logps/rejected": -324.3875244140625, "loss": 0.3172, "rewards/chosen": 0.5029637018839518, "rewards/margins": 2.383693758646647, "rewards/rejected": -1.8807300567626952, "step": 14804 }, { "epoch": 0.7847242466806243, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50846544.0, "logits/rejected": -31307552.0, "logps/chosen": -466.39129638671875, "logps/rejected": -260.65525309244794, "loss": 0.2012, "rewards/chosen": 0.9650710821151733, "rewards/margins": 3.294777750968933, "rewards/rejected": -2.3297066688537598, "step": 14805 }, { "epoch": 0.7847772506824265, "grad_norm": 51.0, "kl": 0.8047237396240234, "learning_rate": 5e-07, "logits/chosen": -39972168.0, "logits/rejected": -13429219.0, "logps/chosen": -347.14556884765625, "logps/rejected": -338.0596923828125, "loss": 0.2788, "rewards/chosen": 0.2977554500102997, "rewards/margins": 3.476246565580368, "rewards/rejected": -3.1784911155700684, "step": 14806 }, { "epoch": 0.7848302546842286, "grad_norm": 39.25, "kl": 1.4630703926086426, "learning_rate": 5e-07, "logits/chosen": -36202392.0, "logits/rejected": 2170579.5, "logps/chosen": -284.59149169921875, "logps/rejected": -282.0175476074219, "loss": 0.2246, "rewards/chosen": 1.065621018409729, "rewards/margins": 3.007287859916687, "rewards/rejected": -1.941666841506958, "step": 14807 }, { "epoch": 0.7848832586860308, "grad_norm": 44.0, "kl": 4.948372840881348, "learning_rate": 5e-07, "logits/chosen": -34925472.0, "logits/rejected": -58467656.0, "logps/chosen": -447.82470703125, "logps/rejected": -505.3894958496094, "loss": 0.4093, "rewards/chosen": 1.041663578578404, "rewards/margins": 3.9756082807268416, "rewards/rejected": -2.9339447021484375, "step": 14808 }, { "epoch": 0.7849362626878329, "grad_norm": 35.25, "kl": 0.313690185546875, "learning_rate": 5e-07, "logits/chosen": -28415022.0, "logits/rejected": -33717093.333333336, "logps/chosen": -178.9408416748047, "logps/rejected": -282.9081217447917, "loss": 0.1882, "rewards/chosen": 0.2012806087732315, "rewards/margins": 2.9300006379683814, "rewards/rejected": -2.72872002919515, "step": 14809 }, { "epoch": 0.7849892666896351, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -15158562.0, "logps/rejected": -235.987060546875, "loss": 0.1384, "rewards/rejected": -2.2721469402313232, "step": 14810 }, { "epoch": 0.7850422706914372, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9034315.0, "logits/rejected": -12087292.0, "logps/chosen": -38.455406188964844, "logps/rejected": -165.8019816080729, "loss": 0.2359, "rewards/chosen": -0.17804202437400818, "rewards/margins": 2.8057248493035636, "rewards/rejected": -2.9837668736775718, "step": 14811 }, { "epoch": 0.7850952746932394, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9257852.666666666, "logits/rejected": -17121280.0, "logps/chosen": -211.65983072916666, "logps/rejected": -349.410791015625, "loss": 0.238, "rewards/chosen": 0.22818297147750854, "rewards/margins": 2.911018741130829, "rewards/rejected": -2.6828357696533205, "step": 14812 }, { "epoch": 0.7851482786950414, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17412700.0, "logits/rejected": -26841336.0, "logps/chosen": -148.5391642252604, "logps/rejected": -390.00048828125, "loss": 0.2075, "rewards/chosen": 0.0237645482023557, "rewards/margins": 3.720226035018762, "rewards/rejected": -3.696461486816406, "step": 14813 }, { "epoch": 0.7852012826968436, "grad_norm": 56.0, "kl": 3.2810707092285156, "learning_rate": 5e-07, "logits/chosen": -24387938.0, "logits/rejected": -24419022.0, "logps/chosen": -446.5517272949219, "logps/rejected": -363.8636779785156, "loss": 0.2663, "rewards/chosen": 0.6886677145957947, "rewards/margins": 4.36819714307785, "rewards/rejected": -3.6795294284820557, "step": 14814 }, { "epoch": 0.7852542866986457, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25520056.0, "logits/rejected": -26578800.0, "logps/chosen": -113.2042236328125, "logps/rejected": -702.509033203125, "loss": 0.4126, "rewards/chosen": -0.17457761764526367, "rewards/margins": 4.742624378204345, "rewards/rejected": -4.917201995849609, "step": 14815 }, { "epoch": 0.7853072907004479, "grad_norm": 47.0, "kl": 2.7838058471679688, "learning_rate": 5e-07, "logits/chosen": -10898824.0, "logits/rejected": -36251189.333333336, "logps/chosen": -139.3414794921875, "logps/rejected": -349.0929361979167, "loss": 0.4022, "rewards/chosen": -0.11751606464385986, "rewards/margins": 2.9603485822677613, "rewards/rejected": -3.077864646911621, "step": 14816 }, { "epoch": 0.78536029470225, "grad_norm": 51.25, "kl": 0.8979148864746094, "learning_rate": 5e-07, "logits/chosen": 2442905.0, "logits/rejected": -33042963.2, "logps/chosen": -56.575358072916664, "logps/rejected": -177.3265380859375, "loss": 0.307, "rewards/chosen": 0.008703045547008514, "rewards/margins": 1.6505674406886102, "rewards/rejected": -1.6418643951416017, "step": 14817 }, { "epoch": 0.7854132987040522, "grad_norm": 38.5, "kl": 2.065887451171875, "learning_rate": 5e-07, "logits/chosen": -50136368.0, "logits/rejected": 10378800.0, "logps/chosen": -245.3080078125, "logps/rejected": -449.6731770833333, "loss": 0.2621, "rewards/chosen": 0.8176115036010743, "rewards/margins": 3.873355038960775, "rewards/rejected": -3.0557435353597007, "step": 14818 }, { "epoch": 0.7854663027058543, "grad_norm": 49.0, "kl": 1.5665016174316406, "learning_rate": 5e-07, "logits/chosen": -31748010.666666668, "logits/rejected": -45703331.2, "logps/chosen": -320.9520263671875, "logps/rejected": -409.198876953125, "loss": 0.2001, "rewards/chosen": 1.4193851153055828, "rewards/margins": 3.7672202746073404, "rewards/rejected": -2.347835159301758, "step": 14819 }, { "epoch": 0.7855193067076565, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73008474.66666667, "logits/rejected": -36135872.0, "logps/chosen": -356.9273274739583, "logps/rejected": -333.2287109375, "loss": 0.1891, "rewards/chosen": 0.6473429600397745, "rewards/margins": 3.393415919939677, "rewards/rejected": -2.7460729598999025, "step": 14820 }, { "epoch": 0.7855723107094585, "grad_norm": 27.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17482018.666666668, "logits/rejected": -13044436.0, "logps/chosen": -292.84230550130206, "logps/rejected": -150.80858154296874, "loss": 0.1115, "rewards/chosen": 1.5594994227091472, "rewards/margins": 4.515709559122722, "rewards/rejected": -2.956210136413574, "step": 14821 }, { "epoch": 0.7856253147112607, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37443289.6, "logits/rejected": -4816094.0, "logps/chosen": -172.8033447265625, "logps/rejected": -97.97756958007812, "loss": 0.3379, "rewards/chosen": 0.061428380012512204, "rewards/margins": 2.763747843106588, "rewards/rejected": -2.7023194630940757, "step": 14822 }, { "epoch": 0.7856783187130628, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3783868.5, "logits/rejected": -19037772.0, "logps/chosen": -424.75775146484375, "logps/rejected": -304.70762125651044, "loss": 0.2656, "rewards/chosen": 0.7199485898017883, "rewards/margins": 2.0481125315030413, "rewards/rejected": -1.3281639417012532, "step": 14823 }, { "epoch": 0.785731322714865, "grad_norm": 45.75, "kl": 1.802816390991211, "learning_rate": 5e-07, "logits/chosen": -16225710.666666666, "logits/rejected": -39926728.0, "logps/chosen": -189.79374186197916, "logps/rejected": -647.72607421875, "loss": 0.4206, "rewards/chosen": 0.012721240520477295, "rewards/margins": 4.434613406658173, "rewards/rejected": -4.421892166137695, "step": 14824 }, { "epoch": 0.7857843267166671, "grad_norm": 43.5, "kl": 2.4463181495666504, "learning_rate": 5e-07, "logits/chosen": -30290778.666666668, "logits/rejected": -40200848.0, "logps/chosen": -407.3665771484375, "logps/rejected": -464.1134948730469, "loss": 0.3159, "rewards/chosen": 1.007983684539795, "rewards/margins": 5.9058003425598145, "rewards/rejected": -4.8978166580200195, "step": 14825 }, { "epoch": 0.7858373307184693, "grad_norm": 52.25, "kl": 2.3919448852539062, "learning_rate": 5e-07, "logits/chosen": -29375048.0, "logits/rejected": -38208068.0, "logps/chosen": -791.8846435546875, "logps/rejected": -258.171875, "loss": 0.2064, "rewards/chosen": 1.2001686096191406, "rewards/margins": 4.365583419799805, "rewards/rejected": -3.165414810180664, "step": 14826 }, { "epoch": 0.7858903347202714, "grad_norm": 46.0, "kl": 1.4698495864868164, "learning_rate": 5e-07, "logits/chosen": -10385056.8, "logits/rejected": -50181877.333333336, "logps/chosen": -218.669287109375, "logps/rejected": -228.6524861653646, "loss": 0.2861, "rewards/chosen": 1.1263490676879884, "rewards/margins": 2.3992936770121256, "rewards/rejected": -1.2729446093241374, "step": 14827 }, { "epoch": 0.7859433387220736, "grad_norm": 37.75, "kl": 1.0248794555664062, "learning_rate": 5e-07, "logits/chosen": 4365063.333333333, "logits/rejected": -13761268.8, "logps/chosen": -144.81697591145834, "logps/rejected": -247.2162841796875, "loss": 0.2517, "rewards/chosen": 0.6817220052083334, "rewards/margins": 2.937676175435384, "rewards/rejected": -2.2559541702270507, "step": 14828 }, { "epoch": 0.7859963427238756, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40632140.0, "logits/rejected": -34402696.0, "logps/chosen": -1003.802001953125, "logps/rejected": -565.5518798828125, "loss": 0.1714, "rewards/chosen": 1.1343094110488892, "rewards/margins": 5.884254097938538, "rewards/rejected": -4.749944686889648, "step": 14829 }, { "epoch": 0.7860493467256778, "grad_norm": 47.75, "kl": 3.5148353576660156, "learning_rate": 5e-07, "logits/chosen": -31936400.0, "logits/rejected": -18909150.0, "logps/chosen": -300.627197265625, "logps/rejected": -219.25331115722656, "loss": 0.3134, "rewards/chosen": 0.8413544495900472, "rewards/margins": 3.811777671178182, "rewards/rejected": -2.9704232215881348, "step": 14830 }, { "epoch": 0.7861023507274799, "grad_norm": 48.75, "kl": 1.5570087432861328, "learning_rate": 5e-07, "logits/chosen": -7592097.333333333, "logits/rejected": -10772007.2, "logps/chosen": -144.52166748046875, "logps/rejected": -221.014111328125, "loss": 0.326, "rewards/chosen": 0.2774081031481425, "rewards/margins": 2.007917288939158, "rewards/rejected": -1.7305091857910155, "step": 14831 }, { "epoch": 0.7861553547292821, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21142352.0, "logits/rejected": -24273804.8, "logps/chosen": -462.821533203125, "logps/rejected": -288.116064453125, "loss": 0.2033, "rewards/chosen": 0.7828929424285889, "rewards/margins": 3.196076536178589, "rewards/rejected": -2.41318359375, "step": 14832 }, { "epoch": 0.7862083587310842, "grad_norm": 51.5, "kl": 0.10985183715820312, "learning_rate": 5e-07, "logits/chosen": 6554446.0, "logits/rejected": -27605424.0, "logps/chosen": -89.96553548177083, "logps/rejected": -284.74375, "loss": 0.3164, "rewards/chosen": -0.24702219168345133, "rewards/margins": 1.4923595984776814, "rewards/rejected": -1.7393817901611328, "step": 14833 }, { "epoch": 0.7862613627328864, "grad_norm": 61.75, "kl": 3.0546321868896484, "learning_rate": 5e-07, "logits/chosen": -52173586.28571428, "logits/rejected": -2746022.5, "logps/chosen": -215.92560686383928, "logps/rejected": -120.33903503417969, "loss": 0.4559, "rewards/chosen": 0.30632168906075613, "rewards/margins": 3.471837588718959, "rewards/rejected": -3.165515899658203, "step": 14834 }, { "epoch": 0.7863143667346885, "grad_norm": 44.75, "kl": 2.359569549560547, "learning_rate": 5e-07, "logits/chosen": -10330268.0, "logits/rejected": -18337104.0, "logps/chosen": -184.1709228515625, "logps/rejected": -255.8162841796875, "loss": 0.2279, "rewards/chosen": 0.8762234687805176, "rewards/margins": 4.25871680577596, "rewards/rejected": -3.382493336995443, "step": 14835 }, { "epoch": 0.7863673707364907, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": 3483606.5, "logps/rejected": -267.6471252441406, "loss": 0.1632, "rewards/rejected": -2.4078116416931152, "step": 14836 }, { "epoch": 0.7864203747382927, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53492297.6, "logits/rejected": -125162250.66666667, "logps/chosen": -408.1860107421875, "logps/rejected": -356.7578938802083, "loss": 0.3128, "rewards/chosen": 0.17100249528884887, "rewards/margins": 2.9363951166470845, "rewards/rejected": -2.765392621358236, "step": 14837 }, { "epoch": 0.7864733787400949, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46555093.333333336, "logits/rejected": 3795450.75, "logps/chosen": -366.9805501302083, "logps/rejected": -61.877899169921875, "loss": 0.3521, "rewards/chosen": 0.22336788972218832, "rewards/margins": 3.41472597916921, "rewards/rejected": -3.1913580894470215, "step": 14838 }, { "epoch": 0.786526382741897, "grad_norm": 113.5, "kl": 4.938560485839844, "learning_rate": 5e-07, "logits/chosen": -42916864.0, "logits/rejected": -25256088.0, "logps/chosen": -387.117431640625, "logps/rejected": -285.8701171875, "loss": 0.3709, "rewards/chosen": 0.36992707252502444, "rewards/margins": 3.0573501427968344, "rewards/rejected": -2.68742307027181, "step": 14839 }, { "epoch": 0.7865793867436991, "grad_norm": 40.25, "kl": 6.215787887573242, "learning_rate": 5e-07, "logits/chosen": -20611562.0, "logits/rejected": -17551364.0, "logps/chosen": -498.3953857421875, "logps/rejected": -241.79476928710938, "loss": 0.2281, "rewards/chosen": 1.9430420398712158, "rewards/margins": 3.7399133443832397, "rewards/rejected": -1.796871304512024, "step": 14840 }, { "epoch": 0.7866323907455013, "grad_norm": 55.0, "kl": 1.321131706237793, "learning_rate": 5e-07, "logits/chosen": 13975823.0, "logits/rejected": -9522793.0, "logps/chosen": -644.2303466796875, "logps/rejected": -647.2717895507812, "loss": 0.2224, "rewards/chosen": 0.9445633292198181, "rewards/margins": 4.22088235616684, "rewards/rejected": -3.2763190269470215, "step": 14841 }, { "epoch": 0.7866853947473034, "grad_norm": 46.0, "kl": 1.8559722900390625, "learning_rate": 5e-07, "logits/chosen": -7492136.0, "logits/rejected": -46867786.666666664, "logps/chosen": -188.9981201171875, "logps/rejected": -272.5068359375, "loss": 0.3486, "rewards/chosen": 0.5638621330261231, "rewards/margins": 2.6605672518412273, "rewards/rejected": -2.096705118815104, "step": 14842 }, { "epoch": 0.7867383987491056, "grad_norm": 53.0, "kl": 3.781982421875, "learning_rate": 5e-07, "logits/chosen": -46819642.666666664, "logits/rejected": -13192121.0, "logps/chosen": -455.2317301432292, "logps/rejected": -168.43453979492188, "loss": 0.407, "rewards/chosen": 0.5144937833150228, "rewards/margins": 2.0938928922017417, "rewards/rejected": -1.5793991088867188, "step": 14843 }, { "epoch": 0.7867914027509076, "grad_norm": 56.5, "kl": 0.8798580169677734, "learning_rate": 5e-07, "logits/chosen": -5489497.6, "logits/rejected": 6988198.0, "logps/chosen": -249.207373046875, "logps/rejected": -249.30120849609375, "loss": 0.294, "rewards/chosen": 0.36715562343597413, "rewards/margins": 3.9692559957504274, "rewards/rejected": -3.602100372314453, "step": 14844 }, { "epoch": 0.7868444067527098, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1639626.5, "logits/rejected": -32618200.0, "logps/chosen": -122.05680847167969, "logps/rejected": -442.9062906901042, "loss": 0.1717, "rewards/chosen": 0.4378192126750946, "rewards/margins": 3.6028681298096976, "rewards/rejected": -3.165048917134603, "step": 14845 }, { "epoch": 0.7868974107545119, "grad_norm": 42.0, "kl": 3.271923542022705, "learning_rate": 5e-07, "logits/chosen": -15422318.0, "logits/rejected": -64962968.0, "logps/chosen": -176.5099639892578, "logps/rejected": -394.66046142578125, "loss": 0.2675, "rewards/chosen": 0.7487870454788208, "rewards/margins": 4.018328070640564, "rewards/rejected": -3.269541025161743, "step": 14846 }, { "epoch": 0.7869504147563141, "grad_norm": 39.0, "kl": 2.4450912475585938, "learning_rate": 5e-07, "logits/chosen": 14581860.0, "logits/rejected": -44740564.0, "logps/chosen": -271.449462890625, "logps/rejected": -478.66326904296875, "loss": 0.226, "rewards/chosen": 1.1832168102264404, "rewards/margins": 3.8652241230010986, "rewards/rejected": -2.682007312774658, "step": 14847 }, { "epoch": 0.7870034187581162, "grad_norm": 56.25, "kl": 2.9076576232910156, "learning_rate": 5e-07, "logits/chosen": -4793855.0, "logits/rejected": -4677280.5, "logps/chosen": -258.0760498046875, "logps/rejected": -77.11902618408203, "loss": 0.3353, "rewards/chosen": 0.7557647824287415, "rewards/margins": 2.300106465816498, "rewards/rejected": -1.5443416833877563, "step": 14848 }, { "epoch": 0.7870564227599184, "grad_norm": 61.75, "kl": 1.3903045654296875, "learning_rate": 5e-07, "logits/chosen": -43083436.0, "logits/rejected": 3483236.5, "logps/chosen": -239.40184020996094, "logps/rejected": -346.54583740234375, "loss": 0.3053, "rewards/chosen": 0.555767297744751, "rewards/margins": 2.5435900688171387, "rewards/rejected": -1.9878227710723877, "step": 14849 }, { "epoch": 0.7871094267617205, "grad_norm": 44.25, "kl": 0.9854354858398438, "learning_rate": 5e-07, "logits/chosen": -18210801.6, "logits/rejected": -75886480.0, "logps/chosen": -234.3229248046875, "logps/rejected": -417.7115885416667, "loss": 0.2742, "rewards/chosen": 0.9469945907592774, "rewards/margins": 2.953149763743083, "rewards/rejected": -2.006155172983805, "step": 14850 }, { "epoch": 0.7871624307635227, "grad_norm": 49.5, "kl": 1.600301742553711, "learning_rate": 5e-07, "logits/chosen": -15996971.0, "logits/rejected": -6540148.5, "logps/chosen": -214.66969299316406, "logps/rejected": -130.83453369140625, "loss": 0.3863, "rewards/chosen": 0.07537674903869629, "rewards/margins": 2.4740240573883057, "rewards/rejected": -2.3986473083496094, "step": 14851 }, { "epoch": 0.7872154347653247, "grad_norm": 61.25, "kl": 0.3336334228515625, "learning_rate": 5e-07, "logits/chosen": -6166720.0, "logits/rejected": -7285438.0, "logps/chosen": -432.91796875, "logps/rejected": -223.0894775390625, "loss": 0.2788, "rewards/chosen": 0.7042596340179443, "rewards/margins": 2.7770087718963623, "rewards/rejected": -2.072749137878418, "step": 14852 }, { "epoch": 0.7872684387671269, "grad_norm": 49.5, "kl": 3.0309085845947266, "learning_rate": 5e-07, "logits/chosen": -33535824.0, "logits/rejected": -17794318.0, "logps/chosen": -201.6107381184896, "logps/rejected": -232.16720581054688, "loss": 0.3064, "rewards/chosen": 0.7002358436584473, "rewards/margins": 3.8714141845703125, "rewards/rejected": -3.1711783409118652, "step": 14853 }, { "epoch": 0.787321442768929, "grad_norm": 84.5, "kl": 0.9483051300048828, "learning_rate": 5e-07, "logits/chosen": 1676486.0, "logits/rejected": -13262146.4, "logps/chosen": -40.232879638671875, "logps/rejected": -239.8961181640625, "loss": 0.3167, "rewards/chosen": -0.03268000235160192, "rewards/margins": 1.6928803766767184, "rewards/rejected": -1.7255603790283203, "step": 14854 }, { "epoch": 0.7873744467707312, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67738976.0, "logits/rejected": -35952580.571428575, "logps/chosen": -433.7679443359375, "logps/rejected": -306.9775390625, "loss": 0.1839, "rewards/chosen": 2.1780030727386475, "rewards/margins": 3.6845153059278215, "rewards/rejected": -1.506512233189174, "step": 14855 }, { "epoch": 0.7874274507725333, "grad_norm": 39.5, "kl": 1.7410707473754883, "learning_rate": 5e-07, "logits/chosen": -16922056.0, "logits/rejected": -44096504.0, "logps/chosen": -363.3161315917969, "logps/rejected": -271.9622497558594, "loss": 0.2171, "rewards/chosen": 1.0253527164459229, "rewards/margins": 3.33437180519104, "rewards/rejected": -2.309019088745117, "step": 14856 }, { "epoch": 0.7874804547743355, "grad_norm": 115.5, "kl": 0.9943904876708984, "learning_rate": 5e-07, "logits/chosen": -61697.5, "logits/rejected": -8214965.333333333, "logps/chosen": -44.47859191894531, "logps/rejected": -204.4647013346354, "loss": 0.2874, "rewards/chosen": 0.7450618743896484, "rewards/margins": 2.059581597646077, "rewards/rejected": -1.314519723256429, "step": 14857 }, { "epoch": 0.7875334587761376, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3181275.5, "logits/rejected": -43835816.0, "logps/chosen": -93.05481719970703, "logps/rejected": -487.79736328125, "loss": 0.2386, "rewards/chosen": 0.6152259707450867, "rewards/margins": 3.2233261466026306, "rewards/rejected": -2.608100175857544, "step": 14858 }, { "epoch": 0.7875864627779398, "grad_norm": 34.5, "kl": 2.490598678588867, "learning_rate": 5e-07, "logits/chosen": 1237041.6666666667, "logits/rejected": -25178128.0, "logps/chosen": -29.605026245117188, "logps/rejected": -147.1385498046875, "loss": 0.3219, "rewards/chosen": 0.16799044609069824, "rewards/margins": 2.268558073043823, "rewards/rejected": -2.100567626953125, "step": 14859 }, { "epoch": 0.7876394667797418, "grad_norm": 38.25, "kl": 0.4455299377441406, "learning_rate": 5e-07, "logits/chosen": -3061626.0, "logits/rejected": -8709850.666666666, "logps/chosen": -135.13648986816406, "logps/rejected": -280.175048828125, "loss": 0.2045, "rewards/chosen": 0.06631536781787872, "rewards/margins": 2.750094766418139, "rewards/rejected": -2.6837793986002603, "step": 14860 }, { "epoch": 0.787692470781544, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50575028.0, "logits/rejected": -8734444.0, "logps/chosen": -323.1153869628906, "logps/rejected": -175.50296020507812, "loss": 0.275, "rewards/chosen": 0.4164300858974457, "rewards/margins": 2.6760304868221283, "rewards/rejected": -2.2596004009246826, "step": 14861 }, { "epoch": 0.7877454747833461, "grad_norm": 49.25, "kl": 1.8470649719238281, "learning_rate": 5e-07, "logits/chosen": -30890136.0, "logits/rejected": -27068098.0, "logps/chosen": -314.1751708984375, "logps/rejected": -79.54891204833984, "loss": 0.3208, "rewards/chosen": 0.9806058406829834, "rewards/margins": 2.658169984817505, "rewards/rejected": -1.6775641441345215, "step": 14862 }, { "epoch": 0.7877984787851483, "grad_norm": 48.5, "kl": 2.0654144287109375, "learning_rate": 5e-07, "logits/chosen": -12244738.666666666, "logits/rejected": -66282888.0, "logps/chosen": -387.0678304036458, "logps/rejected": -94.7441635131836, "loss": 0.2752, "rewards/chosen": 1.0972239176432292, "rewards/margins": 3.8688584963480634, "rewards/rejected": -2.771634578704834, "step": 14863 }, { "epoch": 0.7878514827869504, "grad_norm": 54.5, "kl": 3.6918697357177734, "learning_rate": 5e-07, "logits/chosen": -339037.7, "logits/rejected": -6915739.333333333, "logps/chosen": -38.27001953125, "logps/rejected": -494.4717610677083, "loss": 0.3312, "rewards/chosen": 0.4715579986572266, "rewards/margins": 3.557114346822103, "rewards/rejected": -3.0855563481648765, "step": 14864 }, { "epoch": 0.7879044867887526, "grad_norm": 71.5, "kl": 0.4805450439453125, "learning_rate": 5e-07, "logits/chosen": -146501.5, "logits/rejected": -11291054.0, "logps/chosen": -604.839599609375, "logps/rejected": -190.88613891601562, "loss": 0.2922, "rewards/chosen": 0.6626579761505127, "rewards/margins": 2.294809341430664, "rewards/rejected": -1.6321513652801514, "step": 14865 }, { "epoch": 0.7879574907905547, "grad_norm": 52.75, "kl": 2.692234992980957, "learning_rate": 5e-07, "logits/chosen": 8125899.333333333, "logits/rejected": -39983604.0, "logps/chosen": -232.3425089518229, "logps/rejected": -638.8175048828125, "loss": 0.3949, "rewards/chosen": 0.32229578495025635, "rewards/margins": 4.157232403755188, "rewards/rejected": -3.8349366188049316, "step": 14866 }, { "epoch": 0.7880104947923569, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7903531.5, "logits/rejected": -28674880.0, "logps/chosen": -222.27037048339844, "logps/rejected": -345.9559733072917, "loss": 0.1929, "rewards/chosen": 0.46046602725982666, "rewards/margins": 2.578877409299215, "rewards/rejected": -2.118411382039388, "step": 14867 }, { "epoch": 0.7880634987941589, "grad_norm": 52.25, "kl": 1.30413818359375, "learning_rate": 5e-07, "logits/chosen": -21795485.333333332, "logits/rejected": -68657744.0, "logps/chosen": -458.4655354817708, "logps/rejected": -522.8612060546875, "loss": 0.319, "rewards/chosen": 0.7223339875539144, "rewards/margins": 3.6250606377919516, "rewards/rejected": -2.902726650238037, "step": 14868 }, { "epoch": 0.7881165027959611, "grad_norm": 36.0, "kl": 0.8283252716064453, "learning_rate": 5e-07, "logits/chosen": -28096485.333333332, "logits/rejected": -11075070.4, "logps/chosen": -251.844482421875, "logps/rejected": -259.742529296875, "loss": 0.1916, "rewards/chosen": 1.0081018606821697, "rewards/margins": 2.9807640234629313, "rewards/rejected": -1.9726621627807617, "step": 14869 }, { "epoch": 0.7881695067977632, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4906770.0, "logits/rejected": -39072768.0, "logps/chosen": -313.54541015625, "logps/rejected": -132.87208048502603, "loss": 0.3396, "rewards/chosen": 0.11352448463439942, "rewards/margins": 2.285807339350382, "rewards/rejected": -2.172282854715983, "step": 14870 }, { "epoch": 0.7882225107995654, "grad_norm": 50.5, "kl": 3.3505678176879883, "learning_rate": 5e-07, "logits/chosen": 3177994.0, "logits/rejected": 59470568.0, "logps/chosen": -65.69105529785156, "logps/rejected": -266.92633056640625, "loss": 0.3283, "rewards/chosen": 0.8564269542694092, "rewards/margins": 2.0803312063217163, "rewards/rejected": -1.2239042520523071, "step": 14871 }, { "epoch": 0.7882755148013675, "grad_norm": 53.75, "kl": 2.588937759399414, "learning_rate": 5e-07, "logits/chosen": -12940870.4, "logits/rejected": -51242645.333333336, "logps/chosen": -415.875341796875, "logps/rejected": -497.317626953125, "loss": 0.2262, "rewards/chosen": 1.2279985427856446, "rewards/margins": 3.7782821337382, "rewards/rejected": -2.550283590952555, "step": 14872 }, { "epoch": 0.7883285188031697, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29656268.8, "logits/rejected": -29526018.666666668, "logps/chosen": -434.979296875, "logps/rejected": -154.35688273111978, "loss": 0.2469, "rewards/chosen": 0.8664431571960449, "rewards/margins": 3.7449399630228677, "rewards/rejected": -2.8784968058268228, "step": 14873 }, { "epoch": 0.7883815228049718, "grad_norm": 56.25, "kl": 1.3268280029296875, "learning_rate": 5e-07, "logits/chosen": -18646504.0, "logits/rejected": -9991880.0, "logps/chosen": -240.37538364955358, "logps/rejected": -318.565185546875, "loss": 0.5028, "rewards/chosen": 0.023793759090559825, "rewards/margins": 0.9076988122292927, "rewards/rejected": -0.8839050531387329, "step": 14874 }, { "epoch": 0.788434526806774, "grad_norm": 37.75, "kl": 2.7227563858032227, "learning_rate": 5e-07, "logits/chosen": -1772662.5, "logits/rejected": 13954661.0, "logps/chosen": -174.91983032226562, "logps/rejected": -232.10032653808594, "loss": 0.3319, "rewards/chosen": -0.02224603295326233, "rewards/margins": 2.7660301625728607, "rewards/rejected": -2.788276195526123, "step": 14875 }, { "epoch": 0.788487530808576, "grad_norm": 39.5, "kl": 2.4357614517211914, "learning_rate": 5e-07, "logits/chosen": 2680923.25, "logits/rejected": 34499680.0, "logps/chosen": -41.18134307861328, "logps/rejected": -490.5229085286458, "loss": 0.2428, "rewards/chosen": -0.038245584815740585, "rewards/margins": 2.8949466682970524, "rewards/rejected": -2.933192253112793, "step": 14876 }, { "epoch": 0.7885405348103782, "grad_norm": 49.25, "kl": 2.8069705963134766, "learning_rate": 5e-07, "logits/chosen": 52648746.666666664, "logits/rejected": -38792468.0, "logps/chosen": -330.922119140625, "logps/rejected": -398.4530944824219, "loss": 0.3074, "rewards/chosen": 0.9247599442799886, "rewards/margins": 5.26819904645284, "rewards/rejected": -4.343439102172852, "step": 14877 }, { "epoch": 0.7885935388121803, "grad_norm": 37.5, "kl": 0.01971435546875, "learning_rate": 5e-07, "logits/chosen": -607258.6875, "logits/rejected": 58773096.0, "logps/chosen": -245.3363037109375, "logps/rejected": -447.3358154296875, "loss": 0.1994, "rewards/chosen": 1.6788091659545898, "rewards/margins": 3.356275796890259, "rewards/rejected": -1.677466630935669, "step": 14878 }, { "epoch": 0.7886465428139825, "grad_norm": 47.5, "kl": 1.27838134765625, "learning_rate": 5e-07, "logits/chosen": -39852499.2, "logits/rejected": 5958408.0, "logps/chosen": -279.83427734375, "logps/rejected": -151.77701822916666, "loss": 0.3214, "rewards/chosen": 0.5883748054504394, "rewards/margins": 3.2378290812174475, "rewards/rejected": -2.6494542757670083, "step": 14879 }, { "epoch": 0.7886995468157846, "grad_norm": 57.25, "kl": 2.192294120788574, "learning_rate": 5e-07, "logits/chosen": -29533118.0, "logits/rejected": -47563564.0, "logps/chosen": -295.408447265625, "logps/rejected": -496.5552978515625, "loss": 0.3169, "rewards/chosen": 0.10409761220216751, "rewards/margins": 2.842448003590107, "rewards/rejected": -2.7383503913879395, "step": 14880 }, { "epoch": 0.7887525508175868, "grad_norm": 60.25, "kl": 1.8295269012451172, "learning_rate": 5e-07, "logits/chosen": -63789888.0, "logits/rejected": -27439573.333333332, "logps/chosen": -438.5427734375, "logps/rejected": -266.81146240234375, "loss": 0.3319, "rewards/chosen": 0.5603078842163086, "rewards/margins": 2.3607585271199545, "rewards/rejected": -1.8004506429036458, "step": 14881 }, { "epoch": 0.7888055548193889, "grad_norm": 54.75, "kl": 0.1972064971923828, "learning_rate": 5e-07, "logits/chosen": -6225861.0, "logits/rejected": -58174528.0, "logps/chosen": -395.7437744140625, "logps/rejected": -392.88983154296875, "loss": 0.3038, "rewards/chosen": 0.28910571336746216, "rewards/margins": 2.1670575737953186, "rewards/rejected": -1.8779518604278564, "step": 14882 }, { "epoch": 0.788858558821191, "grad_norm": 36.5, "kl": 0.20950698852539062, "learning_rate": 5e-07, "logits/chosen": -6824183.333333333, "logits/rejected": -37034809.6, "logps/chosen": -791.0789388020834, "logps/rejected": -277.9173828125, "loss": 0.1258, "rewards/chosen": 3.0547701517740884, "rewards/margins": 5.13688112894694, "rewards/rejected": -2.0821109771728517, "step": 14883 }, { "epoch": 0.7889115628229931, "grad_norm": 79.0, "kl": 1.4045734405517578, "learning_rate": 5e-07, "logits/chosen": -64892760.0, "logits/rejected": 6281282.666666667, "logps/chosen": -427.93621826171875, "logps/rejected": -341.3994140625, "loss": 0.2176, "rewards/chosen": 0.7243202328681946, "rewards/margins": 2.6605237126350403, "rewards/rejected": -1.9362034797668457, "step": 14884 }, { "epoch": 0.7889645668247953, "grad_norm": 36.0, "kl": 0.5988373756408691, "learning_rate": 5e-07, "logits/chosen": -9820684.0, "logits/rejected": -26526256.0, "logps/chosen": -115.8677978515625, "logps/rejected": -338.5267333984375, "loss": 0.2756, "rewards/chosen": 0.5945021629333496, "rewards/margins": 3.8041334470113117, "rewards/rejected": -3.2096312840779624, "step": 14885 }, { "epoch": 0.7890175708265974, "grad_norm": 43.75, "kl": 0.9747505187988281, "learning_rate": 5e-07, "logits/chosen": -45691584.0, "logits/rejected": -29704742.4, "logps/chosen": -337.5970458984375, "logps/rejected": -195.81759033203124, "loss": 0.2262, "rewards/chosen": 1.0235992272694905, "rewards/margins": 2.748984130223592, "rewards/rejected": -1.7253849029541015, "step": 14886 }, { "epoch": 0.7890705748283996, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12911623.0, "logits/rejected": -18871869.333333332, "logps/chosen": -260.09112548828125, "logps/rejected": -416.4160970052083, "loss": 0.1583, "rewards/chosen": 0.3467613458633423, "rewards/margins": 3.7649484872817993, "rewards/rejected": -3.418187141418457, "step": 14887 }, { "epoch": 0.7891235788302017, "grad_norm": 42.75, "kl": 3.5882186889648438, "learning_rate": 5e-07, "logits/chosen": -39514244.0, "logits/rejected": -18863592.0, "logps/chosen": -148.7696533203125, "logps/rejected": -372.7880554199219, "loss": 0.2107, "rewards/chosen": 1.0971360206604004, "rewards/margins": 3.7069451808929443, "rewards/rejected": -2.609809160232544, "step": 14888 }, { "epoch": 0.7891765828320039, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12004092.0, "logits/rejected": -50475112.0, "logps/chosen": -270.8085021972656, "logps/rejected": -483.5970458984375, "loss": 0.2389, "rewards/chosen": 0.49976199865341187, "rewards/margins": 3.9381174445152283, "rewards/rejected": -3.4383554458618164, "step": 14889 }, { "epoch": 0.789229586833806, "grad_norm": 31.625, "kl": 2.3173255920410156, "learning_rate": 5e-07, "logits/chosen": -8618134.0, "logits/rejected": -47111164.0, "logps/chosen": -157.0914764404297, "logps/rejected": -381.2633056640625, "loss": 0.2073, "rewards/chosen": 0.8803790807723999, "rewards/margins": 4.1017807722091675, "rewards/rejected": -3.2214016914367676, "step": 14890 }, { "epoch": 0.789282590835608, "grad_norm": 51.75, "kl": 2.486988067626953, "learning_rate": 5e-07, "logits/chosen": -19228364.0, "logits/rejected": -37716992.0, "logps/chosen": -575.865966796875, "logps/rejected": -449.98162841796875, "loss": 0.2265, "rewards/chosen": 0.7795934677124023, "rewards/margins": 3.550015926361084, "rewards/rejected": -2.7704224586486816, "step": 14891 }, { "epoch": 0.7893355948374102, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 129581216.0, "logits/rejected": -30268827.42857143, "logps/chosen": -879.23828125, "logps/rejected": -388.72415597098217, "loss": 0.1389, "rewards/chosen": -0.2614379823207855, "rewards/margins": 2.904510340520314, "rewards/rejected": -3.1659483228410994, "step": 14892 }, { "epoch": 0.7893885988392123, "grad_norm": 48.75, "kl": 4.406888008117676, "learning_rate": 5e-07, "logits/chosen": 211394.8, "logits/rejected": -30337813.333333332, "logps/chosen": -169.18223876953124, "logps/rejected": -198.1552937825521, "loss": 0.4682, "rewards/chosen": 0.4701823711395264, "rewards/margins": 1.2098872979482016, "rewards/rejected": -0.7397049268086752, "step": 14893 }, { "epoch": 0.7894416028410145, "grad_norm": 61.25, "kl": 2.6504344940185547, "learning_rate": 5e-07, "logits/chosen": -35764875.428571425, "logits/rejected": -4232764.5, "logps/chosen": -467.1112583705357, "logps/rejected": -248.57412719726562, "loss": 0.5052, "rewards/chosen": 0.009245412690298898, "rewards/margins": 1.1647813490458898, "rewards/rejected": -1.1555359363555908, "step": 14894 }, { "epoch": 0.7894946068428166, "grad_norm": 68.0, "kl": 0.22728729248046875, "learning_rate": 5e-07, "logits/chosen": -31985600.0, "logits/rejected": -110859728.0, "logps/chosen": -319.54945882161456, "logps/rejected": -376.26324462890625, "loss": 0.3234, "rewards/chosen": 0.36637930075327557, "rewards/margins": 4.180342713991801, "rewards/rejected": -3.8139634132385254, "step": 14895 }, { "epoch": 0.7895476108446188, "grad_norm": 44.0, "kl": 1.2771530151367188, "learning_rate": 5e-07, "logits/chosen": -12731076.0, "logits/rejected": -9892800.666666666, "logps/chosen": -108.26082611083984, "logps/rejected": -381.7810872395833, "loss": 0.2121, "rewards/chosen": 0.3165312707424164, "rewards/margins": 3.415588468313217, "rewards/rejected": -3.099057197570801, "step": 14896 }, { "epoch": 0.7896006148464209, "grad_norm": 33.25, "kl": 3.107794761657715, "learning_rate": 5e-07, "logits/chosen": -12821733.6, "logits/rejected": -8831760.0, "logps/chosen": -250.1805419921875, "logps/rejected": -126.58441162109375, "loss": 0.3444, "rewards/chosen": 0.38074069023132323, "rewards/margins": 3.630228885014852, "rewards/rejected": -3.249488194783529, "step": 14897 }, { "epoch": 0.789653618848223, "grad_norm": 41.0, "kl": 3.264224052429199, "learning_rate": 5e-07, "logits/chosen": -10686033.6, "logits/rejected": -41925621.333333336, "logps/chosen": -191.95908203125, "logps/rejected": -590.7283528645834, "loss": 0.3434, "rewards/chosen": 0.2085340738296509, "rewards/margins": 3.7746217966079714, "rewards/rejected": -3.5660877227783203, "step": 14898 }, { "epoch": 0.7897066228500251, "grad_norm": 60.75, "kl": 2.7550487518310547, "learning_rate": 5e-07, "logits/chosen": -33550518.4, "logits/rejected": -38054269.333333336, "logps/chosen": -316.41337890625, "logps/rejected": -309.8750813802083, "loss": 0.3421, "rewards/chosen": 0.7691338062286377, "rewards/margins": 2.386935790379842, "rewards/rejected": -1.6178019841512044, "step": 14899 }, { "epoch": 0.7897596268518273, "grad_norm": 64.5, "kl": 6.217506408691406, "learning_rate": 5e-07, "logits/chosen": -43331621.333333336, "logits/rejected": -25034518.0, "logps/chosen": -540.2200113932291, "logps/rejected": -141.48272705078125, "loss": 0.3959, "rewards/chosen": 0.9326523145039877, "rewards/margins": 3.1468423207600913, "rewards/rejected": -2.2141900062561035, "step": 14900 }, { "epoch": 0.7898126308536294, "grad_norm": 40.0, "kl": 3.154275894165039, "learning_rate": 5e-07, "logits/chosen": -19799410.0, "logits/rejected": -31150860.0, "logps/chosen": -330.7950134277344, "logps/rejected": -240.32774353027344, "loss": 0.2668, "rewards/chosen": 1.2428675889968872, "rewards/margins": 3.589525580406189, "rewards/rejected": -2.3466579914093018, "step": 14901 }, { "epoch": 0.7898656348554316, "grad_norm": 26.625, "kl": 4.028210639953613, "learning_rate": 5e-07, "logits/chosen": 8735560.0, "logits/rejected": -24317528.0, "logps/chosen": -38.75200271606445, "logps/rejected": -333.2470296223958, "loss": 0.1875, "rewards/chosen": 1.0346404314041138, "rewards/margins": 3.2947519222895303, "rewards/rejected": -2.2601114908854165, "step": 14902 }, { "epoch": 0.7899186388572337, "grad_norm": 60.75, "kl": 2.1599578857421875, "learning_rate": 5e-07, "logits/chosen": -74073632.0, "logits/rejected": -12444240.0, "logps/chosen": -769.7115478515625, "logps/rejected": -208.4804890950521, "loss": 0.2565, "rewards/chosen": 1.2976402044296265, "rewards/margins": 3.0128717819849653, "rewards/rejected": -1.7152315775553386, "step": 14903 }, { "epoch": 0.7899716428590359, "grad_norm": 57.5, "kl": 1.1047172546386719, "learning_rate": 5e-07, "logits/chosen": -25676058.666666668, "logits/rejected": -13844428.8, "logps/chosen": -459.0480143229167, "logps/rejected": -350.40703125, "loss": 0.2853, "rewards/chosen": 0.24871269861857095, "rewards/margins": 2.075646177927653, "rewards/rejected": -1.8269334793090821, "step": 14904 }, { "epoch": 0.790024646860838, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1371539.5, "logits/rejected": -25991385.6, "logps/chosen": -357.52197265625, "logps/rejected": -399.546923828125, "loss": 0.2861, "rewards/chosen": 0.24071502685546875, "rewards/margins": 2.367197799682617, "rewards/rejected": -2.1264827728271483, "step": 14905 }, { "epoch": 0.7900776508626401, "grad_norm": 38.0, "kl": 1.700230598449707, "learning_rate": 5e-07, "logits/chosen": -28556272.0, "logits/rejected": -20370636.8, "logps/chosen": -284.3639322916667, "logps/rejected": -284.2154296875, "loss": 0.25, "rewards/chosen": 0.6594542264938354, "rewards/margins": 3.25676486492157, "rewards/rejected": -2.5973106384277345, "step": 14906 }, { "epoch": 0.7901306548644422, "grad_norm": 41.25, "kl": 1.3652229309082031, "learning_rate": 5e-07, "logits/chosen": -14332251.2, "logits/rejected": -54308826.666666664, "logps/chosen": -301.9881103515625, "logps/rejected": -384.835205078125, "loss": 0.1982, "rewards/chosen": 1.3962116241455078, "rewards/margins": 3.7400309244791665, "rewards/rejected": -2.3438193003336587, "step": 14907 }, { "epoch": 0.7901836588662444, "grad_norm": 47.0, "kl": 3.1850051879882812, "learning_rate": 5e-07, "logits/chosen": -28269436.0, "logits/rejected": -21739696.0, "logps/chosen": -563.5750732421875, "logps/rejected": -305.0654296875, "loss": 0.1847, "rewards/chosen": 1.8083606958389282, "rewards/margins": 3.91952121257782, "rewards/rejected": -2.1111605167388916, "step": 14908 }, { "epoch": 0.7902366628680465, "grad_norm": 55.75, "kl": 2.9290781021118164, "learning_rate": 5e-07, "logits/chosen": -12815800.0, "logits/rejected": -47578840.0, "logps/chosen": -435.8304036458333, "logps/rejected": -182.86520385742188, "loss": 0.3413, "rewards/chosen": 0.9946964581807455, "rewards/margins": 2.5232498248418174, "rewards/rejected": -1.5285533666610718, "step": 14909 }, { "epoch": 0.7902896668698487, "grad_norm": 58.75, "kl": 6.156890392303467, "learning_rate": 5e-07, "logits/chosen": -21570420.8, "logits/rejected": 608764.3333333334, "logps/chosen": -621.79521484375, "logps/rejected": -76.40877787272136, "loss": 0.3948, "rewards/chosen": 1.1040549278259277, "rewards/margins": 2.03503147761027, "rewards/rejected": -0.9309765497843424, "step": 14910 }, { "epoch": 0.7903426708716508, "grad_norm": 58.0, "kl": 0.025613784790039062, "learning_rate": 5e-07, "logits/chosen": -33149085.333333332, "logits/rejected": -30280496.0, "logps/chosen": -304.8897298177083, "logps/rejected": -425.775341796875, "loss": 0.2418, "rewards/chosen": -0.13189233342806497, "rewards/margins": 2.969459213813146, "rewards/rejected": -3.1013515472412108, "step": 14911 }, { "epoch": 0.790395674873453, "grad_norm": 35.5, "kl": 0.7445659637451172, "learning_rate": 5e-07, "logits/chosen": -9390438.4, "logits/rejected": 10503769.333333334, "logps/chosen": -177.04808349609374, "logps/rejected": -334.59222412109375, "loss": 0.2478, "rewards/chosen": 0.9199123382568359, "rewards/margins": 3.6085227330525718, "rewards/rejected": -2.688610394795736, "step": 14912 }, { "epoch": 0.790448678875255, "grad_norm": 45.75, "kl": 1.1011428833007812, "learning_rate": 5e-07, "logits/chosen": -77851440.0, "logits/rejected": -36196780.8, "logps/chosen": -372.4339192708333, "logps/rejected": -520.45146484375, "loss": 0.1703, "rewards/chosen": 0.849872350692749, "rewards/margins": 3.9796212673187257, "rewards/rejected": -3.1297489166259767, "step": 14913 }, { "epoch": 0.7905016828770572, "grad_norm": 36.75, "kl": 2.098599433898926, "learning_rate": 5e-07, "logits/chosen": -31623666.666666668, "logits/rejected": 3366673.6, "logps/chosen": -172.4975382486979, "logps/rejected": -345.85771484375, "loss": 0.2111, "rewards/chosen": 0.49586717287699383, "rewards/margins": 3.1768795172373454, "rewards/rejected": -2.6810123443603517, "step": 14914 }, { "epoch": 0.7905546868788593, "grad_norm": 48.5, "kl": 0.3053302764892578, "learning_rate": 5e-07, "logits/chosen": -13820029.0, "logits/rejected": -19042614.0, "logps/chosen": -217.22227478027344, "logps/rejected": -403.7509765625, "loss": 0.3176, "rewards/chosen": -0.07717356830835342, "rewards/margins": 2.6956061348319054, "rewards/rejected": -2.772779703140259, "step": 14915 }, { "epoch": 0.7906076908806615, "grad_norm": 57.5, "kl": 0.12502288818359375, "learning_rate": 5e-07, "logits/chosen": -27796739.2, "logits/rejected": -11515022.666666666, "logps/chosen": -281.1220703125, "logps/rejected": -233.96638997395834, "loss": 0.3149, "rewards/chosen": 0.3555154800415039, "rewards/margins": 2.4154810905456543, "rewards/rejected": -2.0599656105041504, "step": 14916 }, { "epoch": 0.7906606948824636, "grad_norm": 39.25, "kl": 3.568312644958496, "learning_rate": 5e-07, "logits/chosen": -29766419.2, "logits/rejected": -14801622.666666666, "logps/chosen": -615.88603515625, "logps/rejected": -301.665283203125, "loss": 0.2589, "rewards/chosen": 1.2135461807250976, "rewards/margins": 5.5528870264689125, "rewards/rejected": -4.339340845743815, "step": 14917 }, { "epoch": 0.7907136988842658, "grad_norm": 45.75, "kl": 4.0081024169921875, "learning_rate": 5e-07, "logits/chosen": -26518922.666666668, "logits/rejected": -9638260.8, "logps/chosen": -302.2324625651042, "logps/rejected": -544.13349609375, "loss": 0.2274, "rewards/chosen": 1.283925215403239, "rewards/margins": 4.1039449373881025, "rewards/rejected": -2.8200197219848633, "step": 14918 }, { "epoch": 0.7907667028860679, "grad_norm": 47.25, "kl": 2.2733421325683594, "learning_rate": 5e-07, "logits/chosen": -64197368.0, "logits/rejected": -47724064.0, "logps/chosen": -605.6048583984375, "logps/rejected": -556.4017944335938, "loss": 0.1983, "rewards/chosen": 1.4824211597442627, "rewards/margins": 4.180310964584351, "rewards/rejected": -2.697889804840088, "step": 14919 }, { "epoch": 0.7908197068878701, "grad_norm": 79.0, "kl": 1.7738218307495117, "learning_rate": 5e-07, "logits/chosen": -38626600.0, "logits/rejected": -21397808.0, "logps/chosen": -238.43898010253906, "logps/rejected": -299.30889892578125, "loss": 0.3206, "rewards/chosen": 0.31247225403785706, "rewards/margins": 2.895275741815567, "rewards/rejected": -2.58280348777771, "step": 14920 }, { "epoch": 0.7908727108896721, "grad_norm": 53.25, "kl": 0.6937236785888672, "learning_rate": 5e-07, "logits/chosen": -58964357.333333336, "logits/rejected": 879699.625, "logps/chosen": -435.4652506510417, "logps/rejected": -136.52862548828125, "loss": 0.3223, "rewards/chosen": 0.4678858518600464, "rewards/margins": 3.3844443559646606, "rewards/rejected": -2.9165585041046143, "step": 14921 }, { "epoch": 0.7909257148914743, "grad_norm": 98.5, "kl": 10.743865966796875, "learning_rate": 5e-07, "logits/chosen": -9569345.714285715, "logits/rejected": 5915658.0, "logps/chosen": -380.6768275669643, "logps/rejected": -2.6173019409179688, "loss": 0.4923, "rewards/chosen": 1.1498569761003767, "rewards/margins": 1.1305828363235508, "rewards/rejected": 0.019274139776825905, "step": 14922 }, { "epoch": 0.7909787188932764, "grad_norm": 50.25, "kl": 2.18084716796875, "learning_rate": 5e-07, "logits/chosen": -15280610.666666666, "logits/rejected": -36595248.0, "logps/chosen": -343.8933512369792, "logps/rejected": -493.6518249511719, "loss": 0.2854, "rewards/chosen": 1.0799427032470703, "rewards/margins": 3.1784658432006836, "rewards/rejected": -2.0985231399536133, "step": 14923 }, { "epoch": 0.7910317228950786, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47548832.0, "logits/rejected": -18336973.333333332, "logps/chosen": -380.797021484375, "logps/rejected": -287.7424723307292, "loss": 0.2368, "rewards/chosen": 0.6733036041259766, "rewards/margins": 4.719174067179362, "rewards/rejected": -4.045870463053386, "step": 14924 }, { "epoch": 0.7910847268968807, "grad_norm": 53.0, "kl": 4.451423645019531, "learning_rate": 5e-07, "logits/chosen": -47333840.0, "logits/rejected": 119554741.33333333, "logps/chosen": -392.5145263671875, "logps/rejected": -305.5659586588542, "loss": 0.3702, "rewards/chosen": 0.5879961967468261, "rewards/margins": 3.112003676096598, "rewards/rejected": -2.524007479349772, "step": 14925 }, { "epoch": 0.7911377308986829, "grad_norm": 27.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -39031000.0, "logps/rejected": -392.31549072265625, "loss": 0.063, "rewards/rejected": -2.911098003387451, "step": 14926 }, { "epoch": 0.791190734900485, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42294536.0, "logits/rejected": -11895937.0, "logps/chosen": -276.4197998046875, "logps/rejected": -214.43582153320312, "loss": 0.289, "rewards/chosen": 0.5637897253036499, "rewards/margins": 2.0553966760635376, "rewards/rejected": -1.4916069507598877, "step": 14927 }, { "epoch": 0.7912437389022872, "grad_norm": 32.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6947145.0, "logits/rejected": -10374530.666666666, "logps/chosen": -330.2503662109375, "logps/rejected": -224.41304524739584, "loss": 0.144, "rewards/chosen": 0.639874279499054, "rewards/margins": 3.558653970559438, "rewards/rejected": -2.9187796910603843, "step": 14928 }, { "epoch": 0.7912967429040892, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13324390.0, "logits/rejected": -8319063.333333333, "logps/chosen": -245.89198303222656, "logps/rejected": -178.89595540364584, "loss": 0.2117, "rewards/chosen": 0.1657826453447342, "rewards/margins": 2.63154573738575, "rewards/rejected": -2.4657630920410156, "step": 14929 }, { "epoch": 0.7913497469058914, "grad_norm": 58.0, "kl": 1.7540206909179688, "learning_rate": 5e-07, "logits/chosen": -65920998.4, "logits/rejected": -1115214.3333333333, "logps/chosen": -530.553271484375, "logps/rejected": -157.644775390625, "loss": 0.434, "rewards/chosen": 0.29723329544067384, "rewards/margins": 1.3749919255574543, "rewards/rejected": -1.0777586301167805, "step": 14930 }, { "epoch": 0.7914027509076935, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27377032.0, "logits/rejected": -19801610.0, "logps/chosen": -400.70989990234375, "logps/rejected": -153.0420379638672, "loss": 0.2991, "rewards/chosen": 0.29955655336380005, "rewards/margins": 2.802662193775177, "rewards/rejected": -2.503105640411377, "step": 14931 }, { "epoch": 0.7914557549094957, "grad_norm": 59.0, "kl": 2.723931312561035, "learning_rate": 5e-07, "logits/chosen": -13050445.0, "logits/rejected": -22116280.0, "logps/chosen": -289.75701904296875, "logps/rejected": -106.70863342285156, "loss": 0.3474, "rewards/chosen": 0.5550030469894409, "rewards/margins": 3.053476929664612, "rewards/rejected": -2.498473882675171, "step": 14932 }, { "epoch": 0.7915087589112978, "grad_norm": 46.75, "kl": 0.7958803176879883, "learning_rate": 5e-07, "logits/chosen": -5108496.666666667, "logits/rejected": -39680620.8, "logps/chosen": -305.53794352213544, "logps/rejected": -198.82138671875, "loss": 0.2007, "rewards/chosen": 1.327579180399577, "rewards/margins": 3.027611223856608, "rewards/rejected": -1.7000320434570313, "step": 14933 }, { "epoch": 0.7915617629131, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35909464.0, "logits/rejected": -24618110.0, "logps/chosen": -321.901123046875, "logps/rejected": -316.5196228027344, "loss": 0.3825, "rewards/chosen": -0.7553537487983704, "rewards/margins": 2.316030204296112, "rewards/rejected": -3.0713839530944824, "step": 14934 }, { "epoch": 0.7916147669149021, "grad_norm": 51.5, "kl": 1.0409049987792969, "learning_rate": 5e-07, "logits/chosen": -34405952.0, "logits/rejected": -10471092.8, "logps/chosen": -213.7584228515625, "logps/rejected": -256.8720947265625, "loss": 0.3048, "rewards/chosen": 0.27724558115005493, "rewards/margins": 1.7914141297340394, "rewards/rejected": -1.5141685485839844, "step": 14935 }, { "epoch": 0.7916677709167043, "grad_norm": 37.5, "kl": 2.1851606369018555, "learning_rate": 5e-07, "logits/chosen": -72963418.66666667, "logits/rejected": -24747020.8, "logps/chosen": -456.6305338541667, "logps/rejected": -247.514111328125, "loss": 0.1869, "rewards/chosen": 1.386823336283366, "rewards/margins": 3.947126833597819, "rewards/rejected": -2.560303497314453, "step": 14936 }, { "epoch": 0.7917207749185063, "grad_norm": 44.0, "kl": 2.189220428466797, "learning_rate": 5e-07, "logits/chosen": -43466202.666666664, "logits/rejected": -15283891.2, "logps/chosen": -239.89217122395834, "logps/rejected": -289.607763671875, "loss": 0.2473, "rewards/chosen": 0.8813326358795166, "rewards/margins": 3.3244951725006104, "rewards/rejected": -2.443162536621094, "step": 14937 }, { "epoch": 0.7917737789203085, "grad_norm": 55.5, "kl": 3.7232894897460938, "learning_rate": 5e-07, "logits/chosen": -29317516.0, "logits/rejected": -32614284.0, "logps/chosen": -472.75592041015625, "logps/rejected": -314.6152648925781, "loss": 0.1992, "rewards/chosen": 1.8425242900848389, "rewards/margins": 4.018913984298706, "rewards/rejected": -2.176389694213867, "step": 14938 }, { "epoch": 0.7918267829221106, "grad_norm": 42.75, "kl": 0.34999752044677734, "learning_rate": 5e-07, "logits/chosen": -41851381.333333336, "logits/rejected": -47443974.4, "logps/chosen": -83.39231363932292, "logps/rejected": -353.303955078125, "loss": 0.3156, "rewards/chosen": -0.03003820280234019, "rewards/margins": 1.8159357657035191, "rewards/rejected": -1.8459739685058594, "step": 14939 }, { "epoch": 0.7918797869239128, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28711676.0, "logits/rejected": -25863048.0, "logps/chosen": -321.93890380859375, "logps/rejected": -231.11930338541666, "loss": 0.2242, "rewards/chosen": 0.5001755356788635, "rewards/margins": 2.3935035665829973, "rewards/rejected": -1.893328030904134, "step": 14940 }, { "epoch": 0.7919327909257149, "grad_norm": 31.5, "kl": 2.4930362701416016, "learning_rate": 5e-07, "logits/chosen": -22261664.0, "logits/rejected": -15222498.666666666, "logps/chosen": -93.44662475585938, "logps/rejected": -352.3980305989583, "loss": 0.1801, "rewards/chosen": 0.5806758403778076, "rewards/margins": 3.536367495854696, "rewards/rejected": -2.955691655476888, "step": 14941 }, { "epoch": 0.791985794927517, "grad_norm": 46.0, "kl": 0.06631851196289062, "learning_rate": 5e-07, "logits/chosen": -27482902.0, "logits/rejected": -11854427.0, "logps/chosen": -316.0553283691406, "logps/rejected": -155.05343627929688, "loss": 0.3036, "rewards/chosen": 0.36860352754592896, "rewards/margins": 2.886340916156769, "rewards/rejected": -2.51773738861084, "step": 14942 }, { "epoch": 0.7920387989293192, "grad_norm": 86.0, "kl": 0.2055034637451172, "learning_rate": 5e-07, "logits/chosen": -33577130.666666664, "logits/rejected": 40596220.0, "logps/chosen": -294.6404622395833, "logps/rejected": -477.6083984375, "loss": 0.3761, "rewards/chosen": 0.6317716439565023, "rewards/margins": 0.9528455634911855, "rewards/rejected": -0.3210739195346832, "step": 14943 }, { "epoch": 0.7920918029311212, "grad_norm": 49.75, "kl": 1.6321535110473633, "learning_rate": 5e-07, "logits/chosen": -24161537.6, "logits/rejected": -9122170.666666666, "logps/chosen": -224.796240234375, "logps/rejected": -262.94606526692706, "loss": 0.337, "rewards/chosen": 0.4365635871887207, "rewards/margins": 2.487132485707601, "rewards/rejected": -2.0505688985188804, "step": 14944 }, { "epoch": 0.7921448069329234, "grad_norm": 51.25, "kl": 1.8708610534667969, "learning_rate": 5e-07, "logits/chosen": -43146080.0, "logits/rejected": -25930100.8, "logps/chosen": -752.9982096354166, "logps/rejected": -359.4744873046875, "loss": 0.2003, "rewards/chosen": 0.5479731559753418, "rewards/margins": 3.8597365379333497, "rewards/rejected": -3.311763381958008, "step": 14945 }, { "epoch": 0.7921978109347255, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40062432.0, "logits/rejected": -37111802.666666664, "logps/chosen": -207.91329956054688, "logps/rejected": -283.7596842447917, "loss": 0.1695, "rewards/chosen": 0.7126686573028564, "rewards/margins": 3.2904934088389077, "rewards/rejected": -2.5778247515360513, "step": 14946 }, { "epoch": 0.7922508149365277, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13049406.0, "logits/rejected": -49225546.666666664, "logps/chosen": -373.5627746582031, "logps/rejected": -392.7184244791667, "loss": 0.1523, "rewards/chosen": 1.0624985694885254, "rewards/margins": 3.740370591481527, "rewards/rejected": -2.6778720219930015, "step": 14947 }, { "epoch": 0.7923038189383298, "grad_norm": 46.0, "kl": 2.248506546020508, "learning_rate": 5e-07, "logits/chosen": -37323072.0, "logits/rejected": -10942648.0, "logps/chosen": -228.5888214111328, "logps/rejected": -176.93008422851562, "loss": 0.3657, "rewards/chosen": 0.3771231770515442, "rewards/margins": 1.7535900473594666, "rewards/rejected": -1.3764668703079224, "step": 14948 }, { "epoch": 0.792356822940132, "grad_norm": 53.5, "kl": 2.147982597351074, "learning_rate": 5e-07, "logits/chosen": -22471440.0, "logits/rejected": -70302880.0, "logps/chosen": -288.80606515066967, "logps/rejected": -316.740478515625, "loss": 0.4892, "rewards/chosen": -0.04299664923122951, "rewards/margins": 2.3307125525815144, "rewards/rejected": -2.373709201812744, "step": 14949 }, { "epoch": 0.7924098269419341, "grad_norm": 41.5, "kl": 0.3017730712890625, "learning_rate": 5e-07, "logits/chosen": -134578901.33333334, "logits/rejected": -46934240.0, "logps/chosen": -398.4600016276042, "logps/rejected": -582.044384765625, "loss": 0.1817, "rewards/chosen": 0.4413228432337443, "rewards/margins": 4.572952024141948, "rewards/rejected": -4.131629180908203, "step": 14950 }, { "epoch": 0.7924628309437363, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -126800512.0, "logits/rejected": 1604897.125, "logps/chosen": -230.26443481445312, "logps/rejected": -76.8108901977539, "loss": 0.3444, "rewards/chosen": -0.10054702311754227, "rewards/margins": 2.099118523299694, "rewards/rejected": -2.1996655464172363, "step": 14951 }, { "epoch": 0.7925158349455383, "grad_norm": 39.75, "kl": 5.134223937988281, "learning_rate": 5e-07, "logits/chosen": 8868573.333333334, "logits/rejected": -31890172.8, "logps/chosen": -53.815633138020836, "logps/rejected": -402.9155029296875, "loss": 0.291, "rewards/chosen": 1.1092856725056965, "rewards/margins": 3.6617934544881185, "rewards/rejected": -2.552507781982422, "step": 14952 }, { "epoch": 0.7925688389473405, "grad_norm": 38.0, "kl": 2.089590072631836, "learning_rate": 5e-07, "logits/chosen": -8467083.2, "logits/rejected": -62926128.0, "logps/chosen": -181.7084716796875, "logps/rejected": -583.5528971354166, "loss": 0.3503, "rewards/chosen": 0.19819273948669433, "rewards/margins": 4.061838610967, "rewards/rejected": -3.863645871480306, "step": 14953 }, { "epoch": 0.7926218429491426, "grad_norm": 36.25, "kl": 3.2268447875976562, "learning_rate": 5e-07, "logits/chosen": -26236942.4, "logits/rejected": -7554662.666666667, "logps/chosen": -624.39228515625, "logps/rejected": -271.35491943359375, "loss": 0.1675, "rewards/chosen": 1.7030179977416993, "rewards/margins": 4.6801450729370115, "rewards/rejected": -2.9771270751953125, "step": 14954 }, { "epoch": 0.7926748469509448, "grad_norm": 44.25, "kl": 1.7296314239501953, "learning_rate": 5e-07, "logits/chosen": -4756623.6, "logits/rejected": -10349124.666666666, "logps/chosen": -119.80128173828125, "logps/rejected": -58.061991373697914, "loss": 0.3522, "rewards/chosen": 0.7849663257598877, "rewards/margins": 1.6988583564758302, "rewards/rejected": -0.9138920307159424, "step": 14955 }, { "epoch": 0.7927278509527469, "grad_norm": 39.25, "kl": 0.9931411743164062, "learning_rate": 5e-07, "logits/chosen": -1209717.3333333333, "logits/rejected": -14943014.4, "logps/chosen": -200.7681884765625, "logps/rejected": -199.5913330078125, "loss": 0.2587, "rewards/chosen": 0.8583002885182699, "rewards/margins": 2.653944476445516, "rewards/rejected": -1.795644187927246, "step": 14956 }, { "epoch": 0.7927808549545491, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -2127797.5, "logps/rejected": -261.0382995605469, "loss": 0.2598, "rewards/rejected": -1.2872008085250854, "step": 14957 }, { "epoch": 0.7928338589563512, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21713022.0, "logits/rejected": -30263928.0, "logps/chosen": -238.7638397216797, "logps/rejected": -388.3651123046875, "loss": 0.2932, "rewards/chosen": 0.035907357931137085, "rewards/margins": 2.3894778192043304, "rewards/rejected": -2.3535704612731934, "step": 14958 }, { "epoch": 0.7928868629581534, "grad_norm": 39.5, "kl": 3.2400121688842773, "learning_rate": 5e-07, "logits/chosen": 3506652.8, "logits/rejected": -4544946.666666667, "logps/chosen": -187.74881591796876, "logps/rejected": -195.2159423828125, "loss": 0.3701, "rewards/chosen": 0.6135915756225586, "rewards/margins": 3.2048977851867675, "rewards/rejected": -2.591306209564209, "step": 14959 }, { "epoch": 0.7929398669599554, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12636205.0, "logits/rejected": -38179746.666666664, "logps/chosen": -28.306278228759766, "logps/rejected": -380.9066975911458, "loss": 0.1625, "rewards/chosen": 0.589580774307251, "rewards/margins": 2.9890766938527427, "rewards/rejected": -2.3994959195454917, "step": 14960 }, { "epoch": 0.7929928709617576, "grad_norm": 44.5, "kl": 0.42281341552734375, "learning_rate": 5e-07, "logits/chosen": -15740379.0, "logits/rejected": -3269454.0, "logps/chosen": -390.02398681640625, "logps/rejected": -235.24951171875, "loss": 0.22, "rewards/chosen": 0.9614416360855103, "rewards/margins": 3.1660720109939575, "rewards/rejected": -2.2046303749084473, "step": 14961 }, { "epoch": 0.7930458749635597, "grad_norm": 58.5, "kl": 2.224273681640625, "learning_rate": 5e-07, "logits/chosen": -52772339.2, "logits/rejected": -2264332.6666666665, "logps/chosen": -554.41142578125, "logps/rejected": -273.2119954427083, "loss": 0.2662, "rewards/chosen": 1.6344659805297852, "rewards/margins": 3.187129783630371, "rewards/rejected": -1.552663803100586, "step": 14962 }, { "epoch": 0.7930988789653619, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2093753.5, "logits/rejected": -8494718.0, "logps/chosen": -172.0626220703125, "logps/rejected": -94.39248657226562, "loss": 0.3864, "rewards/chosen": -0.1265043318271637, "rewards/margins": 1.182998150587082, "rewards/rejected": -1.3095024824142456, "step": 14963 }, { "epoch": 0.793151882967164, "grad_norm": 51.75, "kl": 1.7554435729980469, "learning_rate": 5e-07, "logits/chosen": -51068522.666666664, "logits/rejected": -16509455.0, "logps/chosen": -318.2377115885417, "logps/rejected": -339.36749267578125, "loss": 0.3355, "rewards/chosen": 0.6928977171579996, "rewards/margins": 2.753382126490275, "rewards/rejected": -2.0604844093322754, "step": 14964 }, { "epoch": 0.7932048869689662, "grad_norm": 35.0, "kl": 0.26001739501953125, "learning_rate": 5e-07, "logits/chosen": -9321632.0, "logits/rejected": -9341512.0, "logps/chosen": -114.08006286621094, "logps/rejected": -179.90438842773438, "loss": 0.3354, "rewards/chosen": 0.08932846784591675, "rewards/margins": 1.9048983454704285, "rewards/rejected": -1.8155698776245117, "step": 14965 }, { "epoch": 0.7932578909707683, "grad_norm": 34.25, "kl": 3.6403274536132812, "learning_rate": 5e-07, "logits/chosen": 682234.3333333334, "logits/rejected": -22137228.8, "logps/chosen": -456.2452392578125, "logps/rejected": -288.934521484375, "loss": 0.1823, "rewards/chosen": 1.9581157366434734, "rewards/margins": 4.177700964609782, "rewards/rejected": -2.2195852279663084, "step": 14966 }, { "epoch": 0.7933108949725705, "grad_norm": 52.5, "kl": 0.5049514770507812, "learning_rate": 5e-07, "logits/chosen": -28600146.666666668, "logits/rejected": 7478252.8, "logps/chosen": -458.0594889322917, "logps/rejected": -450.741015625, "loss": 0.2141, "rewards/chosen": 0.6790781815846761, "rewards/margins": 3.2690861543019616, "rewards/rejected": -2.5900079727172853, "step": 14967 }, { "epoch": 0.7933638989743725, "grad_norm": 79.0, "kl": 0.4556131362915039, "learning_rate": 5e-07, "logits/chosen": 7247022.0, "logits/rejected": -6916449.0, "logps/chosen": -417.177978515625, "logps/rejected": -197.09864807128906, "loss": 0.3909, "rewards/chosen": 0.16657714545726776, "rewards/margins": 1.025019809603691, "rewards/rejected": -0.8584426641464233, "step": 14968 }, { "epoch": 0.7934169029761747, "grad_norm": 34.75, "kl": 0.6268653869628906, "learning_rate": 5e-07, "logits/chosen": -37156844.0, "logits/rejected": -23177832.0, "logps/chosen": -312.941650390625, "logps/rejected": -222.94400024414062, "loss": 0.2014, "rewards/chosen": 0.7887989282608032, "rewards/margins": 5.04074490070343, "rewards/rejected": -4.251945972442627, "step": 14969 }, { "epoch": 0.7934699069779768, "grad_norm": 52.0, "kl": 1.023904800415039, "learning_rate": 5e-07, "logits/chosen": -31085548.8, "logits/rejected": -32106122.666666668, "logps/chosen": -222.3661865234375, "logps/rejected": -491.4944254557292, "loss": 0.3141, "rewards/chosen": 0.2772304773330688, "rewards/margins": 4.377689894040425, "rewards/rejected": -4.1004594167073565, "step": 14970 }, { "epoch": 0.793522910979779, "grad_norm": 55.5, "kl": 3.823963165283203, "learning_rate": 5e-07, "logits/chosen": -22434660.8, "logits/rejected": -33740200.0, "logps/chosen": -378.928662109375, "logps/rejected": -468.639892578125, "loss": 0.38, "rewards/chosen": 0.604296875, "rewards/margins": 2.7868703524271643, "rewards/rejected": -2.1825734774271646, "step": 14971 }, { "epoch": 0.7935759149815811, "grad_norm": 28.75, "kl": 0.352447509765625, "learning_rate": 5e-07, "logits/chosen": -16445296.0, "logits/rejected": -37186728.0, "logps/chosen": -224.4873046875, "logps/rejected": -424.9776204427083, "loss": 0.1376, "rewards/chosen": 1.4836605787277222, "rewards/margins": 4.596878250439962, "rewards/rejected": -3.1132176717122397, "step": 14972 }, { "epoch": 0.7936289189833833, "grad_norm": 49.75, "kl": 0.8349037170410156, "learning_rate": 5e-07, "logits/chosen": -45288896.0, "logits/rejected": -9302409.333333334, "logps/chosen": -884.0516357421875, "logps/rejected": -176.59281412760416, "loss": 0.2216, "rewards/chosen": 0.9921646118164062, "rewards/margins": 2.9187177022298174, "rewards/rejected": -1.9265530904134114, "step": 14973 }, { "epoch": 0.7936819229851854, "grad_norm": 61.25, "kl": 4.636142730712891, "learning_rate": 5e-07, "logits/chosen": -34514970.666666664, "logits/rejected": -19498082.0, "logps/chosen": -419.6763509114583, "logps/rejected": -238.74757385253906, "loss": 0.4464, "rewards/chosen": 0.46719853083292645, "rewards/margins": 2.057769457499186, "rewards/rejected": -1.5905709266662598, "step": 14974 }, { "epoch": 0.7937349269869876, "grad_norm": 45.5, "kl": 0.471771240234375, "learning_rate": 5e-07, "logits/chosen": -45058568.0, "logits/rejected": -472963.0, "logps/chosen": -416.202392578125, "logps/rejected": -131.358154296875, "loss": 0.1803, "rewards/chosen": 1.1461067199707031, "rewards/margins": 4.122804880142212, "rewards/rejected": -2.976698160171509, "step": 14975 }, { "epoch": 0.7937879309887896, "grad_norm": 49.0, "kl": 0.7656440734863281, "learning_rate": 5e-07, "logits/chosen": -48748660.0, "logits/rejected": -38969368.0, "logps/chosen": -264.54364013671875, "logps/rejected": -435.1222229003906, "loss": 0.2901, "rewards/chosen": 0.3167160153388977, "rewards/margins": 2.822827160358429, "rewards/rejected": -2.5061111450195312, "step": 14976 }, { "epoch": 0.7938409349905918, "grad_norm": 43.25, "kl": 0.6756477355957031, "learning_rate": 5e-07, "logits/chosen": -38273682.666666664, "logits/rejected": 1004990.4, "logps/chosen": -298.23411051432294, "logps/rejected": -418.8943359375, "loss": 0.3292, "rewards/chosen": -0.39207406838734943, "rewards/margins": 1.7417687813440959, "rewards/rejected": -2.1338428497314452, "step": 14977 }, { "epoch": 0.7938939389923939, "grad_norm": 54.75, "kl": 3.789691925048828, "learning_rate": 5e-07, "logits/chosen": -25523570.285714287, "logits/rejected": 31346150.0, "logps/chosen": -347.45054408482144, "logps/rejected": -129.79513549804688, "loss": 0.3587, "rewards/chosen": 1.0521792684282576, "rewards/margins": 2.405264667102269, "rewards/rejected": -1.3530853986740112, "step": 14978 }, { "epoch": 0.7939469429941961, "grad_norm": 49.75, "kl": 2.9426803588867188, "learning_rate": 5e-07, "logits/chosen": -33761716.571428575, "logits/rejected": -32528734.0, "logps/chosen": -373.19594029017856, "logps/rejected": -668.36572265625, "loss": 0.4071, "rewards/chosen": 0.6272326878138951, "rewards/margins": 7.005595820290702, "rewards/rejected": -6.378363132476807, "step": 14979 }, { "epoch": 0.7939999469959982, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61546528.0, "logits/rejected": -19092389.333333332, "logps/chosen": -216.52825927734375, "logps/rejected": -305.92681884765625, "loss": 0.2334, "rewards/chosen": -0.6241989731788635, "rewards/margins": 2.203999141852061, "rewards/rejected": -2.8281981150309243, "step": 14980 }, { "epoch": 0.7940529509978004, "grad_norm": 61.0, "kl": 2.701568603515625, "learning_rate": 5e-07, "logits/chosen": -34248080.0, "logits/rejected": -53932138.666666664, "logps/chosen": -629.37275390625, "logps/rejected": -530.6836344401041, "loss": 0.1909, "rewards/chosen": 1.4185751914978026, "rewards/margins": 5.906520748138428, "rewards/rejected": -4.487945556640625, "step": 14981 }, { "epoch": 0.7941059549996025, "grad_norm": 50.5, "kl": 4.233295440673828, "learning_rate": 5e-07, "logits/chosen": -28448864.0, "logits/rejected": -29820381.333333332, "logps/chosen": -327.161474609375, "logps/rejected": -369.8658040364583, "loss": 0.3808, "rewards/chosen": 0.2625410795211792, "rewards/margins": 4.8598927895228075, "rewards/rejected": -4.597351710001628, "step": 14982 }, { "epoch": 0.7941589590014047, "grad_norm": 42.75, "kl": 0.43703460693359375, "learning_rate": 5e-07, "logits/chosen": -37162288.0, "logits/rejected": -52308940.0, "logps/chosen": -177.54225158691406, "logps/rejected": -264.90582275390625, "loss": 0.2725, "rewards/chosen": 0.7086647748947144, "rewards/margins": 2.4008578062057495, "rewards/rejected": -1.6921930313110352, "step": 14983 }, { "epoch": 0.7942119630032067, "grad_norm": 52.25, "kl": 2.6119728088378906, "learning_rate": 5e-07, "logits/chosen": -48941925.333333336, "logits/rejected": -19408718.4, "logps/chosen": -234.00732421875, "logps/rejected": -328.435693359375, "loss": 0.2656, "rewards/chosen": 0.3498830397923787, "rewards/margins": 3.3495790084203088, "rewards/rejected": -2.99969596862793, "step": 14984 }, { "epoch": 0.7942649670050089, "grad_norm": 48.5, "kl": 2.7388668060302734, "learning_rate": 5e-07, "logits/chosen": 1756002.625, "logits/rejected": -39228344.0, "logps/chosen": -345.71832275390625, "logps/rejected": -310.8397521972656, "loss": 0.2687, "rewards/chosen": 1.0766254663467407, "rewards/margins": 2.57332444190979, "rewards/rejected": -1.4966989755630493, "step": 14985 }, { "epoch": 0.794317971006811, "grad_norm": 91.0, "kl": 7.258445739746094, "learning_rate": 5e-07, "logits/chosen": -16996360.0, "logits/rejected": -85283992.0, "logps/chosen": -311.44512939453125, "logps/rejected": -788.174072265625, "loss": 0.2297, "rewards/chosen": 1.8733938535054524, "rewards/margins": 5.698198636372884, "rewards/rejected": -3.8248047828674316, "step": 14986 }, { "epoch": 0.7943709750086132, "grad_norm": 40.0, "kl": 1.18487548828125, "learning_rate": 5e-07, "logits/chosen": -1555190.875, "logits/rejected": -13667092.0, "logps/chosen": -181.25717163085938, "logps/rejected": -276.5875549316406, "loss": 0.2638, "rewards/chosen": 0.8534319400787354, "rewards/margins": 3.048308849334717, "rewards/rejected": -2.1948769092559814, "step": 14987 }, { "epoch": 0.7944239790104153, "grad_norm": 37.25, "kl": 1.4326286315917969, "learning_rate": 5e-07, "logits/chosen": -30104606.0, "logits/rejected": -21025106.666666668, "logps/chosen": -254.2733154296875, "logps/rejected": -415.0867919921875, "loss": 0.1546, "rewards/chosen": 0.9607982635498047, "rewards/margins": 3.3554724057515464, "rewards/rejected": -2.3946741422017417, "step": 14988 }, { "epoch": 0.7944769830122175, "grad_norm": 61.25, "kl": 2.413249969482422, "learning_rate": 5e-07, "logits/chosen": -71028448.0, "logits/rejected": -19215768.0, "logps/chosen": -620.0830078125, "logps/rejected": -156.78072509765624, "loss": 0.2737, "rewards/chosen": 0.627039631207784, "rewards/margins": 2.8254096110661826, "rewards/rejected": -2.1983699798583984, "step": 14989 }, { "epoch": 0.7945299870140196, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39445920.0, "logits/rejected": 127819680.0, "logps/chosen": -389.7580261230469, "logps/rejected": -488.0449523925781, "loss": 0.226, "rewards/chosen": 0.6518791317939758, "rewards/margins": 3.337816298007965, "rewards/rejected": -2.6859371662139893, "step": 14990 }, { "epoch": 0.7945829910158217, "grad_norm": 38.5, "kl": 2.202594757080078, "learning_rate": 5e-07, "logits/chosen": -22448838.0, "logits/rejected": -7178524.0, "logps/chosen": -246.77517700195312, "logps/rejected": -225.80741373697916, "loss": 0.2035, "rewards/chosen": 0.5073604583740234, "rewards/margins": 4.163543701171875, "rewards/rejected": -3.6561832427978516, "step": 14991 }, { "epoch": 0.7946359950176238, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 29405810.0, "logits/rejected": -48550480.0, "logps/chosen": -233.6155548095703, "logps/rejected": -198.5128377278646, "loss": 0.2218, "rewards/chosen": 0.3722274899482727, "rewards/margins": 2.218399027983348, "rewards/rejected": -1.846171538035075, "step": 14992 }, { "epoch": 0.7946889990194259, "grad_norm": 53.5, "kl": 1.0399055480957031, "learning_rate": 5e-07, "logits/chosen": -48244083.2, "logits/rejected": -10857000.666666666, "logps/chosen": -569.436279296875, "logps/rejected": -160.10889689127603, "loss": 0.306, "rewards/chosen": 0.7696516513824463, "rewards/margins": 3.0121103127797446, "rewards/rejected": -2.2424586613972983, "step": 14993 }, { "epoch": 0.7947420030212281, "grad_norm": 71.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7024181.333333333, "logits/rejected": -122623360.0, "logps/chosen": -461.4081217447917, "logps/rejected": -217.9967041015625, "loss": 0.1993, "rewards/chosen": 0.4640905062357585, "rewards/margins": 3.622788588205973, "rewards/rejected": -3.158698081970215, "step": 14994 }, { "epoch": 0.7947950070230302, "grad_norm": 56.0, "kl": 3.7266407012939453, "learning_rate": 5e-07, "logits/chosen": -39885122.666666664, "logits/rejected": -29572592.0, "logps/chosen": -200.2569580078125, "logps/rejected": -173.32757568359375, "loss": 0.3654, "rewards/chosen": 0.756401538848877, "rewards/margins": 2.6262850761413574, "rewards/rejected": -1.8698835372924805, "step": 14995 }, { "epoch": 0.7948480110248324, "grad_norm": 35.5, "kl": 0.0351409912109375, "learning_rate": 5e-07, "logits/chosen": 38397690.666666664, "logits/rejected": -24231819.2, "logps/chosen": -173.78767903645834, "logps/rejected": -241.042041015625, "loss": 0.2161, "rewards/chosen": 0.7597026030222574, "rewards/margins": 3.1069422880808513, "rewards/rejected": -2.347239685058594, "step": 14996 }, { "epoch": 0.7949010150266345, "grad_norm": 49.25, "kl": 2.4108238220214844, "learning_rate": 5e-07, "logits/chosen": -23384564.8, "logits/rejected": -51056650.666666664, "logps/chosen": -187.1831787109375, "logps/rejected": -421.9791259765625, "loss": 0.3784, "rewards/chosen": 0.36122634410858157, "rewards/margins": 2.308853570620219, "rewards/rejected": -1.9476272265116374, "step": 14997 }, { "epoch": 0.7949540190284367, "grad_norm": 36.75, "kl": 2.947819709777832, "learning_rate": 5e-07, "logits/chosen": -17697033.6, "logits/rejected": -62089957.333333336, "logps/chosen": -220.8400390625, "logps/rejected": -547.4839680989584, "loss": 0.2461, "rewards/chosen": 0.990573501586914, "rewards/margins": 3.6934085845947267, "rewards/rejected": -2.7028350830078125, "step": 14998 }, { "epoch": 0.7950070230302387, "grad_norm": 32.5, "kl": 3.4756832122802734, "learning_rate": 5e-07, "logits/chosen": -27123651.2, "logits/rejected": -37706389.333333336, "logps/chosen": -115.2392333984375, "logps/rejected": -523.3983561197916, "loss": 0.3729, "rewards/chosen": 0.18511768579483032, "rewards/margins": 3.3468058546384176, "rewards/rejected": -3.1616881688435874, "step": 14999 }, { "epoch": 0.7950600270320409, "grad_norm": 44.75, "kl": 1.6518340110778809, "learning_rate": 5e-07, "logits/chosen": -20118393.6, "logits/rejected": -25734432.0, "logps/chosen": -163.043505859375, "logps/rejected": -241.24576822916666, "loss": 0.3783, "rewards/chosen": 0.1739100217819214, "rewards/margins": 2.3314064741134644, "rewards/rejected": -2.157496452331543, "step": 15000 }, { "epoch": 0.795113031033843, "grad_norm": 38.0, "kl": 1.4990167617797852, "learning_rate": 5e-07, "logits/chosen": -15657338.666666666, "logits/rejected": -8314605.0, "logps/chosen": -261.52964274088544, "logps/rejected": -140.03155517578125, "loss": 0.2039, "rewards/chosen": 1.436251958211263, "rewards/margins": 5.8249281247456866, "rewards/rejected": -4.388676166534424, "step": 15001 }, { "epoch": 0.7951660350356452, "grad_norm": 56.75, "kl": 0.9881858825683594, "learning_rate": 5e-07, "logits/chosen": -77284064.0, "logits/rejected": -2260590.6666666665, "logps/chosen": -291.00323486328125, "logps/rejected": -261.8829345703125, "loss": 0.27, "rewards/chosen": 0.11641769111156464, "rewards/margins": 1.7979979167381923, "rewards/rejected": -1.6815802256266277, "step": 15002 }, { "epoch": 0.7952190390374473, "grad_norm": 40.25, "kl": 2.407907485961914, "learning_rate": 5e-07, "logits/chosen": -5532029.0, "logits/rejected": -32189256.0, "logps/chosen": -341.22467041015625, "logps/rejected": -287.2174987792969, "loss": 0.3004, "rewards/chosen": 1.0054813623428345, "rewards/margins": 2.9056549072265625, "rewards/rejected": -1.900173544883728, "step": 15003 }, { "epoch": 0.7952720430392495, "grad_norm": 36.75, "kl": 1.0686101913452148, "learning_rate": 5e-07, "logits/chosen": -19905710.0, "logits/rejected": -8272660.5, "logps/chosen": -214.2969970703125, "logps/rejected": -259.99285888671875, "loss": 0.185, "rewards/chosen": 0.9146604537963867, "rewards/margins": 4.028864860534668, "rewards/rejected": -3.1142044067382812, "step": 15004 }, { "epoch": 0.7953250470410516, "grad_norm": 29.25, "kl": 3.5878353118896484, "learning_rate": 5e-07, "logits/chosen": -22017530.666666668, "logits/rejected": -4739226.4, "logps/chosen": -778.2549641927084, "logps/rejected": -189.29456787109376, "loss": 0.1474, "rewards/chosen": 1.941506067911784, "rewards/margins": 4.64866320292155, "rewards/rejected": -2.7071571350097656, "step": 15005 }, { "epoch": 0.7953780510428538, "grad_norm": 35.25, "kl": 0.9518594741821289, "learning_rate": 5e-07, "logits/chosen": -11200453.333333334, "logits/rejected": -25270776.0, "logps/chosen": -192.2082722981771, "logps/rejected": -310.81767578125, "loss": 0.2274, "rewards/chosen": 0.8165807723999023, "rewards/margins": 2.9914695739746096, "rewards/rejected": -2.174888801574707, "step": 15006 }, { "epoch": 0.7954310550446558, "grad_norm": 52.25, "kl": 1.9699268341064453, "learning_rate": 5e-07, "logits/chosen": -34925884.0, "logits/rejected": -49834180.0, "logps/chosen": -336.0220947265625, "logps/rejected": -329.9674377441406, "loss": 0.4098, "rewards/chosen": -0.21449652314186096, "rewards/margins": 2.2521398961544037, "rewards/rejected": -2.4666364192962646, "step": 15007 }, { "epoch": 0.795484059046458, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11722307.0, "logits/rejected": -15790460.0, "logps/chosen": -277.7696533203125, "logps/rejected": -283.7527262369792, "loss": 0.2365, "rewards/chosen": -0.3544151186943054, "rewards/margins": 1.9906906882921853, "rewards/rejected": -2.3451058069864907, "step": 15008 }, { "epoch": 0.7955370630482601, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34088288.0, "logits/rejected": -26804668.0, "logps/chosen": -206.02938842773438, "logps/rejected": -449.5338134765625, "loss": 0.2029, "rewards/chosen": 0.5460029244422913, "rewards/margins": 4.069491922855377, "rewards/rejected": -3.523488998413086, "step": 15009 }, { "epoch": 0.7955900670500623, "grad_norm": 67.0, "kl": 1.760904312133789, "learning_rate": 5e-07, "logits/chosen": -44555833.6, "logits/rejected": -3846471.3333333335, "logps/chosen": -266.9212646484375, "logps/rejected": -458.0208333333333, "loss": 0.3411, "rewards/chosen": 0.34217774868011475, "rewards/margins": 2.7509411573410034, "rewards/rejected": -2.4087634086608887, "step": 15010 }, { "epoch": 0.7956430710518644, "grad_norm": 44.25, "kl": 0.43395519256591797, "learning_rate": 5e-07, "logits/chosen": 2687726.4, "logits/rejected": 961023.6666666666, "logps/chosen": -134.0901611328125, "logps/rejected": -128.27226765950522, "loss": 0.3595, "rewards/chosen": 0.2384413242340088, "rewards/margins": 2.160926675796509, "rewards/rejected": -1.9224853515625, "step": 15011 }, { "epoch": 0.7956960750536666, "grad_norm": 47.75, "kl": 0.8142738342285156, "learning_rate": 5e-07, "logits/chosen": -31008236.8, "logits/rejected": -13840813.333333334, "logps/chosen": -383.968017578125, "logps/rejected": -391.8643391927083, "loss": 0.2661, "rewards/chosen": 0.543470811843872, "rewards/margins": 3.5481632073720295, "rewards/rejected": -3.0046923955281577, "step": 15012 }, { "epoch": 0.7957490790554687, "grad_norm": 36.25, "kl": 2.1374053955078125, "learning_rate": 5e-07, "logits/chosen": -367884.5, "logits/rejected": -31830466.0, "logps/chosen": -82.72063446044922, "logps/rejected": -331.207763671875, "loss": 0.2535, "rewards/chosen": 0.8623797297477722, "rewards/margins": 3.166808068752289, "rewards/rejected": -2.3044283390045166, "step": 15013 }, { "epoch": 0.7958020830572708, "grad_norm": 62.75, "kl": 0.8754425048828125, "learning_rate": 5e-07, "logits/chosen": -24570273.6, "logits/rejected": -48017642.666666664, "logps/chosen": -310.004638671875, "logps/rejected": -375.7662760416667, "loss": 0.3403, "rewards/chosen": 0.4048623561859131, "rewards/margins": 2.4014071941375734, "rewards/rejected": -1.9965448379516602, "step": 15014 }, { "epoch": 0.7958550870590729, "grad_norm": 33.25, "kl": 0.8360090255737305, "learning_rate": 5e-07, "logits/chosen": 11579.0, "logits/rejected": 526098.6666666666, "logps/chosen": -167.0538330078125, "logps/rejected": -382.6895751953125, "loss": 0.1087, "rewards/chosen": 1.595613956451416, "rewards/margins": 4.708347797393799, "rewards/rejected": -3.112733840942383, "step": 15015 }, { "epoch": 0.7959080910608751, "grad_norm": 42.75, "kl": 3.8605499267578125, "learning_rate": 5e-07, "logits/chosen": -34825748.0, "logits/rejected": -51176176.0, "logps/chosen": -382.83282470703125, "logps/rejected": -465.9678039550781, "loss": 0.2242, "rewards/chosen": 1.1865017414093018, "rewards/margins": 3.5199594497680664, "rewards/rejected": -2.3334577083587646, "step": 15016 }, { "epoch": 0.7959610950626772, "grad_norm": 40.75, "kl": 1.2070674896240234, "learning_rate": 5e-07, "logits/chosen": -37448696.0, "logits/rejected": -59260424.0, "logps/chosen": -186.41615295410156, "logps/rejected": -466.88470458984375, "loss": 0.2888, "rewards/chosen": 0.18239755928516388, "rewards/margins": 2.8794607669115067, "rewards/rejected": -2.6970632076263428, "step": 15017 }, { "epoch": 0.7960140990644794, "grad_norm": 77.0, "kl": 3.27630615234375, "learning_rate": 5e-07, "logits/chosen": -18190344.0, "logits/rejected": -22460538.0, "logps/chosen": -215.79391479492188, "logps/rejected": -331.58380126953125, "loss": 0.3413, "rewards/chosen": 0.4268094599246979, "rewards/margins": 1.8688508570194244, "rewards/rejected": -1.4420413970947266, "step": 15018 }, { "epoch": 0.7960671030662815, "grad_norm": 28.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4105487.3333333335, "logits/rejected": -32132300.8, "logps/chosen": -240.26383463541666, "logps/rejected": -396.1270751953125, "loss": 0.1154, "rewards/chosen": 1.542945384979248, "rewards/margins": 4.444050121307373, "rewards/rejected": -2.901104736328125, "step": 15019 }, { "epoch": 0.7961201070680837, "grad_norm": 54.75, "kl": 4.115804672241211, "learning_rate": 5e-07, "logits/chosen": -19088878.666666668, "logits/rejected": -4225786.8, "logps/chosen": -210.44466145833334, "logps/rejected": -329.2775390625, "loss": 0.2359, "rewards/chosen": 1.0902475516001384, "rewards/margins": 3.3890691916147873, "rewards/rejected": -2.2988216400146486, "step": 15020 }, { "epoch": 0.7961731110698858, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30397874.666666668, "logits/rejected": -58109209.6, "logps/chosen": -413.9135335286458, "logps/rejected": -324.497998046875, "loss": 0.2461, "rewards/chosen": 0.4484110673268636, "rewards/margins": 3.3988112290700276, "rewards/rejected": -2.950400161743164, "step": 15021 }, { "epoch": 0.796226115071688, "grad_norm": 26.125, "kl": 1.784113883972168, "learning_rate": 5e-07, "logits/chosen": -13521345.333333334, "logits/rejected": -36591462.4, "logps/chosen": -71.44538879394531, "logps/rejected": -299.8625, "loss": 0.2759, "rewards/chosen": 0.24158714214960733, "rewards/margins": 3.1314791480700173, "rewards/rejected": -2.88989200592041, "step": 15022 }, { "epoch": 0.79627911907349, "grad_norm": 35.75, "kl": 1.3454532623291016, "learning_rate": 5e-07, "logits/chosen": -11694170.0, "logits/rejected": -26322914.666666668, "logps/chosen": -230.00157165527344, "logps/rejected": -178.51505533854166, "loss": 0.2079, "rewards/chosen": 0.8502875566482544, "rewards/margins": 3.006797432899475, "rewards/rejected": -2.1565098762512207, "step": 15023 }, { "epoch": 0.7963321230752922, "grad_norm": 46.25, "kl": 2.3357467651367188, "learning_rate": 5e-07, "logits/chosen": 6496466.0, "logits/rejected": -35230072.0, "logps/chosen": -514.928271484375, "logps/rejected": -252.57759602864584, "loss": 0.2936, "rewards/chosen": 0.9450994491577148, "rewards/margins": 3.4104823430379234, "rewards/rejected": -2.4653828938802085, "step": 15024 }, { "epoch": 0.7963851270770943, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -88676800.0, "logits/rejected": -45736480.0, "logps/chosen": -800.8571166992188, "logps/rejected": -330.84633382161456, "loss": 0.1758, "rewards/chosen": 0.8557944893836975, "rewards/margins": 3.430991232395172, "rewards/rejected": -2.5751967430114746, "step": 15025 }, { "epoch": 0.7964381310788965, "grad_norm": 54.75, "kl": 3.2202606201171875, "learning_rate": 5e-07, "logits/chosen": -10106957.333333334, "logits/rejected": -39977288.0, "logps/chosen": -270.427734375, "logps/rejected": -454.9974060058594, "loss": 0.3081, "rewards/chosen": 0.8981602986653646, "rewards/margins": 4.273885567982991, "rewards/rejected": -3.375725269317627, "step": 15026 }, { "epoch": 0.7964911350806986, "grad_norm": 46.5, "kl": 2.5863723754882812, "learning_rate": 5e-07, "logits/chosen": -19584360.0, "logits/rejected": -1721156.0, "logps/chosen": -208.82357788085938, "logps/rejected": -206.673828125, "loss": 0.3848, "rewards/chosen": 0.29625335335731506, "rewards/margins": 2.643758922815323, "rewards/rejected": -2.347505569458008, "step": 15027 }, { "epoch": 0.7965441390825008, "grad_norm": 44.5, "kl": 1.9123544692993164, "learning_rate": 5e-07, "logits/chosen": -14316169.6, "logits/rejected": -2043842.1666666667, "logps/chosen": -213.628857421875, "logps/rejected": -213.03165690104166, "loss": 0.3552, "rewards/chosen": 0.3552859783172607, "rewards/margins": 2.5803032080332438, "rewards/rejected": -2.225017229715983, "step": 15028 }, { "epoch": 0.7965971430843029, "grad_norm": 49.5, "kl": 1.0180473327636719, "learning_rate": 5e-07, "logits/chosen": -32389241.6, "logits/rejected": -24479600.0, "logps/chosen": -479.40068359375, "logps/rejected": -212.0589803059896, "loss": 0.2232, "rewards/chosen": 1.3733728408813477, "rewards/margins": 3.303109614054362, "rewards/rejected": -1.9297367731730144, "step": 15029 }, { "epoch": 0.796650147086105, "grad_norm": 32.25, "kl": 0.5454216003417969, "learning_rate": 5e-07, "logits/chosen": -1578611.25, "logits/rejected": -32506594.666666668, "logps/chosen": -114.58990478515625, "logps/rejected": -496.0992024739583, "loss": 0.1478, "rewards/chosen": 0.4027307629585266, "rewards/margins": 3.8646397391955056, "rewards/rejected": -3.461908976236979, "step": 15030 }, { "epoch": 0.7967031510879071, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21440926.0, "logps/chosen": -356.7049560546875, "loss": 0.3687, "rewards/chosen": 0.825431227684021, "step": 15031 }, { "epoch": 0.7967561550897093, "grad_norm": 53.5, "kl": 0.06809520721435547, "learning_rate": 5e-07, "logits/chosen": 3868683.0, "logits/rejected": 12535780.0, "logps/chosen": -126.59759521484375, "logps/rejected": -296.96038818359375, "loss": 0.3609, "rewards/chosen": 0.036572642624378204, "rewards/margins": 1.7103797867894173, "rewards/rejected": -1.673807144165039, "step": 15032 }, { "epoch": 0.7968091590915114, "grad_norm": 76.0, "kl": 1.6237096786499023, "learning_rate": 5e-07, "logits/chosen": -15474992.0, "logits/rejected": -933346.375, "logps/chosen": -279.7115478515625, "logps/rejected": -166.0988006591797, "loss": 0.3216, "rewards/chosen": 0.7053316235542297, "rewards/margins": 2.1045204997062683, "rewards/rejected": -1.3991888761520386, "step": 15033 }, { "epoch": 0.7968621630933136, "grad_norm": 63.5, "kl": 2.714385986328125, "learning_rate": 5e-07, "logits/chosen": -13247692.0, "logits/rejected": -34901984.0, "logps/chosen": -320.26171875, "logps/rejected": -339.2744140625, "loss": 0.2772, "rewards/chosen": 1.0159121751785278, "rewards/margins": 2.939326763153076, "rewards/rejected": -1.9234145879745483, "step": 15034 }, { "epoch": 0.7969151670951157, "grad_norm": 41.5, "kl": 1.025038719177246, "learning_rate": 5e-07, "logits/chosen": -31327628.0, "logits/rejected": 17406790.0, "logps/chosen": -139.11904907226562, "logps/rejected": -146.94241333007812, "loss": 0.2662, "rewards/chosen": 0.25302258133888245, "rewards/margins": 3.080075651407242, "rewards/rejected": -2.8270530700683594, "step": 15035 }, { "epoch": 0.7969681710969179, "grad_norm": 36.0, "kl": 0.8788108825683594, "learning_rate": 5e-07, "logits/chosen": 11800547.0, "logits/rejected": -31408054.0, "logps/chosen": -150.85218811035156, "logps/rejected": -324.55926513671875, "loss": 0.2352, "rewards/chosen": 0.5770397186279297, "rewards/margins": 3.6000587940216064, "rewards/rejected": -3.0230190753936768, "step": 15036 }, { "epoch": 0.79702117509872, "grad_norm": 49.5, "kl": 0.4489288330078125, "learning_rate": 5e-07, "logits/chosen": 10535176.0, "logits/rejected": -28148556.8, "logps/chosen": -269.0537923177083, "logps/rejected": -371.828759765625, "loss": 0.1965, "rewards/chosen": 0.4856869379679362, "rewards/margins": 4.417695299784342, "rewards/rejected": -3.9320083618164063, "step": 15037 }, { "epoch": 0.7970741791005221, "grad_norm": 41.75, "kl": 2.18817138671875, "learning_rate": 5e-07, "logits/chosen": -25198082.0, "logits/rejected": -609316.25, "logps/chosen": -244.3310546875, "logps/rejected": -146.0447235107422, "loss": 0.2866, "rewards/chosen": 1.5232704877853394, "rewards/margins": 2.9109121561050415, "rewards/rejected": -1.3876416683197021, "step": 15038 }, { "epoch": 0.7971271831023242, "grad_norm": 55.75, "kl": 2.54793643951416, "learning_rate": 5e-07, "logits/chosen": -87218032.0, "logits/rejected": -5013917.0, "logps/chosen": -325.67962646484375, "logps/rejected": -224.39686584472656, "loss": 0.4308, "rewards/chosen": 0.05901620785395304, "rewards/margins": 3.5061750213305154, "rewards/rejected": -3.4471588134765625, "step": 15039 }, { "epoch": 0.7971801871041264, "grad_norm": 66.5, "kl": 2.9893722534179688, "learning_rate": 5e-07, "logits/chosen": -13142253.333333334, "logits/rejected": -44700368.0, "logps/chosen": -317.2130533854167, "logps/rejected": -226.67848205566406, "loss": 0.4459, "rewards/chosen": 0.43105022112528485, "rewards/margins": 1.3233101765314739, "rewards/rejected": -0.892259955406189, "step": 15040 }, { "epoch": 0.7972331911059285, "grad_norm": 47.75, "kl": 1.4183616638183594, "learning_rate": 5e-07, "logits/chosen": -26477376.0, "logits/rejected": 13100054.666666666, "logps/chosen": -180.6009521484375, "logps/rejected": -74.08727010091145, "loss": 0.3487, "rewards/chosen": 0.5723155498504638, "rewards/margins": 2.2896655559539796, "rewards/rejected": -1.7173500061035156, "step": 15041 }, { "epoch": 0.7972861951077307, "grad_norm": 43.25, "kl": 2.0373287200927734, "learning_rate": 5e-07, "logits/chosen": -16603896.0, "logits/rejected": -31324499.2, "logps/chosen": -333.9777425130208, "logps/rejected": -241.4018310546875, "loss": 0.2986, "rewards/chosen": 0.9058722654978434, "rewards/margins": 2.283329693476359, "rewards/rejected": -1.3774574279785157, "step": 15042 }, { "epoch": 0.7973391991095328, "grad_norm": 34.75, "kl": 3.653156280517578, "learning_rate": 5e-07, "logits/chosen": -20745962.0, "logits/rejected": -26715792.0, "logps/chosen": -1393.362060546875, "logps/rejected": -269.6348063151042, "loss": 0.1054, "rewards/chosen": 2.4339020252227783, "rewards/margins": 5.338417291641235, "rewards/rejected": -2.904515266418457, "step": 15043 }, { "epoch": 0.7973922031113349, "grad_norm": 44.5, "kl": 4.937657356262207, "learning_rate": 5e-07, "logits/chosen": -4780630.4, "logits/rejected": 50900464.0, "logps/chosen": -359.8723876953125, "logps/rejected": -789.1282552083334, "loss": 0.2725, "rewards/chosen": 1.2335176467895508, "rewards/margins": 4.536223411560059, "rewards/rejected": -3.302705764770508, "step": 15044 }, { "epoch": 0.797445207113137, "grad_norm": 30.25, "kl": 0.03814983367919922, "learning_rate": 5e-07, "logits/chosen": 1955181.75, "logits/rejected": -9164080.666666666, "logps/chosen": -162.64794921875, "logps/rejected": -188.89847819010416, "loss": 0.2605, "rewards/chosen": 0.41857069730758667, "rewards/margins": 2.7797524333000183, "rewards/rejected": -2.3611817359924316, "step": 15045 }, { "epoch": 0.7974982111149391, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11991583.0, "logits/rejected": -15536584.0, "logps/chosen": -87.74687194824219, "logps/rejected": -352.6195475260417, "loss": 0.3248, "rewards/chosen": -0.628903865814209, "rewards/margins": 1.1060949961344402, "rewards/rejected": -1.7349988619486492, "step": 15046 }, { "epoch": 0.7975512151167413, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 820021.125, "logits/rejected": -10427115.333333334, "logps/chosen": -230.04110717773438, "logps/rejected": -197.59735107421875, "loss": 0.3341, "rewards/chosen": -0.6267669796943665, "rewards/margins": 1.1628089149792988, "rewards/rejected": -1.7895758946736653, "step": 15047 }, { "epoch": 0.7976042191185434, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32655946.666666668, "logits/rejected": -21426256.0, "logps/chosen": -297.3676350911458, "logps/rejected": -520.8078125, "loss": 0.1902, "rewards/chosen": 0.5581227540969849, "rewards/margins": 3.9988526582717894, "rewards/rejected": -3.4407299041748045, "step": 15048 }, { "epoch": 0.7976572231203456, "grad_norm": 84.0, "kl": 7.985435485839844, "learning_rate": 5e-07, "logits/chosen": -36803544.0, "logits/rejected": -8597899.0, "logps/chosen": -601.47412109375, "logps/rejected": -192.73605346679688, "loss": 0.3092, "rewards/chosen": 1.559578537940979, "rewards/margins": 3.7453240156173706, "rewards/rejected": -2.1857454776763916, "step": 15049 }, { "epoch": 0.7977102271221477, "grad_norm": 65.0, "kl": 0.458282470703125, "learning_rate": 5e-07, "logits/chosen": -35960410.666666664, "logits/rejected": -16362608.0, "logps/chosen": -567.4254557291666, "logps/rejected": -203.3509033203125, "loss": 0.3491, "rewards/chosen": 0.04248708486557007, "rewards/margins": 1.5778885245323182, "rewards/rejected": -1.5354014396667481, "step": 15050 }, { "epoch": 0.7977632311239499, "grad_norm": 75.0, "kl": 1.9488677978515625, "learning_rate": 5e-07, "logits/chosen": -33063738.666666668, "logits/rejected": 22166024.0, "logps/chosen": -334.12017822265625, "logps/rejected": -512.9891357421875, "loss": 0.3551, "rewards/chosen": 0.558448870976766, "rewards/margins": 2.390864690144857, "rewards/rejected": -1.8324158191680908, "step": 15051 }, { "epoch": 0.797816235125752, "grad_norm": 41.5, "kl": 1.2917060852050781, "learning_rate": 5e-07, "logits/chosen": -23942560.0, "logits/rejected": -28171264.0, "logps/chosen": -294.8141174316406, "logps/rejected": -203.05208333333334, "loss": 0.2586, "rewards/chosen": 0.3564831018447876, "rewards/margins": 1.9977445205052693, "rewards/rejected": -1.6412614186604817, "step": 15052 }, { "epoch": 0.7978692391275541, "grad_norm": 73.0, "kl": 1.3149547576904297, "learning_rate": 5e-07, "logits/chosen": -1508348.25, "logits/rejected": -44276716.0, "logps/chosen": -257.53790283203125, "logps/rejected": -327.00762939453125, "loss": 0.2277, "rewards/chosen": 1.484344720840454, "rewards/margins": 3.2411916255950928, "rewards/rejected": -1.7568469047546387, "step": 15053 }, { "epoch": 0.7979222431293562, "grad_norm": 66.0, "kl": 2.2516050338745117, "learning_rate": 5e-07, "logits/chosen": -26061698.666666668, "logits/rejected": -22971472.0, "logps/chosen": -345.0045166015625, "logps/rejected": -184.31541442871094, "loss": 0.3298, "rewards/chosen": 0.6868008772532145, "rewards/margins": 3.8033804098765054, "rewards/rejected": -3.116579532623291, "step": 15054 }, { "epoch": 0.7979752471311584, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15389324.0, "logits/rejected": -20162728.0, "logps/chosen": -279.2141418457031, "logps/rejected": -567.6533203125, "loss": 0.2264, "rewards/chosen": 0.6765142679214478, "rewards/margins": 3.808977246284485, "rewards/rejected": -3.132462978363037, "step": 15055 }, { "epoch": 0.7980282511329605, "grad_norm": 37.5, "kl": 1.3597240447998047, "learning_rate": 5e-07, "logits/chosen": 3709373.0, "logits/rejected": -26030598.85714286, "logps/chosen": -37.03130340576172, "logps/rejected": -287.8358677455357, "loss": 0.1348, "rewards/chosen": 1.308214545249939, "rewards/margins": 3.489428469112941, "rewards/rejected": -2.181213923863002, "step": 15056 }, { "epoch": 0.7980812551347627, "grad_norm": 41.25, "kl": 0.0536956787109375, "learning_rate": 5e-07, "logits/chosen": -49530581.333333336, "logits/rejected": -42890092.8, "logps/chosen": -384.8616536458333, "logps/rejected": -397.8708984375, "loss": 0.1674, "rewards/chosen": 0.6586222251256307, "rewards/margins": 4.035274751981099, "rewards/rejected": -3.3766525268554686, "step": 15057 }, { "epoch": 0.7981342591365648, "grad_norm": 38.0, "kl": 0.3480978012084961, "learning_rate": 5e-07, "logits/chosen": -23376216.0, "logits/rejected": -44650368.0, "logps/chosen": -177.50775146484375, "logps/rejected": -303.39813232421875, "loss": 0.287, "rewards/chosen": 0.36807674169540405, "rewards/margins": 2.8838791251182556, "rewards/rejected": -2.5158023834228516, "step": 15058 }, { "epoch": 0.798187263138367, "grad_norm": 41.0, "kl": 1.6315650939941406, "learning_rate": 5e-07, "logits/chosen": -25211592.0, "logits/rejected": -12887499.2, "logps/chosen": -219.24137369791666, "logps/rejected": -253.6833984375, "loss": 0.297, "rewards/chosen": -0.033247376481691994, "rewards/margins": 2.012882231672605, "rewards/rejected": -2.046129608154297, "step": 15059 }, { "epoch": 0.798240267140169, "grad_norm": 51.5, "kl": 3.9779281616210938, "learning_rate": 5e-07, "logits/chosen": -25411426.666666668, "logits/rejected": -667798.5, "logps/chosen": -236.26529947916666, "logps/rejected": -244.40658569335938, "loss": 0.3665, "rewards/chosen": 0.7317715485890707, "rewards/margins": 3.3387747605641684, "rewards/rejected": -2.6070032119750977, "step": 15060 }, { "epoch": 0.7982932711419712, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -126291424.0, "logits/rejected": 11576281.6, "logps/chosen": -473.1516927083333, "logps/rejected": -230.780322265625, "loss": 0.3712, "rewards/chosen": 0.35283203919728595, "rewards/margins": 1.0121682723363241, "rewards/rejected": -0.6593362331390381, "step": 15061 }, { "epoch": 0.7983462751437733, "grad_norm": 31.125, "kl": 2.1756057739257812, "learning_rate": 5e-07, "logits/chosen": 3924981.3333333335, "logits/rejected": -34016230.4, "logps/chosen": -37.646280924479164, "logps/rejected": -205.05576171875, "loss": 0.3426, "rewards/chosen": 0.8805471261342367, "rewards/margins": 2.0770604928334553, "rewards/rejected": -1.1965133666992187, "step": 15062 }, { "epoch": 0.7983992791455755, "grad_norm": 46.75, "kl": 3.620769500732422, "learning_rate": 5e-07, "logits/chosen": -20707789.333333332, "logits/rejected": -6509804.0, "logps/chosen": -235.1397705078125, "logps/rejected": -344.0975036621094, "loss": 0.4005, "rewards/chosen": 0.5714619159698486, "rewards/margins": 2.353913187980652, "rewards/rejected": -1.7824512720108032, "step": 15063 }, { "epoch": 0.7984522831473776, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25805608.0, "logits/rejected": -7236096.0, "logps/chosen": -271.37054443359375, "logps/rejected": -123.48485565185547, "loss": 0.2732, "rewards/chosen": 1.1219030618667603, "rewards/margins": 2.929356575012207, "rewards/rejected": -1.8074535131454468, "step": 15064 }, { "epoch": 0.7985052871491798, "grad_norm": 50.25, "kl": 2.5823135375976562, "learning_rate": 5e-07, "logits/chosen": -37257088.0, "logits/rejected": -25683356.0, "logps/chosen": -298.9991048177083, "logps/rejected": -515.01513671875, "loss": 0.3705, "rewards/chosen": 0.5507886409759521, "rewards/margins": 3.3891003131866455, "rewards/rejected": -2.8383116722106934, "step": 15065 }, { "epoch": 0.7985582911509819, "grad_norm": 36.25, "kl": 0.1827545166015625, "learning_rate": 5e-07, "logits/chosen": -21420050.0, "logits/rejected": -9823833.0, "logps/chosen": -252.22027587890625, "logps/rejected": -194.2616729736328, "loss": 0.2601, "rewards/chosen": 1.1244001388549805, "rewards/margins": 3.596832752227783, "rewards/rejected": -2.4724326133728027, "step": 15066 }, { "epoch": 0.7986112951527841, "grad_norm": 73.0, "kl": 1.8840789794921875, "learning_rate": 5e-07, "logits/chosen": -20135984.0, "logits/rejected": -24507752.0, "logps/chosen": -425.24124581473217, "logps/rejected": -285.9518737792969, "loss": 0.3273, "rewards/chosen": 0.80766419001988, "rewards/margins": 2.0690930230276923, "rewards/rejected": -1.2614288330078125, "step": 15067 }, { "epoch": 0.7986642991545861, "grad_norm": 50.0, "kl": 0.009061813354492188, "learning_rate": 5e-07, "logits/chosen": -18771738.0, "logits/rejected": -1204500.0, "logps/chosen": -50.54674530029297, "logps/rejected": -322.04807535807294, "loss": 0.2762, "rewards/chosen": 0.4011361002922058, "rewards/margins": 1.8588628967603047, "rewards/rejected": -1.4577267964680989, "step": 15068 }, { "epoch": 0.7987173031563883, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -139550538.66666666, "logits/rejected": -35291958.4, "logps/chosen": -331.04087320963544, "logps/rejected": -299.914013671875, "loss": 0.2942, "rewards/chosen": -0.4320339361826579, "rewards/margins": 2.4180000146230065, "rewards/rejected": -2.8500339508056642, "step": 15069 }, { "epoch": 0.7987703071581904, "grad_norm": 39.5, "kl": 1.0951118469238281, "learning_rate": 5e-07, "logits/chosen": -24620256.0, "logits/rejected": -15295858.0, "logps/chosen": -336.04364013671875, "logps/rejected": -259.9875793457031, "loss": 0.1925, "rewards/chosen": 1.522270679473877, "rewards/margins": 4.64215087890625, "rewards/rejected": -3.119880199432373, "step": 15070 }, { "epoch": 0.7988233111599926, "grad_norm": 75.5, "kl": 0.8083553314208984, "learning_rate": 5e-07, "logits/chosen": 9787593.333333334, "logits/rejected": -18608953.6, "logps/chosen": -512.7814127604166, "logps/rejected": -302.030029296875, "loss": 0.239, "rewards/chosen": 0.7290106614430746, "rewards/margins": 2.379142077763875, "rewards/rejected": -1.6501314163208007, "step": 15071 }, { "epoch": 0.7988763151617947, "grad_norm": 23.875, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -17708860.0, "logps/rejected": -251.04124450683594, "loss": 0.1247, "rewards/rejected": -3.076785087585449, "step": 15072 }, { "epoch": 0.7989293191635969, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52184124.0, "logits/rejected": -5423413.0, "logps/chosen": -374.9412841796875, "logps/rejected": -228.54646809895834, "loss": 0.236, "rewards/chosen": 0.23304443061351776, "rewards/margins": 2.2263456632693606, "rewards/rejected": -1.993301232655843, "step": 15073 }, { "epoch": 0.798982323165399, "grad_norm": 54.75, "kl": 1.1684465408325195, "learning_rate": 5e-07, "logits/chosen": -26268293.333333332, "logits/rejected": -88187400.0, "logps/chosen": -171.68599446614584, "logps/rejected": -447.3779296875, "loss": 0.4414, "rewards/chosen": -0.08126677076021831, "rewards/margins": 2.025618185599645, "rewards/rejected": -2.1068849563598633, "step": 15074 }, { "epoch": 0.7990353271672012, "grad_norm": 40.25, "kl": 2.183108329772949, "learning_rate": 5e-07, "logits/chosen": -30911292.8, "logits/rejected": -40727306.666666664, "logps/chosen": -192.260595703125, "logps/rejected": -353.0456949869792, "loss": 0.3688, "rewards/chosen": 0.2040628433227539, "rewards/margins": 2.6925551732381185, "rewards/rejected": -2.4884923299153647, "step": 15075 }, { "epoch": 0.7990883311690032, "grad_norm": 41.75, "kl": 0.13335418701171875, "learning_rate": 5e-07, "logits/chosen": -22471096.0, "logits/rejected": -37684242.666666664, "logps/chosen": -116.1890380859375, "logps/rejected": -421.9355875651042, "loss": 0.3583, "rewards/chosen": 0.05221603512763977, "rewards/margins": 2.786883177359899, "rewards/rejected": -2.7346671422322593, "step": 15076 }, { "epoch": 0.7991413351708054, "grad_norm": 53.0, "kl": 3.537899971008301, "learning_rate": 5e-07, "logits/chosen": -41698236.8, "logits/rejected": -13359386.666666666, "logps/chosen": -512.35146484375, "logps/rejected": -134.06464640299478, "loss": 0.2727, "rewards/chosen": 1.1282973289489746, "rewards/margins": 2.5670822461446123, "rewards/rejected": -1.438784917195638, "step": 15077 }, { "epoch": 0.7991943391726075, "grad_norm": 40.0, "kl": 1.2808780670166016, "learning_rate": 5e-07, "logits/chosen": -61588371.2, "logits/rejected": -11502248.0, "logps/chosen": -205.64970703125, "logps/rejected": -305.6842447916667, "loss": 0.2891, "rewards/chosen": 0.5931657314300537, "rewards/margins": 2.739885981877645, "rewards/rejected": -2.1467202504475913, "step": 15078 }, { "epoch": 0.7992473431744097, "grad_norm": 51.25, "kl": 2.17266845703125, "learning_rate": 5e-07, "logits/chosen": -92799048.0, "logits/rejected": -26837456.0, "logps/chosen": -317.22283935546875, "logps/rejected": -277.14617919921875, "loss": 0.2529, "rewards/chosen": 0.7561763525009155, "rewards/margins": 3.765957474708557, "rewards/rejected": -3.0097811222076416, "step": 15079 }, { "epoch": 0.7993003471762118, "grad_norm": 45.5, "kl": 0.3947896957397461, "learning_rate": 5e-07, "logits/chosen": -9003532.0, "logits/rejected": -9345372.8, "logps/chosen": -273.1862386067708, "logps/rejected": -306.8984375, "loss": 0.2172, "rewards/chosen": 0.532754103342692, "rewards/margins": 3.962161604563395, "rewards/rejected": -3.429407501220703, "step": 15080 }, { "epoch": 0.799353351178014, "grad_norm": 40.25, "kl": 4.782744407653809, "learning_rate": 5e-07, "logits/chosen": -27847926.4, "logits/rejected": -68741541.33333333, "logps/chosen": -385.4259521484375, "logps/rejected": -536.2921549479166, "loss": 0.2398, "rewards/chosen": 1.2263833045959474, "rewards/margins": 4.493591086069743, "rewards/rejected": -3.2672077814737954, "step": 15081 }, { "epoch": 0.7994063551798161, "grad_norm": 31.125, "kl": 2.4267215728759766, "learning_rate": 5e-07, "logits/chosen": -17483720.0, "logits/rejected": -16073070.666666666, "logps/chosen": -212.88710021972656, "logps/rejected": -188.87691243489584, "loss": 0.1472, "rewards/chosen": 1.2115780115127563, "rewards/margins": 4.015353163083395, "rewards/rejected": -2.803775151570638, "step": 15082 }, { "epoch": 0.7994593591816183, "grad_norm": 50.5, "kl": 2.4938955307006836, "learning_rate": 5e-07, "logits/chosen": -22455880.0, "logits/rejected": -6654647.0, "logps/chosen": -586.8388264973959, "logps/rejected": -292.81402587890625, "loss": 0.3263, "rewards/chosen": 1.0150771141052246, "rewards/margins": 5.26238489151001, "rewards/rejected": -4.247307777404785, "step": 15083 }, { "epoch": 0.7995123631834203, "grad_norm": 68.5, "kl": 1.5450057983398438, "learning_rate": 5e-07, "logits/chosen": -28218253.333333332, "logits/rejected": -40383033.6, "logps/chosen": -278.84751383463544, "logps/rejected": -239.332177734375, "loss": 0.2465, "rewards/chosen": 0.6142543951670328, "rewards/margins": 2.6953212896982826, "rewards/rejected": -2.08106689453125, "step": 15084 }, { "epoch": 0.7995653671852225, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -132477176.0, "logits/rejected": -38761608.0, "logps/chosen": -406.82086181640625, "logps/rejected": -398.63201904296875, "loss": 0.2688, "rewards/chosen": 0.4897777736186981, "rewards/margins": 2.632095068693161, "rewards/rejected": -2.142317295074463, "step": 15085 }, { "epoch": 0.7996183711870246, "grad_norm": 70.5, "kl": 4.559076309204102, "learning_rate": 5e-07, "logits/chosen": 71523264.0, "logits/rejected": -48098122.666666664, "logps/chosen": -424.913916015625, "logps/rejected": -146.021484375, "loss": 0.5053, "rewards/chosen": 0.45136513710021975, "rewards/margins": 0.8643156528472901, "rewards/rejected": -0.4129505157470703, "step": 15086 }, { "epoch": 0.7996713751888268, "grad_norm": 44.25, "kl": 0.8799333572387695, "learning_rate": 5e-07, "logits/chosen": -32481168.0, "logits/rejected": -16551700.0, "logps/chosen": -173.6655731201172, "logps/rejected": -202.1101837158203, "loss": 0.3128, "rewards/chosen": 0.3838922679424286, "rewards/margins": 2.7326912581920624, "rewards/rejected": -2.348798990249634, "step": 15087 }, { "epoch": 0.7997243791906289, "grad_norm": 40.25, "kl": 2.216672897338867, "learning_rate": 5e-07, "logits/chosen": -4658502.5, "logits/rejected": -34909056.0, "logps/chosen": -173.67144775390625, "logps/rejected": -625.7882690429688, "loss": 0.2434, "rewards/chosen": 0.9283401966094971, "rewards/margins": 3.8805887699127197, "rewards/rejected": -2.9522485733032227, "step": 15088 }, { "epoch": 0.7997773831924311, "grad_norm": 59.5, "kl": 1.137054443359375, "learning_rate": 5e-07, "logits/chosen": -16605282.666666666, "logits/rejected": -32594536.0, "logps/chosen": -424.8931477864583, "logps/rejected": -124.13580322265625, "loss": 0.4079, "rewards/chosen": 0.030095080534617107, "rewards/margins": 2.510911444822947, "rewards/rejected": -2.48081636428833, "step": 15089 }, { "epoch": 0.7998303871942332, "grad_norm": 46.75, "kl": 3.386333465576172, "learning_rate": 5e-07, "logits/chosen": -11813954.0, "logits/rejected": -30311786.0, "logps/chosen": -190.940185546875, "logps/rejected": -363.25341796875, "loss": 0.1984, "rewards/chosen": 1.7100329399108887, "rewards/margins": 3.559159755706787, "rewards/rejected": -1.8491268157958984, "step": 15090 }, { "epoch": 0.7998833911960354, "grad_norm": 53.0, "kl": 0.019618988037109375, "learning_rate": 5e-07, "logits/chosen": -33480195.2, "logits/rejected": -15610117.333333334, "logps/chosen": -271.2671630859375, "logps/rejected": -147.3492431640625, "loss": 0.3609, "rewards/chosen": 0.14557907581329346, "rewards/margins": 2.676031486193339, "rewards/rejected": -2.5304524103800454, "step": 15091 }, { "epoch": 0.7999363951978374, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72500778.66666667, "logits/rejected": -30642915.2, "logps/chosen": -338.33612060546875, "logps/rejected": -434.8515625, "loss": 0.1681, "rewards/chosen": 0.9022556940714518, "rewards/margins": 4.084643618265788, "rewards/rejected": -3.182387924194336, "step": 15092 }, { "epoch": 0.7999893991996396, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -130611720.0, "logits/rejected": -10537906.666666666, "logps/chosen": -174.32696533203125, "logps/rejected": -281.3116048177083, "loss": 0.2615, "rewards/chosen": -0.4840965270996094, "rewards/margins": 1.873377799987793, "rewards/rejected": -2.3574743270874023, "step": 15093 }, { "epoch": 0.8000424032014417, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -95291856.0, "logits/rejected": -11563271.0, "logps/chosen": -313.10540771484375, "logps/rejected": -187.82371520996094, "loss": 0.3689, "rewards/chosen": -0.13741493225097656, "rewards/margins": 1.7783606052398682, "rewards/rejected": -1.9157755374908447, "step": 15094 }, { "epoch": 0.8000954072032438, "grad_norm": 59.75, "kl": 2.7692222595214844, "learning_rate": 5e-07, "logits/chosen": -12539757.6, "logits/rejected": 28861765.333333332, "logps/chosen": -434.247216796875, "logps/rejected": -234.08548990885416, "loss": 0.3616, "rewards/chosen": 0.15405097007751464, "rewards/margins": 2.600342575709025, "rewards/rejected": -2.4462916056315103, "step": 15095 }, { "epoch": 0.800148411205046, "grad_norm": 58.0, "kl": 0.2784881591796875, "learning_rate": 5e-07, "logits/chosen": -33171414.0, "logits/rejected": 28599078.0, "logps/chosen": -213.39871215820312, "logps/rejected": -344.6330261230469, "loss": 0.206, "rewards/chosen": 0.9375863075256348, "rewards/margins": 3.343661069869995, "rewards/rejected": -2.4060747623443604, "step": 15096 }, { "epoch": 0.8002014152068481, "grad_norm": 54.5, "kl": 5.11505651473999, "learning_rate": 5e-07, "logits/chosen": -3737260.8, "logits/rejected": -33474416.0, "logps/chosen": -611.77783203125, "logps/rejected": -306.59682210286456, "loss": 0.3178, "rewards/chosen": 1.1399307250976562, "rewards/margins": 3.279296398162842, "rewards/rejected": -2.1393656730651855, "step": 15097 }, { "epoch": 0.8002544192086503, "grad_norm": 50.5, "kl": 0.5955562591552734, "learning_rate": 5e-07, "logits/chosen": -17872710.0, "logits/rejected": -56584104.0, "logps/chosen": -297.1478271484375, "logps/rejected": -499.8293762207031, "loss": 0.223, "rewards/chosen": 0.594422459602356, "rewards/margins": 3.6804338693618774, "rewards/rejected": -3.0860114097595215, "step": 15098 }, { "epoch": 0.8003074232104523, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11682942.0, "logits/rejected": -37261184.0, "logps/chosen": -263.9124755859375, "logps/rejected": -312.9393005371094, "loss": 0.2874, "rewards/chosen": 0.11840228736400604, "rewards/margins": 2.6972816437482834, "rewards/rejected": -2.5788793563842773, "step": 15099 }, { "epoch": 0.8003604272122545, "grad_norm": 56.75, "kl": 0.041400909423828125, "learning_rate": 5e-07, "logits/chosen": -42291468.8, "logits/rejected": -8172330.666666667, "logps/chosen": -327.6527099609375, "logps/rejected": -269.7655029296875, "loss": 0.3003, "rewards/chosen": 1.007878875732422, "rewards/margins": 2.4548439343770347, "rewards/rejected": -1.4469650586446126, "step": 15100 }, { "epoch": 0.8004134312140566, "grad_norm": 45.75, "kl": 2.084745407104492, "learning_rate": 5e-07, "logits/chosen": 4633604.0, "logits/rejected": -32343548.0, "logps/chosen": -93.69347381591797, "logps/rejected": -260.5447998046875, "loss": 0.283, "rewards/chosen": 0.5306026339530945, "rewards/margins": 3.2142497897148132, "rewards/rejected": -2.6836471557617188, "step": 15101 }, { "epoch": 0.8004664352158588, "grad_norm": 67.0, "kl": 0.8711814880371094, "learning_rate": 5e-07, "logits/chosen": -42870092.8, "logits/rejected": -14850544.0, "logps/chosen": -389.9380126953125, "logps/rejected": -586.5492757161459, "loss": 0.2764, "rewards/chosen": 0.6810003280639648, "rewards/margins": 4.026452318827311, "rewards/rejected": -3.345451990763346, "step": 15102 }, { "epoch": 0.8005194392176609, "grad_norm": 46.25, "kl": 0.7636337280273438, "learning_rate": 5e-07, "logits/chosen": -16188172.0, "logits/rejected": -16048191.0, "logps/chosen": -371.73956298828125, "logps/rejected": -150.8057403564453, "loss": 0.1809, "rewards/chosen": 1.271267294883728, "rewards/margins": 4.595858454704285, "rewards/rejected": -3.3245911598205566, "step": 15103 }, { "epoch": 0.8005724432194631, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23101350.0, "logits/rejected": 2285302.5, "logps/chosen": -196.37753295898438, "logps/rejected": -485.2693684895833, "loss": 0.1587, "rewards/chosen": 0.3746691048145294, "rewards/margins": 4.008331010739008, "rewards/rejected": -3.633661905924479, "step": 15104 }, { "epoch": 0.8006254472212652, "grad_norm": 52.75, "kl": 0.9515380859375, "learning_rate": 5e-07, "logits/chosen": -41946601.6, "logits/rejected": -8645833.333333334, "logps/chosen": -344.337451171875, "logps/rejected": -182.60664876302084, "loss": 0.4067, "rewards/chosen": -0.08635733127593995, "rewards/margins": 2.0604503393173217, "rewards/rejected": -2.1468076705932617, "step": 15105 }, { "epoch": 0.8006784512230674, "grad_norm": 54.5, "kl": 3.7660789489746094, "learning_rate": 5e-07, "logits/chosen": -67418432.0, "logits/rejected": -60674485.333333336, "logps/chosen": -410.05078125, "logps/rejected": -450.6495768229167, "loss": 0.2353, "rewards/chosen": 1.3073969841003419, "rewards/margins": 4.510991191864013, "rewards/rejected": -3.203594207763672, "step": 15106 }, { "epoch": 0.8007314552248694, "grad_norm": 47.5, "kl": 2.480195999145508, "learning_rate": 5e-07, "logits/chosen": -42587152.0, "logits/rejected": -4593013.333333333, "logps/chosen": -277.6138671875, "logps/rejected": -83.71649169921875, "loss": 0.3684, "rewards/chosen": 0.26960315704345705, "rewards/margins": 2.899802748362223, "rewards/rejected": -2.630199591318766, "step": 15107 }, { "epoch": 0.8007844592266716, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72300672.0, "logits/rejected": -73002234.66666667, "logps/chosen": -488.2685852050781, "logps/rejected": -337.23105875651044, "loss": 0.1921, "rewards/chosen": 0.23957443237304688, "rewards/margins": 2.792260011037191, "rewards/rejected": -2.552685578664144, "step": 15108 }, { "epoch": 0.8008374632284737, "grad_norm": 35.5, "kl": 1.8099899291992188, "learning_rate": 5e-07, "logits/chosen": 816214.25, "logits/rejected": -7825870.5, "logps/chosen": -182.05426025390625, "logps/rejected": -481.403076171875, "loss": 0.2114, "rewards/chosen": 0.8616102933883667, "rewards/margins": 5.828105807304382, "rewards/rejected": -4.966495513916016, "step": 15109 }, { "epoch": 0.8008904672302759, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25065968.0, "logits/rejected": -14788585.6, "logps/chosen": -531.1954752604166, "logps/rejected": -171.88538818359376, "loss": 0.2043, "rewards/chosen": 0.823766311009725, "rewards/margins": 3.6673187096913655, "rewards/rejected": -2.8435523986816404, "step": 15110 }, { "epoch": 0.800943471232078, "grad_norm": 51.75, "kl": 0.6433258056640625, "learning_rate": 5e-07, "logits/chosen": -66238392.0, "logits/rejected": -14962716.0, "logps/chosen": -810.1394653320312, "logps/rejected": -256.6508483886719, "loss": 0.1864, "rewards/chosen": 1.3646996021270752, "rewards/margins": 4.547635793685913, "rewards/rejected": -3.182936191558838, "step": 15111 }, { "epoch": 0.8009964752338802, "grad_norm": 53.75, "kl": 1.9018983840942383, "learning_rate": 5e-07, "logits/chosen": -5794172.0, "logits/rejected": -10438817.333333334, "logps/chosen": -284.5704650878906, "logps/rejected": -291.1282145182292, "loss": 0.1996, "rewards/chosen": 0.6025449633598328, "rewards/margins": 3.4050250252087912, "rewards/rejected": -2.8024800618489585, "step": 15112 }, { "epoch": 0.8010494792356823, "grad_norm": 51.25, "kl": 3.6316184997558594, "learning_rate": 5e-07, "logits/chosen": -10714246.857142856, "logits/rejected": 20924352.0, "logps/chosen": -197.86648995535714, "logps/rejected": -555.528564453125, "loss": 0.4602, "rewards/chosen": 0.44118213653564453, "rewards/margins": 2.9144060611724854, "rewards/rejected": -2.473223924636841, "step": 15113 }, { "epoch": 0.8011024832374845, "grad_norm": 38.75, "kl": 1.3399887084960938, "learning_rate": 5e-07, "logits/chosen": -9971826.4, "logits/rejected": -70137728.0, "logps/chosen": -115.701220703125, "logps/rejected": -556.6822509765625, "loss": 0.2833, "rewards/chosen": 0.3334357261657715, "rewards/margins": 5.663072299957276, "rewards/rejected": -5.329636573791504, "step": 15114 }, { "epoch": 0.8011554872392865, "grad_norm": 43.25, "kl": 1.4582414627075195, "learning_rate": 5e-07, "logits/chosen": -14466736.0, "logits/rejected": -7365296.666666667, "logps/chosen": -230.0747314453125, "logps/rejected": -113.22684733072917, "loss": 0.3018, "rewards/chosen": 0.6297124862670899, "rewards/margins": 2.9680208206176757, "rewards/rejected": -2.338308334350586, "step": 15115 }, { "epoch": 0.8012084912410887, "grad_norm": 53.5, "kl": 4.224109649658203, "learning_rate": 5e-07, "logits/chosen": -39276709.333333336, "logits/rejected": -22628814.4, "logps/chosen": -839.7661946614584, "logps/rejected": -245.969873046875, "loss": 0.2638, "rewards/chosen": 2.417574087778727, "rewards/margins": 3.405266157786051, "rewards/rejected": -0.9876920700073242, "step": 15116 }, { "epoch": 0.8012614952428908, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18245169.333333332, "logits/rejected": -23347508.8, "logps/chosen": -280.7368570963542, "logps/rejected": -253.546630859375, "loss": 0.2048, "rewards/chosen": 0.6919345060984293, "rewards/margins": 3.2098181883494057, "rewards/rejected": -2.5178836822509765, "step": 15117 }, { "epoch": 0.801314499244693, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17956936.0, "logits/rejected": -60478192.0, "logps/chosen": -244.26632690429688, "logps/rejected": -342.033203125, "loss": 0.1819, "rewards/chosen": 0.3423828184604645, "rewards/margins": 2.9859797855218253, "rewards/rejected": -2.643596967061361, "step": 15118 }, { "epoch": 0.8013675032464951, "grad_norm": 55.0, "kl": 1.3974618911743164, "learning_rate": 5e-07, "logits/chosen": -34292073.6, "logits/rejected": -21783686.666666668, "logps/chosen": -336.599267578125, "logps/rejected": -368.4124755859375, "loss": 0.3778, "rewards/chosen": 0.06497547626495362, "rewards/margins": 2.479761497179667, "rewards/rejected": -2.4147860209147134, "step": 15119 }, { "epoch": 0.8014205072482973, "grad_norm": 39.0, "kl": 0.6362018585205078, "learning_rate": 5e-07, "logits/chosen": -53478092.0, "logits/rejected": -18556612.0, "logps/chosen": -203.88075256347656, "logps/rejected": -362.64801025390625, "loss": 0.3207, "rewards/chosen": 0.27219733595848083, "rewards/margins": 2.3747960031032562, "rewards/rejected": -2.1025986671447754, "step": 15120 }, { "epoch": 0.8014735112500994, "grad_norm": 51.75, "kl": 2.1650352478027344, "learning_rate": 5e-07, "logits/chosen": -34731980.0, "logits/rejected": -7579722.0, "logps/chosen": -334.6649169921875, "logps/rejected": -228.24913024902344, "loss": 0.3279, "rewards/chosen": 0.8265380859375, "rewards/margins": 2.2782622575759888, "rewards/rejected": -1.4517241716384888, "step": 15121 }, { "epoch": 0.8015265152519016, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51255450.666666664, "logits/rejected": -13034242.4, "logps/chosen": -271.8964436848958, "logps/rejected": -167.94652099609374, "loss": 0.2611, "rewards/chosen": 0.39612770080566406, "rewards/margins": 2.8275598526000976, "rewards/rejected": -2.4314321517944335, "step": 15122 }, { "epoch": 0.8015795192537036, "grad_norm": 66.0, "kl": 0.6314620971679688, "learning_rate": 5e-07, "logits/chosen": -25107453.333333332, "logits/rejected": -36554652.8, "logps/chosen": -567.8561604817709, "logps/rejected": -728.135498046875, "loss": 0.111, "rewards/chosen": 1.521285057067871, "rewards/margins": 5.171236228942871, "rewards/rejected": -3.649951171875, "step": 15123 }, { "epoch": 0.8016325232555058, "grad_norm": 45.0, "kl": 1.2631263732910156, "learning_rate": 5e-07, "logits/chosen": 3422487.25, "logits/rejected": -35274032.0, "logps/chosen": -117.26919555664062, "logps/rejected": -199.68052673339844, "loss": 0.3242, "rewards/chosen": 0.457940936088562, "rewards/margins": 1.9468640089035034, "rewards/rejected": -1.4889230728149414, "step": 15124 }, { "epoch": 0.8016855272573079, "grad_norm": 51.0, "kl": 1.6750240325927734, "learning_rate": 5e-07, "logits/chosen": -55052448.0, "logits/rejected": -28314100.0, "logps/chosen": -265.9927164713542, "logps/rejected": -385.40850830078125, "loss": 0.4284, "rewards/chosen": 0.04833844800790151, "rewards/margins": 2.269117866953214, "rewards/rejected": -2.2207794189453125, "step": 15125 }, { "epoch": 0.8017385312591101, "grad_norm": 46.75, "kl": 1.068746566772461, "learning_rate": 5e-07, "logits/chosen": 2831176.8, "logits/rejected": -26289205.333333332, "logps/chosen": -186.61583251953124, "logps/rejected": -373.3705240885417, "loss": 0.2765, "rewards/chosen": 0.6736244678497314, "rewards/margins": 4.0500661055246985, "rewards/rejected": -3.3764416376749673, "step": 15126 }, { "epoch": 0.8017915352609122, "grad_norm": 56.75, "kl": 0.8034095764160156, "learning_rate": 5e-07, "logits/chosen": -57556672.0, "logits/rejected": -9080477.333333334, "logps/chosen": -360.93671875, "logps/rejected": -224.19598388671875, "loss": 0.3257, "rewards/chosen": 0.2916464328765869, "rewards/margins": 2.692312033971151, "rewards/rejected": -2.400665601094564, "step": 15127 }, { "epoch": 0.8018445392627144, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42552914.666666664, "logits/rejected": -15554094.4, "logps/chosen": -481.1326904296875, "logps/rejected": -219.8642822265625, "loss": 0.1646, "rewards/chosen": 0.6598368485768636, "rewards/margins": 3.554067118962606, "rewards/rejected": -2.894230270385742, "step": 15128 }, { "epoch": 0.8018975432645165, "grad_norm": 54.75, "kl": 3.9666709899902344, "learning_rate": 5e-07, "logits/chosen": -25044920.0, "logits/rejected": -37010122.666666664, "logps/chosen": -433.1388671875, "logps/rejected": -385.1988932291667, "loss": 0.3224, "rewards/chosen": 1.4319000244140625, "rewards/margins": 3.1714245478312177, "rewards/rejected": -1.739524523417155, "step": 15129 }, { "epoch": 0.8019505472663186, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41047232.0, "logits/rejected": -21164172.0, "logps/chosen": -524.60107421875, "logps/rejected": -331.92795817057294, "loss": 0.1324, "rewards/chosen": 1.3102173805236816, "rewards/margins": 3.8823698361714682, "rewards/rejected": -2.5721524556477866, "step": 15130 }, { "epoch": 0.8020035512681207, "grad_norm": 38.25, "kl": 1.0729408264160156, "learning_rate": 5e-07, "logits/chosen": 22099882.666666668, "logits/rejected": -21245862.4, "logps/chosen": -118.90321858723958, "logps/rejected": -221.5569091796875, "loss": 0.2299, "rewards/chosen": 0.6077302694320679, "rewards/margins": 3.436889624595642, "rewards/rejected": -2.8291593551635743, "step": 15131 }, { "epoch": 0.8020565552699229, "grad_norm": 41.25, "kl": 0.4732074737548828, "learning_rate": 5e-07, "logits/chosen": -8704286.0, "logits/rejected": -14390686.0, "logps/chosen": -228.41622924804688, "logps/rejected": -599.03076171875, "loss": 0.2536, "rewards/chosen": 0.6534175872802734, "rewards/margins": 3.7603919506073, "rewards/rejected": -3.1069743633270264, "step": 15132 }, { "epoch": 0.802109559271725, "grad_norm": 55.25, "kl": 3.3651981353759766, "learning_rate": 5e-07, "logits/chosen": -13487944.0, "logits/rejected": 13292004.0, "logps/chosen": -216.880859375, "logps/rejected": -726.8129272460938, "loss": 0.3446, "rewards/chosen": 0.602046807607015, "rewards/margins": 4.37520964940389, "rewards/rejected": -3.773162841796875, "step": 15133 }, { "epoch": 0.8021625632735272, "grad_norm": 104.0, "kl": 11.545829772949219, "learning_rate": 5e-07, "logits/chosen": -39983341.71428572, "logits/rejected": 2250792.75, "logps/chosen": -720.109375, "logps/rejected": -37.4075813293457, "loss": 0.4163, "rewards/chosen": 1.4026712690080916, "rewards/margins": 3.4130136285509383, "rewards/rejected": -2.0103423595428467, "step": 15134 }, { "epoch": 0.8022155672753293, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43189376.0, "logits/rejected": -33633398.4, "logps/chosen": -366.0673421223958, "logps/rejected": -270.436083984375, "loss": 0.252, "rewards/chosen": 0.0651789406935374, "rewards/margins": 2.5651430825392403, "rewards/rejected": -2.499964141845703, "step": 15135 }, { "epoch": 0.8022685712771315, "grad_norm": 56.75, "kl": 1.1057586669921875, "learning_rate": 5e-07, "logits/chosen": -73670256.0, "logits/rejected": -11049616.0, "logps/chosen": -586.128662109375, "logps/rejected": -368.53729248046875, "loss": 0.2038, "rewards/chosen": 1.2110542058944702, "rewards/margins": 3.630842089653015, "rewards/rejected": -2.419787883758545, "step": 15136 }, { "epoch": 0.8023215752789336, "grad_norm": 50.5, "kl": 6.215890884399414, "learning_rate": 5e-07, "logits/chosen": -15393616.0, "logits/rejected": 8144025.0, "logps/chosen": -267.7911376953125, "logps/rejected": -681.2359619140625, "loss": 0.3248, "rewards/chosen": 1.0727425416310628, "rewards/margins": 3.782373984654744, "rewards/rejected": -2.7096314430236816, "step": 15137 }, { "epoch": 0.8023745792807357, "grad_norm": 42.25, "kl": 1.2559852600097656, "learning_rate": 5e-07, "logits/chosen": -9096460.8, "logits/rejected": -108285045.33333333, "logps/chosen": -156.92474365234375, "logps/rejected": -400.4229329427083, "loss": 0.286, "rewards/chosen": 0.4448460578918457, "rewards/margins": 5.350230757395427, "rewards/rejected": -4.905384699503581, "step": 15138 }, { "epoch": 0.8024275832825378, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50511428.0, "logits/rejected": -9056756.57142857, "logps/chosen": -446.55767822265625, "logps/rejected": -329.19845145089283, "loss": 0.1521, "rewards/chosen": 0.768511950969696, "rewards/margins": 3.3505494679723467, "rewards/rejected": -2.5820375170026506, "step": 15139 }, { "epoch": 0.80248058728434, "grad_norm": 41.25, "kl": 0.3749551773071289, "learning_rate": 5e-07, "logits/chosen": -36939497.6, "logits/rejected": -65093082.666666664, "logps/chosen": -200.09561767578126, "logps/rejected": -232.27730305989584, "loss": 0.3793, "rewards/chosen": 0.0077389955520629885, "rewards/margins": 3.0192294994990028, "rewards/rejected": -3.01149050394694, "step": 15140 }, { "epoch": 0.8025335912861421, "grad_norm": 30.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11048993.0, "logits/rejected": -9067313.714285715, "logps/chosen": -361.95758056640625, "logps/rejected": -405.56529017857144, "loss": 0.1035, "rewards/chosen": 2.1156556606292725, "rewards/margins": 5.027951955795288, "rewards/rejected": -2.9122962951660156, "step": 15141 }, { "epoch": 0.8025865952879443, "grad_norm": 46.0, "kl": 0.2953500747680664, "learning_rate": 5e-07, "logits/chosen": -44501397.333333336, "logits/rejected": -16622638.4, "logps/chosen": -277.13079833984375, "logps/rejected": -91.70635375976562, "loss": 0.279, "rewards/chosen": 0.4134920835494995, "rewards/margins": 2.232749819755554, "rewards/rejected": -1.8192577362060547, "step": 15142 }, { "epoch": 0.8026395992897464, "grad_norm": 63.25, "kl": 1.1929140090942383, "learning_rate": 5e-07, "logits/chosen": -238100960.0, "logits/rejected": 1121408.142857143, "logps/chosen": -728.3349609375, "logps/rejected": -151.25887625558036, "loss": 0.2101, "rewards/chosen": 0.743743896484375, "rewards/margins": 2.692809922354562, "rewards/rejected": -1.9490660258701868, "step": 15143 }, { "epoch": 0.8026926032915485, "grad_norm": 41.75, "kl": 0.3519859313964844, "learning_rate": 5e-07, "logits/chosen": -39106053.333333336, "logits/rejected": -46129312.0, "logps/chosen": -389.5084635416667, "logps/rejected": -271.998779296875, "loss": 0.2126, "rewards/chosen": 0.5403275092442831, "rewards/margins": 3.1178055365880333, "rewards/rejected": -2.57747802734375, "step": 15144 }, { "epoch": 0.8027456072933506, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14667822.0, "logits/rejected": -23603684.0, "logps/chosen": -428.1589050292969, "logps/rejected": -269.9901428222656, "loss": 0.1976, "rewards/chosen": 0.9636486172676086, "rewards/margins": 3.7529950737953186, "rewards/rejected": -2.78934645652771, "step": 15145 }, { "epoch": 0.8027986112951527, "grad_norm": 40.0, "kl": 2.705900192260742, "learning_rate": 5e-07, "logits/chosen": 373475.6666666667, "logits/rejected": 3558032.0, "logps/chosen": -54.6547597249349, "logps/rejected": -235.1437255859375, "loss": 0.3017, "rewards/chosen": -0.14654326438903809, "rewards/margins": 1.8104912281036376, "rewards/rejected": -1.9570344924926757, "step": 15146 }, { "epoch": 0.8028516152969549, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 57216012.0, "logits/rejected": -20689021.333333332, "logps/chosen": -242.66712951660156, "logps/rejected": -282.3334554036458, "loss": 0.16, "rewards/chosen": 0.9349663257598877, "rewards/margins": 3.8567707538604736, "rewards/rejected": -2.921804428100586, "step": 15147 }, { "epoch": 0.802904619298757, "grad_norm": 44.75, "kl": 1.213796615600586, "learning_rate": 5e-07, "logits/chosen": -7083872.5, "logits/rejected": -59249436.0, "logps/chosen": -204.8414306640625, "logps/rejected": -356.5880126953125, "loss": 0.3407, "rewards/chosen": 0.44622188806533813, "rewards/margins": 2.045004665851593, "rewards/rejected": -1.5987827777862549, "step": 15148 }, { "epoch": 0.8029576233005592, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -62459980.0, "logps/rejected": -315.5458984375, "loss": 0.1949, "rewards/rejected": -1.8931820392608643, "step": 15149 }, { "epoch": 0.8030106273023613, "grad_norm": 43.75, "kl": 0.22637462615966797, "learning_rate": 5e-07, "logits/chosen": -26167968.0, "logits/rejected": -31074610.666666668, "logps/chosen": -243.5415771484375, "logps/rejected": -257.2784830729167, "loss": 0.3497, "rewards/chosen": 0.3851041316986084, "rewards/margins": 2.604061237970988, "rewards/rejected": -2.2189571062723794, "step": 15150 }, { "epoch": 0.8030636313041635, "grad_norm": 47.75, "kl": 1.7455921173095703, "learning_rate": 5e-07, "logits/chosen": -14039564.0, "logits/rejected": -12555490.0, "logps/chosen": -283.78564453125, "logps/rejected": -142.9766082763672, "loss": 0.3146, "rewards/chosen": 0.5486846566200256, "rewards/margins": 2.4529319405555725, "rewards/rejected": -1.9042472839355469, "step": 15151 }, { "epoch": 0.8031166353059656, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 25566866.0, "logits/rejected": -12630489.333333334, "logps/chosen": -329.7825927734375, "logps/rejected": -350.8062744140625, "loss": 0.2758, "rewards/chosen": -0.14952696859836578, "rewards/margins": 1.4831140289704006, "rewards/rejected": -1.6326409975687664, "step": 15152 }, { "epoch": 0.8031696393077677, "grad_norm": 46.5, "kl": 3.578817367553711, "learning_rate": 5e-07, "logits/chosen": -11271296.0, "logits/rejected": -25020739.2, "logps/chosen": -411.2918294270833, "logps/rejected": -274.38154296875, "loss": 0.283, "rewards/chosen": 1.0618136723836262, "rewards/margins": 3.022874768575033, "rewards/rejected": -1.9610610961914063, "step": 15153 }, { "epoch": 0.8032226433095698, "grad_norm": 58.5, "kl": 1.9331932067871094, "learning_rate": 5e-07, "logits/chosen": 1773138.125, "logits/rejected": -25855410.285714287, "logps/chosen": -920.175048828125, "logps/rejected": -353.77880859375, "loss": 0.1135, "rewards/chosen": 2.3967225551605225, "rewards/margins": 4.953302689961024, "rewards/rejected": -2.556580134800502, "step": 15154 }, { "epoch": 0.803275647311372, "grad_norm": 36.0, "kl": 1.42950439453125, "learning_rate": 5e-07, "logits/chosen": -52599120.0, "logits/rejected": -35529465.6, "logps/chosen": -303.3680419921875, "logps/rejected": -394.047607421875, "loss": 0.274, "rewards/chosen": 0.043667614459991455, "rewards/margins": 3.479393780231476, "rewards/rejected": -3.4357261657714844, "step": 15155 }, { "epoch": 0.8033286513131741, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59589146.666666664, "logits/rejected": -11347095.2, "logps/chosen": -371.1729736328125, "logps/rejected": -367.1751953125, "loss": 0.2419, "rewards/chosen": 0.16900940736134848, "rewards/margins": 2.5459652026494344, "rewards/rejected": -2.3769557952880858, "step": 15156 }, { "epoch": 0.8033816553149763, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48224217.6, "logits/rejected": -51313616.0, "logps/chosen": -512.894140625, "logps/rejected": -386.1850992838542, "loss": 0.2709, "rewards/chosen": 0.5834668159484864, "rewards/margins": 3.160459645589193, "rewards/rejected": -2.5769928296407065, "step": 15157 }, { "epoch": 0.8034346593167784, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20413326.0, "logits/rejected": -10876881.333333334, "logps/chosen": -457.7601013183594, "logps/rejected": -502.4110107421875, "loss": 0.1706, "rewards/chosen": 1.096014380455017, "rewards/margins": 3.3277211586634317, "rewards/rejected": -2.2317067782084146, "step": 15158 }, { "epoch": 0.8034876633185806, "grad_norm": 57.75, "kl": 2.566242218017578, "learning_rate": 5e-07, "logits/chosen": -28407756.0, "logits/rejected": 5677103.0, "logps/chosen": -299.61431884765625, "logps/rejected": -625.9237060546875, "loss": 0.2859, "rewards/chosen": 0.8144572377204895, "rewards/margins": 2.776087462902069, "rewards/rejected": -1.9616302251815796, "step": 15159 }, { "epoch": 0.8035406673203827, "grad_norm": 45.75, "kl": 0.8198013305664062, "learning_rate": 5e-07, "logits/chosen": -26470340.0, "logits/rejected": -16824389.333333332, "logps/chosen": -867.491455078125, "logps/rejected": -407.3138834635417, "loss": 0.1763, "rewards/chosen": 1.7288178205490112, "rewards/margins": 4.132779002189636, "rewards/rejected": -2.403961181640625, "step": 15160 }, { "epoch": 0.8035936713221848, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34965680.0, "logits/rejected": -27865315.2, "logps/chosen": -289.5514322916667, "logps/rejected": -190.2961181640625, "loss": 0.2643, "rewards/chosen": 0.5777575174967448, "rewards/margins": 2.3848950068155923, "rewards/rejected": -1.8071374893188477, "step": 15161 }, { "epoch": 0.8036466753239869, "grad_norm": 49.0, "kl": 0.8547897338867188, "learning_rate": 5e-07, "logits/chosen": -36853356.0, "logits/rejected": -40067260.0, "logps/chosen": -251.21868896484375, "logps/rejected": -279.7421569824219, "loss": 0.284, "rewards/chosen": 0.5617332458496094, "rewards/margins": 2.1353986263275146, "rewards/rejected": -1.5736653804779053, "step": 15162 }, { "epoch": 0.8036996793257891, "grad_norm": 56.25, "kl": 1.4888496398925781, "learning_rate": 5e-07, "logits/chosen": -54837964.8, "logits/rejected": -62601034.666666664, "logps/chosen": -325.217626953125, "logps/rejected": -521.02734375, "loss": 0.3284, "rewards/chosen": 0.46885318756103517, "rewards/margins": 2.572806231180827, "rewards/rejected": -2.1039530436197915, "step": 15163 }, { "epoch": 0.8037526833275912, "grad_norm": 56.75, "kl": 5.174870491027832, "learning_rate": 5e-07, "logits/chosen": -85739488.0, "logits/rejected": -43903760.0, "logps/chosen": -527.278076171875, "logps/rejected": -361.9123840332031, "loss": 0.2597, "rewards/chosen": 1.1623917818069458, "rewards/margins": 3.352523684501648, "rewards/rejected": -2.190131902694702, "step": 15164 }, { "epoch": 0.8038056873293934, "grad_norm": 55.0, "kl": 2.609231948852539, "learning_rate": 5e-07, "logits/chosen": -32440166.4, "logits/rejected": -26430205.333333332, "logps/chosen": -245.145166015625, "logps/rejected": -442.7587076822917, "loss": 0.2985, "rewards/chosen": 0.960389518737793, "rewards/margins": 3.0253118515014648, "rewards/rejected": -2.064922332763672, "step": 15165 }, { "epoch": 0.8038586913311955, "grad_norm": 93.5, "kl": 0.46079349517822266, "learning_rate": 5e-07, "logits/chosen": 14929140.0, "logits/rejected": -75225957.33333333, "logps/chosen": -49.053855895996094, "logps/rejected": -212.4765625, "loss": 0.2836, "rewards/chosen": 0.3040965795516968, "rewards/margins": 2.0809084971745806, "rewards/rejected": -1.776811917622884, "step": 15166 }, { "epoch": 0.8039116953329977, "grad_norm": 65.0, "kl": 3.718865394592285, "learning_rate": 5e-07, "logits/chosen": -32333498.666666668, "logits/rejected": 237142560.0, "logps/chosen": -390.0814208984375, "logps/rejected": -336.19140625, "loss": 0.4087, "rewards/chosen": 0.4299808740615845, "rewards/margins": 3.1381386518478394, "rewards/rejected": -2.708157777786255, "step": 15167 }, { "epoch": 0.8039646993347997, "grad_norm": 80.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18290340.0, "logits/rejected": -70051408.0, "logps/chosen": -656.13818359375, "logps/rejected": -516.7472534179688, "loss": 0.2197, "rewards/chosen": 0.5900947451591492, "rewards/margins": 3.341132342815399, "rewards/rejected": -2.75103759765625, "step": 15168 }, { "epoch": 0.8040177033366019, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20635724.0, "logits/rejected": 52575274.666666664, "logps/chosen": -192.8094024658203, "logps/rejected": -475.0035400390625, "loss": 0.2442, "rewards/chosen": -0.23450812697410583, "rewards/margins": 2.999251753091812, "rewards/rejected": -3.233759880065918, "step": 15169 }, { "epoch": 0.804070707338404, "grad_norm": 36.0, "kl": 0.44271183013916016, "learning_rate": 5e-07, "logits/chosen": -16309420.0, "logits/rejected": -54940217.6, "logps/chosen": -185.4072469075521, "logps/rejected": -407.728955078125, "loss": 0.1706, "rewards/chosen": 0.815198024113973, "rewards/margins": 4.0980055650075276, "rewards/rejected": -3.282807540893555, "step": 15170 }, { "epoch": 0.8041237113402062, "grad_norm": 44.75, "kl": 0.3136749267578125, "learning_rate": 5e-07, "logits/chosen": -51857160.0, "logits/rejected": -50561562.666666664, "logps/chosen": -511.3519592285156, "logps/rejected": -417.5615234375, "loss": 0.1632, "rewards/chosen": 0.6772628426551819, "rewards/margins": 3.1071111957232156, "rewards/rejected": -2.4298483530680337, "step": 15171 }, { "epoch": 0.8041767153420083, "grad_norm": 64.0, "kl": 6.039649963378906, "learning_rate": 5e-07, "logits/chosen": -59511142.4, "logits/rejected": -16896596.0, "logps/chosen": -596.39033203125, "logps/rejected": -256.0712890625, "loss": 0.2877, "rewards/chosen": 1.3428918838500976, "rewards/margins": 3.584716033935547, "rewards/rejected": -2.241824150085449, "step": 15172 }, { "epoch": 0.8042297193438105, "grad_norm": 31.75, "kl": 0.08961772918701172, "learning_rate": 5e-07, "logits/chosen": 4811425.0, "logits/rejected": -14597860.0, "logps/chosen": -62.89578628540039, "logps/rejected": -96.59131622314453, "loss": 0.2228, "rewards/chosen": 0.9358516931533813, "rewards/margins": 2.948250889778137, "rewards/rejected": -2.012399196624756, "step": 15173 }, { "epoch": 0.8042827233456126, "grad_norm": 45.25, "kl": 2.2134952545166016, "learning_rate": 5e-07, "logits/chosen": -31562992.0, "logits/rejected": -15402124.0, "logps/chosen": -227.62626139322916, "logps/rejected": -290.11419677734375, "loss": 0.3981, "rewards/chosen": 0.12113908926645915, "rewards/margins": 3.7514124313990274, "rewards/rejected": -3.6302733421325684, "step": 15174 }, { "epoch": 0.8043357273474148, "grad_norm": 53.75, "kl": 1.9480056762695312, "learning_rate": 5e-07, "logits/chosen": -19916181.333333332, "logits/rejected": -42521062.4, "logps/chosen": -274.19549560546875, "logps/rejected": -609.31220703125, "loss": 0.2021, "rewards/chosen": 0.6569412151972452, "rewards/margins": 3.8957557598749797, "rewards/rejected": -3.2388145446777346, "step": 15175 }, { "epoch": 0.8043887313492168, "grad_norm": 51.25, "kl": 2.77085018157959, "learning_rate": 5e-07, "logits/chosen": -28327219.2, "logits/rejected": 3219888.3333333335, "logps/chosen": -333.975244140625, "logps/rejected": -264.3570963541667, "loss": 0.3114, "rewards/chosen": 0.5654460906982421, "rewards/margins": 2.690509668986002, "rewards/rejected": -2.1250635782877603, "step": 15176 }, { "epoch": 0.804441735351019, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22733360.0, "logits/rejected": -2938312.6, "logps/chosen": -390.5359293619792, "logps/rejected": -191.69013671875, "loss": 0.2227, "rewards/chosen": 0.4655062754948934, "rewards/margins": 3.0179450114568076, "rewards/rejected": -2.552438735961914, "step": 15177 }, { "epoch": 0.8044947393528211, "grad_norm": 50.25, "kl": 2.621882438659668, "learning_rate": 5e-07, "logits/chosen": -16465634.666666666, "logits/rejected": -48902440.0, "logps/chosen": -282.5662027994792, "logps/rejected": -386.16070556640625, "loss": 0.4417, "rewards/chosen": 0.3372645378112793, "rewards/margins": 2.7387828826904297, "rewards/rejected": -2.4015183448791504, "step": 15178 }, { "epoch": 0.8045477433546233, "grad_norm": 59.5, "kl": 1.8418750762939453, "learning_rate": 5e-07, "logits/chosen": -11712259.42857143, "logits/rejected": -57178512.0, "logps/chosen": -249.14688546316964, "logps/rejected": -422.4219970703125, "loss": 0.4426, "rewards/chosen": 0.22291835716792516, "rewards/margins": 2.9507962124688283, "rewards/rejected": -2.7278778553009033, "step": 15179 }, { "epoch": 0.8046007473564254, "grad_norm": 39.75, "kl": 3.1125755310058594, "learning_rate": 5e-07, "logits/chosen": -24853421.333333332, "logits/rejected": -23312204.8, "logps/chosen": -237.22635904947916, "logps/rejected": -350.8238525390625, "loss": 0.2451, "rewards/chosen": 0.561718225479126, "rewards/margins": 3.188920259475708, "rewards/rejected": -2.627202033996582, "step": 15180 }, { "epoch": 0.8046537513582276, "grad_norm": 58.75, "kl": 0.9368629455566406, "learning_rate": 5e-07, "logits/chosen": -25402986.666666668, "logits/rejected": -27072374.0, "logps/chosen": -378.3869222005208, "logps/rejected": -272.9646911621094, "loss": 0.3485, "rewards/chosen": 0.31680768728256226, "rewards/margins": 3.712686002254486, "rewards/rejected": -3.395878314971924, "step": 15181 }, { "epoch": 0.8047067553600297, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37670572.8, "logits/rejected": -16947369.333333332, "logps/chosen": -278.796044921875, "logps/rejected": -559.4653727213541, "loss": 0.2883, "rewards/chosen": 0.31752355098724366, "rewards/margins": 3.250219257672628, "rewards/rejected": -2.9326957066853843, "step": 15182 }, { "epoch": 0.8047597593618319, "grad_norm": 51.25, "kl": 0.011264801025390625, "learning_rate": 5e-07, "logits/chosen": -43284202.666666664, "logits/rejected": -41478662.4, "logps/chosen": -447.8352457682292, "logps/rejected": -456.602978515625, "loss": 0.2189, "rewards/chosen": 0.4848896265029907, "rewards/margins": 2.795067620277405, "rewards/rejected": -2.3101779937744142, "step": 15183 }, { "epoch": 0.8048127633636339, "grad_norm": 51.0, "kl": 1.0811271667480469, "learning_rate": 5e-07, "logits/chosen": -7980217.0, "logits/rejected": -20113706.666666668, "logps/chosen": -342.75579833984375, "logps/rejected": -284.43701171875, "loss": 0.1925, "rewards/chosen": 1.1885693073272705, "rewards/margins": 3.096956491470337, "rewards/rejected": -1.9083871841430664, "step": 15184 }, { "epoch": 0.8048657673654361, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8616626.666666666, "logits/rejected": -238796.8, "logps/chosen": -251.6229248046875, "logps/rejected": -222.79599609375, "loss": 0.2194, "rewards/chosen": 1.064300537109375, "rewards/margins": 2.973170280456543, "rewards/rejected": -1.908869743347168, "step": 15185 }, { "epoch": 0.8049187713672382, "grad_norm": 49.25, "kl": 0.9996824264526367, "learning_rate": 5e-07, "logits/chosen": -31353946.666666668, "logits/rejected": -27633164.8, "logps/chosen": -296.04974365234375, "logps/rejected": -201.37747802734376, "loss": 0.2784, "rewards/chosen": 0.44750694433848065, "rewards/margins": 2.3790972153345744, "rewards/rejected": -1.9315902709960937, "step": 15186 }, { "epoch": 0.8049717753690404, "grad_norm": 56.0, "kl": 0.4833412170410156, "learning_rate": 5e-07, "logits/chosen": -2569561.75, "logits/rejected": -17726172.0, "logps/chosen": -212.27597045898438, "logps/rejected": -347.5205383300781, "loss": 0.2456, "rewards/chosen": 0.5347973108291626, "rewards/margins": 3.0395156145095825, "rewards/rejected": -2.50471830368042, "step": 15187 }, { "epoch": 0.8050247793708425, "grad_norm": 44.25, "kl": 1.3907585144042969, "learning_rate": 5e-07, "logits/chosen": -12367720.0, "logits/rejected": -18402146.666666668, "logps/chosen": -711.7280883789062, "logps/rejected": -358.0384928385417, "loss": 0.1666, "rewards/chosen": 1.2274622917175293, "rewards/margins": 3.5120447476704917, "rewards/rejected": -2.2845824559529624, "step": 15188 }, { "epoch": 0.8050777833726447, "grad_norm": 51.75, "kl": 4.590427398681641, "learning_rate": 5e-07, "logits/chosen": -22209572.57142857, "logits/rejected": -36596312.0, "logps/chosen": -397.75927734375, "logps/rejected": -395.1927795410156, "loss": 0.4371, "rewards/chosen": 0.7264089584350586, "rewards/margins": 2.252449631690979, "rewards/rejected": -1.5260406732559204, "step": 15189 }, { "epoch": 0.8051307873744468, "grad_norm": 57.75, "kl": 1.3370513916015625, "learning_rate": 5e-07, "logits/chosen": -35933410.666666664, "logits/rejected": -16174299.2, "logps/chosen": -234.30485026041666, "logps/rejected": -194.13536376953124, "loss": 0.3594, "rewards/chosen": 0.31223082542419434, "rewards/margins": 1.2447030544281006, "rewards/rejected": -0.9324722290039062, "step": 15190 }, { "epoch": 0.805183791376249, "grad_norm": 29.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4238624.0, "logits/rejected": -27890368.0, "logps/chosen": -24.787647247314453, "logps/rejected": -232.15727887834822, "loss": 0.1567, "rewards/chosen": 0.6711344122886658, "rewards/margins": 3.3841961537088667, "rewards/rejected": -2.713061741420201, "step": 15191 }, { "epoch": 0.805236795378051, "grad_norm": 60.5, "kl": 1.6958045959472656, "learning_rate": 5e-07, "logits/chosen": -6999742.666666667, "logits/rejected": -26216696.0, "logps/chosen": -351.9410807291667, "logps/rejected": -404.4388916015625, "loss": 0.2817, "rewards/chosen": 0.6523025830586752, "rewards/margins": 2.5983524640401203, "rewards/rejected": -1.9460498809814453, "step": 15192 }, { "epoch": 0.8052897993798532, "grad_norm": 46.0, "kl": 0.6321868896484375, "learning_rate": 5e-07, "logits/chosen": -49212437.333333336, "logits/rejected": 69452416.0, "logps/chosen": -251.94034830729166, "logps/rejected": -322.749755859375, "loss": 0.2096, "rewards/chosen": 0.5394384066263834, "rewards/margins": 3.357318655649821, "rewards/rejected": -2.8178802490234376, "step": 15193 }, { "epoch": 0.8053428033816553, "grad_norm": 55.0, "kl": 2.4388580322265625, "learning_rate": 5e-07, "logits/chosen": 18850185.6, "logits/rejected": -7562470.666666667, "logps/chosen": -53.406982421875, "logps/rejected": -199.8106892903646, "loss": 0.4042, "rewards/chosen": 0.3780962944030762, "rewards/margins": 1.875800323486328, "rewards/rejected": -1.497704029083252, "step": 15194 }, { "epoch": 0.8053958073834574, "grad_norm": 51.0, "kl": 4.402961730957031, "learning_rate": 5e-07, "logits/chosen": -47639733.333333336, "logits/rejected": -50305352.0, "logps/chosen": -400.6388346354167, "logps/rejected": -765.4198608398438, "loss": 0.3813, "rewards/chosen": 0.5949017206827799, "rewards/margins": 4.766030470530192, "rewards/rejected": -4.171128749847412, "step": 15195 }, { "epoch": 0.8054488113852596, "grad_norm": 23.75, "kl": 3.3712730407714844, "learning_rate": 5e-07, "logits/chosen": 8060044.666666667, "logits/rejected": -41366995.2, "logps/chosen": -24.739789326985676, "logps/rejected": -417.7494140625, "loss": 0.2562, "rewards/chosen": 0.4443865617116292, "rewards/margins": 3.1971630891164144, "rewards/rejected": -2.752776527404785, "step": 15196 }, { "epoch": 0.8055018153870617, "grad_norm": 46.25, "kl": 1.7570323944091797, "learning_rate": 5e-07, "logits/chosen": -10945433.333333334, "logits/rejected": -40048688.0, "logps/chosen": -278.2524007161458, "logps/rejected": -395.94072265625, "loss": 0.2234, "rewards/chosen": 0.5832773844401041, "rewards/margins": 3.387001864115397, "rewards/rejected": -2.803724479675293, "step": 15197 }, { "epoch": 0.8055548193888639, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21509360.0, "logits/rejected": -9233073.6, "logps/chosen": -239.71244303385416, "logps/rejected": -265.342529296875, "loss": 0.3198, "rewards/chosen": -0.08950094381968181, "rewards/margins": 1.9076548178990682, "rewards/rejected": -1.99715576171875, "step": 15198 }, { "epoch": 0.805607823390666, "grad_norm": 49.0, "kl": 1.2641410827636719, "learning_rate": 5e-07, "logits/chosen": -29115712.0, "logits/rejected": -33530296.0, "logps/chosen": -266.6398111979167, "logps/rejected": -470.77569580078125, "loss": 0.4474, "rewards/chosen": -0.12884039680163065, "rewards/margins": 2.33865695198377, "rewards/rejected": -2.4674973487854004, "step": 15199 }, { "epoch": 0.8056608273924681, "grad_norm": 41.75, "kl": 0.9949398040771484, "learning_rate": 5e-07, "logits/chosen": -4692945.0, "logits/rejected": -39693912.0, "logps/chosen": -398.65814208984375, "logps/rejected": -416.6962076822917, "loss": 0.2161, "rewards/chosen": 0.37816354632377625, "rewards/margins": 2.602501759926478, "rewards/rejected": -2.2243382136027017, "step": 15200 }, { "epoch": 0.8057138313942702, "grad_norm": 56.25, "kl": 2.076244354248047, "learning_rate": 5e-07, "logits/chosen": -20431844.0, "logits/rejected": -45918892.0, "logps/chosen": -274.4520670572917, "logps/rejected": -415.64990234375, "loss": 0.4053, "rewards/chosen": 0.15211517612139383, "rewards/margins": 3.082232783238093, "rewards/rejected": -2.930117607116699, "step": 15201 }, { "epoch": 0.8057668353960724, "grad_norm": 47.0, "kl": 1.3198518753051758, "learning_rate": 5e-07, "logits/chosen": -15999673.333333334, "logits/rejected": -18516142.0, "logps/chosen": -247.20125325520834, "logps/rejected": -155.38087463378906, "loss": 0.3317, "rewards/chosen": 0.619082530339559, "rewards/margins": 2.9422386487325034, "rewards/rejected": -2.3231561183929443, "step": 15202 }, { "epoch": 0.8058198393978745, "grad_norm": 51.5, "kl": 1.4525260925292969, "learning_rate": 5e-07, "logits/chosen": -28315090.0, "logits/rejected": -9559077.0, "logps/chosen": -297.1780090332031, "logps/rejected": -96.02263641357422, "loss": 0.3763, "rewards/chosen": 0.034092046320438385, "rewards/margins": 1.8587401881814003, "rewards/rejected": -1.824648141860962, "step": 15203 }, { "epoch": 0.8058728433996767, "grad_norm": 55.25, "kl": 0.8389396667480469, "learning_rate": 5e-07, "logits/chosen": -21137641.6, "logits/rejected": 4482097.0, "logps/chosen": -350.0984375, "logps/rejected": -551.4958902994791, "loss": 0.2362, "rewards/chosen": 0.9650615692138672, "rewards/margins": 3.081617418924967, "rewards/rejected": -2.1165558497111, "step": 15204 }, { "epoch": 0.8059258474014788, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 395539.2, "logits/rejected": -37769157.333333336, "logps/chosen": -65.2623046875, "logps/rejected": -248.501708984375, "loss": 0.3435, "rewards/chosen": 0.0342461109161377, "rewards/margins": 2.2724135557810463, "rewards/rejected": -2.2381674448649087, "step": 15205 }, { "epoch": 0.805978851403281, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15265934.0, "logits/rejected": -7383266.0, "logps/chosen": -257.6499938964844, "logps/rejected": -365.33184814453125, "loss": 0.2522, "rewards/chosen": 0.6298290491104126, "rewards/margins": 2.829643130302429, "rewards/rejected": -2.1998140811920166, "step": 15206 }, { "epoch": 0.806031855405083, "grad_norm": 28.375, "kl": 0.17380285263061523, "learning_rate": 5e-07, "logits/chosen": -7263072.666666667, "logits/rejected": -33369888.0, "logps/chosen": -213.8591105143229, "logps/rejected": -602.71455078125, "loss": 0.1241, "rewards/chosen": 1.121517578760783, "rewards/margins": 5.184428421656291, "rewards/rejected": -4.062910842895508, "step": 15207 }, { "epoch": 0.8060848594068852, "grad_norm": 55.5, "kl": 2.9248533248901367, "learning_rate": 5e-07, "logits/chosen": 6957549.333333333, "logits/rejected": -21954459.2, "logps/chosen": -315.3114827473958, "logps/rejected": -307.498193359375, "loss": 0.2906, "rewards/chosen": 0.7673870722452799, "rewards/margins": 2.509734789530436, "rewards/rejected": -1.7423477172851562, "step": 15208 }, { "epoch": 0.8061378634086873, "grad_norm": 40.25, "kl": 2.8867149353027344, "learning_rate": 5e-07, "logits/chosen": -9384970.4, "logits/rejected": -79561498.66666667, "logps/chosen": -223.6268310546875, "logps/rejected": -339.7648111979167, "loss": 0.2828, "rewards/chosen": 0.7878552913665772, "rewards/margins": 3.4057885646820067, "rewards/rejected": -2.6179332733154297, "step": 15209 }, { "epoch": 0.8061908674104895, "grad_norm": 25.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72717568.0, "logits/rejected": -20891016.0, "logps/chosen": -168.50387573242188, "logps/rejected": -323.92681884765625, "loss": 0.1152, "rewards/chosen": 1.1434917449951172, "rewards/margins": 4.82427724202474, "rewards/rejected": -3.6807854970296225, "step": 15210 }, { "epoch": 0.8062438714122916, "grad_norm": 53.5, "kl": 1.8691787719726562, "learning_rate": 5e-07, "logits/chosen": 8079323.2, "logits/rejected": -35049162.666666664, "logps/chosen": -221.7775634765625, "logps/rejected": -169.1087849934896, "loss": 0.381, "rewards/chosen": -0.014652371406555176, "rewards/margins": 2.126231869061788, "rewards/rejected": -2.1408842404683432, "step": 15211 }, { "epoch": 0.8062968754140938, "grad_norm": 41.25, "kl": 2.598052978515625, "learning_rate": 5e-07, "logits/chosen": -6084378.0, "logits/rejected": -28859046.4, "logps/chosen": -209.70772298177084, "logps/rejected": -308.834814453125, "loss": 0.2802, "rewards/chosen": 0.03364690144856771, "rewards/margins": 2.8989516576131185, "rewards/rejected": -2.8653047561645506, "step": 15212 }, { "epoch": 0.8063498794158959, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52145436.0, "logits/rejected": -12838958.857142856, "logps/chosen": -481.3802490234375, "logps/rejected": -237.25969587053572, "loss": 0.1398, "rewards/chosen": 0.4037536680698395, "rewards/margins": 2.9784375386578694, "rewards/rejected": -2.57468387058803, "step": 15213 }, { "epoch": 0.8064028834176981, "grad_norm": 46.75, "kl": 0.6322860717773438, "learning_rate": 5e-07, "logits/chosen": -34286864.0, "logits/rejected": -7324882.0, "logps/chosen": -320.91766357421875, "logps/rejected": -115.26686096191406, "loss": 0.2052, "rewards/chosen": 1.2207540273666382, "rewards/margins": 4.490120053291321, "rewards/rejected": -3.2693660259246826, "step": 15214 }, { "epoch": 0.8064558874195001, "grad_norm": 77.5, "kl": 7.730076789855957, "learning_rate": 5e-07, "logits/chosen": -14177658.0, "logps/chosen": -384.4289855957031, "loss": 0.4031, "rewards/chosen": 1.248356819152832, "step": 15215 }, { "epoch": 0.8065088914213023, "grad_norm": 31.125, "kl": 0.12520599365234375, "learning_rate": 5e-07, "logits/chosen": -43260496.0, "logits/rejected": -5006608.8, "logps/chosen": -426.0406494140625, "logps/rejected": -481.0720703125, "loss": 0.0748, "rewards/chosen": 2.1961803436279297, "rewards/margins": 6.083573532104492, "rewards/rejected": -3.8873931884765627, "step": 15216 }, { "epoch": 0.8065618954231044, "grad_norm": 34.25, "kl": 0.28394412994384766, "learning_rate": 5e-07, "logits/chosen": -26593148.0, "logits/rejected": -7268677.333333333, "logps/chosen": -440.6192626953125, "logps/rejected": -343.6118977864583, "loss": 0.1273, "rewards/chosen": 1.9415614604949951, "rewards/margins": 4.383478085199991, "rewards/rejected": -2.4419166247049966, "step": 15217 }, { "epoch": 0.8066148994249066, "grad_norm": 66.0, "kl": 0.4509162902832031, "learning_rate": 5e-07, "logits/chosen": -12761198.0, "logits/rejected": -22651008.0, "logps/chosen": -234.4562530517578, "logps/rejected": -392.7168273925781, "loss": 0.3596, "rewards/chosen": -0.10846538841724396, "rewards/margins": 1.5687599629163742, "rewards/rejected": -1.6772253513336182, "step": 15218 }, { "epoch": 0.8066679034267087, "grad_norm": 29.0, "kl": 2.112706184387207, "learning_rate": 5e-07, "logits/chosen": 1769503.5, "logits/rejected": -22588393.6, "logps/chosen": -150.40485636393228, "logps/rejected": -166.3124267578125, "loss": 0.2663, "rewards/chosen": 0.8125652472178141, "rewards/margins": 2.444214646021525, "rewards/rejected": -1.6316493988037108, "step": 15219 }, { "epoch": 0.8067209074285109, "grad_norm": 47.75, "kl": 0.47301673889160156, "learning_rate": 5e-07, "logits/chosen": 8442225.333333334, "logits/rejected": -7347740.8, "logps/chosen": -58.81226603190104, "logps/rejected": -416.401318359375, "loss": 0.278, "rewards/chosen": -0.047262380520502724, "rewards/margins": 2.234418680270513, "rewards/rejected": -2.2816810607910156, "step": 15220 }, { "epoch": 0.806773911430313, "grad_norm": 56.0, "kl": 0.8506040573120117, "learning_rate": 5e-07, "logits/chosen": 7639401.0, "logits/rejected": -22828940.0, "logps/chosen": -282.53363037109375, "logps/rejected": -185.28582763671875, "loss": 0.3106, "rewards/chosen": 0.3813701868057251, "rewards/margins": 2.1863629817962646, "rewards/rejected": -1.8049927949905396, "step": 15221 }, { "epoch": 0.8068269154321152, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18842697.333333332, "logits/rejected": -24333004.8, "logps/chosen": -294.2195231119792, "logps/rejected": -415.49521484375, "loss": 0.2353, "rewards/chosen": 0.7562857468922933, "rewards/margins": 2.538981898625692, "rewards/rejected": -1.7826961517333983, "step": 15222 }, { "epoch": 0.8068799194339172, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63464904.0, "logits/rejected": -3458677.6666666665, "logps/chosen": -395.9052734375, "logps/rejected": -157.71728515625, "loss": 0.1882, "rewards/chosen": 0.49592363834381104, "rewards/margins": 3.1439950863520303, "rewards/rejected": -2.6480714480082193, "step": 15223 }, { "epoch": 0.8069329234357194, "grad_norm": 34.0, "kl": 4.334806442260742, "learning_rate": 5e-07, "logits/chosen": -2316744.0, "logits/rejected": -14876436.0, "logps/chosen": -327.3974609375, "logps/rejected": -209.25955200195312, "loss": 0.2813, "rewards/chosen": 1.0800490379333496, "rewards/margins": 3.200726270675659, "rewards/rejected": -2.1206772327423096, "step": 15224 }, { "epoch": 0.8069859274375215, "grad_norm": 75.5, "kl": 3.6762895584106445, "learning_rate": 5e-07, "logits/chosen": -25376299.2, "logits/rejected": -4089169.0, "logps/chosen": -316.08271484375, "logps/rejected": -127.90224202473958, "loss": 0.395, "rewards/chosen": 0.3702138900756836, "rewards/margins": 2.0724042892456054, "rewards/rejected": -1.7021903991699219, "step": 15225 }, { "epoch": 0.8070389314393237, "grad_norm": 51.25, "kl": 4.125414848327637, "learning_rate": 5e-07, "logits/chosen": -30817069.714285713, "logits/rejected": -81493176.0, "logps/chosen": -303.9724818638393, "logps/rejected": -478.64630126953125, "loss": 0.5004, "rewards/chosen": 0.23302977425711496, "rewards/margins": 2.6494452272142683, "rewards/rejected": -2.4164154529571533, "step": 15226 }, { "epoch": 0.8070919354411258, "grad_norm": 37.25, "kl": 1.758331298828125, "learning_rate": 5e-07, "logits/chosen": -37624525.333333336, "logits/rejected": -19743073.6, "logps/chosen": -422.4016927083333, "logps/rejected": -125.37608642578125, "loss": 0.19, "rewards/chosen": 1.363252321879069, "rewards/margins": 3.798353258768717, "rewards/rejected": -2.4351009368896483, "step": 15227 }, { "epoch": 0.807144939442928, "grad_norm": 35.5, "kl": 5.77279806137085, "learning_rate": 5e-07, "logits/chosen": -3044745.6, "logits/rejected": -7812557.333333333, "logps/chosen": -290.061962890625, "logps/rejected": -85.52198282877605, "loss": 0.3776, "rewards/chosen": 0.6865839004516602, "rewards/margins": 3.743977928161621, "rewards/rejected": -3.057394027709961, "step": 15228 }, { "epoch": 0.8071979434447301, "grad_norm": 56.0, "kl": 2.6603927612304688, "learning_rate": 5e-07, "logits/chosen": -16287561.333333334, "logits/rejected": -886384.6875, "logps/chosen": -500.5345052083333, "logps/rejected": -80.74508666992188, "loss": 0.3535, "rewards/chosen": 0.6463151375452677, "rewards/margins": 3.3257956902186074, "rewards/rejected": -2.67948055267334, "step": 15229 }, { "epoch": 0.8072509474465323, "grad_norm": 44.25, "kl": 0.19573593139648438, "learning_rate": 5e-07, "logits/chosen": -78001432.0, "logits/rejected": -20439964.0, "logps/chosen": -383.3470764160156, "logps/rejected": -245.33457946777344, "loss": 0.2018, "rewards/chosen": 0.6287292242050171, "rewards/margins": 4.27176034450531, "rewards/rejected": -3.643031120300293, "step": 15230 }, { "epoch": 0.8073039514483343, "grad_norm": 64.5, "kl": 1.211653709411621, "learning_rate": 5e-07, "logits/chosen": -14640659.0, "logits/rejected": -17733054.0, "logps/chosen": -170.5253143310547, "logps/rejected": -411.2359619140625, "loss": 0.2649, "rewards/chosen": 0.3147681951522827, "rewards/margins": 4.771594405174255, "rewards/rejected": -4.456826210021973, "step": 15231 }, { "epoch": 0.8073569554501365, "grad_norm": 46.5, "kl": 1.1587696075439453, "learning_rate": 5e-07, "logits/chosen": -83925525.33333333, "logits/rejected": -16061523.2, "logps/chosen": -307.0393880208333, "logps/rejected": -278.15595703125, "loss": 0.2775, "rewards/chosen": 0.5503626664479574, "rewards/margins": 2.400238116582235, "rewards/rejected": -1.8498754501342773, "step": 15232 }, { "epoch": 0.8074099594519386, "grad_norm": 67.0, "kl": 1.1243457794189453, "learning_rate": 5e-07, "logits/chosen": -30922085.333333332, "logits/rejected": -1903070.4, "logps/chosen": -307.7979736328125, "logps/rejected": -175.464013671875, "loss": 0.3134, "rewards/chosen": 0.10510913530985515, "rewards/margins": 2.0986485640207926, "rewards/rejected": -1.9935394287109376, "step": 15233 }, { "epoch": 0.8074629634537408, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57079696.0, "logits/rejected": -12458688.0, "logps/chosen": -337.7630920410156, "logps/rejected": -279.1687316894531, "loss": 0.2636, "rewards/chosen": 0.6566311120986938, "rewards/margins": 3.2249239683151245, "rewards/rejected": -2.5682928562164307, "step": 15234 }, { "epoch": 0.8075159674555429, "grad_norm": 53.5, "kl": 4.1106719970703125, "learning_rate": 5e-07, "logits/chosen": -54335402.666666664, "logits/rejected": -123382648.0, "logps/chosen": -394.0030924479167, "logps/rejected": -367.0587158203125, "loss": 0.5207, "rewards/chosen": 0.023254116376241047, "rewards/margins": 1.4736021359761555, "rewards/rejected": -1.4503480195999146, "step": 15235 }, { "epoch": 0.8075689714573451, "grad_norm": 42.25, "kl": 2.2150192260742188, "learning_rate": 5e-07, "logits/chosen": -42406940.0, "logits/rejected": -19730260.0, "logps/chosen": -303.63665771484375, "logps/rejected": -273.2263488769531, "loss": 0.3078, "rewards/chosen": 0.6321277618408203, "rewards/margins": 2.5567240715026855, "rewards/rejected": -1.9245963096618652, "step": 15236 }, { "epoch": 0.8076219754591472, "grad_norm": 51.25, "kl": 0.7661170959472656, "learning_rate": 5e-07, "logits/chosen": -28950648.0, "logits/rejected": -7211742.5, "logps/chosen": -216.38899739583334, "logps/rejected": -190.95782470703125, "loss": 0.3358, "rewards/chosen": 0.5717445214589437, "rewards/margins": 2.697247346242269, "rewards/rejected": -2.125502824783325, "step": 15237 }, { "epoch": 0.8076749794609493, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12444064.0, "logits/rejected": -35342213.333333336, "logps/chosen": -367.7150573730469, "logps/rejected": -266.7191975911458, "loss": 0.1609, "rewards/chosen": 0.8947548270225525, "rewards/margins": 3.291938563187917, "rewards/rejected": -2.3971837361653647, "step": 15238 }, { "epoch": 0.8077279834627514, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18521024.0, "logits/rejected": -45030188.8, "logps/chosen": -286.2924397786458, "logps/rejected": -392.155859375, "loss": 0.1718, "rewards/chosen": 0.5090454419453939, "rewards/margins": 4.277022202809651, "rewards/rejected": -3.767976760864258, "step": 15239 }, { "epoch": 0.8077809874645536, "grad_norm": 45.25, "kl": 1.4897918701171875, "learning_rate": 5e-07, "logits/chosen": -42540528.0, "logits/rejected": -27560736.0, "logps/chosen": -352.748583984375, "logps/rejected": -326.43654378255206, "loss": 0.2854, "rewards/chosen": 0.5630889892578125, "rewards/margins": 3.5097525278727213, "rewards/rejected": -2.9466635386149087, "step": 15240 }, { "epoch": 0.8078339914663557, "grad_norm": 44.75, "kl": 3.3223419189453125, "learning_rate": 5e-07, "logits/chosen": -6358962.0, "logits/rejected": -15167610.666666666, "logps/chosen": -1249.681640625, "logps/rejected": -285.693603515625, "loss": 0.1631, "rewards/chosen": 2.9737656116485596, "rewards/margins": 5.001756111780802, "rewards/rejected": -2.0279905001322427, "step": 15241 }, { "epoch": 0.8078869954681579, "grad_norm": 33.0, "kl": 2.8122735023498535, "learning_rate": 5e-07, "logits/chosen": -3264754.75, "logits/rejected": -32729220.0, "logps/chosen": -173.41159057617188, "logps/rejected": -278.0061950683594, "loss": 0.232, "rewards/chosen": 1.0118054151535034, "rewards/margins": 3.469873309135437, "rewards/rejected": -2.4580678939819336, "step": 15242 }, { "epoch": 0.80793999946996, "grad_norm": 54.25, "kl": 3.8449249267578125, "learning_rate": 5e-07, "logits/chosen": -23229672.0, "logits/rejected": -34787404.0, "logps/chosen": -418.8206380208333, "logps/rejected": -592.5595092773438, "loss": 0.3205, "rewards/chosen": 1.2078490257263184, "rewards/margins": 4.825639009475708, "rewards/rejected": -3.6177899837493896, "step": 15243 }, { "epoch": 0.8079930034717622, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36476741.333333336, "logits/rejected": -17804046.4, "logps/chosen": -270.3658040364583, "logps/rejected": -344.6736083984375, "loss": 0.2053, "rewards/chosen": 0.6630651156107584, "rewards/margins": 3.668502203623454, "rewards/rejected": -3.0054370880126955, "step": 15244 }, { "epoch": 0.8080460074735643, "grad_norm": 63.25, "kl": 0.3952484130859375, "learning_rate": 5e-07, "logits/chosen": -24602213.333333332, "logits/rejected": -14882780.8, "logps/chosen": -469.1303304036458, "logps/rejected": -443.49208984375, "loss": 0.234, "rewards/chosen": 0.38290127118428546, "rewards/margins": 2.972470553716024, "rewards/rejected": -2.5895692825317385, "step": 15245 }, { "epoch": 0.8080990114753663, "grad_norm": 54.0, "kl": 1.3971023559570312, "learning_rate": 5e-07, "logits/chosen": -735807.3125, "logits/rejected": -12601298.0, "logps/chosen": -161.6278533935547, "logps/rejected": -273.3592529296875, "loss": 0.2542, "rewards/chosen": 0.6680475473403931, "rewards/margins": 3.270950198173523, "rewards/rejected": -2.60290265083313, "step": 15246 }, { "epoch": 0.8081520154771685, "grad_norm": 39.5, "kl": 2.7779502868652344, "learning_rate": 5e-07, "logits/chosen": -9124288.0, "logits/rejected": -73135056.0, "logps/chosen": -238.37642415364584, "logps/rejected": -373.4552001953125, "loss": 0.369, "rewards/chosen": 0.5136842330296835, "rewards/margins": 2.9543413718541465, "rewards/rejected": -2.440657138824463, "step": 15247 }, { "epoch": 0.8082050194789706, "grad_norm": 51.25, "kl": 2.454838752746582, "learning_rate": 5e-07, "logits/chosen": -3072574.5714285714, "logits/rejected": -63498296.0, "logps/chosen": -238.56291852678572, "logps/rejected": -942.410400390625, "loss": 0.3701, "rewards/chosen": 0.6796023505074638, "rewards/margins": 5.416034426007952, "rewards/rejected": -4.736432075500488, "step": 15248 }, { "epoch": 0.8082580234807728, "grad_norm": 41.0, "kl": 2.012754440307617, "learning_rate": 5e-07, "logits/chosen": -117460768.0, "logits/rejected": -19483137.333333332, "logps/chosen": -566.3814697265625, "logps/rejected": -341.761962890625, "loss": 0.161, "rewards/chosen": 1.3952244520187378, "rewards/margins": 4.3805372317632045, "rewards/rejected": -2.9853127797444663, "step": 15249 }, { "epoch": 0.8083110274825749, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15753080.0, "logits/rejected": -5830353.6, "logps/chosen": -252.08369954427084, "logps/rejected": -225.51396484375, "loss": 0.3065, "rewards/chosen": 0.6104461749394735, "rewards/margins": 1.6404150088628136, "rewards/rejected": -1.02996883392334, "step": 15250 }, { "epoch": 0.8083640314843771, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2133766.0, "logits/rejected": -9299453.0, "logps/chosen": -307.01739501953125, "logps/rejected": -121.24494934082031, "loss": 0.294, "rewards/chosen": 0.20162302255630493, "rewards/margins": 2.815564215183258, "rewards/rejected": -2.613941192626953, "step": 15251 }, { "epoch": 0.8084170354861792, "grad_norm": 65.0, "kl": 1.6861190795898438, "learning_rate": 5e-07, "logits/chosen": -21811734.85714286, "logits/rejected": -1703799.375, "logps/chosen": -266.4298793247768, "logps/rejected": -104.21544647216797, "loss": 0.3853, "rewards/chosen": 0.5744913646153041, "rewards/margins": 2.284208995955331, "rewards/rejected": -1.7097176313400269, "step": 15252 }, { "epoch": 0.8084700394879814, "grad_norm": 45.0, "kl": 0.8785972595214844, "learning_rate": 5e-07, "logits/chosen": 5055456.4, "logits/rejected": -16753941.333333334, "logps/chosen": -153.9702392578125, "logps/rejected": -417.10986328125, "loss": 0.2056, "rewards/chosen": 1.3988444328308105, "rewards/margins": 3.953150145212809, "rewards/rejected": -2.5543057123819985, "step": 15253 }, { "epoch": 0.8085230434897834, "grad_norm": 51.5, "kl": 6.923801422119141, "learning_rate": 5e-07, "logits/chosen": -1332720.0, "logits/rejected": -29812480.0, "logps/chosen": -465.66611328125, "logps/rejected": -240.93916829427084, "loss": 0.3128, "rewards/chosen": 1.4153918266296386, "rewards/margins": 3.354075495402018, "rewards/rejected": -1.9386836687723796, "step": 15254 }, { "epoch": 0.8085760474915856, "grad_norm": 73.0, "kl": 1.8339004516601562, "learning_rate": 5e-07, "logits/chosen": -39727494.4, "logits/rejected": -20939365.333333332, "logps/chosen": -361.956640625, "logps/rejected": -256.7107747395833, "loss": 0.2178, "rewards/chosen": 1.132000732421875, "rewards/margins": 3.6619411786397302, "rewards/rejected": -2.529940446217855, "step": 15255 }, { "epoch": 0.8086290514933877, "grad_norm": 61.0, "kl": 4.485984802246094, "learning_rate": 5e-07, "logits/chosen": -5322616.0, "logits/rejected": -21838412.0, "logps/chosen": -626.09228515625, "logps/rejected": -343.4952392578125, "loss": 0.2047, "rewards/chosen": 1.7563682794570923, "rewards/margins": 4.281388878822327, "rewards/rejected": -2.5250205993652344, "step": 15256 }, { "epoch": 0.8086820554951899, "grad_norm": 47.75, "kl": 0.3452177047729492, "learning_rate": 5e-07, "logits/chosen": 3772521.6, "logits/rejected": -40676517.333333336, "logps/chosen": -257.74814453125, "logps/rejected": -260.2967936197917, "loss": 0.3338, "rewards/chosen": 0.21476829051971436, "rewards/margins": 2.2830830017725625, "rewards/rejected": -2.068314711252848, "step": 15257 }, { "epoch": 0.808735059496992, "grad_norm": 51.5, "kl": 4.015186309814453, "learning_rate": 5e-07, "logits/chosen": 711630.4, "logits/rejected": -5782156.666666667, "logps/chosen": -338.80673828125, "logps/rejected": -279.7205403645833, "loss": 0.273, "rewards/chosen": 1.0935583114624023, "rewards/margins": 4.010616302490234, "rewards/rejected": -2.917057991027832, "step": 15258 }, { "epoch": 0.8087880634987942, "grad_norm": 37.25, "kl": 3.537128448486328, "learning_rate": 5e-07, "logits/chosen": -12843448.0, "logits/rejected": -39405920.0, "logps/chosen": -425.3060302734375, "logps/rejected": -441.7872314453125, "loss": 0.3178, "rewards/chosen": 1.2689477602640789, "rewards/margins": 4.252244154612224, "rewards/rejected": -2.9832963943481445, "step": 15259 }, { "epoch": 0.8088410675005963, "grad_norm": 72.5, "kl": 6.4192657470703125, "learning_rate": 5e-07, "logits/chosen": -14917206.857142856, "logits/rejected": -715513.1875, "logps/chosen": -382.40220424107144, "logps/rejected": -181.60105895996094, "loss": 0.4307, "rewards/chosen": 0.7767807415553502, "rewards/margins": 5.08914818082537, "rewards/rejected": -4.3123674392700195, "step": 15260 }, { "epoch": 0.8088940715023984, "grad_norm": 48.75, "kl": 2.7013702392578125, "learning_rate": 5e-07, "logits/chosen": -43015960.0, "logits/rejected": -51964440.0, "logps/chosen": -305.3831481933594, "logps/rejected": -284.61175537109375, "loss": 0.2523, "rewards/chosen": 0.8440348505973816, "rewards/margins": 2.930905044078827, "rewards/rejected": -2.0868701934814453, "step": 15261 }, { "epoch": 0.8089470755042005, "grad_norm": 68.0, "kl": 1.2405166625976562, "learning_rate": 5e-07, "logits/chosen": 5662505.333333333, "logits/rejected": 50048304.0, "logps/chosen": -192.3231201171875, "logps/rejected": -323.280322265625, "loss": 0.2479, "rewards/chosen": 1.5060523351033528, "rewards/margins": 2.728458531697591, "rewards/rejected": -1.2224061965942383, "step": 15262 }, { "epoch": 0.8090000795060027, "grad_norm": 59.25, "kl": 0.146575927734375, "learning_rate": 5e-07, "logits/chosen": -25287726.0, "logits/rejected": -24257928.0, "logps/chosen": -262.577880859375, "logps/rejected": -271.9964599609375, "loss": 0.3828, "rewards/chosen": 0.17490731179714203, "rewards/margins": 1.390018567442894, "rewards/rejected": -1.215111255645752, "step": 15263 }, { "epoch": 0.8090530835078048, "grad_norm": 48.5, "kl": 0.8512458801269531, "learning_rate": 5e-07, "logits/chosen": -45851036.8, "logits/rejected": -17088005.333333332, "logps/chosen": -408.2940673828125, "logps/rejected": -209.37333170572916, "loss": 0.2844, "rewards/chosen": 0.5658331394195557, "rewards/margins": 3.788597472508749, "rewards/rejected": -3.222764333089193, "step": 15264 }, { "epoch": 0.809106087509607, "grad_norm": 62.25, "kl": 2.9058399200439453, "learning_rate": 5e-07, "logits/chosen": -13531784.0, "logits/rejected": -7564407.5, "logps/chosen": -325.363525390625, "logps/rejected": -435.36114501953125, "loss": 0.3929, "rewards/chosen": 0.12975679834683737, "rewards/margins": 3.5651796956857047, "rewards/rejected": -3.435422897338867, "step": 15265 }, { "epoch": 0.8091590915114091, "grad_norm": 45.75, "kl": 0.8766326904296875, "learning_rate": 5e-07, "logits/chosen": -31342400.0, "logits/rejected": -27413936.0, "logps/chosen": -347.950927734375, "logps/rejected": -586.27724609375, "loss": 0.214, "rewards/chosen": 0.4727012713750203, "rewards/margins": 4.71651230653127, "rewards/rejected": -4.24381103515625, "step": 15266 }, { "epoch": 0.8092120955132113, "grad_norm": 55.5, "kl": 2.3110599517822266, "learning_rate": 5e-07, "logits/chosen": 8758262.666666666, "logits/rejected": -32852322.0, "logps/chosen": -317.3105875651042, "logps/rejected": -156.81895446777344, "loss": 0.3386, "rewards/chosen": 0.88319993019104, "rewards/margins": 1.9367369413375854, "rewards/rejected": -1.0535370111465454, "step": 15267 }, { "epoch": 0.8092650995150134, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43562320.0, "logits/rejected": -33309609.14285714, "logps/chosen": -562.5006103515625, "logps/rejected": -390.5340053013393, "loss": 0.1019, "rewards/chosen": 0.607159435749054, "rewards/margins": 3.6584491303988864, "rewards/rejected": -3.0512896946498325, "step": 15268 }, { "epoch": 0.8093181035168155, "grad_norm": 36.0, "kl": 3.720094680786133, "learning_rate": 5e-07, "logits/chosen": 6853512.0, "logits/rejected": -66628249.6, "logps/chosen": -106.44088745117188, "logps/rejected": -666.98935546875, "loss": 0.2508, "rewards/chosen": 0.44540834426879883, "rewards/margins": 3.260731029510498, "rewards/rejected": -2.815322685241699, "step": 15269 }, { "epoch": 0.8093711075186176, "grad_norm": 41.75, "kl": 2.8749914169311523, "learning_rate": 5e-07, "logits/chosen": -7938443.333333333, "logits/rejected": -64873753.6, "logps/chosen": -506.5244954427083, "logps/rejected": -378.50615234375, "loss": 0.2151, "rewards/chosen": 1.2538951237996419, "rewards/margins": 3.4241824467976887, "rewards/rejected": -2.170287322998047, "step": 15270 }, { "epoch": 0.8094241115204198, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40095616.0, "logits/rejected": -14013089.333333334, "logps/chosen": -457.24395751953125, "logps/rejected": -272.8498942057292, "loss": 0.1754, "rewards/chosen": 1.0419189929962158, "rewards/margins": 3.260027805964152, "rewards/rejected": -2.218108812967936, "step": 15271 }, { "epoch": 0.8094771155222219, "grad_norm": 43.25, "kl": 1.1391716003417969, "learning_rate": 5e-07, "logits/chosen": -19693398.666666668, "logits/rejected": -17346993.6, "logps/chosen": -239.1822509765625, "logps/rejected": -217.959716796875, "loss": 0.2276, "rewards/chosen": 0.985871156056722, "rewards/margins": 3.859562714894613, "rewards/rejected": -2.8736915588378906, "step": 15272 }, { "epoch": 0.8095301195240241, "grad_norm": 52.75, "kl": 4.376382827758789, "learning_rate": 5e-07, "logits/chosen": -22149852.0, "logits/rejected": -8558068.0, "logps/chosen": -499.7388000488281, "logps/rejected": -119.86601257324219, "loss": 0.2939, "rewards/chosen": 1.9198200702667236, "rewards/margins": 2.929769992828369, "rewards/rejected": -1.0099499225616455, "step": 15273 }, { "epoch": 0.8095831235258262, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49249600.0, "logits/rejected": -25917502.4, "logps/chosen": -343.4157307942708, "logps/rejected": -306.56416015625, "loss": 0.1515, "rewards/chosen": 0.829112688700358, "rewards/margins": 3.948483149210612, "rewards/rejected": -3.119370460510254, "step": 15274 }, { "epoch": 0.8096361275276284, "grad_norm": 50.5, "kl": 0.4135904312133789, "learning_rate": 5e-07, "logits/chosen": -29262158.0, "logits/rejected": -2317557.5, "logps/chosen": -306.83514404296875, "logps/rejected": -249.32943725585938, "loss": 0.2548, "rewards/chosen": 0.7575251460075378, "rewards/margins": 3.7458195090293884, "rewards/rejected": -2.9882943630218506, "step": 15275 }, { "epoch": 0.8096891315294305, "grad_norm": 50.25, "kl": 0.97381591796875, "learning_rate": 5e-07, "logits/chosen": -31295012.0, "logits/rejected": -35566868.0, "logps/chosen": -299.21014404296875, "logps/rejected": -294.1674499511719, "loss": 0.2402, "rewards/chosen": 0.9632774591445923, "rewards/margins": 3.0327190160751343, "rewards/rejected": -2.069441556930542, "step": 15276 }, { "epoch": 0.8097421355312326, "grad_norm": 59.0, "kl": 3.8023452758789062, "learning_rate": 5e-07, "logits/chosen": -23213824.0, "logits/rejected": 1605220.75, "logps/chosen": -215.65478515625, "logps/rejected": -91.39039611816406, "loss": 0.4548, "rewards/chosen": 0.2978505690892537, "rewards/margins": 2.1831186612447104, "rewards/rejected": -1.8852680921554565, "step": 15277 }, { "epoch": 0.8097951395330347, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25789998.0, "logits/rejected": -56507962.666666664, "logps/chosen": -325.65618896484375, "logps/rejected": -631.5638020833334, "loss": 0.1373, "rewards/chosen": 0.5163467526435852, "rewards/margins": 3.543474018573761, "rewards/rejected": -3.027127265930176, "step": 15278 }, { "epoch": 0.8098481435348369, "grad_norm": 55.5, "kl": 1.614924430847168, "learning_rate": 5e-07, "logits/chosen": -21636174.0, "logits/rejected": -15225590.666666666, "logps/chosen": -395.17877197265625, "logps/rejected": -198.89007568359375, "loss": 0.1479, "rewards/chosen": 0.6596770882606506, "rewards/margins": 4.211191713809967, "rewards/rejected": -3.5515146255493164, "step": 15279 }, { "epoch": 0.809901147536639, "grad_norm": 36.5, "kl": 3.283684730529785, "learning_rate": 5e-07, "logits/chosen": 9187676.0, "logits/rejected": -47660508.0, "logps/chosen": -78.44625854492188, "logps/rejected": -373.406494140625, "loss": 0.2876, "rewards/chosen": 0.4254511594772339, "rewards/margins": 4.186354279518127, "rewards/rejected": -3.7609031200408936, "step": 15280 }, { "epoch": 0.8099541515384412, "grad_norm": 64.0, "kl": 0.3298530578613281, "learning_rate": 5e-07, "logits/chosen": -5616506.666666667, "logits/rejected": -11420759.2, "logps/chosen": -359.6190592447917, "logps/rejected": -114.9812744140625, "loss": 0.3157, "rewards/chosen": 0.04092700034379959, "rewards/margins": 2.387334890663624, "rewards/rejected": -2.346407890319824, "step": 15281 }, { "epoch": 0.8100071555402433, "grad_norm": 48.75, "kl": 1.4975872039794922, "learning_rate": 5e-07, "logits/chosen": -76411878.4, "logits/rejected": -3512611.3333333335, "logps/chosen": -281.43310546875, "logps/rejected": -275.5721028645833, "loss": 0.3692, "rewards/chosen": 0.06201965808868408, "rewards/margins": 2.80861345132192, "rewards/rejected": -2.746593793233236, "step": 15282 }, { "epoch": 0.8100601595420455, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26631184.0, "logits/rejected": -55798360.0, "logps/chosen": -294.9659423828125, "logps/rejected": -327.5933837890625, "loss": 0.2944, "rewards/chosen": 0.2898082733154297, "rewards/margins": 2.6264266967773438, "rewards/rejected": -2.336618423461914, "step": 15283 }, { "epoch": 0.8101131635438475, "grad_norm": 51.25, "kl": 6.303937911987305, "learning_rate": 5e-07, "logits/chosen": -48160678.4, "logits/rejected": -36965325.333333336, "logps/chosen": -503.57451171875, "logps/rejected": -427.1233317057292, "loss": 0.3509, "rewards/chosen": 1.2217746734619142, "rewards/margins": 3.798016325632731, "rewards/rejected": -2.576241652170817, "step": 15284 }, { "epoch": 0.8101661675456497, "grad_norm": 93.0, "kl": 3.0035629272460938, "learning_rate": 5e-07, "logits/chosen": -28403520.0, "logits/rejected": -5580164.333333333, "logps/chosen": -635.410205078125, "logps/rejected": -109.3269551595052, "loss": 0.2767, "rewards/chosen": 1.00501708984375, "rewards/margins": 4.0210318247477215, "rewards/rejected": -3.016014734903971, "step": 15285 }, { "epoch": 0.8102191715474518, "grad_norm": 54.5, "kl": 6.025993347167969, "learning_rate": 5e-07, "logits/chosen": -4341956.857142857, "logits/rejected": -36868804.0, "logps/chosen": -238.48636300223214, "logps/rejected": -591.8358764648438, "loss": 0.367, "rewards/chosen": 1.0775727544512068, "rewards/margins": 3.7596957002367293, "rewards/rejected": -2.6821229457855225, "step": 15286 }, { "epoch": 0.810272175549254, "grad_norm": 53.75, "kl": 0.4067955017089844, "learning_rate": 5e-07, "logits/chosen": -29637218.285714287, "logits/rejected": -11800768.0, "logps/chosen": -414.4857700892857, "logps/rejected": -119.14124298095703, "loss": 0.2222, "rewards/chosen": 1.3292474746704102, "rewards/margins": 5.208998441696167, "rewards/rejected": -3.879750967025757, "step": 15287 }, { "epoch": 0.8103251795510561, "grad_norm": 51.25, "kl": 2.579936981201172, "learning_rate": 5e-07, "logits/chosen": -34943216.0, "logits/rejected": -16804872.0, "logps/chosen": -416.5464782714844, "logps/rejected": -313.28057861328125, "loss": 0.391, "rewards/chosen": 0.11060018837451935, "rewards/margins": 2.360331729054451, "rewards/rejected": -2.2497315406799316, "step": 15288 }, { "epoch": 0.8103781835528583, "grad_norm": 45.75, "kl": 5.943368911743164, "learning_rate": 5e-07, "logits/chosen": -13013443.2, "logits/rejected": 885492.3333333334, "logps/chosen": -531.309130859375, "logps/rejected": -95.08734130859375, "loss": 0.2665, "rewards/chosen": 1.422805404663086, "rewards/margins": 4.608736610412597, "rewards/rejected": -3.1859312057495117, "step": 15289 }, { "epoch": 0.8104311875546604, "grad_norm": 50.5, "kl": 0.5580291748046875, "learning_rate": 5e-07, "logits/chosen": -38842777.6, "logits/rejected": -19237761.333333332, "logps/chosen": -247.681298828125, "logps/rejected": -273.9640299479167, "loss": 0.4163, "rewards/chosen": -0.31469926834106443, "rewards/margins": 1.9446999867757162, "rewards/rejected": -2.2593992551167807, "step": 15290 }, { "epoch": 0.8104841915564626, "grad_norm": 42.25, "kl": 1.9684410095214844, "learning_rate": 5e-07, "logits/chosen": -47206696.0, "logits/rejected": -17547484.0, "logps/chosen": -606.7186889648438, "logps/rejected": -198.9738311767578, "loss": 0.2415, "rewards/chosen": 1.0751659870147705, "rewards/margins": 3.1989076137542725, "rewards/rejected": -2.123741626739502, "step": 15291 }, { "epoch": 0.8105371955582646, "grad_norm": 47.0, "kl": 1.6023187637329102, "learning_rate": 5e-07, "logits/chosen": -8349033.333333333, "logits/rejected": 28697756.8, "logps/chosen": -307.5177408854167, "logps/rejected": -427.340234375, "loss": 0.2856, "rewards/chosen": 0.013000808656215668, "rewards/margins": 2.1181125029921533, "rewards/rejected": -2.1051116943359376, "step": 15292 }, { "epoch": 0.8105901995600668, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8182226.666666667, "logits/rejected": -22002636.8, "logps/chosen": -332.1648763020833, "logps/rejected": -277.359814453125, "loss": 0.3386, "rewards/chosen": -0.09192453821500142, "rewards/margins": 1.5074330667654674, "rewards/rejected": -1.5993576049804688, "step": 15293 }, { "epoch": 0.8106432035618689, "grad_norm": 53.5, "kl": 0.72454833984375, "learning_rate": 5e-07, "logits/chosen": -74064456.0, "logits/rejected": -1275789.0, "logps/chosen": -220.992919921875, "logps/rejected": -163.04469299316406, "loss": 0.4548, "rewards/chosen": -0.7267544269561768, "rewards/margins": 0.07863688468933105, "rewards/rejected": -0.8053913116455078, "step": 15294 }, { "epoch": 0.8106962075636711, "grad_norm": 35.5, "kl": 3.092156410217285, "learning_rate": 5e-07, "logits/chosen": -8718041.6, "logits/rejected": 1210974.5, "logps/chosen": -95.01551513671875, "logps/rejected": -139.80828857421875, "loss": 0.369, "rewards/chosen": 0.3169680118560791, "rewards/margins": 3.604886738459269, "rewards/rejected": -3.28791872660319, "step": 15295 }, { "epoch": 0.8107492115654732, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -176525040.0, "logits/rejected": -12944048.0, "logps/chosen": -448.93115234375, "logps/rejected": -268.83770751953125, "loss": 0.2313, "rewards/chosen": 0.5710842609405518, "rewards/margins": 3.377530336380005, "rewards/rejected": -2.806446075439453, "step": 15296 }, { "epoch": 0.8108022155672753, "grad_norm": 56.75, "kl": 3.157674789428711, "learning_rate": 5e-07, "logits/chosen": -37271379.2, "logits/rejected": -17140192.0, "logps/chosen": -444.045654296875, "logps/rejected": -202.94677734375, "loss": 0.313, "rewards/chosen": 1.2064369201660157, "rewards/margins": 3.2542538324991863, "rewards/rejected": -2.0478169123331704, "step": 15297 }, { "epoch": 0.8108552195690775, "grad_norm": 56.5, "kl": 0.6043815612792969, "learning_rate": 5e-07, "logits/chosen": 1584232.1666666667, "logits/rejected": -44947667.2, "logps/chosen": -331.56972249348956, "logps/rejected": -405.89873046875, "loss": 0.1923, "rewards/chosen": 1.032881736755371, "rewards/margins": 3.891265296936035, "rewards/rejected": -2.858383560180664, "step": 15298 }, { "epoch": 0.8109082235708795, "grad_norm": 49.0, "kl": 6.561643600463867, "learning_rate": 5e-07, "logits/chosen": 1017590.5, "logits/rejected": -21799972.0, "logps/chosen": -477.3683776855469, "logps/rejected": -170.16371154785156, "loss": 0.3674, "rewards/chosen": 0.8736574053764343, "rewards/margins": 3.5188536047935486, "rewards/rejected": -2.6451961994171143, "step": 15299 }, { "epoch": 0.8109612275726817, "grad_norm": 65.0, "kl": 6.119804382324219, "learning_rate": 5e-07, "logits/chosen": -71186008.0, "logits/rejected": 27931206.0, "logps/chosen": -948.8043823242188, "logps/rejected": -257.7086486816406, "loss": 0.2908, "rewards/chosen": 1.7191636562347412, "rewards/margins": 3.8444855213165283, "rewards/rejected": -2.125321865081787, "step": 15300 }, { "epoch": 0.8110142315744838, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11400656.0, "logits/rejected": 7771254.5, "logps/chosen": -216.32421875, "logps/rejected": -203.2957763671875, "loss": 0.3678, "rewards/chosen": 0.06276091933250427, "rewards/margins": 1.4125377833843231, "rewards/rejected": -1.3497768640518188, "step": 15301 }, { "epoch": 0.811067235576286, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38182410.666666664, "logits/rejected": -1190681.75, "logps/chosen": -301.96893310546875, "logps/rejected": -61.37079620361328, "loss": 0.3533, "rewards/chosen": 0.23249910275141397, "rewards/margins": 3.136528472105662, "rewards/rejected": -2.904029369354248, "step": 15302 }, { "epoch": 0.8111202395780881, "grad_norm": 60.5, "kl": 1.7325248718261719, "learning_rate": 5e-07, "logits/chosen": -23643396.8, "logits/rejected": 43754229.333333336, "logps/chosen": -189.89727783203125, "logps/rejected": -459.37109375, "loss": 0.3995, "rewards/chosen": 0.04372376799583435, "rewards/margins": 2.226235733429591, "rewards/rejected": -2.1825119654337564, "step": 15303 }, { "epoch": 0.8111732435798903, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4557288.0, "logits/rejected": -15700697.6, "logps/chosen": -298.15749104817706, "logps/rejected": -258.4365234375, "loss": 0.1864, "rewards/chosen": 0.6568059126536051, "rewards/margins": 3.70075790087382, "rewards/rejected": -3.043951988220215, "step": 15304 }, { "epoch": 0.8112262475816924, "grad_norm": 53.5, "kl": 2.1794395446777344, "learning_rate": 5e-07, "logits/chosen": -16134504.0, "logits/rejected": -9832602.666666666, "logps/chosen": -803.77314453125, "logps/rejected": -306.80588785807294, "loss": 0.2945, "rewards/chosen": 0.9804569244384765, "rewards/margins": 3.0716882705688477, "rewards/rejected": -2.091231346130371, "step": 15305 }, { "epoch": 0.8112792515834946, "grad_norm": 33.5, "kl": 1.9849815368652344, "learning_rate": 5e-07, "logits/chosen": -11948892.0, "logits/rejected": -8482654.0, "logps/chosen": -410.7472229003906, "logps/rejected": -201.05296325683594, "loss": 0.2893, "rewards/chosen": 0.9732078313827515, "rewards/margins": 3.526089310646057, "rewards/rejected": -2.5528814792633057, "step": 15306 }, { "epoch": 0.8113322555852966, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6599742.0, "logits/rejected": -44353568.0, "logps/chosen": -116.57499694824219, "logps/rejected": -399.6229654947917, "loss": 0.1668, "rewards/chosen": 0.054584801197052, "rewards/margins": 2.888860364754995, "rewards/rejected": -2.834275563557943, "step": 15307 }, { "epoch": 0.8113852595870988, "grad_norm": 39.25, "kl": 1.3020515441894531, "learning_rate": 5e-07, "logits/chosen": -35256973.333333336, "logits/rejected": -20063246.4, "logps/chosen": -242.19217936197916, "logps/rejected": -265.75849609375, "loss": 0.2409, "rewards/chosen": 0.7347198327382406, "rewards/margins": 3.1717422326405846, "rewards/rejected": -2.437022399902344, "step": 15308 }, { "epoch": 0.8114382635889009, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3343139.3333333335, "logits/rejected": -5795538.4, "logps/chosen": -354.428955078125, "logps/rejected": -135.99593505859374, "loss": 0.1523, "rewards/chosen": 1.2348899841308594, "rewards/margins": 4.588246154785156, "rewards/rejected": -3.3533561706542967, "step": 15309 }, { "epoch": 0.8114912675907031, "grad_norm": 100.0, "kl": 0.44103240966796875, "learning_rate": 5e-07, "logits/chosen": -32311205.333333332, "logits/rejected": -15858254.4, "logps/chosen": -204.9722900390625, "logps/rejected": -221.356884765625, "loss": 0.3516, "rewards/chosen": 0.3486960728963216, "rewards/margins": 1.6569602330525715, "rewards/rejected": -1.30826416015625, "step": 15310 }, { "epoch": 0.8115442715925052, "grad_norm": 78.5, "kl": 5.453559875488281, "learning_rate": 5e-07, "logits/chosen": -54082619.428571425, "logits/rejected": -2801610.25, "logps/chosen": -598.8544224330357, "logps/rejected": -102.52812957763672, "loss": 0.3379, "rewards/chosen": 1.14015075138637, "rewards/margins": 5.350594248090472, "rewards/rejected": -4.210443496704102, "step": 15311 }, { "epoch": 0.8115972755943074, "grad_norm": 33.0, "kl": 0.60595703125, "learning_rate": 5e-07, "logits/chosen": -9558436.0, "logits/rejected": -55873830.4, "logps/chosen": -142.70758056640625, "logps/rejected": -362.7003173828125, "loss": 0.2338, "rewards/chosen": 0.4762403170267741, "rewards/margins": 3.4186856905619303, "rewards/rejected": -2.9424453735351563, "step": 15312 }, { "epoch": 0.8116502795961095, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35638018.666666664, "logits/rejected": -13041285.6, "logps/chosen": -385.0188395182292, "logps/rejected": -378.38369140625, "loss": 0.1133, "rewards/chosen": 1.3096130688985188, "rewards/margins": 5.121939500172933, "rewards/rejected": -3.812326431274414, "step": 15313 }, { "epoch": 0.8117032835979117, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59546936.0, "logits/rejected": -42290588.0, "logps/chosen": -226.8631591796875, "logps/rejected": -582.2332153320312, "loss": 0.2115, "rewards/chosen": 0.6191920638084412, "rewards/margins": 3.9502028822898865, "rewards/rejected": -3.3310108184814453, "step": 15314 }, { "epoch": 0.8117562875997137, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13004281.333333334, "logits/rejected": -48320563.2, "logps/chosen": -419.1476236979167, "logps/rejected": -255.9400634765625, "loss": 0.1526, "rewards/chosen": 1.5639047622680664, "rewards/margins": 3.632485580444336, "rewards/rejected": -2.0685808181762697, "step": 15315 }, { "epoch": 0.8118092916015159, "grad_norm": 49.0, "kl": 1.1941947937011719, "learning_rate": 5e-07, "logits/chosen": -11369605.333333334, "logits/rejected": -16557978.0, "logps/chosen": -231.49393717447916, "logps/rejected": -357.03131103515625, "loss": 0.3701, "rewards/chosen": 0.3645556370417277, "rewards/margins": 3.488584796587626, "rewards/rejected": -3.1240291595458984, "step": 15316 }, { "epoch": 0.811862295603318, "grad_norm": 19.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5483449.5, "logits/rejected": -50389193.14285714, "logps/chosen": -37.351806640625, "logps/rejected": -422.74937220982144, "loss": 0.093, "rewards/chosen": 0.1528060883283615, "rewards/margins": 3.602009034582547, "rewards/rejected": -3.4492029462541853, "step": 15317 }, { "epoch": 0.8119152996051202, "grad_norm": 48.75, "kl": 0.08675384521484375, "learning_rate": 5e-07, "logits/chosen": -52452640.0, "logits/rejected": -10861313.6, "logps/chosen": -370.2128499348958, "logps/rejected": -213.053564453125, "loss": 0.2307, "rewards/chosen": 0.28625690937042236, "rewards/margins": 3.0832974672317506, "rewards/rejected": -2.797040557861328, "step": 15318 }, { "epoch": 0.8119683036069223, "grad_norm": 31.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12010071.0, "logits/rejected": -4484802.5, "logps/chosen": -282.5878601074219, "logps/rejected": -125.26017761230469, "loss": 0.1647, "rewards/chosen": 1.2457571029663086, "rewards/margins": 4.393346309661865, "rewards/rejected": -3.1475892066955566, "step": 15319 }, { "epoch": 0.8120213076087245, "grad_norm": 46.25, "kl": 2.6172943115234375, "learning_rate": 5e-07, "logits/chosen": -26351364.0, "logits/rejected": -15863595.0, "logps/chosen": -148.4003143310547, "logps/rejected": -408.1239013671875, "loss": 0.2128, "rewards/chosen": 1.0963252782821655, "rewards/margins": 3.6495953798294067, "rewards/rejected": -2.553270101547241, "step": 15320 }, { "epoch": 0.8120743116105266, "grad_norm": 47.75, "kl": 6.5671539306640625, "learning_rate": 5e-07, "logits/chosen": -30403757.714285713, "logits/rejected": -57801328.0, "logps/chosen": -299.0074986049107, "logps/rejected": -379.93841552734375, "loss": 0.48, "rewards/chosen": 0.6382847513471331, "rewards/margins": 3.056937047413417, "rewards/rejected": -2.418652296066284, "step": 15321 }, { "epoch": 0.8121273156123288, "grad_norm": 50.5, "kl": 2.8882617950439453, "learning_rate": 5e-07, "logits/chosen": -53220732.0, "logits/rejected": -12629740.0, "logps/chosen": -484.6064453125, "logps/rejected": -165.09228515625, "loss": 0.3096, "rewards/chosen": 0.4194504916667938, "rewards/margins": 3.704076737165451, "rewards/rejected": -3.2846262454986572, "step": 15322 }, { "epoch": 0.8121803196141308, "grad_norm": 40.75, "kl": 4.154714584350586, "learning_rate": 5e-07, "logits/chosen": -12139722.4, "logits/rejected": -54032906.666666664, "logps/chosen": -260.5978271484375, "logps/rejected": -528.343505859375, "loss": 0.3112, "rewards/chosen": 0.7970822334289551, "rewards/margins": 3.422599124908447, "rewards/rejected": -2.625516891479492, "step": 15323 }, { "epoch": 0.812233323615933, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8045568.0, "logits/rejected": -6404825.2, "logps/chosen": -214.35538736979166, "logps/rejected": -270.231201171875, "loss": 0.2957, "rewards/chosen": 0.4989182949066162, "rewards/margins": 2.1143863201141357, "rewards/rejected": -1.6154680252075195, "step": 15324 }, { "epoch": 0.8122863276177351, "grad_norm": 56.25, "kl": 0.6904811859130859, "learning_rate": 5e-07, "logits/chosen": -21464628.0, "logits/rejected": -5900354.5, "logps/chosen": -197.2924346923828, "logps/rejected": -290.2196044921875, "loss": 0.3057, "rewards/chosen": 0.21927575767040253, "rewards/margins": 2.4398055523633957, "rewards/rejected": -2.220529794692993, "step": 15325 }, { "epoch": 0.8123393316195373, "grad_norm": 48.25, "kl": 0.06300926208496094, "learning_rate": 5e-07, "logits/chosen": -14786248.0, "logits/rejected": -42034264.0, "logps/chosen": -264.0759684244792, "logps/rejected": -407.26947021484375, "loss": 0.3307, "rewards/chosen": 0.44724929332733154, "rewards/margins": 3.519924759864807, "rewards/rejected": -3.0726754665374756, "step": 15326 }, { "epoch": 0.8123923356213394, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48640821.333333336, "logits/rejected": 751291.25, "logps/chosen": -199.1044718424479, "logps/rejected": -113.5589111328125, "loss": 0.2485, "rewards/chosen": -0.14903965592384338, "rewards/margins": 2.891299623250961, "rewards/rejected": -3.0403392791748045, "step": 15327 }, { "epoch": 0.8124453396231416, "grad_norm": 41.25, "kl": 7.304067611694336, "learning_rate": 5e-07, "logits/chosen": -35161382.4, "logits/rejected": -22504021.333333332, "logps/chosen": -888.423046875, "logps/rejected": -305.0003255208333, "loss": 0.2261, "rewards/chosen": 1.5552426338195802, "rewards/margins": 4.388907210032145, "rewards/rejected": -2.833664576212565, "step": 15328 }, { "epoch": 0.8124983436249437, "grad_norm": 47.75, "kl": 0.4574708938598633, "learning_rate": 5e-07, "logits/chosen": 5021192.666666667, "logits/rejected": 15738700.8, "logps/chosen": -39.810567220052086, "logps/rejected": -263.061328125, "loss": 0.3299, "rewards/chosen": -0.055024534463882446, "rewards/margins": 1.958877557516098, "rewards/rejected": -2.0139020919799804, "step": 15329 }, { "epoch": 0.8125513476267459, "grad_norm": 49.0, "kl": 0.8473091125488281, "learning_rate": 5e-07, "logits/chosen": -5834059.5, "logits/rejected": -45615796.0, "logps/chosen": -89.24748992919922, "logps/rejected": -178.18963623046875, "loss": 0.3085, "rewards/chosen": 0.8618874549865723, "rewards/margins": 1.8738410472869873, "rewards/rejected": -1.011953592300415, "step": 15330 }, { "epoch": 0.8126043516285479, "grad_norm": 63.75, "kl": 0.39932727813720703, "learning_rate": 5e-07, "logits/chosen": -29358060.8, "logits/rejected": -7170590.666666667, "logps/chosen": -303.3749755859375, "logps/rejected": -518.8553059895834, "loss": 0.263, "rewards/chosen": 0.841594123840332, "rewards/margins": 2.84447914759318, "rewards/rejected": -2.002885023752848, "step": 15331 }, { "epoch": 0.8126573556303501, "grad_norm": 46.25, "kl": 3.178220748901367, "learning_rate": 5e-07, "logits/chosen": -46717813.333333336, "logits/rejected": 13916409.0, "logps/chosen": -430.4654541015625, "logps/rejected": -573.3975219726562, "loss": 0.3312, "rewards/chosen": 1.2255047957102458, "rewards/margins": 3.035857955614726, "rewards/rejected": -1.81035315990448, "step": 15332 }, { "epoch": 0.8127103596321522, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25949230.0, "logits/rejected": -18725154.666666668, "logps/chosen": -163.4283905029297, "logps/rejected": -235.3333740234375, "loss": 0.2472, "rewards/chosen": 0.18280182778835297, "rewards/margins": 2.1515216281016665, "rewards/rejected": -1.9687198003133137, "step": 15333 }, { "epoch": 0.8127633636339544, "grad_norm": 47.5, "kl": 1.9238395690917969, "learning_rate": 5e-07, "logits/chosen": -28877328.0, "logits/rejected": 27058932.0, "logps/chosen": -1010.0612182617188, "logps/rejected": -237.98635864257812, "loss": 0.1987, "rewards/chosen": 1.8951668739318848, "rewards/margins": 3.4102933406829834, "rewards/rejected": -1.5151264667510986, "step": 15334 }, { "epoch": 0.8128163676357565, "grad_norm": 32.5, "kl": 2.288209915161133, "learning_rate": 5e-07, "logits/chosen": -24757084.0, "logits/rejected": -7374851.0, "logps/chosen": -862.2537841796875, "logps/rejected": -222.07022094726562, "loss": 0.1929, "rewards/chosen": 1.654836654663086, "rewards/margins": 4.284569263458252, "rewards/rejected": -2.629732608795166, "step": 15335 }, { "epoch": 0.8128693716375587, "grad_norm": 51.75, "kl": 0.7168655395507812, "learning_rate": 5e-07, "logits/chosen": -79372768.0, "logits/rejected": -12169620.0, "logps/chosen": -409.4040832519531, "logps/rejected": -195.7132110595703, "loss": 0.224, "rewards/chosen": 0.6174396276473999, "rewards/margins": 3.5121535062789917, "rewards/rejected": -2.894713878631592, "step": 15336 }, { "epoch": 0.8129223756393608, "grad_norm": 45.75, "kl": 1.7371940612792969, "learning_rate": 5e-07, "logits/chosen": -79459864.0, "logits/rejected": -39624960.0, "logps/chosen": -362.64739990234375, "logps/rejected": -183.56399536132812, "loss": 0.2754, "rewards/chosen": 0.15814724564552307, "rewards/margins": 3.121592193841934, "rewards/rejected": -2.963444948196411, "step": 15337 }, { "epoch": 0.812975379641163, "grad_norm": 64.5, "kl": 9.17161750793457, "learning_rate": 5e-07, "logits/chosen": -68741000.0, "logits/rejected": -44673788.0, "logps/chosen": -731.0473022460938, "logps/rejected": -308.5355224609375, "loss": 0.2525, "rewards/chosen": 2.0249695777893066, "rewards/margins": 4.171449422836304, "rewards/rejected": -2.146479845046997, "step": 15338 }, { "epoch": 0.813028383642965, "grad_norm": 65.5, "kl": 5.544059753417969, "learning_rate": 5e-07, "logits/chosen": -29224960.0, "logits/rejected": -64974272.0, "logps/chosen": -313.14516194661456, "logps/rejected": -317.0965576171875, "loss": 0.3854, "rewards/chosen": 0.8123014767964681, "rewards/margins": 3.8402093251546225, "rewards/rejected": -3.0279078483581543, "step": 15339 }, { "epoch": 0.8130813876447672, "grad_norm": 47.75, "kl": 1.4100360870361328, "learning_rate": 5e-07, "logits/chosen": -5170728.5, "logits/rejected": -1437042.0, "logps/chosen": -52.0579719543457, "logps/rejected": -364.4823303222656, "loss": 0.2614, "rewards/chosen": 0.6082000732421875, "rewards/margins": 3.9805679321289062, "rewards/rejected": -3.3723678588867188, "step": 15340 }, { "epoch": 0.8131343916465693, "grad_norm": 47.0, "kl": 0.23742294311523438, "learning_rate": 5e-07, "logits/chosen": -52696152.0, "logits/rejected": -31094932.0, "logps/chosen": -362.87255859375, "logps/rejected": -405.7984313964844, "loss": 0.2616, "rewards/chosen": 0.2524421811103821, "rewards/margins": 3.8172557950019836, "rewards/rejected": -3.5648136138916016, "step": 15341 }, { "epoch": 0.8131873956483715, "grad_norm": 47.5, "kl": 1.233896255493164, "learning_rate": 5e-07, "logits/chosen": -54797395.2, "logits/rejected": -22824237.333333332, "logps/chosen": -358.786083984375, "logps/rejected": -349.3774007161458, "loss": 0.2257, "rewards/chosen": 0.8014233589172364, "rewards/margins": 4.358601538340251, "rewards/rejected": -3.557178179423014, "step": 15342 }, { "epoch": 0.8132403996501736, "grad_norm": 49.75, "kl": 4.040317535400391, "learning_rate": 5e-07, "logits/chosen": -19658716.0, "logits/rejected": -5686514.5, "logps/chosen": -359.9077555338542, "logps/rejected": -100.29061889648438, "loss": 0.4162, "rewards/chosen": 0.6626652081807455, "rewards/margins": 1.7139023145039878, "rewards/rejected": -1.0512371063232422, "step": 15343 }, { "epoch": 0.8132934036519758, "grad_norm": 30.875, "kl": 0.1800975799560547, "learning_rate": 5e-07, "logits/chosen": 5254553.0, "logits/rejected": -39350528.0, "logps/chosen": -4.910764694213867, "logps/rejected": -359.06107003348217, "loss": 0.1343, "rewards/chosen": 0.35178127884864807, "rewards/margins": 3.2258524000644684, "rewards/rejected": -2.8740711212158203, "step": 15344 }, { "epoch": 0.8133464076537779, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23970024.0, "logits/rejected": -45835484.0, "logps/chosen": -225.05926513671875, "logps/rejected": -448.9721374511719, "loss": 0.3395, "rewards/chosen": -0.5021628737449646, "rewards/margins": 2.4307175278663635, "rewards/rejected": -2.932880401611328, "step": 15345 }, { "epoch": 0.81339941165558, "grad_norm": 97.0, "kl": 7.5846099853515625, "learning_rate": 5e-07, "logits/chosen": -24378318.4, "logits/rejected": -4807829.0, "logps/chosen": -483.20048828125, "logps/rejected": -255.10677083333334, "loss": 0.3218, "rewards/chosen": 1.5069536209106444, "rewards/margins": 3.488411966959635, "rewards/rejected": -1.981458346048991, "step": 15346 }, { "epoch": 0.8134524156573821, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3234549.3333333335, "logits/rejected": -27969718.4, "logps/chosen": -244.2834269205729, "logps/rejected": -597.40634765625, "loss": 0.1411, "rewards/chosen": 1.5636749267578125, "rewards/margins": 4.523067474365234, "rewards/rejected": -2.959392547607422, "step": 15347 }, { "epoch": 0.8135054196591842, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23284452.0, "logits/rejected": -22321124.0, "logps/chosen": -307.8148193359375, "logps/rejected": -346.756103515625, "loss": 0.2364, "rewards/chosen": 0.5136180520057678, "rewards/margins": 3.351272404193878, "rewards/rejected": -2.8376543521881104, "step": 15348 }, { "epoch": 0.8135584236609864, "grad_norm": 39.75, "kl": 0.6325607299804688, "learning_rate": 5e-07, "logits/chosen": -57127816.0, "logits/rejected": -8776134.666666666, "logps/chosen": -228.94276428222656, "logps/rejected": -252.26472981770834, "loss": 0.2597, "rewards/chosen": -0.008252906613051891, "rewards/margins": 1.7984179180736344, "rewards/rejected": -1.8066708246866863, "step": 15349 }, { "epoch": 0.8136114276627885, "grad_norm": 37.75, "kl": 4.010059356689453, "learning_rate": 5e-07, "logits/chosen": -12117340.0, "logits/rejected": -15243618.0, "logps/chosen": -174.1626993815104, "logps/rejected": -331.4323425292969, "loss": 0.3772, "rewards/chosen": 0.6071415742238363, "rewards/margins": 3.4425633748372397, "rewards/rejected": -2.8354218006134033, "step": 15350 }, { "epoch": 0.8136644316645907, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -109650848.0, "logits/rejected": -22481045.333333332, "logps/chosen": -253.1571807861328, "logps/rejected": -207.02665201822916, "loss": 0.1609, "rewards/chosen": 0.3590385317802429, "rewards/margins": 3.1506540179252625, "rewards/rejected": -2.7916154861450195, "step": 15351 }, { "epoch": 0.8137174356663928, "grad_norm": 31.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5483670.0, "logits/rejected": -18264166.85714286, "logps/chosen": -63.52571105957031, "logps/rejected": -304.42954799107144, "loss": 0.1817, "rewards/chosen": 0.04238853603601456, "rewards/margins": 2.537634060851165, "rewards/rejected": -2.4952455248151506, "step": 15352 }, { "epoch": 0.813770439668195, "grad_norm": 57.25, "kl": 0.3909263610839844, "learning_rate": 5e-07, "logits/chosen": -57515368.0, "logits/rejected": -26758162.0, "logps/chosen": -345.6063232421875, "logps/rejected": -294.7375793457031, "loss": 0.3187, "rewards/chosen": -0.04106006771326065, "rewards/margins": 2.3471878990530968, "rewards/rejected": -2.3882479667663574, "step": 15353 }, { "epoch": 0.813823443669997, "grad_norm": 53.5, "kl": 0.6203155517578125, "learning_rate": 5e-07, "logits/chosen": -514576.8, "logits/rejected": 15276734.666666666, "logps/chosen": -251.389013671875, "logps/rejected": -330.7964274088542, "loss": 0.3413, "rewards/chosen": 0.49811525344848634, "rewards/margins": 2.0984546661376955, "rewards/rejected": -1.600339412689209, "step": 15354 }, { "epoch": 0.8138764476717992, "grad_norm": 60.0, "kl": 0.7209644317626953, "learning_rate": 5e-07, "logits/chosen": -25066659.2, "logits/rejected": 11327734.666666666, "logps/chosen": -258.5780517578125, "logps/rejected": -431.2111002604167, "loss": 0.332, "rewards/chosen": 0.17400337457656861, "rewards/margins": 3.761403810977936, "rewards/rejected": -3.587400436401367, "step": 15355 }, { "epoch": 0.8139294516736013, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -82278874.66666667, "logits/rejected": -23670704.0, "logps/chosen": -305.15354410807294, "logps/rejected": -400.3454833984375, "loss": 0.2347, "rewards/chosen": -0.14462247490882874, "rewards/margins": 2.6947303235530855, "rewards/rejected": -2.8393527984619142, "step": 15356 }, { "epoch": 0.8139824556754035, "grad_norm": 73.0, "kl": 2.2775750160217285, "learning_rate": 5e-07, "logits/chosen": 28977866.666666668, "logits/rejected": -33500192.0, "logps/chosen": -277.64453125, "logps/rejected": -397.279296875, "loss": 0.3167, "rewards/chosen": 0.954216480255127, "rewards/margins": 2.977832555770874, "rewards/rejected": -2.023616075515747, "step": 15357 }, { "epoch": 0.8140354596772056, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39010480.0, "logits/rejected": -25409648.0, "logps/chosen": -290.6565348307292, "logps/rejected": -364.860205078125, "loss": 0.2255, "rewards/chosen": 0.5107944011688232, "rewards/margins": 3.06336932182312, "rewards/rejected": -2.5525749206542967, "step": 15358 }, { "epoch": 0.8140884636790078, "grad_norm": 66.0, "kl": 7.005992889404297, "learning_rate": 5e-07, "logits/chosen": -25144556.0, "logps/chosen": -326.9806213378906, "loss": 0.3946, "rewards/chosen": 1.205355167388916, "step": 15359 }, { "epoch": 0.8141414676808099, "grad_norm": 46.25, "kl": 1.7041473388671875, "learning_rate": 5e-07, "logits/chosen": -9744604.0, "logits/rejected": -16527558.4, "logps/chosen": -299.3297932942708, "logps/rejected": -269.6935302734375, "loss": 0.275, "rewards/chosen": 0.3551162878672282, "rewards/margins": 2.2814542929331463, "rewards/rejected": -1.926338005065918, "step": 15360 }, { "epoch": 0.814194471682612, "grad_norm": 45.75, "kl": 1.0082931518554688, "learning_rate": 5e-07, "logits/chosen": -37405800.0, "logits/rejected": -15442853.0, "logps/chosen": -334.0950622558594, "logps/rejected": -293.8657531738281, "loss": 0.3191, "rewards/chosen": 0.08493904024362564, "rewards/margins": 2.838319815695286, "rewards/rejected": -2.75338077545166, "step": 15361 }, { "epoch": 0.8142474756844141, "grad_norm": 47.25, "kl": 1.135507583618164, "learning_rate": 5e-07, "logits/chosen": -40583075.2, "logits/rejected": -1801304.6666666667, "logps/chosen": -434.982275390625, "logps/rejected": -191.78387451171875, "loss": 0.2384, "rewards/chosen": 1.076404094696045, "rewards/margins": 3.9128270149230957, "rewards/rejected": -2.836422920227051, "step": 15362 }, { "epoch": 0.8143004796862163, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22296458.0, "logits/rejected": -27149466.666666668, "logps/chosen": -279.21270751953125, "logps/rejected": -300.8061930338542, "loss": 0.2172, "rewards/chosen": 1.5628811120986938, "rewards/margins": 3.6011657317479453, "rewards/rejected": -2.0382846196492515, "step": 15363 }, { "epoch": 0.8143534836880184, "grad_norm": 47.75, "kl": 2.8158607482910156, "learning_rate": 5e-07, "logits/chosen": -6821602.666666667, "logits/rejected": -28099724.8, "logps/chosen": -87.84290568033855, "logps/rejected": -292.7523681640625, "loss": 0.3156, "rewards/chosen": 1.0309176445007324, "rewards/margins": 2.3879608154296874, "rewards/rejected": -1.357043170928955, "step": 15364 }, { "epoch": 0.8144064876898206, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29579888.0, "logits/rejected": -718771.375, "logps/chosen": -246.3663126627604, "logps/rejected": -70.53952026367188, "loss": 0.4228, "rewards/chosen": -0.03543979922930399, "rewards/margins": 2.139764000972112, "rewards/rejected": -2.175203800201416, "step": 15365 }, { "epoch": 0.8144594916916227, "grad_norm": 41.0, "kl": 2.348177909851074, "learning_rate": 5e-07, "logits/chosen": -23421030.4, "logits/rejected": -23146528.0, "logps/chosen": -258.1615234375, "logps/rejected": -410.3719075520833, "loss": 0.2705, "rewards/chosen": 0.669273853302002, "rewards/margins": 4.143539905548096, "rewards/rejected": -3.4742660522460938, "step": 15366 }, { "epoch": 0.8145124956934249, "grad_norm": 41.25, "kl": 1.8811569213867188, "learning_rate": 5e-07, "logits/chosen": -11853174.0, "logits/rejected": 10957191.0, "logps/chosen": -135.7556915283203, "logps/rejected": -371.9256286621094, "loss": 0.2922, "rewards/chosen": 0.17929719388484955, "rewards/margins": 3.0316202491521835, "rewards/rejected": -2.852323055267334, "step": 15367 }, { "epoch": 0.814565499695227, "grad_norm": 60.25, "kl": 0.404083251953125, "learning_rate": 5e-07, "logits/chosen": -84035622.4, "logits/rejected": 32661989.333333332, "logps/chosen": -335.26357421875, "logps/rejected": -439.7903645833333, "loss": 0.3864, "rewards/chosen": 0.14065895080566407, "rewards/margins": 1.481227970123291, "rewards/rejected": -1.340569019317627, "step": 15368 }, { "epoch": 0.8146185036970292, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 28561214.0, "logits/rejected": -44339704.0, "logps/chosen": -257.5745849609375, "logps/rejected": -429.6148681640625, "loss": 0.2928, "rewards/chosen": 0.3167838454246521, "rewards/margins": 2.4063910841941833, "rewards/rejected": -2.0896072387695312, "step": 15369 }, { "epoch": 0.8146715076988312, "grad_norm": 68.0, "kl": 5.257723808288574, "learning_rate": 5e-07, "logits/chosen": -19484732.0, "logits/rejected": -27561746.0, "logps/chosen": -286.91766357421875, "logps/rejected": -350.64422607421875, "loss": 0.2472, "rewards/chosen": 1.2726373672485352, "rewards/margins": 3.4692347049713135, "rewards/rejected": -2.1965973377227783, "step": 15370 }, { "epoch": 0.8147245117006334, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15702058.0, "logits/rejected": -16739511.0, "logps/chosen": -429.10321044921875, "logps/rejected": -404.3584899902344, "loss": 0.2731, "rewards/chosen": 0.33609992265701294, "rewards/margins": 2.925904333591461, "rewards/rejected": -2.5898044109344482, "step": 15371 }, { "epoch": 0.8147775157024355, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40756130.666666664, "logits/rejected": -11617618.4, "logps/chosen": -327.54022216796875, "logps/rejected": -260.197705078125, "loss": 0.2442, "rewards/chosen": 0.29126179218292236, "rewards/margins": 2.821916699409485, "rewards/rejected": -2.5306549072265625, "step": 15372 }, { "epoch": 0.8148305197042377, "grad_norm": 56.0, "kl": 0.5690155029296875, "learning_rate": 5e-07, "logits/chosen": -13287360.0, "logits/rejected": -30441283.2, "logps/chosen": -117.55979410807292, "logps/rejected": -271.9593505859375, "loss": 0.2911, "rewards/chosen": -0.17447205384572348, "rewards/margins": 2.0578689495722453, "rewards/rejected": -2.232341003417969, "step": 15373 }, { "epoch": 0.8148835237060398, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50437028.0, "logits/rejected": 25598848.0, "logps/chosen": -206.62095642089844, "logps/rejected": -412.2137858072917, "loss": 0.2258, "rewards/chosen": 0.12714804708957672, "rewards/margins": 2.2327498346567154, "rewards/rejected": -2.1056017875671387, "step": 15374 }, { "epoch": 0.814936527707842, "grad_norm": 43.5, "kl": 1.0076637268066406, "learning_rate": 5e-07, "logits/chosen": -59335560.0, "logits/rejected": -6804369.5, "logps/chosen": -606.0606689453125, "logps/rejected": -162.4219970703125, "loss": 0.1957, "rewards/chosen": 1.3036484718322754, "rewards/margins": 3.3731746673583984, "rewards/rejected": -2.069526195526123, "step": 15375 }, { "epoch": 0.814989531709644, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24004160.0, "logits/rejected": 894772.1, "logps/chosen": -334.3429768880208, "logps/rejected": -193.42747802734374, "loss": 0.2727, "rewards/chosen": -0.05621133248011271, "rewards/margins": 2.2086106737454734, "rewards/rejected": -2.264822006225586, "step": 15376 }, { "epoch": 0.8150425357114462, "grad_norm": 38.25, "kl": 2.884197235107422, "learning_rate": 5e-07, "logits/chosen": 3990048.0, "logits/rejected": -24370626.0, "logps/chosen": -102.14848327636719, "logps/rejected": -244.01625061035156, "loss": 0.2419, "rewards/chosen": 0.9212013483047485, "rewards/margins": 3.351349949836731, "rewards/rejected": -2.4301486015319824, "step": 15377 }, { "epoch": 0.8150955397132483, "grad_norm": 57.5, "kl": 2.838611602783203, "learning_rate": 5e-07, "logits/chosen": -36716053.333333336, "logits/rejected": -28522234.0, "logps/chosen": -363.4621988932292, "logps/rejected": -185.91737365722656, "loss": 0.3748, "rewards/chosen": 0.565265417098999, "rewards/margins": 2.8262956142425537, "rewards/rejected": -2.2610301971435547, "step": 15378 }, { "epoch": 0.8151485437150505, "grad_norm": 43.0, "kl": 1.5899124145507812, "learning_rate": 5e-07, "logits/chosen": -34588896.0, "logits/rejected": -7272793.6, "logps/chosen": -341.6405029296875, "logps/rejected": -409.1832763671875, "loss": 0.2321, "rewards/chosen": 0.6033987204233805, "rewards/margins": 3.031116501490275, "rewards/rejected": -2.4277177810668946, "step": 15379 }, { "epoch": 0.8152015477168526, "grad_norm": 59.25, "kl": 1.0539894104003906, "learning_rate": 5e-07, "logits/chosen": 92355184.0, "logits/rejected": -28032106.0, "logps/chosen": -274.5434875488281, "logps/rejected": -213.38787841796875, "loss": 0.2693, "rewards/chosen": 0.2696676254272461, "rewards/margins": 3.328866481781006, "rewards/rejected": -3.0591988563537598, "step": 15380 }, { "epoch": 0.8152545517186548, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5864318.0, "logits/rejected": -4827818.5, "logps/chosen": -224.4844970703125, "logps/rejected": -136.58724975585938, "loss": 0.3235, "rewards/chosen": 0.20103955268859863, "rewards/margins": 2.206711769104004, "rewards/rejected": -2.0056722164154053, "step": 15381 }, { "epoch": 0.8153075557204569, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43690393.6, "logits/rejected": -7577129.333333333, "logps/chosen": -267.2351318359375, "logps/rejected": -306.28460693359375, "loss": 0.3546, "rewards/chosen": 0.044867289066314694, "rewards/margins": 2.150025777022044, "rewards/rejected": -2.105158487955729, "step": 15382 }, { "epoch": 0.8153605597222591, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 72858336.0, "logits/rejected": -52924152.0, "logps/chosen": -463.13427734375, "logps/rejected": -417.10211181640625, "loss": 0.2153, "rewards/chosen": 0.8680801391601562, "rewards/margins": 4.196251153945923, "rewards/rejected": -3.3281710147857666, "step": 15383 }, { "epoch": 0.8154135637240612, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25788842.666666668, "logits/rejected": -36660131.2, "logps/chosen": -312.71405029296875, "logps/rejected": -353.8232666015625, "loss": 0.2346, "rewards/chosen": 0.44065964221954346, "rewards/margins": 3.081129002571106, "rewards/rejected": -2.6404693603515623, "step": 15384 }, { "epoch": 0.8154665677258633, "grad_norm": 51.0, "kl": 2.2839012145996094, "learning_rate": 5e-07, "logits/chosen": 1635027.2, "logits/rejected": -20181689.333333332, "logps/chosen": -375.65615234375, "logps/rejected": -364.5221354166667, "loss": 0.2749, "rewards/chosen": 1.1484820365905761, "rewards/margins": 2.5860507011413576, "rewards/rejected": -1.4375686645507812, "step": 15385 }, { "epoch": 0.8155195717276654, "grad_norm": 61.75, "kl": 0.976348876953125, "learning_rate": 5e-07, "logits/chosen": -36961397.333333336, "logits/rejected": -18261788.0, "logps/chosen": -490.3234049479167, "logps/rejected": -213.13931274414062, "loss": 0.2703, "rewards/chosen": 0.8241976102193197, "rewards/margins": 4.083693345387776, "rewards/rejected": -3.259495735168457, "step": 15386 }, { "epoch": 0.8155725757294676, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2451713.5, "logits/rejected": 12949384.0, "logps/chosen": -157.60658264160156, "logps/rejected": -298.80584716796875, "loss": 0.2434, "rewards/chosen": 0.9532697796821594, "rewards/margins": 2.856265366077423, "rewards/rejected": -1.9029955863952637, "step": 15387 }, { "epoch": 0.8156255797312697, "grad_norm": 36.25, "kl": 5.503623962402344, "learning_rate": 5e-07, "logits/chosen": -27255552.0, "logits/rejected": -18908900.0, "logps/chosen": -225.5835205078125, "logps/rejected": -180.5230509440104, "loss": 0.3316, "rewards/chosen": 0.8840875625610352, "rewards/margins": 2.4217201868693037, "rewards/rejected": -1.5376326243082683, "step": 15388 }, { "epoch": 0.8156785837330719, "grad_norm": 54.5, "kl": 1.801987648010254, "learning_rate": 5e-07, "logits/chosen": -53001033.14285714, "logits/rejected": 2459913.5, "logps/chosen": -246.55892508370536, "logps/rejected": -130.98410034179688, "loss": 0.4233, "rewards/chosen": 0.2820075580051967, "rewards/margins": 4.481061799185617, "rewards/rejected": -4.19905424118042, "step": 15389 }, { "epoch": 0.815731587734874, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -20604260.0, "logps/rejected": -194.4270477294922, "loss": 0.1767, "rewards/rejected": -1.8413920402526855, "step": 15390 }, { "epoch": 0.8157845917366762, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46884698.666666664, "logits/rejected": -19401323.2, "logps/chosen": -317.30214436848956, "logps/rejected": -273.283349609375, "loss": 0.315, "rewards/chosen": 0.06026764710744222, "rewards/margins": 1.7134353717168171, "rewards/rejected": -1.653167724609375, "step": 15391 }, { "epoch": 0.8158375957384782, "grad_norm": 51.25, "kl": 1.6102828979492188, "learning_rate": 5e-07, "logits/chosen": 2310387.3333333335, "logits/rejected": -36781560.0, "logps/chosen": -463.9208170572917, "logps/rejected": -356.0094909667969, "loss": 0.2774, "rewards/chosen": 0.9470032850901285, "rewards/margins": 3.8477081457773843, "rewards/rejected": -2.900704860687256, "step": 15392 }, { "epoch": 0.8158905997402804, "grad_norm": 49.5, "kl": 0.06118583679199219, "learning_rate": 5e-07, "logits/chosen": -40208188.8, "logits/rejected": 3532252.3333333335, "logps/chosen": -254.3566162109375, "logps/rejected": -97.27909342447917, "loss": 0.2899, "rewards/chosen": 0.7462583541870117, "rewards/margins": 2.572068786621094, "rewards/rejected": -1.825810432434082, "step": 15393 }, { "epoch": 0.8159436037420825, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -101686186.66666667, "logits/rejected": -55512480.0, "logps/chosen": -448.5788981119792, "logps/rejected": -318.4883544921875, "loss": 0.1738, "rewards/chosen": 1.094401200612386, "rewards/margins": 3.4785051663716633, "rewards/rejected": -2.3841039657592775, "step": 15394 }, { "epoch": 0.8159966077438847, "grad_norm": 40.25, "kl": 0.4821128845214844, "learning_rate": 5e-07, "logits/chosen": -15119130.666666666, "logits/rejected": -22148814.4, "logps/chosen": -156.36102294921875, "logps/rejected": -368.078173828125, "loss": 0.174, "rewards/chosen": 1.5035624504089355, "rewards/margins": 4.577431964874267, "rewards/rejected": -3.073869514465332, "step": 15395 }, { "epoch": 0.8160496117456868, "grad_norm": 96.0, "kl": 0.2346630096435547, "learning_rate": 5e-07, "logits/chosen": -20163284.8, "logits/rejected": -54051104.0, "logps/chosen": -309.4199951171875, "logps/rejected": -195.8708699544271, "loss": 0.3138, "rewards/chosen": 0.49149093627929685, "rewards/margins": 1.8759903271993, "rewards/rejected": -1.3844993909200032, "step": 15396 }, { "epoch": 0.816102615747489, "grad_norm": 38.25, "kl": 0.5598764419555664, "learning_rate": 5e-07, "logits/chosen": -18835344.0, "logits/rejected": -35603596.0, "logps/chosen": -243.912841796875, "logps/rejected": -261.6675720214844, "loss": 0.3233, "rewards/chosen": 0.2659430503845215, "rewards/margins": 2.728698253631592, "rewards/rejected": -2.4627552032470703, "step": 15397 }, { "epoch": 0.8161556197492911, "grad_norm": 34.5, "kl": 0.6795520782470703, "learning_rate": 5e-07, "logits/chosen": -34321780.0, "logits/rejected": -44010578.666666664, "logps/chosen": -509.0845031738281, "logps/rejected": -210.2869669596354, "loss": 0.1616, "rewards/chosen": 1.3319908380508423, "rewards/margins": 4.518924991289774, "rewards/rejected": -3.186934153238932, "step": 15398 }, { "epoch": 0.8162086237510932, "grad_norm": 42.0, "kl": 0.10762786865234375, "learning_rate": 5e-07, "logits/chosen": -23771546.666666668, "logits/rejected": -53567772.0, "logps/chosen": -220.100341796875, "logps/rejected": -372.1412658691406, "loss": 0.3163, "rewards/chosen": 0.5161614815394083, "rewards/margins": 3.100766936937968, "rewards/rejected": -2.5846054553985596, "step": 15399 }, { "epoch": 0.8162616277528953, "grad_norm": 45.5, "kl": 5.098052978515625, "learning_rate": 5e-07, "logits/chosen": -26133056.0, "logits/rejected": -44448472.0, "logps/chosen": -303.39251708984375, "logps/rejected": -420.4076232910156, "loss": 0.4635, "rewards/chosen": 0.2249067227045695, "rewards/margins": 1.8948663870493572, "rewards/rejected": -1.6699596643447876, "step": 15400 }, { "epoch": 0.8163146317546974, "grad_norm": 43.25, "kl": 1.603470802307129, "learning_rate": 5e-07, "logits/chosen": -52016517.333333336, "logits/rejected": -10134530.4, "logps/chosen": -380.0887858072917, "logps/rejected": -305.1478271484375, "loss": 0.2079, "rewards/chosen": 1.0901438395182292, "rewards/margins": 3.469774309794108, "rewards/rejected": -2.379630470275879, "step": 15401 }, { "epoch": 0.8163676357564996, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3659559.3333333335, "logits/rejected": -8887083.2, "logps/chosen": -221.1921590169271, "logps/rejected": -220.791748046875, "loss": 0.2257, "rewards/chosen": 0.6225908597310384, "rewards/margins": 3.020210870107015, "rewards/rejected": -2.3976200103759764, "step": 15402 }, { "epoch": 0.8164206397583017, "grad_norm": 49.25, "kl": 0.7044944763183594, "learning_rate": 5e-07, "logits/chosen": -30930213.333333332, "logits/rejected": -61479000.0, "logps/chosen": -296.35947672526044, "logps/rejected": -409.5262451171875, "loss": 0.3102, "rewards/chosen": 0.49979249636332196, "rewards/margins": 5.538023392359416, "rewards/rejected": -5.038230895996094, "step": 15403 }, { "epoch": 0.8164736437601039, "grad_norm": 37.5, "kl": 1.8797311782836914, "learning_rate": 5e-07, "logits/chosen": 3203231.75, "logits/rejected": -56383320.0, "logps/chosen": -184.68414306640625, "logps/rejected": -436.48382568359375, "loss": 0.2009, "rewards/chosen": 1.4007036685943604, "rewards/margins": 3.6163992881774902, "rewards/rejected": -2.21569561958313, "step": 15404 }, { "epoch": 0.816526647761906, "grad_norm": 45.75, "kl": 0.44831085205078125, "learning_rate": 5e-07, "logits/chosen": -30802565.333333332, "logits/rejected": -35188764.0, "logps/chosen": -233.36381022135416, "logps/rejected": -321.65301513671875, "loss": 0.2572, "rewards/chosen": 0.9627443154652914, "rewards/margins": 2.9517064889272056, "rewards/rejected": -1.988962173461914, "step": 15405 }, { "epoch": 0.8165796517637082, "grad_norm": 43.75, "kl": 2.466033935546875, "learning_rate": 5e-07, "logits/chosen": -7721533.6, "logits/rejected": -42479546.666666664, "logps/chosen": -300.721875, "logps/rejected": -403.9529622395833, "loss": 0.3277, "rewards/chosen": 0.37624759674072267, "rewards/margins": 2.8056820551554362, "rewards/rejected": -2.4294344584147134, "step": 15406 }, { "epoch": 0.8166326557655103, "grad_norm": 48.5, "kl": 0.7341022491455078, "learning_rate": 5e-07, "logits/chosen": -72661640.0, "logits/rejected": -25335566.0, "logps/chosen": -245.24649047851562, "logps/rejected": -306.2976989746094, "loss": 0.2728, "rewards/chosen": 0.9237858057022095, "rewards/margins": 2.522253632545471, "rewards/rejected": -1.5984678268432617, "step": 15407 }, { "epoch": 0.8166856597673124, "grad_norm": 48.0, "kl": 3.082111358642578, "learning_rate": 5e-07, "logits/chosen": -19859116.0, "logits/rejected": -11803221.0, "logps/chosen": -337.146728515625, "logps/rejected": -185.60910034179688, "loss": 0.2465, "rewards/chosen": 1.2808315753936768, "rewards/margins": 5.158707857131958, "rewards/rejected": -3.8778762817382812, "step": 15408 }, { "epoch": 0.8167386637691145, "grad_norm": 57.75, "kl": 1.0229668617248535, "learning_rate": 5e-07, "logits/chosen": -18153252.0, "logits/rejected": -6226910.5, "logps/chosen": -230.02676391601562, "logps/rejected": -165.70265197753906, "loss": 0.2726, "rewards/chosen": 0.4667931795120239, "rewards/margins": 3.1028276681900024, "rewards/rejected": -2.6360344886779785, "step": 15409 }, { "epoch": 0.8167916677709167, "grad_norm": 50.5, "kl": 0.5951042175292969, "learning_rate": 5e-07, "logits/chosen": -46750272.0, "logits/rejected": -5245518.5, "logps/chosen": -273.57318115234375, "logps/rejected": -280.1159973144531, "loss": 0.257, "rewards/chosen": 0.7078748941421509, "rewards/margins": 2.9292725324630737, "rewards/rejected": -2.221397638320923, "step": 15410 }, { "epoch": 0.8168446717727188, "grad_norm": 55.0, "kl": 6.074665069580078, "learning_rate": 5e-07, "logits/chosen": -15459660.8, "logits/rejected": 5114666.0, "logps/chosen": -140.39189453125, "logps/rejected": -316.88258870442706, "loss": 0.4117, "rewards/chosen": 0.6490986824035645, "rewards/margins": 1.8479954878489178, "rewards/rejected": -1.1988968054453533, "step": 15411 }, { "epoch": 0.816897675774521, "grad_norm": 39.0, "kl": 1.1642131805419922, "learning_rate": 5e-07, "logits/chosen": -23766987.2, "logits/rejected": -13873885.333333334, "logps/chosen": -203.7180908203125, "logps/rejected": -170.427734375, "loss": 0.2544, "rewards/chosen": 0.623801326751709, "rewards/margins": 3.648099676767985, "rewards/rejected": -3.024298350016276, "step": 15412 }, { "epoch": 0.8169506797763231, "grad_norm": 71.0, "kl": 1.3223228454589844, "learning_rate": 5e-07, "logits/chosen": -30359192.0, "logits/rejected": 18144339.2, "logps/chosen": -159.263427734375, "logps/rejected": -349.3828125, "loss": 0.2794, "rewards/chosen": 0.19138896465301514, "rewards/margins": 2.1278156995773316, "rewards/rejected": -1.9364267349243165, "step": 15413 }, { "epoch": 0.8170036837781253, "grad_norm": 40.5, "kl": 0.3924980163574219, "learning_rate": 5e-07, "logits/chosen": -17298997.333333332, "logits/rejected": -25903798.0, "logps/chosen": -222.28129069010416, "logps/rejected": -271.8656005859375, "loss": 0.2525, "rewards/chosen": 1.033631960550944, "rewards/margins": 4.24509064356486, "rewards/rejected": -3.211458683013916, "step": 15414 }, { "epoch": 0.8170566877799273, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40179656.0, "logits/rejected": -31895534.0, "logps/chosen": -292.4421691894531, "logps/rejected": -375.73345947265625, "loss": 0.1947, "rewards/chosen": 0.9483912587165833, "rewards/margins": 3.7691896557807922, "rewards/rejected": -2.820798397064209, "step": 15415 }, { "epoch": 0.8171096917817295, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34423040.0, "logits/rejected": -26185093.333333332, "logps/chosen": -221.81338500976562, "logps/rejected": -295.2420654296875, "loss": 0.1372, "rewards/chosen": 0.29802703857421875, "rewards/margins": 3.651121139526367, "rewards/rejected": -3.3530941009521484, "step": 15416 }, { "epoch": 0.8171626957835316, "grad_norm": 22.75, "kl": 0.2383403778076172, "learning_rate": 5e-07, "logits/chosen": 5316788.666666667, "logits/rejected": -17739094.4, "logps/chosen": -123.35426839192708, "logps/rejected": -217.454296875, "loss": 0.1813, "rewards/chosen": 0.9433535734812418, "rewards/margins": 3.714797512690226, "rewards/rejected": -2.7714439392089845, "step": 15417 }, { "epoch": 0.8172156997853338, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23646784.0, "logits/rejected": -42097608.0, "logps/chosen": -456.1304626464844, "logps/rejected": -242.21827697753906, "loss": 0.3051, "rewards/chosen": 0.024835828691720963, "rewards/margins": 2.2838981188833714, "rewards/rejected": -2.2590622901916504, "step": 15418 }, { "epoch": 0.8172687037871359, "grad_norm": 36.0, "kl": 0.9517011642456055, "learning_rate": 5e-07, "logits/chosen": -23388494.4, "logits/rejected": -12118552.0, "logps/chosen": -181.716015625, "logps/rejected": -339.38584391276044, "loss": 0.3045, "rewards/chosen": 0.2570731401443481, "rewards/margins": 4.195160603523254, "rewards/rejected": -3.9380874633789062, "step": 15419 }, { "epoch": 0.8173217077889381, "grad_norm": 39.75, "kl": 3.1890735626220703, "learning_rate": 5e-07, "logits/chosen": -24978552.0, "logits/rejected": -48587956.0, "logps/chosen": -391.6109313964844, "logps/rejected": -434.9644470214844, "loss": 0.1649, "rewards/chosen": 1.8007923364639282, "rewards/margins": 4.085175633430481, "rewards/rejected": -2.2843832969665527, "step": 15420 }, { "epoch": 0.8173747117907402, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23582640.0, "logits/rejected": -16891113.6, "logps/chosen": -414.0289306640625, "logps/rejected": -484.6361328125, "loss": 0.1431, "rewards/chosen": 1.3817963600158691, "rewards/margins": 4.969618320465088, "rewards/rejected": -3.5878219604492188, "step": 15421 }, { "epoch": 0.8174277157925424, "grad_norm": 48.0, "kl": 1.821685791015625, "learning_rate": 5e-07, "logits/chosen": -35189510.4, "logits/rejected": -56962752.0, "logps/chosen": -293.7910888671875, "logps/rejected": -501.2319742838542, "loss": 0.2459, "rewards/chosen": 0.9210579872131348, "rewards/margins": 4.02416893641154, "rewards/rejected": -3.103110949198405, "step": 15422 }, { "epoch": 0.8174807197943444, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7556263.2, "logits/rejected": -13809933.333333334, "logps/chosen": -82.40419921875, "logps/rejected": -180.43937174479166, "loss": 0.2758, "rewards/chosen": 0.4532649993896484, "rewards/margins": 3.60387274424235, "rewards/rejected": -3.1506077448527017, "step": 15423 }, { "epoch": 0.8175337237961466, "grad_norm": 39.75, "kl": 3.1760025024414062, "learning_rate": 5e-07, "logits/chosen": -13338633.333333334, "logits/rejected": -65847948.8, "logps/chosen": -239.34476725260416, "logps/rejected": -507.40927734375, "loss": 0.1855, "rewards/chosen": 1.0542141596476238, "rewards/margins": 3.6341465632120773, "rewards/rejected": -2.5799324035644533, "step": 15424 }, { "epoch": 0.8175867277979487, "grad_norm": 35.25, "kl": 0.9014301300048828, "learning_rate": 5e-07, "logits/chosen": -27309078.0, "logits/rejected": -6103330.0, "logps/chosen": -274.58270263671875, "logps/rejected": -84.26651000976562, "loss": 0.2686, "rewards/chosen": 0.4588926434516907, "rewards/margins": 3.141079008579254, "rewards/rejected": -2.6821863651275635, "step": 15425 }, { "epoch": 0.8176397317997509, "grad_norm": 41.75, "kl": 0.1048431396484375, "learning_rate": 5e-07, "logits/chosen": -28754750.0, "logits/rejected": -16413672.0, "logps/chosen": -359.6229248046875, "logps/rejected": -288.97021484375, "loss": 0.1792, "rewards/chosen": 0.9563863277435303, "rewards/margins": 3.878779411315918, "rewards/rejected": -2.9223930835723877, "step": 15426 }, { "epoch": 0.817692735801553, "grad_norm": 91.5, "kl": 15.466361999511719, "learning_rate": 5e-07, "logits/chosen": -21189300.57142857, "logits/rejected": -19850590.0, "logps/chosen": -580.9296177455357, "logps/rejected": -383.9752502441406, "loss": 0.3896, "rewards/chosen": 1.8931704929896764, "rewards/margins": 3.8001681736537387, "rewards/rejected": -1.9069976806640625, "step": 15427 }, { "epoch": 0.8177457398033552, "grad_norm": 32.5, "kl": 0.7948055267333984, "learning_rate": 5e-07, "logits/chosen": -33203892.0, "logits/rejected": -39618013.333333336, "logps/chosen": -309.644775390625, "logps/rejected": -275.97349039713544, "loss": 0.1474, "rewards/chosen": 0.9910209774971008, "rewards/margins": 3.881711781024933, "rewards/rejected": -2.890690803527832, "step": 15428 }, { "epoch": 0.8177987438051573, "grad_norm": 74.5, "kl": 1.8607254028320312, "learning_rate": 5e-07, "logits/chosen": -8050696.0, "logits/rejected": -31429618.0, "logps/chosen": -598.7353515625, "logps/rejected": -530.6131591796875, "loss": 0.2965, "rewards/chosen": 0.8126471042633057, "rewards/margins": 4.750420331954956, "rewards/rejected": -3.9377732276916504, "step": 15429 }, { "epoch": 0.8178517478069595, "grad_norm": 54.0, "kl": 1.4942989349365234, "learning_rate": 5e-07, "logits/chosen": 5006156.333333333, "logits/rejected": -26015753.6, "logps/chosen": -240.3525594075521, "logps/rejected": -239.880419921875, "loss": 0.3097, "rewards/chosen": -0.28480937083562213, "rewards/margins": 2.642455796400706, "rewards/rejected": -2.927265167236328, "step": 15430 }, { "epoch": 0.8179047518087615, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9136680.0, "logits/rejected": 8985814.0, "logps/chosen": -551.180078125, "logps/rejected": -423.6719156901042, "loss": 0.2574, "rewards/chosen": 1.0866779327392577, "rewards/margins": 4.120514933268229, "rewards/rejected": -3.033837000528971, "step": 15431 }, { "epoch": 0.8179577558105637, "grad_norm": 69.5, "kl": 2.381063461303711, "learning_rate": 5e-07, "logits/chosen": -47048426.666666664, "logits/rejected": -9156792.0, "logps/chosen": -493.6444498697917, "logps/rejected": -347.056884765625, "loss": 0.2812, "rewards/chosen": 1.1943897406260173, "rewards/margins": 3.012300531069438, "rewards/rejected": -1.8179107904434204, "step": 15432 }, { "epoch": 0.8180107598123658, "grad_norm": 44.0, "kl": 0.96673583984375, "learning_rate": 5e-07, "logits/chosen": -13910664.0, "logits/rejected": -14742950.4, "logps/chosen": -231.36238606770834, "logps/rejected": -152.7769287109375, "loss": 0.2104, "rewards/chosen": 1.1101824442545574, "rewards/margins": 3.7143835703531902, "rewards/rejected": -2.6042011260986326, "step": 15433 }, { "epoch": 0.818063763814168, "grad_norm": 39.5, "kl": 1.6702957153320312, "learning_rate": 5e-07, "logits/chosen": -22890734.4, "logits/rejected": -40457653.333333336, "logps/chosen": -269.100341796875, "logps/rejected": -334.3597005208333, "loss": 0.2721, "rewards/chosen": 0.8404941558837891, "rewards/margins": 4.0460751851399746, "rewards/rejected": -3.205581029256185, "step": 15434 }, { "epoch": 0.8181167678159701, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35209749.333333336, "logits/rejected": -10842072.0, "logps/chosen": -242.60306803385416, "logps/rejected": -274.5155944824219, "loss": 0.3444, "rewards/chosen": 0.5411084493001302, "rewards/margins": 2.6044119199117026, "rewards/rejected": -2.0633034706115723, "step": 15435 }, { "epoch": 0.8181697718177723, "grad_norm": 45.75, "kl": 2.5378098487854004, "learning_rate": 5e-07, "logits/chosen": -7635681.333333333, "logits/rejected": -5639054.0, "logps/chosen": -351.6167805989583, "logps/rejected": -172.76324462890625, "loss": 0.3332, "rewards/chosen": 1.0041667620340984, "rewards/margins": 2.7299533287684126, "rewards/rejected": -1.725786566734314, "step": 15436 }, { "epoch": 0.8182227758195744, "grad_norm": 49.75, "kl": 0.296142578125, "learning_rate": 5e-07, "logits/chosen": -13751545.333333334, "logits/rejected": -43775859.2, "logps/chosen": -465.824462890625, "logps/rejected": -237.977197265625, "loss": 0.2231, "rewards/chosen": 1.1641438802083333, "rewards/margins": 3.0478530248006184, "rewards/rejected": -1.8837091445922851, "step": 15437 }, { "epoch": 0.8182757798213766, "grad_norm": 31.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9002326.0, "logits/rejected": -24187736.0, "logps/chosen": -84.23721313476562, "logps/rejected": -533.6813557942709, "loss": 0.1654, "rewards/chosen": -0.4016788601875305, "rewards/margins": 3.947651406129201, "rewards/rejected": -4.3493302663167315, "step": 15438 }, { "epoch": 0.8183287838231786, "grad_norm": 57.25, "kl": 1.05322265625, "learning_rate": 5e-07, "logits/chosen": -43756344.0, "logits/rejected": -19055974.0, "logps/chosen": -275.313720703125, "logps/rejected": -293.164794921875, "loss": 0.2644, "rewards/chosen": 0.28232118487358093, "rewards/margins": 3.321128338575363, "rewards/rejected": -3.0388071537017822, "step": 15439 }, { "epoch": 0.8183817878249808, "grad_norm": 42.0, "kl": 0.8254146575927734, "learning_rate": 5e-07, "logits/chosen": -11360838.666666666, "logits/rejected": -82139494.4, "logps/chosen": -166.22088623046875, "logps/rejected": -363.913671875, "loss": 0.2136, "rewards/chosen": 0.7378900051116943, "rewards/margins": 3.533019208908081, "rewards/rejected": -2.7951292037963866, "step": 15440 }, { "epoch": 0.8184347918267829, "grad_norm": 46.25, "kl": 0.8892621994018555, "learning_rate": 5e-07, "logits/chosen": -3751711.0, "logits/rejected": -40277796.0, "logps/chosen": -111.62515258789062, "logps/rejected": -329.19830322265625, "loss": 0.2716, "rewards/chosen": 0.4589025378227234, "rewards/margins": 2.4082347750663757, "rewards/rejected": -1.9493322372436523, "step": 15441 }, { "epoch": 0.8184877958285851, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49811984.0, "logits/rejected": -7064273.0, "logps/chosen": -219.1983846028646, "logps/rejected": -318.12799072265625, "loss": 0.3396, "rewards/chosen": 0.3460467259089152, "rewards/margins": 2.8375385204950967, "rewards/rejected": -2.4914917945861816, "step": 15442 }, { "epoch": 0.8185407998303872, "grad_norm": 49.5, "kl": 0.4085197448730469, "learning_rate": 5e-07, "logits/chosen": -49007797.333333336, "logits/rejected": -39018508.8, "logps/chosen": -359.6957194010417, "logps/rejected": -388.7015380859375, "loss": 0.2138, "rewards/chosen": 0.6098368167877197, "rewards/margins": 3.0256006717681885, "rewards/rejected": -2.4157638549804688, "step": 15443 }, { "epoch": 0.8185938038321894, "grad_norm": 50.25, "kl": 0.6970291137695312, "learning_rate": 5e-07, "logits/chosen": -12321716.0, "logits/rejected": -6194940.666666667, "logps/chosen": -168.051806640625, "logps/rejected": -203.74774169921875, "loss": 0.3394, "rewards/chosen": 0.2939716339111328, "rewards/margins": 1.8953077952067057, "rewards/rejected": -1.601336161295573, "step": 15444 }, { "epoch": 0.8186468078339915, "grad_norm": 45.75, "kl": 0.09652233123779297, "learning_rate": 5e-07, "logits/chosen": -3447403.2, "logits/rejected": -103820608.0, "logps/chosen": -178.3873779296875, "logps/rejected": -588.9946695963541, "loss": 0.3039, "rewards/chosen": 0.5821196556091308, "rewards/margins": 2.6565917650858557, "rewards/rejected": -2.074472109476725, "step": 15445 }, { "epoch": 0.8186998118357937, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 24424474.666666668, "logits/rejected": -22322419.2, "logps/chosen": -151.4427693684896, "logps/rejected": -106.63004150390626, "loss": 0.3217, "rewards/chosen": 0.06942888100941975, "rewards/margins": 1.9969278891881306, "rewards/rejected": -1.927499008178711, "step": 15446 }, { "epoch": 0.8187528158375957, "grad_norm": 38.0, "kl": 1.3825302124023438, "learning_rate": 5e-07, "logits/chosen": -9255886.0, "logits/rejected": -43086808.0, "logps/chosen": -167.515625, "logps/rejected": -376.2217102050781, "loss": 0.2956, "rewards/chosen": 0.24377498030662537, "rewards/margins": 2.809192508459091, "rewards/rejected": -2.565417528152466, "step": 15447 }, { "epoch": 0.8188058198393978, "grad_norm": 46.5, "kl": 2.2099790573120117, "learning_rate": 5e-07, "logits/chosen": -29439350.0, "logits/rejected": -52246616.0, "logps/chosen": -188.31121826171875, "logps/rejected": -239.24244689941406, "loss": 0.2982, "rewards/chosen": 0.7821472883224487, "rewards/margins": 2.369672179222107, "rewards/rejected": -1.5875248908996582, "step": 15448 }, { "epoch": 0.8188588238412, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6169441.5, "logits/rejected": -12329013.333333334, "logps/chosen": -49.50689697265625, "logps/rejected": -177.32234700520834, "loss": 0.2593, "rewards/chosen": -0.05139121413230896, "rewards/margins": 2.2697082261244454, "rewards/rejected": -2.3210994402567544, "step": 15449 }, { "epoch": 0.8189118278430021, "grad_norm": 52.25, "kl": 1.7091522216796875, "learning_rate": 5e-07, "logits/chosen": -9859588.0, "logits/rejected": -21468305.6, "logps/chosen": -189.3748575846354, "logps/rejected": -230.186328125, "loss": 0.2372, "rewards/chosen": 0.7039565245310465, "rewards/margins": 3.905928341547648, "rewards/rejected": -3.2019718170166014, "step": 15450 }, { "epoch": 0.8189648318448043, "grad_norm": 46.0, "kl": 4.725957870483398, "learning_rate": 5e-07, "logits/chosen": -43212528.0, "logits/rejected": -55973824.0, "logps/chosen": -399.83447265625, "logps/rejected": -505.9612121582031, "loss": 0.4017, "rewards/chosen": 0.31964266300201416, "rewards/margins": 2.9651366472244263, "rewards/rejected": -2.645493984222412, "step": 15451 }, { "epoch": 0.8190178358466064, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38909865.6, "logits/rejected": -26462402.666666668, "logps/chosen": -211.33017578125, "logps/rejected": -211.2077840169271, "loss": 0.3278, "rewards/chosen": 0.5948060035705567, "rewards/margins": 1.7931916395823162, "rewards/rejected": -1.1983856360117595, "step": 15452 }, { "epoch": 0.8190708398484086, "grad_norm": 38.75, "kl": 2.6536483764648438, "learning_rate": 5e-07, "logits/chosen": 9035989.333333334, "logits/rejected": -23823419.2, "logps/chosen": -88.6800028483073, "logps/rejected": -177.04764404296876, "loss": 0.3247, "rewards/chosen": 0.156752347946167, "rewards/margins": 2.576774549484253, "rewards/rejected": -2.420022201538086, "step": 15453 }, { "epoch": 0.8191238438502106, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55385593.6, "logits/rejected": -9060236.0, "logps/chosen": -490.09228515625, "logps/rejected": -272.9122721354167, "loss": 0.2313, "rewards/chosen": 0.8571100234985352, "rewards/margins": 3.564451217651367, "rewards/rejected": -2.707341194152832, "step": 15454 }, { "epoch": 0.8191768478520128, "grad_norm": 35.0, "kl": 1.9606914520263672, "learning_rate": 5e-07, "logits/chosen": -30370852.57142857, "logits/rejected": -18611284.0, "logps/chosen": -186.17919921875, "logps/rejected": -398.92034912109375, "loss": 0.3286, "rewards/chosen": 0.7960409436907087, "rewards/margins": 3.702699831553868, "rewards/rejected": -2.906658887863159, "step": 15455 }, { "epoch": 0.8192298518538149, "grad_norm": 82.0, "kl": 12.789815902709961, "learning_rate": 5e-07, "logits/chosen": -24661804.8, "logits/rejected": -75591402.66666667, "logps/chosen": -634.539599609375, "logps/rejected": -420.10009765625, "loss": 0.3382, "rewards/chosen": 1.8289596557617187, "rewards/margins": 5.149562390645345, "rewards/rejected": -3.3206027348836265, "step": 15456 }, { "epoch": 0.8192828558556171, "grad_norm": 49.25, "kl": 1.8263120651245117, "learning_rate": 5e-07, "logits/chosen": 7275986.666666667, "logits/rejected": -24079766.4, "logps/chosen": -32.6824696858724, "logps/rejected": -202.4130859375, "loss": 0.265, "rewards/chosen": 1.071028470993042, "rewards/margins": 3.0614430904388428, "rewards/rejected": -1.9904146194458008, "step": 15457 }, { "epoch": 0.8193358598574192, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3869653.3333333335, "logits/rejected": 2988783.6, "logps/chosen": -216.2038777669271, "logps/rejected": -244.2616455078125, "loss": 0.2301, "rewards/chosen": 0.2977849245071411, "rewards/margins": 3.3201836824417112, "rewards/rejected": -3.02239875793457, "step": 15458 }, { "epoch": 0.8193888638592214, "grad_norm": 69.0, "kl": 2.93918514251709, "learning_rate": 5e-07, "logits/chosen": 105532704.0, "logits/rejected": -56276360.0, "logps/chosen": -290.3840738932292, "logps/rejected": -398.67596435546875, "loss": 0.3511, "rewards/chosen": 0.38223199049631756, "rewards/margins": 3.845372478167216, "rewards/rejected": -3.4631404876708984, "step": 15459 }, { "epoch": 0.8194418678610235, "grad_norm": 47.5, "kl": 1.5422744750976562, "learning_rate": 5e-07, "logits/chosen": -69697840.0, "logits/rejected": -22719254.4, "logps/chosen": -441.4713541666667, "logps/rejected": -434.484521484375, "loss": 0.2389, "rewards/chosen": 0.62689208984375, "rewards/margins": 2.657253456115723, "rewards/rejected": -2.030361366271973, "step": 15460 }, { "epoch": 0.8194948718628257, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -144535488.0, "logits/rejected": -25229704.0, "logps/chosen": -325.4150390625, "logps/rejected": -195.89096069335938, "loss": 0.2535, "rewards/chosen": 0.4449354410171509, "rewards/margins": 3.0773180723190308, "rewards/rejected": -2.63238263130188, "step": 15461 }, { "epoch": 0.8195478758646277, "grad_norm": 52.25, "kl": 3.9873695373535156, "learning_rate": 5e-07, "logits/chosen": -4010933.5, "logits/rejected": -21125578.0, "logps/chosen": -249.0028533935547, "logps/rejected": -264.0901184082031, "loss": 0.3793, "rewards/chosen": -0.05923919379711151, "rewards/margins": 3.2985209971666336, "rewards/rejected": -3.357760190963745, "step": 15462 }, { "epoch": 0.8196008798664299, "grad_norm": 54.25, "kl": 1.4546918869018555, "learning_rate": 5e-07, "logits/chosen": -29827352.0, "logits/rejected": 22279570.0, "logps/chosen": -218.25992838541666, "logps/rejected": -97.87198638916016, "loss": 0.4728, "rewards/chosen": 0.18748913208643594, "rewards/margins": 0.6932451526323954, "rewards/rejected": -0.5057560205459595, "step": 15463 }, { "epoch": 0.819653883868232, "grad_norm": 29.75, "kl": 2.3790931701660156, "learning_rate": 5e-07, "logits/chosen": 2307445.5, "logits/rejected": -32279220.0, "logps/chosen": -56.866844177246094, "logps/rejected": -507.6776428222656, "loss": 0.2852, "rewards/chosen": 0.6484203934669495, "rewards/margins": 3.600342333316803, "rewards/rejected": -2.9519219398498535, "step": 15464 }, { "epoch": 0.8197068878700342, "grad_norm": 56.5, "kl": 3.6901168823242188, "learning_rate": 5e-07, "logits/chosen": -81350418.28571428, "logits/rejected": 887014.9375, "logps/chosen": -323.00537109375, "logps/rejected": -71.15632629394531, "loss": 0.3901, "rewards/chosen": 0.8707968848092216, "rewards/margins": 1.8782508032662528, "rewards/rejected": -1.0074539184570312, "step": 15465 }, { "epoch": 0.8197598918718363, "grad_norm": 51.0, "kl": 1.1169958114624023, "learning_rate": 5e-07, "logits/chosen": 3218918.25, "logits/rejected": 7475298.0, "logps/chosen": -105.80094146728516, "logps/rejected": -443.04278564453125, "loss": 0.3784, "rewards/chosen": -0.06641393899917603, "rewards/margins": 2.3333296179771423, "rewards/rejected": -2.3997435569763184, "step": 15466 }, { "epoch": 0.8198128958736385, "grad_norm": 50.75, "kl": 1.9117460250854492, "learning_rate": 5e-07, "logits/chosen": 5129290.0, "logits/rejected": -37681664.0, "logps/chosen": -172.4176483154297, "logps/rejected": -298.12493896484375, "loss": 0.3106, "rewards/chosen": 0.1457769274711609, "rewards/margins": 2.2283975481987, "rewards/rejected": -2.082620620727539, "step": 15467 }, { "epoch": 0.8198658998754406, "grad_norm": 45.25, "kl": 3.5679636001586914, "learning_rate": 5e-07, "logits/chosen": -41234451.2, "logits/rejected": -33041693.333333332, "logps/chosen": -300.3082275390625, "logps/rejected": -421.9735514322917, "loss": 0.3877, "rewards/chosen": 0.5414727210998536, "rewards/margins": 1.9707758903503418, "rewards/rejected": -1.4293031692504883, "step": 15468 }, { "epoch": 0.8199189038772428, "grad_norm": 50.25, "kl": 3.2842178344726562, "learning_rate": 5e-07, "logits/chosen": -26762160.0, "logits/rejected": -20391706.0, "logps/chosen": -379.39529854910717, "logps/rejected": -128.69369506835938, "loss": 0.3302, "rewards/chosen": 1.0564115388052804, "rewards/margins": 2.9434309346335272, "rewards/rejected": -1.887019395828247, "step": 15469 }, { "epoch": 0.8199719078790448, "grad_norm": 66.0, "kl": 2.933837890625, "learning_rate": 5e-07, "logits/chosen": -40767348.0, "logits/rejected": -29528126.0, "logps/chosen": -794.43896484375, "logps/rejected": -148.3563690185547, "loss": 0.3029, "rewards/chosen": 0.8032630681991577, "rewards/margins": 2.2837573289871216, "rewards/rejected": -1.4804942607879639, "step": 15470 }, { "epoch": 0.820024911880847, "grad_norm": 49.75, "kl": 3.9821033477783203, "learning_rate": 5e-07, "logits/chosen": -29606600.0, "logits/rejected": -20833180.0, "logps/chosen": -460.708984375, "logps/rejected": -260.2497253417969, "loss": 0.335, "rewards/chosen": 0.8132445812225342, "rewards/margins": 2.2216579914093018, "rewards/rejected": -1.4084134101867676, "step": 15471 }, { "epoch": 0.8200779158826491, "grad_norm": 76.5, "kl": 3.830972671508789, "learning_rate": 5e-07, "logits/chosen": -757549.7142857143, "logits/rejected": -66066576.0, "logps/chosen": -603.3352399553571, "logps/rejected": -425.461181640625, "loss": 0.3811, "rewards/chosen": 0.7452961376735142, "rewards/margins": 4.061437027794974, "rewards/rejected": -3.31614089012146, "step": 15472 }, { "epoch": 0.8201309198844513, "grad_norm": 42.75, "kl": 0.0810546875, "learning_rate": 5e-07, "logits/chosen": -5295040.0, "logits/rejected": -46249244.8, "logps/chosen": -379.929443359375, "logps/rejected": -545.851123046875, "loss": 0.135, "rewards/chosen": 1.2599059740702312, "rewards/margins": 4.9756210009257, "rewards/rejected": -3.715715026855469, "step": 15473 }, { "epoch": 0.8201839238862534, "grad_norm": 51.0, "kl": 1.3812274932861328, "learning_rate": 5e-07, "logits/chosen": -6038827.2, "logits/rejected": -36926728.0, "logps/chosen": -430.76923828125, "logps/rejected": -283.40635172526044, "loss": 0.2144, "rewards/chosen": 1.3968351364135743, "rewards/margins": 4.306388664245605, "rewards/rejected": -2.9095535278320312, "step": 15474 }, { "epoch": 0.8202369278880556, "grad_norm": 64.5, "kl": 1.3487377166748047, "learning_rate": 5e-07, "logits/chosen": -35966954.666666664, "logits/rejected": 1067131.6, "logps/chosen": -175.62638346354166, "logps/rejected": -300.0322509765625, "loss": 0.3015, "rewards/chosen": 0.22905717293421426, "rewards/margins": 1.873851255575816, "rewards/rejected": -1.6447940826416017, "step": 15475 }, { "epoch": 0.8202899318898577, "grad_norm": 35.0, "kl": 1.6339874267578125, "learning_rate": 5e-07, "logits/chosen": 1646203.0, "logits/rejected": -17047252.0, "logps/chosen": -58.4072151184082, "logps/rejected": -474.69921875, "loss": 0.2574, "rewards/chosen": 0.444802463054657, "rewards/margins": 2.9988731741905212, "rewards/rejected": -2.5540707111358643, "step": 15476 }, { "epoch": 0.8203429358916599, "grad_norm": 62.0, "kl": 1.4473953247070312, "learning_rate": 5e-07, "logits/chosen": -8742464.0, "logits/rejected": -16756393.0, "logps/chosen": -281.4692687988281, "logps/rejected": -295.54595947265625, "loss": 0.311, "rewards/chosen": 0.06172609701752663, "rewards/margins": 3.081340793520212, "rewards/rejected": -3.0196146965026855, "step": 15477 }, { "epoch": 0.8203959398934619, "grad_norm": 52.0, "kl": 1.4016294479370117, "learning_rate": 5e-07, "logits/chosen": -24375976.0, "logits/rejected": -29418586.666666668, "logps/chosen": -162.5451416015625, "logps/rejected": -335.377197265625, "loss": 0.3793, "rewards/chosen": 0.2597994327545166, "rewards/margins": 2.785936784744263, "rewards/rejected": -2.526137351989746, "step": 15478 }, { "epoch": 0.8204489438952641, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30962670.0, "logits/rejected": 4063730.0, "logps/chosen": -423.8594665527344, "logps/rejected": -403.2837320963542, "loss": 0.1672, "rewards/chosen": 0.9599275588989258, "rewards/margins": 3.608715534210205, "rewards/rejected": -2.6487879753112793, "step": 15479 }, { "epoch": 0.8205019478970662, "grad_norm": 57.75, "kl": 0.6333560943603516, "learning_rate": 5e-07, "logits/chosen": -4365332.0, "logits/rejected": -54320968.0, "logps/chosen": -369.0985514322917, "logps/rejected": -345.78033447265625, "loss": 0.2835, "rewards/chosen": 0.7546947797139486, "rewards/margins": 3.266839345296224, "rewards/rejected": -2.5121445655822754, "step": 15480 }, { "epoch": 0.8205549518988684, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 666688.6875, "logits/rejected": -27718442.666666668, "logps/chosen": -49.68505096435547, "logps/rejected": -405.5465087890625, "loss": 0.2651, "rewards/chosen": -0.41504746675491333, "rewards/margins": 1.4493781924247742, "rewards/rejected": -1.8644256591796875, "step": 15481 }, { "epoch": 0.8206079559006705, "grad_norm": 48.5, "kl": 0.16881561279296875, "learning_rate": 5e-07, "logits/chosen": -75500761.6, "logits/rejected": 362169.625, "logps/chosen": -351.1075927734375, "logps/rejected": -100.8766581217448, "loss": 0.313, "rewards/chosen": 0.2564715385437012, "rewards/margins": 3.2229107856750487, "rewards/rejected": -2.9664392471313477, "step": 15482 }, { "epoch": 0.8206609599024727, "grad_norm": 51.75, "kl": 3.913328170776367, "learning_rate": 5e-07, "logits/chosen": 4217147.333333333, "logits/rejected": -82613395.2, "logps/chosen": -288.27634684244794, "logps/rejected": -241.148095703125, "loss": 0.3455, "rewards/chosen": 0.5637508630752563, "rewards/margins": 2.11186420917511, "rewards/rejected": -1.5481133460998535, "step": 15483 }, { "epoch": 0.8207139639042748, "grad_norm": 55.0, "kl": 0.3620567321777344, "learning_rate": 5e-07, "logits/chosen": -63135168.0, "logits/rejected": -12481986.0, "logps/chosen": -622.275634765625, "logps/rejected": -577.9126586914062, "loss": 0.3696, "rewards/chosen": 0.3939476013183594, "rewards/margins": 3.418825626373291, "rewards/rejected": -3.0248780250549316, "step": 15484 }, { "epoch": 0.820766967906077, "grad_norm": 42.25, "kl": 1.2424163818359375, "learning_rate": 5e-07, "logits/chosen": -31406348.8, "logits/rejected": -15141526.666666666, "logps/chosen": -414.49697265625, "logps/rejected": -385.5049641927083, "loss": 0.3097, "rewards/chosen": 0.7687496185302735, "rewards/margins": 3.1906764348347982, "rewards/rejected": -2.421926816304525, "step": 15485 }, { "epoch": 0.820819971907879, "grad_norm": 43.25, "kl": 2.204784393310547, "learning_rate": 5e-07, "logits/chosen": -1393791.2, "logits/rejected": -45508394.666666664, "logps/chosen": -172.10367431640626, "logps/rejected": -164.08596801757812, "loss": 0.3892, "rewards/chosen": 0.25192444324493407, "rewards/margins": 2.336364674568176, "rewards/rejected": -2.084440231323242, "step": 15486 }, { "epoch": 0.8208729759096812, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33517968.0, "logits/rejected": -55419052.0, "logps/chosen": -505.5857849121094, "logps/rejected": -460.79083251953125, "loss": 0.2478, "rewards/chosen": 0.4066459834575653, "rewards/margins": 3.147720545530319, "rewards/rejected": -2.741074562072754, "step": 15487 }, { "epoch": 0.8209259799114833, "grad_norm": 39.5, "kl": 0.7061386108398438, "learning_rate": 5e-07, "logits/chosen": -17645784.0, "logits/rejected": -30976648.0, "logps/chosen": -153.29481506347656, "logps/rejected": -302.15869140625, "loss": 0.3753, "rewards/chosen": -0.06660914421081543, "rewards/margins": 2.0230374336242676, "rewards/rejected": -2.089646577835083, "step": 15488 }, { "epoch": 0.8209789839132855, "grad_norm": 36.75, "kl": 3.9355850219726562, "learning_rate": 5e-07, "logits/chosen": -34065282.666666664, "logits/rejected": -9038022.4, "logps/chosen": -411.0758463541667, "logps/rejected": -613.830810546875, "loss": 0.2461, "rewards/chosen": 0.3952142000198364, "rewards/margins": 4.517054486274719, "rewards/rejected": -4.121840286254883, "step": 15489 }, { "epoch": 0.8210319879150876, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38946282.666666664, "logits/rejected": -25802196.8, "logps/chosen": -387.5492350260417, "logps/rejected": -354.9484130859375, "loss": 0.2552, "rewards/chosen": 0.27429811159769696, "rewards/margins": 2.2692997137705486, "rewards/rejected": -1.9950016021728516, "step": 15490 }, { "epoch": 0.8210849919168898, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 412541.25, "logits/rejected": -7604158.0, "logps/chosen": -47.582786560058594, "logps/rejected": -187.3485310872396, "loss": 0.2279, "rewards/chosen": 0.4659197926521301, "rewards/margins": 2.2211457689603167, "rewards/rejected": -1.7552259763081868, "step": 15491 }, { "epoch": 0.8211379959186919, "grad_norm": 34.0, "kl": 2.6133193969726562, "learning_rate": 5e-07, "logits/chosen": -9253688.0, "logits/rejected": -27764837.333333332, "logps/chosen": -248.6123779296875, "logps/rejected": -265.5697428385417, "loss": 0.2853, "rewards/chosen": 0.671895170211792, "rewards/margins": 5.632105016708374, "rewards/rejected": -4.960209846496582, "step": 15492 }, { "epoch": 0.821190999920494, "grad_norm": 47.25, "kl": 0.8854827880859375, "learning_rate": 5e-07, "logits/chosen": -7954024.0, "logits/rejected": -15337166.666666666, "logps/chosen": -189.2969970703125, "logps/rejected": -248.7311808268229, "loss": 0.133, "rewards/chosen": 1.5362420082092285, "rewards/margins": 3.86201810836792, "rewards/rejected": -2.3257761001586914, "step": 15493 }, { "epoch": 0.8212440039222961, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18923464.0, "logits/rejected": 8791187.2, "logps/chosen": -279.7050374348958, "logps/rejected": -382.102978515625, "loss": 0.2492, "rewards/chosen": 0.8592077891031901, "rewards/margins": 2.5283783594767253, "rewards/rejected": -1.6691705703735351, "step": 15494 }, { "epoch": 0.8212970079240983, "grad_norm": 56.0, "kl": 2.328899383544922, "learning_rate": 5e-07, "logits/chosen": -36138560.0, "logits/rejected": -14842767.0, "logps/chosen": -384.7548421223958, "logps/rejected": -130.91021728515625, "loss": 0.342, "rewards/chosen": 0.8702574570973715, "rewards/margins": 2.8435256083806357, "rewards/rejected": -1.9732681512832642, "step": 15495 }, { "epoch": 0.8213500119259004, "grad_norm": 36.0, "kl": 2.896177291870117, "learning_rate": 5e-07, "logits/chosen": -30325932.8, "logits/rejected": -30302634.666666668, "logps/chosen": -313.7985595703125, "logps/rejected": -359.9225667317708, "loss": 0.2525, "rewards/chosen": 1.1055425643920898, "rewards/margins": 5.0163520812988285, "rewards/rejected": -3.9108095169067383, "step": 15496 }, { "epoch": 0.8214030159277026, "grad_norm": 69.0, "kl": 0.42580223083496094, "learning_rate": 5e-07, "logits/chosen": 220630.0, "logits/rejected": 6327531.5, "logps/chosen": -457.9103088378906, "logps/rejected": -121.86592864990234, "loss": 0.32, "rewards/chosen": 0.14051860570907593, "rewards/margins": 1.7468034625053406, "rewards/rejected": -1.6062848567962646, "step": 15497 }, { "epoch": 0.8214560199295047, "grad_norm": 32.25, "kl": 1.1873245239257812, "learning_rate": 5e-07, "logits/chosen": -25078192.0, "logits/rejected": -66649164.8, "logps/chosen": -162.3428955078125, "logps/rejected": -319.8306396484375, "loss": 0.1469, "rewards/chosen": 1.4920352300008137, "rewards/margins": 4.354545529683431, "rewards/rejected": -2.8625102996826173, "step": 15498 }, { "epoch": 0.8215090239313068, "grad_norm": 65.0, "kl": 2.0344390869140625, "learning_rate": 5e-07, "logits/chosen": -14422944.0, "logits/rejected": -94625120.0, "logps/chosen": -383.7938755580357, "logps/rejected": -554.975341796875, "loss": 0.3866, "rewards/chosen": 0.6106748580932617, "rewards/margins": 2.778424024581909, "rewards/rejected": -2.1677491664886475, "step": 15499 }, { "epoch": 0.821562027933109, "grad_norm": 39.25, "kl": 1.9308547973632812, "learning_rate": 5e-07, "logits/chosen": -16654414.4, "logits/rejected": -19310040.0, "logps/chosen": -238.5566162109375, "logps/rejected": -646.630126953125, "loss": 0.2789, "rewards/chosen": 0.7813501358032227, "rewards/margins": 4.268287976582846, "rewards/rejected": -3.4869378407796225, "step": 15500 }, { "epoch": 0.821615031934911, "grad_norm": 55.5, "kl": 4.0360283851623535, "learning_rate": 5e-07, "logits/chosen": -4058254.6666666665, "logits/rejected": -13144363.0, "logps/chosen": -213.29901123046875, "logps/rejected": -107.46245574951172, "loss": 0.4187, "rewards/chosen": 0.46014352639516193, "rewards/margins": 2.021103580792745, "rewards/rejected": -1.560960054397583, "step": 15501 }, { "epoch": 0.8216680359367132, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66796000.0, "logits/rejected": -23853430.4, "logps/chosen": -663.5231526692709, "logps/rejected": -261.464306640625, "loss": 0.1645, "rewards/chosen": 1.6647440592447917, "rewards/margins": 4.36299196879069, "rewards/rejected": -2.6982479095458984, "step": 15502 }, { "epoch": 0.8217210399385153, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13154153.0, "logits/rejected": -38574088.0, "logps/chosen": -293.173828125, "logps/rejected": -167.47842407226562, "loss": 0.2789, "rewards/chosen": 0.4484604001045227, "rewards/margins": 3.3477302193641663, "rewards/rejected": -2.8992698192596436, "step": 15503 }, { "epoch": 0.8217740439403175, "grad_norm": 46.0, "kl": 0.4579648971557617, "learning_rate": 5e-07, "logits/chosen": -22248766.0, "logits/rejected": -9932544.0, "logps/chosen": -559.18408203125, "logps/rejected": -124.52102661132812, "loss": 0.2789, "rewards/chosen": 1.2819560766220093, "rewards/margins": 2.5261021852493286, "rewards/rejected": -1.2441461086273193, "step": 15504 }, { "epoch": 0.8218270479421196, "grad_norm": 36.75, "kl": 1.4176025390625, "learning_rate": 5e-07, "logits/chosen": -38390754.666666664, "logits/rejected": 15253824.0, "logps/chosen": -347.8270670572917, "logps/rejected": -222.30615234375, "loss": 0.2569, "rewards/chosen": 0.4938880999883016, "rewards/margins": 2.6049991687138876, "rewards/rejected": -2.111111068725586, "step": 15505 }, { "epoch": 0.8218800519439218, "grad_norm": 22.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4239430.0, "logits/rejected": -27900296.0, "logps/chosen": -25.012447357177734, "logps/rejected": -416.1946614583333, "loss": 0.1774, "rewards/chosen": -0.22342853248119354, "rewards/margins": 3.048507884144783, "rewards/rejected": -3.2719364166259766, "step": 15506 }, { "epoch": 0.8219330559457239, "grad_norm": 48.5, "kl": 0.9855155944824219, "learning_rate": 5e-07, "logits/chosen": 3589332.5, "logits/rejected": -18563466.666666668, "logps/chosen": -45.55574035644531, "logps/rejected": -200.97306315104166, "loss": 0.2063, "rewards/chosen": 0.9386615753173828, "rewards/margins": 3.1626113255818686, "rewards/rejected": -2.223949750264486, "step": 15507 }, { "epoch": 0.821986059947526, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59540723.2, "logits/rejected": -29714741.333333332, "logps/chosen": -656.631298828125, "logps/rejected": -281.8459879557292, "loss": 0.2858, "rewards/chosen": 0.6774276733398438, "rewards/margins": 3.6978569666544594, "rewards/rejected": -3.0204292933146157, "step": 15508 }, { "epoch": 0.8220390639493281, "grad_norm": 48.5, "kl": 4.021581649780273, "learning_rate": 5e-07, "logits/chosen": -4593679.0, "logits/rejected": -36338464.0, "logps/chosen": -244.93629455566406, "logps/rejected": -334.2427062988281, "loss": 0.3433, "rewards/chosen": 0.42521393299102783, "rewards/margins": 2.364365577697754, "rewards/rejected": -1.939151644706726, "step": 15509 }, { "epoch": 0.8220920679511303, "grad_norm": 37.25, "kl": 0.7620201110839844, "learning_rate": 5e-07, "logits/chosen": -13406941.333333334, "logits/rejected": -23932491.2, "logps/chosen": -159.82232666015625, "logps/rejected": -372.5003173828125, "loss": 0.2591, "rewards/chosen": -0.04466897249221802, "rewards/margins": 2.883527362346649, "rewards/rejected": -2.928196334838867, "step": 15510 }, { "epoch": 0.8221450719529324, "grad_norm": 50.0, "kl": 1.3211755752563477, "learning_rate": 5e-07, "logits/chosen": 5321320.0, "logits/rejected": 14550877.0, "logps/chosen": -84.85398864746094, "logps/rejected": -449.01995849609375, "loss": 0.3323, "rewards/chosen": 0.35802358388900757, "rewards/margins": 2.356118857860565, "rewards/rejected": -1.9980952739715576, "step": 15511 }, { "epoch": 0.8221980759547346, "grad_norm": 44.75, "kl": 0.16875839233398438, "learning_rate": 5e-07, "logits/chosen": -56570997.333333336, "logits/rejected": -30131177.6, "logps/chosen": -434.4556477864583, "logps/rejected": -579.07529296875, "loss": 0.1778, "rewards/chosen": 0.7064351240793864, "rewards/margins": 3.4882277647654214, "rewards/rejected": -2.781792640686035, "step": 15512 }, { "epoch": 0.8222510799565367, "grad_norm": 48.0, "kl": 0.37666893005371094, "learning_rate": 5e-07, "logits/chosen": -30592664.0, "logits/rejected": -23666710.4, "logps/chosen": -537.5504557291666, "logps/rejected": -200.762744140625, "loss": 0.1623, "rewards/chosen": 1.4625641504923503, "rewards/margins": 3.960789171854655, "rewards/rejected": -2.4982250213623045, "step": 15513 }, { "epoch": 0.8223040839583389, "grad_norm": 38.75, "kl": 1.6192283630371094, "learning_rate": 5e-07, "logits/chosen": -30123208.0, "logits/rejected": -70351392.0, "logps/chosen": -287.6912841796875, "logps/rejected": -450.5835876464844, "loss": 0.1941, "rewards/chosen": 0.8733859062194824, "rewards/margins": 3.885986328125, "rewards/rejected": -3.0126004219055176, "step": 15514 }, { "epoch": 0.822357087960141, "grad_norm": 48.5, "kl": 1.90399169921875, "learning_rate": 5e-07, "logits/chosen": -22803596.0, "logits/rejected": -27825792.0, "logps/chosen": -179.34042358398438, "logps/rejected": -473.41455078125, "loss": 0.2241, "rewards/chosen": 0.7632545232772827, "rewards/margins": 4.2346426248550415, "rewards/rejected": -3.471388101577759, "step": 15515 }, { "epoch": 0.8224100919619431, "grad_norm": 43.25, "kl": 1.0498924255371094, "learning_rate": 5e-07, "logits/chosen": -2677991.5, "logits/rejected": -48917712.0, "logps/chosen": -49.689414978027344, "logps/rejected": -358.1429443359375, "loss": 0.2008, "rewards/chosen": 0.5309074521064758, "rewards/margins": 2.8968016107877097, "rewards/rejected": -2.365894158681234, "step": 15516 }, { "epoch": 0.8224630959637452, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30307540.0, "logits/rejected": -24014312.0, "logps/chosen": -368.8493957519531, "logps/rejected": -287.33929443359375, "loss": 0.2052, "rewards/chosen": 0.6110122799873352, "rewards/margins": 2.4431007504463196, "rewards/rejected": -1.8320884704589844, "step": 15517 }, { "epoch": 0.8225160999655474, "grad_norm": 55.0, "kl": 0.14743995666503906, "learning_rate": 5e-07, "logits/chosen": -10238191.2, "logits/rejected": 52225621.333333336, "logps/chosen": -439.018896484375, "logps/rejected": -853.6040852864584, "loss": 0.257, "rewards/chosen": 0.6173959255218506, "rewards/margins": 4.892497905095418, "rewards/rejected": -4.275101979573567, "step": 15518 }, { "epoch": 0.8225691039673495, "grad_norm": 99.5, "kl": 1.4386253356933594, "learning_rate": 5e-07, "logits/chosen": 53070099.2, "logits/rejected": -42220730.666666664, "logps/chosen": -552.9607421875, "logps/rejected": -472.4951171875, "loss": 0.3766, "rewards/chosen": -0.11618163585662841, "rewards/margins": 2.5149873654047648, "rewards/rejected": -2.631169001261393, "step": 15519 }, { "epoch": 0.8226221079691517, "grad_norm": 57.25, "kl": 0.08702850341796875, "learning_rate": 5e-07, "logits/chosen": -17152352.0, "logits/rejected": -21920032.0, "logps/chosen": -394.8678955078125, "logps/rejected": -283.96608479817706, "loss": 0.3481, "rewards/chosen": 0.31370885372161866, "rewards/margins": 1.9690631945927937, "rewards/rejected": -1.655354340871175, "step": 15520 }, { "epoch": 0.8226751119709538, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 44440568.0, "logits/rejected": -24334285.714285713, "logps/chosen": -421.4114685058594, "logps/rejected": -329.56839425223217, "loss": 0.1751, "rewards/chosen": 0.14287720620632172, "rewards/margins": 2.440996614950044, "rewards/rejected": -2.2981194087437222, "step": 15521 }, { "epoch": 0.822728115972756, "grad_norm": 37.25, "kl": 1.0451154708862305, "learning_rate": 5e-07, "logits/chosen": -24850422.0, "logits/rejected": -10702165.0, "logps/chosen": -90.15784454345703, "logps/rejected": -478.77587890625, "loss": 0.3419, "rewards/chosen": 0.28217026591300964, "rewards/margins": 3.2382712066173553, "rewards/rejected": -2.9561009407043457, "step": 15522 }, { "epoch": 0.822781119974558, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53277048.0, "logits/rejected": -23144509.714285713, "logps/chosen": -612.157958984375, "logps/rejected": -387.9707728794643, "loss": 0.1263, "rewards/chosen": 1.076684594154358, "rewards/margins": 3.5785990272249495, "rewards/rejected": -2.5019144330705916, "step": 15523 }, { "epoch": 0.8228341239763602, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66411608.0, "logits/rejected": -37012944.0, "logps/chosen": -339.6116027832031, "logps/rejected": -377.4019775390625, "loss": 0.2293, "rewards/chosen": 0.03013153001666069, "rewards/margins": 2.1683213226497173, "rewards/rejected": -2.1381897926330566, "step": 15524 }, { "epoch": 0.8228871279781623, "grad_norm": 51.25, "kl": 1.0499763488769531, "learning_rate": 5e-07, "logits/chosen": -41930758.4, "logits/rejected": -13327910.666666666, "logps/chosen": -394.3208984375, "logps/rejected": -98.11182657877605, "loss": 0.3481, "rewards/chosen": 0.42659602165222166, "rewards/margins": 3.126273806889852, "rewards/rejected": -2.6996777852376304, "step": 15525 }, { "epoch": 0.8229401319799645, "grad_norm": 56.25, "kl": 2.1951141357421875, "learning_rate": 5e-07, "logits/chosen": -38110064.0, "logits/rejected": -10770080.8, "logps/chosen": -329.22930908203125, "logps/rejected": -265.799609375, "loss": 0.2255, "rewards/chosen": 1.0906886259714763, "rewards/margins": 2.7333237806955974, "rewards/rejected": -1.6426351547241211, "step": 15526 }, { "epoch": 0.8229931359817666, "grad_norm": 56.75, "kl": 2.3138883113861084, "learning_rate": 5e-07, "logits/chosen": -22444022.4, "logits/rejected": -10204453.333333334, "logps/chosen": -341.2813232421875, "logps/rejected": -267.9019368489583, "loss": 0.356, "rewards/chosen": 0.7658823966979981, "rewards/margins": 1.3769986470540365, "rewards/rejected": -0.6111162503560384, "step": 15527 }, { "epoch": 0.8230461399835688, "grad_norm": 39.75, "kl": 0.5595703125, "learning_rate": 5e-07, "logits/chosen": -40497536.0, "logits/rejected": -166555.0, "logps/chosen": -211.489794921875, "logps/rejected": -84.19423929850261, "loss": 0.2621, "rewards/chosen": 0.5977613925933838, "rewards/margins": 3.815608580907186, "rewards/rejected": -3.2178471883138022, "step": 15528 }, { "epoch": 0.8230991439853709, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42964133.333333336, "logits/rejected": -37818312.0, "logps/chosen": -402.1702473958333, "logps/rejected": -404.35467529296875, "loss": 0.3615, "rewards/chosen": 0.20317025979359946, "rewards/margins": 2.985329588254293, "rewards/rejected": -2.7821593284606934, "step": 15529 }, { "epoch": 0.8231521479871731, "grad_norm": 44.0, "kl": 2.2770652770996094, "learning_rate": 5e-07, "logits/chosen": 1694995.8, "logits/rejected": -17929001.333333332, "logps/chosen": -141.837158203125, "logps/rejected": -357.9535725911458, "loss": 0.4594, "rewards/chosen": -0.15320886373519899, "rewards/margins": 1.9428992549578348, "rewards/rejected": -2.0961081186930337, "step": 15530 }, { "epoch": 0.8232051519889751, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30604422.4, "logits/rejected": -26566570.666666668, "logps/chosen": -288.24619140625, "logps/rejected": -358.7659505208333, "loss": 0.2422, "rewards/chosen": 1.3100003242492675, "rewards/margins": 3.38567533493042, "rewards/rejected": -2.0756750106811523, "step": 15531 }, { "epoch": 0.8232581559907773, "grad_norm": 41.0, "kl": 1.4778194427490234, "learning_rate": 5e-07, "logits/chosen": 4049649.25, "logits/rejected": -31288092.0, "logps/chosen": -266.64764404296875, "logps/rejected": -226.66615295410156, "loss": 0.2327, "rewards/chosen": 1.052564024925232, "rewards/margins": 3.2278531789779663, "rewards/rejected": -2.1752891540527344, "step": 15532 }, { "epoch": 0.8233111599925794, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -98714432.0, "logits/rejected": -21746873.6, "logps/chosen": -254.34627278645834, "logps/rejected": -267.7369140625, "loss": 0.3149, "rewards/chosen": -0.284869909286499, "rewards/margins": 2.2507309436798097, "rewards/rejected": -2.5356008529663088, "step": 15533 }, { "epoch": 0.8233641639943816, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15198424.0, "logits/rejected": -4095026.6666666665, "logps/chosen": -137.41111755371094, "logps/rejected": -194.5970662434896, "loss": 0.1773, "rewards/chosen": -0.025469303131103516, "rewards/margins": 3.3103782335917153, "rewards/rejected": -3.335847536722819, "step": 15534 }, { "epoch": 0.8234171679961837, "grad_norm": 62.5, "kl": 0.8983306884765625, "learning_rate": 5e-07, "logits/chosen": -93653683.2, "logits/rejected": 7916545.333333333, "logps/chosen": -698.590234375, "logps/rejected": -395.3575846354167, "loss": 0.2774, "rewards/chosen": 1.086521053314209, "rewards/margins": 2.9532704989115395, "rewards/rejected": -1.8667494455973308, "step": 15535 }, { "epoch": 0.8234701719979859, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14276596.8, "logits/rejected": -65588618.666666664, "logps/chosen": -489.84462890625, "logps/rejected": -501.3206787109375, "loss": 0.2193, "rewards/chosen": 1.0891427993774414, "rewards/margins": 3.4274179140726724, "rewards/rejected": -2.338275114695231, "step": 15536 }, { "epoch": 0.823523175999788, "grad_norm": 34.75, "kl": 2.0411033630371094, "learning_rate": 5e-07, "logits/chosen": 2875391.3333333335, "logits/rejected": -39653523.2, "logps/chosen": -124.78811645507812, "logps/rejected": -329.46728515625, "loss": 0.2116, "rewards/chosen": 0.6350677410761515, "rewards/margins": 3.126758567492167, "rewards/rejected": -2.491690826416016, "step": 15537 }, { "epoch": 0.8235761800015902, "grad_norm": 29.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78549408.0, "logits/rejected": -33341114.666666668, "logps/chosen": -431.6510314941406, "logps/rejected": -352.642333984375, "loss": 0.0816, "rewards/chosen": 1.6384261846542358, "rewards/margins": 5.11613659063975, "rewards/rejected": -3.477710405985514, "step": 15538 }, { "epoch": 0.8236291840033922, "grad_norm": 32.25, "kl": 0.9811697006225586, "learning_rate": 5e-07, "logits/chosen": -1748531.0, "logits/rejected": -4388019.2, "logps/chosen": -285.0802001953125, "logps/rejected": -198.667041015625, "loss": 0.1577, "rewards/chosen": 1.5212621688842773, "rewards/margins": 4.203643798828125, "rewards/rejected": -2.6823816299438477, "step": 15539 }, { "epoch": 0.8236821880051944, "grad_norm": 52.5, "kl": 5.359138488769531, "learning_rate": 5e-07, "logits/chosen": -14626227.2, "logits/rejected": -34741928.0, "logps/chosen": -715.812255859375, "logps/rejected": -382.0049235026042, "loss": 0.2892, "rewards/chosen": 1.532040786743164, "rewards/margins": 4.549487622578939, "rewards/rejected": -3.017446835835775, "step": 15540 }, { "epoch": 0.8237351920069965, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45226176.0, "logits/rejected": -36178502.85714286, "logps/chosen": -202.1046600341797, "logps/rejected": -518.2948172433036, "loss": 0.155, "rewards/chosen": 0.01715087890625, "rewards/margins": 4.17608152117048, "rewards/rejected": -4.15893064226423, "step": 15541 }, { "epoch": 0.8237881960087987, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48759338.666666664, "logits/rejected": -48815273.6, "logps/chosen": -268.6586100260417, "logps/rejected": -380.9316162109375, "loss": 0.2331, "rewards/chosen": 0.6225438912709554, "rewards/margins": 2.5556194146474205, "rewards/rejected": -1.933075523376465, "step": 15542 }, { "epoch": 0.8238412000106008, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40355536.0, "logits/rejected": -27832438.4, "logps/chosen": -209.0677490234375, "logps/rejected": -372.197509765625, "loss": 0.2458, "rewards/chosen": -0.264462947845459, "rewards/margins": 3.1966835975646974, "rewards/rejected": -3.4611465454101564, "step": 15543 }, { "epoch": 0.823894204012403, "grad_norm": 35.75, "kl": 5.488382339477539, "learning_rate": 5e-07, "logits/chosen": -20486581.333333332, "logits/rejected": 20881675.2, "logps/chosen": -354.9814860026042, "logps/rejected": -216.432666015625, "loss": 0.2163, "rewards/chosen": 1.47880220413208, "rewards/margins": 3.1897276878356933, "rewards/rejected": -1.7109254837036132, "step": 15544 }, { "epoch": 0.8239472080142051, "grad_norm": 56.75, "kl": 1.5126953125, "learning_rate": 5e-07, "logits/chosen": -52845397.333333336, "logits/rejected": -3085055.0, "logps/chosen": -296.0694580078125, "logps/rejected": -477.84765625, "loss": 0.4237, "rewards/chosen": 0.2938477198282878, "rewards/margins": 1.6687852541605632, "rewards/rejected": -1.3749375343322754, "step": 15545 }, { "epoch": 0.8240002120160073, "grad_norm": 51.25, "kl": 1.806375503540039, "learning_rate": 5e-07, "logits/chosen": -43751824.0, "logits/rejected": -36846632.0, "logps/chosen": -614.689208984375, "logps/rejected": -385.9237060546875, "loss": 0.1885, "rewards/chosen": 1.509270429611206, "rewards/margins": 4.463618993759155, "rewards/rejected": -2.954348564147949, "step": 15546 }, { "epoch": 0.8240532160178093, "grad_norm": 30.25, "kl": 0.2119913101196289, "learning_rate": 5e-07, "logits/chosen": -82821936.0, "logits/rejected": -22515800.0, "logps/chosen": -739.9676513671875, "logps/rejected": -290.3065999348958, "loss": 0.0935, "rewards/chosen": 2.1561827659606934, "rewards/margins": 5.116532484690348, "rewards/rejected": -2.960349718729655, "step": 15547 }, { "epoch": 0.8241062200196115, "grad_norm": 74.5, "kl": 1.1015167236328125, "learning_rate": 5e-07, "logits/chosen": -34744968.0, "logits/rejected": -9239945.6, "logps/chosen": -463.33837890625, "logps/rejected": -73.0431884765625, "loss": 0.3036, "rewards/chosen": 0.27879029512405396, "rewards/margins": 2.25348778963089, "rewards/rejected": -1.9746974945068358, "step": 15548 }, { "epoch": 0.8241592240214136, "grad_norm": 43.0, "kl": 0.07170295715332031, "learning_rate": 5e-07, "logits/chosen": -52639528.0, "logits/rejected": 8411781.0, "logps/chosen": -370.26953125, "logps/rejected": -453.1059265136719, "loss": 0.1974, "rewards/chosen": 0.5807262659072876, "rewards/margins": 4.900607705116272, "rewards/rejected": -4.319881439208984, "step": 15549 }, { "epoch": 0.8242122280232157, "grad_norm": 54.75, "kl": 0.5743179321289062, "learning_rate": 5e-07, "logits/chosen": -9067902.0, "logits/rejected": -21740242.0, "logps/chosen": -185.02626037597656, "logps/rejected": -430.5816345214844, "loss": 0.3027, "rewards/chosen": 0.6290565729141235, "rewards/margins": 2.173526406288147, "rewards/rejected": -1.5444698333740234, "step": 15550 }, { "epoch": 0.8242652320250179, "grad_norm": 48.75, "kl": 0.8951606750488281, "learning_rate": 5e-07, "logits/chosen": -300308.75, "logits/rejected": 1170846.0, "logps/chosen": -205.63191731770834, "logps/rejected": -65.04689025878906, "loss": 0.3333, "rewards/chosen": 0.6476102670033773, "rewards/margins": 2.2340511878331504, "rewards/rejected": -1.586440920829773, "step": 15551 }, { "epoch": 0.82431823602682, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60811896.0, "logits/rejected": -13952573.333333334, "logps/chosen": -341.93682861328125, "logps/rejected": -245.76151529947916, "loss": 0.228, "rewards/chosen": -0.9706344604492188, "rewards/margins": 2.5015360514322915, "rewards/rejected": -3.4721705118815103, "step": 15552 }, { "epoch": 0.8243712400286222, "grad_norm": 42.5, "kl": 3.766946792602539, "learning_rate": 5e-07, "logits/chosen": -1206778.4, "logits/rejected": -9289029.333333334, "logps/chosen": -143.23616943359374, "logps/rejected": -282.7873942057292, "loss": 0.3179, "rewards/chosen": 0.3880438804626465, "rewards/margins": 2.865039825439453, "rewards/rejected": -2.4769959449768066, "step": 15553 }, { "epoch": 0.8244242440304242, "grad_norm": 37.25, "kl": 0.7418899536132812, "learning_rate": 5e-07, "logits/chosen": -1789224.0, "logits/rejected": -22303074.0, "logps/chosen": -213.24961853027344, "logps/rejected": -281.417724609375, "loss": 0.2531, "rewards/chosen": 0.5898841619491577, "rewards/margins": 2.7335222959518433, "rewards/rejected": -2.1436381340026855, "step": 15554 }, { "epoch": 0.8244772480322264, "grad_norm": 57.0, "kl": 2.825641632080078, "learning_rate": 5e-07, "logits/chosen": -39723456.0, "logits/rejected": -24697618.0, "logps/chosen": -616.011962890625, "logps/rejected": -210.70741271972656, "loss": 0.226, "rewards/chosen": 1.2494821548461914, "rewards/margins": 3.77419114112854, "rewards/rejected": -2.5247089862823486, "step": 15555 }, { "epoch": 0.8245302520340285, "grad_norm": 56.75, "kl": 0.8814277648925781, "learning_rate": 5e-07, "logits/chosen": -10785756.0, "logits/rejected": -38753136.0, "logps/chosen": -277.46612548828125, "logps/rejected": -296.98614501953125, "loss": 0.2493, "rewards/chosen": 0.6239324808120728, "rewards/margins": 2.922732710838318, "rewards/rejected": -2.298800230026245, "step": 15556 }, { "epoch": 0.8245832560358307, "grad_norm": 55.5, "kl": 1.0505313873291016, "learning_rate": 5e-07, "logits/chosen": -15343745.333333334, "logits/rejected": -27912510.0, "logps/chosen": -351.0560302734375, "logps/rejected": -359.5152587890625, "loss": 0.2309, "rewards/chosen": 1.3586862881978352, "rewards/margins": 3.2645784219106035, "rewards/rejected": -1.9058921337127686, "step": 15557 }, { "epoch": 0.8246362600376328, "grad_norm": 44.5, "kl": 4.084650039672852, "learning_rate": 5e-07, "logits/chosen": -28410361.6, "logits/rejected": -53356272.0, "logps/chosen": -298.88046875, "logps/rejected": -518.0250651041666, "loss": 0.2317, "rewards/chosen": 1.1184255599975585, "rewards/margins": 4.438335609436035, "rewards/rejected": -3.3199100494384766, "step": 15558 }, { "epoch": 0.824689264039435, "grad_norm": 33.75, "kl": 2.0202064514160156, "learning_rate": 5e-07, "logits/chosen": -67467002.66666667, "logits/rejected": -10236541.6, "logps/chosen": -222.62646484375, "logps/rejected": -154.84521484375, "loss": 0.2191, "rewards/chosen": 0.4196027119954427, "rewards/margins": 3.603249486287435, "rewards/rejected": -3.1836467742919923, "step": 15559 }, { "epoch": 0.8247422680412371, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12138474.0, "logits/rejected": -18589568.0, "logps/chosen": -302.58819580078125, "logps/rejected": -147.29071044921875, "loss": 0.3123, "rewards/chosen": 0.9851819276809692, "rewards/margins": 2.0475118160247803, "rewards/rejected": -1.062329888343811, "step": 15560 }, { "epoch": 0.8247952720430393, "grad_norm": 43.5, "kl": 3.5009536743164062, "learning_rate": 5e-07, "logits/chosen": -77894864.0, "logits/rejected": -34306168.0, "logps/chosen": -385.9355773925781, "logps/rejected": -440.6962890625, "loss": 0.3536, "rewards/chosen": 0.7923860549926758, "rewards/margins": 2.7703813314437866, "rewards/rejected": -1.9779952764511108, "step": 15561 }, { "epoch": 0.8248482760448413, "grad_norm": 27.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4902591.0, "logits/rejected": 10437722.666666666, "logps/chosen": -105.35983276367188, "logps/rejected": -153.70803833007812, "loss": 0.1135, "rewards/chosen": 0.7292877435684204, "rewards/margins": 4.422529180844625, "rewards/rejected": -3.6932414372762046, "step": 15562 }, { "epoch": 0.8249012800466435, "grad_norm": 33.5, "kl": 2.679279327392578, "learning_rate": 5e-07, "logits/chosen": -3359980.8, "logits/rejected": -36435261.333333336, "logps/chosen": -208.3881591796875, "logps/rejected": -283.8127848307292, "loss": 0.2588, "rewards/chosen": 1.1361049652099608, "rewards/margins": 2.7673372268676757, "rewards/rejected": -1.6312322616577148, "step": 15563 }, { "epoch": 0.8249542840484456, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24395802.666666668, "logits/rejected": -2561990.4, "logps/chosen": -183.29827880859375, "logps/rejected": -150.262353515625, "loss": 0.2759, "rewards/chosen": 0.09231135249137878, "rewards/margins": 2.1516068160533903, "rewards/rejected": -2.0592954635620115, "step": 15564 }, { "epoch": 0.8250072880502478, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58380656.0, "logits/rejected": -48655032.0, "logps/chosen": -365.4726257324219, "logps/rejected": -542.4615478515625, "loss": 0.3174, "rewards/chosen": 0.0018379203975200653, "rewards/margins": 2.31457943841815, "rewards/rejected": -2.31274151802063, "step": 15565 }, { "epoch": 0.8250602920520499, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4043452.5, "logits/rejected": -34049760.0, "logps/chosen": -231.6162109375, "logps/rejected": -389.1510009765625, "loss": 0.2601, "rewards/chosen": 0.03673344850540161, "rewards/margins": 4.08112508058548, "rewards/rejected": -4.044391632080078, "step": 15566 }, { "epoch": 0.8251132960538521, "grad_norm": 46.75, "kl": 6.230352401733398, "learning_rate": 5e-07, "logits/chosen": -8711536.0, "logits/rejected": -11422714.666666666, "logps/chosen": -237.4541748046875, "logps/rejected": -471.5113932291667, "loss": 0.3775, "rewards/chosen": 0.30175256729125977, "rewards/margins": 2.8654653231302896, "rewards/rejected": -2.56371275583903, "step": 15567 }, { "epoch": 0.8251663000556542, "grad_norm": 32.5, "kl": 0.20934677124023438, "learning_rate": 5e-07, "logits/chosen": -16144008.0, "logits/rejected": -43847776.0, "logps/chosen": -290.24017333984375, "logps/rejected": -177.46217346191406, "loss": 0.1595, "rewards/chosen": 1.4918218851089478, "rewards/margins": 3.6332398653030396, "rewards/rejected": -2.141417980194092, "step": 15568 }, { "epoch": 0.8252193040574564, "grad_norm": 44.5, "kl": 0.06255340576171875, "learning_rate": 5e-07, "logits/chosen": -52658816.0, "logits/rejected": -13049095.2, "logps/chosen": -435.9713541666667, "logps/rejected": -245.5520751953125, "loss": 0.2021, "rewards/chosen": 1.1898484230041504, "rewards/margins": 3.5415190696716308, "rewards/rejected": -2.3516706466674804, "step": 15569 }, { "epoch": 0.8252723080592584, "grad_norm": 52.25, "kl": 2.5209836959838867, "learning_rate": 5e-07, "logits/chosen": -14526242.666666666, "logits/rejected": -21665028.0, "logps/chosen": -214.1337890625, "logps/rejected": -584.6840209960938, "loss": 0.3346, "rewards/chosen": 0.544544498125712, "rewards/margins": 4.646092693010966, "rewards/rejected": -4.101548194885254, "step": 15570 }, { "epoch": 0.8253253120610606, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45501632.0, "logits/rejected": -19520882.0, "logps/chosen": -262.03289794921875, "logps/rejected": -399.410400390625, "loss": 0.3183, "rewards/chosen": 0.08139696717262268, "rewards/margins": 2.5620197355747223, "rewards/rejected": -2.4806227684020996, "step": 15571 }, { "epoch": 0.8253783160628627, "grad_norm": 59.5, "kl": 0.25227928161621094, "learning_rate": 5e-07, "logits/chosen": -18542924.8, "logits/rejected": -37055280.0, "logps/chosen": -255.166650390625, "logps/rejected": -490.0359700520833, "loss": 0.2923, "rewards/chosen": 0.28328964710235593, "rewards/margins": 3.291153963406881, "rewards/rejected": -3.007864316304525, "step": 15572 }, { "epoch": 0.8254313200646649, "grad_norm": 58.75, "kl": 2.8090381622314453, "learning_rate": 5e-07, "logits/chosen": -49496531.2, "logits/rejected": 1481314.3333333333, "logps/chosen": -248.9303955078125, "logps/rejected": -101.22747802734375, "loss": 0.3117, "rewards/chosen": 0.7667824745178222, "rewards/margins": 3.4348469098409016, "rewards/rejected": -2.6680644353230796, "step": 15573 }, { "epoch": 0.825484324066467, "grad_norm": 56.75, "kl": 0.8063793182373047, "learning_rate": 5e-07, "logits/chosen": -44262448.0, "logps/chosen": -449.8772888183594, "loss": 0.3785, "rewards/chosen": 0.6908536553382874, "step": 15574 }, { "epoch": 0.8255373280682692, "grad_norm": 53.25, "kl": 2.8406906127929688, "learning_rate": 5e-07, "logits/chosen": 3305284.0, "logits/rejected": -3773804.0, "logps/chosen": -424.1914876302083, "logps/rejected": -83.62545166015624, "loss": 0.1958, "rewards/chosen": 0.8528258800506592, "rewards/margins": 3.7757405757904055, "rewards/rejected": -2.9229146957397463, "step": 15575 }, { "epoch": 0.8255903320700713, "grad_norm": 37.5, "kl": 4.897656440734863, "learning_rate": 5e-07, "logits/chosen": -43875814.4, "logits/rejected": -48438064.0, "logps/chosen": -288.9351318359375, "logps/rejected": -341.4072672526042, "loss": 0.4098, "rewards/chosen": -0.1209288477897644, "rewards/margins": 2.7299123565355936, "rewards/rejected": -2.850841204325358, "step": 15576 }, { "epoch": 0.8256433360718735, "grad_norm": 42.25, "kl": 0.037708282470703125, "learning_rate": 5e-07, "logits/chosen": -54561612.0, "logits/rejected": 29784390.0, "logps/chosen": -418.7315979003906, "logps/rejected": -571.0171508789062, "loss": 0.1651, "rewards/chosen": 1.050602674484253, "rewards/margins": 5.093982458114624, "rewards/rejected": -4.043379783630371, "step": 15577 }, { "epoch": 0.8256963400736755, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26682888.0, "logits/rejected": -25745108.8, "logps/chosen": -420.84130859375, "logps/rejected": -200.92425537109375, "loss": 0.2464, "rewards/chosen": 1.4155913988749187, "rewards/margins": 2.5937247912089028, "rewards/rejected": -1.1781333923339843, "step": 15578 }, { "epoch": 0.8257493440754777, "grad_norm": 31.0, "kl": 0.5423812866210938, "learning_rate": 5e-07, "logits/chosen": -26161712.0, "logits/rejected": -40237491.2, "logps/chosen": -224.99735514322916, "logps/rejected": -464.98447265625, "loss": 0.1745, "rewards/chosen": 0.882732073465983, "rewards/margins": 3.6944052378336587, "rewards/rejected": -2.811673164367676, "step": 15579 }, { "epoch": 0.8258023480772798, "grad_norm": 66.0, "kl": 6.412859916687012, "learning_rate": 5e-07, "logits/chosen": -41909264.0, "logits/rejected": -22908332.0, "logps/chosen": -298.53354899088544, "logps/rejected": -218.700439453125, "loss": 0.3746, "rewards/chosen": 1.0601590474446614, "rewards/margins": 2.4637692769368487, "rewards/rejected": -1.4036102294921875, "step": 15580 }, { "epoch": 0.825855352079082, "grad_norm": 43.5, "kl": 1.26995849609375, "learning_rate": 5e-07, "logits/chosen": -58341424.0, "logits/rejected": -29939763.2, "logps/chosen": -464.1463216145833, "logps/rejected": -330.3669921875, "loss": 0.1966, "rewards/chosen": 0.7578721841176351, "rewards/margins": 3.728702720006307, "rewards/rejected": -2.970830535888672, "step": 15581 }, { "epoch": 0.8259083560808841, "grad_norm": 37.75, "kl": 1.9806041717529297, "learning_rate": 5e-07, "logits/chosen": -6329207.6, "logits/rejected": -32384282.666666668, "logps/chosen": -219.6385009765625, "logps/rejected": -299.24167887369794, "loss": 0.3188, "rewards/chosen": 0.5195311069488525, "rewards/margins": 3.429593578974406, "rewards/rejected": -2.9100624720255532, "step": 15582 }, { "epoch": 0.8259613600826863, "grad_norm": 46.5, "kl": 0.2317371368408203, "learning_rate": 5e-07, "logits/chosen": -34299312.0, "logits/rejected": 1046099.0, "logps/chosen": -633.47509765625, "logps/rejected": -103.24737548828125, "loss": 0.2028, "rewards/chosen": 1.2225430806477864, "rewards/margins": 5.765359242757161, "rewards/rejected": -4.542816162109375, "step": 15583 }, { "epoch": 0.8260143640844884, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 67825968.0, "logits/rejected": -18238408.0, "logps/chosen": -513.4445190429688, "logps/rejected": -336.094482421875, "loss": 0.2092, "rewards/chosen": 0.6511199474334717, "rewards/margins": 4.154973030090332, "rewards/rejected": -3.5038530826568604, "step": 15584 }, { "epoch": 0.8260673680862906, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50233356.0, "logits/rejected": -3206565.0, "logps/chosen": -284.3365478515625, "logps/rejected": -613.3090209960938, "loss": 0.2714, "rewards/chosen": 0.4327239990234375, "rewards/margins": 3.8578133583068848, "rewards/rejected": -3.4250893592834473, "step": 15585 }, { "epoch": 0.8261203720880926, "grad_norm": 55.5, "kl": 0.4213142395019531, "learning_rate": 5e-07, "logits/chosen": -37390738.666666664, "logits/rejected": -34846828.8, "logps/chosen": -449.7236328125, "logps/rejected": -348.76923828125, "loss": 0.253, "rewards/chosen": 0.3571360111236572, "rewards/margins": 3.015102243423462, "rewards/rejected": -2.6579662322998048, "step": 15586 }, { "epoch": 0.8261733760898948, "grad_norm": 49.75, "kl": 2.5827713012695312, "learning_rate": 5e-07, "logits/chosen": -30944994.0, "logits/rejected": -845609.4375, "logps/chosen": -337.8321228027344, "logps/rejected": -236.61044311523438, "loss": 0.3323, "rewards/chosen": 0.43363580107688904, "rewards/margins": 3.2329581677913666, "rewards/rejected": -2.7993223667144775, "step": 15587 }, { "epoch": 0.8262263800916969, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19442072.0, "logits/rejected": -12826058.666666666, "logps/chosen": -378.10052490234375, "logps/rejected": -176.5147908528646, "loss": 0.16, "rewards/chosen": 1.968651533126831, "rewards/margins": 4.652188062667847, "rewards/rejected": -2.6835365295410156, "step": 15588 }, { "epoch": 0.8262793840934991, "grad_norm": 47.25, "kl": 0.02692413330078125, "learning_rate": 5e-07, "logits/chosen": -3320201.6666666665, "logits/rejected": -42115824.0, "logps/chosen": -173.1666463216146, "logps/rejected": -600.0948486328125, "loss": 0.3767, "rewards/chosen": 0.16485286752382913, "rewards/margins": 2.5528426269690194, "rewards/rejected": -2.3879897594451904, "step": 15589 }, { "epoch": 0.8263323880953012, "grad_norm": 58.75, "kl": 2.478271484375, "learning_rate": 5e-07, "logits/chosen": -6405597.333333333, "logits/rejected": 6043555.0, "logps/chosen": -673.330078125, "logps/rejected": -179.63108825683594, "loss": 0.3344, "rewards/chosen": 1.1393925348917644, "rewards/margins": 2.0777603785196943, "rewards/rejected": -0.9383678436279297, "step": 15590 }, { "epoch": 0.8263853920971034, "grad_norm": 47.75, "kl": 1.3696117401123047, "learning_rate": 5e-07, "logits/chosen": -46473840.0, "logits/rejected": -576747.25, "logps/chosen": -260.2445068359375, "logps/rejected": -119.83158874511719, "loss": 0.4199, "rewards/chosen": 0.006199886401494344, "rewards/margins": 4.2176104088624315, "rewards/rejected": -4.2114105224609375, "step": 15591 }, { "epoch": 0.8264383960989055, "grad_norm": 52.25, "kl": 0.7588043212890625, "learning_rate": 5e-07, "logits/chosen": -31370982.4, "logits/rejected": -10786013.333333334, "logps/chosen": -284.718798828125, "logps/rejected": -195.40873209635416, "loss": 0.2586, "rewards/chosen": 0.950322151184082, "rewards/margins": 3.4347426096598306, "rewards/rejected": -2.4844204584757485, "step": 15592 }, { "epoch": 0.8264914001007077, "grad_norm": 57.5, "kl": 4.163885116577148, "learning_rate": 5e-07, "logits/chosen": -39292296.0, "logits/rejected": 2699073.0, "logps/chosen": -324.4186604817708, "logps/rejected": -131.54067993164062, "loss": 0.2391, "rewards/chosen": 1.40891695022583, "rewards/margins": 5.283439636230469, "rewards/rejected": -3.8745226860046387, "step": 15593 }, { "epoch": 0.8265444041025097, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25497745.6, "logits/rejected": -16549026.666666666, "logps/chosen": -377.921142578125, "logps/rejected": -528.3013102213541, "loss": 0.2607, "rewards/chosen": 0.6378401756286621, "rewards/margins": 3.438986047108968, "rewards/rejected": -2.801145871480306, "step": 15594 }, { "epoch": 0.8265974081043119, "grad_norm": 45.0, "kl": 2.408611297607422, "learning_rate": 5e-07, "logits/chosen": -66574592.0, "logits/rejected": -38744893.333333336, "logps/chosen": -546.233642578125, "logps/rejected": -241.26497395833334, "loss": 0.289, "rewards/chosen": 1.2878301620483399, "rewards/margins": 3.2052918752034505, "rewards/rejected": -1.9174617131551106, "step": 15595 }, { "epoch": 0.826650412106114, "grad_norm": 41.0, "kl": 1.4339370727539062, "learning_rate": 5e-07, "logits/chosen": -17823920.0, "logits/rejected": -28177274.666666668, "logps/chosen": -131.4035888671875, "logps/rejected": -258.4568684895833, "loss": 0.4038, "rewards/chosen": -0.15600111484527587, "rewards/margins": 2.1571478605270387, "rewards/rejected": -2.3131489753723145, "step": 15596 }, { "epoch": 0.8267034161079162, "grad_norm": 120.5, "kl": 1.74273681640625, "learning_rate": 5e-07, "logits/chosen": -15921097.142857144, "logits/rejected": -14288036.0, "logps/chosen": -279.490966796875, "logps/rejected": -550.974365234375, "loss": 0.4274, "rewards/chosen": 0.4197100911821638, "rewards/margins": 1.8501544509615218, "rewards/rejected": -1.430444359779358, "step": 15597 }, { "epoch": 0.8267564201097183, "grad_norm": 63.0, "kl": 1.793243408203125, "learning_rate": 5e-07, "logits/chosen": -25053384.0, "logits/rejected": -19197434.666666668, "logps/chosen": -267.02880859375, "logps/rejected": -416.3473307291667, "loss": 0.1437, "rewards/chosen": 0.6519868969917297, "rewards/margins": 3.620108743508657, "rewards/rejected": -2.9681218465169272, "step": 15598 }, { "epoch": 0.8268094241115205, "grad_norm": 38.5, "kl": 0.8800506591796875, "learning_rate": 5e-07, "logits/chosen": -18420320.0, "logits/rejected": -17922064.0, "logps/chosen": -308.4147216796875, "logps/rejected": -280.88323974609375, "loss": 0.2885, "rewards/chosen": 0.9182041168212891, "rewards/margins": 3.0804164886474608, "rewards/rejected": -2.162212371826172, "step": 15599 }, { "epoch": 0.8268624281133226, "grad_norm": 29.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22358280.0, "logits/rejected": -28345768.0, "logps/chosen": -204.01531982421875, "logps/rejected": -576.0552978515625, "loss": 0.1766, "rewards/chosen": 0.8912172913551331, "rewards/margins": 4.502247393131256, "rewards/rejected": -3.611030101776123, "step": 15600 }, { "epoch": 0.8269154321151246, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 31455416.0, "logits/rejected": -13935229.333333334, "logps/chosen": -272.9642028808594, "logps/rejected": -256.75604248046875, "loss": 0.1839, "rewards/chosen": 0.8306694030761719, "rewards/margins": 3.4643362363179526, "rewards/rejected": -2.6336668332417807, "step": 15601 }, { "epoch": 0.8269684361169268, "grad_norm": 40.25, "kl": 0.47727203369140625, "learning_rate": 5e-07, "logits/chosen": -6028466.0, "logits/rejected": -29395714.666666668, "logps/chosen": -231.28896484375, "logps/rejected": -326.73699951171875, "loss": 0.2402, "rewards/chosen": 1.1128275871276856, "rewards/margins": 3.2201421737670897, "rewards/rejected": -2.1073145866394043, "step": 15602 }, { "epoch": 0.8270214401187289, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59625696.0, "logits/rejected": -51306042.666666664, "logps/chosen": -406.20390625, "logps/rejected": -795.7701822916666, "loss": 0.3275, "rewards/chosen": 0.3675396919250488, "rewards/margins": 3.451862939198812, "rewards/rejected": -3.084323247273763, "step": 15603 }, { "epoch": 0.8270744441205311, "grad_norm": 54.0, "kl": 1.445150375366211, "learning_rate": 5e-07, "logits/chosen": -17801418.0, "logits/rejected": 66126108.0, "logps/chosen": -202.94430541992188, "logps/rejected": -226.44607543945312, "loss": 0.4053, "rewards/chosen": 0.13191060721874237, "rewards/margins": 0.8130700439214706, "rewards/rejected": -0.6811594367027283, "step": 15604 }, { "epoch": 0.8271274481223332, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33003573.333333332, "logits/rejected": -35295488.0, "logps/chosen": -210.667724609375, "logps/rejected": -373.9247314453125, "loss": 0.2098, "rewards/chosen": 0.5543375809987386, "rewards/margins": 3.409599192937215, "rewards/rejected": -2.8552616119384764, "step": 15605 }, { "epoch": 0.8271804521241354, "grad_norm": 37.75, "kl": 2.1526527404785156, "learning_rate": 5e-07, "logits/chosen": 2210930.75, "logits/rejected": -38653992.0, "logps/chosen": -48.76283264160156, "logps/rejected": -479.61669921875, "loss": 0.2638, "rewards/chosen": 0.46861815452575684, "rewards/margins": 3.8398184776306152, "rewards/rejected": -3.3712003231048584, "step": 15606 }, { "epoch": 0.8272334561259375, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24125554.666666668, "logits/rejected": -824233.5, "logps/chosen": -146.1451619466146, "logps/rejected": -208.85755920410156, "loss": 0.3867, "rewards/chosen": 0.14635437726974487, "rewards/margins": 2.2815369963645935, "rewards/rejected": -2.1351826190948486, "step": 15607 }, { "epoch": 0.8272864601277397, "grad_norm": 41.5, "kl": 2.491318702697754, "learning_rate": 5e-07, "logits/chosen": 352806.25, "logits/rejected": -87074136.0, "logps/chosen": -49.24042892456055, "logps/rejected": -307.4944152832031, "loss": 0.3004, "rewards/chosen": 0.18092602491378784, "rewards/margins": 2.5766634345054626, "rewards/rejected": -2.395737409591675, "step": 15608 }, { "epoch": 0.8273394641295417, "grad_norm": 48.5, "kl": 0.21077919006347656, "learning_rate": 5e-07, "logits/chosen": 961461.0, "logits/rejected": -17218449.6, "logps/chosen": -246.59871419270834, "logps/rejected": -212.592919921875, "loss": 0.2547, "rewards/chosen": 0.8951303958892822, "rewards/margins": 2.469859170913696, "rewards/rejected": -1.574728775024414, "step": 15609 }, { "epoch": 0.8273924681313439, "grad_norm": 80.0, "kl": 0.014252662658691406, "learning_rate": 5e-07, "logits/chosen": 1674535.3333333333, "logits/rejected": 3172690.0, "logps/chosen": -491.8516438802083, "logps/rejected": -357.01611328125, "loss": 0.3501, "rewards/chosen": 0.4224471648534139, "rewards/margins": 2.466716011365255, "rewards/rejected": -2.044268846511841, "step": 15610 }, { "epoch": 0.827445472133146, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -109715952.0, "logits/rejected": -9940324.0, "logps/chosen": -379.71978759765625, "logps/rejected": -207.486083984375, "loss": 0.1658, "rewards/chosen": 0.00410156324505806, "rewards/margins": 3.3266942985355854, "rewards/rejected": -3.3225927352905273, "step": 15611 }, { "epoch": 0.8274984761349482, "grad_norm": 44.75, "kl": 1.2452964782714844, "learning_rate": 5e-07, "logits/chosen": -4370318.4, "logits/rejected": -10333672.0, "logps/chosen": -234.91689453125, "logps/rejected": -380.200439453125, "loss": 0.273, "rewards/chosen": 0.9302180290222168, "rewards/margins": 2.572356192270915, "rewards/rejected": -1.642138163248698, "step": 15612 }, { "epoch": 0.8275514801367503, "grad_norm": 44.5, "kl": 1.2990341186523438, "learning_rate": 5e-07, "logits/chosen": -28020448.0, "logits/rejected": -49183752.0, "logps/chosen": -425.85870361328125, "logps/rejected": -520.5169677734375, "loss": 0.1717, "rewards/chosen": 1.2482857704162598, "rewards/margins": 3.270636558532715, "rewards/rejected": -2.022350788116455, "step": 15613 }, { "epoch": 0.8276044841385525, "grad_norm": 43.75, "kl": 1.7497711181640625, "learning_rate": 5e-07, "logits/chosen": -29306580.0, "logits/rejected": -28153606.0, "logps/chosen": -560.2368774414062, "logps/rejected": -136.4513397216797, "loss": 0.1673, "rewards/chosen": 1.2110075950622559, "rewards/margins": 4.051405429840088, "rewards/rejected": -2.840397834777832, "step": 15614 }, { "epoch": 0.8276574881403546, "grad_norm": 50.5, "kl": 2.9171953201293945, "learning_rate": 5e-07, "logits/chosen": -47484936.0, "logits/rejected": -8855526.285714285, "logps/chosen": -523.52978515625, "logps/rejected": -129.23448835100447, "loss": 0.3066, "rewards/chosen": 0.16153565049171448, "rewards/margins": 1.2143645754882268, "rewards/rejected": -1.0528289249965124, "step": 15615 }, { "epoch": 0.8277104921421568, "grad_norm": 65.5, "kl": 2.5322647094726562, "learning_rate": 5e-07, "logits/chosen": -26819858.0, "logits/rejected": -116858016.0, "logps/chosen": -340.37213134765625, "logps/rejected": -254.42245483398438, "loss": 0.3175, "rewards/chosen": 0.825936496257782, "rewards/margins": 2.398299753665924, "rewards/rejected": -1.572363257408142, "step": 15616 }, { "epoch": 0.8277634961439588, "grad_norm": 38.75, "kl": 0.5612640380859375, "learning_rate": 5e-07, "logits/chosen": -16482349.333333334, "logits/rejected": -10590404.0, "logps/chosen": -198.30367024739584, "logps/rejected": -210.357373046875, "loss": 0.2098, "rewards/chosen": 1.0027024745941162, "rewards/margins": 2.8299769878387453, "rewards/rejected": -1.8272745132446289, "step": 15617 }, { "epoch": 0.827816500145761, "grad_norm": 61.5, "kl": 2.1099777221679688, "learning_rate": 5e-07, "logits/chosen": -31211526.4, "logits/rejected": -17968198.666666668, "logps/chosen": -725.696142578125, "logps/rejected": -192.23213704427084, "loss": 0.2485, "rewards/chosen": 1.4004939079284668, "rewards/margins": 2.906464862823486, "rewards/rejected": -1.5059709548950195, "step": 15618 }, { "epoch": 0.8278695041475631, "grad_norm": 91.5, "kl": 2.6960325241088867, "learning_rate": 5e-07, "logits/chosen": 1561530.75, "logits/rejected": -2327295.75, "logps/chosen": -405.88970947265625, "logps/rejected": -392.8740234375, "loss": 0.2441, "rewards/chosen": 1.1857361793518066, "rewards/margins": 2.8304250240325928, "rewards/rejected": -1.6446888446807861, "step": 15619 }, { "epoch": 0.8279225081493653, "grad_norm": 77.0, "kl": 1.0840492248535156, "learning_rate": 5e-07, "logits/chosen": -19638724.57142857, "logits/rejected": -71131072.0, "logps/chosen": -596.3934849330357, "logps/rejected": -348.02484130859375, "loss": 0.4422, "rewards/chosen": 0.5834620339529855, "rewards/margins": 1.0784907383578164, "rewards/rejected": -0.49502870440483093, "step": 15620 }, { "epoch": 0.8279755121511674, "grad_norm": 48.5, "kl": 0.4658355712890625, "learning_rate": 5e-07, "logits/chosen": -58056133.333333336, "logits/rejected": -31145539.2, "logps/chosen": -545.0218912760416, "logps/rejected": -296.1080078125, "loss": 0.1658, "rewards/chosen": 1.1712036927541096, "rewards/margins": 3.703789981206258, "rewards/rejected": -2.5325862884521486, "step": 15621 }, { "epoch": 0.8280285161529696, "grad_norm": 50.25, "kl": 5.756336212158203, "learning_rate": 5e-07, "logits/chosen": -69311648.0, "logits/rejected": -6815426.5, "logps/chosen": -438.8056233723958, "logps/rejected": -335.64727783203125, "loss": 0.3189, "rewards/chosen": 1.2077486515045166, "rewards/margins": 4.03217887878418, "rewards/rejected": -2.824430227279663, "step": 15622 }, { "epoch": 0.8280815201547717, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51674904.0, "logits/rejected": -3705231.3333333335, "logps/chosen": -421.1094665527344, "logps/rejected": -338.54266357421875, "loss": 0.193, "rewards/chosen": 0.079803466796875, "rewards/margins": 2.7224365870157876, "rewards/rejected": -2.6426331202189126, "step": 15623 }, { "epoch": 0.8281345241565738, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12214576.0, "logits/rejected": 6222292.666666667, "logps/chosen": -414.268701171875, "logps/rejected": -121.64707438151042, "loss": 0.3475, "rewards/chosen": 0.2592581272125244, "rewards/margins": 1.9075050512949625, "rewards/rejected": -1.6482469240824382, "step": 15624 }, { "epoch": 0.8281875281583759, "grad_norm": 33.25, "kl": 5.431652069091797, "learning_rate": 5e-07, "logits/chosen": -8189036.666666667, "logits/rejected": -56885544.0, "logps/chosen": -133.84297688802084, "logps/rejected": -316.6321105957031, "loss": 0.3747, "rewards/chosen": 0.8391855557759603, "rewards/margins": 3.0560503800710044, "rewards/rejected": -2.216864824295044, "step": 15625 }, { "epoch": 0.8282405321601781, "grad_norm": 27.5, "kl": 0.2754335403442383, "learning_rate": 5e-07, "logits/chosen": -28844570.666666668, "logits/rejected": -3719773.6, "logps/chosen": -277.49310302734375, "logps/rejected": -177.8760986328125, "loss": 0.1794, "rewards/chosen": 0.8345498243967692, "rewards/margins": 3.993002430597941, "rewards/rejected": -3.158452606201172, "step": 15626 }, { "epoch": 0.8282935361619802, "grad_norm": 47.5, "kl": 0.05675315856933594, "learning_rate": 5e-07, "logits/chosen": -13164028.0, "logits/rejected": -11046036.0, "logps/chosen": -304.26173909505206, "logps/rejected": -196.60626220703125, "loss": 0.2522, "rewards/chosen": 0.6219824552536011, "rewards/margins": 2.7208651304244995, "rewards/rejected": -2.0988826751708984, "step": 15627 }, { "epoch": 0.8283465401637824, "grad_norm": 38.0, "kl": 1.6765022277832031, "learning_rate": 5e-07, "logits/chosen": -27503208.0, "logits/rejected": -40529337.6, "logps/chosen": -94.10076904296875, "logps/rejected": -473.3390625, "loss": 0.2606, "rewards/chosen": 0.1823935111363729, "rewards/margins": 3.6904326041539512, "rewards/rejected": -3.508039093017578, "step": 15628 }, { "epoch": 0.8283995441655845, "grad_norm": 56.5, "kl": 0.9356422424316406, "learning_rate": 5e-07, "logits/chosen": -40356003.2, "logits/rejected": -67190346.66666667, "logps/chosen": -381.9878173828125, "logps/rejected": -785.5206705729166, "loss": 0.2528, "rewards/chosen": 0.6819867134094239, "rewards/margins": 5.699086729685466, "rewards/rejected": -5.017100016276042, "step": 15629 }, { "epoch": 0.8284525481673867, "grad_norm": 25.5, "kl": 0.7388191223144531, "learning_rate": 5e-07, "logits/chosen": -19977650.0, "logits/rejected": -31005365.333333332, "logps/chosen": -321.8471374511719, "logps/rejected": -376.1442057291667, "loss": 0.1282, "rewards/chosen": 1.953392744064331, "rewards/margins": 5.6989905834198, "rewards/rejected": -3.7455978393554688, "step": 15630 }, { "epoch": 0.8285055521691888, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1040251.625, "logits/rejected": -42818572.0, "logps/chosen": -146.01593017578125, "logps/rejected": -350.1484069824219, "loss": 0.2325, "rewards/chosen": 0.9845498204231262, "rewards/margins": 3.092858612537384, "rewards/rejected": -2.108308792114258, "step": 15631 }, { "epoch": 0.828558556170991, "grad_norm": 65.5, "kl": 0.7993888854980469, "learning_rate": 5e-07, "logits/chosen": 2118098.6666666665, "logits/rejected": -26595064.0, "logps/chosen": -367.4263509114583, "logps/rejected": -312.7400817871094, "loss": 0.4239, "rewards/chosen": -0.0196173960963885, "rewards/margins": 1.9408127491672833, "rewards/rejected": -1.9604301452636719, "step": 15632 }, { "epoch": 0.828611560172793, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15349232.0, "logits/rejected": -2191375.5, "logps/chosen": -330.73822021484375, "logps/rejected": -314.2650451660156, "loss": 0.2845, "rewards/chosen": 0.8680830001831055, "rewards/margins": 2.205461263656616, "rewards/rejected": -1.3373782634735107, "step": 15633 }, { "epoch": 0.8286645641745952, "grad_norm": 46.75, "kl": 8.054615020751953, "learning_rate": 5e-07, "logits/chosen": -29913757.714285713, "logits/rejected": -11326664.0, "logps/chosen": -518.8495047433036, "logps/rejected": -931.3041381835938, "loss": 0.3515, "rewards/chosen": 1.771393094744001, "rewards/margins": 6.326343332018171, "rewards/rejected": -4.55495023727417, "step": 15634 }, { "epoch": 0.8287175681763973, "grad_norm": 36.75, "kl": 1.0249137878417969, "learning_rate": 5e-07, "logits/chosen": -13904056.0, "logits/rejected": -14368660.8, "logps/chosen": -101.46155802408855, "logps/rejected": -168.32340087890626, "loss": 0.223, "rewards/chosen": 0.7830366293589274, "rewards/margins": 2.61144544283549, "rewards/rejected": -1.8284088134765626, "step": 15635 }, { "epoch": 0.8287705721781995, "grad_norm": 47.5, "kl": 1.4496440887451172, "learning_rate": 5e-07, "logits/chosen": -37103216.0, "logits/rejected": -30045162.0, "logps/chosen": -246.74398803710938, "logps/rejected": -463.59539794921875, "loss": 0.3641, "rewards/chosen": 0.48952656984329224, "rewards/margins": 2.010928452014923, "rewards/rejected": -1.5214018821716309, "step": 15636 }, { "epoch": 0.8288235761800016, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72160528.0, "logits/rejected": -3826390.0, "logps/chosen": -396.623779296875, "logps/rejected": -428.25233677455356, "loss": 0.129, "rewards/chosen": 0.7203918695449829, "rewards/margins": 4.131420595305307, "rewards/rejected": -3.411028725760324, "step": 15637 }, { "epoch": 0.8288765801818038, "grad_norm": 50.25, "kl": 0.5055770874023438, "learning_rate": 5e-07, "logits/chosen": -27021040.0, "logits/rejected": -24237456.0, "logps/chosen": -293.3881591796875, "logps/rejected": -343.6780192057292, "loss": 0.3381, "rewards/chosen": 0.18187168836593628, "rewards/margins": 2.284747080008189, "rewards/rejected": -2.1028753916422525, "step": 15638 }, { "epoch": 0.8289295841836058, "grad_norm": 53.0, "kl": 1.906759262084961, "learning_rate": 5e-07, "logits/chosen": -90278485.33333333, "logits/rejected": -19544948.0, "logps/chosen": -425.78662109375, "logps/rejected": -293.9527282714844, "loss": 0.3528, "rewards/chosen": 0.9656709035237631, "rewards/margins": 2.1275902589162192, "rewards/rejected": -1.161919355392456, "step": 15639 }, { "epoch": 0.828982588185408, "grad_norm": 51.25, "kl": 6.839506149291992, "learning_rate": 5e-07, "logits/chosen": -59061435.428571425, "logits/rejected": -1087405.625, "logps/chosen": -175.74204799107142, "logps/rejected": -134.2139434814453, "loss": 0.5249, "rewards/chosen": 0.1646556854248047, "rewards/margins": 0.9797939658164978, "rewards/rejected": -0.8151382803916931, "step": 15640 }, { "epoch": 0.8290355921872101, "grad_norm": 56.25, "kl": 1.469503402709961, "learning_rate": 5e-07, "logits/chosen": -30262678.0, "logits/rejected": -30939412.0, "logps/chosen": -209.507080078125, "logps/rejected": -407.1026611328125, "loss": 0.253, "rewards/chosen": 0.688736081123352, "rewards/margins": 3.756776452064514, "rewards/rejected": -3.068040370941162, "step": 15641 }, { "epoch": 0.8290885961890123, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34349864.0, "logits/rejected": -11566529.0, "logps/chosen": -351.31793212890625, "logps/rejected": -189.07315063476562, "loss": 0.2267, "rewards/chosen": 0.4287261962890625, "rewards/margins": 4.133463382720947, "rewards/rejected": -3.7047371864318848, "step": 15642 }, { "epoch": 0.8291416001908144, "grad_norm": 58.0, "kl": 1.7882461547851562, "learning_rate": 5e-07, "logits/chosen": -22911293.714285713, "logits/rejected": -30144792.0, "logps/chosen": -304.75662667410717, "logps/rejected": -208.46743774414062, "loss": 0.3356, "rewards/chosen": 0.8265087945120675, "rewards/margins": 3.555763040270124, "rewards/rejected": -2.7292542457580566, "step": 15643 }, { "epoch": 0.8291946041926166, "grad_norm": 54.0, "kl": 4.486026763916016, "learning_rate": 5e-07, "logits/chosen": 527974.6666666666, "logits/rejected": -20954022.4, "logps/chosen": -553.987548828125, "logps/rejected": -456.3181640625, "loss": 0.1612, "rewards/chosen": 1.2298589547475178, "rewards/margins": 3.8838746865590412, "rewards/rejected": -2.6540157318115236, "step": 15644 }, { "epoch": 0.8292476081944187, "grad_norm": 43.5, "kl": 0.33728599548339844, "learning_rate": 5e-07, "logits/chosen": -2839853.2, "logits/rejected": -1347020.5, "logps/chosen": -249.373876953125, "logps/rejected": -105.28314208984375, "loss": 0.2577, "rewards/chosen": 1.4501195907592774, "rewards/margins": 4.1699936548868815, "rewards/rejected": -2.719874064127604, "step": 15645 }, { "epoch": 0.8293006121962209, "grad_norm": 55.75, "kl": 2.3339309692382812, "learning_rate": 5e-07, "logits/chosen": -2979774.6, "logits/rejected": -70609557.33333333, "logps/chosen": -327.4654541015625, "logps/rejected": -270.2899576822917, "loss": 0.3681, "rewards/chosen": 0.1315520167350769, "rewards/margins": 2.118023192882538, "rewards/rejected": -1.986471176147461, "step": 15646 }, { "epoch": 0.829353616198023, "grad_norm": 57.5, "kl": 0.2493114471435547, "learning_rate": 5e-07, "logits/chosen": -73990696.0, "logits/rejected": -1432591.0, "logps/chosen": -211.4937744140625, "logps/rejected": -156.17499651227678, "loss": 0.2572, "rewards/chosen": 0.7699005007743835, "rewards/margins": 2.3830310702323914, "rewards/rejected": -1.6131305694580078, "step": 15647 }, { "epoch": 0.8294066201998251, "grad_norm": 54.5, "kl": 1.3944149017333984, "learning_rate": 5e-07, "logits/chosen": -40499520.0, "logits/rejected": -11122876.0, "logps/chosen": -490.2125549316406, "logps/rejected": -196.80072021484375, "loss": 0.2462, "rewards/chosen": 1.1642646789550781, "rewards/margins": 2.9427443742752075, "rewards/rejected": -1.7784796953201294, "step": 15648 }, { "epoch": 0.8294596242016272, "grad_norm": 57.75, "kl": 0.5398311614990234, "learning_rate": 5e-07, "logits/chosen": -4531624.0, "logits/rejected": -12123505.6, "logps/chosen": -175.29229736328125, "logps/rejected": -147.13465576171876, "loss": 0.2851, "rewards/chosen": 0.5498871008555094, "rewards/margins": 2.079242245356242, "rewards/rejected": -1.5293551445007325, "step": 15649 }, { "epoch": 0.8295126282034294, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17628220.0, "logits/rejected": -33917040.0, "logps/chosen": -351.4967447916667, "logps/rejected": -439.63427734375, "loss": 0.224, "rewards/chosen": 0.44886549313863117, "rewards/margins": 3.7983245690663656, "rewards/rejected": -3.3494590759277343, "step": 15650 }, { "epoch": 0.8295656322052315, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -940885.0, "logits/rejected": -5230793.0, "logps/chosen": -359.310791015625, "logps/rejected": -239.9530029296875, "loss": 0.2253, "rewards/chosen": 0.6332647204399109, "rewards/margins": 2.51897797981898, "rewards/rejected": -1.885713259379069, "step": 15651 }, { "epoch": 0.8296186362070336, "grad_norm": 43.0, "kl": 5.213233947753906, "learning_rate": 5e-07, "logits/chosen": -25713248.0, "logits/rejected": -56355701.333333336, "logps/chosen": -305.69755859375, "logps/rejected": -742.1053059895834, "loss": 0.3081, "rewards/chosen": 0.46162190437316897, "rewards/margins": 4.618000014623006, "rewards/rejected": -4.156378110249837, "step": 15652 }, { "epoch": 0.8296716402088358, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17851730.0, "logits/rejected": -15626061.0, "logps/chosen": -352.966796875, "logps/rejected": -368.87762451171875, "loss": 0.2325, "rewards/chosen": 0.6605364680290222, "rewards/margins": 3.216014802455902, "rewards/rejected": -2.55547833442688, "step": 15653 }, { "epoch": 0.8297246442106379, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80490677.33333333, "logits/rejected": -12565622.4, "logps/chosen": -581.205810546875, "logps/rejected": -223.36640625, "loss": 0.3428, "rewards/chosen": -0.26100871960322064, "rewards/margins": 1.5277676781018574, "rewards/rejected": -1.7887763977050781, "step": 15654 }, { "epoch": 0.82977764821244, "grad_norm": 51.75, "kl": 0.437896728515625, "learning_rate": 5e-07, "logits/chosen": -25068669.333333332, "logits/rejected": -26363812.0, "logps/chosen": -224.02213541666666, "logps/rejected": -203.19723510742188, "loss": 0.289, "rewards/chosen": 0.7498664061228434, "rewards/margins": 2.9588026205698648, "rewards/rejected": -2.2089362144470215, "step": 15655 }, { "epoch": 0.8298306522142421, "grad_norm": 42.5, "kl": 1.2039337158203125, "learning_rate": 5e-07, "logits/chosen": -24680140.0, "logits/rejected": -5356771.0, "logps/chosen": -336.83746337890625, "logps/rejected": -359.1194763183594, "loss": 0.2251, "rewards/chosen": 0.805682897567749, "rewards/margins": 3.468003034591675, "rewards/rejected": -2.662320137023926, "step": 15656 }, { "epoch": 0.8298836562160443, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14404356.8, "logits/rejected": -2125906.6666666665, "logps/chosen": -221.1293212890625, "logps/rejected": -381.0547688802083, "loss": 0.213, "rewards/chosen": 0.8909591674804688, "rewards/margins": 4.15371265411377, "rewards/rejected": -3.262753486633301, "step": 15657 }, { "epoch": 0.8299366602178464, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6608309.333333333, "logits/rejected": 47466316.8, "logps/chosen": -271.9637044270833, "logps/rejected": -313.6821044921875, "loss": 0.279, "rewards/chosen": -0.12609743078549704, "rewards/margins": 2.0691956142584482, "rewards/rejected": -2.1952930450439454, "step": 15658 }, { "epoch": 0.8299896642196486, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44675356.0, "logits/rejected": -21730549.333333332, "logps/chosen": -236.4715576171875, "logps/rejected": -465.922119140625, "loss": 0.2644, "rewards/chosen": 0.16324463486671448, "rewards/margins": 2.6390794813632965, "rewards/rejected": -2.475834846496582, "step": 15659 }, { "epoch": 0.8300426682214507, "grad_norm": 62.75, "kl": 1.0039291381835938, "learning_rate": 5e-07, "logits/chosen": -38381280.0, "logits/rejected": -64275576.0, "logps/chosen": -273.6805725097656, "logps/rejected": -695.1316528320312, "loss": 0.2181, "rewards/chosen": 0.5371782779693604, "rewards/margins": 4.094801664352417, "rewards/rejected": -3.5576233863830566, "step": 15660 }, { "epoch": 0.8300956722232529, "grad_norm": 51.75, "kl": 4.762203216552734, "learning_rate": 5e-07, "logits/chosen": -11085876.8, "logits/rejected": -40914018.666666664, "logps/chosen": -424.6177734375, "logps/rejected": -380.8846842447917, "loss": 0.2891, "rewards/chosen": 0.7809664726257324, "rewards/margins": 3.3025586128234865, "rewards/rejected": -2.521592140197754, "step": 15661 }, { "epoch": 0.830148676225055, "grad_norm": 59.0, "kl": 2.9256820678710938, "learning_rate": 5e-07, "logits/chosen": -15583886.4, "logits/rejected": -106697301.33333333, "logps/chosen": -389.9255859375, "logps/rejected": -416.7893880208333, "loss": 0.198, "rewards/chosen": 1.3773737907409669, "rewards/margins": 4.135600821177165, "rewards/rejected": -2.7582270304361978, "step": 15662 }, { "epoch": 0.8302016802268571, "grad_norm": 46.25, "kl": 1.8139801025390625, "learning_rate": 5e-07, "logits/chosen": -19862405.333333332, "logits/rejected": -28563129.6, "logps/chosen": -240.00909423828125, "logps/rejected": -213.0564453125, "loss": 0.3435, "rewards/chosen": -0.04634641110897064, "rewards/margins": 1.7308123260736465, "rewards/rejected": -1.7771587371826172, "step": 15663 }, { "epoch": 0.8302546842286592, "grad_norm": 35.0, "kl": 0.040058135986328125, "learning_rate": 5e-07, "logits/chosen": 3344368.5, "logits/rejected": -15803299.0, "logps/chosen": -149.76168823242188, "logps/rejected": -330.39691162109375, "loss": 0.319, "rewards/chosen": 0.12771014869213104, "rewards/margins": 2.114094063639641, "rewards/rejected": -1.9863839149475098, "step": 15664 }, { "epoch": 0.8303076882304614, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -88641136.0, "logits/rejected": -13043574.666666666, "logps/chosen": -395.55316162109375, "logps/rejected": -344.4950358072917, "loss": 0.1759, "rewards/chosen": 0.21965789794921875, "rewards/margins": 2.920869509379069, "rewards/rejected": -2.70121161142985, "step": 15665 }, { "epoch": 0.8303606922322635, "grad_norm": 88.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9108737.0, "logits/rejected": -10681782.0, "logps/chosen": -202.3699493408203, "logps/rejected": -432.0213317871094, "loss": 0.361, "rewards/chosen": -0.14193668961524963, "rewards/margins": 1.6536453664302826, "rewards/rejected": -1.7955820560455322, "step": 15666 }, { "epoch": 0.8304136962340657, "grad_norm": 51.0, "kl": 0.1352081298828125, "learning_rate": 5e-07, "logits/chosen": -28305140.0, "logits/rejected": -17368704.0, "logps/chosen": -162.18759155273438, "logps/rejected": -291.01800537109375, "loss": 0.263, "rewards/chosen": 0.49163612723350525, "rewards/margins": 2.6103366315364838, "rewards/rejected": -2.1187005043029785, "step": 15667 }, { "epoch": 0.8304667002358678, "grad_norm": 38.75, "kl": 3.8367462158203125, "learning_rate": 5e-07, "logits/chosen": -39112232.0, "logits/rejected": -21051040.0, "logps/chosen": -224.89129638671875, "logps/rejected": -394.8367004394531, "loss": 0.2938, "rewards/chosen": 0.7593958377838135, "rewards/margins": 3.4148714542388916, "rewards/rejected": -2.655475616455078, "step": 15668 }, { "epoch": 0.83051970423767, "grad_norm": 69.0, "kl": 2.0585098266601562, "learning_rate": 5e-07, "logits/chosen": -17682104.0, "logits/rejected": -37288488.0, "logps/chosen": -421.42919921875, "logps/rejected": -406.686767578125, "loss": 0.3225, "rewards/chosen": 0.7042493025461832, "rewards/margins": 3.539271513621012, "rewards/rejected": -2.835022211074829, "step": 15669 }, { "epoch": 0.830572708239472, "grad_norm": 42.75, "kl": 0.7434921264648438, "learning_rate": 5e-07, "logits/chosen": -2758860.1666666665, "logits/rejected": -22040315.2, "logps/chosen": -281.5496012369792, "logps/rejected": -226.1237548828125, "loss": 0.1973, "rewards/chosen": 1.5606406529744465, "rewards/margins": 3.029260285695394, "rewards/rejected": -1.4686196327209473, "step": 15670 }, { "epoch": 0.8306257122412742, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5931300.0, "logits/rejected": -44869792.0, "logps/chosen": -348.9603678385417, "logps/rejected": -410.96328125, "loss": 0.2545, "rewards/chosen": 0.24355258544286093, "rewards/margins": 2.561608692010244, "rewards/rejected": -2.318056106567383, "step": 15671 }, { "epoch": 0.8306787162430763, "grad_norm": 34.0, "kl": 2.020130157470703, "learning_rate": 5e-07, "logits/chosen": -2306419.0, "logits/rejected": -10821284.0, "logps/chosen": -162.4215850830078, "logps/rejected": -135.7539265950521, "loss": 0.1834, "rewards/chosen": 1.0976027250289917, "rewards/margins": 3.657112161318461, "rewards/rejected": -2.5595094362894693, "step": 15672 }, { "epoch": 0.8307317202448785, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12121206.0, "logits/rejected": -15323738.0, "logps/chosen": -536.3273315429688, "logps/rejected": -566.8265380859375, "loss": 0.3087, "rewards/chosen": 0.19475364685058594, "rewards/margins": 2.934678792953491, "rewards/rejected": -2.7399251461029053, "step": 15673 }, { "epoch": 0.8307847242466806, "grad_norm": 67.5, "kl": 0.015273094177246094, "learning_rate": 5e-07, "logits/chosen": -22997550.4, "logits/rejected": -12640752.0, "logps/chosen": -433.96083984375, "logps/rejected": -246.0066935221354, "loss": 0.3979, "rewards/chosen": 0.06455474495887756, "rewards/margins": 1.27475648522377, "rewards/rejected": -1.2102017402648926, "step": 15674 }, { "epoch": 0.8308377282484828, "grad_norm": 45.75, "kl": 1.5030746459960938, "learning_rate": 5e-07, "logits/chosen": 52286088.0, "logits/rejected": -66631140.0, "logps/chosen": -197.7010040283203, "logps/rejected": -547.4927978515625, "loss": 0.3133, "rewards/chosen": 0.08142537623643875, "rewards/margins": 3.1812011674046516, "rewards/rejected": -3.099775791168213, "step": 15675 }, { "epoch": 0.8308907322502849, "grad_norm": 34.25, "kl": 1.2018470764160156, "learning_rate": 5e-07, "logits/chosen": 1700996.0, "logits/rejected": 7599912.8, "logps/chosen": -291.5673828125, "logps/rejected": -398.801171875, "loss": 0.213, "rewards/chosen": 0.4664998451868693, "rewards/margins": 3.0288518349329627, "rewards/rejected": -2.5623519897460936, "step": 15676 }, { "epoch": 0.8309437362520871, "grad_norm": 34.75, "kl": 3.1445693969726562, "learning_rate": 5e-07, "logits/chosen": -18509518.0, "logits/rejected": -23458890.0, "logps/chosen": -262.007080078125, "logps/rejected": -381.846923828125, "loss": 0.297, "rewards/chosen": 0.9904926419258118, "rewards/margins": 2.8065791726112366, "rewards/rejected": -1.8160865306854248, "step": 15677 }, { "epoch": 0.8309967402538891, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43899692.0, "logits/rejected": -52058408.0, "logps/chosen": -163.93377685546875, "logps/rejected": -315.0211486816406, "loss": 0.2705, "rewards/chosen": 0.13225960731506348, "rewards/margins": 2.981487512588501, "rewards/rejected": -2.8492279052734375, "step": 15678 }, { "epoch": 0.8310497442556913, "grad_norm": 55.0, "kl": 2.9820480346679688, "learning_rate": 5e-07, "logits/chosen": -25590080.0, "logits/rejected": -35115320.0, "logps/chosen": -268.1562906901042, "logps/rejected": -389.0339050292969, "loss": 0.3375, "rewards/chosen": 0.747116486231486, "rewards/margins": 5.109551350275676, "rewards/rejected": -4.3624348640441895, "step": 15679 }, { "epoch": 0.8311027482574934, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17640572.0, "logits/rejected": -13766235.42857143, "logps/chosen": -544.3616943359375, "logps/rejected": -264.24647739955356, "loss": 0.1598, "rewards/chosen": 0.4664367735385895, "rewards/margins": 2.854312466723578, "rewards/rejected": -2.3878756931849887, "step": 15680 }, { "epoch": 0.8311557522592956, "grad_norm": 51.25, "kl": 3.687915802001953, "learning_rate": 5e-07, "logits/chosen": -31817216.0, "logits/rejected": -8618706.666666666, "logps/chosen": -447.374267578125, "logps/rejected": -288.00510660807294, "loss": 0.3157, "rewards/chosen": 0.7787259101867676, "rewards/margins": 3.5458354632059734, "rewards/rejected": -2.7671095530192056, "step": 15681 }, { "epoch": 0.8312087562610977, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15740346.0, "logits/rejected": -29961874.0, "logps/chosen": -253.8094940185547, "logps/rejected": -365.1165771484375, "loss": 0.266, "rewards/chosen": 0.015447620302438736, "rewards/margins": 3.4324021376669407, "rewards/rejected": -3.416954517364502, "step": 15682 }, { "epoch": 0.8312617602628999, "grad_norm": 44.25, "kl": 3.6569013595581055, "learning_rate": 5e-07, "logits/chosen": 238012.0, "logits/rejected": -26319162.0, "logps/chosen": -171.4447021484375, "logps/rejected": -252.35867309570312, "loss": 0.4255, "rewards/chosen": 0.2849733829498291, "rewards/margins": 3.4044086933135986, "rewards/rejected": -3.1194353103637695, "step": 15683 }, { "epoch": 0.831314764264702, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12197927.2, "logits/rejected": -26090848.0, "logps/chosen": -147.460986328125, "logps/rejected": -320.759033203125, "loss": 0.3217, "rewards/chosen": 0.35456366539001466, "rewards/margins": 2.235183032353719, "rewards/rejected": -1.8806193669637044, "step": 15684 }, { "epoch": 0.8313677682665042, "grad_norm": 38.0, "kl": 5.689731597900391, "learning_rate": 5e-07, "logits/chosen": -7607504.0, "logits/rejected": 5701619.5, "logps/chosen": -238.73200334821428, "logps/rejected": -35.206844329833984, "loss": 0.4718, "rewards/chosen": 0.8379977090018136, "rewards/margins": 1.47346350976399, "rewards/rejected": -0.6354658007621765, "step": 15685 }, { "epoch": 0.8314207722683062, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42164371.2, "logits/rejected": -15602356.0, "logps/chosen": -239.539794921875, "logps/rejected": -117.05047607421875, "loss": 0.2855, "rewards/chosen": 0.48121018409729005, "rewards/margins": 3.2538993676503503, "rewards/rejected": -2.77268918355306, "step": 15686 }, { "epoch": 0.8314737762701084, "grad_norm": 47.0, "kl": 1.4205036163330078, "learning_rate": 5e-07, "logits/chosen": 23803100.0, "logits/rejected": -36259426.666666664, "logps/chosen": -161.14817810058594, "logps/rejected": -403.1886393229167, "loss": 0.2171, "rewards/chosen": 0.5153539776802063, "rewards/margins": 2.3026709357897444, "rewards/rejected": -1.7873169581095378, "step": 15687 }, { "epoch": 0.8315267802719105, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13261690.0, "logits/rejected": -18564289.14285714, "logps/chosen": -229.68161010742188, "logps/rejected": -457.0142299107143, "loss": 0.1598, "rewards/chosen": -0.20991821587085724, "rewards/margins": 2.539628652589662, "rewards/rejected": -2.749546868460519, "step": 15688 }, { "epoch": 0.8315797842737127, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20068749.333333332, "logits/rejected": 20150803.2, "logps/chosen": -164.14090983072916, "logps/rejected": -379.730322265625, "loss": 0.2427, "rewards/chosen": 0.9972894986470541, "rewards/margins": 2.510642655690511, "rewards/rejected": -1.513353157043457, "step": 15689 }, { "epoch": 0.8316327882755148, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -79919738.66666667, "logits/rejected": -7141034.4, "logps/chosen": -252.59562174479166, "logps/rejected": -272.280615234375, "loss": 0.1809, "rewards/chosen": 1.0045560201009114, "rewards/margins": 3.722803815205892, "rewards/rejected": -2.7182477951049804, "step": 15690 }, { "epoch": 0.831685792277317, "grad_norm": 47.0, "kl": 4.631452560424805, "learning_rate": 5e-07, "logits/chosen": -55939984.0, "logits/rejected": -59278628.0, "logps/chosen": -784.1742553710938, "logps/rejected": -554.9041137695312, "loss": 0.2113, "rewards/chosen": 1.4379100799560547, "rewards/margins": 4.355166435241699, "rewards/rejected": -2.9172563552856445, "step": 15691 }, { "epoch": 0.8317387962791191, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21238908.0, "logits/rejected": -18787061.333333332, "logps/chosen": -743.8287353515625, "logps/rejected": -193.842529296875, "loss": 0.168, "rewards/chosen": 2.614367723464966, "rewards/margins": 4.683753887812296, "rewards/rejected": -2.0693861643473306, "step": 15692 }, { "epoch": 0.8317918002809213, "grad_norm": 52.75, "kl": 2.1996002197265625, "learning_rate": 5e-07, "logits/chosen": -33266350.0, "logits/rejected": -31752252.0, "logps/chosen": -683.1090087890625, "logps/rejected": -466.870849609375, "loss": 0.2825, "rewards/chosen": 0.5068192481994629, "rewards/margins": 3.2245235443115234, "rewards/rejected": -2.7177042961120605, "step": 15693 }, { "epoch": 0.8318448042827233, "grad_norm": 59.5, "kl": 2.6327171325683594, "learning_rate": 5e-07, "logits/chosen": 7197163.0, "logits/rejected": -50049776.0, "logps/chosen": -266.17840576171875, "logps/rejected": -352.0061340332031, "loss": 0.3305, "rewards/chosen": 0.9014867544174194, "rewards/margins": 2.603182315826416, "rewards/rejected": -1.7016955614089966, "step": 15694 }, { "epoch": 0.8318978082845255, "grad_norm": 37.0, "kl": 1.1837940216064453, "learning_rate": 5e-07, "logits/chosen": -13042474.0, "logits/rejected": -25619738.0, "logps/chosen": -139.3411865234375, "logps/rejected": -263.91973876953125, "loss": 0.2195, "rewards/chosen": 0.7777871489524841, "rewards/margins": 3.7677785754203796, "rewards/rejected": -2.9899914264678955, "step": 15695 }, { "epoch": 0.8319508122863276, "grad_norm": 56.25, "kl": 4.870657920837402, "learning_rate": 5e-07, "logits/chosen": -59842500.0, "logits/rejected": -19824996.0, "logps/chosen": -669.157958984375, "logps/rejected": -300.2599792480469, "loss": 0.2832, "rewards/chosen": 1.3388363122940063, "rewards/margins": 2.3898273706436157, "rewards/rejected": -1.0509910583496094, "step": 15696 }, { "epoch": 0.8320038162881298, "grad_norm": 46.0, "kl": 4.104511260986328, "learning_rate": 5e-07, "logits/chosen": -23266820.0, "logits/rejected": -19124962.0, "logps/chosen": -326.257080078125, "logps/rejected": -572.191650390625, "loss": 0.3596, "rewards/chosen": 0.4720601439476013, "rewards/margins": 2.4215356707572937, "rewards/rejected": -1.9494755268096924, "step": 15697 }, { "epoch": 0.8320568202899319, "grad_norm": 46.0, "kl": 0.4861602783203125, "learning_rate": 5e-07, "logits/chosen": -44095952.0, "logits/rejected": -26653235.2, "logps/chosen": -376.3736979166667, "logps/rejected": -354.6509765625, "loss": 0.2053, "rewards/chosen": 0.32059045632680255, "rewards/margins": 3.438449533780416, "rewards/rejected": -3.1178590774536135, "step": 15698 }, { "epoch": 0.8321098242917341, "grad_norm": 65.0, "kl": 2.0415515899658203, "learning_rate": 5e-07, "logits/chosen": -39041400.0, "logits/rejected": -29400646.0, "logps/chosen": -391.0386962890625, "logps/rejected": -376.9651794433594, "loss": 0.3557, "rewards/chosen": 0.5775043567021688, "rewards/margins": 2.748944958051046, "rewards/rejected": -2.171440601348877, "step": 15699 }, { "epoch": 0.8321628282935362, "grad_norm": 157.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56237280.0, "logits/rejected": -16155790.0, "logps/chosen": -284.8157958984375, "logps/rejected": -165.96469116210938, "loss": 0.1585, "rewards/chosen": 0.9123298525810242, "rewards/margins": 5.557027518749237, "rewards/rejected": -4.644697666168213, "step": 15700 }, { "epoch": 0.8322158322953382, "grad_norm": 55.75, "kl": 4.067906379699707, "learning_rate": 5e-07, "logits/chosen": -30665244.8, "logits/rejected": -13641938.666666666, "logps/chosen": -571.062109375, "logps/rejected": -170.29815673828125, "loss": 0.3363, "rewards/chosen": 1.0960816383361816, "rewards/margins": 1.8409357706705727, "rewards/rejected": -0.7448541323343912, "step": 15701 }, { "epoch": 0.8322688362971404, "grad_norm": 40.75, "kl": 2.421884536743164, "learning_rate": 5e-07, "logits/chosen": -124247000.0, "logits/rejected": -41382234.666666664, "logps/chosen": -386.87713623046875, "logps/rejected": -425.0305989583333, "loss": 0.1492, "rewards/chosen": 1.0328311920166016, "rewards/margins": 3.6610209147135415, "rewards/rejected": -2.62818972269694, "step": 15702 }, { "epoch": 0.8323218402989425, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23442517.333333332, "logits/rejected": -47366524.8, "logps/chosen": -343.78759765625, "logps/rejected": -274.015380859375, "loss": 0.2552, "rewards/chosen": 0.5070684353510538, "rewards/margins": 2.769802657763163, "rewards/rejected": -2.262734222412109, "step": 15703 }, { "epoch": 0.8323748443007447, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66229946.666666664, "logits/rejected": -29550710.4, "logps/chosen": -319.07887776692706, "logps/rejected": -361.47021484375, "loss": 0.2943, "rewards/chosen": 0.058260599772135414, "rewards/margins": 1.9507609049479167, "rewards/rejected": -1.8925003051757812, "step": 15704 }, { "epoch": 0.8324278483025468, "grad_norm": 52.75, "kl": 0.48720550537109375, "learning_rate": 5e-07, "logits/chosen": -34442534.4, "logits/rejected": -23462989.333333332, "logps/chosen": -430.045849609375, "logps/rejected": -293.5860595703125, "loss": 0.2414, "rewards/chosen": 0.7264986038208008, "rewards/margins": 5.105323473612468, "rewards/rejected": -4.378824869791667, "step": 15705 }, { "epoch": 0.832480852304349, "grad_norm": 53.0, "kl": 7.750823974609375, "learning_rate": 5e-07, "logits/chosen": -29156037.333333332, "logits/rejected": -19810150.0, "logps/chosen": -271.5239664713542, "logps/rejected": -221.73110961914062, "loss": 0.2759, "rewards/chosen": 1.3354026476542156, "rewards/margins": 2.781410614649455, "rewards/rejected": -1.4460079669952393, "step": 15706 }, { "epoch": 0.8325338563061511, "grad_norm": 41.25, "kl": 0.9056930541992188, "learning_rate": 5e-07, "logits/chosen": -25815689.6, "logits/rejected": -23981194.666666668, "logps/chosen": -325.2415283203125, "logps/rejected": -394.6474202473958, "loss": 0.2466, "rewards/chosen": 1.1349575996398926, "rewards/margins": 4.169245052337646, "rewards/rejected": -3.034287452697754, "step": 15707 }, { "epoch": 0.8325868603079533, "grad_norm": 48.75, "kl": 0.5176162719726562, "learning_rate": 5e-07, "logits/chosen": -9598372.0, "logits/rejected": -9332742.4, "logps/chosen": -135.1692097981771, "logps/rejected": -304.673876953125, "loss": 0.3301, "rewards/chosen": -0.01126130297780037, "rewards/margins": 1.3760951690375804, "rewards/rejected": -1.3873564720153808, "step": 15708 }, { "epoch": 0.8326398643097553, "grad_norm": 46.5, "kl": 2.9396018981933594, "learning_rate": 5e-07, "logits/chosen": -14913104.0, "logits/rejected": -23343010.0, "logps/chosen": -194.04617309570312, "logps/rejected": -228.64288330078125, "loss": 0.3239, "rewards/chosen": 0.77592533826828, "rewards/margins": 1.995379626750946, "rewards/rejected": -1.219454288482666, "step": 15709 }, { "epoch": 0.8326928683115575, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8887810.0, "logits/rejected": -12300944.0, "logps/chosen": -297.6014099121094, "logps/rejected": -255.97310384114584, "loss": 0.2377, "rewards/chosen": -0.14056548476219177, "rewards/margins": 2.1294289330641427, "rewards/rejected": -2.2699944178263345, "step": 15710 }, { "epoch": 0.8327458723133596, "grad_norm": 54.5, "kl": 0.26869964599609375, "learning_rate": 5e-07, "logits/chosen": -59138901.333333336, "logits/rejected": -34265318.4, "logps/chosen": -383.6884765625, "logps/rejected": -354.9631103515625, "loss": 0.2234, "rewards/chosen": 0.2559626301129659, "rewards/margins": 2.7399668415387475, "rewards/rejected": -2.4840042114257814, "step": 15711 }, { "epoch": 0.8327988763151618, "grad_norm": 55.5, "kl": 1.0934677124023438, "learning_rate": 5e-07, "logits/chosen": -12165355.2, "logits/rejected": -137759392.0, "logps/chosen": -344.914599609375, "logps/rejected": -472.255615234375, "loss": 0.3863, "rewards/chosen": -0.15846084356307982, "rewards/margins": 3.4013485232988994, "rewards/rejected": -3.559809366861979, "step": 15712 }, { "epoch": 0.8328518803169639, "grad_norm": 55.75, "kl": 1.6770362854003906, "learning_rate": 5e-07, "logits/chosen": -41605052.8, "logits/rejected": -70618602.66666667, "logps/chosen": -328.474560546875, "logps/rejected": -397.4795735677083, "loss": 0.2333, "rewards/chosen": 1.2680093765258789, "rewards/margins": 3.6475046793619788, "rewards/rejected": -2.3794953028361, "step": 15713 }, { "epoch": 0.8329048843187661, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33623821.333333336, "logits/rejected": 376825.7, "logps/chosen": -110.46519978841145, "logps/rejected": -295.154248046875, "loss": 0.2873, "rewards/chosen": -0.066148375471433, "rewards/margins": 2.6884201059738793, "rewards/rejected": -2.7545684814453124, "step": 15714 }, { "epoch": 0.8329578883205682, "grad_norm": 44.0, "kl": 0.17017745971679688, "learning_rate": 5e-07, "logits/chosen": -20306043.2, "logits/rejected": -24373821.333333332, "logps/chosen": -545.32626953125, "logps/rejected": -178.9693806966146, "loss": 0.3182, "rewards/chosen": 0.9227212905883789, "rewards/margins": 2.1612494627634686, "rewards/rejected": -1.2385281721750896, "step": 15715 }, { "epoch": 0.8330108923223704, "grad_norm": 56.5, "kl": 0.9634742736816406, "learning_rate": 5e-07, "logits/chosen": -3125153.714285714, "logits/rejected": -20277614.0, "logps/chosen": -327.380615234375, "logps/rejected": -190.24462890625, "loss": 0.4395, "rewards/chosen": 0.14227781125477382, "rewards/margins": 3.906747673239027, "rewards/rejected": -3.764469861984253, "step": 15716 }, { "epoch": 0.8330638963241724, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 32153264.0, "logits/rejected": -10731203.2, "logps/chosen": -315.9398600260417, "logps/rejected": -143.3472412109375, "loss": 0.1876, "rewards/chosen": 0.9669616222381592, "rewards/margins": 3.3482469081878663, "rewards/rejected": -2.381285285949707, "step": 15717 }, { "epoch": 0.8331169003259746, "grad_norm": 44.75, "kl": 0.027067184448242188, "learning_rate": 5e-07, "logits/chosen": 4623242.0, "logits/rejected": 2910641.6, "logps/chosen": -153.2898966471354, "logps/rejected": -279.9540283203125, "loss": 0.2487, "rewards/chosen": 0.08890610933303833, "rewards/margins": 2.3906498074531557, "rewards/rejected": -2.3017436981201174, "step": 15718 }, { "epoch": 0.8331699043277767, "grad_norm": 166.0, "kl": 5.7731428146362305, "learning_rate": 5e-07, "logits/chosen": -15483412.8, "logits/rejected": -24728576.0, "logps/chosen": -220.304296875, "logps/rejected": -758.4429524739584, "loss": 0.2697, "rewards/chosen": 0.9811566352844239, "rewards/margins": 4.833647060394287, "rewards/rejected": -3.8524904251098633, "step": 15719 }, { "epoch": 0.8332229083295789, "grad_norm": 47.75, "kl": 2.251708984375, "learning_rate": 5e-07, "logits/chosen": -37199308.0, "logits/rejected": -39977960.0, "logps/chosen": -724.7625732421875, "logps/rejected": -554.1796264648438, "loss": 0.149, "rewards/chosen": 1.5482200384140015, "rewards/margins": 5.2791184186935425, "rewards/rejected": -3.730898380279541, "step": 15720 }, { "epoch": 0.833275912331381, "grad_norm": 58.0, "kl": 1.5044593811035156, "learning_rate": 5e-07, "logits/chosen": -25157601.6, "logits/rejected": -3418924.6666666665, "logps/chosen": -246.3332763671875, "logps/rejected": -207.70147705078125, "loss": 0.3522, "rewards/chosen": 0.18663361072540283, "rewards/margins": 2.9158870617548622, "rewards/rejected": -2.7292534510294595, "step": 15721 }, { "epoch": 0.8333289163331832, "grad_norm": 49.0, "kl": 0.730010986328125, "learning_rate": 5e-07, "logits/chosen": -80279664.0, "logits/rejected": -32932678.0, "logps/chosen": -346.0540771484375, "logps/rejected": -252.30291748046875, "loss": 0.2843, "rewards/chosen": 0.6905742883682251, "rewards/margins": 2.3966516256332397, "rewards/rejected": -1.7060773372650146, "step": 15722 }, { "epoch": 0.8333819203349853, "grad_norm": 53.25, "kl": 0.3209877014160156, "learning_rate": 5e-07, "logits/chosen": -30440708.57142857, "logits/rejected": -1920317.0, "logps/chosen": -276.71133858816967, "logps/rejected": -108.16438293457031, "loss": 0.4285, "rewards/chosen": 0.12474157980510167, "rewards/margins": 2.65390350988933, "rewards/rejected": -2.5291619300842285, "step": 15723 }, { "epoch": 0.8334349243367875, "grad_norm": 38.75, "kl": 3.7336082458496094, "learning_rate": 5e-07, "logits/chosen": -20505724.0, "logits/rejected": -71529096.0, "logps/chosen": -261.7659606933594, "logps/rejected": -444.65179443359375, "loss": 0.3277, "rewards/chosen": 0.05076754093170166, "rewards/margins": 3.1955257654190063, "rewards/rejected": -3.1447582244873047, "step": 15724 }, { "epoch": 0.8334879283385895, "grad_norm": 31.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6250445.0, "logits/rejected": -15562680.0, "logps/chosen": -224.56927490234375, "logps/rejected": -248.954833984375, "loss": 0.1165, "rewards/chosen": 1.0818313360214233, "rewards/margins": 3.9041529099146524, "rewards/rejected": -2.822321573893229, "step": 15725 }, { "epoch": 0.8335409323403917, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29440182.0, "logits/rejected": -26224360.0, "logps/chosen": -192.86575317382812, "logps/rejected": -397.503173828125, "loss": 0.2595, "rewards/chosen": 0.13543719053268433, "rewards/margins": 3.5370797514915466, "rewards/rejected": -3.4016425609588623, "step": 15726 }, { "epoch": 0.8335939363421938, "grad_norm": 45.25, "kl": 0.2895965576171875, "learning_rate": 5e-07, "logits/chosen": -2200617.6666666665, "logits/rejected": -12196720.0, "logps/chosen": -206.16292317708334, "logps/rejected": -457.4153137207031, "loss": 0.3165, "rewards/chosen": 0.4905889828999837, "rewards/margins": 3.378605683644613, "rewards/rejected": -2.888016700744629, "step": 15727 }, { "epoch": 0.833646940343996, "grad_norm": 44.25, "kl": 1.2951850891113281, "learning_rate": 5e-07, "logits/chosen": -33796848.0, "logits/rejected": 8835717.0, "logps/chosen": -427.8564453125, "logps/rejected": -338.0567626953125, "loss": 0.184, "rewards/chosen": 1.0401703119277954, "rewards/margins": 3.031143307685852, "rewards/rejected": -1.9909729957580566, "step": 15728 }, { "epoch": 0.8336999443457981, "grad_norm": 49.75, "kl": 1.21893310546875, "learning_rate": 5e-07, "logits/chosen": -25680350.0, "logits/rejected": -19023876.0, "logps/chosen": -263.1388854980469, "logps/rejected": -270.218994140625, "loss": 0.2711, "rewards/chosen": 0.7366416454315186, "rewards/margins": 2.3808802366256714, "rewards/rejected": -1.6442385911941528, "step": 15729 }, { "epoch": 0.8337529483476003, "grad_norm": 45.5, "kl": 0.2678041458129883, "learning_rate": 5e-07, "logits/chosen": 1769873.2, "logits/rejected": -22393240.0, "logps/chosen": -164.8376708984375, "logps/rejected": -326.7305501302083, "loss": 0.3112, "rewards/chosen": 0.015998983383178712, "rewards/margins": 3.432461881637573, "rewards/rejected": -3.4164628982543945, "step": 15730 }, { "epoch": 0.8338059523494024, "grad_norm": 110.5, "kl": 4.200137138366699, "learning_rate": 5e-07, "logits/chosen": -36420236.8, "logits/rejected": 843561.0, "logps/chosen": -449.87744140625, "logps/rejected": -419.8501383463542, "loss": 0.3694, "rewards/chosen": 0.40855841636657714, "rewards/margins": 1.5091022332509358, "rewards/rejected": -1.1005438168843586, "step": 15731 }, { "epoch": 0.8338589563512045, "grad_norm": 65.0, "kl": 0.3783226013183594, "learning_rate": 5e-07, "logits/chosen": -42565658.666666664, "logits/rejected": -30025612.0, "logps/chosen": -346.7823893229167, "logps/rejected": -170.4278564453125, "loss": 0.4003, "rewards/chosen": 0.21083438396453857, "rewards/margins": 1.5039103031158447, "rewards/rejected": -1.2930759191513062, "step": 15732 }, { "epoch": 0.8339119603530066, "grad_norm": 50.5, "kl": 1.9472408294677734, "learning_rate": 5e-07, "logits/chosen": -46071605.333333336, "logits/rejected": 2741947.75, "logps/chosen": -227.0312703450521, "logps/rejected": -299.6492614746094, "loss": 0.4161, "rewards/chosen": 0.07919476429621379, "rewards/margins": 3.3152663509051004, "rewards/rejected": -3.2360715866088867, "step": 15733 }, { "epoch": 0.8339649643548088, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 56875264.0, "logits/rejected": -7677028.8, "logps/chosen": -479.914306640625, "logps/rejected": -214.566650390625, "loss": 0.2364, "rewards/chosen": 0.5239573319753011, "rewards/margins": 2.9292677720387776, "rewards/rejected": -2.4053104400634764, "step": 15734 }, { "epoch": 0.8340179683566109, "grad_norm": 70.5, "kl": 1.6919174194335938, "learning_rate": 5e-07, "logits/chosen": -16669897.333333334, "logits/rejected": -56985964.0, "logps/chosen": -452.4864095052083, "logps/rejected": -409.19952392578125, "loss": 0.3204, "rewards/chosen": 0.7268694241841634, "rewards/margins": 3.5684622128804526, "rewards/rejected": -2.841592788696289, "step": 15735 }, { "epoch": 0.8340709723584131, "grad_norm": 50.25, "kl": 0.7989311218261719, "learning_rate": 5e-07, "logits/chosen": -14679746.0, "logits/rejected": -17652602.0, "logps/chosen": -494.9152526855469, "logps/rejected": -218.59010314941406, "loss": 0.2571, "rewards/chosen": 1.0231724977493286, "rewards/margins": 3.124623656272888, "rewards/rejected": -2.1014511585235596, "step": 15736 }, { "epoch": 0.8341239763602152, "grad_norm": 52.25, "kl": 0.473876953125, "learning_rate": 5e-07, "logits/chosen": -20541974.0, "logits/rejected": -8051279.5, "logps/chosen": -270.24462890625, "logps/rejected": -239.85513305664062, "loss": 0.2833, "rewards/chosen": 0.6770012378692627, "rewards/margins": 2.3848283290863037, "rewards/rejected": -1.707827091217041, "step": 15737 }, { "epoch": 0.8341769803620174, "grad_norm": 48.5, "kl": 2.8924636840820312, "learning_rate": 5e-07, "logits/chosen": -30035236.57142857, "logits/rejected": -283892.875, "logps/chosen": -306.56570870535717, "logps/rejected": -103.74420928955078, "loss": 0.4299, "rewards/chosen": 0.32853470529828754, "rewards/margins": 4.933309657233102, "rewards/rejected": -4.6047749519348145, "step": 15738 }, { "epoch": 0.8342299843638195, "grad_norm": 38.75, "kl": 1.980743408203125, "learning_rate": 5e-07, "logits/chosen": -38138885.333333336, "logits/rejected": -68636928.0, "logps/chosen": -288.78000895182294, "logps/rejected": -255.50380859375, "loss": 0.2077, "rewards/chosen": 1.3204350471496582, "rewards/margins": 3.3029362678527834, "rewards/rejected": -1.982501220703125, "step": 15739 }, { "epoch": 0.8342829883656216, "grad_norm": 38.0, "kl": 1.0719547271728516, "learning_rate": 5e-07, "logits/chosen": -17816589.333333332, "logits/rejected": -22047488.0, "logps/chosen": -288.02099609375, "logps/rejected": -275.37333984375, "loss": 0.1924, "rewards/chosen": 0.891771157582601, "rewards/margins": 3.4081518491109213, "rewards/rejected": -2.5163806915283202, "step": 15740 }, { "epoch": 0.8343359923674237, "grad_norm": 38.5, "kl": 0.19601058959960938, "learning_rate": 5e-07, "logits/chosen": -68155936.0, "logits/rejected": -27790861.333333332, "logps/chosen": -320.1666259765625, "logps/rejected": -260.493408203125, "loss": 0.2293, "rewards/chosen": -0.07028350979089737, "rewards/margins": 2.599214298029741, "rewards/rejected": -2.669497807820638, "step": 15741 }, { "epoch": 0.8343889963692259, "grad_norm": 37.75, "kl": 1.4682693481445312, "learning_rate": 5e-07, "logits/chosen": -17477448.0, "logits/rejected": 22239310.0, "logps/chosen": -167.60450744628906, "logps/rejected": -233.4652099609375, "loss": 0.3055, "rewards/chosen": 0.06009932979941368, "rewards/margins": 3.046850886195898, "rewards/rejected": -2.9867515563964844, "step": 15742 }, { "epoch": 0.834442000371028, "grad_norm": 49.0, "kl": 2.341714859008789, "learning_rate": 5e-07, "logits/chosen": -19813098.0, "logits/rejected": -19035082.0, "logps/chosen": -518.1907958984375, "logps/rejected": -206.94886779785156, "loss": 0.377, "rewards/chosen": 0.6533933877944946, "rewards/margins": 3.382989287376404, "rewards/rejected": -2.729595899581909, "step": 15743 }, { "epoch": 0.8344950043728302, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38068088.0, "logits/rejected": 54288026.666666664, "logps/chosen": -146.50018310546875, "logps/rejected": -365.2045491536458, "loss": 0.2342, "rewards/chosen": -0.19518548250198364, "rewards/margins": 2.079539159933726, "rewards/rejected": -2.2747246424357095, "step": 15744 }, { "epoch": 0.8345480083746323, "grad_norm": 28.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11732968.0, "logits/rejected": -11220330.285714285, "logps/chosen": -201.12918090820312, "logps/rejected": -194.89222935267858, "loss": 0.1115, "rewards/chosen": 1.312048316001892, "rewards/margins": 4.189954025404794, "rewards/rejected": -2.877905709402902, "step": 15745 }, { "epoch": 0.8346010123764345, "grad_norm": 77.5, "kl": 1.7104454040527344, "learning_rate": 5e-07, "logits/chosen": -31751888.0, "logits/rejected": -36642909.333333336, "logps/chosen": -243.191650390625, "logps/rejected": -611.6046956380209, "loss": 0.3123, "rewards/chosen": 0.45754289627075195, "rewards/margins": 3.1102099418640137, "rewards/rejected": -2.6526670455932617, "step": 15746 }, { "epoch": 0.8346540163782366, "grad_norm": 74.0, "kl": 0.854914665222168, "learning_rate": 5e-07, "logits/chosen": -6365284.0, "logits/rejected": -44883264.0, "logps/chosen": -118.70567830403645, "logps/rejected": -526.509765625, "loss": 0.3983, "rewards/chosen": 0.04032710442940394, "rewards/margins": 3.3084423864881196, "rewards/rejected": -3.268115282058716, "step": 15747 }, { "epoch": 0.8347070203800387, "grad_norm": 109.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7917770.666666667, "logits/rejected": -30949331.2, "logps/chosen": -271.5402425130208, "logps/rejected": -375.2512939453125, "loss": 0.2136, "rewards/chosen": 0.6646026770273844, "rewards/margins": 3.243778053919474, "rewards/rejected": -2.5791753768920898, "step": 15748 }, { "epoch": 0.8347600243818408, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15976444.0, "logits/rejected": -19931341.333333332, "logps/chosen": -412.49188232421875, "logps/rejected": -269.80617268880206, "loss": 0.2777, "rewards/chosen": 0.44827425479888916, "rewards/margins": 1.8676363229751587, "rewards/rejected": -1.4193620681762695, "step": 15749 }, { "epoch": 0.834813028383643, "grad_norm": 31.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10473554.0, "logits/rejected": -36675056.0, "logps/chosen": -847.4361979166666, "logps/rejected": -273.887255859375, "loss": 0.1251, "rewards/chosen": 2.106240749359131, "rewards/margins": 5.230317783355713, "rewards/rejected": -3.124077033996582, "step": 15750 }, { "epoch": 0.8348660323854451, "grad_norm": 47.0, "kl": 2.655231475830078, "learning_rate": 5e-07, "logits/chosen": -31880172.8, "logits/rejected": -14769592.0, "logps/chosen": -705.782275390625, "logps/rejected": -260.9801025390625, "loss": 0.2387, "rewards/chosen": 1.1681068420410157, "rewards/margins": 5.247369575500488, "rewards/rejected": -4.079262733459473, "step": 15751 }, { "epoch": 0.8349190363872472, "grad_norm": 58.25, "kl": 0.5380268096923828, "learning_rate": 5e-07, "logits/chosen": -20792353.333333332, "logits/rejected": 1484093.25, "logps/chosen": -347.0760091145833, "logps/rejected": -109.1563720703125, "loss": 0.3745, "rewards/chosen": 0.7510824998219808, "rewards/margins": 1.0301810105641684, "rewards/rejected": -0.2790985107421875, "step": 15752 }, { "epoch": 0.8349720403890494, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12052276.0, "logits/rejected": -9931535.0, "logps/chosen": -221.04046630859375, "logps/rejected": -228.52395629882812, "loss": 0.2108, "rewards/chosen": 0.9922350645065308, "rewards/margins": 4.825297236442566, "rewards/rejected": -3.833062171936035, "step": 15753 }, { "epoch": 0.8350250443908515, "grad_norm": 46.0, "kl": 2.2970008850097656, "learning_rate": 5e-07, "logits/chosen": -52651956.0, "logits/rejected": -7909307.0, "logps/chosen": -263.6687316894531, "logps/rejected": -272.54248046875, "loss": 0.3128, "rewards/chosen": 0.7085907459259033, "rewards/margins": 2.455244779586792, "rewards/rejected": -1.7466540336608887, "step": 15754 }, { "epoch": 0.8350780483926536, "grad_norm": 57.25, "kl": 0.0302734375, "learning_rate": 5e-07, "logits/chosen": -29189008.0, "logits/rejected": 694984.25, "logps/chosen": -426.5188802083333, "logps/rejected": -102.36320495605469, "loss": 0.3461, "rewards/chosen": 0.4677962859471639, "rewards/margins": 1.7780444224675496, "rewards/rejected": -1.3102481365203857, "step": 15755 }, { "epoch": 0.8351310523944557, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30764565.333333332, "logits/rejected": -24264707.2, "logps/chosen": -257.7738444010417, "logps/rejected": -431.16806640625, "loss": 0.241, "rewards/chosen": -0.20834298928578696, "rewards/margins": 2.695742551485697, "rewards/rejected": -2.9040855407714843, "step": 15756 }, { "epoch": 0.8351840563962579, "grad_norm": 36.75, "kl": 0.5450963973999023, "learning_rate": 5e-07, "logits/chosen": -11828792.0, "logits/rejected": -32018937.6, "logps/chosen": -143.03511555989584, "logps/rejected": -408.4626708984375, "loss": 0.2382, "rewards/chosen": 0.6332202752431234, "rewards/margins": 3.0374667962392174, "rewards/rejected": -2.404246520996094, "step": 15757 }, { "epoch": 0.83523706039806, "grad_norm": 41.5, "kl": 4.433195114135742, "learning_rate": 5e-07, "logits/chosen": -24533502.0, "logits/rejected": -27236872.0, "logps/chosen": -148.15048217773438, "logps/rejected": -236.06353759765625, "loss": 0.4227, "rewards/chosen": 0.22641515731811523, "rewards/margins": 2.0259947776794434, "rewards/rejected": -1.7995796203613281, "step": 15758 }, { "epoch": 0.8352900643998622, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54515302.4, "logits/rejected": -4306681.333333333, "logps/chosen": -394.5005126953125, "logps/rejected": -116.4609375, "loss": 0.2463, "rewards/chosen": 0.7007336616516113, "rewards/margins": 3.9419636726379395, "rewards/rejected": -3.241230010986328, "step": 15759 }, { "epoch": 0.8353430684016643, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36730707.2, "logits/rejected": -12904.0, "logps/chosen": -291.662744140625, "logps/rejected": -258.53688557942706, "loss": 0.4022, "rewards/chosen": 0.22450838088989258, "rewards/margins": 1.2583171685536703, "rewards/rejected": -1.0338087876637776, "step": 15760 }, { "epoch": 0.8353960724034665, "grad_norm": 45.0, "kl": 1.673797607421875, "learning_rate": 5e-07, "logits/chosen": -35424320.0, "logits/rejected": -35485288.0, "logps/chosen": -159.249755859375, "logps/rejected": -134.99530029296875, "loss": 0.3085, "rewards/chosen": 0.6460222601890564, "rewards/margins": 1.8616501688957214, "rewards/rejected": -1.215627908706665, "step": 15761 }, { "epoch": 0.8354490764052686, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80944741.33333333, "logits/rejected": -25206988.8, "logps/chosen": -814.4070638020834, "logps/rejected": -327.122998046875, "loss": 0.1475, "rewards/chosen": 1.4702407519022624, "rewards/margins": 4.524522558848063, "rewards/rejected": -3.054281806945801, "step": 15762 }, { "epoch": 0.8355020804070707, "grad_norm": 41.5, "kl": 2.881014823913574, "learning_rate": 5e-07, "logits/chosen": -10988640.666666666, "logits/rejected": -37213996.0, "logps/chosen": -386.3398030598958, "logps/rejected": -136.1326904296875, "loss": 0.383, "rewards/chosen": 0.8797070185343424, "rewards/margins": 1.9202740589777627, "rewards/rejected": -1.0405670404434204, "step": 15763 }, { "epoch": 0.8355550844088728, "grad_norm": 38.5, "kl": 2.1146745681762695, "learning_rate": 5e-07, "logits/chosen": -26552414.0, "logits/rejected": 1399546.75, "logps/chosen": -404.33270263671875, "logps/rejected": -298.6131591796875, "loss": 0.206, "rewards/chosen": 1.394820213317871, "rewards/margins": 5.1155030727386475, "rewards/rejected": -3.7206828594207764, "step": 15764 }, { "epoch": 0.835608088410675, "grad_norm": 68.5, "kl": 1.1273117065429688, "learning_rate": 5e-07, "logits/chosen": -4529454.0, "logits/rejected": -48227565.71428572, "logps/chosen": -356.1661071777344, "logps/rejected": -272.59713309151783, "loss": 0.199, "rewards/chosen": 0.7667083740234375, "rewards/margins": 2.45138304574149, "rewards/rejected": -1.6846746717180525, "step": 15765 }, { "epoch": 0.8356610924124771, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4150986.5, "logits/rejected": -2474298.75, "logps/chosen": -184.23410034179688, "logps/rejected": -391.4823303222656, "loss": 0.2413, "rewards/chosen": 0.39007148146629333, "rewards/margins": 4.424178212881088, "rewards/rejected": -4.034106731414795, "step": 15766 }, { "epoch": 0.8357140964142793, "grad_norm": 53.5, "kl": 1.6383171081542969, "learning_rate": 5e-07, "logits/chosen": -61816656.0, "logits/rejected": -4978261.0, "logps/chosen": -326.07582600911456, "logps/rejected": -267.5732727050781, "loss": 0.352, "rewards/chosen": 0.5235358874003092, "rewards/margins": 2.521077791849772, "rewards/rejected": -1.997541904449463, "step": 15767 }, { "epoch": 0.8357671004160814, "grad_norm": 50.0, "kl": 0.6147537231445312, "learning_rate": 5e-07, "logits/chosen": -37259508.0, "logits/rejected": -23933724.0, "logps/chosen": -339.2220764160156, "logps/rejected": -205.7466583251953, "loss": 0.285, "rewards/chosen": 0.24681313335895538, "rewards/margins": 2.6453889161348343, "rewards/rejected": -2.398575782775879, "step": 15768 }, { "epoch": 0.8358201044178836, "grad_norm": 41.0, "kl": 1.1390113830566406, "learning_rate": 5e-07, "logits/chosen": -7799568.0, "logits/rejected": -28572464.0, "logps/chosen": -142.2234375, "logps/rejected": -339.91815185546875, "loss": 0.2468, "rewards/chosen": 0.8721293449401856, "rewards/margins": 3.465230846405029, "rewards/rejected": -2.5931015014648438, "step": 15769 }, { "epoch": 0.8358731084196857, "grad_norm": 38.5, "kl": 0.04057121276855469, "learning_rate": 5e-07, "logits/chosen": -18860165.333333332, "logits/rejected": -19255971.2, "logps/chosen": -231.51407877604166, "logps/rejected": -293.704541015625, "loss": 0.1966, "rewards/chosen": 0.5219411055246989, "rewards/margins": 3.36493026415507, "rewards/rejected": -2.8429891586303713, "step": 15770 }, { "epoch": 0.8359261124214878, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7751763.0, "logits/rejected": -15870608.0, "logps/chosen": -227.44195556640625, "logps/rejected": -195.28659057617188, "loss": 0.2921, "rewards/chosen": 0.6001960039138794, "rewards/margins": 1.9655085802078247, "rewards/rejected": -1.3653125762939453, "step": 15771 }, { "epoch": 0.8359791164232899, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72661264.0, "logits/rejected": 7779142.666666667, "logps/chosen": -716.7205200195312, "logps/rejected": -206.3088582356771, "loss": 0.1306, "rewards/chosen": 1.6068848371505737, "rewards/margins": 4.280291438102722, "rewards/rejected": -2.6734066009521484, "step": 15772 }, { "epoch": 0.8360321204250921, "grad_norm": 74.0, "kl": 5.280338287353516, "learning_rate": 5e-07, "logits/chosen": -15339185.333333334, "logits/rejected": -71178800.0, "logps/chosen": -741.83251953125, "logps/rejected": -568.6004638671875, "loss": 0.2275, "rewards/chosen": 1.5645208358764648, "rewards/margins": 3.7159674167633057, "rewards/rejected": -2.151446580886841, "step": 15773 }, { "epoch": 0.8360851244268942, "grad_norm": 48.25, "kl": 3.2848854064941406, "learning_rate": 5e-07, "logits/chosen": -31686709.333333332, "logits/rejected": -9902480.0, "logps/chosen": -201.34488932291666, "logps/rejected": -1096.5172119140625, "loss": 0.41, "rewards/chosen": 0.12459208567937215, "rewards/margins": 3.270484467347463, "rewards/rejected": -3.145892381668091, "step": 15774 }, { "epoch": 0.8361381284286964, "grad_norm": 40.75, "kl": 1.9486160278320312, "learning_rate": 5e-07, "logits/chosen": -26526291.2, "logits/rejected": -24505216.0, "logps/chosen": -206.9791015625, "logps/rejected": -310.5430501302083, "loss": 0.3813, "rewards/chosen": 0.16614784002304078, "rewards/margins": 2.2431889891624452, "rewards/rejected": -2.0770411491394043, "step": 15775 }, { "epoch": 0.8361911324304985, "grad_norm": 58.75, "kl": 4.625960350036621, "learning_rate": 5e-07, "logits/chosen": -37062272.0, "logits/rejected": -42079152.0, "logps/chosen": -336.81912667410717, "logps/rejected": -166.67001342773438, "loss": 0.3561, "rewards/chosen": 1.1180787086486816, "rewards/margins": 4.3460633754730225, "rewards/rejected": -3.227984666824341, "step": 15776 }, { "epoch": 0.8362441364323007, "grad_norm": 49.25, "kl": 0.4560432434082031, "learning_rate": 5e-07, "logits/chosen": -19483016.0, "logits/rejected": -26719794.666666668, "logps/chosen": -293.5291748046875, "logps/rejected": -382.7459716796875, "loss": 0.188, "rewards/chosen": 0.5634124875068665, "rewards/margins": 2.495188693205516, "rewards/rejected": -1.9317762056986492, "step": 15777 }, { "epoch": 0.8362971404341027, "grad_norm": 35.5, "kl": 0.909454345703125, "learning_rate": 5e-07, "logits/chosen": -19330278.666666668, "logits/rejected": -7643029.6, "logps/chosen": -127.70578002929688, "logps/rejected": -277.5874267578125, "loss": 0.2878, "rewards/chosen": -0.25258394082387287, "rewards/margins": 2.0823995033899942, "rewards/rejected": -2.3349834442138673, "step": 15778 }, { "epoch": 0.8363501444359049, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53406256.0, "logits/rejected": -14608991.0, "logps/chosen": -472.0816650390625, "logps/rejected": -235.1511688232422, "loss": 0.1647, "rewards/chosen": 0.9193475842475891, "rewards/margins": 4.940437138080597, "rewards/rejected": -4.021089553833008, "step": 15779 }, { "epoch": 0.836403148437707, "grad_norm": 58.25, "kl": 1.5893278121948242, "learning_rate": 5e-07, "logits/chosen": 4716441.0, "logits/rejected": -23183378.666666668, "logps/chosen": -31.754777908325195, "logps/rejected": -315.00209554036456, "loss": 0.2622, "rewards/chosen": 0.26381003856658936, "rewards/margins": 1.7992021640141804, "rewards/rejected": -1.535392125447591, "step": 15780 }, { "epoch": 0.8364561524395092, "grad_norm": 29.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12952706.0, "logits/rejected": -9291958.0, "logps/chosen": -203.76129150390625, "logps/rejected": -172.82173665364584, "loss": 0.1512, "rewards/chosen": 0.2689899504184723, "rewards/margins": 3.5251668989658356, "rewards/rejected": -3.2561769485473633, "step": 15781 }, { "epoch": 0.8365091564413113, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 22885554.666666668, "logits/rejected": -12342868.8, "logps/chosen": -506.7575276692708, "logps/rejected": -330.4579833984375, "loss": 0.2755, "rewards/chosen": -0.24810282389322916, "rewards/margins": 2.526566823323568, "rewards/rejected": -2.774669647216797, "step": 15782 }, { "epoch": 0.8365621604431135, "grad_norm": 25.625, "kl": 0.0439448356628418, "learning_rate": 5e-07, "logits/chosen": -34550733.333333336, "logits/rejected": -18209139.2, "logps/chosen": -513.11669921875, "logps/rejected": -353.913330078125, "loss": 0.1846, "rewards/chosen": 1.6007394790649414, "rewards/margins": 4.3817853927612305, "rewards/rejected": -2.781045913696289, "step": 15783 }, { "epoch": 0.8366151644449156, "grad_norm": 33.75, "kl": 0.3618755340576172, "learning_rate": 5e-07, "logits/chosen": -9536579.0, "logits/rejected": -1604246.5, "logps/chosen": -288.7747497558594, "logps/rejected": -95.2023417154948, "loss": 0.3083, "rewards/chosen": 0.8140010833740234, "rewards/margins": 2.3255988756815595, "rewards/rejected": -1.5115977923075359, "step": 15784 }, { "epoch": 0.8366681684467178, "grad_norm": 68.0, "kl": 0.0055866241455078125, "learning_rate": 5e-07, "logits/chosen": -14394260.0, "logits/rejected": -3353711.5, "logps/chosen": -718.289794921875, "logps/rejected": -415.4919738769531, "loss": 0.4207, "rewards/chosen": -0.024456024169921875, "rewards/margins": 2.3287243843078613, "rewards/rejected": -2.353180408477783, "step": 15785 }, { "epoch": 0.8367211724485198, "grad_norm": 57.25, "kl": 3.6945533752441406, "learning_rate": 5e-07, "logits/chosen": -48325273.6, "logits/rejected": -22471906.666666668, "logps/chosen": -309.4440185546875, "logps/rejected": -379.4077555338542, "loss": 0.4222, "rewards/chosen": 0.0889462947845459, "rewards/margins": 2.8655137538909914, "rewards/rejected": -2.7765674591064453, "step": 15786 }, { "epoch": 0.836774176450322, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60028176.0, "logits/rejected": -36872309.333333336, "logps/chosen": -663.9699096679688, "logps/rejected": -384.0071614583333, "loss": 0.1987, "rewards/chosen": 0.11623230576515198, "rewards/margins": 3.0133327543735504, "rewards/rejected": -2.8971004486083984, "step": 15787 }, { "epoch": 0.8368271804521241, "grad_norm": 77.5, "kl": 5.767581939697266, "learning_rate": 5e-07, "logits/chosen": -11606404.0, "logits/rejected": -37784210.666666664, "logps/chosen": -320.3975830078125, "logps/rejected": -518.3013509114584, "loss": 0.226, "rewards/chosen": 1.360441017150879, "rewards/margins": 5.406307665506999, "rewards/rejected": -4.04586664835612, "step": 15788 }, { "epoch": 0.8368801844539263, "grad_norm": 52.75, "kl": 1.057546615600586, "learning_rate": 5e-07, "logits/chosen": -16090147.2, "logits/rejected": -20441177.333333332, "logps/chosen": -202.663720703125, "logps/rejected": -492.3314208984375, "loss": 0.3504, "rewards/chosen": 0.24995787143707277, "rewards/margins": 4.017862470944722, "rewards/rejected": -3.76790459950765, "step": 15789 }, { "epoch": 0.8369331884557284, "grad_norm": 44.5, "kl": 5.2523298263549805, "learning_rate": 5e-07, "logits/chosen": -11117976.0, "logits/rejected": -11155565.333333334, "logps/chosen": -195.41011962890624, "logps/rejected": -316.2977294921875, "loss": 0.2961, "rewards/chosen": 1.0923954010009767, "rewards/margins": 2.9043290774027506, "rewards/rejected": -1.8119336764017742, "step": 15790 }, { "epoch": 0.8369861924575306, "grad_norm": 44.75, "kl": 5.411251068115234, "learning_rate": 5e-07, "logits/chosen": -14858462.4, "logits/rejected": -15579462.666666666, "logps/chosen": -354.3888671875, "logps/rejected": -347.8931477864583, "loss": 0.3412, "rewards/chosen": 0.7333593368530273, "rewards/margins": 3.2147062619527182, "rewards/rejected": -2.481346925099691, "step": 15791 }, { "epoch": 0.8370391964593327, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29080646.4, "logits/rejected": -19153029.333333332, "logps/chosen": -460.81240234375, "logps/rejected": -334.22145589192706, "loss": 0.3303, "rewards/chosen": 0.09665390253067016, "rewards/margins": 2.9462871193885802, "rewards/rejected": -2.84963321685791, "step": 15792 }, { "epoch": 0.8370922004611349, "grad_norm": 35.5, "kl": 0.9000473022460938, "learning_rate": 5e-07, "logits/chosen": -29958624.0, "logits/rejected": -14833175.0, "logps/chosen": -167.90782165527344, "logps/rejected": -211.9222412109375, "loss": 0.2776, "rewards/chosen": 0.6036729216575623, "rewards/margins": 2.7877292037010193, "rewards/rejected": -2.184056282043457, "step": 15793 }, { "epoch": 0.8371452044629369, "grad_norm": 59.5, "kl": 2.2982025146484375, "learning_rate": 5e-07, "logits/chosen": -68899848.0, "logits/rejected": -16247784.0, "logps/chosen": -304.2810974121094, "logps/rejected": -175.28599548339844, "loss": 0.3458, "rewards/chosen": 1.161366581916809, "rewards/margins": 1.9803967475891113, "rewards/rejected": -0.8190301656723022, "step": 15794 }, { "epoch": 0.8371982084647391, "grad_norm": 26.0, "kl": 2.9556708335876465, "learning_rate": 5e-07, "logits/chosen": -12800433.0, "logits/rejected": -93491776.0, "logps/chosen": -154.35140991210938, "logps/rejected": -543.7216186523438, "loss": 0.1673, "rewards/chosen": 1.7847861051559448, "rewards/margins": 4.948291182518005, "rewards/rejected": -3.1635050773620605, "step": 15795 }, { "epoch": 0.8372512124665412, "grad_norm": 39.75, "kl": 4.39460563659668, "learning_rate": 5e-07, "logits/chosen": -4788650.0, "logits/rejected": -26228888.0, "logps/chosen": -196.11685180664062, "logps/rejected": -441.598876953125, "loss": 0.2911, "rewards/chosen": 0.7410439252853394, "rewards/margins": 3.7805429697036743, "rewards/rejected": -3.039499044418335, "step": 15796 }, { "epoch": 0.8373042164683434, "grad_norm": 86.0, "kl": 0.19820785522460938, "learning_rate": 5e-07, "logits/chosen": -38354596.571428575, "logits/rejected": -12435001.0, "logps/chosen": -345.9713657924107, "logps/rejected": -139.83206176757812, "loss": 0.3381, "rewards/chosen": 0.5340876919882638, "rewards/margins": 4.635705028261457, "rewards/rejected": -4.101617336273193, "step": 15797 }, { "epoch": 0.8373572204701455, "grad_norm": 57.5, "kl": 4.484999656677246, "learning_rate": 5e-07, "logits/chosen": -21471284.0, "logits/rejected": -36525448.0, "logps/chosen": -256.7289123535156, "logps/rejected": -322.58941650390625, "loss": 0.2135, "rewards/chosen": 1.114222764968872, "rewards/margins": 3.9342784881591797, "rewards/rejected": -2.8200557231903076, "step": 15798 }, { "epoch": 0.8374102244719477, "grad_norm": 39.25, "kl": 0.116546630859375, "learning_rate": 5e-07, "logits/chosen": -18007190.0, "logits/rejected": -35172128.0, "logps/chosen": -678.4959716796875, "logps/rejected": -277.2573547363281, "loss": 0.1592, "rewards/chosen": 1.584144115447998, "rewards/margins": 4.609554290771484, "rewards/rejected": -3.0254101753234863, "step": 15799 }, { "epoch": 0.8374632284737498, "grad_norm": 53.0, "kl": 2.018096923828125, "learning_rate": 5e-07, "logits/chosen": -74317904.0, "logits/rejected": -94537578.66666667, "logps/chosen": -907.0555419921875, "logps/rejected": -394.9901123046875, "loss": 0.1694, "rewards/chosen": 1.792598009109497, "rewards/margins": 4.0145011742909755, "rewards/rejected": -2.221903165181478, "step": 15800 }, { "epoch": 0.837516232475552, "grad_norm": 74.5, "kl": 2.975675582885742, "learning_rate": 5e-07, "logits/chosen": -647576.5, "logits/rejected": 14586668.0, "logps/chosen": -147.19215393066406, "logps/rejected": -440.8580322265625, "loss": 0.4122, "rewards/chosen": 0.4419407248497009, "rewards/margins": 1.7967450022697449, "rewards/rejected": -1.354804277420044, "step": 15801 }, { "epoch": 0.837569236477354, "grad_norm": 60.25, "kl": 1.0859193801879883, "learning_rate": 5e-07, "logits/chosen": -29673344.0, "logits/rejected": -18279810.0, "logps/chosen": -192.80007934570312, "logps/rejected": -234.90887451171875, "loss": 0.3357, "rewards/chosen": -0.03422146290540695, "rewards/margins": 2.166020341217518, "rewards/rejected": -2.200241804122925, "step": 15802 }, { "epoch": 0.8376222404791561, "grad_norm": 45.75, "kl": 3.883915901184082, "learning_rate": 5e-07, "logits/chosen": 4578964.666666667, "logits/rejected": -31036729.6, "logps/chosen": -180.88250732421875, "logps/rejected": -222.872265625, "loss": 0.26, "rewards/chosen": 1.160299301147461, "rewards/margins": 2.8180891036987306, "rewards/rejected": -1.6577898025512696, "step": 15803 }, { "epoch": 0.8376752444809583, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36977717.333333336, "logits/rejected": -11599709.6, "logps/chosen": -290.84063720703125, "logps/rejected": -117.1912841796875, "loss": 0.2508, "rewards/chosen": 0.6442520221074423, "rewards/margins": 2.312129600842794, "rewards/rejected": -1.6678775787353515, "step": 15804 }, { "epoch": 0.8377282484827604, "grad_norm": 31.875, "kl": 2.158599853515625, "learning_rate": 5e-07, "logits/chosen": -8902012.0, "logits/rejected": -38641292.8, "logps/chosen": -157.3996378580729, "logps/rejected": -197.4468994140625, "loss": 0.2317, "rewards/chosen": 0.49967479705810547, "rewards/margins": 2.7689468383789064, "rewards/rejected": -2.269272041320801, "step": 15805 }, { "epoch": 0.8377812524845626, "grad_norm": 58.0, "kl": 0.042633056640625, "learning_rate": 5e-07, "logits/chosen": -20973106.666666668, "logits/rejected": -26764144.0, "logps/chosen": -593.8116861979166, "logps/rejected": -334.4264892578125, "loss": 0.1482, "rewards/chosen": 1.562302589416504, "rewards/margins": 3.930308151245117, "rewards/rejected": -2.368005561828613, "step": 15806 }, { "epoch": 0.8378342564863647, "grad_norm": 41.75, "kl": 1.9278678894042969, "learning_rate": 5e-07, "logits/chosen": -9974809.333333334, "logits/rejected": -91694968.0, "logps/chosen": -149.69420369466147, "logps/rejected": -545.3363647460938, "loss": 0.4106, "rewards/chosen": 0.09455155332883199, "rewards/margins": 3.0702199836572013, "rewards/rejected": -2.975668430328369, "step": 15807 }, { "epoch": 0.8378872604881669, "grad_norm": 51.0, "kl": 0.5311555862426758, "learning_rate": 5e-07, "logits/chosen": -39481445.333333336, "logits/rejected": -48528.1875, "logps/chosen": -335.69873046875, "logps/rejected": -263.89959716796875, "loss": 0.3166, "rewards/chosen": 0.7211062908172607, "rewards/margins": 2.933107376098633, "rewards/rejected": -2.212001085281372, "step": 15808 }, { "epoch": 0.837940264489969, "grad_norm": 48.5, "kl": 2.725484848022461, "learning_rate": 5e-07, "logits/chosen": 2423659.5, "logits/rejected": -10624980.0, "logps/chosen": -30.57305908203125, "logps/rejected": -244.07559204101562, "loss": 0.2492, "rewards/chosen": 0.4079076051712036, "rewards/margins": 4.013776659965515, "rewards/rejected": -3.6058690547943115, "step": 15809 }, { "epoch": 0.8379932684917711, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11708010.4, "logits/rejected": -5077865.333333333, "logps/chosen": -141.7701904296875, "logps/rejected": -133.566162109375, "loss": 0.2905, "rewards/chosen": 0.7724565505981446, "rewards/margins": 2.3038171768188476, "rewards/rejected": -1.5313606262207031, "step": 15810 }, { "epoch": 0.8380462724935732, "grad_norm": 56.0, "kl": 3.273649215698242, "learning_rate": 5e-07, "logits/chosen": -45878246.4, "logits/rejected": -32163693.333333332, "logps/chosen": -471.67255859375, "logps/rejected": -330.94374593098956, "loss": 0.3368, "rewards/chosen": 0.9856062889099121, "rewards/margins": 2.156199550628662, "rewards/rejected": -1.17059326171875, "step": 15811 }, { "epoch": 0.8380992764953754, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14295578.0, "logits/rejected": -14578221.333333334, "logps/chosen": -511.4471130371094, "logps/rejected": -220.5229695638021, "loss": 0.1606, "rewards/chosen": 0.5766311883926392, "rewards/margins": 3.5854965448379517, "rewards/rejected": -3.0088653564453125, "step": 15812 }, { "epoch": 0.8381522804971775, "grad_norm": 30.0, "kl": 1.8236732482910156, "learning_rate": 5e-07, "logits/chosen": -12352649.333333334, "logits/rejected": -56041254.4, "logps/chosen": -845.2943522135416, "logps/rejected": -555.409130859375, "loss": 0.2527, "rewards/chosen": 1.33441162109375, "rewards/margins": 3.8758434295654296, "rewards/rejected": -2.5414318084716796, "step": 15813 }, { "epoch": 0.8382052844989797, "grad_norm": 58.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18801452.0, "logits/rejected": -27423619.2, "logps/chosen": -195.9269002278646, "logps/rejected": -562.58857421875, "loss": 0.2992, "rewards/chosen": -0.17026845614115396, "rewards/margins": 2.676815207799276, "rewards/rejected": -2.8470836639404298, "step": 15814 }, { "epoch": 0.8382582885007818, "grad_norm": 47.0, "kl": 1.0915374755859375, "learning_rate": 5e-07, "logits/chosen": -25347481.6, "logits/rejected": -881534.6666666666, "logps/chosen": -390.88037109375, "logps/rejected": -86.09806315104167, "loss": 0.2777, "rewards/chosen": 0.6164525032043457, "rewards/margins": 3.460960992177328, "rewards/rejected": -2.844508488972982, "step": 15815 }, { "epoch": 0.838311292502584, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 40948140.0, "logits/rejected": -9926946.285714285, "logps/chosen": -243.26210021972656, "logps/rejected": -252.56480189732142, "loss": 0.2071, "rewards/chosen": -0.6652283072471619, "rewards/margins": 1.5644099286624362, "rewards/rejected": -2.229638235909598, "step": 15816 }, { "epoch": 0.838364296504386, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27050425.6, "logits/rejected": -29902410.666666668, "logps/chosen": -254.97646484375, "logps/rejected": -328.68943277994794, "loss": 0.292, "rewards/chosen": 0.373430871963501, "rewards/margins": 3.1017064889272055, "rewards/rejected": -2.7282756169637046, "step": 15817 }, { "epoch": 0.8384173005061882, "grad_norm": 50.5, "kl": 0.7655220031738281, "learning_rate": 5e-07, "logits/chosen": -23310844.8, "logits/rejected": -57507712.0, "logps/chosen": -293.607666015625, "logps/rejected": -367.4261067708333, "loss": 0.3689, "rewards/chosen": -0.15302231311798095, "rewards/margins": 2.570138748486837, "rewards/rejected": -2.723161061604818, "step": 15818 }, { "epoch": 0.8384703045079903, "grad_norm": 74.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65655392.0, "logits/rejected": 3022354.5, "logps/chosen": -408.97314453125, "logps/rejected": -149.75491333007812, "loss": 0.346, "rewards/chosen": 0.5774987936019897, "rewards/margins": 2.9866949319839478, "rewards/rejected": -2.409196138381958, "step": 15819 }, { "epoch": 0.8385233085097925, "grad_norm": 40.75, "kl": 1.1051654815673828, "learning_rate": 5e-07, "logits/chosen": -71825461.33333333, "logits/rejected": -46393814.4, "logps/chosen": -205.53694661458334, "logps/rejected": -162.0826416015625, "loss": 0.224, "rewards/chosen": 0.8697144190470377, "rewards/margins": 3.175839106241862, "rewards/rejected": -2.306124687194824, "step": 15820 }, { "epoch": 0.8385763125115946, "grad_norm": 43.5, "kl": 0.5533046722412109, "learning_rate": 5e-07, "logits/chosen": -7987827.0, "logits/rejected": -15125086.0, "logps/chosen": -132.06109619140625, "logps/rejected": -258.29351806640625, "loss": 0.3507, "rewards/chosen": -0.12119180709123611, "rewards/margins": 1.8621663376688957, "rewards/rejected": -1.9833581447601318, "step": 15821 }, { "epoch": 0.8386293165133968, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71059338.66666667, "logits/rejected": -22179371.2, "logps/chosen": -497.3113606770833, "logps/rejected": -323.9167724609375, "loss": 0.2091, "rewards/chosen": 0.3644063472747803, "rewards/margins": 3.554284429550171, "rewards/rejected": -3.1898780822753907, "step": 15822 }, { "epoch": 0.8386823205151989, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59864128.0, "logits/rejected": -69528723.2, "logps/chosen": -570.97265625, "logps/rejected": -305.717578125, "loss": 0.2388, "rewards/chosen": 0.6521749099095663, "rewards/margins": 3.4808850844701134, "rewards/rejected": -2.828710174560547, "step": 15823 }, { "epoch": 0.8387353245170011, "grad_norm": 55.25, "kl": 0.09253311157226562, "learning_rate": 5e-07, "logits/chosen": 3554376.25, "logits/rejected": -65841776.0, "logps/chosen": -200.15077209472656, "logps/rejected": -292.8219299316406, "loss": 0.2585, "rewards/chosen": 0.4142981767654419, "rewards/margins": 2.9080108404159546, "rewards/rejected": -2.4937126636505127, "step": 15824 }, { "epoch": 0.8387883285188031, "grad_norm": 43.0, "kl": 0.8590526580810547, "learning_rate": 5e-07, "logits/chosen": -37585356.8, "logits/rejected": -13329594.666666666, "logps/chosen": -386.782763671875, "logps/rejected": -217.18524169921875, "loss": 0.3118, "rewards/chosen": 0.7676497459411621, "rewards/margins": 3.3466179529825846, "rewards/rejected": -2.5789682070414224, "step": 15825 }, { "epoch": 0.8388413325206053, "grad_norm": 37.25, "kl": 0.7027778625488281, "learning_rate": 5e-07, "logits/chosen": -16127182.666666666, "logits/rejected": -21662720.0, "logps/chosen": -190.0079549153646, "logps/rejected": -176.6713623046875, "loss": 0.2787, "rewards/chosen": 0.2238017717997233, "rewards/margins": 2.405675665537516, "rewards/rejected": -2.181873893737793, "step": 15826 }, { "epoch": 0.8388943365224074, "grad_norm": 36.75, "kl": 1.632791519165039, "learning_rate": 5e-07, "logits/chosen": -10156470.0, "logits/rejected": -29574934.0, "logps/chosen": -378.1385498046875, "logps/rejected": -439.40350341796875, "loss": 0.1725, "rewards/chosen": 1.0832023620605469, "rewards/margins": 4.557203531265259, "rewards/rejected": -3.474001169204712, "step": 15827 }, { "epoch": 0.8389473405242096, "grad_norm": 42.75, "kl": 3.8889942169189453, "learning_rate": 5e-07, "logits/chosen": -6949485.6, "logits/rejected": -8642796.666666666, "logps/chosen": -388.3697509765625, "logps/rejected": -115.6018575032552, "loss": 0.3068, "rewards/chosen": 0.9003332138061524, "rewards/margins": 1.8895028114318848, "rewards/rejected": -0.9891695976257324, "step": 15828 }, { "epoch": 0.8390003445260117, "grad_norm": 57.75, "kl": 0.14810943603515625, "learning_rate": 5e-07, "logits/chosen": -31156816.0, "logits/rejected": -38532576.0, "logps/chosen": -302.680078125, "logps/rejected": -251.00211588541666, "loss": 0.2879, "rewards/chosen": 0.5981347560882568, "rewards/margins": 2.5543218453725176, "rewards/rejected": -1.956187089284261, "step": 15829 }, { "epoch": 0.8390533485278139, "grad_norm": 62.75, "kl": 1.3415107727050781, "learning_rate": 5e-07, "logits/chosen": -48059637.333333336, "logits/rejected": -4611465.0, "logps/chosen": -259.8804931640625, "logps/rejected": -111.21586608886719, "loss": 0.3798, "rewards/chosen": 0.23219374815622965, "rewards/margins": 4.191450874010722, "rewards/rejected": -3.959257125854492, "step": 15830 }, { "epoch": 0.839106352529616, "grad_norm": 50.5, "kl": 0.11376571655273438, "learning_rate": 5e-07, "logits/chosen": -45719141.333333336, "logits/rejected": -5753780.0, "logps/chosen": -685.1189778645834, "logps/rejected": -228.1978759765625, "loss": 0.2428, "rewards/chosen": 1.1316711902618408, "rewards/margins": 3.2151089191436766, "rewards/rejected": -2.0834377288818358, "step": 15831 }, { "epoch": 0.8391593565314182, "grad_norm": 31.625, "kl": 0.19049835205078125, "learning_rate": 5e-07, "logits/chosen": 1109375.375, "logits/rejected": -11691632.0, "logps/chosen": -42.759498596191406, "logps/rejected": -258.5979410807292, "loss": 0.1591, "rewards/chosen": 0.79095458984375, "rewards/margins": 3.438014348347982, "rewards/rejected": -2.647059758504232, "step": 15832 }, { "epoch": 0.8392123605332202, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4315749.333333333, "logits/rejected": -70146880.0, "logps/chosen": -324.2115071614583, "logps/rejected": -268.40537109375, "loss": 0.2504, "rewards/chosen": 0.15846101442972818, "rewards/margins": 2.7134933630625406, "rewards/rejected": -2.5550323486328126, "step": 15833 }, { "epoch": 0.8392653645350224, "grad_norm": 37.75, "kl": 4.378589630126953, "learning_rate": 5e-07, "logits/chosen": -17455188.8, "logits/rejected": -82204288.0, "logps/chosen": -209.908544921875, "logps/rejected": -320.9377034505208, "loss": 0.3335, "rewards/chosen": 0.6718592643737793, "rewards/margins": 2.971879482269287, "rewards/rejected": -2.300020217895508, "step": 15834 }, { "epoch": 0.8393183685368245, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68761928.0, "logits/rejected": 4568091.0, "logps/chosen": -310.60821533203125, "logps/rejected": -224.81668090820312, "loss": 0.3266, "rewards/chosen": 0.06946180760860443, "rewards/margins": 2.346496567130089, "rewards/rejected": -2.2770347595214844, "step": 15835 }, { "epoch": 0.8393713725386267, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30641356.0, "logits/rejected": -50237866.666666664, "logps/chosen": -354.02166748046875, "logps/rejected": -372.9539794921875, "loss": 0.1839, "rewards/chosen": 0.8221038579940796, "rewards/margins": 2.9185956716537476, "rewards/rejected": -2.096491813659668, "step": 15836 }, { "epoch": 0.8394243765404288, "grad_norm": 66.5, "kl": 2.0807571411132812, "learning_rate": 5e-07, "logits/chosen": -1867840.6666666667, "logits/rejected": -53926368.0, "logps/chosen": -151.16619873046875, "logps/rejected": -562.2183837890625, "loss": 0.3468, "rewards/chosen": 0.49422558148701984, "rewards/margins": 3.5440197785695395, "rewards/rejected": -3.0497941970825195, "step": 15837 }, { "epoch": 0.839477380542231, "grad_norm": 51.0, "kl": 4.668874740600586, "learning_rate": 5e-07, "logits/chosen": -18428155.2, "logits/rejected": -17996560.0, "logps/chosen": -308.584375, "logps/rejected": -414.1910807291667, "loss": 0.2564, "rewards/chosen": 1.2079697608947755, "rewards/margins": 4.11928981145223, "rewards/rejected": -2.9113200505574546, "step": 15838 }, { "epoch": 0.8395303845440331, "grad_norm": 55.0, "kl": 0.4279136657714844, "learning_rate": 5e-07, "logits/chosen": 3329460.4, "logits/rejected": -56935914.666666664, "logps/chosen": -170.33671875, "logps/rejected": -347.9292399088542, "loss": 0.3956, "rewards/chosen": -0.22231512069702147, "rewards/margins": 2.0373020490010583, "rewards/rejected": -2.2596171696980796, "step": 15839 }, { "epoch": 0.8395833885458353, "grad_norm": 49.25, "kl": 1.582387924194336, "learning_rate": 5e-07, "logits/chosen": -36112019.2, "logits/rejected": -31266237.333333332, "logps/chosen": -316.5003173828125, "logps/rejected": -381.7184244791667, "loss": 0.283, "rewards/chosen": 0.5062331199645996, "rewards/margins": 3.5828389167785644, "rewards/rejected": -3.076605796813965, "step": 15840 }, { "epoch": 0.8396363925476373, "grad_norm": 66.0, "kl": 1.335702896118164, "learning_rate": 5e-07, "logits/chosen": -38069216.0, "logits/rejected": -46059480.0, "logps/chosen": -493.5244140625, "logps/rejected": -410.3149719238281, "loss": 0.2375, "rewards/chosen": 1.1416079998016357, "rewards/margins": 2.999018907546997, "rewards/rejected": -1.8574109077453613, "step": 15841 }, { "epoch": 0.8396893965494395, "grad_norm": 51.0, "kl": 1.8490514755249023, "learning_rate": 5e-07, "logits/chosen": -15388808.0, "logits/rejected": -54743428.0, "logps/chosen": -314.5466613769531, "logps/rejected": -289.6306457519531, "loss": 0.302, "rewards/chosen": 0.4744205176830292, "rewards/margins": 2.475898951292038, "rewards/rejected": -2.001478433609009, "step": 15842 }, { "epoch": 0.8397424005512416, "grad_norm": 33.75, "kl": 0.07245826721191406, "learning_rate": 5e-07, "logits/chosen": -9529122.0, "logits/rejected": -10689438.0, "logps/chosen": -167.66432189941406, "logps/rejected": -695.7740478515625, "loss": 0.2388, "rewards/chosen": 0.2525116801261902, "rewards/margins": 3.9049922823905945, "rewards/rejected": -3.6524806022644043, "step": 15843 }, { "epoch": 0.8397954045530438, "grad_norm": 62.5, "kl": 1.616063117980957, "learning_rate": 5e-07, "logits/chosen": -23593528.0, "logits/rejected": -15831522.0, "logps/chosen": -171.35595703125, "logps/rejected": -551.3267822265625, "loss": 0.3215, "rewards/chosen": 0.9351797103881836, "rewards/margins": 1.9329411387443542, "rewards/rejected": -0.9977614283561707, "step": 15844 }, { "epoch": 0.8398484085548459, "grad_norm": 61.25, "kl": 3.570150375366211, "learning_rate": 5e-07, "logits/chosen": -5222579.5, "logits/rejected": -18230100.0, "logps/chosen": -310.9544372558594, "logps/rejected": -312.1995849609375, "loss": 0.2027, "rewards/chosen": 1.334700584411621, "rewards/margins": 4.108652114868164, "rewards/rejected": -2.773951530456543, "step": 15845 }, { "epoch": 0.8399014125566481, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21135096.0, "logits/rejected": -21063555.2, "logps/chosen": -661.9581298828125, "logps/rejected": -357.810791015625, "loss": 0.1984, "rewards/chosen": 0.2756337722142537, "rewards/margins": 3.4504809935887657, "rewards/rejected": -3.174847221374512, "step": 15846 }, { "epoch": 0.8399544165584502, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19399266.666666668, "logits/rejected": -16663267.2, "logps/chosen": -225.3426513671875, "logps/rejected": -278.3076171875, "loss": 0.2765, "rewards/chosen": 0.4534439245859782, "rewards/margins": 2.328356568018595, "rewards/rejected": -1.874912643432617, "step": 15847 }, { "epoch": 0.8400074205602523, "grad_norm": 41.75, "kl": 3.8890819549560547, "learning_rate": 5e-07, "logits/chosen": -20128883.2, "logits/rejected": -22804392.0, "logps/chosen": -273.9158447265625, "logps/rejected": -249.2762451171875, "loss": 0.2593, "rewards/chosen": 1.0530882835388184, "rewards/margins": 2.4924086888631187, "rewards/rejected": -1.4393204053243, "step": 15848 }, { "epoch": 0.8400604245620544, "grad_norm": 37.75, "kl": 2.6677513122558594, "learning_rate": 5e-07, "logits/chosen": -24467556.8, "logits/rejected": -18081174.666666668, "logps/chosen": -328.527587890625, "logps/rejected": -168.9056193033854, "loss": 0.2073, "rewards/chosen": 1.186898136138916, "rewards/margins": 4.4768043200174965, "rewards/rejected": -3.2899061838785806, "step": 15849 }, { "epoch": 0.8401134285638566, "grad_norm": 40.25, "kl": 1.6889171600341797, "learning_rate": 5e-07, "logits/chosen": -30129372.0, "logits/rejected": -4625428.0, "logps/chosen": -963.047607421875, "logps/rejected": -96.54679870605469, "loss": 0.1893, "rewards/chosen": 1.8271836042404175, "rewards/margins": 4.485775113105774, "rewards/rejected": -2.6585915088653564, "step": 15850 }, { "epoch": 0.8401664325656587, "grad_norm": 38.5, "kl": 1.5814189910888672, "learning_rate": 5e-07, "logits/chosen": -27881474.666666668, "logits/rejected": -4463040.0, "logps/chosen": -139.85550944010416, "logps/rejected": -238.84852600097656, "loss": 0.3511, "rewards/chosen": 0.7617129484812418, "rewards/margins": 2.28731099764506, "rewards/rejected": -1.5255980491638184, "step": 15851 }, { "epoch": 0.8402194365674609, "grad_norm": 43.75, "kl": 0.2321014404296875, "learning_rate": 5e-07, "logits/chosen": -26151907.2, "logits/rejected": -26407520.0, "logps/chosen": -354.016455078125, "logps/rejected": -203.1274617513021, "loss": 0.2479, "rewards/chosen": 0.8417248725891113, "rewards/margins": 3.0691049893697104, "rewards/rejected": -2.227380116780599, "step": 15852 }, { "epoch": 0.840272440569263, "grad_norm": 48.5, "kl": 1.2005538940429688, "learning_rate": 5e-07, "logits/chosen": -31514776.0, "logits/rejected": -56737860.0, "logps/chosen": -387.9439697265625, "logps/rejected": -417.94232177734375, "loss": 0.2197, "rewards/chosen": 0.9618831872940063, "rewards/margins": 3.1994162797927856, "rewards/rejected": -2.2375330924987793, "step": 15853 }, { "epoch": 0.8403254445710651, "grad_norm": 57.75, "kl": 1.2671051025390625, "learning_rate": 5e-07, "logits/chosen": -14311850.285714285, "logits/rejected": -6190934.0, "logps/chosen": -261.383544921875, "logps/rejected": -52.78693389892578, "loss": 0.5084, "rewards/chosen": -0.12391630240849086, "rewards/margins": 1.8749618189675468, "rewards/rejected": -1.9988781213760376, "step": 15854 }, { "epoch": 0.8403784485728673, "grad_norm": 54.25, "kl": 1.260141372680664, "learning_rate": 5e-07, "logits/chosen": -28674297.6, "logits/rejected": -22262860.0, "logps/chosen": -330.538232421875, "logps/rejected": -357.6403401692708, "loss": 0.2853, "rewards/chosen": 0.626074743270874, "rewards/margins": 3.983414284388224, "rewards/rejected": -3.35733954111735, "step": 15855 }, { "epoch": 0.8404314525746693, "grad_norm": 44.25, "kl": 1.8497085571289062, "learning_rate": 5e-07, "logits/chosen": -36457334.4, "logits/rejected": -28133688.0, "logps/chosen": -495.120751953125, "logps/rejected": -509.433349609375, "loss": 0.2055, "rewards/chosen": 1.0052205085754395, "rewards/margins": 5.267375087738037, "rewards/rejected": -4.262154579162598, "step": 15856 }, { "epoch": 0.8404844565764715, "grad_norm": 39.75, "kl": 2.2006759643554688, "learning_rate": 5e-07, "logits/chosen": -35984688.0, "logits/rejected": -6272640.0, "logps/chosen": -262.5433349609375, "logps/rejected": -338.0434265136719, "loss": 0.1888, "rewards/chosen": 1.0159550905227661, "rewards/margins": 3.6753865480422974, "rewards/rejected": -2.6594314575195312, "step": 15857 }, { "epoch": 0.8405374605782736, "grad_norm": 28.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4010119.0, "logits/rejected": 32792662.85714286, "logps/chosen": -24.44390296936035, "logps/rejected": -518.8078264508929, "loss": 0.1112, "rewards/chosen": 0.6248857378959656, "rewards/margins": 4.410279589039939, "rewards/rejected": -3.785393851143973, "step": 15858 }, { "epoch": 0.8405904645800758, "grad_norm": 47.5, "kl": 2.802762985229492, "learning_rate": 5e-07, "logits/chosen": -26522212.0, "logits/rejected": -47498316.0, "logps/chosen": -293.7940673828125, "logps/rejected": -363.8300476074219, "loss": 0.2301, "rewards/chosen": 0.7172739505767822, "rewards/margins": 4.022169589996338, "rewards/rejected": -3.3048956394195557, "step": 15859 }, { "epoch": 0.8406434685818779, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84996784.0, "logits/rejected": -15208667.2, "logps/chosen": -512.0439046223959, "logps/rejected": -234.8357421875, "loss": 0.1441, "rewards/chosen": 1.1058197021484375, "rewards/margins": 4.0015153884887695, "rewards/rejected": -2.895695686340332, "step": 15860 }, { "epoch": 0.8406964725836801, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36209948.0, "logits/rejected": -25860458.0, "logps/chosen": -424.772216796875, "logps/rejected": -205.23712158203125, "loss": 0.1975, "rewards/chosen": 1.4367691278457642, "rewards/margins": 3.3952510356903076, "rewards/rejected": -1.9584819078445435, "step": 15861 }, { "epoch": 0.8407494765854822, "grad_norm": 45.0, "kl": 2.504007339477539, "learning_rate": 5e-07, "logits/chosen": -44046163.2, "logits/rejected": -35685338.666666664, "logps/chosen": -367.9970458984375, "logps/rejected": -343.8230794270833, "loss": 0.2687, "rewards/chosen": 1.0146597862243651, "rewards/margins": 3.426842435201009, "rewards/rejected": -2.412182648976644, "step": 15862 }, { "epoch": 0.8408024805872844, "grad_norm": 55.0, "kl": 2.01947021484375, "learning_rate": 5e-07, "logits/chosen": 13534130.0, "logits/rejected": -25703387.42857143, "logps/chosen": -41.42133331298828, "logps/rejected": -357.38033621651783, "loss": 0.215, "rewards/chosen": 0.4882850646972656, "rewards/margins": 2.4499974931989397, "rewards/rejected": -1.961712428501674, "step": 15863 }, { "epoch": 0.8408554845890864, "grad_norm": 39.0, "kl": 0.5462570190429688, "learning_rate": 5e-07, "logits/chosen": -81357496.0, "logits/rejected": 1960268.0, "logps/chosen": -273.26202392578125, "logps/rejected": -962.548583984375, "loss": 0.2529, "rewards/chosen": 0.29766225814819336, "rewards/margins": 6.5316338539123535, "rewards/rejected": -6.23397159576416, "step": 15864 }, { "epoch": 0.8409084885908886, "grad_norm": 29.5, "kl": 2.350865364074707, "learning_rate": 5e-07, "logits/chosen": -11168030.4, "logits/rejected": -29550093.333333332, "logps/chosen": -169.53184814453124, "logps/rejected": -469.9886881510417, "loss": 0.2563, "rewards/chosen": 0.7007457733154296, "rewards/margins": 3.940539232889811, "rewards/rejected": -3.2397934595743814, "step": 15865 }, { "epoch": 0.8409614925926907, "grad_norm": 59.5, "kl": 2.7634429931640625, "learning_rate": 5e-07, "logits/chosen": -35827648.0, "logits/rejected": -80568064.0, "logps/chosen": -358.060791015625, "logps/rejected": -320.43255615234375, "loss": 0.3694, "rewards/chosen": 0.8709213733673096, "rewards/margins": 2.177734851837158, "rewards/rejected": -1.3068134784698486, "step": 15866 }, { "epoch": 0.8410144965944929, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30409464.0, "logits/rejected": -10864404.0, "logps/chosen": -348.80413818359375, "logps/rejected": -247.54769897460938, "loss": 0.2391, "rewards/chosen": 0.6743087768554688, "rewards/margins": 3.746164321899414, "rewards/rejected": -3.0718555450439453, "step": 15867 }, { "epoch": 0.841067500596295, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40329240.0, "logits/rejected": -39445757.333333336, "logps/chosen": -175.17092895507812, "logps/rejected": -516.7418212890625, "loss": 0.2844, "rewards/chosen": -0.41270941495895386, "rewards/margins": 1.8232974807421365, "rewards/rejected": -2.2360068957010903, "step": 15868 }, { "epoch": 0.8411205045980972, "grad_norm": 31.125, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -40109188.0, "logps/rejected": -315.84759521484375, "loss": 0.1147, "rewards/rejected": -2.577216863632202, "step": 15869 }, { "epoch": 0.8411735085998993, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 117086592.0, "logits/rejected": -18049090.285714287, "logps/chosen": -453.43560791015625, "logps/rejected": -241.43150111607142, "loss": 0.1688, "rewards/chosen": -0.11179199069738388, "rewards/margins": 2.522624317024435, "rewards/rejected": -2.634416307721819, "step": 15870 }, { "epoch": 0.8412265126017014, "grad_norm": 64.5, "kl": 7.751173973083496, "learning_rate": 5e-07, "logits/chosen": -24945504.0, "logits/rejected": -28912821.333333332, "logps/chosen": -771.783203125, "logps/rejected": -434.322998046875, "loss": 0.2605, "rewards/chosen": 1.4708555221557618, "rewards/margins": 4.8586016972859705, "rewards/rejected": -3.3877461751302085, "step": 15871 }, { "epoch": 0.8412795166035035, "grad_norm": 43.25, "kl": 2.3429956436157227, "learning_rate": 5e-07, "logits/chosen": -33896035.2, "logits/rejected": -31622794.666666668, "logps/chosen": -248.8859130859375, "logps/rejected": -274.50396728515625, "loss": 0.2996, "rewards/chosen": 0.7532369136810303, "rewards/margins": 2.9290162563323974, "rewards/rejected": -2.175779342651367, "step": 15872 }, { "epoch": 0.8413325206053057, "grad_norm": 60.25, "kl": 3.9942264556884766, "learning_rate": 5e-07, "logits/chosen": -15716983.0, "logits/rejected": -20180936.0, "logps/chosen": -319.3026123046875, "logps/rejected": -173.1356201171875, "loss": 0.34, "rewards/chosen": 1.1935542821884155, "rewards/margins": 2.326287865638733, "rewards/rejected": -1.1327335834503174, "step": 15873 }, { "epoch": 0.8413855246071078, "grad_norm": 55.75, "kl": 3.365558624267578, "learning_rate": 5e-07, "logits/chosen": -2746281.6, "logits/rejected": -53313653.333333336, "logps/chosen": -206.314892578125, "logps/rejected": -239.28460693359375, "loss": 0.2836, "rewards/chosen": 1.072867488861084, "rewards/margins": 3.468492921193441, "rewards/rejected": -2.395625432332357, "step": 15874 }, { "epoch": 0.84143852860891, "grad_norm": 47.5, "kl": 0.06849288940429688, "learning_rate": 5e-07, "logits/chosen": -59393080.0, "logits/rejected": -32711792.0, "logps/chosen": -385.0071716308594, "logps/rejected": -403.7413736979167, "loss": 0.1452, "rewards/chosen": 1.3420441150665283, "rewards/margins": 3.6546765168507895, "rewards/rejected": -2.312632401784261, "step": 15875 }, { "epoch": 0.8414915326107121, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -38246048.0, "logps/rejected": -469.0233154296875, "loss": 0.1106, "rewards/rejected": -2.6846961975097656, "step": 15876 }, { "epoch": 0.8415445366125143, "grad_norm": 57.25, "kl": 2.2003097534179688, "learning_rate": 5e-07, "logits/chosen": -21227156.0, "logits/rejected": -13664078.0, "logps/chosen": -289.0596516927083, "logps/rejected": -642.3134765625, "loss": 0.3524, "rewards/chosen": 0.6115406354268392, "rewards/margins": 2.653670152028402, "rewards/rejected": -2.0421295166015625, "step": 15877 }, { "epoch": 0.8415975406143164, "grad_norm": 23.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19322670.666666668, "logits/rejected": -56097772.8, "logps/chosen": -201.31193033854166, "logps/rejected": -357.796484375, "loss": 0.184, "rewards/chosen": 1.3654006322224934, "rewards/margins": 3.83367265065511, "rewards/rejected": -2.468272018432617, "step": 15878 }, { "epoch": 0.8416505446161185, "grad_norm": 36.5, "kl": 0.1930255889892578, "learning_rate": 5e-07, "logits/chosen": 756350.0, "logits/rejected": -33598992.0, "logps/chosen": -151.56906127929688, "logps/rejected": -350.5097351074219, "loss": 0.1907, "rewards/chosen": 0.8468096852302551, "rewards/margins": 4.485579311847687, "rewards/rejected": -3.6387696266174316, "step": 15879 }, { "epoch": 0.8417035486179206, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72080019.2, "logits/rejected": -41598317.333333336, "logps/chosen": -505.659521484375, "logps/rejected": -405.9332275390625, "loss": 0.2421, "rewards/chosen": 0.7177990913391114, "rewards/margins": 3.2413710594177245, "rewards/rejected": -2.5235719680786133, "step": 15880 }, { "epoch": 0.8417565526197228, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9214950.666666666, "logits/rejected": -2165536.8, "logps/chosen": -262.9544270833333, "logps/rejected": -136.20040283203124, "loss": 0.2881, "rewards/chosen": 0.6308557987213135, "rewards/margins": 2.527307939529419, "rewards/rejected": -1.8964521408081054, "step": 15881 }, { "epoch": 0.8418095566215249, "grad_norm": 46.25, "kl": 0.1925201416015625, "learning_rate": 5e-07, "logits/chosen": -34727888.0, "logits/rejected": -7961126.0, "logps/chosen": -267.50283203125, "logps/rejected": -294.64186604817706, "loss": 0.243, "rewards/chosen": 0.7735621929168701, "rewards/margins": 3.1242600282033286, "rewards/rejected": -2.3506978352864585, "step": 15882 }, { "epoch": 0.8418625606233271, "grad_norm": 35.5, "kl": 0.8589401245117188, "learning_rate": 5e-07, "logits/chosen": -18469313.6, "logits/rejected": -39138154.666666664, "logps/chosen": -93.01406860351562, "logps/rejected": -761.2303873697916, "loss": 0.2646, "rewards/chosen": 0.583327341079712, "rewards/margins": 3.2935521284739178, "rewards/rejected": -2.7102247873942056, "step": 15883 }, { "epoch": 0.8419155646251292, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42947808.0, "logits/rejected": -26135440.0, "logps/chosen": -799.9659016927084, "logps/rejected": -360.008740234375, "loss": 0.1709, "rewards/chosen": 1.7003766695658367, "rewards/margins": 4.346332518259684, "rewards/rejected": -2.6459558486938475, "step": 15884 }, { "epoch": 0.8419685686269314, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46521013.333333336, "logits/rejected": -12256362.4, "logps/chosen": -501.6267903645833, "logps/rejected": -151.5052490234375, "loss": 0.2397, "rewards/chosen": 0.36743776003519696, "rewards/margins": 2.9321638266245524, "rewards/rejected": -2.5647260665893556, "step": 15885 }, { "epoch": 0.8420215726287335, "grad_norm": 76.5, "kl": 1.4622259140014648, "learning_rate": 5e-07, "logits/chosen": -6703685.0, "logits/rejected": 7445717.0, "logps/chosen": -331.83563232421875, "logps/rejected": -200.93394470214844, "loss": 0.3441, "rewards/chosen": 0.27743974328041077, "rewards/margins": 2.106614738702774, "rewards/rejected": -1.8291749954223633, "step": 15886 }, { "epoch": 0.8420745766305356, "grad_norm": 46.25, "kl": 1.5239448547363281, "learning_rate": 5e-07, "logits/chosen": -96199392.0, "logits/rejected": -23950268.0, "logps/chosen": -275.5934143066406, "logps/rejected": -223.81629943847656, "loss": 0.3305, "rewards/chosen": 0.5726590752601624, "rewards/margins": 1.7277609705924988, "rewards/rejected": -1.1551018953323364, "step": 15887 }, { "epoch": 0.8421275806323377, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42823813.333333336, "logits/rejected": -12609069.6, "logps/chosen": -240.4244588216146, "logps/rejected": -336.328076171875, "loss": 0.2686, "rewards/chosen": 0.4078691005706787, "rewards/margins": 1.9692952632904053, "rewards/rejected": -1.5614261627197266, "step": 15888 }, { "epoch": 0.8421805846341399, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41862954.666666664, "logits/rejected": -28337468.8, "logps/chosen": -256.37107340494794, "logps/rejected": -337.888623046875, "loss": 0.231, "rewards/chosen": 0.2486058473587036, "rewards/margins": 2.9077689409255982, "rewards/rejected": -2.6591630935668946, "step": 15889 }, { "epoch": 0.842233588635942, "grad_norm": 39.25, "kl": 7.887051582336426, "learning_rate": 5e-07, "logits/chosen": -51325209.6, "logits/rejected": -6958314.0, "logps/chosen": -622.97646484375, "logps/rejected": -234.1067911783854, "loss": 0.1393, "rewards/chosen": 2.7310367584228517, "rewards/margins": 5.687456385294596, "rewards/rejected": -2.9564196268717446, "step": 15890 }, { "epoch": 0.8422865926377442, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17728824.0, "logits/rejected": -45714380.8, "logps/chosen": -255.93465169270834, "logps/rejected": -247.6005126953125, "loss": 0.2085, "rewards/chosen": 1.353508472442627, "rewards/margins": 2.9825133323669433, "rewards/rejected": -1.6290048599243163, "step": 15891 }, { "epoch": 0.8423395966395463, "grad_norm": 42.75, "kl": 1.7617511749267578, "learning_rate": 5e-07, "logits/chosen": -11322800.0, "logits/rejected": -1498629.0, "logps/chosen": -198.2669189453125, "logps/rejected": -225.85150146484375, "loss": 0.3676, "rewards/chosen": 0.20201659202575684, "rewards/margins": 2.6129122575124106, "rewards/rejected": -2.410895665486654, "step": 15892 }, { "epoch": 0.8423926006413485, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45491616.0, "logits/rejected": -26095076.0, "logps/chosen": -419.8956604003906, "logps/rejected": -347.647705078125, "loss": 0.364, "rewards/chosen": 0.20311659574508667, "rewards/margins": 1.336975872516632, "rewards/rejected": -1.1338592767715454, "step": 15893 }, { "epoch": 0.8424456046431505, "grad_norm": 56.5, "kl": 0.884613037109375, "learning_rate": 5e-07, "logits/chosen": -33893170.28571428, "logits/rejected": -33410926.0, "logps/chosen": -359.66469029017856, "logps/rejected": -579.549072265625, "loss": 0.3557, "rewards/chosen": 0.5893214770725795, "rewards/margins": 3.3002163001469205, "rewards/rejected": -2.710894823074341, "step": 15894 }, { "epoch": 0.8424986086449527, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48901792.0, "logits/rejected": -45688789.333333336, "logps/chosen": -392.85345458984375, "logps/rejected": -572.2944742838541, "loss": 0.1992, "rewards/chosen": 0.188140869140625, "rewards/margins": 2.6143115361531577, "rewards/rejected": -2.4261706670125327, "step": 15895 }, { "epoch": 0.8425516126467548, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8736072.0, "logits/rejected": -24370801.6, "logps/chosen": -346.052001953125, "logps/rejected": -342.8171875, "loss": 0.2634, "rewards/chosen": -0.1043558915456136, "rewards/margins": 2.863619915644328, "rewards/rejected": -2.9679758071899416, "step": 15896 }, { "epoch": 0.842604616648557, "grad_norm": 62.5, "kl": 0.018232345581054688, "learning_rate": 5e-07, "logits/chosen": -41428900.571428575, "logits/rejected": -128777112.0, "logps/chosen": -340.92435128348217, "logps/rejected": -61.780120849609375, "loss": 0.3751, "rewards/chosen": 0.4637704576764788, "rewards/margins": 1.5971887792859758, "rewards/rejected": -1.133418321609497, "step": 15897 }, { "epoch": 0.8426576206503591, "grad_norm": 52.0, "kl": 5.137228012084961, "learning_rate": 5e-07, "logits/chosen": -20045576.0, "logits/rejected": -104223504.0, "logps/chosen": -170.12276204427084, "logps/rejected": -605.0337524414062, "loss": 0.3542, "rewards/chosen": 0.6904261112213135, "rewards/margins": 4.0578553676605225, "rewards/rejected": -3.367429256439209, "step": 15898 }, { "epoch": 0.8427106246521613, "grad_norm": 37.0, "kl": 1.7941856384277344, "learning_rate": 5e-07, "logits/chosen": -18493906.0, "logits/rejected": -35619320.0, "logps/chosen": -204.21475219726562, "logps/rejected": -235.9665069580078, "loss": 0.2598, "rewards/chosen": 1.0604610443115234, "rewards/margins": 3.2504372596740723, "rewards/rejected": -2.189976215362549, "step": 15899 }, { "epoch": 0.8427636286539634, "grad_norm": 51.75, "kl": 0.8835792541503906, "learning_rate": 5e-07, "logits/chosen": -101692672.0, "logits/rejected": -13046987.0, "logps/chosen": -400.9114583333333, "logps/rejected": -766.737548828125, "loss": 0.2988, "rewards/chosen": 0.6830562750498453, "rewards/margins": 4.26510492960612, "rewards/rejected": -3.5820486545562744, "step": 15900 }, { "epoch": 0.8428166326557656, "grad_norm": 47.0, "kl": 0.5157051086425781, "learning_rate": 5e-07, "logits/chosen": -30916396.0, "logits/rejected": -37314852.0, "logps/chosen": -263.80950927734375, "logps/rejected": -201.63458251953125, "loss": 0.3422, "rewards/chosen": 0.06848078221082687, "rewards/margins": 2.1131830736994743, "rewards/rejected": -2.0447022914886475, "step": 15901 }, { "epoch": 0.8428696366575676, "grad_norm": 53.25, "kl": 1.730672836303711, "learning_rate": 5e-07, "logits/chosen": -22616480.0, "logits/rejected": -144568720.0, "logps/chosen": -267.4845493861607, "logps/rejected": -654.272216796875, "loss": 0.3809, "rewards/chosen": 0.5288117953709194, "rewards/margins": 4.935025078909738, "rewards/rejected": -4.406213283538818, "step": 15902 }, { "epoch": 0.8429226406593698, "grad_norm": 65.5, "kl": 2.3254547119140625, "learning_rate": 5e-07, "logits/chosen": -38637990.4, "logits/rejected": -47767488.0, "logps/chosen": -725.56513671875, "logps/rejected": -451.0636393229167, "loss": 0.1935, "rewards/chosen": 1.2720008850097657, "rewards/margins": 4.16501833597819, "rewards/rejected": -2.8930174509684243, "step": 15903 }, { "epoch": 0.8429756446611719, "grad_norm": 49.0, "kl": 1.6529426574707031, "learning_rate": 5e-07, "logits/chosen": -26369981.333333332, "logits/rejected": 362277.5, "logps/chosen": -214.78702799479166, "logps/rejected": -121.37577056884766, "loss": 0.3697, "rewards/chosen": 0.3373719056447347, "rewards/margins": 3.625903924306234, "rewards/rejected": -3.288532018661499, "step": 15904 }, { "epoch": 0.843028648662974, "grad_norm": 48.0, "kl": 0.0661163330078125, "learning_rate": 5e-07, "logits/chosen": -61397370.666666664, "logits/rejected": -45450440.0, "logps/chosen": -560.6864420572916, "logps/rejected": -222.81051635742188, "loss": 0.3355, "rewards/chosen": 0.9538223743438721, "rewards/margins": 2.0928932428359985, "rewards/rejected": -1.1390708684921265, "step": 15905 }, { "epoch": 0.8430816526647762, "grad_norm": 37.0, "kl": 3.0483241081237793, "learning_rate": 5e-07, "logits/chosen": -96379944.0, "logits/rejected": -11095329.142857144, "logps/chosen": -1223.4453125, "logps/rejected": -197.09685407366072, "loss": 0.217, "rewards/chosen": 3.4709229469299316, "rewards/margins": 4.52726834160941, "rewards/rejected": -1.0563453946794783, "step": 15906 }, { "epoch": 0.8431346566665783, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27656458.0, "logits/rejected": -31164132.0, "logps/chosen": -500.949462890625, "logps/rejected": -280.3172912597656, "loss": 0.2561, "rewards/chosen": 0.7124935388565063, "rewards/margins": 2.508929133415222, "rewards/rejected": -1.7964355945587158, "step": 15907 }, { "epoch": 0.8431876606683805, "grad_norm": 41.75, "kl": 5.1789703369140625, "learning_rate": 5e-07, "logits/chosen": -14687305.0, "logits/rejected": -52123840.0, "logps/chosen": -170.22756958007812, "logps/rejected": -424.7778625488281, "loss": 0.3251, "rewards/chosen": 0.6178300380706787, "rewards/margins": 4.059697389602661, "rewards/rejected": -3.4418673515319824, "step": 15908 }, { "epoch": 0.8432406646701825, "grad_norm": 38.25, "kl": 0.06744861602783203, "learning_rate": 5e-07, "logits/chosen": -51080376.0, "logits/rejected": -38605200.0, "logps/chosen": -322.5005798339844, "logps/rejected": -200.06434631347656, "loss": 0.22, "rewards/chosen": 1.1010321378707886, "rewards/margins": 3.133090376853943, "rewards/rejected": -2.0320582389831543, "step": 15909 }, { "epoch": 0.8432936686719847, "grad_norm": 46.0, "kl": 1.9901762008666992, "learning_rate": 5e-07, "logits/chosen": -10173644.8, "logits/rejected": -38254133.333333336, "logps/chosen": -242.3812255859375, "logps/rejected": -375.8288981119792, "loss": 0.3103, "rewards/chosen": 0.6939625263214111, "rewards/margins": 3.001678578058878, "rewards/rejected": -2.3077160517374673, "step": 15910 }, { "epoch": 0.8433466726737868, "grad_norm": 56.0, "kl": 0.16308307647705078, "learning_rate": 5e-07, "logits/chosen": -16383338.0, "logits/rejected": -18836040.0, "logps/chosen": -351.428466796875, "logps/rejected": -246.1061553955078, "loss": 0.3427, "rewards/chosen": 0.017118915915489197, "rewards/margins": 1.6368467658758163, "rewards/rejected": -1.6197278499603271, "step": 15911 }, { "epoch": 0.843399676675589, "grad_norm": 54.5, "kl": 1.444284439086914, "learning_rate": 5e-07, "logits/chosen": -63320484.0, "logits/rejected": -5407137.0, "logps/chosen": -498.2530517578125, "logps/rejected": -89.72059631347656, "loss": 0.3258, "rewards/chosen": 0.6102302670478821, "rewards/margins": 2.3388128876686096, "rewards/rejected": -1.7285826206207275, "step": 15912 }, { "epoch": 0.8434526806773911, "grad_norm": 54.25, "kl": 1.6265640258789062, "learning_rate": 5e-07, "logits/chosen": -42743920.0, "logits/rejected": -16906521.6, "logps/chosen": -452.91748046875, "logps/rejected": -152.61243896484376, "loss": 0.3033, "rewards/chosen": 0.10872091849644978, "rewards/margins": 1.9305301149686176, "rewards/rejected": -1.8218091964721679, "step": 15913 }, { "epoch": 0.8435056846791933, "grad_norm": 44.0, "kl": 2.208108901977539, "learning_rate": 5e-07, "logits/chosen": -19470402.0, "logits/rejected": -45744560.0, "logps/chosen": -158.38735961914062, "logps/rejected": -311.45892333984375, "loss": 0.3008, "rewards/chosen": 0.3805151879787445, "rewards/margins": 2.449711173772812, "rewards/rejected": -2.0691959857940674, "step": 15914 }, { "epoch": 0.8435586886809954, "grad_norm": 59.75, "kl": 0.21414947509765625, "learning_rate": 5e-07, "logits/chosen": 22277452.0, "logits/rejected": -39708240.0, "logps/chosen": -358.2496744791667, "logps/rejected": -491.559375, "loss": 0.2228, "rewards/chosen": 1.015454928080241, "rewards/margins": 3.076939455668132, "rewards/rejected": -2.061484527587891, "step": 15915 }, { "epoch": 0.8436116926827976, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29365734.0, "logits/rejected": -32038154.0, "logps/chosen": -382.48699951171875, "logps/rejected": -310.2713623046875, "loss": 0.2045, "rewards/chosen": 0.694951057434082, "rewards/margins": 4.343856334686279, "rewards/rejected": -3.6489052772521973, "step": 15916 }, { "epoch": 0.8436646966845996, "grad_norm": 56.75, "kl": 2.1524906158447266, "learning_rate": 5e-07, "logits/chosen": 12092984.8, "logits/rejected": -12586424.0, "logps/chosen": -378.556884765625, "logps/rejected": -125.42612711588542, "loss": 0.2768, "rewards/chosen": 1.4257192611694336, "rewards/margins": 2.663652181625366, "rewards/rejected": -1.2379329204559326, "step": 15917 }, { "epoch": 0.8437177006864018, "grad_norm": 42.5, "kl": 2.206157684326172, "learning_rate": 5e-07, "logits/chosen": -24376221.333333332, "logits/rejected": 14404934.4, "logps/chosen": -251.5794474283854, "logps/rejected": -308.884521484375, "loss": 0.1911, "rewards/chosen": 1.2030781904856365, "rewards/margins": 3.7949307600657143, "rewards/rejected": -2.591852569580078, "step": 15918 }, { "epoch": 0.8437707046882039, "grad_norm": 45.75, "kl": 1.931121826171875, "learning_rate": 5e-07, "logits/chosen": -32895043.2, "logits/rejected": -32905194.666666668, "logps/chosen": -359.642578125, "logps/rejected": -402.9506022135417, "loss": 0.2836, "rewards/chosen": 0.8211150169372559, "rewards/margins": 2.963066577911377, "rewards/rejected": -2.141951560974121, "step": 15919 }, { "epoch": 0.8438237086900061, "grad_norm": 55.25, "kl": 0.05558013916015625, "learning_rate": 5e-07, "logits/chosen": -108858304.0, "logits/rejected": -38406144.0, "logps/chosen": -326.2774963378906, "logps/rejected": -413.801513671875, "loss": 0.2215, "rewards/chosen": 0.13533172011375427, "rewards/margins": 2.755345433950424, "rewards/rejected": -2.62001371383667, "step": 15920 }, { "epoch": 0.8438767126918082, "grad_norm": 38.5, "kl": 0.9125785827636719, "learning_rate": 5e-07, "logits/chosen": 9951305.6, "logits/rejected": -9501385.333333334, "logps/chosen": -125.495361328125, "logps/rejected": -115.8670654296875, "loss": 0.3315, "rewards/chosen": -0.029499459266662597, "rewards/margins": 3.027305833498637, "rewards/rejected": -3.0568052927652993, "step": 15921 }, { "epoch": 0.8439297166936104, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29521232.0, "logits/rejected": -23538245.333333332, "logps/chosen": -438.9105224609375, "logps/rejected": -299.0877685546875, "loss": 0.15, "rewards/chosen": 0.6767135858535767, "rewards/margins": 3.302655816078186, "rewards/rejected": -2.6259422302246094, "step": 15922 }, { "epoch": 0.8439827206954125, "grad_norm": 51.75, "kl": 0.1468639373779297, "learning_rate": 5e-07, "logits/chosen": 5318379.0, "logits/rejected": 99725638.4, "logps/chosen": -52.0741221110026, "logps/rejected": -336.3665771484375, "loss": 0.1484, "rewards/chosen": 1.1424511273701985, "rewards/margins": 4.279954369862875, "rewards/rejected": -3.137503242492676, "step": 15923 }, { "epoch": 0.8440357246972147, "grad_norm": 41.25, "kl": 0.9153785705566406, "learning_rate": 5e-07, "logits/chosen": -15554647.0, "logits/rejected": -11591018.0, "logps/chosen": -85.06098175048828, "logps/rejected": -139.8500518798828, "loss": 0.3261, "rewards/chosen": 0.0793248638510704, "rewards/margins": 2.3229795917868614, "rewards/rejected": -2.243654727935791, "step": 15924 }, { "epoch": 0.8440887286990167, "grad_norm": 42.0, "kl": 5.455226898193359, "learning_rate": 5e-07, "logits/chosen": -3390616.0, "logits/rejected": -8612574.0, "logps/chosen": -395.7677001953125, "logps/rejected": -76.52959442138672, "loss": 0.2341, "rewards/chosen": 1.85558287302653, "rewards/margins": 5.352365652720134, "rewards/rejected": -3.4967827796936035, "step": 15925 }, { "epoch": 0.8441417327008189, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -42038128.0, "logps/rejected": -301.39605712890625, "loss": 0.1835, "rewards/rejected": -2.061411142349243, "step": 15926 }, { "epoch": 0.844194736702621, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 28451861.333333332, "logits/rejected": -21651006.4, "logps/chosen": -182.2526652018229, "logps/rejected": -515.490087890625, "loss": 0.1852, "rewards/chosen": 0.6664695739746094, "rewards/margins": 3.8674060821533205, "rewards/rejected": -3.200936508178711, "step": 15927 }, { "epoch": 0.8442477407044232, "grad_norm": 26.25, "kl": 1.6343717575073242, "learning_rate": 5e-07, "logits/chosen": -7718165.6, "logits/rejected": -6617942.666666667, "logps/chosen": -101.50157470703125, "logps/rejected": -599.3265787760416, "loss": 0.2761, "rewards/chosen": 0.5412148475646973, "rewards/margins": 4.766926352183025, "rewards/rejected": -4.225711504618327, "step": 15928 }, { "epoch": 0.8443007447062253, "grad_norm": 54.75, "kl": 2.2151565551757812, "learning_rate": 5e-07, "logits/chosen": -35838112.0, "logits/rejected": -8508670.666666666, "logps/chosen": -489.397265625, "logps/rejected": -210.71905517578125, "loss": 0.2801, "rewards/chosen": 1.1085325241088868, "rewards/margins": 2.822212791442871, "rewards/rejected": -1.7136802673339844, "step": 15929 }, { "epoch": 0.8443537487080275, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64656869.333333336, "logits/rejected": 14602097.6, "logps/chosen": -326.203125, "logps/rejected": -321.725146484375, "loss": 0.2844, "rewards/chosen": 0.1337962051232656, "rewards/margins": 2.3901493926843007, "rewards/rejected": -2.256353187561035, "step": 15930 }, { "epoch": 0.8444067527098296, "grad_norm": 57.25, "kl": 3.2139530181884766, "learning_rate": 5e-07, "logits/chosen": -13119928.0, "logits/rejected": -41572464.0, "logps/chosen": -384.7259928385417, "logps/rejected": -138.7393798828125, "loss": 0.3932, "rewards/chosen": 0.6341212590535482, "rewards/margins": 1.7160807450612388, "rewards/rejected": -1.0819594860076904, "step": 15931 }, { "epoch": 0.8444597567116318, "grad_norm": 36.5, "kl": 1.7894363403320312, "learning_rate": 5e-07, "logits/chosen": -63527556.0, "logits/rejected": -18789330.666666668, "logps/chosen": -750.134033203125, "logps/rejected": -189.5888671875, "loss": 0.1293, "rewards/chosen": 1.8894531726837158, "rewards/margins": 4.691624879837036, "rewards/rejected": -2.8021717071533203, "step": 15932 }, { "epoch": 0.8445127607134338, "grad_norm": 45.5, "kl": 0.29717159271240234, "learning_rate": 5e-07, "logits/chosen": -41648496.0, "logits/rejected": 29985866.666666668, "logps/chosen": -268.94873046875, "logps/rejected": -269.7230631510417, "loss": 0.3424, "rewards/chosen": 0.4406151294708252, "rewards/margins": 1.6299190044403076, "rewards/rejected": -1.1893038749694824, "step": 15933 }, { "epoch": 0.844565764715236, "grad_norm": 72.0, "kl": 5.183866500854492, "learning_rate": 5e-07, "logits/chosen": -41677693.333333336, "logits/rejected": -17762094.0, "logps/chosen": -387.6476643880208, "logps/rejected": -64.64439392089844, "loss": 0.383, "rewards/chosen": 1.1770151456197102, "rewards/margins": 1.1323375602563222, "rewards/rejected": 0.04467758536338806, "step": 15934 }, { "epoch": 0.8446187687170381, "grad_norm": 73.0, "kl": 4.691242218017578, "learning_rate": 5e-07, "logits/chosen": -11343953.6, "logits/rejected": -114020533.33333333, "logps/chosen": -520.694580078125, "logps/rejected": -205.93497721354166, "loss": 0.2243, "rewards/chosen": 1.30607328414917, "rewards/margins": 3.579306697845459, "rewards/rejected": -2.273233413696289, "step": 15935 }, { "epoch": 0.8446717727188403, "grad_norm": 54.25, "kl": 2.096968650817871, "learning_rate": 5e-07, "logits/chosen": -11467200.0, "logits/rejected": -23753936.0, "logps/chosen": -329.7678629557292, "logps/rejected": -569.760986328125, "loss": 0.3333, "rewards/chosen": 0.5550380945205688, "rewards/margins": 5.566262364387512, "rewards/rejected": -5.011224269866943, "step": 15936 }, { "epoch": 0.8447247767206424, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57136294.4, "logits/rejected": -135239.33333333334, "logps/chosen": -414.451025390625, "logps/rejected": -721.4855143229166, "loss": 0.2858, "rewards/chosen": 0.6013959884643555, "rewards/margins": 2.901843452453613, "rewards/rejected": -2.300447463989258, "step": 15937 }, { "epoch": 0.8447777807224446, "grad_norm": 44.0, "kl": 5.1096086502075195, "learning_rate": 5e-07, "logits/chosen": -42928972.8, "logits/rejected": -21308984.0, "logps/chosen": -331.913330078125, "logps/rejected": -382.5680745442708, "loss": 0.2879, "rewards/chosen": 0.9665372848510743, "rewards/margins": 2.789291826883952, "rewards/rejected": -1.8227545420328777, "step": 15938 }, { "epoch": 0.8448307847242467, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39959316.0, "logits/rejected": -54312346.666666664, "logps/chosen": -323.31890869140625, "logps/rejected": -378.2188720703125, "loss": 0.1668, "rewards/chosen": 0.5663391351699829, "rewards/margins": 3.164799968401591, "rewards/rejected": -2.598460833231608, "step": 15939 }, { "epoch": 0.8448837887260489, "grad_norm": 36.25, "kl": 1.4892921447753906, "learning_rate": 5e-07, "logits/chosen": 7046445.0, "logits/rejected": -19925722.0, "logps/chosen": -101.65157318115234, "logps/rejected": -204.5509490966797, "loss": 0.1687, "rewards/chosen": 1.6822394132614136, "rewards/margins": 4.3114906549453735, "rewards/rejected": -2.62925124168396, "step": 15940 }, { "epoch": 0.8449367927278509, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39230232.0, "logits/rejected": -26188532.57142857, "logps/chosen": -128.6651611328125, "logps/rejected": -407.6101771763393, "loss": 0.1852, "rewards/chosen": 0.22633972764015198, "rewards/margins": 2.4464458525180817, "rewards/rejected": -2.2201061248779297, "step": 15941 }, { "epoch": 0.8449897967296531, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2733620.0, "logits/rejected": 95765.02083333333, "logps/chosen": -196.62074279785156, "logps/rejected": -120.28946940104167, "loss": 0.1943, "rewards/chosen": 1.0750815868377686, "rewards/margins": 3.8052149613698325, "rewards/rejected": -2.730133374532064, "step": 15942 }, { "epoch": 0.8450428007314552, "grad_norm": 41.0, "kl": 2.4965457916259766, "learning_rate": 5e-07, "logits/chosen": -11198736.0, "logits/rejected": -30604040.0, "logps/chosen": -463.5494384765625, "logps/rejected": -292.209228515625, "loss": 0.3067, "rewards/chosen": 1.1308870315551758, "rewards/margins": 3.2682900428771973, "rewards/rejected": -2.1374030113220215, "step": 15943 }, { "epoch": 0.8450958047332574, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11903932.0, "logits/rejected": -30570520.0, "logps/chosen": -218.2928924560547, "logps/rejected": -305.1751708984375, "loss": 0.313, "rewards/chosen": 0.14645680785179138, "rewards/margins": 2.5977316796779633, "rewards/rejected": -2.451274871826172, "step": 15944 }, { "epoch": 0.8451488087350595, "grad_norm": 43.75, "kl": 0.35739994049072266, "learning_rate": 5e-07, "logits/chosen": 6164256.0, "logits/rejected": -18117280.0, "logps/chosen": -51.27447509765625, "logps/rejected": -349.7496337890625, "loss": 0.2509, "rewards/chosen": 0.37682509422302246, "rewards/margins": 2.3067777156829834, "rewards/rejected": -1.929952621459961, "step": 15945 }, { "epoch": 0.8452018127368617, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11222222.0, "logits/rejected": -12593326.0, "logps/chosen": -198.62579345703125, "logps/rejected": -106.08860778808594, "loss": 0.2991, "rewards/chosen": 0.21343690156936646, "rewards/margins": 2.696700394153595, "rewards/rejected": -2.4832634925842285, "step": 15946 }, { "epoch": 0.8452548167386638, "grad_norm": 52.75, "kl": 0.7579574584960938, "learning_rate": 5e-07, "logits/chosen": -33445066.0, "logits/rejected": -30961144.0, "logps/chosen": -470.81182861328125, "logps/rejected": -407.5349426269531, "loss": 0.2503, "rewards/chosen": 0.5048359036445618, "rewards/margins": 3.966584265232086, "rewards/rejected": -3.4617483615875244, "step": 15947 }, { "epoch": 0.845307820740466, "grad_norm": 43.75, "kl": 6.392799377441406, "learning_rate": 5e-07, "logits/chosen": -6934598.666666667, "logits/rejected": -32730632.0, "logps/chosen": -629.9814453125, "logps/rejected": -237.3765869140625, "loss": 0.4142, "rewards/chosen": 1.152916431427002, "rewards/margins": 2.6059141159057617, "rewards/rejected": -1.4529976844787598, "step": 15948 }, { "epoch": 0.845360824742268, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26857090.0, "logits/rejected": -17190892.0, "logps/chosen": -491.6893310546875, "logps/rejected": -644.830322265625, "loss": 0.1953, "rewards/chosen": 0.9194652438163757, "rewards/margins": 5.5921027064323425, "rewards/rejected": -4.672637462615967, "step": 15949 }, { "epoch": 0.8454138287440702, "grad_norm": 53.0, "kl": 4.3238677978515625, "learning_rate": 5e-07, "logits/chosen": -9827280.0, "logits/rejected": 57039272.0, "logps/chosen": -273.0803920200893, "logps/rejected": -37.823204040527344, "loss": 0.3409, "rewards/chosen": 1.312776701790946, "rewards/margins": 1.613802867276328, "rewards/rejected": -0.3010261654853821, "step": 15950 }, { "epoch": 0.8454668327458723, "grad_norm": 69.0, "kl": 0.7154731750488281, "learning_rate": 5e-07, "logits/chosen": 5521262.666666667, "logits/rejected": -29945878.4, "logps/chosen": -765.8076985677084, "logps/rejected": -218.058544921875, "loss": 0.2497, "rewards/chosen": 1.3131197293599446, "rewards/margins": 2.608797295888265, "rewards/rejected": -1.2956775665283202, "step": 15951 }, { "epoch": 0.8455198367476745, "grad_norm": 70.0, "kl": 2.100282669067383, "learning_rate": 5e-07, "logits/chosen": -88656376.0, "logits/rejected": 16312321.0, "logps/chosen": -403.7667236328125, "logps/rejected": -346.7044677734375, "loss": 0.3038, "rewards/chosen": 0.7146949768066406, "rewards/margins": 1.9688116312026978, "rewards/rejected": -1.2541166543960571, "step": 15952 }, { "epoch": 0.8455728407494766, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47828554.666666664, "logits/rejected": -8601507.2, "logps/chosen": -339.8039957682292, "logps/rejected": -552.222705078125, "loss": 0.2106, "rewards/chosen": 0.6572906176249186, "rewards/margins": 3.4446656862894693, "rewards/rejected": -2.787375068664551, "step": 15953 }, { "epoch": 0.8456258447512788, "grad_norm": 72.0, "kl": 3.0508766174316406, "learning_rate": 5e-07, "logits/chosen": -43776130.666666664, "logits/rejected": -89906240.0, "logps/chosen": -483.5301106770833, "logps/rejected": -673.64892578125, "loss": 0.3425, "rewards/chosen": 0.660681406656901, "rewards/margins": 3.5537768999735513, "rewards/rejected": -2.8930954933166504, "step": 15954 }, { "epoch": 0.8456788487530809, "grad_norm": 42.75, "kl": 2.0345840454101562, "learning_rate": 5e-07, "logits/chosen": -39571272.0, "logits/rejected": -85213113.6, "logps/chosen": -330.57354736328125, "logps/rejected": -431.073828125, "loss": 0.2089, "rewards/chosen": 1.2866663138071697, "rewards/margins": 3.6726442495981857, "rewards/rejected": -2.385977935791016, "step": 15955 }, { "epoch": 0.8457318527548829, "grad_norm": 32.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1502224.5, "logits/rejected": -23177477.333333332, "logps/chosen": -35.149803161621094, "logps/rejected": -432.0142008463542, "loss": 0.1796, "rewards/chosen": 0.5207393765449524, "rewards/margins": 3.401759604612986, "rewards/rejected": -2.8810202280680337, "step": 15956 }, { "epoch": 0.8457848567566851, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10484931.0, "logits/rejected": -37678092.0, "logps/chosen": -123.9439468383789, "logps/rejected": -257.9482421875, "loss": 0.3855, "rewards/chosen": -0.2877442240715027, "rewards/margins": 1.3254719376564026, "rewards/rejected": -1.6132161617279053, "step": 15957 }, { "epoch": 0.8458378607584872, "grad_norm": 30.25, "kl": 0.9651165008544922, "learning_rate": 5e-07, "logits/chosen": -8625360.0, "logits/rejected": -26820612.0, "logps/chosen": -134.47952270507812, "logps/rejected": -524.4443359375, "loss": 0.2617, "rewards/chosen": 0.4729558229446411, "rewards/margins": 3.8071800470352173, "rewards/rejected": -3.334224224090576, "step": 15958 }, { "epoch": 0.8458908647602894, "grad_norm": 44.25, "kl": 1.006317138671875, "learning_rate": 5e-07, "logits/chosen": -26795112.0, "logits/rejected": -56638588.0, "logps/chosen": -155.88690185546875, "logps/rejected": -354.2242431640625, "loss": 0.171, "rewards/chosen": 1.106143832206726, "rewards/margins": 3.9241641759872437, "rewards/rejected": -2.8180203437805176, "step": 15959 }, { "epoch": 0.8459438687620915, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12703143.0, "logits/rejected": -7519756.0, "logps/chosen": -209.96673583984375, "logps/rejected": -194.85030110677084, "loss": 0.1644, "rewards/chosen": 1.234229326248169, "rewards/margins": 3.6060074965159097, "rewards/rejected": -2.3717781702677407, "step": 15960 }, { "epoch": 0.8459968727638937, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60823093.333333336, "logits/rejected": -18040440.0, "logps/chosen": -218.60713704427084, "logps/rejected": -282.7255615234375, "loss": 0.2751, "rewards/chosen": -0.11664136250813802, "rewards/margins": 1.8960299173990884, "rewards/rejected": -2.0126712799072264, "step": 15961 }, { "epoch": 0.8460498767656958, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56141044.0, "logits/rejected": -42342124.0, "logps/chosen": -451.4697265625, "logps/rejected": -452.6129150390625, "loss": 0.2444, "rewards/chosen": 0.8886989951133728, "rewards/margins": 3.7010547518730164, "rewards/rejected": -2.8123557567596436, "step": 15962 }, { "epoch": 0.846102880767498, "grad_norm": 54.0, "kl": 0.00742340087890625, "learning_rate": 5e-07, "logits/chosen": -41272120.0, "logits/rejected": -24795860.0, "logps/chosen": -747.9357299804688, "logps/rejected": -428.57470703125, "loss": 0.2227, "rewards/chosen": 0.5134811401367188, "rewards/margins": 4.560116291046143, "rewards/rejected": -4.046635150909424, "step": 15963 }, { "epoch": 0.8461558847693, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29303453.333333332, "logits/rejected": -15841224.0, "logps/chosen": -207.32889811197916, "logps/rejected": -259.731787109375, "loss": 0.1892, "rewards/chosen": 1.5676941871643066, "rewards/margins": 3.5468358039855956, "rewards/rejected": -1.979141616821289, "step": 15964 }, { "epoch": 0.8462088887711022, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35428312.0, "logits/rejected": -24002774.0, "logps/chosen": -288.41729736328125, "logps/rejected": -193.45474243164062, "loss": 0.3024, "rewards/chosen": 0.14434757828712463, "rewards/margins": 2.1286506950855255, "rewards/rejected": -1.9843031167984009, "step": 15965 }, { "epoch": 0.8462618927729043, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24233832.0, "logits/rejected": -45594517.333333336, "logps/chosen": -188.82736206054688, "logps/rejected": -265.3534342447917, "loss": 0.1568, "rewards/chosen": 0.22254866361618042, "rewards/margins": 3.564384639263153, "rewards/rejected": -3.3418359756469727, "step": 15966 }, { "epoch": 0.8463148967747065, "grad_norm": 38.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 33884.666666666664, "logits/rejected": -21668528.0, "logps/chosen": -155.55716959635416, "logps/rejected": -447.248046875, "loss": 0.2078, "rewards/chosen": 0.888953447341919, "rewards/margins": 3.0730404376983644, "rewards/rejected": -2.1840869903564455, "step": 15967 }, { "epoch": 0.8463679007765086, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57020249.6, "logits/rejected": -67152389.33333333, "logps/chosen": -477.655029296875, "logps/rejected": -250.6009521484375, "loss": 0.3388, "rewards/chosen": 0.4730255126953125, "rewards/margins": 1.7750024318695068, "rewards/rejected": -1.3019769191741943, "step": 15968 }, { "epoch": 0.8464209047783108, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37058896.0, "logits/rejected": -44607426.666666664, "logps/chosen": -430.7902526855469, "logps/rejected": -304.90814208984375, "loss": 0.2185, "rewards/chosen": 0.9695900082588196, "rewards/margins": 3.1007054448127747, "rewards/rejected": -2.131115436553955, "step": 15969 }, { "epoch": 0.8464739087801129, "grad_norm": 53.75, "kl": 5.833453178405762, "learning_rate": 5e-07, "logits/chosen": -17337656.0, "logits/rejected": 485401.6666666667, "logps/chosen": -383.1895263671875, "logps/rejected": -77.16843668619792, "loss": 0.3352, "rewards/chosen": 1.6365116119384766, "rewards/margins": 1.8317484537760418, "rewards/rejected": -0.1952368418375651, "step": 15970 }, { "epoch": 0.846526912781915, "grad_norm": 60.0, "kl": 4.376045227050781, "learning_rate": 5e-07, "logits/chosen": -39020061.333333336, "logits/rejected": -6674939.2, "logps/chosen": -1045.701171875, "logps/rejected": -239.4362548828125, "loss": 0.1158, "rewards/chosen": 2.744370460510254, "rewards/margins": 5.94008617401123, "rewards/rejected": -3.1957157135009764, "step": 15971 }, { "epoch": 0.8465799167837171, "grad_norm": 35.75, "kl": 1.0138320922851562, "learning_rate": 5e-07, "logits/chosen": -4366990.0, "logits/rejected": -52013676.8, "logps/chosen": -177.21417236328125, "logps/rejected": -412.17021484375, "loss": 0.2068, "rewards/chosen": 0.49469153086344403, "rewards/margins": 3.3533707300821938, "rewards/rejected": -2.85867919921875, "step": 15972 }, { "epoch": 0.8466329207855193, "grad_norm": 46.5, "kl": 0.14414215087890625, "learning_rate": 5e-07, "logits/chosen": -92182600.0, "logits/rejected": -27352976.0, "logps/chosen": -228.80609130859375, "logps/rejected": -304.8350524902344, "loss": 0.205, "rewards/chosen": 1.1871708631515503, "rewards/margins": 2.9766346216201782, "rewards/rejected": -1.789463758468628, "step": 15973 }, { "epoch": 0.8466859247873214, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13933954.0, "logits/rejected": -45924368.0, "logps/chosen": -206.35409545898438, "logps/rejected": -348.59075927734375, "loss": 0.1769, "rewards/chosen": 1.2406468391418457, "rewards/margins": 4.1414947509765625, "rewards/rejected": -2.900847911834717, "step": 15974 }, { "epoch": 0.8467389287891236, "grad_norm": 62.5, "kl": 4.3624114990234375, "learning_rate": 5e-07, "logits/chosen": -11390151.2, "logits/rejected": -12482464.0, "logps/chosen": -153.13194580078124, "logps/rejected": -251.3484090169271, "loss": 0.3694, "rewards/chosen": 0.6362168788909912, "rewards/margins": 2.0746945540110273, "rewards/rejected": -1.4384776751200359, "step": 15975 }, { "epoch": 0.8467919327909257, "grad_norm": 50.0, "kl": 1.4046382904052734, "learning_rate": 5e-07, "logits/chosen": -45175008.0, "logits/rejected": -76567536.0, "logps/chosen": -269.1864990234375, "logps/rejected": -618.7646484375, "loss": 0.3572, "rewards/chosen": 0.0058567821979522705, "rewards/margins": 3.373384108146032, "rewards/rejected": -3.3675273259480796, "step": 15976 }, { "epoch": 0.8468449367927279, "grad_norm": 56.0, "kl": 1.3159332275390625, "learning_rate": 5e-07, "logits/chosen": -38870237.333333336, "logits/rejected": -88142912.0, "logps/chosen": -330.9979248046875, "logps/rejected": -561.0813598632812, "loss": 0.3047, "rewards/chosen": 0.571306308110555, "rewards/margins": 3.825632174809774, "rewards/rejected": -3.2543258666992188, "step": 15977 }, { "epoch": 0.84689794079453, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19178810.0, "logits/rejected": -63170048.0, "logps/chosen": -241.64047241210938, "logps/rejected": -397.9101257324219, "loss": 0.3001, "rewards/chosen": 0.1835928112268448, "rewards/margins": 2.1908388286828995, "rewards/rejected": -2.0072460174560547, "step": 15978 }, { "epoch": 0.8469509447963322, "grad_norm": 53.0, "kl": 2.2538509368896484, "learning_rate": 5e-07, "logits/chosen": -32852918.0, "logits/rejected": 4609941.0, "logps/chosen": -243.12294006347656, "logps/rejected": -404.24267578125, "loss": 0.2731, "rewards/chosen": 0.6965140104293823, "rewards/margins": 2.5001074075698853, "rewards/rejected": -1.803593397140503, "step": 15979 }, { "epoch": 0.8470039487981342, "grad_norm": 37.5, "kl": 2.8739242553710938, "learning_rate": 5e-07, "logits/chosen": -13408192.0, "logits/rejected": -22524457.6, "logps/chosen": -487.609130859375, "logps/rejected": -404.968408203125, "loss": 0.1629, "rewards/chosen": 1.4478871027628581, "rewards/margins": 5.249475924173991, "rewards/rejected": -3.8015888214111326, "step": 15980 }, { "epoch": 0.8470569527999364, "grad_norm": 47.25, "kl": 1.3895492553710938, "learning_rate": 5e-07, "logits/chosen": -23533118.0, "logits/rejected": -21163444.0, "logps/chosen": -415.677734375, "logps/rejected": -791.767333984375, "loss": 0.3095, "rewards/chosen": -0.010967962443828583, "rewards/margins": 4.418338067829609, "rewards/rejected": -4.4293060302734375, "step": 15981 }, { "epoch": 0.8471099568017385, "grad_norm": 49.5, "kl": 0.2993793487548828, "learning_rate": 5e-07, "logits/chosen": -39072105.6, "logits/rejected": -106657973.33333333, "logps/chosen": -283.0115478515625, "logps/rejected": -333.9604899088542, "loss": 0.2874, "rewards/chosen": 0.4545924186706543, "rewards/margins": 3.6261895815531413, "rewards/rejected": -3.171597162882487, "step": 15982 }, { "epoch": 0.8471629608035407, "grad_norm": 45.5, "kl": 3.2018165588378906, "learning_rate": 5e-07, "logits/chosen": -47979808.0, "logits/rejected": -18890784.0, "logps/chosen": -323.7069091796875, "logps/rejected": -354.33294677734375, "loss": 0.324, "rewards/chosen": 0.26686519384384155, "rewards/margins": 2.431783378124237, "rewards/rejected": -2.1649181842803955, "step": 15983 }, { "epoch": 0.8472159648053428, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13021619.0, "logits/rejected": -17124852.0, "logps/chosen": -431.42236328125, "logps/rejected": -216.11578369140625, "loss": 0.1865, "rewards/chosen": 1.1994941234588623, "rewards/margins": 4.533733129501343, "rewards/rejected": -3.3342390060424805, "step": 15984 }, { "epoch": 0.847268968807145, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67344970.66666667, "logits/rejected": -15342252.8, "logps/chosen": -257.61643473307294, "logps/rejected": -236.8360107421875, "loss": 0.2273, "rewards/chosen": -0.0018369754155476887, "rewards/margins": 3.309483138720194, "rewards/rejected": -3.311320114135742, "step": 15985 }, { "epoch": 0.847321972808947, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15533294.4, "logits/rejected": -29030229.333333332, "logps/chosen": -336.4308837890625, "logps/rejected": -338.26780192057294, "loss": 0.2154, "rewards/chosen": 0.9842979431152343, "rewards/margins": 3.3553290685017902, "rewards/rejected": -2.371031125386556, "step": 15986 }, { "epoch": 0.8473749768107492, "grad_norm": 136.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9508698.666666666, "logits/rejected": 12532588.8, "logps/chosen": -625.6424967447916, "logps/rejected": -292.662841796875, "loss": 0.3795, "rewards/chosen": -0.30833741029103595, "rewards/margins": 0.8239310185114543, "rewards/rejected": -1.1322684288024902, "step": 15987 }, { "epoch": 0.8474279808125513, "grad_norm": 45.75, "kl": 5.006924629211426, "learning_rate": 5e-07, "logits/chosen": -14969302.666666666, "logits/rejected": -7115269.0, "logps/chosen": -502.0403645833333, "logps/rejected": -300.4833679199219, "loss": 0.2941, "rewards/chosen": 1.537051518758138, "rewards/margins": 4.114764293034871, "rewards/rejected": -2.5777127742767334, "step": 15988 }, { "epoch": 0.8474809848143535, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19115536.0, "logits/rejected": -18725154.0, "logps/chosen": -213.7189178466797, "logps/rejected": -262.3348388671875, "loss": 0.3103, "rewards/chosen": 0.24864214658737183, "rewards/margins": 2.408125340938568, "rewards/rejected": -2.1594831943511963, "step": 15989 }, { "epoch": 0.8475339888161556, "grad_norm": 38.25, "kl": 0.09190082550048828, "learning_rate": 5e-07, "logits/chosen": -14925236.0, "logits/rejected": -29678742.0, "logps/chosen": -259.73944091796875, "logps/rejected": -244.80056762695312, "loss": 0.2328, "rewards/chosen": 0.5965598225593567, "rewards/margins": 3.413863956928253, "rewards/rejected": -2.8173041343688965, "step": 15990 }, { "epoch": 0.8475869928179578, "grad_norm": 54.0, "kl": 0.23710250854492188, "learning_rate": 5e-07, "logits/chosen": -21189849.333333332, "logits/rejected": 8294163.2, "logps/chosen": -210.26236979166666, "logps/rejected": -140.727880859375, "loss": 0.3146, "rewards/chosen": 0.41986846923828125, "rewards/margins": 1.5104072570800782, "rewards/rejected": -1.090538787841797, "step": 15991 }, { "epoch": 0.8476399968197599, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3118468.0, "logits/rejected": -17973891.2, "logps/chosen": -411.8428548177083, "logps/rejected": -89.9019287109375, "loss": 0.2801, "rewards/chosen": 0.32941486438115436, "rewards/margins": 2.527542038758596, "rewards/rejected": -2.1981271743774413, "step": 15992 }, { "epoch": 0.8476930008215621, "grad_norm": 46.75, "kl": 3.4793949127197266, "learning_rate": 5e-07, "logits/chosen": -3346581.3333333335, "logits/rejected": -190612592.0, "logps/chosen": -273.23333740234375, "logps/rejected": -399.551025390625, "loss": 0.3672, "rewards/chosen": 0.5256469249725342, "rewards/margins": 4.576379060745239, "rewards/rejected": -4.050732135772705, "step": 15993 }, { "epoch": 0.8477460048233642, "grad_norm": 64.5, "kl": 3.12076473236084, "learning_rate": 5e-07, "logits/chosen": -54082704.0, "logits/rejected": 3242200.5, "logps/chosen": -197.6908976236979, "logps/rejected": -140.23321533203125, "loss": 0.4341, "rewards/chosen": 0.2920754353205363, "rewards/margins": 0.8683892289797466, "rewards/rejected": -0.5763137936592102, "step": 15994 }, { "epoch": 0.8477990088251663, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77176048.0, "logits/rejected": -32477058.666666668, "logps/chosen": -646.1318359375, "logps/rejected": -372.7412516276042, "loss": 0.213, "rewards/chosen": 0.7747436761856079, "rewards/margins": 3.1802765130996704, "rewards/rejected": -2.4055328369140625, "step": 15995 }, { "epoch": 0.8478520128269684, "grad_norm": 28.375, "kl": 1.3698530197143555, "learning_rate": 5e-07, "logits/chosen": -13366508.0, "logits/rejected": -11805878.0, "logps/chosen": -150.58242797851562, "logps/rejected": -381.62884521484375, "loss": 0.2787, "rewards/chosen": 0.16925688087940216, "rewards/margins": 3.716617777943611, "rewards/rejected": -3.547360897064209, "step": 15996 }, { "epoch": 0.8479050168287706, "grad_norm": 68.0, "kl": 0.1626434326171875, "learning_rate": 5e-07, "logits/chosen": -86177000.0, "logits/rejected": -63403424.0, "logps/chosen": -107.6126480102539, "logps/rejected": -385.650146484375, "loss": 0.2451, "rewards/chosen": 0.3313213586807251, "rewards/margins": 2.0422076781590777, "rewards/rejected": -1.7108863194783528, "step": 15997 }, { "epoch": 0.8479580208305727, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78249408.0, "logits/rejected": -6124052.666666667, "logps/chosen": -355.6746520996094, "logps/rejected": -235.42301432291666, "loss": 0.221, "rewards/chosen": -0.41414642333984375, "rewards/margins": 3.009341557820638, "rewards/rejected": -3.423487981160482, "step": 15998 }, { "epoch": 0.8480110248323749, "grad_norm": 47.5, "kl": 2.449472427368164, "learning_rate": 5e-07, "logits/chosen": -33393475.2, "logits/rejected": -31006008.0, "logps/chosen": -224.30087890625, "logps/rejected": -283.91188557942706, "loss": 0.3628, "rewards/chosen": 0.13273407220840455, "rewards/margins": 2.7978530844052636, "rewards/rejected": -2.665119012196859, "step": 15999 }, { "epoch": 0.848064028834177, "grad_norm": 43.25, "kl": 0.5184555053710938, "learning_rate": 5e-07, "logits/chosen": -38840140.8, "logits/rejected": -7022924.0, "logps/chosen": -585.129296875, "logps/rejected": -321.85052490234375, "loss": 0.2456, "rewards/chosen": 1.0035218238830566, "rewards/margins": 4.814413611094157, "rewards/rejected": -3.8108917872111, "step": 16000 }, { "epoch": 0.8481170328359792, "grad_norm": 62.25, "kl": 3.434459686279297, "learning_rate": 5e-07, "logits/chosen": -17602750.4, "logits/rejected": -3914460.3333333335, "logps/chosen": -337.76064453125, "logps/rejected": -134.53754679361978, "loss": 0.2428, "rewards/chosen": 1.3953365325927733, "rewards/margins": 3.1185986836751303, "rewards/rejected": -1.7232621510823567, "step": 16001 }, { "epoch": 0.8481700368377812, "grad_norm": 39.5, "kl": 1.9232616424560547, "learning_rate": 5e-07, "logits/chosen": -39014240.0, "logits/rejected": -5934672.0, "logps/chosen": -738.717041015625, "logps/rejected": -183.38124084472656, "loss": 0.2062, "rewards/chosen": 1.1516993045806885, "rewards/margins": 4.797261953353882, "rewards/rejected": -3.6455626487731934, "step": 16002 }, { "epoch": 0.8482230408395834, "grad_norm": 50.0, "kl": 0.9280891418457031, "learning_rate": 5e-07, "logits/chosen": -13239178.666666666, "logits/rejected": 134403296.0, "logps/chosen": -454.8807779947917, "logps/rejected": -273.1015625, "loss": 0.2723, "rewards/chosen": 1.0450913111368816, "rewards/margins": 2.775786677996318, "rewards/rejected": -1.730695366859436, "step": 16003 }, { "epoch": 0.8482760448413855, "grad_norm": 29.875, "kl": 5.625938415527344, "learning_rate": 5e-07, "logits/chosen": -10084709.0, "logits/rejected": -44468808.0, "logps/chosen": -157.2095489501953, "logps/rejected": -221.38052368164062, "loss": 0.2495, "rewards/chosen": 0.961472749710083, "rewards/margins": 3.2400574684143066, "rewards/rejected": -2.2785847187042236, "step": 16004 }, { "epoch": 0.8483290488431876, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54387609.6, "logits/rejected": 7190145.333333333, "logps/chosen": -310.035888671875, "logps/rejected": -442.4952392578125, "loss": 0.3186, "rewards/chosen": 0.18476732969284057, "rewards/margins": 2.5420412262280783, "rewards/rejected": -2.357273896535238, "step": 16005 }, { "epoch": 0.8483820528449898, "grad_norm": 30.625, "kl": 3.74639892578125, "learning_rate": 5e-07, "logits/chosen": -4988145.2, "logits/rejected": -46150346.666666664, "logps/chosen": -157.1064208984375, "logps/rejected": -199.77970377604166, "loss": 0.3631, "rewards/chosen": 0.35239391326904296, "rewards/margins": 2.7345674514770506, "rewards/rejected": -2.382173538208008, "step": 16006 }, { "epoch": 0.8484350568467919, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2245490.0, "logits/rejected": -7004960.0, "logps/chosen": -166.3297119140625, "logps/rejected": -282.3246663411458, "loss": 0.1551, "rewards/chosen": 1.253268837928772, "rewards/margins": 3.5156465768814087, "rewards/rejected": -2.2623777389526367, "step": 16007 }, { "epoch": 0.8484880608485941, "grad_norm": 44.75, "kl": 0.7758636474609375, "learning_rate": 5e-07, "logits/chosen": -34118357.333333336, "logits/rejected": -78726745.6, "logps/chosen": -330.2305908203125, "logps/rejected": -397.74716796875, "loss": 0.2149, "rewards/chosen": 0.533234159151713, "rewards/margins": 3.8007523934046428, "rewards/rejected": -3.26751823425293, "step": 16008 }, { "epoch": 0.8485410648503962, "grad_norm": 23.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34790760.0, "logits/rejected": -20869842.285714287, "logps/chosen": -352.2217102050781, "logps/rejected": -251.02978515625, "loss": 0.0616, "rewards/chosen": 2.553964376449585, "rewards/margins": 5.481595482145037, "rewards/rejected": -2.927631105695452, "step": 16009 }, { "epoch": 0.8485940688521983, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2358474.6666666665, "logits/rejected": -97008876.8, "logps/chosen": -41.83516947428385, "logps/rejected": -466.28037109375, "loss": 0.3047, "rewards/chosen": -0.34458422660827637, "rewards/margins": 1.713171625137329, "rewards/rejected": -2.0577558517456054, "step": 16010 }, { "epoch": 0.8486470728540004, "grad_norm": 96.5, "kl": 0.5943031311035156, "learning_rate": 5e-07, "logits/chosen": -21617702.666666668, "logits/rejected": -15925379.2, "logps/chosen": -230.5687052408854, "logps/rejected": -460.361376953125, "loss": 0.2795, "rewards/chosen": 0.6153644720713297, "rewards/margins": 2.4419492880503335, "rewards/rejected": -1.826584815979004, "step": 16011 }, { "epoch": 0.8487000768558026, "grad_norm": 49.5, "kl": 0.23989105224609375, "learning_rate": 5e-07, "logits/chosen": -25578509.333333332, "logits/rejected": -63307699.2, "logps/chosen": -205.68599446614584, "logps/rejected": -142.65185546875, "loss": 0.3645, "rewards/chosen": 0.437572439511617, "rewards/margins": 1.301484735806783, "rewards/rejected": -0.863912296295166, "step": 16012 }, { "epoch": 0.8487530808576047, "grad_norm": 41.75, "kl": 2.2001304626464844, "learning_rate": 5e-07, "logits/chosen": -29520780.8, "logits/rejected": -14242509.333333334, "logps/chosen": -596.894287109375, "logps/rejected": -329.5791015625, "loss": 0.3142, "rewards/chosen": 0.8114119529724121, "rewards/margins": 4.1394371350606285, "rewards/rejected": -3.3280251820882163, "step": 16013 }, { "epoch": 0.8488060848594069, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66104565.333333336, "logits/rejected": -56171539.2, "logps/chosen": -312.68931070963544, "logps/rejected": -499.9302734375, "loss": 0.2583, "rewards/chosen": -0.34162290891011554, "rewards/margins": 2.7226513067881264, "rewards/rejected": -3.064274215698242, "step": 16014 }, { "epoch": 0.848859088861209, "grad_norm": 41.25, "kl": 1.0316495895385742, "learning_rate": 5e-07, "logits/chosen": -25494589.333333332, "logits/rejected": -1354380.0, "logps/chosen": -229.50956217447916, "logps/rejected": -168.92886352539062, "loss": 0.2196, "rewards/chosen": 1.096823771794637, "rewards/margins": 5.502852996190389, "rewards/rejected": -4.406029224395752, "step": 16015 }, { "epoch": 0.8489120928630112, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25348208.0, "logits/rejected": -27050421.333333332, "logps/chosen": -323.7712097167969, "logps/rejected": -248.3582560221354, "loss": 0.1408, "rewards/chosen": 0.7890545129776001, "rewards/margins": 3.557799299558004, "rewards/rejected": -2.768744786580404, "step": 16016 }, { "epoch": 0.8489650968648133, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80806570.66666667, "logits/rejected": -11580176.0, "logps/chosen": -370.0547688802083, "logps/rejected": -160.81583251953126, "loss": 0.2617, "rewards/chosen": 0.03720754384994507, "rewards/margins": 2.9137019515037537, "rewards/rejected": -2.8764944076538086, "step": 16017 }, { "epoch": 0.8490181008666154, "grad_norm": 43.0, "kl": 0.7088241577148438, "learning_rate": 5e-07, "logits/chosen": -29126868.0, "logits/rejected": 185350.5, "logps/chosen": -271.5531005859375, "logps/rejected": -292.3408203125, "loss": 0.1993, "rewards/chosen": 1.2525994777679443, "rewards/margins": 4.03995943069458, "rewards/rejected": -2.7873599529266357, "step": 16018 }, { "epoch": 0.8490711048684175, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45074374.4, "logits/rejected": 15475296.0, "logps/chosen": -126.972412109375, "logps/rejected": -199.1073201497396, "loss": 0.4141, "rewards/chosen": 0.08729865550994872, "rewards/margins": 1.08437127272288, "rewards/rejected": -0.9970726172129313, "step": 16019 }, { "epoch": 0.8491241088702197, "grad_norm": 37.75, "kl": 1.7287235260009766, "learning_rate": 5e-07, "logits/chosen": -25122334.0, "logits/rejected": -21495468.0, "logps/chosen": -259.5417785644531, "logps/rejected": -341.3494466145833, "loss": 0.1711, "rewards/chosen": 0.32159414887428284, "rewards/margins": 3.321329345305761, "rewards/rejected": -2.999735196431478, "step": 16020 }, { "epoch": 0.8491771128720218, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31222469.333333332, "logits/rejected": 15205134.4, "logps/chosen": -167.45491536458334, "logps/rejected": -476.5732421875, "loss": 0.2702, "rewards/chosen": 0.31781208515167236, "rewards/margins": 3.152999234199524, "rewards/rejected": -2.8351871490478517, "step": 16021 }, { "epoch": 0.849230116873824, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -12056748.0, "logps/rejected": -329.0405578613281, "loss": 0.1524, "rewards/rejected": -2.482788562774658, "step": 16022 }, { "epoch": 0.8492831208756261, "grad_norm": 39.5, "kl": 3.2808876037597656, "learning_rate": 5e-07, "logits/chosen": -47730886.4, "logits/rejected": -38339306.666666664, "logps/chosen": -231.591748046875, "logps/rejected": -470.2923990885417, "loss": 0.308, "rewards/chosen": 0.40183157920837403, "rewards/margins": 4.297423315048218, "rewards/rejected": -3.8955917358398438, "step": 16023 }, { "epoch": 0.8493361248774283, "grad_norm": 53.25, "kl": 3.6282615661621094, "learning_rate": 5e-07, "logits/chosen": -35252704.0, "logits/rejected": -31634254.0, "logps/chosen": -473.4516906738281, "logps/rejected": -330.382568359375, "loss": 0.2304, "rewards/chosen": 1.3444328308105469, "rewards/margins": 3.730307102203369, "rewards/rejected": -2.3858742713928223, "step": 16024 }, { "epoch": 0.8493891288792303, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14912089.6, "logits/rejected": -22403562.666666668, "logps/chosen": -406.706689453125, "logps/rejected": -384.2902425130208, "loss": 0.1826, "rewards/chosen": 1.3257256507873536, "rewards/margins": 3.6947249730428062, "rewards/rejected": -2.3689993222554526, "step": 16025 }, { "epoch": 0.8494421328810325, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58651096.0, "logits/rejected": -8298398.666666667, "logps/chosen": -219.7301025390625, "logps/rejected": -99.40256754557292, "loss": 0.176, "rewards/chosen": 0.4405220150947571, "rewards/margins": 2.736911118030548, "rewards/rejected": -2.296389102935791, "step": 16026 }, { "epoch": 0.8494951368828346, "grad_norm": 47.5, "kl": 2.7413463592529297, "learning_rate": 5e-07, "logits/chosen": -53324796.8, "logits/rejected": -1259121.3333333333, "logps/chosen": -268.767822265625, "logps/rejected": -286.12518310546875, "loss": 0.3533, "rewards/chosen": 0.4210216999053955, "rewards/margins": 3.0331202348073325, "rewards/rejected": -2.612098534901937, "step": 16027 }, { "epoch": 0.8495481408846368, "grad_norm": 47.0, "kl": 1.820547103881836, "learning_rate": 5e-07, "logits/chosen": -32906732.8, "logits/rejected": -12581808.0, "logps/chosen": -464.79736328125, "logps/rejected": -317.3840738932292, "loss": 0.2529, "rewards/chosen": 0.9171030044555664, "rewards/margins": 3.5185358047485353, "rewards/rejected": -2.6014328002929688, "step": 16028 }, { "epoch": 0.8496011448864389, "grad_norm": 48.5, "kl": 1.2699518203735352, "learning_rate": 5e-07, "logits/chosen": -19344756.57142857, "logits/rejected": 1381769.25, "logps/chosen": -250.45371791294642, "logps/rejected": -169.31814575195312, "loss": 0.2744, "rewards/chosen": 1.1235406058175224, "rewards/margins": 7.520472254071917, "rewards/rejected": -6.3969316482543945, "step": 16029 }, { "epoch": 0.8496541488882411, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2630952.0, "logits/rejected": -10692566.0, "logps/chosen": -256.606201171875, "logps/rejected": -426.5416259765625, "loss": 0.2417, "rewards/chosen": 1.0965049266815186, "rewards/margins": 2.865945339202881, "rewards/rejected": -1.7694404125213623, "step": 16030 }, { "epoch": 0.8497071528900432, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -97115216.0, "logits/rejected": -9470244.0, "logps/chosen": -404.27984619140625, "logps/rejected": -329.05792236328125, "loss": 0.1671, "rewards/chosen": 0.26761019229888916, "rewards/margins": 3.3687413136164346, "rewards/rejected": -3.1011311213175454, "step": 16031 }, { "epoch": 0.8497601568918454, "grad_norm": 41.0, "kl": 4.264852523803711, "learning_rate": 5e-07, "logits/chosen": -23327881.6, "logits/rejected": -13890341.333333334, "logps/chosen": -343.53310546875, "logps/rejected": -197.3748779296875, "loss": 0.3434, "rewards/chosen": 0.6749428749084473, "rewards/margins": 3.2325926462809242, "rewards/rejected": -2.557649771372477, "step": 16032 }, { "epoch": 0.8498131608936474, "grad_norm": 71.5, "kl": 2.8400630950927734, "learning_rate": 5e-07, "logits/chosen": -5620851.428571428, "logits/rejected": 49431688.0, "logps/chosen": -415.04275948660717, "logps/rejected": -372.82135009765625, "loss": 0.4529, "rewards/chosen": 0.4149975436074393, "rewards/margins": 1.0515331881386893, "rewards/rejected": -0.63653564453125, "step": 16033 }, { "epoch": 0.8498661648954496, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 315481984.0, "logits/rejected": 4502964.0, "logps/chosen": -395.9217529296875, "logps/rejected": -291.05495198567706, "loss": 0.2469, "rewards/chosen": 0.11265411227941513, "rewards/margins": 1.9617109919587772, "rewards/rejected": -1.849056879679362, "step": 16034 }, { "epoch": 0.8499191688972517, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91374112.0, "logits/rejected": -33020040.0, "logps/chosen": -232.1513671875, "logps/rejected": -207.658447265625, "loss": 0.1846, "rewards/chosen": 0.5113365054130554, "rewards/margins": 3.0056904355684915, "rewards/rejected": -2.494353930155436, "step": 16035 }, { "epoch": 0.8499721728990539, "grad_norm": 43.75, "kl": 1.5114269256591797, "learning_rate": 5e-07, "logits/chosen": -19583884.0, "logits/rejected": 4461983.0, "logps/chosen": -424.36016845703125, "logps/rejected": -176.6611328125, "loss": 0.23, "rewards/chosen": 0.8019814491271973, "rewards/margins": 3.6189780235290527, "rewards/rejected": -2.8169965744018555, "step": 16036 }, { "epoch": 0.850025176900856, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4580509.0, "logits/rejected": -23274077.333333332, "logps/chosen": -133.7534942626953, "logps/rejected": -246.24906412760416, "loss": 0.1965, "rewards/chosen": -0.03970155864953995, "rewards/margins": 3.4812678322196007, "rewards/rejected": -3.5209693908691406, "step": 16037 }, { "epoch": 0.8500781809026582, "grad_norm": 49.25, "kl": 0.7916412353515625, "learning_rate": 5e-07, "logits/chosen": -15102217.6, "logits/rejected": -24911482.666666668, "logps/chosen": -256.3869873046875, "logps/rejected": -326.26552327473956, "loss": 0.2559, "rewards/chosen": 0.7896450519561767, "rewards/margins": 2.5758396943410236, "rewards/rejected": -1.786194642384847, "step": 16038 }, { "epoch": 0.8501311849044603, "grad_norm": 45.75, "kl": 1.8067913055419922, "learning_rate": 5e-07, "logits/chosen": -44428826.666666664, "logits/rejected": -48544566.4, "logps/chosen": -263.9696451822917, "logps/rejected": -498.8033203125, "loss": 0.2103, "rewards/chosen": 0.8778461615244547, "rewards/margins": 3.4539014975229896, "rewards/rejected": -2.576055335998535, "step": 16039 }, { "epoch": 0.8501841889062625, "grad_norm": 40.0, "kl": 2.282665252685547, "learning_rate": 5e-07, "logits/chosen": -62654648.0, "logits/rejected": -20812294.0, "logps/chosen": -442.477783203125, "logps/rejected": -308.3774719238281, "loss": 0.2879, "rewards/chosen": 0.4305027425289154, "rewards/margins": 3.324223607778549, "rewards/rejected": -2.893720865249634, "step": 16040 }, { "epoch": 0.8502371929080645, "grad_norm": 50.25, "kl": 3.706662178039551, "learning_rate": 5e-07, "logits/chosen": -27335850.0, "logits/rejected": -7253487.0, "logps/chosen": -375.2051696777344, "logps/rejected": -105.27001190185547, "loss": 0.2815, "rewards/chosen": 0.7244033217430115, "rewards/margins": 3.6605936884880066, "rewards/rejected": -2.936190366744995, "step": 16041 }, { "epoch": 0.8502901969098667, "grad_norm": 46.5, "kl": 0.07276153564453125, "learning_rate": 5e-07, "logits/chosen": -5887077.5, "logits/rejected": -12397384.0, "logps/chosen": -201.43133544921875, "logps/rejected": -272.19390869140625, "loss": 0.3711, "rewards/chosen": 0.061622053384780884, "rewards/margins": 1.54172483086586, "rewards/rejected": -1.480102777481079, "step": 16042 }, { "epoch": 0.8503432009116688, "grad_norm": 54.25, "kl": 0.8384475708007812, "learning_rate": 5e-07, "logits/chosen": -39457651.2, "logits/rejected": 9388650.666666666, "logps/chosen": -466.80966796875, "logps/rejected": -353.9028727213542, "loss": 0.2983, "rewards/chosen": 1.124175453186035, "rewards/margins": 2.1246275742848715, "rewards/rejected": -1.0004521210988362, "step": 16043 }, { "epoch": 0.850396204913471, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34023996.0, "logits/rejected": -4779950.666666667, "logps/chosen": -508.05328369140625, "logps/rejected": -250.4629109700521, "loss": 0.2061, "rewards/chosen": 0.3928283751010895, "rewards/margins": 2.441903978586197, "rewards/rejected": -2.0490756034851074, "step": 16044 }, { "epoch": 0.8504492089152731, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": 6895402.0, "logps/rejected": -337.1968994140625, "loss": 0.1448, "rewards/rejected": -2.5227131843566895, "step": 16045 }, { "epoch": 0.8505022129170753, "grad_norm": 37.25, "kl": 0.2749156951904297, "learning_rate": 5e-07, "logits/chosen": -6350287.333333333, "logits/rejected": -16060470.4, "logps/chosen": -189.77840169270834, "logps/rejected": -244.3536865234375, "loss": 0.1243, "rewards/chosen": 1.529484748840332, "rewards/margins": 4.5252632141113285, "rewards/rejected": -2.995778465270996, "step": 16046 }, { "epoch": 0.8505552169188774, "grad_norm": 36.5, "kl": 2.8844833374023438, "learning_rate": 5e-07, "logits/chosen": -2995267.6, "logits/rejected": -23980858.666666668, "logps/chosen": -159.76416015625, "logps/rejected": -280.9791666666667, "loss": 0.3188, "rewards/chosen": 0.6400012969970703, "rewards/margins": 3.87812074025472, "rewards/rejected": -3.23811944325765, "step": 16047 }, { "epoch": 0.8506082209206796, "grad_norm": 52.75, "kl": 0.816227912902832, "learning_rate": 5e-07, "logits/chosen": -18300700.0, "logits/rejected": -10036411.0, "logps/chosen": -345.1056315104167, "logps/rejected": -490.3829345703125, "loss": 0.33, "rewards/chosen": 0.5190020402272543, "rewards/margins": 3.4340670903523765, "rewards/rejected": -2.915065050125122, "step": 16048 }, { "epoch": 0.8506612249224816, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20786457.6, "logits/rejected": -79739317.33333333, "logps/chosen": -338.831689453125, "logps/rejected": -401.127197265625, "loss": 0.2843, "rewards/chosen": 0.3054342746734619, "rewards/margins": 3.976992336908976, "rewards/rejected": -3.671558062235514, "step": 16049 }, { "epoch": 0.8507142289242838, "grad_norm": 67.0, "kl": 0.46954345703125, "learning_rate": 5e-07, "logits/chosen": -32013080.0, "logits/rejected": -17204036.8, "logps/chosen": -176.9452921549479, "logps/rejected": -232.816162109375, "loss": 0.3162, "rewards/chosen": -0.1447214682896932, "rewards/margins": 1.4901828209559123, "rewards/rejected": -1.6349042892456054, "step": 16050 }, { "epoch": 0.8507672329260859, "grad_norm": 39.5, "kl": 2.290485382080078, "learning_rate": 5e-07, "logits/chosen": -3630351.75, "logits/rejected": -16338633.0, "logps/chosen": -243.17872619628906, "logps/rejected": -279.1894226074219, "loss": 0.1689, "rewards/chosen": 1.495084285736084, "rewards/margins": 4.940630197525024, "rewards/rejected": -3.4455459117889404, "step": 16051 }, { "epoch": 0.8508202369278881, "grad_norm": 63.25, "kl": 0.7504367828369141, "learning_rate": 5e-07, "logits/chosen": -57509965.71428572, "logits/rejected": -68690192.0, "logps/chosen": -384.0496303013393, "logps/rejected": -580.7761840820312, "loss": 0.3548, "rewards/chosen": 0.5674583911895752, "rewards/margins": 2.8442223072052, "rewards/rejected": -2.276763916015625, "step": 16052 }, { "epoch": 0.8508732409296902, "grad_norm": 37.0, "kl": 1.923858642578125, "learning_rate": 5e-07, "logits/chosen": 2828446.6666666665, "logits/rejected": -43839654.4, "logps/chosen": -183.5280965169271, "logps/rejected": -504.192724609375, "loss": 0.2441, "rewards/chosen": 0.3967250982920329, "rewards/margins": 2.9087886969248453, "rewards/rejected": -2.5120635986328126, "step": 16053 }, { "epoch": 0.8509262449314924, "grad_norm": 32.25, "kl": 3.558152198791504, "learning_rate": 5e-07, "logits/chosen": -756610.75, "logits/rejected": -1569248.75, "logps/chosen": -74.56029256184895, "logps/rejected": -107.73519897460938, "loss": 0.4934, "rewards/chosen": -0.06117909153302511, "rewards/margins": 2.961216519276301, "rewards/rejected": -3.022395610809326, "step": 16054 }, { "epoch": 0.8509792489332945, "grad_norm": 40.75, "kl": 1.8300533294677734, "learning_rate": 5e-07, "logits/chosen": -3041617.2, "logits/rejected": -41696784.0, "logps/chosen": -180.704345703125, "logps/rejected": -379.4654541015625, "loss": 0.2773, "rewards/chosen": 0.5026580333709717, "rewards/margins": 4.1849811712900795, "rewards/rejected": -3.682323137919108, "step": 16055 }, { "epoch": 0.8510322529350965, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71235416.0, "logits/rejected": -24896660.0, "logps/chosen": -569.97119140625, "logps/rejected": -318.87823486328125, "loss": 0.2586, "rewards/chosen": 0.20685654878616333, "rewards/margins": 2.9830275177955627, "rewards/rejected": -2.7761709690093994, "step": 16056 }, { "epoch": 0.8510852569368987, "grad_norm": 41.5, "kl": 1.2568244934082031, "learning_rate": 5e-07, "logits/chosen": -26900044.0, "logits/rejected": -2864652.5, "logps/chosen": -261.54669189453125, "logps/rejected": -146.44017028808594, "loss": 0.2705, "rewards/chosen": 0.415877103805542, "rewards/margins": 2.449753522872925, "rewards/rejected": -2.033876419067383, "step": 16057 }, { "epoch": 0.8511382609387008, "grad_norm": 42.75, "kl": 1.21734619140625, "learning_rate": 5e-07, "logits/chosen": -75198544.0, "logits/rejected": -42653276.0, "logps/chosen": -193.44200134277344, "logps/rejected": -276.491455078125, "loss": 0.3021, "rewards/chosen": 0.22603580355644226, "rewards/margins": 2.173708885908127, "rewards/rejected": -1.9476730823516846, "step": 16058 }, { "epoch": 0.851191264940503, "grad_norm": 45.5, "kl": 0.3584098815917969, "learning_rate": 5e-07, "logits/chosen": -62697082.666666664, "logits/rejected": -39309075.2, "logps/chosen": -210.1806640625, "logps/rejected": -388.2446533203125, "loss": 0.2114, "rewards/chosen": 0.7596460978190104, "rewards/margins": 3.1023859659830726, "rewards/rejected": -2.3427398681640623, "step": 16059 }, { "epoch": 0.8512442689423051, "grad_norm": 43.75, "kl": 3.7853355407714844, "learning_rate": 5e-07, "logits/chosen": -4630389.666666667, "logits/rejected": 3702612.25, "logps/chosen": -325.1880696614583, "logps/rejected": -376.7754821777344, "loss": 0.3498, "rewards/chosen": 1.1780703862508137, "rewards/margins": 2.8908909161885576, "rewards/rejected": -1.7128205299377441, "step": 16060 }, { "epoch": 0.8512972729441073, "grad_norm": 40.25, "kl": 1.5890522003173828, "learning_rate": 5e-07, "logits/chosen": -13227186.666666666, "logits/rejected": -19173046.0, "logps/chosen": -201.31062825520834, "logps/rejected": -247.8561248779297, "loss": 0.3014, "rewards/chosen": 0.8190109729766846, "rewards/margins": 3.0969603061676025, "rewards/rejected": -2.277949333190918, "step": 16061 }, { "epoch": 0.8513502769459094, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8471148.0, "logits/rejected": -6231307.428571428, "logps/chosen": -349.8720703125, "logps/rejected": -203.11851283482142, "loss": 0.2033, "rewards/chosen": 1.0815887451171875, "rewards/margins": 2.656336511884417, "rewards/rejected": -1.5747477667672294, "step": 16062 }, { "epoch": 0.8514032809477116, "grad_norm": 46.5, "kl": 0.4761810302734375, "learning_rate": 5e-07, "logits/chosen": -13541708.0, "logits/rejected": -3271685.2, "logps/chosen": -122.39407348632812, "logps/rejected": -164.33985595703126, "loss": 0.3687, "rewards/chosen": 0.3247673511505127, "rewards/margins": 1.1574245929718017, "rewards/rejected": -0.8326572418212891, "step": 16063 }, { "epoch": 0.8514562849495136, "grad_norm": 52.75, "kl": 0.9250659942626953, "learning_rate": 5e-07, "logits/chosen": -28294226.0, "logits/rejected": -33595818.666666664, "logps/chosen": -269.74609375, "logps/rejected": -406.2145182291667, "loss": 0.1803, "rewards/chosen": 0.768856406211853, "rewards/margins": 3.2530681689580283, "rewards/rejected": -2.4842117627461753, "step": 16064 }, { "epoch": 0.8515092889513158, "grad_norm": 45.0, "kl": 0.8507041931152344, "learning_rate": 5e-07, "logits/chosen": 6640189.5, "logits/rejected": -19674365.333333332, "logps/chosen": -104.3289566040039, "logps/rejected": -277.7921142578125, "loss": 0.2013, "rewards/chosen": 0.6430861949920654, "rewards/margins": 3.1120256582895913, "rewards/rejected": -2.468939463297526, "step": 16065 }, { "epoch": 0.8515622929531179, "grad_norm": 47.0, "kl": 3.536336898803711, "learning_rate": 5e-07, "logits/chosen": -22994608.0, "logits/rejected": -37886816.0, "logps/chosen": -211.0741984049479, "logps/rejected": -244.1667938232422, "loss": 0.3497, "rewards/chosen": 0.49386099974314374, "rewards/margins": 3.8927836815516152, "rewards/rejected": -3.3989226818084717, "step": 16066 }, { "epoch": 0.8516152969549201, "grad_norm": 37.5, "kl": 0.44296836853027344, "learning_rate": 5e-07, "logits/chosen": -3110294.5, "logits/rejected": -5738669.0, "logps/chosen": -102.9111328125, "logps/rejected": -151.10508728027344, "loss": 0.2401, "rewards/chosen": 1.4034228324890137, "rewards/margins": 3.1389408111572266, "rewards/rejected": -1.735517978668213, "step": 16067 }, { "epoch": 0.8516683009567222, "grad_norm": 48.0, "kl": 1.6388673782348633, "learning_rate": 5e-07, "logits/chosen": -44252922.666666664, "logits/rejected": -29712036.0, "logps/chosen": -432.18896484375, "logps/rejected": -117.90104675292969, "loss": 0.3635, "rewards/chosen": 0.5550454060236613, "rewards/margins": 2.5871413151423135, "rewards/rejected": -2.0320959091186523, "step": 16068 }, { "epoch": 0.8517213049585244, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35911229.333333336, "logits/rejected": -37850585.6, "logps/chosen": -454.655517578125, "logps/rejected": -570.03134765625, "loss": 0.1902, "rewards/chosen": 1.0678746700286865, "rewards/margins": 3.71493182182312, "rewards/rejected": -2.6470571517944337, "step": 16069 }, { "epoch": 0.8517743089603265, "grad_norm": 32.25, "kl": 1.9181594848632812, "learning_rate": 5e-07, "logits/chosen": 13788870.0, "logits/rejected": -13171502.666666666, "logps/chosen": -133.66445922851562, "logps/rejected": -288.7569173177083, "loss": 0.1972, "rewards/chosen": -0.04364929348230362, "rewards/margins": 2.9970242803295455, "rewards/rejected": -3.040673573811849, "step": 16070 }, { "epoch": 0.8518273129621287, "grad_norm": 67.0, "kl": 1.0261306762695312, "learning_rate": 5e-07, "logits/chosen": -45168572.8, "logits/rejected": -22038312.0, "logps/chosen": -289.2473388671875, "logps/rejected": -381.3168131510417, "loss": 0.3277, "rewards/chosen": 0.44741272926330566, "rewards/margins": 2.824713945388794, "rewards/rejected": -2.3773012161254883, "step": 16071 }, { "epoch": 0.8518803169639307, "grad_norm": 50.75, "kl": 0.5052404403686523, "learning_rate": 5e-07, "logits/chosen": -34405194.666666664, "logits/rejected": -5797003.5, "logps/chosen": -282.34613037109375, "logps/rejected": -212.94842529296875, "loss": 0.3825, "rewards/chosen": 0.08727871378262837, "rewards/margins": 2.849393238623937, "rewards/rejected": -2.7621145248413086, "step": 16072 }, { "epoch": 0.8519333209657329, "grad_norm": 53.5, "kl": 4.967885971069336, "learning_rate": 5e-07, "logits/chosen": -5060182.5, "logits/rejected": -41855152.0, "logps/chosen": -320.991455078125, "logps/rejected": -481.1699523925781, "loss": 0.1724, "rewards/chosen": 1.8001816272735596, "rewards/margins": 4.312717914581299, "rewards/rejected": -2.5125362873077393, "step": 16073 }, { "epoch": 0.851986324967535, "grad_norm": 52.75, "kl": 0.18292236328125, "learning_rate": 5e-07, "logits/chosen": -27709600.0, "logits/rejected": -4121527.3333333335, "logps/chosen": -271.6119384765625, "logps/rejected": -215.7799072265625, "loss": 0.3063, "rewards/chosen": 0.35920712947845457, "rewards/margins": 3.0063271284103394, "rewards/rejected": -2.6471199989318848, "step": 16074 }, { "epoch": 0.8520393289693372, "grad_norm": 43.75, "kl": 1.613316535949707, "learning_rate": 5e-07, "logits/chosen": 3100233.2, "logits/rejected": -3184041.3333333335, "logps/chosen": -136.6904052734375, "logps/rejected": -81.65679931640625, "loss": 0.247, "rewards/chosen": 1.0566818237304687, "rewards/margins": 3.8277860005696613, "rewards/rejected": -2.771104176839193, "step": 16075 }, { "epoch": 0.8520923329711393, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21686729.6, "logits/rejected": -788086.0833333334, "logps/chosen": -363.844384765625, "logps/rejected": -85.15218098958333, "loss": 0.329, "rewards/chosen": 0.339874267578125, "rewards/margins": 3.0522537231445312, "rewards/rejected": -2.7123794555664062, "step": 16076 }, { "epoch": 0.8521453369729415, "grad_norm": 49.75, "kl": 0.34381866455078125, "learning_rate": 5e-07, "logits/chosen": -2892951.3333333335, "logits/rejected": -49263648.0, "logps/chosen": -362.4102783203125, "logps/rejected": -313.1036376953125, "loss": 0.2762, "rewards/chosen": 0.742995023727417, "rewards/margins": 2.2912621974945067, "rewards/rejected": -1.5482671737670899, "step": 16077 }, { "epoch": 0.8521983409747436, "grad_norm": 46.5, "kl": 4.754987716674805, "learning_rate": 5e-07, "logits/chosen": -23336790.4, "logits/rejected": -12589896.0, "logps/chosen": -340.6408203125, "logps/rejected": -498.0941162109375, "loss": 0.2226, "rewards/chosen": 1.560226058959961, "rewards/margins": 3.7374009450276695, "rewards/rejected": -2.1771748860677085, "step": 16078 }, { "epoch": 0.8522513449765458, "grad_norm": 50.75, "kl": 0.3188896179199219, "learning_rate": 5e-07, "logits/chosen": -41926092.8, "logits/rejected": -27263197.333333332, "logps/chosen": -270.453173828125, "logps/rejected": -206.78668212890625, "loss": 0.2751, "rewards/chosen": 1.0196971893310547, "rewards/margins": 2.572721004486084, "rewards/rejected": -1.5530238151550293, "step": 16079 }, { "epoch": 0.8523043489783478, "grad_norm": 20.5, "kl": 3.3234691619873047, "learning_rate": 5e-07, "logits/chosen": -8832400.0, "logits/rejected": -11397757.333333334, "logps/chosen": -274.785546875, "logps/rejected": -788.3946126302084, "loss": 0.3051, "rewards/chosen": 0.7129866123199463, "rewards/margins": 6.914375289281209, "rewards/rejected": -6.201388676961263, "step": 16080 }, { "epoch": 0.85235735298015, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37335052.0, "logits/rejected": 2160428.8333333335, "logps/chosen": -482.22052001953125, "logps/rejected": -144.37540690104166, "loss": 0.1187, "rewards/chosen": 1.4087738990783691, "rewards/margins": 4.502557277679443, "rewards/rejected": -3.093783378601074, "step": 16081 }, { "epoch": 0.8524103569819521, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21790906.0, "logits/rejected": -17512770.0, "logps/chosen": -318.5244140625, "logps/rejected": -456.2902526855469, "loss": 0.2883, "rewards/chosen": 0.24485912919044495, "rewards/margins": 2.4777096807956696, "rewards/rejected": -2.2328505516052246, "step": 16082 }, { "epoch": 0.8524633609837543, "grad_norm": 38.5, "kl": 3.8606605529785156, "learning_rate": 5e-07, "logits/chosen": -28870787.2, "logits/rejected": -10271597.333333334, "logps/chosen": -352.926611328125, "logps/rejected": -90.62396240234375, "loss": 0.3165, "rewards/chosen": 0.7653673171997071, "rewards/margins": 2.623172092437744, "rewards/rejected": -1.857804775238037, "step": 16083 }, { "epoch": 0.8525163649855564, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15974296.0, "logits/rejected": 1914337.7142857143, "logps/chosen": -313.8165283203125, "logps/rejected": -375.75523158482144, "loss": 0.1715, "rewards/chosen": -0.02373046986758709, "rewards/margins": 2.7021805343351195, "rewards/rejected": -2.7259110042027066, "step": 16084 }, { "epoch": 0.8525693689873586, "grad_norm": 49.5, "kl": 1.2004814147949219, "learning_rate": 5e-07, "logits/chosen": -37533376.0, "logits/rejected": -17773784.0, "logps/chosen": -255.96884765625, "logps/rejected": -159.49857584635416, "loss": 0.3972, "rewards/chosen": 0.1606041669845581, "rewards/margins": 1.5665273745854695, "rewards/rejected": -1.4059232076009114, "step": 16085 }, { "epoch": 0.8526223729891607, "grad_norm": 50.5, "kl": 3.0703125, "learning_rate": 5e-07, "logits/chosen": -27010781.333333332, "logits/rejected": -47021732.0, "logps/chosen": -284.1941324869792, "logps/rejected": -338.8589172363281, "loss": 0.3816, "rewards/chosen": 0.5064668655395508, "rewards/margins": 2.1432974338531494, "rewards/rejected": -1.6368305683135986, "step": 16086 }, { "epoch": 0.8526753769909629, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32682888.0, "logits/rejected": -18282880.0, "logps/chosen": -328.7659912109375, "logps/rejected": -437.374755859375, "loss": 0.3209, "rewards/chosen": -0.11307287216186523, "rewards/margins": 2.7874350547790527, "rewards/rejected": -2.900507926940918, "step": 16087 }, { "epoch": 0.8527283809927649, "grad_norm": 63.5, "kl": 1.583169937133789, "learning_rate": 5e-07, "logits/chosen": -73530297.6, "logits/rejected": -21080718.666666668, "logps/chosen": -417.25517578125, "logps/rejected": -559.171630859375, "loss": 0.2774, "rewards/chosen": 0.6562588691711426, "rewards/margins": 4.770113595326741, "rewards/rejected": -4.113854726155599, "step": 16088 }, { "epoch": 0.8527813849945671, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81628074.66666667, "logits/rejected": -48439276.8, "logps/chosen": -349.439453125, "logps/rejected": -456.23662109375, "loss": 0.1945, "rewards/chosen": 0.8531906604766846, "rewards/margins": 3.9173679828643797, "rewards/rejected": -3.064177322387695, "step": 16089 }, { "epoch": 0.8528343889963692, "grad_norm": 33.0, "kl": 0.9779214859008789, "learning_rate": 5e-07, "logits/chosen": -18845065.333333332, "logits/rejected": -34981731.2, "logps/chosen": -220.02132161458334, "logps/rejected": -385.720849609375, "loss": 0.2381, "rewards/chosen": 0.4930071433385213, "rewards/margins": 3.3199846824010213, "rewards/rejected": -2.8269775390625, "step": 16090 }, { "epoch": 0.8528873929981714, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5610478.8, "logits/rejected": 9377534.666666666, "logps/chosen": -359.9224609375, "logps/rejected": -144.09956868489584, "loss": 0.2698, "rewards/chosen": 0.9409263610839844, "rewards/margins": 2.855847962697347, "rewards/rejected": -1.9149216016133626, "step": 16091 }, { "epoch": 0.8529403969999735, "grad_norm": 48.0, "kl": 1.1024932861328125, "learning_rate": 5e-07, "logits/chosen": -86463850.66666667, "logits/rejected": -16681718.4, "logps/chosen": -215.67012532552084, "logps/rejected": -375.03505859375, "loss": 0.3217, "rewards/chosen": 0.022219985723495483, "rewards/margins": 1.9780827939510346, "rewards/rejected": -1.955862808227539, "step": 16092 }, { "epoch": 0.8529934010017757, "grad_norm": 54.75, "kl": 1.6623382568359375, "learning_rate": 5e-07, "logits/chosen": -24855789.333333332, "logits/rejected": -26100662.0, "logps/chosen": -305.5415445963542, "logps/rejected": -459.3771667480469, "loss": 0.441, "rewards/chosen": -0.0937118927637736, "rewards/margins": 3.5028168757756553, "rewards/rejected": -3.5965287685394287, "step": 16093 }, { "epoch": 0.8530464050035778, "grad_norm": 52.5, "kl": 2.0299739837646484, "learning_rate": 5e-07, "logits/chosen": -35134178.666666664, "logits/rejected": -24275304.0, "logps/chosen": -284.28912353515625, "logps/rejected": -567.027587890625, "loss": 0.4516, "rewards/chosen": -0.27062873045603436, "rewards/margins": 4.629761894543965, "rewards/rejected": -4.900390625, "step": 16094 }, { "epoch": 0.85309940900538, "grad_norm": 24.0, "kl": 2.4771413803100586, "learning_rate": 5e-07, "logits/chosen": 18089258.0, "logits/rejected": -7890609.714285715, "logps/chosen": -16.185543060302734, "logps/rejected": -273.33455984933033, "loss": 0.0896, "rewards/chosen": 1.6634358167648315, "rewards/margins": 5.027680550302778, "rewards/rejected": -3.3642447335379466, "step": 16095 }, { "epoch": 0.853152413007182, "grad_norm": 64.5, "kl": 0.10695648193359375, "learning_rate": 5e-07, "logits/chosen": -51491257.6, "logits/rejected": 5429593.333333333, "logps/chosen": -383.1625244140625, "logps/rejected": -278.84796142578125, "loss": 0.3026, "rewards/chosen": 0.30711028575897215, "rewards/margins": 3.3069087425867716, "rewards/rejected": -2.9997984568277993, "step": 16096 }, { "epoch": 0.8532054170089842, "grad_norm": 23.875, "kl": 1.2069969177246094, "learning_rate": 5e-07, "logits/chosen": -118227424.0, "logits/rejected": -17631646.666666668, "logps/chosen": -910.6250610351562, "logps/rejected": -181.1287841796875, "loss": 0.091, "rewards/chosen": 2.0889604091644287, "rewards/margins": 5.121952136357626, "rewards/rejected": -3.0329917271931968, "step": 16097 }, { "epoch": 0.8532584210107863, "grad_norm": 70.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 43613628.8, "logits/rejected": -21815613.333333332, "logps/chosen": -485.21748046875, "logps/rejected": -521.905029296875, "loss": 0.3204, "rewards/chosen": 0.3364540100097656, "rewards/margins": 4.585958735148112, "rewards/rejected": -4.249504725138347, "step": 16098 }, { "epoch": 0.8533114250125885, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11438365.333333334, "logits/rejected": -23322592.0, "logps/chosen": -466.5730794270833, "logps/rejected": -577.3587036132812, "loss": 0.3105, "rewards/chosen": 0.8237340450286865, "rewards/margins": 3.1678550243377686, "rewards/rejected": -2.344120979309082, "step": 16099 }, { "epoch": 0.8533644290143906, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21040888.0, "logits/rejected": -56012826.666666664, "logps/chosen": -345.105712890625, "logps/rejected": -368.7923583984375, "loss": 0.1771, "rewards/chosen": 0.230865478515625, "rewards/margins": 2.8238890965779624, "rewards/rejected": -2.5930236180623374, "step": 16100 }, { "epoch": 0.8534174330161928, "grad_norm": 42.0, "kl": 0.09253692626953125, "learning_rate": 5e-07, "logits/chosen": -16685272.0, "logits/rejected": -47674732.8, "logps/chosen": -755.7797037760416, "logps/rejected": -664.223828125, "loss": 0.189, "rewards/chosen": 1.5300380388895671, "rewards/margins": 4.584694449106852, "rewards/rejected": -3.054656410217285, "step": 16101 }, { "epoch": 0.8534704370179949, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35263077.333333336, "logits/rejected": -25793006.4, "logps/chosen": -186.23282877604166, "logps/rejected": -312.481005859375, "loss": 0.2285, "rewards/chosen": -0.18019193410873413, "rewards/margins": 3.1952749848365785, "rewards/rejected": -3.3754669189453126, "step": 16102 }, { "epoch": 0.853523441019797, "grad_norm": 56.25, "kl": 0.13240814208984375, "learning_rate": 5e-07, "logits/chosen": 1109492.25, "logits/rejected": 48581880.0, "logps/chosen": -216.5994873046875, "logps/rejected": -300.6045227050781, "loss": 0.3228, "rewards/chosen": 0.05812130868434906, "rewards/margins": 2.069247826933861, "rewards/rejected": -2.0111265182495117, "step": 16103 }, { "epoch": 0.8535764450215991, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37974144.0, "logits/rejected": -39988564.0, "logps/chosen": -200.84620666503906, "logps/rejected": -376.8669738769531, "loss": 0.2481, "rewards/chosen": 0.2114848643541336, "rewards/margins": 3.3600916415452957, "rewards/rejected": -3.148606777191162, "step": 16104 }, { "epoch": 0.8536294490234013, "grad_norm": 29.5, "kl": 0.5829296112060547, "learning_rate": 5e-07, "logits/chosen": 3293487.6666666665, "logits/rejected": -29102371.2, "logps/chosen": -64.83172607421875, "logps/rejected": -232.1796630859375, "loss": 0.217, "rewards/chosen": 0.6309281587600708, "rewards/margins": 2.8984355211257933, "rewards/rejected": -2.2675073623657225, "step": 16105 }, { "epoch": 0.8536824530252034, "grad_norm": 39.75, "kl": 0.37714099884033203, "learning_rate": 5e-07, "logits/chosen": -26797690.0, "logits/rejected": -64341724.0, "logps/chosen": -236.9537353515625, "logps/rejected": -356.3908386230469, "loss": 0.2962, "rewards/chosen": 0.35142040252685547, "rewards/margins": 2.2415748834609985, "rewards/rejected": -1.890154480934143, "step": 16106 }, { "epoch": 0.8537354570270055, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28101926.4, "logits/rejected": -5814120.0, "logps/chosen": -410.211669921875, "logps/rejected": -118.80902099609375, "loss": 0.221, "rewards/chosen": 0.8223804473876953, "rewards/margins": 4.408310000101725, "rewards/rejected": -3.58592955271403, "step": 16107 }, { "epoch": 0.8537884610288077, "grad_norm": 53.0, "kl": 0.6095161437988281, "learning_rate": 5e-07, "logits/chosen": -12508637.333333334, "logits/rejected": 194558288.0, "logps/chosen": -261.7956136067708, "logps/rejected": -1010.9788818359375, "loss": 0.3921, "rewards/chosen": 0.10962448517481486, "rewards/margins": 2.614540914694468, "rewards/rejected": -2.5049164295196533, "step": 16108 }, { "epoch": 0.8538414650306098, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69697648.0, "logits/rejected": -45753152.0, "logps/chosen": -213.9388427734375, "logps/rejected": -490.97552490234375, "loss": 0.2596, "rewards/chosen": 0.1976739913225174, "rewards/margins": 3.3998229056596756, "rewards/rejected": -3.202148914337158, "step": 16109 }, { "epoch": 0.853894469032412, "grad_norm": 39.0, "kl": 2.738719940185547, "learning_rate": 5e-07, "logits/chosen": -33093020.0, "logits/rejected": -29525846.0, "logps/chosen": -245.5376434326172, "logps/rejected": -341.1665344238281, "loss": 0.2724, "rewards/chosen": 0.7730148434638977, "rewards/margins": 3.1905086636543274, "rewards/rejected": -2.4174938201904297, "step": 16110 }, { "epoch": 0.853947473034214, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4588302.0, "logits/rejected": -27577005.333333332, "logps/chosen": -17.215442657470703, "logps/rejected": -236.64453125, "loss": 0.2116, "rewards/chosen": -0.11916275322437286, "rewards/margins": 2.2555493960777917, "rewards/rejected": -2.3747121493021646, "step": 16111 }, { "epoch": 0.8540004770360162, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12303720.0, "logits/rejected": -27889548.8, "logps/chosen": -325.9173583984375, "logps/rejected": -384.056201171875, "loss": 0.3055, "rewards/chosen": 0.05822982390721639, "rewards/margins": 2.128495403130849, "rewards/rejected": -2.0702655792236326, "step": 16112 }, { "epoch": 0.8540534810378183, "grad_norm": 49.25, "kl": 4.198085784912109, "learning_rate": 5e-07, "logits/chosen": -27488704.0, "logits/rejected": 7902368.5, "logps/chosen": -316.26767985026044, "logps/rejected": -327.876708984375, "loss": 0.2889, "rewards/chosen": 0.950413703918457, "rewards/margins": 2.993396282196045, "rewards/rejected": -2.042982578277588, "step": 16113 }, { "epoch": 0.8541064850396205, "grad_norm": 90.5, "kl": 4.781695365905762, "learning_rate": 5e-07, "logits/chosen": -57930026.666666664, "logits/rejected": 1749621.75, "logps/chosen": -554.0301920572916, "logps/rejected": -394.18438720703125, "loss": 0.2432, "rewards/chosen": 1.5323699315388997, "rewards/margins": 5.389649947484334, "rewards/rejected": -3.8572800159454346, "step": 16114 }, { "epoch": 0.8541594890414226, "grad_norm": 35.5, "kl": 0.9027585983276367, "learning_rate": 5e-07, "logits/chosen": -14734156.0, "logits/rejected": -71025936.0, "logps/chosen": -212.15555826822916, "logps/rejected": -868.6029052734375, "loss": 0.2532, "rewards/chosen": 1.0126598676045735, "rewards/margins": 4.670048077901204, "rewards/rejected": -3.657388210296631, "step": 16115 }, { "epoch": 0.8542124930432248, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19708444.0, "logits/rejected": -13135968.0, "logps/chosen": -147.71981811523438, "logps/rejected": -264.08966064453125, "loss": 0.3635, "rewards/chosen": -0.08986816555261612, "rewards/margins": 1.4443158134818077, "rewards/rejected": -1.5341839790344238, "step": 16116 }, { "epoch": 0.8542654970450269, "grad_norm": 47.0, "kl": 1.8404121398925781, "learning_rate": 5e-07, "logits/chosen": -71210368.0, "logits/rejected": -21314070.0, "logps/chosen": -538.2784423828125, "logps/rejected": -426.8275451660156, "loss": 0.2041, "rewards/chosen": 1.0358299016952515, "rewards/margins": 3.4130364656448364, "rewards/rejected": -2.377206563949585, "step": 16117 }, { "epoch": 0.854318501046829, "grad_norm": 29.0, "kl": 1.5728378295898438, "learning_rate": 5e-07, "logits/chosen": -35192148.0, "logits/rejected": -44715893.333333336, "logps/chosen": -230.12442016601562, "logps/rejected": -328.4269612630208, "loss": 0.1684, "rewards/chosen": 1.2832971811294556, "rewards/margins": 3.7261937061945596, "rewards/rejected": -2.442896525065104, "step": 16118 }, { "epoch": 0.8543715050486311, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11451667.0, "logits/rejected": -28675926.0, "logps/chosen": -505.35150146484375, "logps/rejected": -464.8816223144531, "loss": 0.2576, "rewards/chosen": 0.16974641382694244, "rewards/margins": 4.09563972055912, "rewards/rejected": -3.9258933067321777, "step": 16119 }, { "epoch": 0.8544245090504333, "grad_norm": 133.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -176197397.33333334, "logits/rejected": -20571804.8, "logps/chosen": -902.9165852864584, "logps/rejected": -194.34603271484374, "loss": 0.2542, "rewards/chosen": 0.224639892578125, "rewards/margins": 2.8816129684448244, "rewards/rejected": -2.6569730758666994, "step": 16120 }, { "epoch": 0.8544775130522354, "grad_norm": 55.5, "kl": 0.19481849670410156, "learning_rate": 5e-07, "logits/chosen": -16012258.0, "logits/rejected": 32677398.0, "logps/chosen": -236.30780029296875, "logps/rejected": -124.54163360595703, "loss": 0.2768, "rewards/chosen": 1.2807689905166626, "rewards/margins": 2.125406563282013, "rewards/rejected": -0.8446375727653503, "step": 16121 }, { "epoch": 0.8545305170540376, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4292182.0, "logits/rejected": -41610372.0, "logps/chosen": -171.21664428710938, "logps/rejected": -460.14886474609375, "loss": 0.258, "rewards/chosen": 0.0982895940542221, "rewards/margins": 3.477542981505394, "rewards/rejected": -3.379253387451172, "step": 16122 }, { "epoch": 0.8545835210558397, "grad_norm": 39.0, "kl": 2.1469593048095703, "learning_rate": 5e-07, "logits/chosen": -7264345.6, "logits/rejected": -31804650.666666668, "logps/chosen": -234.5587890625, "logps/rejected": -338.3624267578125, "loss": 0.2765, "rewards/chosen": 0.919522476196289, "rewards/margins": 3.6715681076049806, "rewards/rejected": -2.7520456314086914, "step": 16123 }, { "epoch": 0.8546365250576419, "grad_norm": 50.25, "kl": 1.6416034698486328, "learning_rate": 5e-07, "logits/chosen": -24087750.4, "logits/rejected": -16467042.666666666, "logps/chosen": -455.16923828125, "logps/rejected": -199.3687947591146, "loss": 0.3202, "rewards/chosen": 0.6863987922668457, "rewards/margins": 1.9702702045440674, "rewards/rejected": -1.2838714122772217, "step": 16124 }, { "epoch": 0.854689529059444, "grad_norm": 44.75, "kl": 3.055866241455078, "learning_rate": 5e-07, "logits/chosen": -22346753.333333332, "logits/rejected": -13512454.0, "logps/chosen": -266.34340413411456, "logps/rejected": -169.43637084960938, "loss": 0.3732, "rewards/chosen": 0.6607711712519327, "rewards/margins": 2.2179847160975137, "rewards/rejected": -1.557213544845581, "step": 16125 }, { "epoch": 0.8547425330612461, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30386884.0, "logits/rejected": -20417902.666666668, "logps/chosen": -1560.5205078125, "logps/rejected": -268.0024007161458, "loss": 0.1485, "rewards/chosen": 1.6713058948516846, "rewards/margins": 3.9518484274546304, "rewards/rejected": -2.280542532602946, "step": 16126 }, { "epoch": 0.8547955370630482, "grad_norm": 48.75, "kl": 0.6845512390136719, "learning_rate": 5e-07, "logits/chosen": -23248267.2, "logits/rejected": -3066441.6666666665, "logps/chosen": -286.13642578125, "logps/rejected": -278.96449788411456, "loss": 0.2335, "rewards/chosen": 0.9618298530578613, "rewards/margins": 3.623834959665934, "rewards/rejected": -2.6620051066080728, "step": 16127 }, { "epoch": 0.8548485410648504, "grad_norm": 60.0, "kl": 1.3773536682128906, "learning_rate": 5e-07, "logits/chosen": 6210797.0, "logits/rejected": -35696453.333333336, "logps/chosen": -159.71368408203125, "logps/rejected": -253.56500244140625, "loss": 0.2213, "rewards/chosen": 0.4037986695766449, "rewards/margins": 2.5994719763596854, "rewards/rejected": -2.1956733067830405, "step": 16128 }, { "epoch": 0.8549015450666525, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 58125.0, "logits/rejected": -21542754.0, "logps/chosen": -156.60975646972656, "logps/rejected": -273.5068664550781, "loss": 0.3406, "rewards/chosen": -0.41892415285110474, "rewards/margins": 3.2549802660942078, "rewards/rejected": -3.6739044189453125, "step": 16129 }, { "epoch": 0.8549545490684547, "grad_norm": 64.5, "kl": 7.49406623840332, "learning_rate": 5e-07, "logits/chosen": -18152814.4, "logits/rejected": 5712225.333333333, "logps/chosen": -546.13818359375, "logps/rejected": -449.6399739583333, "loss": 0.2694, "rewards/chosen": 1.6986408233642578, "rewards/margins": 3.5080211957295733, "rewards/rejected": -1.8093803723653157, "step": 16130 }, { "epoch": 0.8550075530702568, "grad_norm": 47.0, "kl": 0.38751220703125, "learning_rate": 5e-07, "logits/chosen": -52880282.666666664, "logits/rejected": -15656844.8, "logps/chosen": -456.478515625, "logps/rejected": -153.6752197265625, "loss": 0.2679, "rewards/chosen": 0.5691823164621989, "rewards/margins": 2.4435080687204995, "rewards/rejected": -1.8743257522583008, "step": 16131 }, { "epoch": 0.855060557072059, "grad_norm": 110.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11948249.333333334, "logits/rejected": -3328895.0, "logps/chosen": -404.736328125, "logps/rejected": -324.9884765625, "loss": 0.252, "rewards/chosen": 0.5436091423034668, "rewards/margins": 2.6298567771911623, "rewards/rejected": -2.0862476348876955, "step": 16132 }, { "epoch": 0.855113561073861, "grad_norm": 36.75, "kl": 2.2984704971313477, "learning_rate": 5e-07, "logits/chosen": -14108188.0, "logits/rejected": 9053054.0, "logps/chosen": -99.1611836751302, "logps/rejected": -582.8363037109375, "loss": 0.2997, "rewards/chosen": 0.7148675918579102, "rewards/margins": 4.5850794315338135, "rewards/rejected": -3.8702118396759033, "step": 16133 }, { "epoch": 0.8551665650756632, "grad_norm": 66.0, "kl": 1.6407928466796875, "learning_rate": 5e-07, "logits/chosen": -21257834.666666668, "logits/rejected": -75001888.0, "logps/chosen": -352.464111328125, "logps/rejected": -359.85064697265625, "loss": 0.4228, "rewards/chosen": 0.10595280925432841, "rewards/margins": 2.1935344437758126, "rewards/rejected": -2.0875816345214844, "step": 16134 }, { "epoch": 0.8552195690774653, "grad_norm": 42.5, "kl": 1.1261444091796875, "learning_rate": 5e-07, "logits/chosen": -36423754.666666664, "logits/rejected": -43991532.8, "logps/chosen": -168.90653483072916, "logps/rejected": -376.50380859375, "loss": 0.2397, "rewards/chosen": 0.19055755933125815, "rewards/margins": 2.9593346436818444, "rewards/rejected": -2.768777084350586, "step": 16135 }, { "epoch": 0.8552725730792675, "grad_norm": 41.75, "kl": 0.32662391662597656, "learning_rate": 5e-07, "logits/chosen": -20166024.0, "logits/rejected": -17778568.0, "logps/chosen": -145.53004455566406, "logps/rejected": -316.65472412109375, "loss": 0.3599, "rewards/chosen": 0.05645785853266716, "rewards/margins": 1.9033195786178112, "rewards/rejected": -1.846861720085144, "step": 16136 }, { "epoch": 0.8553255770810696, "grad_norm": 45.75, "kl": 0.79339599609375, "learning_rate": 5e-07, "logits/chosen": -17994800.0, "logits/rejected": -8677768.0, "logps/chosen": -349.6028137207031, "logps/rejected": -298.58331298828125, "loss": 0.2273, "rewards/chosen": 0.6246414184570312, "rewards/margins": 3.8510212898254395, "rewards/rejected": -3.226379871368408, "step": 16137 }, { "epoch": 0.8553785810828718, "grad_norm": 29.375, "kl": 3.86846923828125, "learning_rate": 5e-07, "logits/chosen": -4311709.6, "logits/rejected": -2508886.1666666665, "logps/chosen": -98.10946044921874, "logps/rejected": -134.0805867513021, "loss": 0.3593, "rewards/chosen": 0.6816303730010986, "rewards/margins": 2.6878084341684976, "rewards/rejected": -2.006178061167399, "step": 16138 }, { "epoch": 0.8554315850846739, "grad_norm": 44.5, "kl": 0.28975677490234375, "learning_rate": 5e-07, "logits/chosen": -2869153.0, "logits/rejected": -21017320.0, "logps/chosen": -186.2644500732422, "logps/rejected": -156.42373657226562, "loss": 0.2064, "rewards/chosen": 0.5494856238365173, "rewards/margins": 3.945527732372284, "rewards/rejected": -3.3960421085357666, "step": 16139 }, { "epoch": 0.8554845890864761, "grad_norm": 70.5, "kl": 6.357707977294922, "learning_rate": 5e-07, "logits/chosen": -28972165.333333332, "logits/rejected": -23403404.0, "logps/chosen": -645.1106363932291, "logps/rejected": -230.89035034179688, "loss": 0.2592, "rewards/chosen": 1.4863627751668294, "rewards/margins": 4.840365727742513, "rewards/rejected": -3.3540029525756836, "step": 16140 }, { "epoch": 0.8555375930882781, "grad_norm": 38.0, "kl": 0.12764739990234375, "learning_rate": 5e-07, "logits/chosen": -40818938.666666664, "logits/rejected": -32522694.4, "logps/chosen": -182.39400227864584, "logps/rejected": -422.0978515625, "loss": 0.2208, "rewards/chosen": 0.5368606646855673, "rewards/margins": 2.957914741834005, "rewards/rejected": -2.4210540771484377, "step": 16141 }, { "epoch": 0.8555905970900803, "grad_norm": 53.5, "kl": 1.8070087432861328, "learning_rate": 5e-07, "logits/chosen": -16461321.333333334, "logits/rejected": -5010998.0, "logps/chosen": -94.9879150390625, "logps/rejected": -365.8861389160156, "loss": 0.3614, "rewards/chosen": 0.41183483600616455, "rewards/margins": 3.072691559791565, "rewards/rejected": -2.6608567237854004, "step": 16142 }, { "epoch": 0.8556436010918824, "grad_norm": 57.25, "kl": 0.5195522308349609, "learning_rate": 5e-07, "logits/chosen": -23733304.0, "logits/rejected": 2384330.1666666665, "logps/chosen": -176.5938232421875, "logps/rejected": -150.42415364583334, "loss": 0.4473, "rewards/chosen": -0.06259781122207642, "rewards/margins": 1.2438436547915142, "rewards/rejected": -1.3064414660135906, "step": 16143 }, { "epoch": 0.8556966050936846, "grad_norm": 44.25, "kl": 0.5683813095092773, "learning_rate": 5e-07, "logits/chosen": -10153770.4, "logits/rejected": -17088473.333333332, "logps/chosen": -206.026171875, "logps/rejected": -487.38916015625, "loss": 0.2572, "rewards/chosen": 0.63624587059021, "rewards/margins": 4.406708860397339, "rewards/rejected": -3.770462989807129, "step": 16144 }, { "epoch": 0.8557496090954867, "grad_norm": 61.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27450246.4, "logits/rejected": -12394468.0, "logps/chosen": -201.1138427734375, "logps/rejected": -218.45414225260416, "loss": 0.3709, "rewards/chosen": -0.023440003395080566, "rewards/margins": 2.7427695989608765, "rewards/rejected": -2.766209602355957, "step": 16145 }, { "epoch": 0.8558026130972889, "grad_norm": 58.75, "kl": 0.5745182037353516, "learning_rate": 5e-07, "logits/chosen": -23525605.333333332, "logits/rejected": -29411817.6, "logps/chosen": -343.3684488932292, "logps/rejected": -194.88533935546874, "loss": 0.2219, "rewards/chosen": 0.7591567039489746, "rewards/margins": 2.777589130401611, "rewards/rejected": -2.0184324264526365, "step": 16146 }, { "epoch": 0.855855617099091, "grad_norm": 41.25, "kl": 0.6589794158935547, "learning_rate": 5e-07, "logits/chosen": -36413513.6, "logits/rejected": -44556592.0, "logps/chosen": -255.7327392578125, "logps/rejected": -279.30010986328125, "loss": 0.3403, "rewards/chosen": 0.0684281826019287, "rewards/margins": 2.460859282811483, "rewards/rejected": -2.392431100209554, "step": 16147 }, { "epoch": 0.8559086211008932, "grad_norm": 88.5, "kl": 4.540865898132324, "learning_rate": 5e-07, "logits/chosen": -3215231.6, "logits/rejected": 121678240.0, "logps/chosen": -330.562890625, "logps/rejected": -193.92342122395834, "loss": 0.3307, "rewards/chosen": 1.3242681503295899, "rewards/margins": 2.224677054087321, "rewards/rejected": -0.9004089037577311, "step": 16148 }, { "epoch": 0.8559616251026952, "grad_norm": 62.5, "kl": 0.3498420715332031, "learning_rate": 5e-07, "logits/chosen": -42166496.0, "logits/rejected": 19970028.0, "logps/chosen": -284.2171936035156, "logps/rejected": -176.2025146484375, "loss": 0.2935, "rewards/chosen": 0.8659189343452454, "rewards/margins": 2.2156508564949036, "rewards/rejected": -1.3497319221496582, "step": 16149 }, { "epoch": 0.8560146291044974, "grad_norm": 44.0, "kl": 1.0179996490478516, "learning_rate": 5e-07, "logits/chosen": -43543572.0, "logits/rejected": -16849869.333333332, "logps/chosen": -274.1488342285156, "logps/rejected": -294.83538818359375, "loss": 0.162, "rewards/chosen": 1.2186229228973389, "rewards/margins": 3.7815873622894287, "rewards/rejected": -2.56296443939209, "step": 16150 }, { "epoch": 0.8560676331062995, "grad_norm": 51.75, "kl": 1.1523609161376953, "learning_rate": 5e-07, "logits/chosen": -14582628.0, "logits/rejected": -28678048.0, "logps/chosen": -115.0136210123698, "logps/rejected": -238.491162109375, "loss": 0.316, "rewards/chosen": -0.7161475817362467, "rewards/margins": 1.4117828687032064, "rewards/rejected": -2.127930450439453, "step": 16151 }, { "epoch": 0.8561206371081017, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 240455168.0, "logits/rejected": -38719980.8, "logps/chosen": -2045.31884765625, "logps/rejected": -445.0083984375, "loss": 0.118, "rewards/chosen": 1.918403148651123, "rewards/margins": 4.720449352264405, "rewards/rejected": -2.802046203613281, "step": 16152 }, { "epoch": 0.8561736411099038, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40175362.666666664, "logits/rejected": -49693212.8, "logps/chosen": -202.38431803385416, "logps/rejected": -308.894091796875, "loss": 0.2145, "rewards/chosen": 0.6758336226145426, "rewards/margins": 2.969393269220988, "rewards/rejected": -2.2935596466064454, "step": 16153 }, { "epoch": 0.856226645111706, "grad_norm": 51.0, "kl": 4.989285469055176, "learning_rate": 5e-07, "logits/chosen": -37004992.0, "logits/rejected": -21890760.0, "logps/chosen": -466.6599426269531, "logps/rejected": -294.7126770019531, "loss": 0.2831, "rewards/chosen": 0.9477346539497375, "rewards/margins": 4.232826054096222, "rewards/rejected": -3.2850914001464844, "step": 16154 }, { "epoch": 0.8562796491135081, "grad_norm": 38.5, "kl": 0.89984130859375, "learning_rate": 5e-07, "logits/chosen": -19474544.0, "logits/rejected": 24488.166666666668, "logps/chosen": -588.8645629882812, "logps/rejected": -162.09847005208334, "loss": 0.2035, "rewards/chosen": 1.7514790296554565, "rewards/margins": 3.629547953605652, "rewards/rejected": -1.8780689239501953, "step": 16155 }, { "epoch": 0.8563326531153103, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5446799.0, "logits/rejected": -23097832.0, "logps/chosen": -811.5005493164062, "logps/rejected": -409.6363932291667, "loss": 0.0882, "rewards/chosen": 2.3560791015625, "rewards/margins": 5.211941401163736, "rewards/rejected": -2.855862299601237, "step": 16156 }, { "epoch": 0.8563856571171123, "grad_norm": 43.75, "kl": 0.9260425567626953, "learning_rate": 5e-07, "logits/chosen": 5325558.666666667, "logits/rejected": -22544800.0, "logps/chosen": -295.19122314453125, "logps/rejected": -109.1830810546875, "loss": 0.2687, "rewards/chosen": 1.041235129038493, "rewards/margins": 2.588629976908366, "rewards/rejected": -1.5473948478698731, "step": 16157 }, { "epoch": 0.8564386611189144, "grad_norm": 57.0, "kl": 2.3573970794677734, "learning_rate": 5e-07, "logits/chosen": -42014420.0, "logits/rejected": -38376104.0, "logps/chosen": -283.4507751464844, "logps/rejected": -221.40802001953125, "loss": 0.3735, "rewards/chosen": 0.5501993894577026, "rewards/margins": 1.555984377861023, "rewards/rejected": -1.0057849884033203, "step": 16158 }, { "epoch": 0.8564916651207166, "grad_norm": 44.25, "kl": 4.919927597045898, "learning_rate": 5e-07, "logits/chosen": -682779.25, "logits/rejected": -27185260.0, "logps/chosen": -397.1151123046875, "logps/rejected": -611.212646484375, "loss": 0.3275, "rewards/chosen": 0.37271690368652344, "rewards/margins": 3.6743133068084717, "rewards/rejected": -3.3015964031219482, "step": 16159 }, { "epoch": 0.8565446691225187, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35185189.333333336, "logits/rejected": -38277318.4, "logps/chosen": -172.28092447916666, "logps/rejected": -536.934619140625, "loss": 0.2203, "rewards/chosen": 0.16776021321614584, "rewards/margins": 2.853545125325521, "rewards/rejected": -2.685784912109375, "step": 16160 }, { "epoch": 0.8565976731243209, "grad_norm": 49.75, "kl": 1.4031562805175781, "learning_rate": 5e-07, "logits/chosen": 10366137.0, "logits/rejected": -1362579.375, "logps/chosen": -113.66192626953125, "logps/rejected": -83.82343292236328, "loss": 0.3324, "rewards/chosen": 0.12720417976379395, "rewards/margins": 2.842625379562378, "rewards/rejected": -2.715421199798584, "step": 16161 }, { "epoch": 0.856650677126123, "grad_norm": 75.5, "kl": 0.7395763397216797, "learning_rate": 5e-07, "logits/chosen": 78390416.0, "logits/rejected": -25244366.0, "logps/chosen": -332.7275695800781, "logps/rejected": -379.68804931640625, "loss": 0.3158, "rewards/chosen": 0.24371075630187988, "rewards/margins": 2.230727791786194, "rewards/rejected": -1.987017035484314, "step": 16162 }, { "epoch": 0.8567036811279252, "grad_norm": 50.5, "kl": 0.8708419799804688, "learning_rate": 5e-07, "logits/chosen": -58818400.0, "logits/rejected": -21979048.0, "logps/chosen": -327.38250732421875, "logps/rejected": -253.19766235351562, "loss": 0.2523, "rewards/chosen": 1.0309388637542725, "rewards/margins": 3.013804793357849, "rewards/rejected": -1.9828659296035767, "step": 16163 }, { "epoch": 0.8567566851297272, "grad_norm": 80.0, "kl": 1.892801284790039, "learning_rate": 5e-07, "logits/chosen": -42047888.0, "logits/rejected": -18044778.0, "logps/chosen": -275.61053466796875, "logps/rejected": -338.4451904296875, "loss": 0.3421, "rewards/chosen": 0.108360156416893, "rewards/margins": 2.180883750319481, "rewards/rejected": -2.072523593902588, "step": 16164 }, { "epoch": 0.8568096891315294, "grad_norm": 38.25, "kl": 1.8049001693725586, "learning_rate": 5e-07, "logits/chosen": -19663582.4, "logits/rejected": -4573598.666666667, "logps/chosen": -78.6820068359375, "logps/rejected": -141.56379191080728, "loss": 0.2704, "rewards/chosen": 0.6000580787658691, "rewards/margins": 3.588743050893148, "rewards/rejected": -2.988684972127279, "step": 16165 }, { "epoch": 0.8568626931333315, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -116584298.66666667, "logits/rejected": -43233369.6, "logps/chosen": -207.28959147135416, "logps/rejected": -537.63125, "loss": 0.3428, "rewards/chosen": -0.3601338863372803, "rewards/margins": 2.0391804218292235, "rewards/rejected": -2.399314308166504, "step": 16166 }, { "epoch": 0.8569156971351337, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44332616.0, "logits/rejected": -12186478.0, "logps/chosen": -368.100830078125, "logps/rejected": -227.59945678710938, "loss": 0.3012, "rewards/chosen": 0.10280761867761612, "rewards/margins": 2.6607895866036415, "rewards/rejected": -2.5579819679260254, "step": 16167 }, { "epoch": 0.8569687011369358, "grad_norm": 39.25, "kl": 5.418659210205078, "learning_rate": 5e-07, "logits/chosen": -21397405.333333332, "logits/rejected": -29632616.0, "logps/chosen": -499.6669514973958, "logps/rejected": -263.5934753417969, "loss": 0.411, "rewards/chosen": 0.7844319343566895, "rewards/margins": 3.7592453956604004, "rewards/rejected": -2.974813461303711, "step": 16168 }, { "epoch": 0.857021705138738, "grad_norm": 52.25, "kl": 0.5424118041992188, "learning_rate": 5e-07, "logits/chosen": -42441224.0, "logits/rejected": -27608460.8, "logps/chosen": -602.6569010416666, "logps/rejected": -415.236669921875, "loss": 0.2186, "rewards/chosen": 0.2251352866490682, "rewards/margins": 3.765654238065084, "rewards/rejected": -3.540518951416016, "step": 16169 }, { "epoch": 0.8570747091405401, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1168965.25, "logits/rejected": -35776920.0, "logps/chosen": -434.45965576171875, "logps/rejected": -399.4656575520833, "loss": 0.1562, "rewards/chosen": 0.2005462646484375, "rewards/margins": 3.542311668395996, "rewards/rejected": -3.3417654037475586, "step": 16170 }, { "epoch": 0.8571277131423423, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4390557.0, "logits/rejected": 381724.8, "logps/chosen": -637.20849609375, "logps/rejected": -311.70224609375, "loss": 0.2242, "rewards/chosen": 1.3418172200520833, "rewards/margins": 3.36863587697347, "rewards/rejected": -2.0268186569213866, "step": 16171 }, { "epoch": 0.8571807171441443, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13525576.0, "logits/rejected": -30777625.6, "logps/chosen": -375.9104817708333, "logps/rejected": -381.7941162109375, "loss": 0.2756, "rewards/chosen": -0.04100646575291952, "rewards/margins": 2.7432348291079203, "rewards/rejected": -2.78424129486084, "step": 16172 }, { "epoch": 0.8572337211459465, "grad_norm": 43.0, "kl": 2.6393051147460938, "learning_rate": 5e-07, "logits/chosen": -10748495.2, "logits/rejected": -3466991.0, "logps/chosen": -287.961083984375, "logps/rejected": -150.30199178059897, "loss": 0.3267, "rewards/chosen": 1.0360494613647462, "rewards/margins": 3.0110984802246095, "rewards/rejected": -1.9750490188598633, "step": 16173 }, { "epoch": 0.8572867251477486, "grad_norm": 39.75, "kl": 5.18775749206543, "learning_rate": 5e-07, "logits/chosen": -7973632.0, "logits/rejected": -25983316.0, "logps/chosen": -83.18829345703125, "logps/rejected": -209.1834716796875, "loss": 0.3332, "rewards/chosen": 0.6909230947494507, "rewards/margins": 2.243960976600647, "rewards/rejected": -1.5530378818511963, "step": 16174 }, { "epoch": 0.8573397291495508, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25122382.0, "logits/rejected": -17104520.0, "logps/chosen": -444.12078857421875, "logps/rejected": -160.5939483642578, "loss": 0.1985, "rewards/chosen": 0.9540695548057556, "rewards/margins": 3.3561238646507263, "rewards/rejected": -2.4020543098449707, "step": 16175 }, { "epoch": 0.8573927331513529, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9515905.333333334, "logits/rejected": -28651219.2, "logps/chosen": -294.8380940755208, "logps/rejected": -262.4308837890625, "loss": 0.1672, "rewards/chosen": 1.664778709411621, "rewards/margins": 3.7939661026000975, "rewards/rejected": -2.1291873931884764, "step": 16176 }, { "epoch": 0.8574457371531551, "grad_norm": 31.375, "kl": 2.479785919189453, "learning_rate": 5e-07, "logits/chosen": 13053464.0, "logits/rejected": -8401050.0, "logps/chosen": -35.56740951538086, "logps/rejected": -167.16141764322916, "loss": 0.2415, "rewards/chosen": 0.49149179458618164, "rewards/margins": 2.8786279360453286, "rewards/rejected": -2.387136141459147, "step": 16177 }, { "epoch": 0.8574987411549572, "grad_norm": 36.0, "kl": 0.42870330810546875, "learning_rate": 5e-07, "logits/chosen": -18462452.0, "logits/rejected": -15618081.6, "logps/chosen": -468.6231282552083, "logps/rejected": -316.377197265625, "loss": 0.2197, "rewards/chosen": 0.9309073289235433, "rewards/margins": 3.42037189801534, "rewards/rejected": -2.4894645690917967, "step": 16178 }, { "epoch": 0.8575517451567594, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75441504.0, "logits/rejected": -17584569.333333332, "logps/chosen": -236.3253631591797, "logps/rejected": -459.7003173828125, "loss": 0.2099, "rewards/chosen": -0.1784542053937912, "rewards/margins": 3.1508759210507074, "rewards/rejected": -3.3293301264444985, "step": 16179 }, { "epoch": 0.8576047491585614, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16709517.333333334, "logits/rejected": -16636116.8, "logps/chosen": -343.561767578125, "logps/rejected": -207.3087890625, "loss": 0.2984, "rewards/chosen": -0.08535818258921306, "rewards/margins": 2.1226954062779746, "rewards/rejected": -2.2080535888671875, "step": 16180 }, { "epoch": 0.8576577531603636, "grad_norm": 35.5, "kl": 1.5018806457519531, "learning_rate": 5e-07, "logits/chosen": -17485161.333333332, "logits/rejected": -35077190.4, "logps/chosen": -195.4766845703125, "logps/rejected": -403.97041015625, "loss": 0.2369, "rewards/chosen": 0.23953564961751303, "rewards/margins": 2.6036477406819665, "rewards/rejected": -2.3641120910644533, "step": 16181 }, { "epoch": 0.8577107571621657, "grad_norm": 39.75, "kl": 4.421290397644043, "learning_rate": 5e-07, "logits/chosen": -6502347.2, "logits/rejected": -6773350.0, "logps/chosen": -191.715966796875, "logps/rejected": -126.15140787760417, "loss": 0.3208, "rewards/chosen": 0.7442038536071778, "rewards/margins": 2.5340617179870604, "rewards/rejected": -1.7898578643798828, "step": 16182 }, { "epoch": 0.8577637611639679, "grad_norm": 43.0, "kl": 3.0109691619873047, "learning_rate": 5e-07, "logits/chosen": -18499568.0, "logits/rejected": -23568568.0, "logps/chosen": -170.53602600097656, "logps/rejected": -442.83807373046875, "loss": 0.2775, "rewards/chosen": 0.5229359269142151, "rewards/margins": 2.460973083972931, "rewards/rejected": -1.9380371570587158, "step": 16183 }, { "epoch": 0.85781676516577, "grad_norm": 30.875, "kl": 3.2402095794677734, "learning_rate": 5e-07, "logits/chosen": -30103756.8, "logits/rejected": -25220040.0, "logps/chosen": -783.219140625, "logps/rejected": -258.2366129557292, "loss": 0.2318, "rewards/chosen": 1.5894247055053712, "rewards/margins": 4.846040026346842, "rewards/rejected": -3.256615320841471, "step": 16184 }, { "epoch": 0.8578697691675722, "grad_norm": 46.25, "kl": 2.1170578002929688, "learning_rate": 5e-07, "logits/chosen": -5219032.0, "logits/rejected": -15354666.666666666, "logps/chosen": -261.42474365234375, "logps/rejected": -252.0916748046875, "loss": 0.2155, "rewards/chosen": 1.18105149269104, "rewards/margins": 3.495797554651896, "rewards/rejected": -2.314746061960856, "step": 16185 }, { "epoch": 0.8579227731693743, "grad_norm": 60.0, "kl": 0.3908538818359375, "learning_rate": 5e-07, "logits/chosen": -44832837.333333336, "logits/rejected": 40746040.0, "logps/chosen": -292.0179036458333, "logps/rejected": -660.5081787109375, "loss": 0.3662, "rewards/chosen": 0.18410762151082358, "rewards/margins": 3.3739941914876304, "rewards/rejected": -3.1898865699768066, "step": 16186 }, { "epoch": 0.8579757771711765, "grad_norm": 50.0, "kl": 1.666778564453125, "learning_rate": 5e-07, "logits/chosen": -32563450.0, "logits/rejected": -23143016.0, "logps/chosen": -304.31488037109375, "logps/rejected": -251.5802001953125, "loss": 0.1742, "rewards/chosen": 1.881423830986023, "rewards/margins": 3.907258152961731, "rewards/rejected": -2.025834321975708, "step": 16187 }, { "epoch": 0.8580287811729785, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28544329.6, "logits/rejected": -40476962.666666664, "logps/chosen": -257.416650390625, "logps/rejected": -645.06884765625, "loss": 0.3896, "rewards/chosen": -0.3244572639465332, "rewards/margins": 2.224458281199137, "rewards/rejected": -2.5489155451456704, "step": 16188 }, { "epoch": 0.8580817851747807, "grad_norm": 36.25, "kl": 1.4521255493164062, "learning_rate": 5e-07, "logits/chosen": 3578912.6666666665, "logits/rejected": -19192032.0, "logps/chosen": -138.20381673177084, "logps/rejected": -388.003125, "loss": 0.2372, "rewards/chosen": 0.7179948488871256, "rewards/margins": 2.7415132204691566, "rewards/rejected": -2.023518371582031, "step": 16189 }, { "epoch": 0.8581347891765828, "grad_norm": 65.0, "kl": 0.4431743621826172, "learning_rate": 5e-07, "logits/chosen": 59771800.0, "logits/rejected": 7336364.0, "logps/chosen": -412.3244323730469, "logps/rejected": -169.74771118164062, "loss": 0.2782, "rewards/chosen": 0.7999774813652039, "rewards/margins": 2.756404459476471, "rewards/rejected": -1.956426978111267, "step": 16190 }, { "epoch": 0.858187793178385, "grad_norm": 54.75, "kl": 1.6639175415039062, "learning_rate": 5e-07, "logits/chosen": -51156972.8, "logits/rejected": -37622600.0, "logps/chosen": -353.3751708984375, "logps/rejected": -723.1171061197916, "loss": 0.2475, "rewards/chosen": 1.0109211921691894, "rewards/margins": 3.6648550351460774, "rewards/rejected": -2.653933842976888, "step": 16191 }, { "epoch": 0.8582407971801871, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29831540.0, "logits/rejected": -35403296.0, "logps/chosen": -258.1361389160156, "logps/rejected": -346.1884765625, "loss": 0.2461, "rewards/chosen": 0.4392440617084503, "rewards/margins": 3.5719185173511505, "rewards/rejected": -3.1326744556427, "step": 16192 }, { "epoch": 0.8582938011819893, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40609240.0, "logits/rejected": -23456310.0, "logps/chosen": -178.30845642089844, "logps/rejected": -156.79837036132812, "loss": 0.2967, "rewards/chosen": 0.20616474747657776, "rewards/margins": 2.403032213449478, "rewards/rejected": -2.1968674659729004, "step": 16193 }, { "epoch": 0.8583468051837914, "grad_norm": 45.75, "kl": 0.4080848693847656, "learning_rate": 5e-07, "logits/chosen": -43893910.4, "logits/rejected": -52218186.666666664, "logps/chosen": -343.683984375, "logps/rejected": -519.0204264322916, "loss": 0.2271, "rewards/chosen": 0.8906347274780273, "rewards/margins": 3.7866661071777346, "rewards/rejected": -2.896031379699707, "step": 16194 }, { "epoch": 0.8583998091855936, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43946813.333333336, "logits/rejected": -4268001.6, "logps/chosen": -287.12184651692706, "logps/rejected": -139.087939453125, "loss": 0.2493, "rewards/chosen": -0.08965249856313069, "rewards/margins": 2.7494645516077676, "rewards/rejected": -2.8391170501708984, "step": 16195 }, { "epoch": 0.8584528131873956, "grad_norm": 46.75, "kl": 0.3202362060546875, "learning_rate": 5e-07, "logits/chosen": -113921280.0, "logits/rejected": -16621793.0, "logps/chosen": -287.54803466796875, "logps/rejected": -328.463134765625, "loss": 0.3114, "rewards/chosen": 0.10755559056997299, "rewards/margins": 2.108142577111721, "rewards/rejected": -2.000586986541748, "step": 16196 }, { "epoch": 0.8585058171891978, "grad_norm": 62.75, "kl": 0.578155517578125, "learning_rate": 5e-07, "logits/chosen": -38749382.4, "logits/rejected": -57512869.333333336, "logps/chosen": -334.3744140625, "logps/rejected": -435.731201171875, "loss": 0.2987, "rewards/chosen": 0.3701120376586914, "rewards/margins": 2.914620018005371, "rewards/rejected": -2.5445079803466797, "step": 16197 }, { "epoch": 0.8585588211909999, "grad_norm": 55.0, "kl": 0.3272538185119629, "learning_rate": 5e-07, "logits/chosen": -27370246.4, "logits/rejected": 13556345.333333334, "logps/chosen": -256.276220703125, "logps/rejected": -236.78645833333334, "loss": 0.4005, "rewards/chosen": 0.22440311908721924, "rewards/margins": 1.4822639385859173, "rewards/rejected": -1.257860819498698, "step": 16198 }, { "epoch": 0.8586118251928021, "grad_norm": 30.875, "kl": 4.47724723815918, "learning_rate": 5e-07, "logits/chosen": 4432048.333333333, "logits/rejected": -9581480.0, "logps/chosen": -72.14455159505208, "logps/rejected": -227.470458984375, "loss": 0.3985, "rewards/chosen": 0.5683130423227946, "rewards/margins": 2.0770458380381265, "rewards/rejected": -1.508732795715332, "step": 16199 }, { "epoch": 0.8586648291946042, "grad_norm": 49.25, "kl": 1.583024024963379, "learning_rate": 5e-07, "logits/chosen": -19769592.0, "logits/rejected": -36074850.666666664, "logps/chosen": -85.67713012695313, "logps/rejected": -143.87372843424478, "loss": 0.3115, "rewards/chosen": 0.49880352020263674, "rewards/margins": 2.6894514401753744, "rewards/rejected": -2.190647919972738, "step": 16200 }, { "epoch": 0.8587178331964064, "grad_norm": 50.25, "kl": 2.1332778930664062, "learning_rate": 5e-07, "logits/chosen": -26835259.2, "logits/rejected": -52777626.666666664, "logps/chosen": -303.0907958984375, "logps/rejected": -421.8279622395833, "loss": 0.2854, "rewards/chosen": 0.7515365600585937, "rewards/margins": 2.889964930216471, "rewards/rejected": -2.1384283701578775, "step": 16201 }, { "epoch": 0.8587708371982085, "grad_norm": 34.5, "kl": 0.6638450622558594, "learning_rate": 5e-07, "logits/chosen": -37757576.0, "logits/rejected": -49937956.571428575, "logps/chosen": -258.53369140625, "logps/rejected": -348.5257045200893, "loss": 0.1302, "rewards/chosen": 0.38148194551467896, "rewards/margins": 2.840823267187391, "rewards/rejected": -2.4593413216727122, "step": 16202 }, { "epoch": 0.8588238412000107, "grad_norm": 65.5, "kl": 4.141546249389648, "learning_rate": 5e-07, "logits/chosen": -933394.0, "logits/rejected": -1869867.0, "logps/chosen": -192.63259887695312, "logps/rejected": -252.08837890625, "loss": 0.3588, "rewards/chosen": 0.1494387686252594, "rewards/margins": 1.7145629227161407, "rewards/rejected": -1.5651241540908813, "step": 16203 }, { "epoch": 0.8588768452018127, "grad_norm": 39.25, "kl": 0.7153854370117188, "learning_rate": 5e-07, "logits/chosen": -1792597.0, "logits/rejected": -49817092.0, "logps/chosen": -313.78570556640625, "logps/rejected": -261.80267333984375, "loss": 0.2418, "rewards/chosen": 0.7173314094543457, "rewards/margins": 3.4356272220611572, "rewards/rejected": -2.7182958126068115, "step": 16204 }, { "epoch": 0.8589298492036149, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63610976.0, "logits/rejected": -20305178.0, "logps/chosen": -332.2386169433594, "logps/rejected": -364.2677001953125, "loss": 0.2459, "rewards/chosen": 0.39255523681640625, "rewards/margins": 3.3607230186462402, "rewards/rejected": -2.968167781829834, "step": 16205 }, { "epoch": 0.858982853205417, "grad_norm": 58.25, "kl": 4.641201019287109, "learning_rate": 5e-07, "logits/chosen": -641046.7142857143, "logits/rejected": -339055424.0, "logps/chosen": -205.92398507254464, "logps/rejected": -635.0189208984375, "loss": 0.475, "rewards/chosen": 0.45587158203125, "rewards/margins": 1.97369384765625, "rewards/rejected": -1.517822265625, "step": 16206 }, { "epoch": 0.8590358572072192, "grad_norm": 46.5, "kl": 2.2402210235595703, "learning_rate": 5e-07, "logits/chosen": -47623968.0, "logits/rejected": -51637178.666666664, "logps/chosen": -364.811083984375, "logps/rejected": -228.9363810221354, "loss": 0.2704, "rewards/chosen": 1.2823675155639649, "rewards/margins": 3.500256824493408, "rewards/rejected": -2.2178893089294434, "step": 16207 }, { "epoch": 0.8590888612090213, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38505548.0, "logits/rejected": 2809051.0, "logps/chosen": -284.4171142578125, "logps/rejected": -285.1717529296875, "loss": 0.2959, "rewards/chosen": 0.482385516166687, "rewards/margins": 2.047738552093506, "rewards/rejected": -1.5653530359268188, "step": 16208 }, { "epoch": 0.8591418652108234, "grad_norm": 79.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2558553.75, "logits/rejected": -19320758.85714286, "logps/chosen": -113.3232421875, "logps/rejected": -255.96934291294642, "loss": 0.208, "rewards/chosen": -0.21512527763843536, "rewards/margins": 1.821078379239355, "rewards/rejected": -2.0362036568777904, "step": 16209 }, { "epoch": 0.8591948692126256, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9491794.666666666, "logits/rejected": -11869672.8, "logps/chosen": -420.0313720703125, "logps/rejected": -359.502099609375, "loss": 0.2587, "rewards/chosen": 0.5310457944869995, "rewards/margins": 2.3164618253707885, "rewards/rejected": -1.785416030883789, "step": 16210 }, { "epoch": 0.8592478732144276, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13995606.666666666, "logits/rejected": -3581401.0, "logps/chosen": -223.6705525716146, "logps/rejected": -66.44866943359375, "loss": 0.3965, "rewards/chosen": 0.07675046722094218, "rewards/margins": 1.8748909930388133, "rewards/rejected": -1.798140525817871, "step": 16211 }, { "epoch": 0.8593008772162298, "grad_norm": 37.0, "kl": 1.0182075500488281, "learning_rate": 5e-07, "logits/chosen": 6212109.5, "logits/rejected": -37805853.333333336, "logps/chosen": -242.06309509277344, "logps/rejected": -210.25931803385416, "loss": 0.1991, "rewards/chosen": 1.106133222579956, "rewards/margins": 3.2713178793589273, "rewards/rejected": -2.165184656778971, "step": 16212 }, { "epoch": 0.8593538812180319, "grad_norm": 33.0, "kl": 2.701519012451172, "learning_rate": 5e-07, "logits/chosen": -77093344.0, "logits/rejected": -33280844.0, "logps/chosen": -221.9815673828125, "logps/rejected": -223.05746459960938, "loss": 0.2977, "rewards/chosen": 0.29895997047424316, "rewards/margins": 3.6001150608062744, "rewards/rejected": -3.3011550903320312, "step": 16213 }, { "epoch": 0.8594068852198341, "grad_norm": 28.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7119344.0, "logits/rejected": -9525664.57142857, "logps/chosen": -347.4454345703125, "logps/rejected": -235.66800362723214, "loss": 0.2012, "rewards/chosen": 1.65130615234375, "rewards/margins": 3.5974724633353095, "rewards/rejected": -1.9461663109915597, "step": 16214 }, { "epoch": 0.8594598892216362, "grad_norm": 30.5, "kl": 0.8519878387451172, "learning_rate": 5e-07, "logits/chosen": 15068844.0, "logits/rejected": -41617560.0, "logps/chosen": -142.51515197753906, "logps/rejected": -304.43756103515625, "loss": 0.1851, "rewards/chosen": 0.9669049382209778, "rewards/margins": 3.7563536763191223, "rewards/rejected": -2.7894487380981445, "step": 16215 }, { "epoch": 0.8595128932234384, "grad_norm": 43.5, "kl": 0.05420684814453125, "learning_rate": 5e-07, "logits/chosen": -30065604.0, "logits/rejected": 70317768.0, "logps/chosen": -312.71917724609375, "logps/rejected": -330.45867919921875, "loss": 0.2899, "rewards/chosen": -0.010890394449234009, "rewards/margins": 2.884591668844223, "rewards/rejected": -2.895482063293457, "step": 16216 }, { "epoch": 0.8595658972252405, "grad_norm": 47.5, "kl": 2.113694190979004, "learning_rate": 5e-07, "logits/chosen": -58505461.333333336, "logits/rejected": 2276679.0, "logps/chosen": -343.3826904296875, "logps/rejected": -64.01688385009766, "loss": 0.3685, "rewards/chosen": 0.6510217189788818, "rewards/margins": 2.253994584083557, "rewards/rejected": -1.6029728651046753, "step": 16217 }, { "epoch": 0.8596189012270427, "grad_norm": 27.625, "kl": 1.0266456604003906, "learning_rate": 5e-07, "logits/chosen": -15695679.0, "logits/rejected": -51529755.428571425, "logps/chosen": -331.6696472167969, "logps/rejected": -407.6719447544643, "loss": 0.0847, "rewards/chosen": 1.875848412513733, "rewards/margins": 4.681952255112784, "rewards/rejected": -2.806103842599051, "step": 16218 }, { "epoch": 0.8596719052288447, "grad_norm": 36.0, "kl": 1.279693603515625, "learning_rate": 5e-07, "logits/chosen": -9634428.0, "logits/rejected": -20811268.0, "logps/chosen": -267.0516052246094, "logps/rejected": -154.94424438476562, "loss": 0.3003, "rewards/chosen": 0.4127104580402374, "rewards/margins": 2.901769906282425, "rewards/rejected": -2.4890594482421875, "step": 16219 }, { "epoch": 0.8597249092306469, "grad_norm": 53.0, "kl": 0.25564002990722656, "learning_rate": 5e-07, "logits/chosen": -50821876.0, "logits/rejected": -12671736.0, "logps/chosen": -247.78652954101562, "logps/rejected": -305.75518798828125, "loss": 0.3197, "rewards/chosen": 0.18238124251365662, "rewards/margins": 2.0805860459804535, "rewards/rejected": -1.8982048034667969, "step": 16220 }, { "epoch": 0.859777913232449, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84618960.0, "logits/rejected": -50488885.333333336, "logps/chosen": -525.4193115234375, "logps/rejected": -315.86570231119794, "loss": 0.2065, "rewards/chosen": 0.4212280511856079, "rewards/margins": 2.50689963499705, "rewards/rejected": -2.085671583811442, "step": 16221 }, { "epoch": 0.8598309172342512, "grad_norm": 66.0, "kl": 0.356231689453125, "learning_rate": 5e-07, "logits/chosen": -9181132.0, "logits/rejected": -53761749.333333336, "logps/chosen": -359.073681640625, "logps/rejected": -351.6309407552083, "loss": 0.3433, "rewards/chosen": 0.47110629081726074, "rewards/margins": 1.8650639057159424, "rewards/rejected": -1.3939576148986816, "step": 16222 }, { "epoch": 0.8598839212360533, "grad_norm": 33.5, "kl": 4.567405700683594, "learning_rate": 5e-07, "logits/chosen": -19784451.2, "logits/rejected": -53114170.666666664, "logps/chosen": -304.58505859375, "logps/rejected": -138.50238037109375, "loss": 0.2755, "rewards/chosen": 1.2676753997802734, "rewards/margins": 3.9063830375671387, "rewards/rejected": -2.6387076377868652, "step": 16223 }, { "epoch": 0.8599369252378555, "grad_norm": 28.25, "kl": 3.346747398376465, "learning_rate": 5e-07, "logits/chosen": -24874990.4, "logits/rejected": -28432210.666666668, "logps/chosen": -264.5125732421875, "logps/rejected": -182.5732218424479, "loss": 0.3125, "rewards/chosen": 1.0728189468383789, "rewards/margins": 2.9653010686238606, "rewards/rejected": -1.8924821217854817, "step": 16224 }, { "epoch": 0.8599899292396576, "grad_norm": 42.25, "kl": 7.404399871826172, "learning_rate": 5e-07, "logits/chosen": -18119914.666666668, "logits/rejected": 54050912.0, "logps/chosen": -452.4302164713542, "logps/rejected": -173.35911560058594, "loss": 0.3445, "rewards/chosen": 1.8407845497131348, "rewards/margins": 3.0468852519989014, "rewards/rejected": -1.2061007022857666, "step": 16225 }, { "epoch": 0.8600429332414598, "grad_norm": 54.25, "kl": 0.10847854614257812, "learning_rate": 5e-07, "logits/chosen": 1635546.0, "logits/rejected": -10187862.0, "logps/chosen": -244.08004760742188, "logps/rejected": -165.34072875976562, "loss": 0.2668, "rewards/chosen": 0.8040348291397095, "rewards/margins": 2.723321318626404, "rewards/rejected": -1.9192864894866943, "step": 16226 }, { "epoch": 0.8600959372432618, "grad_norm": 58.75, "kl": 2.7525405883789062, "learning_rate": 5e-07, "logits/chosen": 1618723.3333333333, "logits/rejected": -24611682.0, "logps/chosen": -259.66636149088544, "logps/rejected": -313.5453186035156, "loss": 0.3844, "rewards/chosen": 0.41328481833140057, "rewards/margins": 4.109940807024638, "rewards/rejected": -3.6966559886932373, "step": 16227 }, { "epoch": 0.860148941245064, "grad_norm": 45.75, "kl": 0.10137557983398438, "learning_rate": 5e-07, "logits/chosen": -45999824.0, "logits/rejected": 3725789.0, "logps/chosen": -283.58355712890625, "logps/rejected": -420.968505859375, "loss": 0.2932, "rewards/chosen": 0.609697182973226, "rewards/margins": 3.128641923268636, "rewards/rejected": -2.51894474029541, "step": 16228 }, { "epoch": 0.8602019452468661, "grad_norm": 57.0, "kl": 0.2708930969238281, "learning_rate": 5e-07, "logits/chosen": -49328073.6, "logits/rejected": -39964920.0, "logps/chosen": -464.66328125, "logps/rejected": -209.5776163736979, "loss": 0.28, "rewards/chosen": 0.49830307960510256, "rewards/margins": 3.144110663731893, "rewards/rejected": -2.6458075841267905, "step": 16229 }, { "epoch": 0.8602549492486683, "grad_norm": 52.5, "kl": 1.5441093444824219, "learning_rate": 5e-07, "logits/chosen": -48826736.0, "logits/rejected": -32137842.0, "logps/chosen": -342.3451843261719, "logps/rejected": -314.12847900390625, "loss": 0.2141, "rewards/chosen": 1.091935396194458, "rewards/margins": 3.8727610111236572, "rewards/rejected": -2.780825614929199, "step": 16230 }, { "epoch": 0.8603079532504704, "grad_norm": 54.5, "kl": 1.3646965026855469, "learning_rate": 5e-07, "logits/chosen": -18537798.0, "logits/rejected": -38806040.0, "logps/chosen": -375.4853515625, "logps/rejected": -259.26983642578125, "loss": 0.202, "rewards/chosen": 1.3907026052474976, "rewards/margins": 3.8094810247421265, "rewards/rejected": -2.418778419494629, "step": 16231 }, { "epoch": 0.8603609572522726, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43203688.0, "logits/rejected": -10814658.0, "logps/chosen": -242.49761962890625, "logps/rejected": -428.74365234375, "loss": 0.1929, "rewards/chosen": -0.25123921036720276, "rewards/margins": 2.640796591838201, "rewards/rejected": -2.892035802205404, "step": 16232 }, { "epoch": 0.8604139612540747, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3899664.0, "logits/rejected": -20093188.0, "logps/chosen": -342.313818359375, "logps/rejected": -100.18304443359375, "loss": 0.3356, "rewards/chosen": 0.29326663017272947, "rewards/margins": 2.107438580195109, "rewards/rejected": -1.8141719500223796, "step": 16233 }, { "epoch": 0.8604669652558768, "grad_norm": 60.75, "kl": 4.758052825927734, "learning_rate": 5e-07, "logits/chosen": -44972857.6, "logits/rejected": -38761477.333333336, "logps/chosen": -666.9416015625, "logps/rejected": -335.5463460286458, "loss": 0.1996, "rewards/chosen": 1.6326990127563477, "rewards/margins": 5.037418047587076, "rewards/rejected": -3.404719034830729, "step": 16234 }, { "epoch": 0.8605199692576789, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -128315136.0, "logits/rejected": -19802356.8, "logps/chosen": -396.4903157552083, "logps/rejected": -300.32724609375, "loss": 0.2554, "rewards/chosen": 0.07636819283167522, "rewards/margins": 2.503631071249644, "rewards/rejected": -2.427262878417969, "step": 16235 }, { "epoch": 0.8605729732594811, "grad_norm": 49.75, "kl": 0.18530654907226562, "learning_rate": 5e-07, "logits/chosen": 221904.0, "logits/rejected": -10462414.4, "logps/chosen": -218.8585205078125, "logps/rejected": -160.37425537109374, "loss": 0.2345, "rewards/chosen": 0.27369487285614014, "rewards/margins": 3.1642863035202025, "rewards/rejected": -2.8905914306640623, "step": 16236 }, { "epoch": 0.8606259772612832, "grad_norm": 46.0, "kl": 3.592618942260742, "learning_rate": 5e-07, "logits/chosen": -44186547.2, "logits/rejected": -12691393.333333334, "logps/chosen": -232.325048828125, "logps/rejected": -359.8739013671875, "loss": 0.3094, "rewards/chosen": 0.6521357536315918, "rewards/margins": 2.585152848561605, "rewards/rejected": -1.933017094930013, "step": 16237 }, { "epoch": 0.8606789812630854, "grad_norm": 47.0, "kl": 1.3371829986572266, "learning_rate": 5e-07, "logits/chosen": -51461104.0, "logits/rejected": -9111772.0, "logps/chosen": -597.7347819010416, "logps/rejected": -81.33909301757812, "loss": 0.2033, "rewards/chosen": 1.175154685974121, "rewards/margins": 3.6205078125, "rewards/rejected": -2.445353126525879, "step": 16238 }, { "epoch": 0.8607319852648875, "grad_norm": 42.25, "kl": 0.093048095703125, "learning_rate": 5e-07, "logits/chosen": -15894737.0, "logits/rejected": -22498944.0, "logps/chosen": -227.18739318847656, "logps/rejected": -342.93377685546875, "loss": 0.2431, "rewards/chosen": 0.5357155203819275, "rewards/margins": 3.196814000606537, "rewards/rejected": -2.6610984802246094, "step": 16239 }, { "epoch": 0.8607849892666897, "grad_norm": 31.125, "kl": 0.028156280517578125, "learning_rate": 5e-07, "logits/chosen": -5593118.666666667, "logits/rejected": -20683376.0, "logps/chosen": -155.03704833984375, "logps/rejected": -159.82432861328124, "loss": 0.1836, "rewards/chosen": 0.9200553099314371, "rewards/margins": 3.4160136381785073, "rewards/rejected": -2.4959583282470703, "step": 16240 }, { "epoch": 0.8608379932684918, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26536880.0, "logits/rejected": -14785801.333333334, "logps/chosen": -443.5033874511719, "logps/rejected": -215.41837565104166, "loss": 0.2487, "rewards/chosen": 0.6274397373199463, "rewards/margins": 2.3190092245737715, "rewards/rejected": -1.691569487253825, "step": 16241 }, { "epoch": 0.860890997270294, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -17151476.0, "logps/rejected": -199.60244750976562, "loss": 0.099, "rewards/rejected": -3.0622830390930176, "step": 16242 }, { "epoch": 0.860944001272096, "grad_norm": 28.875, "kl": 2.1765575408935547, "learning_rate": 5e-07, "logits/chosen": -14314310.0, "logits/rejected": -45980480.0, "logps/chosen": -207.20712280273438, "logps/rejected": -547.353759765625, "loss": 0.1742, "rewards/chosen": 1.2326433658599854, "rewards/margins": 4.823485612869263, "rewards/rejected": -3.5908422470092773, "step": 16243 }, { "epoch": 0.8609970052738982, "grad_norm": 39.0, "kl": 2.5772132873535156, "learning_rate": 5e-07, "logits/chosen": -10290458.666666666, "logits/rejected": -12747408.0, "logps/chosen": -189.89776611328125, "logps/rejected": -440.61142578125, "loss": 0.2099, "rewards/chosen": 1.0009302298227947, "rewards/margins": 3.4203378836313885, "rewards/rejected": -2.4194076538085936, "step": 16244 }, { "epoch": 0.8610500092757003, "grad_norm": 30.125, "kl": 2.190157890319824, "learning_rate": 5e-07, "logits/chosen": -18031696.0, "logits/rejected": -49891136.0, "logps/chosen": -157.91172790527344, "logps/rejected": -416.252685546875, "loss": 0.2821, "rewards/chosen": 0.1652437299489975, "rewards/margins": 5.790802106261253, "rewards/rejected": -5.625558376312256, "step": 16245 }, { "epoch": 0.8611030132775025, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37909532.0, "logits/rejected": -5444166.0, "logps/chosen": -335.3919982910156, "logps/rejected": -201.18037923177084, "loss": 0.1193, "rewards/chosen": 1.329889178276062, "rewards/margins": 4.2011266152064, "rewards/rejected": -2.8712374369303384, "step": 16246 }, { "epoch": 0.8611560172793046, "grad_norm": 29.5, "kl": 4.159999847412109, "learning_rate": 5e-07, "logits/chosen": -10478942.0, "logits/rejected": -35264816.0, "logps/chosen": -554.7044677734375, "logps/rejected": -312.6203308105469, "loss": 0.1872, "rewards/chosen": 1.5951039791107178, "rewards/margins": 5.6092212200164795, "rewards/rejected": -4.014117240905762, "step": 16247 }, { "epoch": 0.8612090212811068, "grad_norm": 41.5, "kl": 0.08080291748046875, "learning_rate": 5e-07, "logits/chosen": -15524849.333333334, "logits/rejected": -37780460.8, "logps/chosen": -376.3247477213542, "logps/rejected": -432.593896484375, "loss": 0.1349, "rewards/chosen": 1.7190887133280437, "rewards/margins": 5.588614622751872, "rewards/rejected": -3.869525909423828, "step": 16248 }, { "epoch": 0.8612620252829088, "grad_norm": 39.0, "kl": 2.5295028686523438, "learning_rate": 5e-07, "logits/chosen": -7735374.0, "logits/rejected": -19933810.0, "logps/chosen": -186.22314453125, "logps/rejected": -340.45867919921875, "loss": 0.2704, "rewards/chosen": 0.6865966320037842, "rewards/margins": 3.761157274246216, "rewards/rejected": -3.0745606422424316, "step": 16249 }, { "epoch": 0.861315029284711, "grad_norm": 41.75, "kl": 0.8835182189941406, "learning_rate": 5e-07, "logits/chosen": -37962253.333333336, "logits/rejected": -23141660.0, "logps/chosen": -346.7005208333333, "logps/rejected": -614.676025390625, "loss": 0.3057, "rewards/chosen": 1.0139168898264568, "rewards/margins": 3.449482838312785, "rewards/rejected": -2.435565948486328, "step": 16250 }, { "epoch": 0.8613680332865131, "grad_norm": 44.0, "kl": 4.925810813903809, "learning_rate": 5e-07, "logits/chosen": -19855302.4, "logits/rejected": 5114160.0, "logps/chosen": -165.823583984375, "logps/rejected": -298.4307047526042, "loss": 0.3628, "rewards/chosen": 0.6230425834655762, "rewards/margins": 3.3512202898661294, "rewards/rejected": -2.7281777064005532, "step": 16251 }, { "epoch": 0.8614210372883153, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15912617.6, "logits/rejected": -6456116.0, "logps/chosen": -345.959765625, "logps/rejected": -502.289306640625, "loss": 0.1913, "rewards/chosen": 1.2184907913208007, "rewards/margins": 3.9879438400268556, "rewards/rejected": -2.7694530487060547, "step": 16252 }, { "epoch": 0.8614740412901174, "grad_norm": 38.25, "kl": 1.1653709411621094, "learning_rate": 5e-07, "logits/chosen": 6015182.5, "logits/rejected": -33091138.666666668, "logps/chosen": -94.54887390136719, "logps/rejected": -327.6155192057292, "loss": 0.1632, "rewards/chosen": 0.6853408813476562, "rewards/margins": 3.421330451965332, "rewards/rejected": -2.735989570617676, "step": 16253 }, { "epoch": 0.8615270452919196, "grad_norm": 35.5, "kl": 3.2705259323120117, "learning_rate": 5e-07, "logits/chosen": 3646946.6666666665, "logits/rejected": 5807940.8, "logps/chosen": -187.13753255208334, "logps/rejected": -271.6232177734375, "loss": 0.1889, "rewards/chosen": 1.3624327977498372, "rewards/margins": 3.884210522969564, "rewards/rejected": -2.5217777252197267, "step": 16254 }, { "epoch": 0.8615800492937217, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11233086.0, "logits/rejected": -19483966.666666668, "logps/chosen": -400.2802734375, "logps/rejected": -283.61358642578125, "loss": 0.2432, "rewards/chosen": 0.08046722412109375, "rewards/margins": 2.428140640258789, "rewards/rejected": -2.3476734161376953, "step": 16255 }, { "epoch": 0.8616330532955239, "grad_norm": 51.0, "kl": 5.526946067810059, "learning_rate": 5e-07, "logits/chosen": -20081088.0, "logits/rejected": -46710304.0, "logps/chosen": -457.9020182291667, "logps/rejected": -869.1798095703125, "loss": 0.3549, "rewards/chosen": 0.8377499580383301, "rewards/margins": 4.774017333984375, "rewards/rejected": -3.936267375946045, "step": 16256 }, { "epoch": 0.861686057297326, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17565460.0, "logits/rejected": -24982808.0, "logps/chosen": -256.1156412760417, "logps/rejected": -258.6103759765625, "loss": 0.1752, "rewards/chosen": 0.5605870485305786, "rewards/margins": 3.945580792427063, "rewards/rejected": -3.3849937438964846, "step": 16257 }, { "epoch": 0.8617390612991281, "grad_norm": 38.25, "kl": 1.133772850036621, "learning_rate": 5e-07, "logits/chosen": 1655934.8333333333, "logits/rejected": -42440409.6, "logps/chosen": -697.33984375, "logps/rejected": -441.282666015625, "loss": 0.1458, "rewards/chosen": 1.2553314367930095, "rewards/margins": 4.3987972418467205, "rewards/rejected": -3.1434658050537108, "step": 16258 }, { "epoch": 0.8617920653009302, "grad_norm": 36.0, "kl": 5.286365509033203, "learning_rate": 5e-07, "logits/chosen": 8514778.0, "logits/rejected": -9430902.0, "logps/chosen": -28.268569946289062, "logps/rejected": -125.51231384277344, "loss": 0.347, "rewards/chosen": 0.7074926495552063, "rewards/margins": 2.1950703263282776, "rewards/rejected": -1.4875776767730713, "step": 16259 }, { "epoch": 0.8618450693027323, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31552170.666666668, "logits/rejected": -20223408.0, "logps/chosen": -239.82938639322916, "logps/rejected": -273.877783203125, "loss": 0.2577, "rewards/chosen": 0.6177612940470377, "rewards/margins": 2.255503527323405, "rewards/rejected": -1.6377422332763671, "step": 16260 }, { "epoch": 0.8618980733045345, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52960416.0, "logits/rejected": -16537628.8, "logps/chosen": -336.8299967447917, "logps/rejected": -135.34697265625, "loss": 0.1367, "rewards/chosen": 0.8207271099090576, "rewards/margins": 4.983177518844604, "rewards/rejected": -4.1624504089355465, "step": 16261 }, { "epoch": 0.8619510773063366, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21196976.0, "logits/rejected": -6510140.5, "logps/chosen": -356.2854309082031, "logps/rejected": -424.99176025390625, "loss": 0.2376, "rewards/chosen": 0.637057900428772, "rewards/margins": 2.8569637537002563, "rewards/rejected": -2.2199058532714844, "step": 16262 }, { "epoch": 0.8620040813081388, "grad_norm": 44.25, "kl": 0.26967430114746094, "learning_rate": 5e-07, "logits/chosen": -16347552.0, "logits/rejected": -8082223.0, "logps/chosen": -317.9322509765625, "logps/rejected": -104.33381652832031, "loss": 0.2823, "rewards/chosen": 0.6785445213317871, "rewards/margins": 3.493100166320801, "rewards/rejected": -2.8145556449890137, "step": 16263 }, { "epoch": 0.8620570853099409, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -95937360.0, "logits/rejected": -36126651.428571425, "logps/chosen": -414.7840881347656, "logps/rejected": -368.7006138392857, "loss": 0.1514, "rewards/chosen": -0.32535097002983093, "rewards/margins": 2.669935426541737, "rewards/rejected": -2.995286396571568, "step": 16264 }, { "epoch": 0.862110089311743, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36546704.0, "logits/rejected": 6238797.5, "logps/chosen": -392.988525390625, "logps/rejected": -336.76849365234375, "loss": 0.258, "rewards/chosen": 0.9155013561248779, "rewards/margins": 3.3293955326080322, "rewards/rejected": -2.4138941764831543, "step": 16265 }, { "epoch": 0.8621630933135451, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -341051.3333333333, "logits/rejected": -19688529.6, "logps/chosen": -152.51819864908853, "logps/rejected": -248.851220703125, "loss": 0.1805, "rewards/chosen": 0.6663386027018229, "rewards/margins": 3.493139902750651, "rewards/rejected": -2.826801300048828, "step": 16266 }, { "epoch": 0.8622160973153473, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32674866.0, "logits/rejected": -19840502.0, "logps/chosen": -425.01788330078125, "logps/rejected": -324.2811584472656, "loss": 0.1889, "rewards/chosen": 0.8103870749473572, "rewards/margins": 3.679313600063324, "rewards/rejected": -2.868926525115967, "step": 16267 }, { "epoch": 0.8622691013171494, "grad_norm": 50.75, "kl": 2.3413238525390625, "learning_rate": 5e-07, "logits/chosen": -16458577.142857144, "logits/rejected": -802928.375, "logps/chosen": -253.92440359933036, "logps/rejected": -101.58749389648438, "loss": 0.3825, "rewards/chosen": 0.4874276433672224, "rewards/margins": 2.252001336642674, "rewards/rejected": -1.7645736932754517, "step": 16268 }, { "epoch": 0.8623221053189516, "grad_norm": 75.0, "kl": 7.992715835571289, "learning_rate": 5e-07, "logits/chosen": -12124993.142857144, "logits/rejected": -52064388.0, "logps/chosen": -315.8474818638393, "logps/rejected": -622.4937133789062, "loss": 0.4008, "rewards/chosen": 1.1333250999450684, "rewards/margins": 4.203442335128784, "rewards/rejected": -3.070117235183716, "step": 16269 }, { "epoch": 0.8623751093207537, "grad_norm": 39.5, "kl": 0.4253864288330078, "learning_rate": 5e-07, "logits/chosen": -58454956.0, "logits/rejected": -22356582.0, "logps/chosen": -433.0545654296875, "logps/rejected": -151.27792358398438, "loss": 0.265, "rewards/chosen": 0.9409672021865845, "rewards/margins": 3.9513577222824097, "rewards/rejected": -3.010390520095825, "step": 16270 }, { "epoch": 0.8624281133225559, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38432648.0, "logits/rejected": -51101756.8, "logps/chosen": -347.4390869140625, "logps/rejected": -223.0521484375, "loss": 0.23, "rewards/chosen": 0.2787853280703227, "rewards/margins": 3.677552036444346, "rewards/rejected": -3.3987667083740236, "step": 16271 }, { "epoch": 0.862481117324358, "grad_norm": 51.75, "kl": 3.857219696044922, "learning_rate": 5e-07, "logits/chosen": 2706735.6666666665, "logits/rejected": -4228304.0, "logps/chosen": -235.71256510416666, "logps/rejected": -164.095458984375, "loss": 0.4393, "rewards/chosen": 0.5801384846369425, "rewards/margins": 1.4509877363840737, "rewards/rejected": -0.8708492517471313, "step": 16272 }, { "epoch": 0.8625341213261601, "grad_norm": 43.25, "kl": 0.7609338760375977, "learning_rate": 5e-07, "logits/chosen": -1110670.0, "logits/rejected": -4365113.0, "logps/chosen": -187.7064208984375, "logps/rejected": -208.61056518554688, "loss": 0.3367, "rewards/chosen": 0.5241882801055908, "rewards/margins": 2.075246572494507, "rewards/rejected": -1.551058292388916, "step": 16273 }, { "epoch": 0.8625871253279622, "grad_norm": 53.5, "kl": 1.3580245971679688, "learning_rate": 5e-07, "logits/chosen": -28120332.8, "logits/rejected": 23999712.0, "logps/chosen": -266.528955078125, "logps/rejected": -405.3177083333333, "loss": 0.367, "rewards/chosen": 0.4019052982330322, "rewards/margins": 2.136155843734741, "rewards/rejected": -1.734250545501709, "step": 16274 }, { "epoch": 0.8626401293297644, "grad_norm": 58.25, "kl": 4.489255905151367, "learning_rate": 5e-07, "logits/chosen": -12320527.2, "logits/rejected": -30866800.0, "logps/chosen": -256.6801025390625, "logps/rejected": -453.44287109375, "loss": 0.292, "rewards/chosen": 0.9738140106201172, "rewards/margins": 3.9341166814168296, "rewards/rejected": -2.9603026707967124, "step": 16275 }, { "epoch": 0.8626931333315665, "grad_norm": 43.0, "kl": 1.9027252197265625, "learning_rate": 5e-07, "logits/chosen": -21053364.57142857, "logits/rejected": -70868304.0, "logps/chosen": -179.43352399553572, "logps/rejected": -372.2117919921875, "loss": 0.36, "rewards/chosen": 0.7855842454092843, "rewards/margins": 3.2157662255423407, "rewards/rejected": -2.4301819801330566, "step": 16276 }, { "epoch": 0.8627461373333687, "grad_norm": 45.25, "kl": 0.5587863922119141, "learning_rate": 5e-07, "logits/chosen": -17229760.0, "logits/rejected": -19942458.666666668, "logps/chosen": -251.4921142578125, "logps/rejected": -415.0181070963542, "loss": 0.257, "rewards/chosen": 0.6467844009399414, "rewards/margins": 3.6476970672607423, "rewards/rejected": -3.000912666320801, "step": 16277 }, { "epoch": 0.8627991413351708, "grad_norm": 42.5, "kl": 0.8338146209716797, "learning_rate": 5e-07, "logits/chosen": -35309092.0, "logits/rejected": -14191912.0, "logps/chosen": -773.439208984375, "logps/rejected": -168.44563802083334, "loss": 0.1657, "rewards/chosen": 1.2930771112442017, "rewards/margins": 3.5147300958633423, "rewards/rejected": -2.2216529846191406, "step": 16278 }, { "epoch": 0.862852145336973, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11194083.0, "logits/rejected": -53586372.571428575, "logps/chosen": -312.24224853515625, "logps/rejected": -292.10292271205356, "loss": 0.1281, "rewards/chosen": 2.177816867828369, "rewards/margins": 4.6826348304748535, "rewards/rejected": -2.5048179626464844, "step": 16279 }, { "epoch": 0.862905149338775, "grad_norm": 48.5, "kl": 1.8484992980957031, "learning_rate": 5e-07, "logits/chosen": -2071263.8, "logits/rejected": -4067262.3333333335, "logps/chosen": -237.398681640625, "logps/rejected": -357.15625, "loss": 0.3817, "rewards/chosen": 0.04240216016769409, "rewards/margins": 2.640186361471812, "rewards/rejected": -2.5977842013041177, "step": 16280 }, { "epoch": 0.8629581533405772, "grad_norm": 47.25, "kl": 2.046152114868164, "learning_rate": 5e-07, "logits/chosen": 14102025.0, "logits/rejected": -28003360.0, "logps/chosen": -30.29180335998535, "logps/rejected": -348.8956298828125, "loss": 0.1872, "rewards/chosen": 0.012085720896720886, "rewards/margins": 3.1591537445783615, "rewards/rejected": -3.1470680236816406, "step": 16281 }, { "epoch": 0.8630111573423793, "grad_norm": 74.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 174541024.0, "logits/rejected": -29067072.0, "logps/chosen": -394.59356689453125, "logps/rejected": -309.6166178385417, "loss": 0.2826, "rewards/chosen": -0.3138580322265625, "rewards/margins": 1.8668909072875977, "rewards/rejected": -2.18074893951416, "step": 16282 }, { "epoch": 0.8630641613441815, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -28248004.0, "logps/rejected": -203.67637634277344, "loss": 0.1862, "rewards/rejected": -2.259592056274414, "step": 16283 }, { "epoch": 0.8631171653459836, "grad_norm": 42.0, "kl": 0.9617996215820312, "learning_rate": 5e-07, "logits/chosen": -43044661.333333336, "logits/rejected": -25752739.2, "logps/chosen": -234.70084635416666, "logps/rejected": -280.218115234375, "loss": 0.2517, "rewards/chosen": 0.7616877555847168, "rewards/margins": 2.6732029914855957, "rewards/rejected": -1.911515235900879, "step": 16284 }, { "epoch": 0.8631701693477858, "grad_norm": 84.5, "kl": 1.0573539733886719, "learning_rate": 5e-07, "logits/chosen": 7719552.666666667, "logits/rejected": -58503072.0, "logps/chosen": -294.58270263671875, "logps/rejected": -838.4097290039062, "loss": 0.3725, "rewards/chosen": 0.20021355152130127, "rewards/margins": 5.6535035371780396, "rewards/rejected": -5.453289985656738, "step": 16285 }, { "epoch": 0.8632231733495879, "grad_norm": 42.75, "kl": 1.3234786987304688, "learning_rate": 5e-07, "logits/chosen": -48930648.0, "logits/rejected": -18483444.0, "logps/chosen": -384.81195068359375, "logps/rejected": -186.81356811523438, "loss": 0.294, "rewards/chosen": 0.06459073722362518, "rewards/margins": 2.8300567120313644, "rewards/rejected": -2.7654659748077393, "step": 16286 }, { "epoch": 0.8632761773513901, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43185308.0, "logits/rejected": -27817100.0, "logps/chosen": -312.07330322265625, "logps/rejected": -214.98524475097656, "loss": 0.2993, "rewards/chosen": 0.3314143419265747, "rewards/margins": 2.0875593423843384, "rewards/rejected": -1.7561450004577637, "step": 16287 }, { "epoch": 0.8633291813531921, "grad_norm": 48.75, "kl": 1.2053298950195312, "learning_rate": 5e-07, "logits/chosen": 8108828.0, "logits/rejected": -33834985.6, "logps/chosen": -32.924540201822914, "logps/rejected": -274.9456298828125, "loss": 0.2998, "rewards/chosen": 0.24738152821858725, "rewards/margins": 1.795742925008138, "rewards/rejected": -1.5483613967895509, "step": 16288 }, { "epoch": 0.8633821853549943, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3437276.0, "logits/rejected": -8587062.666666666, "logps/chosen": -198.659912109375, "logps/rejected": -149.4786173502604, "loss": 0.1319, "rewards/chosen": 0.9901493191719055, "rewards/margins": 4.996506830056508, "rewards/rejected": -4.006357510884603, "step": 16289 }, { "epoch": 0.8634351893567964, "grad_norm": 46.5, "kl": 1.597555160522461, "learning_rate": 5e-07, "logits/chosen": 3532632.0, "logits/rejected": -60629632.0, "logps/chosen": -334.5405578613281, "logps/rejected": -1055.13916015625, "loss": 0.1953, "rewards/chosen": 0.8249454498291016, "rewards/margins": 5.599727630615234, "rewards/rejected": -4.774782180786133, "step": 16290 }, { "epoch": 0.8634881933585986, "grad_norm": 49.75, "kl": 3.3361358642578125, "learning_rate": 5e-07, "logits/chosen": -36679624.0, "logits/rejected": -33910744.0, "logps/chosen": -357.4150390625, "logps/rejected": -373.5451965332031, "loss": 0.2274, "rewards/chosen": 0.9860062003135681, "rewards/margins": 3.1498369574546814, "rewards/rejected": -2.1638307571411133, "step": 16291 }, { "epoch": 0.8635411973604007, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2209637.5, "logits/rejected": -37305578.666666664, "logps/chosen": -231.87857055664062, "logps/rejected": -315.35719807942706, "loss": 0.2052, "rewards/chosen": 1.0104316473007202, "rewards/margins": 3.4384352763493857, "rewards/rejected": -2.4280036290486655, "step": 16292 }, { "epoch": 0.8635942013622029, "grad_norm": 53.5, "kl": 1.639078140258789, "learning_rate": 5e-07, "logits/chosen": -17813438.0, "logits/rejected": -37850104.0, "logps/chosen": -362.71844482421875, "logps/rejected": -458.9365539550781, "loss": 0.2338, "rewards/chosen": 0.6340469121932983, "rewards/margins": 3.5814071893692017, "rewards/rejected": -2.9473602771759033, "step": 16293 }, { "epoch": 0.863647205364005, "grad_norm": 42.25, "kl": 0.10527801513671875, "learning_rate": 5e-07, "logits/chosen": -41507040.0, "logits/rejected": -40535848.0, "logps/chosen": -647.4658203125, "logps/rejected": -348.3419189453125, "loss": 0.1973, "rewards/chosen": 1.1701939105987549, "rewards/margins": 3.9077534675598145, "rewards/rejected": -2.7375595569610596, "step": 16294 }, { "epoch": 0.8637002093658072, "grad_norm": 45.0, "kl": 1.4327373504638672, "learning_rate": 5e-07, "logits/chosen": -5678065.333333333, "logits/rejected": -5731146.5, "logps/chosen": -155.65054321289062, "logps/rejected": -130.8994903564453, "loss": 0.3769, "rewards/chosen": 0.5291207631429037, "rewards/margins": 1.685701092084249, "rewards/rejected": -1.1565803289413452, "step": 16295 }, { "epoch": 0.8637532133676092, "grad_norm": 44.75, "kl": 2.096111297607422, "learning_rate": 5e-07, "logits/chosen": -28472677.333333332, "logits/rejected": -33446152.0, "logps/chosen": -217.3290812174479, "logps/rejected": -391.291015625, "loss": 0.3855, "rewards/chosen": 0.5610675811767578, "rewards/margins": 3.325866937637329, "rewards/rejected": -2.7647993564605713, "step": 16296 }, { "epoch": 0.8638062173694114, "grad_norm": 32.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -107864816.0, "logits/rejected": -44653577.14285714, "logps/chosen": -673.4979248046875, "logps/rejected": -376.8881138392857, "loss": 0.0967, "rewards/chosen": 0.7241760492324829, "rewards/margins": 3.6138146775109425, "rewards/rejected": -2.8896386282784596, "step": 16297 }, { "epoch": 0.8638592213712135, "grad_norm": 72.5, "kl": 1.0725383758544922, "learning_rate": 5e-07, "logits/chosen": -6676279.5, "logits/rejected": -19634936.0, "logps/chosen": -205.44235229492188, "logps/rejected": -296.150146484375, "loss": 0.2282, "rewards/chosen": 0.39943885803222656, "rewards/margins": 2.301233132680257, "rewards/rejected": -1.9017942746480305, "step": 16298 }, { "epoch": 0.8639122253730157, "grad_norm": 30.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4247604.333333333, "logits/rejected": -14798963.2, "logps/chosen": -77.60994466145833, "logps/rejected": -158.40755615234374, "loss": 0.241, "rewards/chosen": 0.365971843401591, "rewards/margins": 2.8525072971979775, "rewards/rejected": -2.4865354537963866, "step": 16299 }, { "epoch": 0.8639652293748178, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -103695744.0, "logits/rejected": -26113027.2, "logps/chosen": -360.8323567708333, "logps/rejected": -260.221337890625, "loss": 0.2663, "rewards/chosen": -0.041152959068616234, "rewards/margins": 2.215387720863024, "rewards/rejected": -2.2565406799316405, "step": 16300 }, { "epoch": 0.86401823337662, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35270505.6, "logits/rejected": -21110978.666666668, "logps/chosen": -301.9817138671875, "logps/rejected": -191.75960286458334, "loss": 0.2566, "rewards/chosen": 0.5734467506408691, "rewards/margins": 3.865956465403239, "rewards/rejected": -3.2925097147623696, "step": 16301 }, { "epoch": 0.8640712373784221, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7104624.0, "logits/rejected": -30584941.333333332, "logps/chosen": -395.46014404296875, "logps/rejected": -350.2024332682292, "loss": 0.1985, "rewards/chosen": -0.01371612399816513, "rewards/margins": 3.141761081914107, "rewards/rejected": -3.155477205912272, "step": 16302 }, { "epoch": 0.8641242413802243, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67158021.33333333, "logits/rejected": -33638419.2, "logps/chosen": -378.561279296875, "logps/rejected": -222.4648681640625, "loss": 0.2742, "rewards/chosen": -0.08998920520146687, "rewards/margins": 2.751945189634959, "rewards/rejected": -2.841934394836426, "step": 16303 }, { "epoch": 0.8641772453820263, "grad_norm": 40.75, "kl": 1.4818315505981445, "learning_rate": 5e-07, "logits/chosen": 22969042.0, "logits/rejected": -32373318.0, "logps/chosen": -263.02960205078125, "logps/rejected": -515.0792236328125, "loss": 0.2487, "rewards/chosen": 0.28044599294662476, "rewards/margins": 3.511229693889618, "rewards/rejected": -3.230783700942993, "step": 16304 }, { "epoch": 0.8642302493838285, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16153566.4, "logits/rejected": -84479168.0, "logps/chosen": -439.2208984375, "logps/rejected": -310.4191080729167, "loss": 0.304, "rewards/chosen": 0.5033256530761718, "rewards/margins": 2.906349849700928, "rewards/rejected": -2.403024196624756, "step": 16305 }, { "epoch": 0.8642832533856306, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28795644.0, "logits/rejected": -20175334.0, "logps/chosen": -217.12918090820312, "logps/rejected": -272.21685791015625, "loss": 0.2592, "rewards/chosen": 0.049556463956832886, "rewards/margins": 4.368808001279831, "rewards/rejected": -4.319251537322998, "step": 16306 }, { "epoch": 0.8643362573874328, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15124612.8, "logits/rejected": -52086517.333333336, "logps/chosen": -264.460302734375, "logps/rejected": -583.2780354817709, "loss": 0.272, "rewards/chosen": 0.5669459342956543, "rewards/margins": 3.340333525339762, "rewards/rejected": -2.773387591044108, "step": 16307 }, { "epoch": 0.8643892613892349, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1102165.75, "logits/rejected": -13273336.0, "logps/chosen": -32.52271270751953, "logps/rejected": -287.25852457682294, "loss": 0.2246, "rewards/chosen": -0.0803302749991417, "rewards/margins": 2.2840922052661576, "rewards/rejected": -2.3644224802652993, "step": 16308 }, { "epoch": 0.864442265391037, "grad_norm": 34.5, "kl": 1.582611083984375, "learning_rate": 5e-07, "logits/chosen": -18976832.0, "logits/rejected": -28401264.0, "logps/chosen": -247.4932861328125, "logps/rejected": -478.53994140625, "loss": 0.1924, "rewards/chosen": 0.4091421365737915, "rewards/margins": 3.846437668800354, "rewards/rejected": -3.4372955322265626, "step": 16309 }, { "epoch": 0.8644952693928392, "grad_norm": 37.75, "kl": 1.568079948425293, "learning_rate": 5e-07, "logits/chosen": -38060160.0, "logits/rejected": -28888688.0, "logps/chosen": -200.7599365234375, "logps/rejected": -278.5850830078125, "loss": 0.3032, "rewards/chosen": 0.5881887912750244, "rewards/margins": 2.766215626398722, "rewards/rejected": -2.1780268351236978, "step": 16310 }, { "epoch": 0.8645482733946412, "grad_norm": 39.5, "kl": 1.7754554748535156, "learning_rate": 5e-07, "logits/chosen": -22160760.0, "logits/rejected": -53711144.0, "logps/chosen": -200.86105346679688, "logps/rejected": -372.5902404785156, "loss": 0.3463, "rewards/chosen": -0.1914016604423523, "rewards/margins": 2.2790732979774475, "rewards/rejected": -2.4704749584198, "step": 16311 }, { "epoch": 0.8646012773964434, "grad_norm": 38.25, "kl": 1.5309677124023438, "learning_rate": 5e-07, "logits/chosen": -26778138.666666668, "logits/rejected": -32327948.8, "logps/chosen": -271.7716878255208, "logps/rejected": -427.824609375, "loss": 0.2189, "rewards/chosen": 0.5683349768320719, "rewards/margins": 2.992323319117228, "rewards/rejected": -2.4239883422851562, "step": 16312 }, { "epoch": 0.8646542813982455, "grad_norm": 63.0, "kl": 1.0983200073242188, "learning_rate": 5e-07, "logits/chosen": -50041109.333333336, "logits/rejected": -10841612.0, "logps/chosen": -411.9703776041667, "logps/rejected": -116.84013366699219, "loss": 0.3059, "rewards/chosen": 0.8492991129557291, "rewards/margins": 3.5982064406077066, "rewards/rejected": -2.7489073276519775, "step": 16313 }, { "epoch": 0.8647072854000477, "grad_norm": 62.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53295364.0, "logits/rejected": 95778080.0, "logps/chosen": -323.8362731933594, "logps/rejected": -425.4191487630208, "loss": 0.1589, "rewards/chosen": 0.9086792469024658, "rewards/margins": 3.647451639175415, "rewards/rejected": -2.738772392272949, "step": 16314 }, { "epoch": 0.8647602894018498, "grad_norm": 36.75, "kl": 5.114926338195801, "learning_rate": 5e-07, "logits/chosen": -11469631.0, "logits/rejected": -42905988.0, "logps/chosen": -304.32708740234375, "logps/rejected": -499.9060974121094, "loss": 0.2711, "rewards/chosen": 1.1221134662628174, "rewards/margins": 4.111004114151001, "rewards/rejected": -2.9888906478881836, "step": 16315 }, { "epoch": 0.864813293403652, "grad_norm": 40.0, "kl": 1.700333595275879, "learning_rate": 5e-07, "logits/chosen": -8323560.0, "logits/rejected": -24976992.0, "logps/chosen": -319.803125, "logps/rejected": -389.0294596354167, "loss": 0.36, "rewards/chosen": 0.9342342376708984, "rewards/margins": 2.090158716837565, "rewards/rejected": -1.1559244791666667, "step": 16316 }, { "epoch": 0.8648662974054541, "grad_norm": 45.0, "kl": 0.9863815307617188, "learning_rate": 5e-07, "logits/chosen": -28417174.4, "logits/rejected": -19237853.333333332, "logps/chosen": -329.589013671875, "logps/rejected": -374.0321858723958, "loss": 0.3698, "rewards/chosen": 0.4170022964477539, "rewards/margins": 2.568681812286377, "rewards/rejected": -2.151679515838623, "step": 16317 }, { "epoch": 0.8649193014072563, "grad_norm": 37.75, "kl": 1.3542242050170898, "learning_rate": 5e-07, "logits/chosen": -7476114.5, "logits/rejected": -3739675.0, "logps/chosen": -343.2960205078125, "logps/rejected": -620.9033813476562, "loss": 0.1552, "rewards/chosen": 1.1695761680603027, "rewards/margins": 5.049530982971191, "rewards/rejected": -3.8799548149108887, "step": 16318 }, { "epoch": 0.8649723054090583, "grad_norm": 60.75, "kl": 3.1157150268554688, "learning_rate": 5e-07, "logits/chosen": -55237434.666666664, "logits/rejected": 79454136.0, "logps/chosen": -856.1148274739584, "logps/rejected": -305.90924072265625, "loss": 0.3824, "rewards/chosen": 1.0461739699045818, "rewards/margins": 2.1618631283442182, "rewards/rejected": -1.1156891584396362, "step": 16319 }, { "epoch": 0.8650253094108605, "grad_norm": 48.25, "kl": 5.6266374588012695, "learning_rate": 5e-07, "logits/chosen": -17342906.666666668, "logits/rejected": -22346944.0, "logps/chosen": -121.4473164876302, "logps/rejected": -313.19769287109375, "loss": 0.3864, "rewards/chosen": 0.9130130608876547, "rewards/margins": 2.2059741814931235, "rewards/rejected": -1.2929611206054688, "step": 16320 }, { "epoch": 0.8650783134126626, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34537328.0, "logits/rejected": -44463544.0, "logps/chosen": -383.1123352050781, "logps/rejected": -324.42431640625, "loss": 0.35, "rewards/chosen": 0.12447871267795563, "rewards/margins": 2.356990233063698, "rewards/rejected": -2.232511520385742, "step": 16321 }, { "epoch": 0.8651313174144648, "grad_norm": 43.25, "kl": 0.802398681640625, "learning_rate": 5e-07, "logits/chosen": -3588307.75, "logits/rejected": -47188776.0, "logps/chosen": -212.5237579345703, "logps/rejected": -504.92938232421875, "loss": 0.2386, "rewards/chosen": 0.6543465852737427, "rewards/margins": 2.7378565073013306, "rewards/rejected": -2.083509922027588, "step": 16322 }, { "epoch": 0.8651843214162669, "grad_norm": 59.0, "kl": 1.2688484191894531, "learning_rate": 5e-07, "logits/chosen": -14530342.666666666, "logits/rejected": -27488761.6, "logps/chosen": -206.78287760416666, "logps/rejected": -282.87490234375, "loss": 0.2821, "rewards/chosen": -0.12870164712270102, "rewards/margins": 2.6974149147669473, "rewards/rejected": -2.8261165618896484, "step": 16323 }, { "epoch": 0.8652373254180691, "grad_norm": 30.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9749127.0, "logits/rejected": -29606509.333333332, "logps/chosen": -1124.1080322265625, "logps/rejected": -349.6962076822917, "loss": 0.092, "rewards/chosen": 2.842202663421631, "rewards/margins": 5.524220943450928, "rewards/rejected": -2.682018280029297, "step": 16324 }, { "epoch": 0.8652903294198712, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41781180.0, "logits/rejected": -8059239.5, "logps/chosen": -341.9801330566406, "logps/rejected": -352.9969787597656, "loss": 0.2545, "rewards/chosen": 0.5156806707382202, "rewards/margins": 3.8982561826705933, "rewards/rejected": -3.382575511932373, "step": 16325 }, { "epoch": 0.8653433334216734, "grad_norm": 44.5, "kl": 0.4687767028808594, "learning_rate": 5e-07, "logits/chosen": -65306586.666666664, "logits/rejected": -31467942.4, "logps/chosen": -397.0326334635417, "logps/rejected": -277.397119140625, "loss": 0.2067, "rewards/chosen": 1.0149463017781575, "rewards/margins": 3.849066480000814, "rewards/rejected": -2.834120178222656, "step": 16326 }, { "epoch": 0.8653963374234754, "grad_norm": 34.5, "kl": 0.5611190795898438, "learning_rate": 5e-07, "logits/chosen": -14808656.0, "logits/rejected": -40400857.6, "logps/chosen": -87.99538167317708, "logps/rejected": -364.530810546875, "loss": 0.2146, "rewards/chosen": 0.37432920932769775, "rewards/margins": 2.9926275491714476, "rewards/rejected": -2.61829833984375, "step": 16327 }, { "epoch": 0.8654493414252776, "grad_norm": 41.75, "kl": 1.9511070251464844, "learning_rate": 5e-07, "logits/chosen": 2701525.3333333335, "logits/rejected": -5903270.4, "logps/chosen": -145.29195149739584, "logps/rejected": -180.2322998046875, "loss": 0.3521, "rewards/chosen": 0.7884042263031006, "rewards/margins": 2.1879409313201905, "rewards/rejected": -1.39953670501709, "step": 16328 }, { "epoch": 0.8655023454270797, "grad_norm": 45.5, "kl": 2.410968780517578, "learning_rate": 5e-07, "logits/chosen": -70399104.0, "logits/rejected": -35725692.0, "logps/chosen": -680.7879028320312, "logps/rejected": -540.2633056640625, "loss": 0.1929, "rewards/chosen": 1.2170051336288452, "rewards/margins": 4.915147185325623, "rewards/rejected": -3.6981420516967773, "step": 16329 }, { "epoch": 0.8655553494288819, "grad_norm": 49.0, "kl": 6.4571027755737305, "learning_rate": 5e-07, "logits/chosen": -9833492.8, "logits/rejected": -21123540.0, "logps/chosen": -492.2201171875, "logps/rejected": -153.33781941731772, "loss": 0.4115, "rewards/chosen": 0.9935100555419922, "rewards/margins": 1.9388706366221111, "rewards/rejected": -0.9453605810801188, "step": 16330 }, { "epoch": 0.865608353430684, "grad_norm": 45.5, "kl": 2.1604843139648438, "learning_rate": 5e-07, "logits/chosen": -67183480.0, "logits/rejected": 2584994.0, "logps/chosen": -170.05958557128906, "logps/rejected": -364.7667236328125, "loss": 0.3289, "rewards/chosen": 0.3730792701244354, "rewards/margins": 2.6944579780101776, "rewards/rejected": -2.321378707885742, "step": 16331 }, { "epoch": 0.8656613574324862, "grad_norm": 22.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6472681.5, "logits/rejected": -22770725.333333332, "logps/chosen": -48.24686813354492, "logps/rejected": -322.16807047526044, "loss": 0.0974, "rewards/chosen": 0.9098313450813293, "rewards/margins": 4.991040845712026, "rewards/rejected": -4.081209500630696, "step": 16332 }, { "epoch": 0.8657143614342883, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48820568.0, "logits/rejected": 2982143.0, "logps/chosen": -259.38494873046875, "logps/rejected": -201.74409993489584, "loss": 0.162, "rewards/chosen": 0.37643927335739136, "rewards/margins": 3.609318276246389, "rewards/rejected": -3.2328790028889975, "step": 16333 }, { "epoch": 0.8657673654360905, "grad_norm": 53.25, "kl": 0.14068222045898438, "learning_rate": 5e-07, "logits/chosen": -7167042.0, "logits/rejected": -17014048.0, "logps/chosen": -240.39066569010416, "logps/rejected": -345.84892578125, "loss": 0.2296, "rewards/chosen": 0.4843536615371704, "rewards/margins": 2.8730274438858032, "rewards/rejected": -2.388673782348633, "step": 16334 }, { "epoch": 0.8658203694378925, "grad_norm": 50.75, "kl": 1.9445991516113281, "learning_rate": 5e-07, "logits/chosen": -59021644.8, "logits/rejected": -22727040.0, "logps/chosen": -326.41318359375, "logps/rejected": -193.60308837890625, "loss": 0.389, "rewards/chosen": 0.21326332092285155, "rewards/margins": 1.582139778137207, "rewards/rejected": -1.3688764572143555, "step": 16335 }, { "epoch": 0.8658733734396947, "grad_norm": 46.5, "kl": 0.06810951232910156, "learning_rate": 5e-07, "logits/chosen": -13762992.0, "logits/rejected": -21603356.0, "logps/chosen": -376.2729248046875, "logps/rejected": -290.73480224609375, "loss": 0.2662, "rewards/chosen": 0.49959731101989746, "rewards/margins": 3.252206246058146, "rewards/rejected": -2.7526089350382485, "step": 16336 }, { "epoch": 0.8659263774414968, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24773236.8, "logits/rejected": -23827490.666666668, "logps/chosen": -398.4349609375, "logps/rejected": -271.4388020833333, "loss": 0.2572, "rewards/chosen": 0.5495043754577636, "rewards/margins": 3.5092958768208824, "rewards/rejected": -2.9597915013631186, "step": 16337 }, { "epoch": 0.865979381443299, "grad_norm": 57.5, "kl": 0.08959197998046875, "learning_rate": 5e-07, "logits/chosen": -48054675.2, "logits/rejected": -24605160.0, "logps/chosen": -488.455859375, "logps/rejected": -305.7520345052083, "loss": 0.2499, "rewards/chosen": 1.061553382873535, "rewards/margins": 3.121842130025228, "rewards/rejected": -2.060288747151693, "step": 16338 }, { "epoch": 0.8660323854451011, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46882517.333333336, "logits/rejected": -33861308.0, "logps/chosen": -334.6536458333333, "logps/rejected": -460.28082275390625, "loss": 0.4419, "rewards/chosen": -0.2402478059132894, "rewards/margins": 2.230905453364054, "rewards/rejected": -2.4711532592773438, "step": 16339 }, { "epoch": 0.8660853894469033, "grad_norm": 56.75, "kl": 5.1781463623046875, "learning_rate": 5e-07, "logits/chosen": -5559525.0, "logits/rejected": -36882164.0, "logps/chosen": -285.971435546875, "logps/rejected": -312.017333984375, "loss": 0.3072, "rewards/chosen": 0.6509279012680054, "rewards/margins": 4.199146866798401, "rewards/rejected": -3.5482189655303955, "step": 16340 }, { "epoch": 0.8661383934487054, "grad_norm": 67.5, "kl": 1.588078498840332, "learning_rate": 5e-07, "logits/chosen": -39586221.71428572, "logits/rejected": -53786488.0, "logps/chosen": -527.1410784040179, "logps/rejected": -202.44497680664062, "loss": 0.4401, "rewards/chosen": 0.18637018544333323, "rewards/margins": 1.731151751109532, "rewards/rejected": -1.5447815656661987, "step": 16341 }, { "epoch": 0.8661913974505075, "grad_norm": 44.75, "kl": 1.1687736511230469, "learning_rate": 5e-07, "logits/chosen": -29977636.0, "logits/rejected": -73538832.0, "logps/chosen": -278.3028259277344, "logps/rejected": -271.2474060058594, "loss": 0.3623, "rewards/chosen": 0.32575803995132446, "rewards/margins": 1.7593565583229065, "rewards/rejected": -1.433598518371582, "step": 16342 }, { "epoch": 0.8662444014523096, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 155334688.0, "logits/rejected": -34029252.0, "logps/chosen": -429.40582275390625, "logps/rejected": -672.7005615234375, "loss": 0.2018, "rewards/chosen": 0.9123756289482117, "rewards/margins": 4.497252643108368, "rewards/rejected": -3.5848770141601562, "step": 16343 }, { "epoch": 0.8662974054541118, "grad_norm": 45.5, "kl": 0.9145984649658203, "learning_rate": 5e-07, "logits/chosen": -11318978.0, "logits/rejected": -26562120.0, "logps/chosen": -152.90139770507812, "logps/rejected": -422.995849609375, "loss": 0.2101, "rewards/chosen": 0.6929658651351929, "rewards/margins": 4.309280276298523, "rewards/rejected": -3.61631441116333, "step": 16344 }, { "epoch": 0.8663504094559139, "grad_norm": 44.75, "kl": 0.4569549560546875, "learning_rate": 5e-07, "logits/chosen": -16885916.8, "logits/rejected": -14063537.333333334, "logps/chosen": -165.0510498046875, "logps/rejected": -181.5175984700521, "loss": 0.3221, "rewards/chosen": 0.5052038669586182, "rewards/margins": 2.7692556222279867, "rewards/rejected": -2.2640517552693686, "step": 16345 }, { "epoch": 0.8664034134577161, "grad_norm": 46.75, "kl": 1.2113609313964844, "learning_rate": 5e-07, "logits/chosen": -44253728.0, "logits/rejected": -576523.5, "logps/chosen": -352.3861389160156, "logps/rejected": -160.22377014160156, "loss": 0.2452, "rewards/chosen": 0.5349025726318359, "rewards/margins": 3.119361162185669, "rewards/rejected": -2.584458589553833, "step": 16346 }, { "epoch": 0.8664564174595182, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 44132408.0, "logits/rejected": -15052334.666666666, "logps/chosen": -603.517578125, "logps/rejected": -269.0844319661458, "loss": 0.1795, "rewards/chosen": -0.25953367352485657, "rewards/margins": 2.9105786184469857, "rewards/rejected": -3.1701122919718423, "step": 16347 }, { "epoch": 0.8665094214613204, "grad_norm": 35.0, "kl": 2.6287074089050293, "learning_rate": 5e-07, "logits/chosen": -16723888.0, "logits/rejected": -30413126.0, "logps/chosen": -183.08601888020834, "logps/rejected": -124.3404769897461, "loss": 0.3864, "rewards/chosen": 0.4705970287322998, "rewards/margins": 3.0914852619171143, "rewards/rejected": -2.6208882331848145, "step": 16348 }, { "epoch": 0.8665624254631225, "grad_norm": 46.25, "kl": 1.4665699005126953, "learning_rate": 5e-07, "logits/chosen": -8314644.8, "logits/rejected": -1921650.1666666667, "logps/chosen": -164.86903076171876, "logps/rejected": -188.9311319986979, "loss": 0.3644, "rewards/chosen": 0.27033371925354005, "rewards/margins": 3.1522316455841066, "rewards/rejected": -2.8818979263305664, "step": 16349 }, { "epoch": 0.8666154294649246, "grad_norm": 46.5, "kl": 0.7741718292236328, "learning_rate": 5e-07, "logits/chosen": -50856920.0, "logits/rejected": -8178002.5, "logps/chosen": -291.83538818359375, "logps/rejected": -106.76332092285156, "loss": 0.3038, "rewards/chosen": 0.8260257244110107, "rewards/margins": 2.1524739265441895, "rewards/rejected": -1.3264482021331787, "step": 16350 }, { "epoch": 0.8666684334667267, "grad_norm": 47.0, "kl": 2.5425844192504883, "learning_rate": 5e-07, "logits/chosen": -72162762.66666667, "logits/rejected": -4041570.75, "logps/chosen": -459.725830078125, "logps/rejected": -122.97236633300781, "loss": 0.2555, "rewards/chosen": 1.5860843658447266, "rewards/margins": 3.41886830329895, "rewards/rejected": -1.8327839374542236, "step": 16351 }, { "epoch": 0.8667214374685289, "grad_norm": 41.25, "kl": 3.5579147338867188, "learning_rate": 5e-07, "logits/chosen": -18291562.0, "logits/rejected": -27817258.0, "logps/chosen": -283.51837158203125, "logps/rejected": -430.1124267578125, "loss": 0.2788, "rewards/chosen": 0.3799486756324768, "rewards/margins": 3.193390190601349, "rewards/rejected": -2.813441514968872, "step": 16352 }, { "epoch": 0.866774441470331, "grad_norm": 47.0, "kl": 2.704723358154297, "learning_rate": 5e-07, "logits/chosen": -29610649.6, "logits/rejected": -43439645.333333336, "logps/chosen": -307.92890625, "logps/rejected": -285.93756103515625, "loss": 0.2893, "rewards/chosen": 0.773246955871582, "rewards/margins": 3.716534169514974, "rewards/rejected": -2.943287213643392, "step": 16353 }, { "epoch": 0.8668274454721332, "grad_norm": 43.25, "kl": 4.685450553894043, "learning_rate": 5e-07, "logits/chosen": -7348784.0, "logits/rejected": -13185596.0, "logps/chosen": -700.6009521484375, "logps/rejected": -247.34420776367188, "loss": 0.1671, "rewards/chosen": 1.7759177684783936, "rewards/margins": 4.301833391189575, "rewards/rejected": -2.5259156227111816, "step": 16354 }, { "epoch": 0.8668804494739353, "grad_norm": 65.0, "kl": 2.1102304458618164, "learning_rate": 5e-07, "logits/chosen": 15891825.6, "logits/rejected": -5098099.333333333, "logps/chosen": -336.81494140625, "logps/rejected": -237.43206787109375, "loss": 0.3296, "rewards/chosen": 0.6665219783782959, "rewards/margins": 2.8362464427948, "rewards/rejected": -2.169724464416504, "step": 16355 }, { "epoch": 0.8669334534757375, "grad_norm": 51.5, "kl": 0.00682830810546875, "learning_rate": 5e-07, "logits/chosen": -34424384.0, "logits/rejected": -32924426.666666668, "logps/chosen": -298.99794921875, "logps/rejected": -434.9392903645833, "loss": 0.24, "rewards/chosen": 1.0219669342041016, "rewards/margins": 3.4084970156351724, "rewards/rejected": -2.386530081431071, "step": 16356 }, { "epoch": 0.8669864574775396, "grad_norm": 49.75, "kl": 2.9418792724609375, "learning_rate": 5e-07, "logits/chosen": -7502468.0, "logits/rejected": -8645343.0, "logps/chosen": -89.87650299072266, "logps/rejected": -196.85684204101562, "loss": 0.3541, "rewards/chosen": 0.15387475490570068, "rewards/margins": 1.462091326713562, "rewards/rejected": -1.3082165718078613, "step": 16357 }, { "epoch": 0.8670394614793417, "grad_norm": 51.0, "kl": 2.122929573059082, "learning_rate": 5e-07, "logits/chosen": -28105520.0, "logits/rejected": -10995638.0, "logps/chosen": -408.2851867675781, "logps/rejected": -226.47781372070312, "loss": 0.29, "rewards/chosen": 0.6423879861831665, "rewards/margins": 2.6335772275924683, "rewards/rejected": -1.9911892414093018, "step": 16358 }, { "epoch": 0.8670924654811438, "grad_norm": 52.5, "kl": 5.345085144042969, "learning_rate": 5e-07, "logits/chosen": -38903944.0, "logits/rejected": -43042152.0, "logps/chosen": -385.6035970052083, "logps/rejected": -658.5663452148438, "loss": 0.3549, "rewards/chosen": 0.888160785039266, "rewards/margins": 5.623916705449422, "rewards/rejected": -4.735755920410156, "step": 16359 }, { "epoch": 0.8671454694829459, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45738400.0, "logits/rejected": -12403154.666666666, "logps/chosen": -310.8941650390625, "logps/rejected": -300.6720377604167, "loss": 0.2238, "rewards/chosen": 0.02864229679107666, "rewards/margins": 3.5048935810724893, "rewards/rejected": -3.4762512842814126, "step": 16360 }, { "epoch": 0.8671984734847481, "grad_norm": 48.0, "kl": 0.2162160873413086, "learning_rate": 5e-07, "logits/chosen": -57539306.666666664, "logits/rejected": -18001825.6, "logps/chosen": -203.3677978515625, "logps/rejected": -264.009619140625, "loss": 0.203, "rewards/chosen": 0.8736608028411865, "rewards/margins": 2.716455030441284, "rewards/rejected": -1.8427942276000977, "step": 16361 }, { "epoch": 0.8672514774865502, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19408374.0, "logits/rejected": -43143692.0, "logps/chosen": -351.99066162109375, "logps/rejected": -606.7530517578125, "loss": 0.256, "rewards/chosen": 0.16134434938430786, "rewards/margins": 3.27754408121109, "rewards/rejected": -3.1161997318267822, "step": 16362 }, { "epoch": 0.8673044814883524, "grad_norm": 109.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57945888.0, "logits/rejected": -18944664.0, "logps/chosen": -414.8816324869792, "logps/rejected": -149.1945556640625, "loss": 0.253, "rewards/chosen": 0.5266708532969157, "rewards/margins": 2.7465080420176187, "rewards/rejected": -2.219837188720703, "step": 16363 }, { "epoch": 0.8673574854901545, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17422752.0, "logits/rejected": -16527021.0, "logps/chosen": -190.45516967773438, "logps/rejected": -472.4732360839844, "loss": 0.3219, "rewards/chosen": 0.12950526177883148, "rewards/margins": 2.731361970305443, "rewards/rejected": -2.6018567085266113, "step": 16364 }, { "epoch": 0.8674104894919566, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17603713.333333332, "logits/rejected": -16877395.2, "logps/chosen": -168.862060546875, "logps/rejected": -338.7525634765625, "loss": 0.2492, "rewards/chosen": 0.456182599067688, "rewards/margins": 2.7336129426956175, "rewards/rejected": -2.2774303436279295, "step": 16365 }, { "epoch": 0.8674634934937587, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18097606.0, "logits/rejected": 6216836.0, "logps/chosen": -250.0330810546875, "logps/rejected": -209.78591918945312, "loss": 0.2973, "rewards/chosen": 0.31186962127685547, "rewards/margins": 2.5870742797851562, "rewards/rejected": -2.275204658508301, "step": 16366 }, { "epoch": 0.8675164974955609, "grad_norm": 42.75, "kl": 0.9296588897705078, "learning_rate": 5e-07, "logits/chosen": -3997207.5, "logits/rejected": 13092564.0, "logps/chosen": -244.54571533203125, "logps/rejected": -494.4573974609375, "loss": 0.2059, "rewards/chosen": 0.8136129379272461, "rewards/margins": 3.569763660430908, "rewards/rejected": -2.756150722503662, "step": 16367 }, { "epoch": 0.867569501497363, "grad_norm": 48.5, "kl": 5.136983871459961, "learning_rate": 5e-07, "logits/chosen": -21789552.0, "logits/rejected": 43662640.0, "logps/chosen": -219.1144002278646, "logps/rejected": -602.4393310546875, "loss": 0.4135, "rewards/chosen": 0.5657206376393636, "rewards/margins": 3.070659955342611, "rewards/rejected": -2.504939317703247, "step": 16368 }, { "epoch": 0.8676225054991652, "grad_norm": 80.5, "kl": 1.8559455871582031, "learning_rate": 5e-07, "logits/chosen": -27000457.6, "logits/rejected": -75342426.66666667, "logps/chosen": -522.464453125, "logps/rejected": -376.5609537760417, "loss": 0.2659, "rewards/chosen": 0.8104165077209473, "rewards/margins": 3.2520284334818523, "rewards/rejected": -2.441611925760905, "step": 16369 }, { "epoch": 0.8676755095009673, "grad_norm": 56.75, "kl": 1.746108055114746, "learning_rate": 5e-07, "logits/chosen": -13622727.0, "logits/rejected": -56458484.0, "logps/chosen": -203.19793701171875, "logps/rejected": -283.9625549316406, "loss": 0.1971, "rewards/chosen": 0.8297362923622131, "rewards/margins": 3.5618582367897034, "rewards/rejected": -2.7321219444274902, "step": 16370 }, { "epoch": 0.8677285135027695, "grad_norm": 46.75, "kl": 0.4854755401611328, "learning_rate": 5e-07, "logits/chosen": -19461534.666666668, "logits/rejected": -34581644.8, "logps/chosen": -323.34686279296875, "logps/rejected": -465.9591796875, "loss": 0.1679, "rewards/chosen": 0.8159513473510742, "rewards/margins": 4.98043041229248, "rewards/rejected": -4.164479064941406, "step": 16371 }, { "epoch": 0.8677815175045716, "grad_norm": 44.75, "kl": 2.437833786010742, "learning_rate": 5e-07, "logits/chosen": -7833968.8, "logits/rejected": -44173296.0, "logps/chosen": -225.2141357421875, "logps/rejected": -424.0226643880208, "loss": 0.2036, "rewards/chosen": 1.6124397277832032, "rewards/margins": 4.252414480845133, "rewards/rejected": -2.63997475306193, "step": 16372 }, { "epoch": 0.8678345215063737, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47832890.666666664, "logits/rejected": -20840403.2, "logps/chosen": -399.0469156901042, "logps/rejected": -534.105029296875, "loss": 0.242, "rewards/chosen": -0.01401367038488388, "rewards/margins": 2.9306074157357216, "rewards/rejected": -2.9446210861206055, "step": 16373 }, { "epoch": 0.8678875255081758, "grad_norm": 49.0, "kl": 1.00604248046875, "learning_rate": 5e-07, "logits/chosen": -79129496.0, "logits/rejected": -43428584.0, "logps/chosen": -784.660888671875, "logps/rejected": -253.50234985351562, "loss": 0.1168, "rewards/chosen": 2.69035267829895, "rewards/margins": 5.091596364974976, "rewards/rejected": -2.4012436866760254, "step": 16374 }, { "epoch": 0.867940529509978, "grad_norm": 42.25, "kl": 3.606525421142578, "learning_rate": 5e-07, "logits/chosen": -32266652.8, "logits/rejected": -31899693.333333332, "logps/chosen": -329.70322265625, "logps/rejected": -229.29107666015625, "loss": 0.2359, "rewards/chosen": 1.1335624694824218, "rewards/margins": 3.2732366561889648, "rewards/rejected": -2.139674186706543, "step": 16375 }, { "epoch": 0.8679935335117801, "grad_norm": 31.75, "kl": 0.6977043151855469, "learning_rate": 5e-07, "logits/chosen": -4541526.5, "logits/rejected": -16177640.0, "logps/chosen": -294.07720947265625, "logps/rejected": -349.39190673828125, "loss": 0.2268, "rewards/chosen": 0.8173486590385437, "rewards/margins": 3.9688743948936462, "rewards/rejected": -3.1515257358551025, "step": 16376 }, { "epoch": 0.8680465375135823, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35624336.0, "logits/rejected": -28367209.6, "logps/chosen": -288.802734375, "logps/rejected": -338.52744140625, "loss": 0.1444, "rewards/chosen": 1.069361925125122, "rewards/margins": 3.945319986343384, "rewards/rejected": -2.875958061218262, "step": 16377 }, { "epoch": 0.8680995415153844, "grad_norm": 35.25, "kl": 3.39630126953125, "learning_rate": 5e-07, "logits/chosen": -65027236.0, "logits/rejected": -10026566.0, "logps/chosen": -337.9523620605469, "logps/rejected": -197.81712341308594, "loss": 0.1927, "rewards/chosen": 1.1713312864303589, "rewards/margins": 3.791481375694275, "rewards/rejected": -2.620150089263916, "step": 16378 }, { "epoch": 0.8681525455171866, "grad_norm": 31.625, "kl": 0.12352371215820312, "learning_rate": 5e-07, "logits/chosen": -2872334.0, "logits/rejected": 31451.0, "logps/chosen": -303.29559326171875, "logps/rejected": -531.0968831380209, "loss": 0.1296, "rewards/chosen": 0.33934250473976135, "rewards/margins": 4.171934515237808, "rewards/rejected": -3.832592010498047, "step": 16379 }, { "epoch": 0.8682055495189887, "grad_norm": 40.25, "kl": 0.9281044006347656, "learning_rate": 5e-07, "logits/chosen": -15396745.6, "logits/rejected": -3416334.3333333335, "logps/chosen": -258.27919921875, "logps/rejected": -117.14125569661458, "loss": 0.3086, "rewards/chosen": 0.2600802659988403, "rewards/margins": 4.539519556363423, "rewards/rejected": -4.279439290364583, "step": 16380 }, { "epoch": 0.8682585535207908, "grad_norm": 28.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24551706.666666668, "logits/rejected": -21017923.2, "logps/chosen": -754.9586588541666, "logps/rejected": -448.99921875, "loss": 0.2146, "rewards/chosen": 1.2348408699035645, "rewards/margins": 3.9290188789367675, "rewards/rejected": -2.694178009033203, "step": 16381 }, { "epoch": 0.8683115575225929, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -526260.3333333334, "logits/rejected": -9112551.2, "logps/chosen": -238.38863118489584, "logps/rejected": -176.489501953125, "loss": 0.159, "rewards/chosen": 1.672207196553548, "rewards/margins": 3.8358033498128252, "rewards/rejected": -2.1635961532592773, "step": 16382 }, { "epoch": 0.8683645615243951, "grad_norm": 45.5, "kl": 1.1947746276855469, "learning_rate": 5e-07, "logits/chosen": -14894160.0, "logits/rejected": -27124540.0, "logps/chosen": -325.9144287109375, "logps/rejected": -551.8296508789062, "loss": 0.2105, "rewards/chosen": 1.3574432134628296, "rewards/margins": 4.068234324455261, "rewards/rejected": -2.7107911109924316, "step": 16383 }, { "epoch": 0.8684175655261972, "grad_norm": 58.5, "kl": 1.3389091491699219, "learning_rate": 5e-07, "logits/chosen": -37285920.0, "logits/rejected": -24628624.0, "logps/chosen": -327.945068359375, "logps/rejected": -253.7328338623047, "loss": 0.2419, "rewards/chosen": 1.3792622089385986, "rewards/margins": 3.0252726078033447, "rewards/rejected": -1.646010398864746, "step": 16384 }, { "epoch": 0.8684705695279994, "grad_norm": 45.0, "kl": 1.7915153503417969, "learning_rate": 5e-07, "logits/chosen": -50068688.0, "logits/rejected": -1538298.75, "logps/chosen": -243.2633056640625, "logps/rejected": -119.87705993652344, "loss": 0.3379, "rewards/chosen": 0.7450041770935059, "rewards/margins": 2.6287795305252075, "rewards/rejected": -1.8837753534317017, "step": 16385 }, { "epoch": 0.8685235735298015, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22813232.0, "logits/rejected": -42381645.333333336, "logps/chosen": -252.26591796875, "logps/rejected": -285.4173583984375, "loss": 0.2567, "rewards/chosen": 0.5580019950866699, "rewards/margins": 3.772085666656494, "rewards/rejected": -3.214083671569824, "step": 16386 }, { "epoch": 0.8685765775316037, "grad_norm": 30.0, "kl": 1.6956253051757812, "learning_rate": 5e-07, "logits/chosen": 873535.5, "logits/rejected": -33948589.333333336, "logps/chosen": -100.1399154663086, "logps/rejected": -357.6604817708333, "loss": 0.2209, "rewards/chosen": 0.36090126633644104, "rewards/margins": 2.9765125016371408, "rewards/rejected": -2.6156112353006997, "step": 16387 }, { "epoch": 0.8686295815334057, "grad_norm": 35.75, "kl": 1.6002311706542969, "learning_rate": 5e-07, "logits/chosen": 9207086.666666666, "logits/rejected": -80055756.8, "logps/chosen": -42.63343302408854, "logps/rejected": -253.222607421875, "loss": 0.2877, "rewards/chosen": 0.1051007608572642, "rewards/margins": 2.0699569086233773, "rewards/rejected": -1.9648561477661133, "step": 16388 }, { "epoch": 0.8686825855352079, "grad_norm": 37.5, "kl": 3.634031295776367, "learning_rate": 5e-07, "logits/chosen": -35293104.0, "logits/rejected": -17472486.4, "logps/chosen": -365.7533365885417, "logps/rejected": -130.3046142578125, "loss": 0.257, "rewards/chosen": 0.6831965446472168, "rewards/margins": 3.619920253753662, "rewards/rejected": -2.9367237091064453, "step": 16389 }, { "epoch": 0.86873558953701, "grad_norm": 40.75, "kl": 1.2210922241210938, "learning_rate": 5e-07, "logits/chosen": -10533584.0, "logits/rejected": 3020826.5, "logps/chosen": -676.8097534179688, "logps/rejected": -197.64328002929688, "loss": 0.188, "rewards/chosen": 1.6781768798828125, "rewards/margins": 3.937312364578247, "rewards/rejected": -2.2591354846954346, "step": 16390 }, { "epoch": 0.8687885935388122, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 28997474.666666668, "logits/rejected": -9868708.0, "logps/chosen": -321.9255777994792, "logps/rejected": -256.6970703125, "loss": 0.2493, "rewards/chosen": 0.051894952853520714, "rewards/margins": 2.826683618625005, "rewards/rejected": -2.774788665771484, "step": 16391 }, { "epoch": 0.8688415975406143, "grad_norm": 50.75, "kl": 1.5596141815185547, "learning_rate": 5e-07, "logits/chosen": -26148814.0, "logits/rejected": -5618307.0, "logps/chosen": -131.7281951904297, "logps/rejected": -260.73614501953125, "loss": 0.3175, "rewards/chosen": -0.18531008064746857, "rewards/margins": 2.757740780711174, "rewards/rejected": -2.9430508613586426, "step": 16392 }, { "epoch": 0.8688946015424165, "grad_norm": 58.5, "kl": 0.5948553085327148, "learning_rate": 5e-07, "logits/chosen": -32724784.0, "logits/rejected": 54172250.666666664, "logps/chosen": -205.371337890625, "logps/rejected": -420.6079508463542, "loss": 0.41, "rewards/chosen": -0.25769853591918945, "rewards/margins": 2.052896022796631, "rewards/rejected": -2.3105945587158203, "step": 16393 }, { "epoch": 0.8689476055442186, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16474370.666666666, "logits/rejected": -7244382.4, "logps/chosen": -257.4397786458333, "logps/rejected": -130.229833984375, "loss": 0.1999, "rewards/chosen": 1.53567902247111, "rewards/margins": 3.5294349352518717, "rewards/rejected": -1.9937559127807618, "step": 16394 }, { "epoch": 0.8690006095460208, "grad_norm": 40.5, "kl": 1.5967330932617188, "learning_rate": 5e-07, "logits/chosen": -15143196.8, "logits/rejected": -37927192.0, "logps/chosen": -330.261669921875, "logps/rejected": -288.00994873046875, "loss": 0.3093, "rewards/chosen": 0.4745182037353516, "rewards/margins": 2.7551217714945473, "rewards/rejected": -2.280603567759196, "step": 16395 }, { "epoch": 0.8690536135478228, "grad_norm": 76.5, "kl": 2.702810287475586, "learning_rate": 5e-07, "logits/chosen": -55623955.2, "logits/rejected": -323503.4166666667, "logps/chosen": -392.6525634765625, "logps/rejected": -279.81451416015625, "loss": 0.3528, "rewards/chosen": 0.31894655227661134, "rewards/margins": 2.229798666636149, "rewards/rejected": -1.9108521143595378, "step": 16396 }, { "epoch": 0.869106617549625, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9062269.333333334, "logits/rejected": -51315398.4, "logps/chosen": -262.0155436197917, "logps/rejected": -245.9140869140625, "loss": 0.2263, "rewards/chosen": 0.5244242350260416, "rewards/margins": 3.0354794184366862, "rewards/rejected": -2.5110551834106447, "step": 16397 }, { "epoch": 0.8691596215514271, "grad_norm": 51.0, "kl": 2.770496368408203, "learning_rate": 5e-07, "logits/chosen": -23920689.6, "logits/rejected": -41170117.333333336, "logps/chosen": -346.440283203125, "logps/rejected": -265.87384033203125, "loss": 0.2115, "rewards/chosen": 1.1661669731140136, "rewards/margins": 4.742011419932047, "rewards/rejected": -3.5758444468180337, "step": 16398 }, { "epoch": 0.8692126255532293, "grad_norm": 43.25, "kl": 0.16482925415039062, "learning_rate": 5e-07, "logits/chosen": -78962602.66666667, "logits/rejected": -43224768.0, "logps/chosen": -132.3920694986979, "logps/rejected": -467.78896484375, "loss": 0.2155, "rewards/chosen": 0.3223714431126912, "rewards/margins": 3.9199067672093713, "rewards/rejected": -3.59753532409668, "step": 16399 }, { "epoch": 0.8692656295550314, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76965072.0, "logits/rejected": -28104938.666666668, "logps/chosen": -461.22589111328125, "logps/rejected": -297.56817626953125, "loss": 0.1788, "rewards/chosen": 0.8274681568145752, "rewards/margins": 3.968491474787394, "rewards/rejected": -3.141023317972819, "step": 16400 }, { "epoch": 0.8693186335568336, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53302496.0, "logits/rejected": -27117094.0, "logps/chosen": -360.2420654296875, "logps/rejected": -298.5788269042969, "loss": 0.3042, "rewards/chosen": 0.3730493187904358, "rewards/margins": 2.0344772934913635, "rewards/rejected": -1.6614279747009277, "step": 16401 }, { "epoch": 0.8693716375586357, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58076840.0, "logits/rejected": -27929204.0, "logps/chosen": -354.782470703125, "logps/rejected": -299.59246826171875, "loss": 0.2469, "rewards/chosen": 0.931835949420929, "rewards/margins": 2.7258909344673157, "rewards/rejected": -1.7940549850463867, "step": 16402 }, { "epoch": 0.8694246415604379, "grad_norm": 28.375, "kl": 3.463653564453125, "learning_rate": 5e-07, "logits/chosen": -10347578.0, "logits/rejected": -19652358.666666668, "logps/chosen": -308.2430419921875, "logps/rejected": -249.09419759114584, "loss": 0.1369, "rewards/chosen": 1.5292937755584717, "rewards/margins": 4.137951930363974, "rewards/rejected": -2.6086581548055015, "step": 16403 }, { "epoch": 0.8694776455622399, "grad_norm": 39.0, "kl": 2.3695068359375, "learning_rate": 5e-07, "logits/chosen": -19043916.8, "logits/rejected": -15838262.666666666, "logps/chosen": -598.171435546875, "logps/rejected": -358.0666910807292, "loss": 0.2931, "rewards/chosen": 1.1091434478759765, "rewards/margins": 3.0829962412516276, "rewards/rejected": -1.9738527933756511, "step": 16404 }, { "epoch": 0.8695306495640421, "grad_norm": 41.75, "kl": 0.04092597961425781, "learning_rate": 5e-07, "logits/chosen": -15413289.6, "logits/rejected": -7872428.666666667, "logps/chosen": -251.974365234375, "logps/rejected": -165.27654012044272, "loss": 0.2837, "rewards/chosen": 0.8840021133422852, "rewards/margins": 3.394861284891764, "rewards/rejected": -2.510859171549479, "step": 16405 }, { "epoch": 0.8695836535658442, "grad_norm": 33.25, "kl": 0.4320182800292969, "learning_rate": 5e-07, "logits/chosen": -68741232.0, "logits/rejected": -12588188.0, "logps/chosen": -771.2935791015625, "logps/rejected": -316.00726318359375, "loss": 0.1837, "rewards/chosen": 1.380037546157837, "rewards/margins": 5.1168129444122314, "rewards/rejected": -3.7367753982543945, "step": 16406 }, { "epoch": 0.8696366575676464, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61533556.0, "logits/rejected": -34102772.0, "logps/chosen": -234.22779846191406, "logps/rejected": -245.41989135742188, "loss": 0.2692, "rewards/chosen": 0.3560979962348938, "rewards/margins": 2.3810790181159973, "rewards/rejected": -2.0249810218811035, "step": 16407 }, { "epoch": 0.8696896615694485, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -102067413.33333333, "logits/rejected": -77681088.0, "logps/chosen": -393.5162760416667, "logps/rejected": -304.261669921875, "loss": 0.1968, "rewards/chosen": 0.25240174929300946, "rewards/margins": 3.4406555334726967, "rewards/rejected": -3.1882537841796874, "step": 16408 }, { "epoch": 0.8697426655712507, "grad_norm": 68.5, "kl": 0.17942428588867188, "learning_rate": 5e-07, "logits/chosen": -27659584.0, "logits/rejected": -4362382.4, "logps/chosen": -296.6617431640625, "logps/rejected": -470.62705078125, "loss": 0.2307, "rewards/chosen": 0.5924896001815796, "rewards/margins": 2.715495467185974, "rewards/rejected": -2.1230058670043945, "step": 16409 }, { "epoch": 0.8697956695730528, "grad_norm": 54.0, "kl": 3.9153575897216797, "learning_rate": 5e-07, "logits/chosen": -7029609.6, "logits/rejected": 26322576.0, "logps/chosen": -153.90457763671876, "logps/rejected": -148.6723429361979, "loss": 0.3374, "rewards/chosen": 0.5547631263732911, "rewards/margins": 3.527192401885986, "rewards/rejected": -2.9724292755126953, "step": 16410 }, { "epoch": 0.8698486735748548, "grad_norm": 40.5, "kl": 0.216217041015625, "learning_rate": 5e-07, "logits/chosen": 27697664.0, "logits/rejected": -31219302.4, "logps/chosen": -252.7027587890625, "logps/rejected": -560.9484375, "loss": 0.1663, "rewards/chosen": 1.293708324432373, "rewards/margins": 4.818397045135498, "rewards/rejected": -3.524688720703125, "step": 16411 }, { "epoch": 0.869901677576657, "grad_norm": 49.5, "kl": 0.2118682861328125, "learning_rate": 5e-07, "logits/chosen": -20478352.0, "logits/rejected": -17248972.0, "logps/chosen": -291.40570068359375, "logps/rejected": -438.48382568359375, "loss": 0.215, "rewards/chosen": 0.6034974455833435, "rewards/margins": 3.9908856749534607, "rewards/rejected": -3.387388229370117, "step": 16412 }, { "epoch": 0.8699546815784591, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37788576.0, "logits/rejected": -17891233.6, "logps/chosen": -280.38661702473956, "logps/rejected": -474.09853515625, "loss": 0.2494, "rewards/chosen": 1.1082928975423176, "rewards/margins": 2.5096233685811358, "rewards/rejected": -1.4013304710388184, "step": 16413 }, { "epoch": 0.8700076855802613, "grad_norm": 33.25, "kl": 0.09646224975585938, "learning_rate": 5e-07, "logits/chosen": -44783669.333333336, "logits/rejected": -29271747.2, "logps/chosen": -228.05928548177084, "logps/rejected": -418.401220703125, "loss": 0.1753, "rewards/chosen": 0.4237939914067586, "rewards/margins": 3.6948564608891807, "rewards/rejected": -3.271062469482422, "step": 16414 }, { "epoch": 0.8700606895820634, "grad_norm": 37.75, "kl": 3.2219676971435547, "learning_rate": 5e-07, "logits/chosen": 8402135.0, "logits/rejected": -53137032.0, "logps/chosen": -135.55819702148438, "logps/rejected": -364.7597961425781, "loss": 0.2928, "rewards/chosen": 0.1956121325492859, "rewards/margins": 3.27264803647995, "rewards/rejected": -3.077035903930664, "step": 16415 }, { "epoch": 0.8701136935838656, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26203216.0, "logits/rejected": -22702564.57142857, "logps/chosen": -282.21759033203125, "logps/rejected": -294.55433872767856, "loss": 0.1605, "rewards/chosen": -0.22662353515625, "rewards/margins": 2.4156742095947266, "rewards/rejected": -2.6422977447509766, "step": 16416 }, { "epoch": 0.8701666975856677, "grad_norm": 51.5, "kl": 1.2685575485229492, "learning_rate": 5e-07, "logits/chosen": -40554649.6, "logits/rejected": -20397602.666666668, "logps/chosen": -242.0638671875, "logps/rejected": -322.69557698567706, "loss": 0.2674, "rewards/chosen": 0.6782993793487548, "rewards/margins": 4.784163586298625, "rewards/rejected": -4.10586420694987, "step": 16417 }, { "epoch": 0.8702197015874699, "grad_norm": 53.5, "kl": 1.165802001953125, "learning_rate": 5e-07, "logits/chosen": -18696306.666666668, "logits/rejected": -25688256.0, "logps/chosen": -187.7098388671875, "logps/rejected": -375.76824951171875, "loss": 0.338, "rewards/chosen": 0.3473691940307617, "rewards/margins": 4.072452783584595, "rewards/rejected": -3.725083589553833, "step": 16418 }, { "epoch": 0.8702727055892719, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25378914.666666668, "logits/rejected": -54232780.8, "logps/chosen": -489.1280110677083, "logps/rejected": -359.641357421875, "loss": 0.1224, "rewards/chosen": 1.3405650456746419, "rewards/margins": 4.252798589070638, "rewards/rejected": -2.9122335433959963, "step": 16419 }, { "epoch": 0.8703257095910741, "grad_norm": 42.5, "kl": 1.4848995208740234, "learning_rate": 5e-07, "logits/chosen": -19305086.4, "logits/rejected": -38452525.333333336, "logps/chosen": -288.1542724609375, "logps/rejected": -240.020751953125, "loss": 0.3243, "rewards/chosen": 0.3848290920257568, "rewards/margins": 2.706156619389852, "rewards/rejected": -2.321327527364095, "step": 16420 }, { "epoch": 0.8703787135928762, "grad_norm": 57.0, "kl": 1.1907691955566406, "learning_rate": 5e-07, "logits/chosen": -30562928.0, "logits/rejected": -32174886.4, "logps/chosen": -196.53509521484375, "logps/rejected": -338.6721435546875, "loss": 0.3348, "rewards/chosen": -0.07013499736785889, "rewards/margins": 1.7646479845046996, "rewards/rejected": -1.8347829818725585, "step": 16421 }, { "epoch": 0.8704317175946784, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34941924.0, "logits/rejected": -80852912.0, "logps/chosen": -261.2412109375, "logps/rejected": -357.749755859375, "loss": 0.298, "rewards/chosen": 0.2462644726037979, "rewards/margins": 2.211854711174965, "rewards/rejected": -1.965590238571167, "step": 16422 }, { "epoch": 0.8704847215964805, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 308371.5625, "logits/rejected": -48263949.71428572, "logps/chosen": -35.41481018066406, "logps/rejected": -446.05259486607144, "loss": 0.1579, "rewards/chosen": 0.40296632051467896, "rewards/margins": 3.0082001941544667, "rewards/rejected": -2.6052338736397878, "step": 16423 }, { "epoch": 0.8705377255982827, "grad_norm": 49.0, "kl": 1.7950992584228516, "learning_rate": 5e-07, "logits/chosen": -12618042.285714285, "logits/rejected": 5086251.5, "logps/chosen": -240.59946986607142, "logps/rejected": -25.699460983276367, "loss": 0.4097, "rewards/chosen": 0.48417207172938753, "rewards/margins": 0.49391290119716097, "rewards/rejected": -0.009740829467773438, "step": 16424 }, { "epoch": 0.8705907296000848, "grad_norm": 27.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8921630.666666666, "logits/rejected": -66097190.4, "logps/chosen": -116.11619059244792, "logps/rejected": -411.97734375, "loss": 0.2324, "rewards/chosen": -0.17024765412012735, "rewards/margins": 3.0063247640927635, "rewards/rejected": -3.1765724182128907, "step": 16425 }, { "epoch": 0.870643733601887, "grad_norm": 55.0, "kl": 2.543086051940918, "learning_rate": 5e-07, "logits/chosen": -38968216.0, "logits/rejected": -36357456.0, "logps/chosen": -287.3401794433594, "logps/rejected": -561.0549926757812, "loss": 0.23, "rewards/chosen": 0.9189197421073914, "rewards/margins": 3.882706344127655, "rewards/rejected": -2.9637866020202637, "step": 16426 }, { "epoch": 0.870696737603689, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37842212.0, "logits/rejected": -21260176.0, "logps/chosen": -246.5701446533203, "logps/rejected": -434.2292175292969, "loss": 0.275, "rewards/chosen": 0.23270350694656372, "rewards/margins": 2.600055992603302, "rewards/rejected": -2.3673524856567383, "step": 16427 }, { "epoch": 0.8707497416054912, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27324024.0, "logits/rejected": -35001741.333333336, "logps/chosen": -249.06594848632812, "logps/rejected": -337.9896240234375, "loss": 0.2111, "rewards/chosen": 0.9069259762763977, "rewards/margins": 2.8669450879096985, "rewards/rejected": -1.9600191116333008, "step": 16428 }, { "epoch": 0.8708027456072933, "grad_norm": 48.75, "kl": 1.88604736328125, "learning_rate": 5e-07, "logits/chosen": -6092782.0, "logits/rejected": 88704416.0, "logps/chosen": -182.19387817382812, "logps/rejected": -564.9163818359375, "loss": 0.3405, "rewards/chosen": -0.10364624857902527, "rewards/margins": 2.2368490993976593, "rewards/rejected": -2.3404953479766846, "step": 16429 }, { "epoch": 0.8708557496090955, "grad_norm": 66.5, "kl": 1.2303657531738281, "learning_rate": 5e-07, "logits/chosen": -48680056.0, "logits/rejected": -5620664.0, "logps/chosen": -179.66558837890625, "logps/rejected": -294.2588195800781, "loss": 0.3376, "rewards/chosen": 0.24254103004932404, "rewards/margins": 2.0464381724596024, "rewards/rejected": -1.8038971424102783, "step": 16430 }, { "epoch": 0.8709087536108976, "grad_norm": 46.5, "kl": 1.2451038360595703, "learning_rate": 5e-07, "logits/chosen": -60749626.666666664, "logits/rejected": -12652179.2, "logps/chosen": -845.6373697916666, "logps/rejected": -316.762158203125, "loss": 0.217, "rewards/chosen": 1.510222593943278, "rewards/margins": 4.86323086420695, "rewards/rejected": -3.353008270263672, "step": 16431 }, { "epoch": 0.8709617576126998, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18442766.666666668, "logits/rejected": -46746720.0, "logps/chosen": -246.88541666666666, "logps/rejected": -472.86220703125, "loss": 0.2104, "rewards/chosen": 0.2844781478246053, "rewards/margins": 2.964715154965719, "rewards/rejected": -2.6802370071411135, "step": 16432 }, { "epoch": 0.8710147616145019, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42267920.0, "logits/rejected": -53547488.0, "logps/chosen": -375.3311767578125, "logps/rejected": -312.205810546875, "loss": 0.2158, "rewards/chosen": 0.647259533405304, "rewards/margins": 3.2253692746162415, "rewards/rejected": -2.5781097412109375, "step": 16433 }, { "epoch": 0.8710677656163041, "grad_norm": 33.0, "kl": 3.71478271484375, "learning_rate": 5e-07, "logits/chosen": 3283115.3333333335, "logits/rejected": -24748420.8, "logps/chosen": -180.17533365885416, "logps/rejected": -369.53291015625, "loss": 0.1866, "rewards/chosen": 1.2110598882039387, "rewards/margins": 3.499209912618001, "rewards/rejected": -2.2881500244140627, "step": 16434 }, { "epoch": 0.8711207696181061, "grad_norm": 51.25, "kl": 5.35153865814209, "learning_rate": 5e-07, "logits/chosen": -11555884.0, "logits/rejected": -69927400.0, "logps/chosen": -229.04888916015625, "logps/rejected": -124.82661437988281, "loss": 0.3822, "rewards/chosen": 0.9441129366556803, "rewards/margins": 3.404751459757487, "rewards/rejected": -2.4606385231018066, "step": 16435 }, { "epoch": 0.8711737736199083, "grad_norm": 59.0, "kl": 1.2376594543457031, "learning_rate": 5e-07, "logits/chosen": 13460715.2, "logits/rejected": -12069145.333333334, "logps/chosen": -320.1027099609375, "logps/rejected": -329.00986735026044, "loss": 0.3999, "rewards/chosen": 0.006544792652130127, "rewards/margins": 2.76171148220698, "rewards/rejected": -2.75516668955485, "step": 16436 }, { "epoch": 0.8712267776217104, "grad_norm": 55.75, "kl": 0.8070783615112305, "learning_rate": 5e-07, "logits/chosen": -21786923.2, "logits/rejected": 22671005.333333332, "logps/chosen": -182.460546875, "logps/rejected": -265.8868815104167, "loss": 0.4057, "rewards/chosen": -0.058875513076782224, "rewards/margins": 3.0172995408376058, "rewards/rejected": -3.076175053914388, "step": 16437 }, { "epoch": 0.8712797816235126, "grad_norm": 45.0, "kl": 2.6912498474121094, "learning_rate": 5e-07, "logits/chosen": -26074472.0, "logits/rejected": -18468344.0, "logps/chosen": -234.8394775390625, "logps/rejected": -211.2196044921875, "loss": 0.3154, "rewards/chosen": 0.770591139793396, "rewards/margins": 3.274229407310486, "rewards/rejected": -2.50363826751709, "step": 16438 }, { "epoch": 0.8713327856253147, "grad_norm": 45.0, "kl": 4.699296951293945, "learning_rate": 5e-07, "logits/chosen": -37070928.0, "logits/rejected": -21006118.0, "logps/chosen": -475.6997985839844, "logps/rejected": -301.3359375, "loss": 0.1756, "rewards/chosen": 1.5998355150222778, "rewards/margins": 4.614080786705017, "rewards/rejected": -3.0142452716827393, "step": 16439 }, { "epoch": 0.8713857896271169, "grad_norm": 37.75, "kl": 0.5998916625976562, "learning_rate": 5e-07, "logits/chosen": -18674380.0, "logits/rejected": -19198921.6, "logps/chosen": -134.7758585611979, "logps/rejected": -364.60419921875, "loss": 0.2147, "rewards/chosen": 0.34462324778238934, "rewards/margins": 3.2993193308512367, "rewards/rejected": -2.9546960830688476, "step": 16440 }, { "epoch": 0.871438793628919, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12926484.0, "logits/rejected": -31772188.0, "logps/chosen": -412.22808837890625, "logps/rejected": -602.7533569335938, "loss": 0.1809, "rewards/chosen": 0.7304809093475342, "rewards/margins": 4.974825620651245, "rewards/rejected": -4.244344711303711, "step": 16441 }, { "epoch": 0.8714917976307212, "grad_norm": 67.0, "kl": 1.79718017578125, "learning_rate": 5e-07, "logits/chosen": -16030665.6, "logits/rejected": -21270368.0, "logps/chosen": -282.9821533203125, "logps/rejected": -195.0073038736979, "loss": 0.2697, "rewards/chosen": 0.8485622406005859, "rewards/margins": 4.041074434916178, "rewards/rejected": -3.1925121943155923, "step": 16442 }, { "epoch": 0.8715448016325232, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59630828.0, "logits/rejected": -23600120.0, "logps/chosen": -262.807861328125, "logps/rejected": -213.99154663085938, "loss": 0.3691, "rewards/chosen": -0.12109354138374329, "rewards/margins": 1.3855759799480438, "rewards/rejected": -1.506669521331787, "step": 16443 }, { "epoch": 0.8715978056343254, "grad_norm": 48.25, "kl": 3.485443115234375, "learning_rate": 5e-07, "logits/chosen": -4661535.666666667, "logits/rejected": -6693587.0, "logps/chosen": -233.73150634765625, "logps/rejected": -253.126953125, "loss": 0.3293, "rewards/chosen": 0.7207614580790201, "rewards/margins": 3.5596343676249185, "rewards/rejected": -2.8388729095458984, "step": 16444 }, { "epoch": 0.8716508096361275, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26940300.0, "logits/rejected": -54453445.333333336, "logps/chosen": -451.9874572753906, "logps/rejected": -374.4742431640625, "loss": 0.1164, "rewards/chosen": 1.1963729858398438, "rewards/margins": 4.378814379374186, "rewards/rejected": -3.1824413935343423, "step": 16445 }, { "epoch": 0.8717038136379297, "grad_norm": 85.0, "kl": 3.9445724487304688, "learning_rate": 5e-07, "logits/chosen": -18652996.0, "logits/rejected": -22411208.0, "logps/chosen": -668.2663167317709, "logps/rejected": -281.8275451660156, "loss": 0.2798, "rewards/chosen": 1.4518632888793945, "rewards/margins": 3.333278179168701, "rewards/rejected": -1.8814148902893066, "step": 16446 }, { "epoch": 0.8717568176397318, "grad_norm": 27.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11496578.0, "logits/rejected": -47842496.0, "logps/chosen": -368.7969970703125, "logps/rejected": -578.56201171875, "loss": 0.1019, "rewards/chosen": 1.2068557739257812, "rewards/margins": 4.437903722127279, "rewards/rejected": -3.2310479482014975, "step": 16447 }, { "epoch": 0.871809821641534, "grad_norm": 50.25, "kl": 0.7118625640869141, "learning_rate": 5e-07, "logits/chosen": -4677719.6, "logits/rejected": -30876464.0, "logps/chosen": -367.80830078125, "logps/rejected": -418.9906412760417, "loss": 0.3039, "rewards/chosen": 0.5714225769042969, "rewards/margins": 3.1644411087036133, "rewards/rejected": -2.5930185317993164, "step": 16448 }, { "epoch": 0.8718628256433361, "grad_norm": 90.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -9745114.0, "logps/rejected": -272.69879150390625, "loss": 0.2415, "rewards/rejected": -1.3962475061416626, "step": 16449 }, { "epoch": 0.8719158296451383, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -279338.3333333333, "logits/rejected": -36634595.2, "logps/chosen": -408.6805419921875, "logps/rejected": -405.4535888671875, "loss": 0.1476, "rewards/chosen": 1.0249145825703938, "rewards/margins": 3.8057876904805497, "rewards/rejected": -2.780873107910156, "step": 16450 }, { "epoch": 0.8719688336469403, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20845220.0, "logits/rejected": -44382544.0, "logps/chosen": -257.92431640625, "logps/rejected": -430.4983215332031, "loss": 0.2005, "rewards/chosen": 1.3158098459243774, "rewards/margins": 3.3206034898757935, "rewards/rejected": -2.004793643951416, "step": 16451 }, { "epoch": 0.8720218376487425, "grad_norm": 36.25, "kl": 3.6501340866088867, "learning_rate": 5e-07, "logits/chosen": -27943996.8, "logits/rejected": 1554779.1666666667, "logps/chosen": -81.146044921875, "logps/rejected": -282.41302490234375, "loss": 0.4714, "rewards/chosen": -0.29855172634124755, "rewards/margins": 1.8944465239842734, "rewards/rejected": -2.192998250325521, "step": 16452 }, { "epoch": 0.8720748416505446, "grad_norm": 45.5, "kl": 2.7968711853027344, "learning_rate": 5e-07, "logits/chosen": -25971296.0, "logits/rejected": 15466205.0, "logps/chosen": -276.40625, "logps/rejected": -134.94369506835938, "loss": 0.3247, "rewards/chosen": 0.7849210103352865, "rewards/margins": 1.9363938172658286, "rewards/rejected": -1.151472806930542, "step": 16453 }, { "epoch": 0.8721278456523468, "grad_norm": 34.5, "kl": 0.20827484130859375, "learning_rate": 5e-07, "logits/chosen": -4821521.0, "logits/rejected": -40923456.0, "logps/chosen": -73.35304260253906, "logps/rejected": -499.2891438802083, "loss": 0.2144, "rewards/chosen": -0.04222717136144638, "rewards/margins": 2.806316949427128, "rewards/rejected": -2.848544120788574, "step": 16454 }, { "epoch": 0.8721808496541489, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48988949.333333336, "logits/rejected": -25205728.0, "logps/chosen": -376.6141764322917, "logps/rejected": -411.925, "loss": 0.2134, "rewards/chosen": 0.05863800644874573, "rewards/margins": 3.1287345945835114, "rewards/rejected": -3.0700965881347657, "step": 16455 }, { "epoch": 0.8722338536559511, "grad_norm": 58.25, "kl": 0.6995620727539062, "learning_rate": 5e-07, "logits/chosen": 6434293.0, "logits/rejected": -33505737.14285714, "logps/chosen": -9.08383560180664, "logps/rejected": -443.1878138950893, "loss": 0.2004, "rewards/chosen": 0.0530664436519146, "rewards/margins": 2.4227329519178187, "rewards/rejected": -2.369666508265904, "step": 16456 }, { "epoch": 0.8722868576577532, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19506310.666666668, "logits/rejected": -49942457.6, "logps/chosen": -98.7445576985677, "logps/rejected": -449.20654296875, "loss": 0.3143, "rewards/chosen": -0.28458817799886066, "rewards/margins": 1.750715510050456, "rewards/rejected": -2.0353036880493165, "step": 16457 }, { "epoch": 0.8723398616595553, "grad_norm": 36.25, "kl": 1.0136280059814453, "learning_rate": 5e-07, "logits/chosen": -64900256.0, "logits/rejected": -63178074.666666664, "logps/chosen": -265.242431640625, "logps/rejected": -225.58988444010416, "loss": 0.2259, "rewards/chosen": 0.29877591133117676, "rewards/margins": 2.698845624923706, "rewards/rejected": -2.4000697135925293, "step": 16458 }, { "epoch": 0.8723928656613574, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -114894672.0, "logits/rejected": -1498574.5714285714, "logps/chosen": -885.0562133789062, "logps/rejected": -229.339599609375, "loss": 0.1891, "rewards/chosen": 0.24739380180835724, "rewards/margins": 2.5645616288696016, "rewards/rejected": -2.3171678270612444, "step": 16459 }, { "epoch": 0.8724458696631596, "grad_norm": 54.5, "kl": 4.452587127685547, "learning_rate": 5e-07, "logits/chosen": -8067251.333333333, "logits/rejected": -21319578.0, "logps/chosen": -216.4526570638021, "logps/rejected": -439.7448425292969, "loss": 0.2677, "rewards/chosen": 1.068292538324992, "rewards/margins": 3.8855847517649336, "rewards/rejected": -2.8172922134399414, "step": 16460 }, { "epoch": 0.8724988736649617, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22356724.0, "logits/rejected": -28949192.0, "logps/chosen": -331.0129699707031, "logps/rejected": -403.11236572265625, "loss": 0.2682, "rewards/chosen": 0.15494443476200104, "rewards/margins": 3.0370986610651016, "rewards/rejected": -2.8821542263031006, "step": 16461 }, { "epoch": 0.8725518776667638, "grad_norm": 60.5, "kl": 7.965339660644531, "learning_rate": 5e-07, "logits/chosen": -12670104.0, "logits/rejected": -61740128.0, "logps/chosen": -561.405517578125, "logps/rejected": -483.4825439453125, "loss": 0.266, "rewards/chosen": 1.7929519653320312, "rewards/margins": 4.53202158610026, "rewards/rejected": -2.739069620768229, "step": 16462 }, { "epoch": 0.872604881668566, "grad_norm": 32.25, "kl": 2.5287551879882812, "learning_rate": 5e-07, "logits/chosen": -22866862.0, "logits/rejected": -30026834.666666668, "logps/chosen": -268.47247314453125, "logps/rejected": -316.996337890625, "loss": 0.1182, "rewards/chosen": 1.4060391187667847, "rewards/margins": 4.973176836967468, "rewards/rejected": -3.5671377182006836, "step": 16463 }, { "epoch": 0.8726578856703681, "grad_norm": 49.25, "kl": 0.17404937744140625, "learning_rate": 5e-07, "logits/chosen": -62344857.6, "logits/rejected": 1851041.3333333333, "logps/chosen": -507.977197265625, "logps/rejected": -380.0576171875, "loss": 0.2393, "rewards/chosen": 0.6316796779632569, "rewards/margins": 4.111068232854207, "rewards/rejected": -3.4793885548909507, "step": 16464 }, { "epoch": 0.8727108896721703, "grad_norm": 45.75, "kl": 2.446613311767578, "learning_rate": 5e-07, "logits/chosen": -33365532.0, "logits/rejected": 30120302.0, "logps/chosen": -547.9028930664062, "logps/rejected": -359.49481201171875, "loss": 0.1539, "rewards/chosen": 2.0165023803710938, "rewards/margins": 5.313112735748291, "rewards/rejected": -3.2966103553771973, "step": 16465 }, { "epoch": 0.8727638936739723, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54826624.0, "logits/rejected": -51884000.0, "logps/chosen": -671.5736490885416, "logps/rejected": -566.119921875, "loss": 0.1414, "rewards/chosen": 1.087623119354248, "rewards/margins": 5.13786039352417, "rewards/rejected": -4.0502372741699215, "step": 16466 }, { "epoch": 0.8728168976757745, "grad_norm": 63.5, "kl": 0.5660362243652344, "learning_rate": 5e-07, "logits/chosen": -7674170.0, "logits/rejected": -42556588.8, "logps/chosen": -306.2493896484375, "logps/rejected": -283.74130859375, "loss": 0.2531, "rewards/chosen": 0.33894455432891846, "rewards/margins": 2.532614064216614, "rewards/rejected": -2.1936695098876955, "step": 16467 }, { "epoch": 0.8728699016775766, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35953132.0, "logits/rejected": -25981472.0, "logps/chosen": -383.34381103515625, "logps/rejected": -247.9883015950521, "loss": 0.2237, "rewards/chosen": 0.6037552356719971, "rewards/margins": 2.7121764024098716, "rewards/rejected": -2.1084211667378745, "step": 16468 }, { "epoch": 0.8729229056793788, "grad_norm": 39.0, "kl": 4.913402557373047, "learning_rate": 5e-07, "logits/chosen": -10728315.0, "logits/rejected": -22092648.0, "logps/chosen": -170.90684509277344, "logps/rejected": -212.48707580566406, "loss": 0.275, "rewards/chosen": 0.7437632083892822, "rewards/margins": 4.082561731338501, "rewards/rejected": -3.3387985229492188, "step": 16469 }, { "epoch": 0.8729759096811809, "grad_norm": 38.25, "kl": 2.9485416412353516, "learning_rate": 5e-07, "logits/chosen": -3572177.3333333335, "logits/rejected": 273514.625, "logps/chosen": -317.35780843098956, "logps/rejected": -314.409619140625, "loss": 0.1942, "rewards/chosen": 1.1362552642822266, "rewards/margins": 3.6596599578857423, "rewards/rejected": -2.5234046936035157, "step": 16470 }, { "epoch": 0.8730289136829831, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94917200.0, "logits/rejected": -26656072.0, "logps/chosen": -472.05157470703125, "logps/rejected": -365.0406494140625, "loss": 0.1875, "rewards/chosen": 0.055102527141571045, "rewards/margins": 2.9163286089897156, "rewards/rejected": -2.8612260818481445, "step": 16471 }, { "epoch": 0.8730819176847852, "grad_norm": 57.5, "kl": 0.08723831176757812, "learning_rate": 5e-07, "logits/chosen": -79463720.0, "logits/rejected": -53824600.0, "logps/chosen": -571.20361328125, "logps/rejected": -329.7882385253906, "loss": 0.3271, "rewards/chosen": 0.36955493688583374, "rewards/margins": 1.8356983065605164, "rewards/rejected": -1.4661433696746826, "step": 16472 }, { "epoch": 0.8731349216865874, "grad_norm": 51.5, "kl": 1.2997055053710938, "learning_rate": 5e-07, "logits/chosen": -70588920.0, "logits/rejected": -17485582.0, "logps/chosen": -474.0844421386719, "logps/rejected": -343.4279479980469, "loss": 0.1773, "rewards/chosen": 1.1691441535949707, "rewards/margins": 4.473897933959961, "rewards/rejected": -3.3047537803649902, "step": 16473 }, { "epoch": 0.8731879256883894, "grad_norm": 40.5, "kl": 3.014312744140625, "learning_rate": 5e-07, "logits/chosen": -33608723.2, "logits/rejected": -5365557.333333333, "logps/chosen": -141.941650390625, "logps/rejected": -130.8842976888021, "loss": 0.384, "rewards/chosen": 0.3315009355545044, "rewards/margins": 2.018669374783834, "rewards/rejected": -1.6871684392293294, "step": 16474 }, { "epoch": 0.8732409296901916, "grad_norm": 65.5, "kl": 0.5050811767578125, "learning_rate": 5e-07, "logits/chosen": 18940126.666666668, "logits/rejected": -576883.875, "logps/chosen": -272.34568277994794, "logps/rejected": -68.28593444824219, "loss": 0.3508, "rewards/chosen": 0.43294334411621094, "rewards/margins": 3.1156373023986816, "rewards/rejected": -2.6826939582824707, "step": 16475 }, { "epoch": 0.8732939336919937, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30261372.8, "logits/rejected": -32942789.333333332, "logps/chosen": -441.990576171875, "logps/rejected": -220.326416015625, "loss": 0.3189, "rewards/chosen": 0.17138185501098632, "rewards/margins": 3.3376473426818847, "rewards/rejected": -3.1662654876708984, "step": 16476 }, { "epoch": 0.8733469376937959, "grad_norm": 56.25, "kl": 2.8829116821289062, "learning_rate": 5e-07, "logits/chosen": -15680645.333333334, "logits/rejected": -37472076.0, "logps/chosen": -231.252197265625, "logps/rejected": -152.0869140625, "loss": 0.4662, "rewards/chosen": -0.09584321578343709, "rewards/margins": 1.225466827551524, "rewards/rejected": -1.321310043334961, "step": 16477 }, { "epoch": 0.873399941695598, "grad_norm": 46.0, "kl": 0.6534357070922852, "learning_rate": 5e-07, "logits/chosen": -29989290.666666668, "logits/rejected": -26272433.6, "logps/chosen": -330.02235921223956, "logps/rejected": -273.23173828125, "loss": 0.187, "rewards/chosen": 1.1162903308868408, "rewards/margins": 3.6639973163604735, "rewards/rejected": -2.5477069854736327, "step": 16478 }, { "epoch": 0.8734529456974002, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26434874.666666668, "logits/rejected": -43001331.2, "logps/chosen": -353.1295166015625, "logps/rejected": -486.039306640625, "loss": 0.2449, "rewards/chosen": 0.3962137699127197, "rewards/margins": 2.6688747882843016, "rewards/rejected": -2.272661018371582, "step": 16479 }, { "epoch": 0.8735059496992023, "grad_norm": 49.75, "kl": 1.6406927108764648, "learning_rate": 5e-07, "logits/chosen": -28108154.0, "logits/rejected": -32961244.0, "logps/chosen": -374.6824951171875, "logps/rejected": -434.1798095703125, "loss": 0.2441, "rewards/chosen": 0.749398410320282, "rewards/margins": 2.8874844908714294, "rewards/rejected": -2.1380860805511475, "step": 16480 }, { "epoch": 0.8735589537010044, "grad_norm": 40.5, "kl": 3.8734970092773438, "learning_rate": 5e-07, "logits/chosen": 220203.2, "logits/rejected": -14765024.0, "logps/chosen": -154.178076171875, "logps/rejected": -236.4869181315104, "loss": 0.3729, "rewards/chosen": 0.2613541603088379, "rewards/margins": 3.310349178314209, "rewards/rejected": -3.048995018005371, "step": 16481 }, { "epoch": 0.8736119577028065, "grad_norm": 56.5, "kl": 4.140499114990234, "learning_rate": 5e-07, "logits/chosen": -4149503.2, "logits/rejected": -20509814.666666668, "logps/chosen": -297.7666748046875, "logps/rejected": -353.8497721354167, "loss": 0.2921, "rewards/chosen": 1.4206048965454101, "rewards/margins": 2.9567448616027834, "rewards/rejected": -1.536139965057373, "step": 16482 }, { "epoch": 0.8736649617046087, "grad_norm": 39.0, "kl": 0.9566879272460938, "learning_rate": 5e-07, "logits/chosen": -8613032.0, "logits/rejected": 32406592.0, "logps/chosen": -374.5942077636719, "logps/rejected": -191.37693786621094, "loss": 0.2064, "rewards/chosen": 0.7382729053497314, "rewards/margins": 4.447973966598511, "rewards/rejected": -3.7097010612487793, "step": 16483 }, { "epoch": 0.8737179657064108, "grad_norm": 40.25, "kl": 0.7725176811218262, "learning_rate": 5e-07, "logits/chosen": 28382940.0, "logits/rejected": -34390432.0, "logps/chosen": -223.77052307128906, "logps/rejected": -365.2872619628906, "loss": 0.2243, "rewards/chosen": 0.448912113904953, "rewards/margins": 4.342882603406906, "rewards/rejected": -3.893970489501953, "step": 16484 }, { "epoch": 0.873770969708213, "grad_norm": 53.0, "kl": 4.166290283203125, "learning_rate": 5e-07, "logits/chosen": -9880280.0, "logits/rejected": -29725808.0, "logps/chosen": -616.2278442382812, "logps/rejected": -307.0550231933594, "loss": 0.2238, "rewards/chosen": 1.1166785955429077, "rewards/margins": 4.397732138633728, "rewards/rejected": -3.2810535430908203, "step": 16485 }, { "epoch": 0.8738239737100151, "grad_norm": 41.0, "kl": 1.9166069030761719, "learning_rate": 5e-07, "logits/chosen": -31141673.6, "logits/rejected": -9380914.666666666, "logps/chosen": -516.407373046875, "logps/rejected": -396.9688313802083, "loss": 0.2294, "rewards/chosen": 1.5366870880126953, "rewards/margins": 3.3314719518025715, "rewards/rejected": -1.7947848637898762, "step": 16486 }, { "epoch": 0.8738769777118173, "grad_norm": 61.0, "kl": 0.8663539886474609, "learning_rate": 5e-07, "logits/chosen": -48822790.4, "logits/rejected": -19109398.666666668, "logps/chosen": -494.6826171875, "logps/rejected": -325.3701171875, "loss": 0.395, "rewards/chosen": 0.3022606372833252, "rewards/margins": 1.9218273957570393, "rewards/rejected": -1.6195667584737141, "step": 16487 }, { "epoch": 0.8739299817136194, "grad_norm": 54.25, "kl": 2.2160701751708984, "learning_rate": 5e-07, "logits/chosen": -4151676.8, "logits/rejected": -16988462.666666668, "logps/chosen": -238.3298828125, "logps/rejected": -294.56675211588544, "loss": 0.2488, "rewards/chosen": 0.9069684028625489, "rewards/margins": 4.649168618520101, "rewards/rejected": -3.7422002156575522, "step": 16488 }, { "epoch": 0.8739829857154215, "grad_norm": 49.0, "kl": 1.462385654449463, "learning_rate": 5e-07, "logits/chosen": 8126738.285714285, "logits/rejected": -2485778.0, "logps/chosen": -166.70321219308036, "logps/rejected": -92.73353576660156, "loss": 0.4913, "rewards/chosen": -0.02556685890470232, "rewards/margins": 0.40893785868372234, "rewards/rejected": -0.4345047175884247, "step": 16489 }, { "epoch": 0.8740359897172236, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60150165.333333336, "logits/rejected": -27455590.4, "logps/chosen": -381.8998616536458, "logps/rejected": -320.291943359375, "loss": 0.2529, "rewards/chosen": 0.15146687626838684, "rewards/margins": 2.3325015366077424, "rewards/rejected": -2.1810346603393556, "step": 16490 }, { "epoch": 0.8740889937190258, "grad_norm": 52.75, "kl": 9.163700103759766, "learning_rate": 5e-07, "logits/chosen": -4903758.0, "logits/rejected": -34789904.0, "logps/chosen": -542.8021240234375, "logps/rejected": -341.25079345703125, "loss": 0.3361, "rewards/chosen": 1.3894046147664387, "rewards/margins": 3.730197509129842, "rewards/rejected": -2.3407928943634033, "step": 16491 }, { "epoch": 0.8741419977208279, "grad_norm": 49.25, "kl": 2.1435232162475586, "learning_rate": 5e-07, "logits/chosen": -26924772.57142857, "logits/rejected": -64589296.0, "logps/chosen": -170.71250697544642, "logps/rejected": -743.5452880859375, "loss": 0.4236, "rewards/chosen": 0.35753655433654785, "rewards/margins": 2.9816393852233887, "rewards/rejected": -2.624102830886841, "step": 16492 }, { "epoch": 0.8741950017226301, "grad_norm": 42.0, "kl": 1.3464508056640625, "learning_rate": 5e-07, "logits/chosen": -23312442.0, "logits/rejected": -39079032.0, "logps/chosen": -367.6261901855469, "logps/rejected": -494.2351379394531, "loss": 0.1934, "rewards/chosen": 1.0492640733718872, "rewards/margins": 4.603591322898865, "rewards/rejected": -3.5543272495269775, "step": 16493 }, { "epoch": 0.8742480057244322, "grad_norm": 54.5, "kl": 0.042530059814453125, "learning_rate": 5e-07, "logits/chosen": -19542518.4, "logits/rejected": -29204453.333333332, "logps/chosen": -266.0640380859375, "logps/rejected": -234.33642578125, "loss": 0.3546, "rewards/chosen": 0.27392258644104006, "rewards/margins": 2.9044146378835043, "rewards/rejected": -2.6304920514424643, "step": 16494 }, { "epoch": 0.8743010097262344, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33042218.0, "logits/rejected": -15456240.0, "logps/chosen": -495.77337646484375, "logps/rejected": -341.3584681919643, "loss": 0.1675, "rewards/chosen": 0.35455322265625, "rewards/margins": 2.844794682094029, "rewards/rejected": -2.490241459437779, "step": 16495 }, { "epoch": 0.8743540137280364, "grad_norm": 50.25, "kl": 2.8266372680664062, "learning_rate": 5e-07, "logits/chosen": -6139416.0, "logits/rejected": -25260650.666666668, "logps/chosen": -115.09140625, "logps/rejected": -329.13002522786456, "loss": 0.2944, "rewards/chosen": 0.5527229309082031, "rewards/margins": 2.907424290974935, "rewards/rejected": -2.354701360066732, "step": 16496 }, { "epoch": 0.8744070177298386, "grad_norm": 43.5, "kl": 2.246187210083008, "learning_rate": 5e-07, "logits/chosen": -26002688.0, "logits/rejected": -32863762.0, "logps/chosen": -125.50041198730469, "logps/rejected": -156.32066345214844, "loss": 0.3913, "rewards/chosen": -0.09467676281929016, "rewards/margins": 1.4626187980175018, "rewards/rejected": -1.557295560836792, "step": 16497 }, { "epoch": 0.8744600217316407, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19966040.0, "logits/rejected": -9511041.0, "logps/chosen": -245.41989135742188, "logps/rejected": -340.3044738769531, "loss": 0.2671, "rewards/chosen": 0.3389306664466858, "rewards/margins": 2.6921046376228333, "rewards/rejected": -2.3531739711761475, "step": 16498 }, { "epoch": 0.8745130257334429, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48070474.666666664, "logits/rejected": -8729742.4, "logps/chosen": -429.6336263020833, "logps/rejected": -282.8612548828125, "loss": 0.1838, "rewards/chosen": 0.6328633626302084, "rewards/margins": 4.103248723347981, "rewards/rejected": -3.4703853607177733, "step": 16499 }, { "epoch": 0.874566029735245, "grad_norm": 44.0, "kl": 4.394630432128906, "learning_rate": 5e-07, "logits/chosen": -17051310.4, "logits/rejected": -100848810.66666667, "logps/chosen": -154.99462890625, "logps/rejected": -502.5382080078125, "loss": 0.3932, "rewards/chosen": 0.2776841163635254, "rewards/margins": 2.485421403249105, "rewards/rejected": -2.2077372868855796, "step": 16500 }, { "epoch": 0.8746190337370472, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26607909.333333332, "logits/rejected": -22403470.4, "logps/chosen": -233.89213053385416, "logps/rejected": -311.866064453125, "loss": 0.2408, "rewards/chosen": 0.3245040973027547, "rewards/margins": 2.89981480439504, "rewards/rejected": -2.575310707092285, "step": 16501 }, { "epoch": 0.8746720377388493, "grad_norm": 42.75, "kl": 0.6226043701171875, "learning_rate": 5e-07, "logits/chosen": -30855302.4, "logits/rejected": -34750469.333333336, "logps/chosen": -296.663671875, "logps/rejected": -367.732666015625, "loss": 0.2487, "rewards/chosen": 0.6272378921508789, "rewards/margins": 3.1059569676717125, "rewards/rejected": -2.4787190755208335, "step": 16502 }, { "epoch": 0.8747250417406515, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51652218.666666664, "logits/rejected": -31643849.6, "logps/chosen": -370.1136474609375, "logps/rejected": -421.146484375, "loss": 0.2, "rewards/chosen": 0.5940363009770712, "rewards/margins": 3.1249601443608603, "rewards/rejected": -2.530923843383789, "step": 16503 }, { "epoch": 0.8747780457424535, "grad_norm": 55.0, "kl": 0.5686664581298828, "learning_rate": 5e-07, "logits/chosen": -32342518.0, "logits/rejected": -16779702.0, "logps/chosen": -396.6876220703125, "logps/rejected": -208.11941528320312, "loss": 0.2863, "rewards/chosen": 0.028597451746463776, "rewards/margins": 2.9847262874245644, "rewards/rejected": -2.9561288356781006, "step": 16504 }, { "epoch": 0.8748310497442557, "grad_norm": 51.75, "kl": 3.7028160095214844, "learning_rate": 5e-07, "logits/chosen": -9835176.666666666, "logits/rejected": -17097776.0, "logps/chosen": -129.46675618489584, "logps/rejected": -227.898828125, "loss": 0.2455, "rewards/chosen": 1.2468788623809814, "rewards/margins": 2.6298765659332277, "rewards/rejected": -1.382997703552246, "step": 16505 }, { "epoch": 0.8748840537460578, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15747887.0, "logits/rejected": -21192800.0, "logps/chosen": -63.3302116394043, "logps/rejected": -306.11212158203125, "loss": 0.1506, "rewards/chosen": 1.5531277656555176, "rewards/margins": 3.696068604787191, "rewards/rejected": -2.1429408391316733, "step": 16506 }, { "epoch": 0.87493705774786, "grad_norm": 62.25, "kl": 3.712665557861328, "learning_rate": 5e-07, "logits/chosen": -46573504.0, "logits/rejected": 2097075.0, "logps/chosen": -461.3210856119792, "logps/rejected": -360.6644592285156, "loss": 0.3571, "rewards/chosen": 1.1033008893330891, "rewards/margins": 2.384821017583211, "rewards/rejected": -1.281520128250122, "step": 16507 }, { "epoch": 0.8749900617496621, "grad_norm": 58.25, "kl": 3.8101425170898438, "learning_rate": 5e-07, "logits/chosen": -28216738.285714287, "logits/rejected": -32335806.0, "logps/chosen": -486.70228794642856, "logps/rejected": -460.77117919921875, "loss": 0.336, "rewards/chosen": 1.0521140779767717, "rewards/margins": 4.623595544270106, "rewards/rejected": -3.571481466293335, "step": 16508 }, { "epoch": 0.8750430657514643, "grad_norm": 63.25, "kl": 3.7018089294433594, "learning_rate": 5e-07, "logits/chosen": -34721341.333333336, "logits/rejected": -520803.125, "logps/chosen": -301.7723388671875, "logps/rejected": -97.11811828613281, "loss": 0.4279, "rewards/chosen": 0.21305922667185465, "rewards/margins": 4.184875289599101, "rewards/rejected": -3.971816062927246, "step": 16509 }, { "epoch": 0.8750960697532664, "grad_norm": 59.0, "kl": 3.407914161682129, "learning_rate": 5e-07, "logits/chosen": -30945653.333333332, "logits/rejected": 6212250.0, "logps/chosen": -330.72410074869794, "logps/rejected": -125.0196304321289, "loss": 0.3867, "rewards/chosen": 0.8685778776804606, "rewards/margins": 1.5756335655848184, "rewards/rejected": -0.7070556879043579, "step": 16510 }, { "epoch": 0.8751490737550686, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10751642.666666666, "logits/rejected": -8829940.8, "logps/chosen": -214.86787923177084, "logps/rejected": -208.3297607421875, "loss": 0.3245, "rewards/chosen": -0.22502772013346353, "rewards/margins": 1.9577008565266925, "rewards/rejected": -2.182728576660156, "step": 16511 }, { "epoch": 0.8752020777568706, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36422621.333333336, "logits/rejected": -38707865.6, "logps/chosen": -301.8103434244792, "logps/rejected": -446.032568359375, "loss": 0.2622, "rewards/chosen": 0.4389867385228475, "rewards/margins": 2.677720602353414, "rewards/rejected": -2.2387338638305665, "step": 16512 }, { "epoch": 0.8752550817586727, "grad_norm": 47.0, "kl": 0.3466339111328125, "learning_rate": 5e-07, "logits/chosen": -82975888.0, "logits/rejected": -64279564.8, "logps/chosen": -646.7047119140625, "logps/rejected": -391.852880859375, "loss": 0.1773, "rewards/chosen": 0.3902221918106079, "rewards/margins": 3.8021984338760375, "rewards/rejected": -3.4119762420654296, "step": 16513 }, { "epoch": 0.8753080857604749, "grad_norm": 52.75, "kl": 0.2779884338378906, "learning_rate": 5e-07, "logits/chosen": -8622379.2, "logits/rejected": -36918856.0, "logps/chosen": -201.7052978515625, "logps/rejected": -441.5347493489583, "loss": 0.2997, "rewards/chosen": 0.3284849405288696, "rewards/margins": 3.047499426205953, "rewards/rejected": -2.7190144856770835, "step": 16514 }, { "epoch": 0.875361089762277, "grad_norm": 45.0, "kl": 0.35343503952026367, "learning_rate": 5e-07, "logits/chosen": -15073742.666666666, "logits/rejected": -36903776.0, "logps/chosen": -218.31683349609375, "logps/rejected": -627.337109375, "loss": 0.2768, "rewards/chosen": 0.0953792929649353, "rewards/margins": 4.097444570064544, "rewards/rejected": -4.002065277099609, "step": 16515 }, { "epoch": 0.8754140937640792, "grad_norm": 37.5, "kl": 0.9996051788330078, "learning_rate": 5e-07, "logits/chosen": -38860160.0, "logits/rejected": -3435840.6666666665, "logps/chosen": -77.45848083496094, "logps/rejected": -156.72380574544272, "loss": 0.2148, "rewards/chosen": 1.6330289840698242, "rewards/margins": 3.926990509033203, "rewards/rejected": -2.293961524963379, "step": 16516 }, { "epoch": 0.8754670977658813, "grad_norm": 62.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33346518.0, "logits/rejected": -12771772.0, "logps/chosen": -299.6543273925781, "logps/rejected": -322.0972595214844, "loss": 0.3732, "rewards/chosen": -0.3241748809814453, "rewards/margins": 1.9969053268432617, "rewards/rejected": -2.321080207824707, "step": 16517 }, { "epoch": 0.8755201017676835, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6269471.333333333, "logits/rejected": -34363686.4, "logps/chosen": -436.7838541666667, "logps/rejected": -362.3717529296875, "loss": 0.163, "rewards/chosen": 1.2603179613749187, "rewards/margins": 3.5839982668558754, "rewards/rejected": -2.323680305480957, "step": 16518 }, { "epoch": 0.8755731057694855, "grad_norm": 40.25, "kl": 0.6005764007568359, "learning_rate": 5e-07, "logits/chosen": -38842915.2, "logits/rejected": -26348797.333333332, "logps/chosen": -290.7779541015625, "logps/rejected": -147.9518839518229, "loss": 0.2914, "rewards/chosen": 0.5888995170593262, "rewards/margins": 3.003947798411051, "rewards/rejected": -2.415048281351725, "step": 16519 }, { "epoch": 0.8756261097712877, "grad_norm": 51.75, "kl": 2.4582901000976562, "learning_rate": 5e-07, "logits/chosen": -50189731.2, "logits/rejected": -16004216.0, "logps/chosen": -474.712158203125, "logps/rejected": -316.1797688802083, "loss": 0.2769, "rewards/chosen": 0.8250930786132813, "rewards/margins": 4.4145559946695965, "rewards/rejected": -3.589462916056315, "step": 16520 }, { "epoch": 0.8756791137730898, "grad_norm": 25.5, "kl": 1.3838272094726562, "learning_rate": 5e-07, "logits/chosen": 4326409.5, "logits/rejected": -20573726.0, "logps/chosen": -56.97713088989258, "logps/rejected": -185.37939453125, "loss": 0.2345, "rewards/chosen": 0.5403839349746704, "rewards/margins": 3.7549139261245728, "rewards/rejected": -3.2145299911499023, "step": 16521 }, { "epoch": 0.875732117774892, "grad_norm": 53.75, "kl": 3.768198013305664, "learning_rate": 5e-07, "logits/chosen": -10390074.0, "logits/rejected": -26391096.0, "logps/chosen": -129.9839630126953, "logps/rejected": -246.1564483642578, "loss": 0.2828, "rewards/chosen": 1.0292812585830688, "rewards/margins": 3.1551166772842407, "rewards/rejected": -2.125835418701172, "step": 16522 }, { "epoch": 0.8757851217766941, "grad_norm": 63.5, "kl": 1.4848995208740234, "learning_rate": 5e-07, "logits/chosen": -5169419.333333333, "logits/rejected": -26000494.0, "logps/chosen": -228.8773396809896, "logps/rejected": -368.6293640136719, "loss": 0.3622, "rewards/chosen": 0.3032507697741191, "rewards/margins": 4.283210734526317, "rewards/rejected": -3.9799599647521973, "step": 16523 }, { "epoch": 0.8758381257784963, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39228304.0, "logits/rejected": -9842894.4, "logps/chosen": -71.3474833170573, "logps/rejected": -435.250146484375, "loss": 0.3296, "rewards/chosen": -0.10357385873794556, "rewards/margins": 2.372148835659027, "rewards/rejected": -2.4757226943969726, "step": 16524 }, { "epoch": 0.8758911297802984, "grad_norm": 29.375, "kl": 1.0698127746582031, "learning_rate": 5e-07, "logits/chosen": -1018017.5, "logits/rejected": -39865020.0, "logps/chosen": -127.52438354492188, "logps/rejected": -339.54669189453125, "loss": 0.2588, "rewards/chosen": 0.689255952835083, "rewards/margins": 3.620042562484741, "rewards/rejected": -2.930786609649658, "step": 16525 }, { "epoch": 0.8759441337821006, "grad_norm": 35.75, "kl": 2.439380645751953, "learning_rate": 5e-07, "logits/chosen": -8831637.0, "logits/rejected": -25765338.0, "logps/chosen": -230.685546875, "logps/rejected": -263.84405517578125, "loss": 0.2244, "rewards/chosen": 0.8492381572723389, "rewards/margins": 4.560678720474243, "rewards/rejected": -3.7114405632019043, "step": 16526 }, { "epoch": 0.8759971377839026, "grad_norm": 80.5, "kl": 1.7879714965820312, "learning_rate": 5e-07, "logits/chosen": -26528486.0, "logits/rejected": 137689360.0, "logps/chosen": -347.2865295410156, "logps/rejected": -386.57373046875, "loss": 0.3061, "rewards/chosen": 0.7982150316238403, "rewards/margins": 2.378580093383789, "rewards/rejected": -1.5803650617599487, "step": 16527 }, { "epoch": 0.8760501417857048, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4303966.4, "logits/rejected": -14900560.0, "logps/chosen": -251.3335693359375, "logps/rejected": -735.94482421875, "loss": 0.1606, "rewards/chosen": 1.3509202957153321, "rewards/margins": 5.265013631184896, "rewards/rejected": -3.914093335469564, "step": 16528 }, { "epoch": 0.8761031457875069, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59343520.0, "logits/rejected": -45073176.0, "logps/chosen": -310.9978942871094, "logps/rejected": -406.735595703125, "loss": 0.303, "rewards/chosen": 0.04460330307483673, "rewards/margins": 2.2752847224473953, "rewards/rejected": -2.2306814193725586, "step": 16529 }, { "epoch": 0.8761561497893091, "grad_norm": 31.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7251952.0, "logits/rejected": -17140084.0, "logps/chosen": -101.81969451904297, "logps/rejected": -242.1250203450521, "loss": 0.1308, "rewards/chosen": 0.5154377222061157, "rewards/margins": 3.9662920236587524, "rewards/rejected": -3.4508543014526367, "step": 16530 }, { "epoch": 0.8762091537911112, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10611904.0, "logits/rejected": -44999989.333333336, "logps/chosen": -248.1466064453125, "logps/rejected": -335.586181640625, "loss": 0.2509, "rewards/chosen": 0.8124155044555664, "rewards/margins": 3.0427230834960937, "rewards/rejected": -2.2303075790405273, "step": 16531 }, { "epoch": 0.8762621577929134, "grad_norm": 43.5, "kl": 2.86456298828125, "learning_rate": 5e-07, "logits/chosen": -52171540.0, "logits/rejected": -35579896.0, "logps/chosen": -213.7620086669922, "logps/rejected": -417.5579833984375, "loss": 0.2254, "rewards/chosen": 1.0030182600021362, "rewards/margins": 4.245371460914612, "rewards/rejected": -3.2423532009124756, "step": 16532 }, { "epoch": 0.8763151617947155, "grad_norm": 50.5, "kl": 0.1827392578125, "learning_rate": 5e-07, "logits/chosen": -23419241.6, "logits/rejected": -53863562.666666664, "logps/chosen": -279.9423095703125, "logps/rejected": -367.6805826822917, "loss": 0.3279, "rewards/chosen": 0.2602132797241211, "rewards/margins": 2.4632378260294594, "rewards/rejected": -2.2030245463053384, "step": 16533 }, { "epoch": 0.8763681657965177, "grad_norm": 39.5, "kl": 0.960179328918457, "learning_rate": 5e-07, "logits/chosen": -7074504.8, "logits/rejected": -28576738.666666668, "logps/chosen": -198.82261962890624, "logps/rejected": -346.7749430338542, "loss": 0.3033, "rewards/chosen": 0.3125696897506714, "rewards/margins": 3.5990521828333537, "rewards/rejected": -3.286482493082682, "step": 16534 }, { "epoch": 0.8764211697983197, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39802170.666666664, "logits/rejected": -44290492.0, "logps/chosen": -311.7078450520833, "logps/rejected": -324.80938720703125, "loss": 0.2949, "rewards/chosen": 0.7507797876993815, "rewards/margins": 2.246273080507914, "rewards/rejected": -1.4954932928085327, "step": 16535 }, { "epoch": 0.8764741738001219, "grad_norm": 44.75, "kl": 2.600375175476074, "learning_rate": 5e-07, "logits/chosen": -25138235.2, "logits/rejected": -27441093.333333332, "logps/chosen": -260.9564453125, "logps/rejected": -223.3201700846354, "loss": 0.2139, "rewards/chosen": 1.3243581771850585, "rewards/margins": 4.389115842183431, "rewards/rejected": -3.0647576649983725, "step": 16536 }, { "epoch": 0.876527177801924, "grad_norm": 60.75, "kl": 3.716686248779297, "learning_rate": 5e-07, "logits/chosen": -59023542.85714286, "logits/rejected": 7727876.0, "logps/chosen": -740.6661551339286, "logps/rejected": -133.55618286132812, "loss": 0.4246, "rewards/chosen": 0.6589952877589634, "rewards/margins": 6.089705126626151, "rewards/rejected": -5.4307098388671875, "step": 16537 }, { "epoch": 0.8765801818037262, "grad_norm": 73.5, "kl": 5.577108383178711, "learning_rate": 5e-07, "logits/chosen": -46765432.0, "logps/chosen": -306.8536071777344, "loss": 0.4477, "rewards/chosen": 0.8016537427902222, "step": 16538 }, { "epoch": 0.8766331858055283, "grad_norm": 31.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28808472.0, "logits/rejected": -30774067.2, "logps/chosen": -112.33943684895833, "logps/rejected": -263.008447265625, "loss": 0.2581, "rewards/chosen": -0.13187103470166525, "rewards/margins": 2.4742629985014597, "rewards/rejected": -2.606134033203125, "step": 16539 }, { "epoch": 0.8766861898073305, "grad_norm": 39.5, "kl": 0.22034835815429688, "learning_rate": 5e-07, "logits/chosen": -8015674.5, "logits/rejected": -25834010.0, "logps/chosen": -219.53860473632812, "logps/rejected": -250.0350341796875, "loss": 0.2491, "rewards/chosen": 0.4093397259712219, "rewards/margins": 3.238477051258087, "rewards/rejected": -2.8291373252868652, "step": 16540 }, { "epoch": 0.8767391938091326, "grad_norm": 45.75, "kl": 2.8316001892089844, "learning_rate": 5e-07, "logits/chosen": -18241641.6, "logits/rejected": 1979044.3333333333, "logps/chosen": -236.876220703125, "logps/rejected": -192.00581868489584, "loss": 0.4683, "rewards/chosen": -0.10320839881896973, "rewards/margins": 1.5880366802215575, "rewards/rejected": -1.6912450790405273, "step": 16541 }, { "epoch": 0.8767921978109348, "grad_norm": 49.25, "kl": 1.5007190704345703, "learning_rate": 5e-07, "logits/chosen": -34541248.0, "logits/rejected": -32197450.666666668, "logps/chosen": -305.9006103515625, "logps/rejected": -499.82958984375, "loss": 0.414, "rewards/chosen": 0.1814626932144165, "rewards/margins": 2.2462012847264607, "rewards/rejected": -2.0647385915120444, "step": 16542 }, { "epoch": 0.8768452018127368, "grad_norm": 46.75, "kl": 0.8068370819091797, "learning_rate": 5e-07, "logits/chosen": -8711256.0, "logits/rejected": -32857629.333333332, "logps/chosen": -194.7156982421875, "logps/rejected": -438.6297607421875, "loss": 0.2766, "rewards/chosen": 0.6105332374572754, "rewards/margins": 3.333139260609945, "rewards/rejected": -2.7226060231526694, "step": 16543 }, { "epoch": 0.876898205814539, "grad_norm": 36.25, "kl": 2.4060258865356445, "learning_rate": 5e-07, "logits/chosen": -29382336.0, "logits/rejected": -7209983.0, "logps/chosen": -131.70114135742188, "logps/rejected": -263.8601989746094, "loss": 0.3254, "rewards/chosen": 0.3538413941860199, "rewards/margins": 2.561990350484848, "rewards/rejected": -2.208148956298828, "step": 16544 }, { "epoch": 0.8769512098163411, "grad_norm": 142.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50223824.0, "logits/rejected": -12604650.0, "logps/chosen": -255.1419677734375, "logps/rejected": -141.3379364013672, "loss": 0.2823, "rewards/chosen": 0.7634592056274414, "rewards/margins": 2.2444576025009155, "rewards/rejected": -1.4809983968734741, "step": 16545 }, { "epoch": 0.8770042138181433, "grad_norm": 61.0, "kl": 1.0291919708251953, "learning_rate": 5e-07, "logits/chosen": -31864732.0, "logits/rejected": -56722432.0, "logps/chosen": -500.94390869140625, "logps/rejected": -108.466796875, "loss": 0.292, "rewards/chosen": 0.6512912511825562, "rewards/margins": 3.0914772748947144, "rewards/rejected": -2.440186023712158, "step": 16546 }, { "epoch": 0.8770572178199454, "grad_norm": 25.125, "kl": 0.28514862060546875, "learning_rate": 5e-07, "logits/chosen": -34384928.0, "logits/rejected": -46986117.333333336, "logps/chosen": -569.8408203125, "logps/rejected": -362.736328125, "loss": 0.0887, "rewards/chosen": 2.0606629848480225, "rewards/margins": 5.104231913884481, "rewards/rejected": -3.0435689290364585, "step": 16547 }, { "epoch": 0.8771102218217476, "grad_norm": 56.25, "kl": 1.4076509475708008, "learning_rate": 5e-07, "logits/chosen": 6014602.666666667, "logits/rejected": -34972864.0, "logps/chosen": -94.18565877278645, "logps/rejected": -305.01983642578125, "loss": 0.3837, "rewards/chosen": 0.4156189759572347, "rewards/margins": 2.1269193490346274, "rewards/rejected": -1.7113003730773926, "step": 16548 }, { "epoch": 0.8771632258235497, "grad_norm": 40.5, "kl": 0.17856693267822266, "learning_rate": 5e-07, "logits/chosen": -1019660.5, "logits/rejected": -8940648.0, "logps/chosen": -45.24454116821289, "logps/rejected": -280.3490804036458, "loss": 0.2056, "rewards/chosen": 0.6942238807678223, "rewards/margins": 2.8705856005350747, "rewards/rejected": -2.1763617197672525, "step": 16549 }, { "epoch": 0.8772162298253519, "grad_norm": 46.25, "kl": 1.659195899963379, "learning_rate": 5e-07, "logits/chosen": -10761790.0, "logits/rejected": -15062110.0, "logps/chosen": -133.7496795654297, "logps/rejected": -167.39718627929688, "loss": 0.3365, "rewards/chosen": 0.08583469688892365, "rewards/margins": 1.901910737156868, "rewards/rejected": -1.8160760402679443, "step": 16550 }, { "epoch": 0.8772692338271539, "grad_norm": 27.125, "kl": 1.3557815551757812, "learning_rate": 5e-07, "logits/chosen": 6833549.333333333, "logits/rejected": -58664524.8, "logps/chosen": -89.4103291829427, "logps/rejected": -548.77275390625, "loss": 0.1599, "rewards/chosen": 0.843547503153483, "rewards/margins": 4.114916674296062, "rewards/rejected": -3.2713691711425783, "step": 16551 }, { "epoch": 0.8773222378289561, "grad_norm": 24.875, "kl": 0.8852519989013672, "learning_rate": 5e-07, "logits/chosen": -5380048.666666667, "logits/rejected": -15478691.2, "logps/chosen": -111.83673095703125, "logps/rejected": -298.0344970703125, "loss": 0.1714, "rewards/chosen": 0.6232577164967855, "rewards/margins": 4.521462519963582, "rewards/rejected": -3.898204803466797, "step": 16552 }, { "epoch": 0.8773752418307582, "grad_norm": 41.5, "kl": 0.7758293151855469, "learning_rate": 5e-07, "logits/chosen": -53264725.333333336, "logits/rejected": -24725336.0, "logps/chosen": -154.855224609375, "logps/rejected": -419.91201171875, "loss": 0.2441, "rewards/chosen": 0.05407822132110596, "rewards/margins": 3.115752339363098, "rewards/rejected": -3.061674118041992, "step": 16553 }, { "epoch": 0.8774282458325604, "grad_norm": 51.5, "kl": 0.33478546142578125, "learning_rate": 5e-07, "logits/chosen": -13113405.333333334, "logits/rejected": -20439428.0, "logps/chosen": -166.5185343424479, "logps/rejected": -157.31784057617188, "loss": 0.4159, "rewards/chosen": -0.03154356777667999, "rewards/margins": 3.322469875216484, "rewards/rejected": -3.354013442993164, "step": 16554 }, { "epoch": 0.8774812498343625, "grad_norm": 61.5, "kl": 1.3300666809082031, "learning_rate": 5e-07, "logits/chosen": -57302170.666666664, "logits/rejected": -22711072.0, "logps/chosen": -370.8351643880208, "logps/rejected": -316.048291015625, "loss": 0.2935, "rewards/chosen": -0.053077638149261475, "rewards/margins": 2.08092600107193, "rewards/rejected": -2.1340036392211914, "step": 16555 }, { "epoch": 0.8775342538361647, "grad_norm": 33.5, "kl": 3.72821044921875, "learning_rate": 5e-07, "logits/chosen": -34678394.666666664, "logits/rejected": 26427.2, "logps/chosen": -393.7937825520833, "logps/rejected": -447.9931640625, "loss": 0.2275, "rewards/chosen": 1.305846373240153, "rewards/margins": 4.157925955454509, "rewards/rejected": -2.8520795822143556, "step": 16556 }, { "epoch": 0.8775872578379668, "grad_norm": 43.5, "kl": 5.2961955070495605, "learning_rate": 5e-07, "logits/chosen": -51452411.428571425, "logits/rejected": -34763848.0, "logps/chosen": -366.62569754464283, "logps/rejected": -550.6708374023438, "loss": 0.4169, "rewards/chosen": 0.849433353969029, "rewards/margins": 5.266590527125767, "rewards/rejected": -4.417157173156738, "step": 16557 }, { "epoch": 0.877640261839769, "grad_norm": 46.25, "kl": 0.7048187255859375, "learning_rate": 5e-07, "logits/chosen": -14551488.0, "logits/rejected": -21734588.8, "logps/chosen": -314.73911539713544, "logps/rejected": -151.264453125, "loss": 0.2269, "rewards/chosen": 0.6794606844584147, "rewards/margins": 2.905237356821696, "rewards/rejected": -2.2257766723632812, "step": 16558 }, { "epoch": 0.877693265841571, "grad_norm": 54.5, "kl": 1.7556266784667969, "learning_rate": 5e-07, "logits/chosen": -57456038.4, "logits/rejected": 19380301.333333332, "logps/chosen": -743.140087890625, "logps/rejected": -551.638671875, "loss": 0.223, "rewards/chosen": 1.5034205436706543, "rewards/margins": 4.236755402882894, "rewards/rejected": -2.7333348592122397, "step": 16559 }, { "epoch": 0.8777462698433732, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25952970.0, "logits/rejected": -8995690.666666666, "logps/chosen": -210.7122802734375, "logps/rejected": -181.9366455078125, "loss": 0.2611, "rewards/chosen": -0.01352652907371521, "rewards/margins": 1.770551433165868, "rewards/rejected": -1.7840779622395833, "step": 16560 }, { "epoch": 0.8777992738451753, "grad_norm": 33.0, "kl": 0.9328546524047852, "learning_rate": 5e-07, "logits/chosen": -15165160.0, "logits/rejected": -6661123.6, "logps/chosen": -1102.0577799479167, "logps/rejected": -403.8423583984375, "loss": 0.126, "rewards/chosen": 2.487079461415609, "rewards/margins": 5.400386651357016, "rewards/rejected": -2.9133071899414062, "step": 16561 }, { "epoch": 0.8778522778469775, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54566040.0, "logits/rejected": -9875476.0, "logps/chosen": -524.32470703125, "logps/rejected": -144.68578084309897, "loss": 0.194, "rewards/chosen": 0.7948257923126221, "rewards/margins": 2.969639857610067, "rewards/rejected": -2.174814065297445, "step": 16562 }, { "epoch": 0.8779052818487796, "grad_norm": 44.25, "kl": 0.5586938858032227, "learning_rate": 5e-07, "logits/chosen": -20400208.0, "logits/rejected": -16618973.0, "logps/chosen": -163.59298706054688, "logps/rejected": -266.2591247558594, "loss": 0.2714, "rewards/chosen": 0.40830105543136597, "rewards/margins": 2.9119665026664734, "rewards/rejected": -2.5036654472351074, "step": 16563 }, { "epoch": 0.8779582858505817, "grad_norm": 51.5, "kl": 1.3059730529785156, "learning_rate": 5e-07, "logits/chosen": 1618836.25, "logits/rejected": -23975446.0, "logps/chosen": -59.817909240722656, "logps/rejected": -413.580810546875, "loss": 0.3129, "rewards/chosen": -0.12312917411327362, "rewards/margins": 3.878611281514168, "rewards/rejected": -4.001740455627441, "step": 16564 }, { "epoch": 0.8780112898523839, "grad_norm": 51.25, "kl": 1.1191062927246094, "learning_rate": 5e-07, "logits/chosen": 21636850.666666668, "logits/rejected": -24024715.2, "logps/chosen": -410.7913411458333, "logps/rejected": -479.83662109375, "loss": 0.1779, "rewards/chosen": 0.8247876962025961, "rewards/margins": 3.3285734017690025, "rewards/rejected": -2.5037857055664063, "step": 16565 }, { "epoch": 0.8780642938541859, "grad_norm": 41.25, "kl": 0.7961349487304688, "learning_rate": 5e-07, "logits/chosen": -57747992.0, "logits/rejected": -10718593.0, "logps/chosen": -240.68170166015625, "logps/rejected": -309.2747802734375, "loss": 0.2569, "rewards/chosen": 0.5452781915664673, "rewards/margins": 3.1732412576675415, "rewards/rejected": -2.627963066101074, "step": 16566 }, { "epoch": 0.8781172978559881, "grad_norm": 51.5, "kl": 1.2419497966766357, "learning_rate": 5e-07, "logits/chosen": -20883170.0, "logits/rejected": -21634468.0, "logps/chosen": -313.1064453125, "logps/rejected": -185.30355834960938, "loss": 0.2029, "rewards/chosen": 1.0252041816711426, "rewards/margins": 4.657140731811523, "rewards/rejected": -3.631936550140381, "step": 16567 }, { "epoch": 0.8781703018577902, "grad_norm": 40.0, "kl": 0.6489486694335938, "learning_rate": 5e-07, "logits/chosen": -5080421.333333333, "logits/rejected": -19756921.6, "logps/chosen": -342.0870768229167, "logps/rejected": -192.181591796875, "loss": 0.2129, "rewards/chosen": 1.1058100859324138, "rewards/margins": 3.839682499567668, "rewards/rejected": -2.733872413635254, "step": 16568 }, { "epoch": 0.8782233058595924, "grad_norm": 49.5, "kl": 1.4725732803344727, "learning_rate": 5e-07, "logits/chosen": -5819691.2, "logits/rejected": -43891520.0, "logps/chosen": -187.5691162109375, "logps/rejected": -376.8425699869792, "loss": 0.3339, "rewards/chosen": 0.2984838247299194, "rewards/margins": 2.339803989728292, "rewards/rejected": -2.0413201649983725, "step": 16569 }, { "epoch": 0.8782763098613945, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69177472.0, "logits/rejected": -16363684.57142857, "logps/chosen": -164.80300903320312, "logps/rejected": -331.8433314732143, "loss": 0.25, "rewards/chosen": -0.576129138469696, "rewards/margins": 1.6302905338151112, "rewards/rejected": -2.2064196722848073, "step": 16570 }, { "epoch": 0.8783293138631967, "grad_norm": 40.25, "kl": 1.4566850662231445, "learning_rate": 5e-07, "logits/chosen": -20972173.333333332, "logits/rejected": -13899961.6, "logps/chosen": -228.98771158854166, "logps/rejected": -192.267822265625, "loss": 0.2281, "rewards/chosen": 0.2619857390721639, "rewards/margins": 3.322781523068746, "rewards/rejected": -3.060795783996582, "step": 16571 }, { "epoch": 0.8783823178649988, "grad_norm": 64.0, "kl": 4.106662750244141, "learning_rate": 5e-07, "logits/chosen": -16445588.8, "logits/rejected": 55194160.0, "logps/chosen": -533.5271484375, "logps/rejected": -294.2818196614583, "loss": 0.2635, "rewards/chosen": 1.0442153930664062, "rewards/margins": 2.831635030110677, "rewards/rejected": -1.7874196370442708, "step": 16572 }, { "epoch": 0.878435321866801, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -28868444.0, "logps/rejected": -377.9117736816406, "loss": 0.1785, "rewards/rejected": -2.172471523284912, "step": 16573 }, { "epoch": 0.878488325868603, "grad_norm": 38.25, "kl": 0.4435892105102539, "learning_rate": 5e-07, "logits/chosen": -13014296.0, "logits/rejected": -15754163.0, "logps/chosen": -128.7867889404297, "logps/rejected": -216.95730590820312, "loss": 0.2769, "rewards/chosen": 0.5689423680305481, "rewards/margins": 2.4425496459007263, "rewards/rejected": -1.8736072778701782, "step": 16574 }, { "epoch": 0.8785413298704052, "grad_norm": 49.25, "kl": 1.4143352508544922, "learning_rate": 5e-07, "logits/chosen": -11028804.0, "logits/rejected": -43930936.0, "logps/chosen": -189.46165466308594, "logps/rejected": -252.68731689453125, "loss": 0.3352, "rewards/chosen": 0.38768458366394043, "rewards/margins": 2.1282901763916016, "rewards/rejected": -1.7406055927276611, "step": 16575 }, { "epoch": 0.8785943338722073, "grad_norm": 33.0, "kl": 4.648734092712402, "learning_rate": 5e-07, "logits/chosen": -8798206.4, "logits/rejected": -43860784.0, "logps/chosen": -525.776611328125, "logps/rejected": -136.4749552408854, "loss": 0.2926, "rewards/chosen": 1.4619593620300293, "rewards/margins": 3.5304436683654785, "rewards/rejected": -2.068484306335449, "step": 16576 }, { "epoch": 0.8786473378740095, "grad_norm": 43.25, "kl": 1.5526847839355469, "learning_rate": 5e-07, "logits/chosen": -12095615.2, "logits/rejected": -67323594.66666667, "logps/chosen": -232.44833984375, "logps/rejected": -642.4222412109375, "loss": 0.3185, "rewards/chosen": 0.2922669887542725, "rewards/margins": 3.220655584335327, "rewards/rejected": -2.9283885955810547, "step": 16577 }, { "epoch": 0.8787003418758116, "grad_norm": 39.5, "kl": 2.5606918334960938, "learning_rate": 5e-07, "logits/chosen": -16119198.4, "logits/rejected": -30352213.333333332, "logps/chosen": -417.78173828125, "logps/rejected": -366.62060546875, "loss": 0.2933, "rewards/chosen": 0.6871655464172364, "rewards/margins": 4.003162034352621, "rewards/rejected": -3.3159964879353843, "step": 16578 }, { "epoch": 0.8787533458776138, "grad_norm": 60.0, "kl": 4.3117876052856445, "learning_rate": 5e-07, "logits/chosen": -40624612.571428575, "logits/rejected": -26717568.0, "logps/chosen": -292.42562430245533, "logps/rejected": -616.517578125, "loss": 0.4106, "rewards/chosen": 0.6991355078560966, "rewards/margins": 2.6102195637566705, "rewards/rejected": -1.9110840559005737, "step": 16579 }, { "epoch": 0.8788063498794159, "grad_norm": 45.75, "kl": 3.394245147705078, "learning_rate": 5e-07, "logits/chosen": -61791648.0, "logits/rejected": -7044419.0, "logps/chosen": -304.9141540527344, "logps/rejected": -147.8126678466797, "loss": 0.3232, "rewards/chosen": 0.6616984605789185, "rewards/margins": 3.4090999364852905, "rewards/rejected": -2.747401475906372, "step": 16580 }, { "epoch": 0.878859353881218, "grad_norm": 66.0, "kl": 1.8478927612304688, "learning_rate": 5e-07, "logits/chosen": -52982730.666666664, "logits/rejected": -28645475.2, "logps/chosen": -495.80712890625, "logps/rejected": -439.309033203125, "loss": 0.2219, "rewards/chosen": 1.145659367243449, "rewards/margins": 3.926890675226847, "rewards/rejected": -2.7812313079833983, "step": 16581 }, { "epoch": 0.8789123578830201, "grad_norm": 47.0, "kl": 0.2387542724609375, "learning_rate": 5e-07, "logits/chosen": 6483876.0, "logits/rejected": -69641936.0, "logps/chosen": -96.89332580566406, "logps/rejected": -456.3962097167969, "loss": 0.2806, "rewards/chosen": 0.14308756589889526, "rewards/margins": 3.4037691950798035, "rewards/rejected": -3.260681629180908, "step": 16582 }, { "epoch": 0.8789653618848223, "grad_norm": 40.25, "kl": 2.4921255111694336, "learning_rate": 5e-07, "logits/chosen": -4758979.5, "logits/rejected": -4241095.5, "logps/chosen": -193.56211853027344, "logps/rejected": -209.0444793701172, "loss": 0.345, "rewards/chosen": 0.6947126388549805, "rewards/margins": 2.1490931510925293, "rewards/rejected": -1.4543805122375488, "step": 16583 }, { "epoch": 0.8790183658866244, "grad_norm": 76.5, "kl": 2.122142791748047, "learning_rate": 5e-07, "logits/chosen": 93054741.33333333, "logits/rejected": -66517072.0, "logps/chosen": -376.6695963541667, "logps/rejected": -549.168212890625, "loss": 0.454, "rewards/chosen": -0.02549851934115092, "rewards/margins": 2.886265148719152, "rewards/rejected": -2.9117636680603027, "step": 16584 }, { "epoch": 0.8790713698884266, "grad_norm": 27.75, "kl": 2.819122314453125, "learning_rate": 5e-07, "logits/chosen": -167154.5, "logits/rejected": -19533816.0, "logps/chosen": -46.23590596516927, "logps/rejected": -300.1348388671875, "loss": 0.2542, "rewards/chosen": 0.4789244333902995, "rewards/margins": 2.164488855997721, "rewards/rejected": -1.6855644226074218, "step": 16585 }, { "epoch": 0.8791243738902287, "grad_norm": 68.5, "kl": 4.945600509643555, "learning_rate": 5e-07, "logits/chosen": -33063024.0, "logits/rejected": -7609191.0, "logps/chosen": -373.9488525390625, "logps/rejected": -42.43975067138672, "loss": 0.371, "rewards/chosen": 1.0538612206776936, "rewards/margins": 2.0234170754750567, "rewards/rejected": -0.9695558547973633, "step": 16586 }, { "epoch": 0.8791773778920309, "grad_norm": 48.25, "kl": 1.1935467720031738, "learning_rate": 5e-07, "logits/chosen": -57757056.0, "logits/rejected": -49519656.0, "logps/chosen": -302.2220458984375, "logps/rejected": -181.58839416503906, "loss": 0.2995, "rewards/chosen": 0.36414089798927307, "rewards/margins": 2.134103924036026, "rewards/rejected": -1.769963026046753, "step": 16587 }, { "epoch": 0.879230381893833, "grad_norm": 45.75, "kl": 1.529550552368164, "learning_rate": 5e-07, "logits/chosen": -63953752.0, "logits/rejected": -16591516.0, "logps/chosen": -300.6279296875, "logps/rejected": -472.4260559082031, "loss": 0.1909, "rewards/chosen": 0.9973891973495483, "rewards/margins": 4.278039336204529, "rewards/rejected": -3.2806501388549805, "step": 16588 }, { "epoch": 0.8792833858956352, "grad_norm": 30.125, "kl": 0.2789430618286133, "learning_rate": 5e-07, "logits/chosen": -6214268.0, "logits/rejected": -24624154.666666668, "logps/chosen": -95.5008316040039, "logps/rejected": -336.699951171875, "loss": 0.1892, "rewards/chosen": 0.38306960463523865, "rewards/margins": 3.084582577149073, "rewards/rejected": -2.7015129725138345, "step": 16589 }, { "epoch": 0.8793363898974372, "grad_norm": 48.25, "kl": 1.4531745910644531, "learning_rate": 5e-07, "logits/chosen": -35727865.6, "logits/rejected": -7146066.666666667, "logps/chosen": -337.271240234375, "logps/rejected": -184.3320109049479, "loss": 0.2692, "rewards/chosen": 0.8042990684509277, "rewards/margins": 3.0739531834920246, "rewards/rejected": -2.269654115041097, "step": 16590 }, { "epoch": 0.8793893938992394, "grad_norm": 37.25, "kl": 0.4036121368408203, "learning_rate": 5e-07, "logits/chosen": -4293168.0, "logits/rejected": 2189756.0, "logps/chosen": -229.67971801757812, "logps/rejected": -471.5472819010417, "loss": 0.2045, "rewards/chosen": 0.8937149047851562, "rewards/margins": 3.0292040506998696, "rewards/rejected": -2.1354891459147134, "step": 16591 }, { "epoch": 0.8794423979010415, "grad_norm": 46.5, "kl": 2.345076560974121, "learning_rate": 5e-07, "logits/chosen": -46248556.0, "logits/rejected": -30757988.0, "logps/chosen": -305.27301025390625, "logps/rejected": -419.6919860839844, "loss": 0.3751, "rewards/chosen": 0.278270423412323, "rewards/margins": 2.370581805706024, "rewards/rejected": -2.092311382293701, "step": 16592 }, { "epoch": 0.8794954019028437, "grad_norm": 50.75, "kl": 2.2314281463623047, "learning_rate": 5e-07, "logits/chosen": 13202534.0, "logits/rejected": -85650272.0, "logps/chosen": -51.800811767578125, "logps/rejected": -440.43426513671875, "loss": 0.2494, "rewards/chosen": 0.7499233484268188, "rewards/margins": 3.159629702568054, "rewards/rejected": -2.4097063541412354, "step": 16593 }, { "epoch": 0.8795484059046458, "grad_norm": 47.5, "kl": 2.9807052612304688, "learning_rate": 5e-07, "logits/chosen": -31518744.0, "logits/rejected": 2980368.0, "logps/chosen": -1280.5423583984375, "logps/rejected": -470.9076334635417, "loss": 0.2065, "rewards/chosen": 2.6757049560546875, "rewards/margins": 4.846960385640463, "rewards/rejected": -2.171255429585775, "step": 16594 }, { "epoch": 0.879601409906448, "grad_norm": 59.0, "kl": 0.08945846557617188, "learning_rate": 5e-07, "logits/chosen": -15529423.0, "logits/rejected": -20569892.0, "logps/chosen": -416.273193359375, "logps/rejected": -256.958984375, "loss": 0.2585, "rewards/chosen": 0.3429931402206421, "rewards/margins": 3.2126392126083374, "rewards/rejected": -2.8696460723876953, "step": 16595 }, { "epoch": 0.87965441390825, "grad_norm": 40.75, "kl": 2.8074817657470703, "learning_rate": 5e-07, "logits/chosen": -26284242.666666668, "logits/rejected": -36684272.0, "logps/chosen": -242.71183268229166, "logps/rejected": -208.12884521484375, "loss": 0.3438, "rewards/chosen": 0.5646311044692993, "rewards/margins": 4.151598572731018, "rewards/rejected": -3.5869674682617188, "step": 16596 }, { "epoch": 0.8797074179100522, "grad_norm": 30.5, "kl": 1.0320158004760742, "learning_rate": 5e-07, "logits/chosen": 1674199.8333333333, "logits/rejected": -33783398.4, "logps/chosen": -22.697186787923176, "logps/rejected": -415.20703125, "loss": 0.2025, "rewards/chosen": 0.7653044064839681, "rewards/margins": 3.084005578358968, "rewards/rejected": -2.318701171875, "step": 16597 }, { "epoch": 0.8797604219118543, "grad_norm": 44.25, "kl": 6.255891799926758, "learning_rate": 5e-07, "logits/chosen": 4991426.666666667, "logits/rejected": -15692803.2, "logps/chosen": -26.330523173014324, "logps/rejected": -332.1119140625, "loss": 0.3278, "rewards/chosen": 0.5510265827178955, "rewards/margins": 3.1230270862579346, "rewards/rejected": -2.572000503540039, "step": 16598 }, { "epoch": 0.8798134259136565, "grad_norm": 51.5, "kl": 3.993865966796875, "learning_rate": 5e-07, "logits/chosen": -20758260.0, "logits/rejected": -19666102.0, "logps/chosen": -456.85986328125, "logps/rejected": -237.98565673828125, "loss": 0.3476, "rewards/chosen": 1.167751630147298, "rewards/margins": 2.5061155160268145, "rewards/rejected": -1.3383638858795166, "step": 16599 }, { "epoch": 0.8798664299154586, "grad_norm": 34.5, "kl": 0.14319229125976562, "learning_rate": 5e-07, "logits/chosen": -30550160.0, "logits/rejected": -26547888.0, "logps/chosen": -295.3295084635417, "logps/rejected": -335.167919921875, "loss": 0.2086, "rewards/chosen": 0.6694689591725668, "rewards/margins": 3.260622294743856, "rewards/rejected": -2.5911533355712892, "step": 16600 }, { "epoch": 0.8799194339172608, "grad_norm": 28.125, "kl": 0.9091911315917969, "learning_rate": 5e-07, "logits/chosen": -20853190.666666668, "logits/rejected": -22339937.6, "logps/chosen": -371.7422688802083, "logps/rejected": -262.5515869140625, "loss": 0.1683, "rewards/chosen": 1.4731208483378093, "rewards/margins": 4.612509123484294, "rewards/rejected": -3.1393882751464846, "step": 16601 }, { "epoch": 0.8799724379190629, "grad_norm": 46.75, "kl": 1.2512130737304688, "learning_rate": 5e-07, "logits/chosen": -26366052.0, "logits/rejected": -16339537.0, "logps/chosen": -241.67355346679688, "logps/rejected": -299.8454895019531, "loss": 0.3439, "rewards/chosen": 0.03202486038208008, "rewards/margins": 2.276510715484619, "rewards/rejected": -2.244485855102539, "step": 16602 }, { "epoch": 0.8800254419208651, "grad_norm": 71.0, "kl": 0.5410385131835938, "learning_rate": 5e-07, "logits/chosen": -53893728.0, "logits/rejected": -573265.75, "logps/chosen": -449.1701354980469, "logps/rejected": -87.15833282470703, "loss": 0.319, "rewards/chosen": 0.44964295625686646, "rewards/margins": 1.871157467365265, "rewards/rejected": -1.4215145111083984, "step": 16603 }, { "epoch": 0.8800784459226672, "grad_norm": 37.0, "kl": 1.5122356414794922, "learning_rate": 5e-07, "logits/chosen": -40945436.0, "logits/rejected": -2117854.0, "logps/chosen": -285.01898193359375, "logps/rejected": -123.9357681274414, "loss": 0.2938, "rewards/chosen": 0.21445578336715698, "rewards/margins": 3.0546788573265076, "rewards/rejected": -2.8402230739593506, "step": 16604 }, { "epoch": 0.8801314499244693, "grad_norm": 78.5, "kl": 3.6662845611572266, "learning_rate": 5e-07, "logits/chosen": -47032636.0, "logits/rejected": -13234712.0, "logps/chosen": -349.0800476074219, "logps/rejected": -249.07252502441406, "loss": 0.2619, "rewards/chosen": 0.8441287875175476, "rewards/margins": 4.376435935497284, "rewards/rejected": -3.5323071479797363, "step": 16605 }, { "epoch": 0.8801844539262714, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16132756.0, "logits/rejected": -25433510.4, "logps/chosen": -354.0327555338542, "logps/rejected": -129.81417236328124, "loss": 0.244, "rewards/chosen": 0.6714192231496176, "rewards/margins": 2.6999089082082115, "rewards/rejected": -2.028489685058594, "step": 16606 }, { "epoch": 0.8802374579280736, "grad_norm": 52.5, "kl": 4.01680850982666, "learning_rate": 5e-07, "logits/chosen": -2996935.0, "logits/rejected": -67335968.0, "logps/chosen": -235.96961975097656, "logps/rejected": -443.7508544921875, "loss": 0.3128, "rewards/chosen": 0.3704335689544678, "rewards/margins": 3.1793711185455322, "rewards/rejected": -2.8089375495910645, "step": 16607 }, { "epoch": 0.8802904619298757, "grad_norm": 59.5, "kl": 3.1253700256347656, "learning_rate": 5e-07, "logits/chosen": -36038227.2, "logits/rejected": -16032765.333333334, "logps/chosen": -345.2101806640625, "logps/rejected": -715.7200520833334, "loss": 0.281, "rewards/chosen": 0.6947341442108155, "rewards/margins": 3.824496793746948, "rewards/rejected": -3.129762649536133, "step": 16608 }, { "epoch": 0.8803434659316779, "grad_norm": 36.75, "kl": 1.6371440887451172, "learning_rate": 5e-07, "logits/chosen": 1923973.125, "logits/rejected": -40941504.0, "logps/chosen": -54.77248001098633, "logps/rejected": -278.05889892578125, "loss": 0.2969, "rewards/chosen": 0.44128671288490295, "rewards/margins": 2.483090788125992, "rewards/rejected": -2.041804075241089, "step": 16609 }, { "epoch": 0.88039646993348, "grad_norm": 81.0, "kl": 4.050692558288574, "learning_rate": 5e-07, "logits/chosen": -3101622.4, "logits/rejected": -20577040.0, "logps/chosen": -576.51474609375, "logps/rejected": -303.4964192708333, "loss": 0.2867, "rewards/chosen": 1.1579267501831054, "rewards/margins": 3.954911740620931, "rewards/rejected": -2.7969849904378257, "step": 16610 }, { "epoch": 0.8804494739352822, "grad_norm": 44.0, "kl": 4.1276140213012695, "learning_rate": 5e-07, "logits/chosen": -62097843.2, "logits/rejected": -50616256.0, "logps/chosen": -516.836181640625, "logps/rejected": -529.9534505208334, "loss": 0.2378, "rewards/chosen": 1.4928156852722168, "rewards/margins": 4.334191672007242, "rewards/rejected": -2.841375986735026, "step": 16611 }, { "epoch": 0.8805024779370842, "grad_norm": 62.25, "kl": 5.113895416259766, "learning_rate": 5e-07, "logits/chosen": -47461961.6, "logits/rejected": -5272805.666666667, "logps/chosen": -399.3914306640625, "logps/rejected": -136.64373779296875, "loss": 0.3858, "rewards/chosen": 0.4226843357086182, "rewards/margins": 4.065120999018351, "rewards/rejected": -3.642436663309733, "step": 16612 }, { "epoch": 0.8805554819388863, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13450094.666666666, "logits/rejected": 18761014.4, "logps/chosen": -276.1051432291667, "logps/rejected": -291.679345703125, "loss": 0.1935, "rewards/chosen": 1.1834345658620198, "rewards/margins": 3.7256792863210038, "rewards/rejected": -2.542244720458984, "step": 16613 }, { "epoch": 0.8806084859406885, "grad_norm": 66.5, "kl": 4.628008842468262, "learning_rate": 5e-07, "logits/chosen": -32280712.0, "logits/rejected": 1375738.625, "logps/chosen": -299.83591715494794, "logps/rejected": -91.27702331542969, "loss": 0.423, "rewards/chosen": 0.7481686274210612, "rewards/margins": 2.0336997906366983, "rewards/rejected": -1.2855311632156372, "step": 16614 }, { "epoch": 0.8806614899424906, "grad_norm": 44.75, "kl": 2.514667510986328, "learning_rate": 5e-07, "logits/chosen": -18191200.0, "logits/rejected": -37443512.0, "logps/chosen": -175.71954345703125, "logps/rejected": -418.505859375, "loss": 0.323, "rewards/chosen": 0.45232999324798584, "rewards/margins": 2.8214133977890015, "rewards/rejected": -2.3690834045410156, "step": 16615 }, { "epoch": 0.8807144939442928, "grad_norm": 36.0, "kl": 0.38629913330078125, "learning_rate": 5e-07, "logits/chosen": 5796829.0, "logits/rejected": -6357289.333333333, "logps/chosen": -126.26052856445312, "logps/rejected": -304.101318359375, "loss": 0.2473, "rewards/chosen": -0.42494338750839233, "rewards/margins": 2.3378240863482156, "rewards/rejected": -2.762767473856608, "step": 16616 }, { "epoch": 0.8807674979460949, "grad_norm": 60.5, "kl": 2.1019744873046875, "learning_rate": 5e-07, "logits/chosen": -34959862.4, "logits/rejected": 141990592.0, "logps/chosen": -205.54794921875, "logps/rejected": -541.69287109375, "loss": 0.3761, "rewards/chosen": 0.05118393301963806, "rewards/margins": 1.9957660774389903, "rewards/rejected": -1.9445821444193523, "step": 16617 }, { "epoch": 0.8808205019478971, "grad_norm": 55.75, "kl": 3.1589126586914062, "learning_rate": 5e-07, "logits/chosen": -43737763.2, "logits/rejected": -31984618.666666668, "logps/chosen": -591.53076171875, "logps/rejected": -205.78377278645834, "loss": 0.3394, "rewards/chosen": 1.1396828651428224, "rewards/margins": 2.564531262715658, "rewards/rejected": -1.4248483975728352, "step": 16618 }, { "epoch": 0.8808735059496992, "grad_norm": 65.0, "kl": 2.8699989318847656, "learning_rate": 5e-07, "logits/chosen": -43948314.666666664, "logits/rejected": -72589600.0, "logps/chosen": -610.1280110677084, "logps/rejected": -412.8068542480469, "loss": 0.2602, "rewards/chosen": 1.2133253415425618, "rewards/margins": 4.3519056638081866, "rewards/rejected": -3.138580322265625, "step": 16619 }, { "epoch": 0.8809265099515013, "grad_norm": 35.5, "kl": 2.161073684692383, "learning_rate": 5e-07, "logits/chosen": -21389994.0, "logits/rejected": -18124984.0, "logps/chosen": -158.1241455078125, "logps/rejected": -179.8166961669922, "loss": 0.3156, "rewards/chosen": 0.3765551745891571, "rewards/margins": 3.0014626681804657, "rewards/rejected": -2.6249074935913086, "step": 16620 }, { "epoch": 0.8809795139533034, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -36004216.0, "logps/rejected": -196.32659912109375, "loss": 0.1953, "rewards/rejected": -1.740520715713501, "step": 16621 }, { "epoch": 0.8810325179551056, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33029637.333333332, "logits/rejected": -19038737.6, "logps/chosen": -313.26983642578125, "logps/rejected": -372.30224609375, "loss": 0.2026, "rewards/chosen": 0.2924092809359233, "rewards/margins": 3.3041303197542824, "rewards/rejected": -3.0117210388183593, "step": 16622 }, { "epoch": 0.8810855219569077, "grad_norm": 33.75, "kl": 3.682201385498047, "learning_rate": 5e-07, "logits/chosen": -9558522.0, "logits/rejected": -17683876.8, "logps/chosen": -170.32232666015625, "logps/rejected": -356.5421875, "loss": 0.3096, "rewards/chosen": 0.43284404277801514, "rewards/margins": 2.5981762647628783, "rewards/rejected": -2.165332221984863, "step": 16623 }, { "epoch": 0.8811385259587099, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44763509.333333336, "logits/rejected": -21691252.8, "logps/chosen": -377.468017578125, "logps/rejected": -263.29208984375, "loss": 0.2625, "rewards/chosen": -0.22897537549336752, "rewards/margins": 2.381682697931925, "rewards/rejected": -2.610658073425293, "step": 16624 }, { "epoch": 0.881191529960512, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 23807672.0, "logits/rejected": -10607746.285714285, "logps/chosen": -404.90496826171875, "logps/rejected": -179.38612583705358, "loss": 0.2589, "rewards/chosen": 0.7790069580078125, "rewards/margins": 2.1374216079711914, "rewards/rejected": -1.358414649963379, "step": 16625 }, { "epoch": 0.8812445339623142, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26364912.0, "logits/rejected": -27345824.0, "logps/chosen": -280.6190185546875, "logps/rejected": -456.1330159505208, "loss": 0.1293, "rewards/chosen": 0.5226287841796875, "rewards/margins": 3.99794864654541, "rewards/rejected": -3.4753198623657227, "step": 16626 }, { "epoch": 0.8812975379641163, "grad_norm": 33.5, "kl": 5.044116973876953, "learning_rate": 5e-07, "logits/chosen": -20082684.0, "logits/rejected": -54570636.0, "logps/chosen": -229.7994842529297, "logps/rejected": -547.1380004882812, "loss": 0.3321, "rewards/chosen": -0.05060477554798126, "rewards/margins": 3.366352364420891, "rewards/rejected": -3.416957139968872, "step": 16627 }, { "epoch": 0.8813505419659184, "grad_norm": 42.0, "kl": 3.432803153991699, "learning_rate": 5e-07, "logits/chosen": -12750193.333333334, "logits/rejected": -27619340.8, "logps/chosen": -456.9672037760417, "logps/rejected": -314.0638671875, "loss": 0.2767, "rewards/chosen": 1.2063024838765461, "rewards/margins": 3.2799032529195147, "rewards/rejected": -2.0736007690429688, "step": 16628 }, { "epoch": 0.8814035459677205, "grad_norm": 53.5, "kl": 1.3381462097167969, "learning_rate": 5e-07, "logits/chosen": -4867511.0, "logits/rejected": -120014794.66666667, "logps/chosen": -127.15597534179688, "logps/rejected": -232.12874348958334, "loss": 0.1831, "rewards/chosen": 1.3694523572921753, "rewards/margins": 3.418371558189392, "rewards/rejected": -2.048919200897217, "step": 16629 }, { "epoch": 0.8814565499695227, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38361381.333333336, "logits/rejected": -19732259.2, "logps/chosen": -383.0625813802083, "logps/rejected": -210.9898681640625, "loss": 0.1623, "rewards/chosen": 0.85511581103007, "rewards/margins": 3.9312731901804603, "rewards/rejected": -3.0761573791503904, "step": 16630 }, { "epoch": 0.8815095539713248, "grad_norm": 51.0, "kl": 0.7442359924316406, "learning_rate": 5e-07, "logits/chosen": -19959728.0, "logits/rejected": -15062120.0, "logps/chosen": -282.62192789713544, "logps/rejected": -248.9494873046875, "loss": 0.3883, "rewards/chosen": -0.5298158725102743, "rewards/margins": 1.5609834591547647, "rewards/rejected": -2.090799331665039, "step": 16631 }, { "epoch": 0.881562557973127, "grad_norm": 71.5, "kl": 4.621776580810547, "learning_rate": 5e-07, "logits/chosen": -31357620.0, "logits/rejected": -30104266.0, "logps/chosen": -659.0379028320312, "logps/rejected": -276.52691650390625, "loss": 0.2831, "rewards/chosen": 1.3991860151290894, "rewards/margins": 2.6268779039382935, "rewards/rejected": -1.227691888809204, "step": 16632 }, { "epoch": 0.8816155619749291, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -106859160.0, "logits/rejected": -7027798.285714285, "logps/chosen": -323.8788757324219, "logps/rejected": -268.58231026785717, "loss": 0.1812, "rewards/chosen": -0.11741333454847336, "rewards/margins": 2.4106810933777263, "rewards/rejected": -2.5280944279261996, "step": 16633 }, { "epoch": 0.8816685659767313, "grad_norm": 48.75, "kl": 1.1856088638305664, "learning_rate": 5e-07, "logits/chosen": -45000522.666666664, "logits/rejected": -10924864.0, "logps/chosen": -217.4437255859375, "logps/rejected": -257.7509765625, "loss": 0.3673, "rewards/chosen": 0.31720378001530963, "rewards/margins": 3.652733107407888, "rewards/rejected": -3.335529327392578, "step": 16634 }, { "epoch": 0.8817215699785333, "grad_norm": 23.625, "kl": 3.1179561614990234, "learning_rate": 5e-07, "logits/chosen": 10526974.0, "logits/rejected": -47790240.0, "logps/chosen": -58.57505798339844, "logps/rejected": -522.7440185546875, "loss": 0.1377, "rewards/chosen": 0.8493908047676086, "rewards/margins": 4.5769003033638, "rewards/rejected": -3.7275094985961914, "step": 16635 }, { "epoch": 0.8817745739803355, "grad_norm": 51.75, "kl": 2.659097671508789, "learning_rate": 5e-07, "logits/chosen": -8965936.0, "logits/rejected": -14369189.333333334, "logps/chosen": -206.1257080078125, "logps/rejected": -266.8450113932292, "loss": 0.2815, "rewards/chosen": 0.9416544914245606, "rewards/margins": 3.4310164133707683, "rewards/rejected": -2.4893619219462075, "step": 16636 }, { "epoch": 0.8818275779821376, "grad_norm": 42.25, "kl": 0.2782611846923828, "learning_rate": 5e-07, "logits/chosen": -4407824.333333333, "logits/rejected": -8595560.0, "logps/chosen": -221.8273722330729, "logps/rejected": -231.000244140625, "loss": 0.2602, "rewards/chosen": 0.5910825332005819, "rewards/margins": 2.205124052365621, "rewards/rejected": -1.614041519165039, "step": 16637 }, { "epoch": 0.8818805819839398, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2432212.0, "logits/rejected": -3478139.3333333335, "logps/chosen": -1597.613037109375, "logps/rejected": -275.8448079427083, "loss": 0.1474, "rewards/chosen": 2.1226775646209717, "rewards/margins": 4.842608531316122, "rewards/rejected": -2.71993096669515, "step": 16638 }, { "epoch": 0.8819335859857419, "grad_norm": 46.75, "kl": 0.3211174011230469, "learning_rate": 5e-07, "logits/chosen": -66062408.0, "logits/rejected": -17571668.0, "logps/chosen": -472.9914245605469, "logps/rejected": -326.63897705078125, "loss": 0.1762, "rewards/chosen": 1.0214158296585083, "rewards/margins": 3.7722946405410767, "rewards/rejected": -2.7508788108825684, "step": 16639 }, { "epoch": 0.8819865899875441, "grad_norm": 55.25, "kl": 0.07172393798828125, "learning_rate": 5e-07, "logits/chosen": -102366698.66666667, "logits/rejected": -8229548.8, "logps/chosen": -614.0729573567709, "logps/rejected": -311.1986572265625, "loss": 0.1572, "rewards/chosen": 1.2181905110677083, "rewards/margins": 3.804408582051595, "rewards/rejected": -2.586218070983887, "step": 16640 }, { "epoch": 0.8820395939893462, "grad_norm": 43.0, "kl": 2.0624923706054688, "learning_rate": 5e-07, "logits/chosen": -30352564.0, "logits/rejected": -50216988.0, "logps/chosen": -189.09080505371094, "logps/rejected": -583.8690795898438, "loss": 0.2608, "rewards/chosen": 0.49719667434692383, "rewards/margins": 3.207460641860962, "rewards/rejected": -2.710263967514038, "step": 16641 }, { "epoch": 0.8820925979911484, "grad_norm": 50.0, "kl": 2.6151809692382812, "learning_rate": 5e-07, "logits/chosen": -17475040.0, "logits/rejected": -42713728.0, "logps/chosen": -229.89420572916666, "logps/rejected": -157.01869201660156, "loss": 0.3096, "rewards/chosen": 0.8418893019358317, "rewards/margins": 3.814750591913859, "rewards/rejected": -2.9728612899780273, "step": 16642 }, { "epoch": 0.8821456019929504, "grad_norm": 51.75, "kl": 2.779322624206543, "learning_rate": 5e-07, "logits/chosen": -26393000.0, "logits/rejected": 29963462.0, "logps/chosen": -119.01370239257812, "logps/rejected": -312.8463134765625, "loss": 0.4003, "rewards/chosen": -0.08543506264686584, "rewards/margins": 1.6830379664897919, "rewards/rejected": -1.7684730291366577, "step": 16643 }, { "epoch": 0.8821986059947526, "grad_norm": 120.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86970432.0, "logits/rejected": -15668492.8, "logps/chosen": -292.85093180338544, "logps/rejected": -329.131640625, "loss": 0.2445, "rewards/chosen": 0.03212737043698629, "rewards/margins": 2.676210774978002, "rewards/rejected": -2.6440834045410155, "step": 16644 }, { "epoch": 0.8822516099965547, "grad_norm": 54.0, "kl": 1.777653694152832, "learning_rate": 5e-07, "logits/chosen": -24714928.0, "logits/rejected": -30830262.0, "logps/chosen": -246.967041015625, "logps/rejected": -465.8223571777344, "loss": 0.2063, "rewards/chosen": 1.3335607051849365, "rewards/margins": 3.644444704055786, "rewards/rejected": -2.3108839988708496, "step": 16645 }, { "epoch": 0.8823046139983569, "grad_norm": 69.5, "kl": 0.3435859680175781, "learning_rate": 5e-07, "logits/chosen": 14604107.0, "logits/rejected": -15583245.0, "logps/chosen": -520.2786254882812, "logps/rejected": -188.72482299804688, "loss": 0.2751, "rewards/chosen": 0.5586057901382446, "rewards/margins": 2.306089997291565, "rewards/rejected": -1.7474842071533203, "step": 16646 }, { "epoch": 0.882357618000159, "grad_norm": 69.5, "kl": 0.8484725952148438, "learning_rate": 5e-07, "logits/chosen": 39440905.14285714, "logits/rejected": -47394888.0, "logps/chosen": -325.78470284598217, "logps/rejected": -287.7921142578125, "loss": 0.4595, "rewards/chosen": 0.07571443489619664, "rewards/margins": 2.894018973623003, "rewards/rejected": -2.8183045387268066, "step": 16647 }, { "epoch": 0.8824106220019612, "grad_norm": 76.5, "kl": 4.97410774230957, "learning_rate": 5e-07, "logits/chosen": -1650348.3333333333, "logits/rejected": -25110873.6, "logps/chosen": -667.912109375, "logps/rejected": -349.427197265625, "loss": 0.2641, "rewards/chosen": 1.2364858786265056, "rewards/margins": 3.12800866762797, "rewards/rejected": -1.8915227890014648, "step": 16648 }, { "epoch": 0.8824636260037633, "grad_norm": 38.75, "kl": 2.8811893463134766, "learning_rate": 5e-07, "logits/chosen": -13969932.8, "logits/rejected": -23677994.666666668, "logps/chosen": -134.19654541015626, "logps/rejected": -163.1265665690104, "loss": 0.4229, "rewards/chosen": -0.03084941506385803, "rewards/margins": 2.1409674108028414, "rewards/rejected": -2.171816825866699, "step": 16649 }, { "epoch": 0.8825166300055655, "grad_norm": 42.25, "kl": 2.5242862701416016, "learning_rate": 5e-07, "logits/chosen": -11750048.0, "logits/rejected": -15940298.666666666, "logps/chosen": -223.5294677734375, "logps/rejected": -454.2654622395833, "loss": 0.2268, "rewards/chosen": 1.1365272521972656, "rewards/margins": 5.20326665242513, "rewards/rejected": -4.066739400227864, "step": 16650 }, { "epoch": 0.8825696340073675, "grad_norm": 54.5, "kl": 2.505535125732422, "learning_rate": 5e-07, "logits/chosen": -33840152.0, "logits/rejected": -50011672.0, "logps/chosen": -536.43408203125, "logps/rejected": -512.92919921875, "loss": 0.243, "rewards/chosen": 1.6301865577697754, "rewards/margins": 4.004753589630127, "rewards/rejected": -2.3745670318603516, "step": 16651 }, { "epoch": 0.8826226380091697, "grad_norm": 65.5, "kl": 7.482906341552734, "learning_rate": 5e-07, "logits/chosen": -6423995.2, "logits/rejected": -7555030.666666667, "logps/chosen": -245.2541748046875, "logps/rejected": -205.34688313802084, "loss": 0.227, "rewards/chosen": 1.974158477783203, "rewards/margins": 4.968701871236165, "rewards/rejected": -2.9945433934529624, "step": 16652 }, { "epoch": 0.8826756420109718, "grad_norm": 47.5, "kl": 3.162679672241211, "learning_rate": 5e-07, "logits/chosen": -44770028.0, "logits/rejected": -14895872.0, "logps/chosen": -438.4910888671875, "logps/rejected": -138.67193603515625, "loss": 0.2475, "rewards/chosen": 1.4167221784591675, "rewards/margins": 3.8302167654037476, "rewards/rejected": -2.41349458694458, "step": 16653 }, { "epoch": 0.882728646012774, "grad_norm": 53.0, "kl": 4.8962202072143555, "learning_rate": 5e-07, "logits/chosen": -36267930.666666664, "logits/rejected": -32460012.0, "logps/chosen": -224.5844523111979, "logps/rejected": -372.33087158203125, "loss": 0.3478, "rewards/chosen": 0.8125375906626383, "rewards/margins": 3.810035149256388, "rewards/rejected": -2.99749755859375, "step": 16654 }, { "epoch": 0.8827816500145761, "grad_norm": 50.0, "kl": 5.73597526550293, "learning_rate": 5e-07, "logits/chosen": -19014876.8, "logits/rejected": -43807301.333333336, "logps/chosen": -260.5309326171875, "logps/rejected": -355.926513671875, "loss": 0.4801, "rewards/chosen": 0.17025245428085328, "rewards/margins": 1.7991240541140239, "rewards/rejected": -1.6288715998331706, "step": 16655 }, { "epoch": 0.8828346540163783, "grad_norm": 52.25, "kl": 1.5513935089111328, "learning_rate": 5e-07, "logits/chosen": -112211818.66666667, "logits/rejected": -4065721.6, "logps/chosen": -449.1668294270833, "logps/rejected": -285.8947265625, "loss": 0.2396, "rewards/chosen": 0.7280364831288656, "rewards/margins": 2.6206943353017174, "rewards/rejected": -1.8926578521728517, "step": 16656 }, { "epoch": 0.8828876580181804, "grad_norm": 40.25, "kl": 0.2627687454223633, "learning_rate": 5e-07, "logits/chosen": -1694047.625, "logits/rejected": -9377542.0, "logps/chosen": -8.58640193939209, "logps/rejected": -276.9837239583333, "loss": 0.2791, "rewards/chosen": -0.04957323148846626, "rewards/margins": 1.649475605537494, "rewards/rejected": -1.6990488370259602, "step": 16657 }, { "epoch": 0.8829406620199826, "grad_norm": 40.5, "kl": 5.354846954345703, "learning_rate": 5e-07, "logits/chosen": -26417798.0, "logits/rejected": -28764984.0, "logps/chosen": -572.2062377929688, "logps/rejected": -428.6179504394531, "loss": 0.2215, "rewards/chosen": 1.2322816848754883, "rewards/margins": 5.183816909790039, "rewards/rejected": -3.951535224914551, "step": 16658 }, { "epoch": 0.8829936660217846, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -96969717.33333333, "logits/rejected": -29284828.8, "logps/chosen": -420.1138509114583, "logps/rejected": -378.70234375, "loss": 0.3036, "rewards/chosen": 0.09642207622528076, "rewards/margins": 2.034968447685242, "rewards/rejected": -1.938546371459961, "step": 16659 }, { "epoch": 0.8830466700235868, "grad_norm": 35.5, "kl": 1.2464532852172852, "learning_rate": 5e-07, "logits/chosen": 2552770.2, "logits/rejected": -31086898.666666668, "logps/chosen": -218.5771484375, "logps/rejected": -345.082763671875, "loss": 0.1824, "rewards/chosen": 1.4279289245605469, "rewards/margins": 3.84684419631958, "rewards/rejected": -2.418915271759033, "step": 16660 }, { "epoch": 0.8830996740253889, "grad_norm": 53.5, "kl": 1.638418197631836, "learning_rate": 5e-07, "logits/chosen": -27235037.333333332, "logits/rejected": -40014232.0, "logps/chosen": -298.11647542317706, "logps/rejected": -373.2649230957031, "loss": 0.3879, "rewards/chosen": 0.24726142485936484, "rewards/margins": 4.638435264428456, "rewards/rejected": -4.391173839569092, "step": 16661 }, { "epoch": 0.8831526780271911, "grad_norm": 97.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71133051.42857143, "logits/rejected": -10202786.0, "logps/chosen": -327.439208984375, "logps/rejected": -276.8881530761719, "loss": 0.3579, "rewards/chosen": 0.47117386545453754, "rewards/margins": 2.4726050921848843, "rewards/rejected": -2.0014312267303467, "step": 16662 }, { "epoch": 0.8832056820289932, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44175933.333333336, "logits/rejected": -34391302.4, "logps/chosen": -176.6646728515625, "logps/rejected": -285.7042236328125, "loss": 0.2573, "rewards/chosen": 0.40871278444925946, "rewards/margins": 2.574709908167521, "rewards/rejected": -2.165997123718262, "step": 16663 }, { "epoch": 0.8832586860307953, "grad_norm": 58.75, "kl": 1.6926136016845703, "learning_rate": 5e-07, "logits/chosen": 258238.33333333334, "logits/rejected": -35482588.0, "logps/chosen": -363.3629150390625, "logps/rejected": -405.5702209472656, "loss": 0.3541, "rewards/chosen": 0.32152803738911945, "rewards/margins": 3.3353708585103354, "rewards/rejected": -3.013842821121216, "step": 16664 }, { "epoch": 0.8833116900325975, "grad_norm": 31.5, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": 11450402.0, "logps/rejected": -346.7148742675781, "loss": 0.09, "rewards/rejected": -3.4738380908966064, "step": 16665 }, { "epoch": 0.8833646940343995, "grad_norm": 32.0, "kl": 1.2651538848876953, "learning_rate": 5e-07, "logits/chosen": -28581573.333333332, "logits/rejected": -11434132.8, "logps/chosen": -186.5005900065104, "logps/rejected": -235.526318359375, "loss": 0.2335, "rewards/chosen": 0.09543602665265401, "rewards/margins": 2.989821936686834, "rewards/rejected": -2.8943859100341798, "step": 16666 }, { "epoch": 0.8834176980362017, "grad_norm": 38.25, "kl": 3.5171022415161133, "learning_rate": 5e-07, "logits/chosen": -33588464.0, "logits/rejected": 27126048.0, "logps/chosen": -159.6494384765625, "logps/rejected": -308.3507893880208, "loss": 0.3902, "rewards/chosen": 0.09725867509841919, "rewards/margins": 3.5754516402880347, "rewards/rejected": -3.4781929651896157, "step": 16667 }, { "epoch": 0.8834707020380038, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16880746.666666668, "logits/rejected": 47933635.2, "logps/chosen": -321.6177164713542, "logps/rejected": -397.6268310546875, "loss": 0.2149, "rewards/chosen": 0.8019256591796875, "rewards/margins": 2.920453643798828, "rewards/rejected": -2.1185279846191407, "step": 16668 }, { "epoch": 0.883523706039806, "grad_norm": 45.75, "kl": 2.2775726318359375, "learning_rate": 5e-07, "logits/chosen": -1536768.0, "logits/rejected": -17815760.0, "logps/chosen": -234.07418823242188, "logps/rejected": -359.7729187011719, "loss": 0.3218, "rewards/chosen": 0.6631885170936584, "rewards/margins": 3.3378437161445618, "rewards/rejected": -2.6746551990509033, "step": 16669 }, { "epoch": 0.8835767100416081, "grad_norm": 41.5, "kl": 2.0118408203125, "learning_rate": 5e-07, "logits/chosen": -13276137.6, "logits/rejected": -47367882.666666664, "logps/chosen": -156.54910888671876, "logps/rejected": -626.1625569661459, "loss": 0.3755, "rewards/chosen": -0.05636330842971802, "rewards/margins": 3.7236539085706077, "rewards/rejected": -3.7800172170003257, "step": 16670 }, { "epoch": 0.8836297140434103, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 16342320.0, "logits/rejected": -33641656.0, "logps/chosen": -115.2917251586914, "logps/rejected": -351.2142333984375, "loss": 0.2484, "rewards/chosen": 0.03321990743279457, "rewards/margins": 1.9868145920336246, "rewards/rejected": -1.95359468460083, "step": 16671 }, { "epoch": 0.8836827180452124, "grad_norm": 39.5, "kl": 0.9218177795410156, "learning_rate": 5e-07, "logits/chosen": -39350432.0, "logits/rejected": -30395344.0, "logps/chosen": -624.9672241210938, "logps/rejected": -382.3455403645833, "loss": 0.1152, "rewards/chosen": 1.867431640625, "rewards/margins": 5.254107157389322, "rewards/rejected": -3.3866755167643228, "step": 16672 }, { "epoch": 0.8837357220470146, "grad_norm": 49.5, "kl": 2.638540267944336, "learning_rate": 5e-07, "logits/chosen": -18209094.4, "logits/rejected": -12565018.666666666, "logps/chosen": -317.195166015625, "logps/rejected": -81.79323832194011, "loss": 0.355, "rewards/chosen": 0.30404815673828123, "rewards/margins": 3.135930951436361, "rewards/rejected": -2.8318827946980796, "step": 16673 }, { "epoch": 0.8837887260488166, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44214268.8, "logits/rejected": -37065056.0, "logps/chosen": -323.2489990234375, "logps/rejected": -339.2717692057292, "loss": 0.3531, "rewards/chosen": 0.39587161540985105, "rewards/margins": 1.8247697909673053, "rewards/rejected": -1.4288981755574544, "step": 16674 }, { "epoch": 0.8838417300506188, "grad_norm": 86.5, "kl": 4.792448043823242, "learning_rate": 5e-07, "logits/chosen": -26206514.285714287, "logits/rejected": -3972949.0, "logps/chosen": -395.55423409598217, "logps/rejected": -103.51604461669922, "loss": 0.4063, "rewards/chosen": 0.7013122694832938, "rewards/margins": 5.115152086530413, "rewards/rejected": -4.413839817047119, "step": 16675 }, { "epoch": 0.8838947340524209, "grad_norm": 71.0, "kl": 1.7047576904296875, "learning_rate": 5e-07, "logits/chosen": 436707200.0, "logits/rejected": -78975808.0, "logps/chosen": -922.0374755859375, "logps/rejected": -445.5398864746094, "loss": 0.2278, "rewards/chosen": 1.014936089515686, "rewards/margins": 3.586840271949768, "rewards/rejected": -2.571904182434082, "step": 16676 }, { "epoch": 0.8839477380542231, "grad_norm": 52.25, "kl": 5.246023178100586, "learning_rate": 5e-07, "logits/chosen": -6840496.0, "logits/rejected": -24580085.333333332, "logps/chosen": -252.25703125, "logps/rejected": -313.9518229166667, "loss": 0.426, "rewards/chosen": 0.3484335899353027, "rewards/margins": 1.6310088793436686, "rewards/rejected": -1.282575289408366, "step": 16677 }, { "epoch": 0.8840007420560252, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23865738.666666668, "logits/rejected": -8870160.0, "logps/chosen": -354.816162109375, "logps/rejected": -352.2263916015625, "loss": 0.2743, "rewards/chosen": 0.6137064297993978, "rewards/margins": 2.0584041913350424, "rewards/rejected": -1.4446977615356444, "step": 16678 }, { "epoch": 0.8840537460578274, "grad_norm": 43.5, "kl": 0.25684356689453125, "learning_rate": 5e-07, "logits/chosen": -23546078.4, "logits/rejected": -21636505.333333332, "logps/chosen": -251.001025390625, "logps/rejected": -229.64554850260416, "loss": 0.3314, "rewards/chosen": 0.04869309067726135, "rewards/margins": 3.2933506071567535, "rewards/rejected": -3.244657516479492, "step": 16679 }, { "epoch": 0.8841067500596295, "grad_norm": 39.5, "kl": 5.043877601623535, "learning_rate": 5e-07, "logits/chosen": -4596832.8, "logits/rejected": -2645516.6666666665, "logps/chosen": -809.1259765625, "logps/rejected": -261.94061279296875, "loss": 0.2312, "rewards/chosen": 1.6593687057495117, "rewards/margins": 6.498607190450032, "rewards/rejected": -4.8392384847005205, "step": 16680 }, { "epoch": 0.8841597540614317, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36400580.0, "logits/rejected": -18013736.0, "logps/chosen": -447.97503662109375, "logps/rejected": -396.53839111328125, "loss": 0.2708, "rewards/chosen": 0.6468612551689148, "rewards/margins": 2.685416877269745, "rewards/rejected": -2.03855562210083, "step": 16681 }, { "epoch": 0.8842127580632337, "grad_norm": 53.0, "kl": 1.8385372161865234, "learning_rate": 5e-07, "logits/chosen": -42882026.666666664, "logits/rejected": -9186279.0, "logps/chosen": -370.02392578125, "logps/rejected": -401.68389892578125, "loss": 0.3652, "rewards/chosen": 0.4736056327819824, "rewards/margins": 2.907837390899658, "rewards/rejected": -2.434231758117676, "step": 16682 }, { "epoch": 0.8842657620650359, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53173699.2, "logits/rejected": -3368353.3333333335, "logps/chosen": -353.5417236328125, "logps/rejected": -99.8910420735677, "loss": 0.241, "rewards/chosen": 0.7665026664733887, "rewards/margins": 3.4994954744974773, "rewards/rejected": -2.7329928080240884, "step": 16683 }, { "epoch": 0.884318766066838, "grad_norm": 57.5, "kl": 1.7290658950805664, "learning_rate": 5e-07, "logits/chosen": -8324748.8, "logits/rejected": -11306310.666666666, "logps/chosen": -278.070556640625, "logps/rejected": -283.0515950520833, "loss": 0.4113, "rewards/chosen": 0.45994720458984373, "rewards/margins": 1.3489805221557618, "rewards/rejected": -0.889033317565918, "step": 16684 }, { "epoch": 0.8843717700686402, "grad_norm": 64.5, "kl": 0.7794933319091797, "learning_rate": 5e-07, "logits/chosen": -28997565.333333332, "logits/rejected": -8367556.8, "logps/chosen": -325.80568440755206, "logps/rejected": -372.9385986328125, "loss": 0.3023, "rewards/chosen": 0.1534390648206075, "rewards/margins": 1.748879071076711, "rewards/rejected": -1.5954400062561036, "step": 16685 }, { "epoch": 0.8844247740704423, "grad_norm": 46.25, "kl": 4.02878475189209, "learning_rate": 5e-07, "logits/chosen": 2460371.5, "logits/rejected": 7766084.0, "logps/chosen": -148.9265899658203, "logps/rejected": -134.31826782226562, "loss": 0.3712, "rewards/chosen": 0.6490512490272522, "rewards/margins": 3.1293116211891174, "rewards/rejected": -2.4802603721618652, "step": 16686 }, { "epoch": 0.8844777780722445, "grad_norm": 54.75, "kl": 0.8543319702148438, "learning_rate": 5e-07, "logits/chosen": -41761792.0, "logits/rejected": -17063568.0, "logps/chosen": -393.831689453125, "logps/rejected": -283.76096598307294, "loss": 0.1988, "rewards/chosen": 1.0655834197998046, "rewards/margins": 4.190627606709798, "rewards/rejected": -3.1250441869099936, "step": 16687 }, { "epoch": 0.8845307820740466, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13671990.0, "logits/rejected": -6654190.666666667, "logps/chosen": -249.3903350830078, "logps/rejected": -361.1175944010417, "loss": 0.2053, "rewards/chosen": -0.29853782057762146, "rewards/margins": 2.53812383611997, "rewards/rejected": -2.8366616566975913, "step": 16688 }, { "epoch": 0.8845837860758488, "grad_norm": 43.0, "kl": 3.7318363189697266, "learning_rate": 5e-07, "logits/chosen": -65671056.0, "logits/rejected": -15950584.0, "logps/chosen": -1135.744140625, "logps/rejected": -195.58321707589286, "loss": 0.1583, "rewards/chosen": 4.09393310546875, "rewards/margins": 5.969931602478027, "rewards/rejected": -1.8759984970092773, "step": 16689 }, { "epoch": 0.8846367900776508, "grad_norm": 45.25, "kl": 3.408580780029297, "learning_rate": 5e-07, "logits/chosen": -30936841.6, "logits/rejected": -21993969.333333332, "logps/chosen": -354.0511474609375, "logps/rejected": -298.23948160807294, "loss": 0.2484, "rewards/chosen": 1.2214197158813476, "rewards/margins": 5.306829198201497, "rewards/rejected": -4.085409482320149, "step": 16690 }, { "epoch": 0.884689794079453, "grad_norm": 54.75, "kl": 0.3539237976074219, "learning_rate": 5e-07, "logits/chosen": 3181430.8, "logits/rejected": -21188274.666666668, "logps/chosen": -229.2062744140625, "logps/rejected": -277.3091227213542, "loss": 0.3031, "rewards/chosen": 0.17161407470703124, "rewards/margins": 4.550913874308269, "rewards/rejected": -4.379299799601237, "step": 16691 }, { "epoch": 0.8847427980812551, "grad_norm": 55.5, "kl": 2.104036331176758, "learning_rate": 5e-07, "logits/chosen": -10270279.333333334, "logits/rejected": -2960516.6, "logps/chosen": -100.0318603515625, "logps/rejected": -474.43349609375, "loss": 0.3193, "rewards/chosen": 0.16197866201400757, "rewards/margins": 2.483082139492035, "rewards/rejected": -2.3211034774780273, "step": 16692 }, { "epoch": 0.8847958020830573, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12349465.333333334, "logits/rejected": 22542808.0, "logps/chosen": -152.48100789388022, "logps/rejected": -342.4763671875, "loss": 0.2325, "rewards/chosen": 0.42873303095499676, "rewards/margins": 2.7698163668314613, "rewards/rejected": -2.3410833358764647, "step": 16693 }, { "epoch": 0.8848488060848594, "grad_norm": 54.25, "kl": 3.393526077270508, "learning_rate": 5e-07, "logits/chosen": -66011840.0, "logits/rejected": -12457536.0, "logps/chosen": -284.9642822265625, "logps/rejected": -331.75758870442706, "loss": 0.376, "rewards/chosen": 0.36895065307617186, "rewards/margins": 2.6829409281412757, "rewards/rejected": -2.313990275065104, "step": 16694 }, { "epoch": 0.8849018100866616, "grad_norm": 55.5, "kl": 6.524200439453125, "learning_rate": 5e-07, "logits/chosen": -8748132.0, "logits/rejected": -2236520.5, "logps/chosen": -306.8569641113281, "logps/rejected": -295.86651611328125, "loss": 0.2267, "rewards/chosen": 1.5887246131896973, "rewards/margins": 3.8253173828125, "rewards/rejected": -2.2365927696228027, "step": 16695 }, { "epoch": 0.8849548140884637, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69179637.33333333, "logits/rejected": -65038553.6, "logps/chosen": -428.161865234375, "logps/rejected": -218.945703125, "loss": 0.258, "rewards/chosen": 0.6936208407084147, "rewards/margins": 2.4157888094584146, "rewards/rejected": -1.72216796875, "step": 16696 }, { "epoch": 0.8850078180902659, "grad_norm": 59.25, "kl": 3.7856998443603516, "learning_rate": 5e-07, "logits/chosen": -15404989.333333334, "logits/rejected": -5243787.5, "logps/chosen": -192.7170206705729, "logps/rejected": -176.04437255859375, "loss": 0.3109, "rewards/chosen": 1.243383566538493, "rewards/margins": 2.9075833956400556, "rewards/rejected": -1.6641998291015625, "step": 16697 }, { "epoch": 0.8850608220920679, "grad_norm": 49.75, "kl": 2.417609214782715, "learning_rate": 5e-07, "logits/chosen": -70269541.33333333, "logits/rejected": 14238429.0, "logps/chosen": -560.819091796875, "logps/rejected": -269.7796325683594, "loss": 0.2914, "rewards/chosen": 0.9951692422231039, "rewards/margins": 3.3369372685750327, "rewards/rejected": -2.3417680263519287, "step": 16698 }, { "epoch": 0.8851138260938701, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40859664.0, "logits/rejected": -26378370.666666668, "logps/chosen": -356.9462890625, "logps/rejected": -193.69757080078125, "loss": 0.2605, "rewards/chosen": 0.6499356269836426, "rewards/margins": 3.7112759590148925, "rewards/rejected": -3.06134033203125, "step": 16699 }, { "epoch": 0.8851668300956722, "grad_norm": 34.25, "kl": 1.4478797912597656, "learning_rate": 5e-07, "logits/chosen": -32421922.0, "logits/rejected": -41657637.333333336, "logps/chosen": -193.94703674316406, "logps/rejected": -320.25730387369794, "loss": 0.1347, "rewards/chosen": 1.1076936721801758, "rewards/margins": 3.99039618174235, "rewards/rejected": -2.8827025095621743, "step": 16700 }, { "epoch": 0.8852198340974744, "grad_norm": 49.5, "kl": 1.4466066360473633, "learning_rate": 5e-07, "logits/chosen": -8051887.2, "logits/rejected": -18525958.666666668, "logps/chosen": -241.70244140625, "logps/rejected": -379.0817057291667, "loss": 0.3519, "rewards/chosen": 0.11138179302215576, "rewards/margins": 3.7211055199305214, "rewards/rejected": -3.6097237269083657, "step": 16701 }, { "epoch": 0.8852728380992765, "grad_norm": 43.5, "kl": 0.49275684356689453, "learning_rate": 5e-07, "logits/chosen": 5589451.0, "logits/rejected": -21811010.0, "logps/chosen": -144.0386962890625, "logps/rejected": -247.26971435546875, "loss": 0.2681, "rewards/chosen": 0.569979190826416, "rewards/margins": 2.9513044357299805, "rewards/rejected": -2.3813252449035645, "step": 16702 }, { "epoch": 0.8853258421010787, "grad_norm": 66.0, "kl": 4.057544708251953, "learning_rate": 5e-07, "logits/chosen": -41723874.666666664, "logits/rejected": -23870624.0, "logps/chosen": -292.6642252604167, "logps/rejected": -163.1002960205078, "loss": 0.3908, "rewards/chosen": 0.4690433740615845, "rewards/margins": 2.6562894582748413, "rewards/rejected": -2.187246084213257, "step": 16703 }, { "epoch": 0.8853788461028808, "grad_norm": 46.25, "kl": 1.963409423828125, "learning_rate": 5e-07, "logits/chosen": -31842589.333333332, "logits/rejected": -21544513.6, "logps/chosen": -483.6541748046875, "logps/rejected": -312.568212890625, "loss": 0.203, "rewards/chosen": 1.1448717912038167, "rewards/margins": 3.5622896989186605, "rewards/rejected": -2.4174179077148437, "step": 16704 }, { "epoch": 0.885431850104683, "grad_norm": 96.5, "kl": 0.0673828125, "learning_rate": 5e-07, "logits/chosen": -23457220.0, "logits/rejected": -30056884.57142857, "logps/chosen": -275.8736267089844, "logps/rejected": -269.58958217075894, "loss": 0.2185, "rewards/chosen": 0.39245912432670593, "rewards/margins": 2.4898443945816586, "rewards/rejected": -2.0973852702549527, "step": 16705 }, { "epoch": 0.885484854106485, "grad_norm": 71.5, "kl": 4.787191390991211, "learning_rate": 5e-07, "logits/chosen": 6435134.0, "logits/rejected": -28149690.0, "logps/chosen": -231.468994140625, "logps/rejected": -306.7732849121094, "loss": 0.4141, "rewards/chosen": 0.23940469324588776, "rewards/margins": 1.7816041857004166, "rewards/rejected": -1.5421994924545288, "step": 16706 }, { "epoch": 0.8855378581082872, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4875468.666666667, "logits/rejected": -17009632.0, "logps/chosen": -257.2037760416667, "logps/rejected": -254.5696044921875, "loss": 0.308, "rewards/chosen": 0.553570032119751, "rewards/margins": 1.8038588047027588, "rewards/rejected": -1.2502887725830079, "step": 16707 }, { "epoch": 0.8855908621100893, "grad_norm": 34.25, "kl": 0.2753734588623047, "learning_rate": 5e-07, "logits/chosen": -2004532.6666666667, "logits/rejected": -25522894.4, "logps/chosen": -176.37451171875, "logps/rejected": -156.39649658203126, "loss": 0.1687, "rewards/chosen": 1.565463383992513, "rewards/margins": 3.718073972066243, "rewards/rejected": -2.1526105880737303, "step": 16708 }, { "epoch": 0.8856438661118915, "grad_norm": 67.5, "kl": 2.0250701904296875, "learning_rate": 5e-07, "logits/chosen": -78913958.4, "logits/rejected": -31509301.333333332, "logps/chosen": -534.194140625, "logps/rejected": -826.9259440104166, "loss": 0.3054, "rewards/chosen": 0.7557787895202637, "rewards/margins": 2.87441078821818, "rewards/rejected": -2.1186319986979165, "step": 16709 }, { "epoch": 0.8856968701136936, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11079670.666666666, "logits/rejected": -10266172.8, "logps/chosen": -443.4715983072917, "logps/rejected": -332.6189208984375, "loss": 0.2503, "rewards/chosen": 0.5250426133473715, "rewards/margins": 2.184021838506063, "rewards/rejected": -1.6589792251586915, "step": 16710 }, { "epoch": 0.8857498741154958, "grad_norm": 57.0, "kl": 4.269989013671875, "learning_rate": 5e-07, "logits/chosen": -49278889.6, "logits/rejected": -38378432.0, "logps/chosen": -629.409228515625, "logps/rejected": -494.0904947916667, "loss": 0.3356, "rewards/chosen": 0.92344970703125, "rewards/margins": 3.206717046101888, "rewards/rejected": -2.283267339070638, "step": 16711 }, { "epoch": 0.8858028781172979, "grad_norm": 53.25, "kl": 2.493000030517578, "learning_rate": 5e-07, "logits/chosen": -25804181.333333332, "logits/rejected": -2702972.25, "logps/chosen": -284.3970947265625, "logps/rejected": -135.1024627685547, "loss": 0.3433, "rewards/chosen": 0.6804904937744141, "rewards/margins": 1.7028656005859375, "rewards/rejected": -1.0223751068115234, "step": 16712 }, { "epoch": 0.8858558821191, "grad_norm": 33.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23313024.0, "logits/rejected": -16047777.6, "logps/chosen": -188.5184326171875, "logps/rejected": -145.04322509765626, "loss": 0.3236, "rewards/chosen": -0.3042083779970805, "rewards/margins": 2.0458444555600486, "rewards/rejected": -2.350052833557129, "step": 16713 }, { "epoch": 0.8859088861209021, "grad_norm": 45.75, "kl": 1.3022842407226562, "learning_rate": 5e-07, "logits/chosen": -32242118.4, "logits/rejected": -50294453.333333336, "logps/chosen": -243.59306640625, "logps/rejected": -556.2787679036459, "loss": 0.3128, "rewards/chosen": 0.5107421875, "rewards/margins": 3.3476104736328125, "rewards/rejected": -2.8368682861328125, "step": 16714 }, { "epoch": 0.8859618901227042, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56086826.666666664, "logits/rejected": -32322598.4, "logps/chosen": -276.2454833984375, "logps/rejected": -497.650439453125, "loss": 0.304, "rewards/chosen": -0.050675079226493835, "rewards/margins": 3.002371719479561, "rewards/rejected": -3.0530467987060548, "step": 16715 }, { "epoch": 0.8860148941245064, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 271830.0, "logits/rejected": -6093144.666666667, "logps/chosen": -113.31640625, "logps/rejected": -317.2381184895833, "loss": 0.2288, "rewards/chosen": 0.33794403076171875, "rewards/margins": 2.34112548828125, "rewards/rejected": -2.0031814575195312, "step": 16716 }, { "epoch": 0.8860678981263085, "grad_norm": 42.25, "kl": 2.2605857849121094, "learning_rate": 5e-07, "logits/chosen": 34325416.0, "logits/rejected": -43986000.0, "logps/chosen": -223.72035217285156, "logps/rejected": -504.4112854003906, "loss": 0.2402, "rewards/chosen": 0.8534793853759766, "rewards/margins": 3.5318989753723145, "rewards/rejected": -2.678419589996338, "step": 16717 }, { "epoch": 0.8861209021281107, "grad_norm": 44.25, "kl": 0.6251401901245117, "learning_rate": 5e-07, "logits/chosen": -8839573.333333334, "logits/rejected": -1236594.4, "logps/chosen": -294.31797281901044, "logps/rejected": -143.1515380859375, "loss": 0.2819, "rewards/chosen": 1.2902544339497883, "rewards/margins": 2.324654038747152, "rewards/rejected": -1.0343996047973634, "step": 16718 }, { "epoch": 0.8861739061299128, "grad_norm": 54.25, "kl": 3.2379798889160156, "learning_rate": 5e-07, "logits/chosen": -42305728.0, "logits/rejected": -30377294.0, "logps/chosen": -450.2607421875, "logps/rejected": -595.890380859375, "loss": 0.2101, "rewards/chosen": 1.3859461545944214, "rewards/margins": 5.0050143003463745, "rewards/rejected": -3.619068145751953, "step": 16719 }, { "epoch": 0.886226910131715, "grad_norm": 34.5, "kl": 2.8867883682250977, "learning_rate": 5e-07, "logits/chosen": -21726222.4, "logits/rejected": -33132725.333333332, "logps/chosen": -198.7031982421875, "logps/rejected": -321.84670003255206, "loss": 0.3028, "rewards/chosen": 0.7041616439819336, "rewards/margins": 4.965013186136882, "rewards/rejected": -4.260851542154948, "step": 16720 }, { "epoch": 0.886279914133517, "grad_norm": 43.5, "kl": 1.14117431640625, "learning_rate": 5e-07, "logits/chosen": 5864398.0, "logits/rejected": -21653540.57142857, "logps/chosen": -42.1265754699707, "logps/rejected": -308.6590053013393, "loss": 0.1603, "rewards/chosen": 1.1192963123321533, "rewards/margins": 3.075310128075736, "rewards/rejected": -1.9560138157435827, "step": 16721 }, { "epoch": 0.8863329181353192, "grad_norm": 68.0, "kl": 1.299056053161621, "learning_rate": 5e-07, "logits/chosen": 5267149.5, "logits/rejected": -20600333.333333332, "logps/chosen": -45.002803802490234, "logps/rejected": -444.72802734375, "loss": 0.2388, "rewards/chosen": 0.4026113748550415, "rewards/margins": 2.5790684620539346, "rewards/rejected": -2.176457087198893, "step": 16722 }, { "epoch": 0.8863859221371213, "grad_norm": 43.75, "kl": 2.3292236328125, "learning_rate": 5e-07, "logits/chosen": -15111834.666666666, "logits/rejected": -35968732.8, "logps/chosen": -436.2408854166667, "logps/rejected": -320.5091552734375, "loss": 0.2131, "rewards/chosen": 1.2525035540262859, "rewards/margins": 3.690622679392497, "rewards/rejected": -2.438119125366211, "step": 16723 }, { "epoch": 0.8864389261389235, "grad_norm": 57.25, "kl": 1.7513980865478516, "learning_rate": 5e-07, "logits/chosen": -13830235.2, "logits/rejected": -54018480.0, "logps/chosen": -257.68369140625, "logps/rejected": -504.4711100260417, "loss": 0.3966, "rewards/chosen": 0.39562532901763914, "rewards/margins": 1.7439939737319947, "rewards/rejected": -1.3483686447143555, "step": 16724 }, { "epoch": 0.8864919301407256, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13862334.0, "logits/rejected": -19241712.0, "logps/chosen": -278.7518615722656, "logps/rejected": -424.7564392089844, "loss": 0.1542, "rewards/chosen": 1.3086133003234863, "rewards/margins": 4.242543458938599, "rewards/rejected": -2.9339301586151123, "step": 16725 }, { "epoch": 0.8865449341425278, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12192252.0, "logits/rejected": -12537140.0, "logps/chosen": -149.8833770751953, "logps/rejected": -300.9771728515625, "loss": 0.3099, "rewards/chosen": -0.012701809406280518, "rewards/margins": 1.367982566356659, "rewards/rejected": -1.3806843757629395, "step": 16726 }, { "epoch": 0.8865979381443299, "grad_norm": 54.75, "kl": 0.01717376708984375, "learning_rate": 5e-07, "logits/chosen": -90521600.0, "logits/rejected": -6896675.0, "logps/chosen": -552.5955200195312, "logps/rejected": -421.22576904296875, "loss": 0.2566, "rewards/chosen": 0.6541725397109985, "rewards/margins": 2.5518438816070557, "rewards/rejected": -1.8976713418960571, "step": 16727 }, { "epoch": 0.886650942146132, "grad_norm": 55.0, "kl": 2.9947452545166016, "learning_rate": 5e-07, "logits/chosen": -15533128.0, "logits/rejected": -28357322.666666668, "logps/chosen": -477.9189453125, "logps/rejected": -333.0917561848958, "loss": 0.1396, "rewards/chosen": 1.9068344831466675, "rewards/margins": 3.9249460299809775, "rewards/rejected": -2.01811154683431, "step": 16728 }, { "epoch": 0.8867039461479341, "grad_norm": 59.5, "kl": 1.7237167358398438, "learning_rate": 5e-07, "logits/chosen": -32796626.666666668, "logits/rejected": 16493763.0, "logps/chosen": -531.3324381510416, "logps/rejected": -131.21591186523438, "loss": 0.2388, "rewards/chosen": 1.344049612681071, "rewards/margins": 3.4609917799631758, "rewards/rejected": -2.1169421672821045, "step": 16729 }, { "epoch": 0.8867569501497363, "grad_norm": 64.0, "kl": 3.049041748046875, "learning_rate": 5e-07, "logits/chosen": -70171120.0, "logits/rejected": -30120696.0, "logps/chosen": -808.1748046875, "logps/rejected": -405.91302490234375, "loss": 0.199, "rewards/chosen": 1.5854675769805908, "rewards/margins": 4.924755811691284, "rewards/rejected": -3.3392882347106934, "step": 16730 }, { "epoch": 0.8868099541515384, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40709052.0, "logits/rejected": -44411528.0, "logps/chosen": -374.8951416015625, "logps/rejected": -604.1865234375, "loss": 0.2176, "rewards/chosen": 0.5045230984687805, "rewards/margins": 3.784955322742462, "rewards/rejected": -3.2804322242736816, "step": 16731 }, { "epoch": 0.8868629581533406, "grad_norm": 28.625, "kl": 1.48291015625, "learning_rate": 5e-07, "logits/chosen": -58234184.0, "logits/rejected": -28986992.0, "logps/chosen": -767.9176025390625, "logps/rejected": -377.9288736979167, "loss": 0.0843, "rewards/chosen": 2.1931777000427246, "rewards/margins": 5.386560599009195, "rewards/rejected": -3.193382898966471, "step": 16732 }, { "epoch": 0.8869159621551427, "grad_norm": 37.0, "kl": 0.9595699310302734, "learning_rate": 5e-07, "logits/chosen": -73384440.0, "logits/rejected": -22548672.0, "logps/chosen": -102.9074935913086, "logps/rejected": -215.52120971679688, "loss": 0.3141, "rewards/chosen": 0.36704614758491516, "rewards/margins": 2.4758446514606476, "rewards/rejected": -2.1087985038757324, "step": 16733 }, { "epoch": 0.8869689661569449, "grad_norm": 53.5, "kl": 2.5371618270874023, "learning_rate": 5e-07, "logits/chosen": -3682050.8, "logits/rejected": -32344224.0, "logps/chosen": -261.6488037109375, "logps/rejected": -278.8799235026042, "loss": 0.3214, "rewards/chosen": 0.8292581558227539, "rewards/margins": 2.487770589192708, "rewards/rejected": -1.6585124333699544, "step": 16734 }, { "epoch": 0.887021970158747, "grad_norm": 45.75, "kl": 1.4547719955444336, "learning_rate": 5e-07, "logits/chosen": -18463560.0, "logits/rejected": -17959126.0, "logps/chosen": -210.31631469726562, "logps/rejected": -154.8650665283203, "loss": 0.3277, "rewards/chosen": 0.3763292729854584, "rewards/margins": 1.950624793767929, "rewards/rejected": -1.5742955207824707, "step": 16735 }, { "epoch": 0.8870749741605491, "grad_norm": 42.25, "kl": 3.4877099990844727, "learning_rate": 5e-07, "logits/chosen": 4934300.0, "logits/rejected": -7354820.666666667, "logps/chosen": -117.740478515625, "logps/rejected": -177.27974446614584, "loss": 0.2902, "rewards/chosen": 0.6795441627502441, "rewards/margins": 4.039793999989827, "rewards/rejected": -3.3602498372395835, "step": 16736 }, { "epoch": 0.8871279781623512, "grad_norm": 42.5, "kl": 1.1722640991210938, "learning_rate": 5e-07, "logits/chosen": -48154762.666666664, "logits/rejected": -25259566.4, "logps/chosen": -511.1128336588542, "logps/rejected": -399.22685546875, "loss": 0.231, "rewards/chosen": 1.0399107933044434, "rewards/margins": 3.5235867500305176, "rewards/rejected": -2.483675956726074, "step": 16737 }, { "epoch": 0.8871809821641534, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31053252.0, "logits/rejected": -6128432.0, "logps/chosen": -236.018310546875, "logps/rejected": -156.48121643066406, "loss": 0.2902, "rewards/chosen": -0.09304356575012207, "rewards/margins": 2.9085288047790527, "rewards/rejected": -3.001572370529175, "step": 16738 }, { "epoch": 0.8872339861659555, "grad_norm": 60.5, "kl": 1.3962011337280273, "learning_rate": 5e-07, "logits/chosen": 28341971.2, "logits/rejected": -15724794.666666666, "logps/chosen": -113.7681396484375, "logps/rejected": -229.7768758138021, "loss": 0.287, "rewards/chosen": 0.6324009895324707, "rewards/margins": 2.656994660695394, "rewards/rejected": -2.0245936711629233, "step": 16739 }, { "epoch": 0.8872869901677577, "grad_norm": 36.0, "kl": 3.352834701538086, "learning_rate": 5e-07, "logits/chosen": -20205778.0, "logits/rejected": -32309098.0, "logps/chosen": -368.8904724121094, "logps/rejected": -397.029296875, "loss": 0.1937, "rewards/chosen": 1.7932422161102295, "rewards/margins": 4.4292151927948, "rewards/rejected": -2.6359729766845703, "step": 16740 }, { "epoch": 0.8873399941695598, "grad_norm": 61.75, "kl": 4.908260345458984, "learning_rate": 5e-07, "logits/chosen": -23273821.333333332, "logits/rejected": -11495127.0, "logps/chosen": -492.4617513020833, "logps/rejected": -120.72769927978516, "loss": 0.3115, "rewards/chosen": 1.2158602873484294, "rewards/margins": 4.045446316401164, "rewards/rejected": -2.8295860290527344, "step": 16741 }, { "epoch": 0.887392998171362, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -64725588.0, "logits/rejected": -43302778.666666664, "logps/chosen": -409.71856689453125, "logps/rejected": -453.5709228515625, "loss": 0.1143, "rewards/chosen": 0.7355697751045227, "rewards/margins": 4.976151287555695, "rewards/rejected": -4.240581512451172, "step": 16742 }, { "epoch": 0.887446002173164, "grad_norm": 43.5, "kl": 4.095550537109375, "learning_rate": 5e-07, "logits/chosen": -2158598.0, "logits/rejected": -60249397.333333336, "logps/chosen": -272.6811767578125, "logps/rejected": -489.6208089192708, "loss": 0.3004, "rewards/chosen": 0.8284190177917481, "rewards/margins": 3.5241835594177244, "rewards/rejected": -2.6957645416259766, "step": 16743 }, { "epoch": 0.8874990061749662, "grad_norm": 59.0, "kl": 5.743007659912109, "learning_rate": 5e-07, "logits/chosen": -25069984.0, "logits/rejected": -1951799.1666666667, "logps/chosen": -515.813671875, "logps/rejected": -220.6418660481771, "loss": 0.3593, "rewards/chosen": 1.0147822380065918, "rewards/margins": 3.4497170130411785, "rewards/rejected": -2.4349347750345864, "step": 16744 }, { "epoch": 0.8875520101767683, "grad_norm": 45.25, "kl": 0.14862632751464844, "learning_rate": 5e-07, "logits/chosen": -43339225.6, "logits/rejected": -26324600.0, "logps/chosen": -326.68779296875, "logps/rejected": -206.66251627604166, "loss": 0.2497, "rewards/chosen": 0.6995137214660645, "rewards/margins": 3.4978576978047693, "rewards/rejected": -2.7983439763387046, "step": 16745 }, { "epoch": 0.8876050141785705, "grad_norm": 48.5, "kl": 3.0251035690307617, "learning_rate": 5e-07, "logits/chosen": -28426090.0, "logits/rejected": -13242505.0, "logps/chosen": -294.5441589355469, "logps/rejected": -229.0287628173828, "loss": 0.2607, "rewards/chosen": 0.7825733423233032, "rewards/margins": 4.590366959571838, "rewards/rejected": -3.807793617248535, "step": 16746 }, { "epoch": 0.8876580181803726, "grad_norm": 54.0, "kl": 1.7821168899536133, "learning_rate": 5e-07, "logits/chosen": -15071664.0, "logits/rejected": -1129262.4, "logps/chosen": -216.1933797200521, "logps/rejected": -269.8277587890625, "loss": 0.3052, "rewards/chosen": 0.8736600875854492, "rewards/margins": 2.3499902725219726, "rewards/rejected": -1.4763301849365233, "step": 16747 }, { "epoch": 0.8877110221821748, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5944144.0, "logits/rejected": -11837222.666666666, "logps/chosen": -434.7987060546875, "logps/rejected": -178.75065104166666, "loss": 0.1716, "rewards/chosen": 1.2757354974746704, "rewards/margins": 3.134196400642395, "rewards/rejected": -1.8584609031677246, "step": 16748 }, { "epoch": 0.8877640261839769, "grad_norm": 50.75, "kl": 1.3732357025146484, "learning_rate": 5e-07, "logits/chosen": -40897541.333333336, "logits/rejected": -23572648.0, "logps/chosen": -450.1531168619792, "logps/rejected": -502.0740234375, "loss": 0.1871, "rewards/chosen": 0.767386277516683, "rewards/margins": 4.5101444562276205, "rewards/rejected": -3.7427581787109374, "step": 16749 }, { "epoch": 0.8878170301857791, "grad_norm": 38.75, "kl": 0.29271507263183594, "learning_rate": 5e-07, "logits/chosen": 5944166.0, "logits/rejected": -8139269.6, "logps/chosen": -21.1190668741862, "logps/rejected": -184.7218505859375, "loss": 0.25, "rewards/chosen": 0.34426383177439374, "rewards/margins": 2.507370559374491, "rewards/rejected": -2.1631067276000975, "step": 16750 }, { "epoch": 0.8878700341875811, "grad_norm": 49.25, "kl": 2.322683334350586, "learning_rate": 5e-07, "logits/chosen": -27221235.2, "logits/rejected": 27275749.333333332, "logps/chosen": -309.867626953125, "logps/rejected": -382.4114990234375, "loss": 0.3321, "rewards/chosen": 0.4174992561340332, "rewards/margins": 2.832030328114828, "rewards/rejected": -2.4145310719807944, "step": 16751 }, { "epoch": 0.8879230381893833, "grad_norm": 49.0, "kl": 1.2180938720703125, "learning_rate": 5e-07, "logits/chosen": -82842904.0, "logits/rejected": -11425137.0, "logps/chosen": -500.8434753417969, "logps/rejected": -218.16209411621094, "loss": 0.2727, "rewards/chosen": 0.9890366196632385, "rewards/margins": 3.2117376923561096, "rewards/rejected": -2.222701072692871, "step": 16752 }, { "epoch": 0.8879760421911854, "grad_norm": 25.875, "kl": 0.36707305908203125, "learning_rate": 5e-07, "logits/chosen": -17542904.0, "logits/rejected": -18537545.6, "logps/chosen": -270.63417561848956, "logps/rejected": -233.2835693359375, "loss": 0.0887, "rewards/chosen": 1.9099631309509277, "rewards/margins": 5.152637386322022, "rewards/rejected": -3.2426742553710937, "step": 16753 }, { "epoch": 0.8880290461929876, "grad_norm": 41.75, "kl": 4.983678817749023, "learning_rate": 5e-07, "logits/chosen": -71604240.0, "logits/rejected": 1086610.857142857, "logps/chosen": -2459.2685546875, "logps/rejected": -260.70741489955356, "loss": 0.17, "rewards/chosen": 4.393701076507568, "rewards/margins": 6.390687874385289, "rewards/rejected": -1.9969867978777205, "step": 16754 }, { "epoch": 0.8880820501947897, "grad_norm": 36.75, "kl": 4.398802757263184, "learning_rate": 5e-07, "logits/chosen": 2697145.75, "logits/rejected": -13137788.0, "logps/chosen": -82.9796371459961, "logps/rejected": -190.0780029296875, "loss": 0.2553, "rewards/chosen": 1.3630224466323853, "rewards/margins": 3.4174166917800903, "rewards/rejected": -2.054394245147705, "step": 16755 }, { "epoch": 0.8881350541965919, "grad_norm": 68.5, "kl": 3.409444808959961, "learning_rate": 5e-07, "logits/chosen": -41541352.0, "logits/rejected": -17515116.0, "logps/chosen": -194.06961059570312, "logps/rejected": -221.68450927734375, "loss": 0.3825, "rewards/chosen": 0.1569652557373047, "rewards/margins": 2.5464744567871094, "rewards/rejected": -2.3895092010498047, "step": 16756 }, { "epoch": 0.888188058198394, "grad_norm": 231.0, "kl": 3.8181190490722656, "learning_rate": 5e-07, "logits/chosen": -75549888.0, "logits/rejected": -27258218.666666668, "logps/chosen": -420.35927734375, "logps/rejected": -314.6129964192708, "loss": 0.5057, "rewards/chosen": 0.14856337308883666, "rewards/margins": 0.2446311891078949, "rewards/rejected": -0.09606781601905823, "step": 16757 }, { "epoch": 0.8882410622001962, "grad_norm": 40.5, "kl": 2.4405441284179688, "learning_rate": 5e-07, "logits/chosen": -5428069.0, "logits/rejected": -52664088.0, "logps/chosen": -180.4801025390625, "logps/rejected": -254.48809814453125, "loss": 0.2345, "rewards/chosen": 1.1015617847442627, "rewards/margins": 3.6289050579071045, "rewards/rejected": -2.527343273162842, "step": 16758 }, { "epoch": 0.8882940662019982, "grad_norm": 32.75, "kl": 1.1461563110351562, "learning_rate": 5e-07, "logits/chosen": -17156749.333333332, "logits/rejected": -32646016.0, "logps/chosen": -671.42041015625, "logps/rejected": -310.154541015625, "loss": 0.2383, "rewards/chosen": 0.8093124230702718, "rewards/margins": 4.178320296605428, "rewards/rejected": -3.369007873535156, "step": 16759 }, { "epoch": 0.8883470702038004, "grad_norm": 39.0, "kl": 0.2721710205078125, "learning_rate": 5e-07, "logits/chosen": -22476288.0, "logits/rejected": -3464437.5, "logps/chosen": -311.7344970703125, "logps/rejected": -223.9370880126953, "loss": 0.2524, "rewards/chosen": 1.15037202835083, "rewards/margins": 3.002069115638733, "rewards/rejected": -1.8516970872879028, "step": 16760 }, { "epoch": 0.8884000742056025, "grad_norm": 65.5, "kl": 1.2678489685058594, "learning_rate": 5e-07, "logits/chosen": -34610979.2, "logits/rejected": -12279145.333333334, "logps/chosen": -274.5056396484375, "logps/rejected": -247.45113118489584, "loss": 0.3826, "rewards/chosen": -0.15155243873596191, "rewards/margins": 2.5668415228525796, "rewards/rejected": -2.7183939615885415, "step": 16761 }, { "epoch": 0.8884530782074047, "grad_norm": 59.5, "kl": 0.7839250564575195, "learning_rate": 5e-07, "logits/chosen": -16968350.4, "logits/rejected": -61826080.0, "logps/chosen": -152.74105224609374, "logps/rejected": -352.2621256510417, "loss": 0.3931, "rewards/chosen": 0.11037732362747192, "rewards/margins": 1.5555712501207988, "rewards/rejected": -1.445193926493327, "step": 16762 }, { "epoch": 0.8885060822092068, "grad_norm": 47.5, "kl": 4.256294250488281, "learning_rate": 5e-07, "logits/chosen": -12873811.2, "logits/rejected": -39613866.666666664, "logps/chosen": -213.4718505859375, "logps/rejected": -366.21044921875, "loss": 0.3184, "rewards/chosen": 0.5431025505065918, "rewards/margins": 4.197438844045004, "rewards/rejected": -3.6543362935384116, "step": 16763 }, { "epoch": 0.888559086211009, "grad_norm": 35.5, "kl": 0.6274137496948242, "learning_rate": 5e-07, "logits/chosen": -46088192.0, "logits/rejected": -8597793.6, "logps/chosen": -299.3459065755208, "logps/rejected": -127.66915283203124, "loss": 0.2676, "rewards/chosen": 0.24100279808044434, "rewards/margins": 2.3025535106658936, "rewards/rejected": -2.0615507125854493, "step": 16764 }, { "epoch": 0.8886120902128111, "grad_norm": 46.0, "kl": 2.3691768646240234, "learning_rate": 5e-07, "logits/chosen": -10860049.6, "logits/rejected": -65955680.0, "logps/chosen": -204.62637939453126, "logps/rejected": -280.5144449869792, "loss": 0.3665, "rewards/chosen": 0.23353226184844972, "rewards/margins": 2.7034905672073366, "rewards/rejected": -2.4699583053588867, "step": 16765 }, { "epoch": 0.8886650942146131, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4215276.5, "logits/rejected": -43510020.571428575, "logps/chosen": -218.12200927734375, "logps/rejected": -295.67372349330356, "loss": 0.2387, "rewards/chosen": -0.8034942746162415, "rewards/margins": 1.2739397202219283, "rewards/rejected": -2.0774339948381697, "step": 16766 }, { "epoch": 0.8887180982164153, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16175992.0, "logits/rejected": -13328505.6, "logps/chosen": -393.2217610677083, "logps/rejected": -157.98704833984374, "loss": 0.2598, "rewards/chosen": 0.7005231380462646, "rewards/margins": 2.5205602169036867, "rewards/rejected": -1.8200370788574218, "step": 16767 }, { "epoch": 0.8887711022182174, "grad_norm": 43.5, "kl": 1.807154655456543, "learning_rate": 5e-07, "logits/chosen": -26483177.6, "logits/rejected": -251600.75, "logps/chosen": -148.26605224609375, "logps/rejected": -99.58575439453125, "loss": 0.2966, "rewards/chosen": 0.5303987503051758, "rewards/margins": 2.9734952290852865, "rewards/rejected": -2.443096478780111, "step": 16768 }, { "epoch": 0.8888241062200196, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62850613.333333336, "logits/rejected": -23529352.0, "logps/chosen": -328.05100504557294, "logps/rejected": -251.434423828125, "loss": 0.2817, "rewards/chosen": 0.24120736122131348, "rewards/margins": 2.3036362171173095, "rewards/rejected": -2.062428855895996, "step": 16769 }, { "epoch": 0.8888771102218217, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55824736.0, "logits/rejected": -18234931.2, "logps/chosen": -275.1338704427083, "logps/rejected": -447.732275390625, "loss": 0.2088, "rewards/chosen": 0.2412633498509725, "rewards/margins": 3.1036893447240197, "rewards/rejected": -2.862425994873047, "step": 16770 }, { "epoch": 0.8889301142236239, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15177650.666666666, "logits/rejected": -2099880.4, "logps/chosen": -595.9558919270834, "logps/rejected": -133.2822998046875, "loss": 0.0787, "rewards/chosen": 1.738593578338623, "rewards/margins": 5.698113918304443, "rewards/rejected": -3.9595203399658203, "step": 16771 }, { "epoch": 0.888983118225426, "grad_norm": 42.0, "kl": 3.323629379272461, "learning_rate": 5e-07, "logits/chosen": -17196820.57142857, "logits/rejected": -8015475.5, "logps/chosen": -476.3035365513393, "logps/rejected": -93.19818115234375, "loss": 0.421, "rewards/chosen": 0.6693813460213798, "rewards/margins": 3.891159500394549, "rewards/rejected": -3.221778154373169, "step": 16772 }, { "epoch": 0.8890361222272282, "grad_norm": 46.25, "kl": 2.7692604064941406, "learning_rate": 5e-07, "logits/chosen": -29871808.0, "logits/rejected": 469237.3333333333, "logps/chosen": -190.99892578125, "logps/rejected": -210.50728352864584, "loss": 0.2178, "rewards/chosen": 1.0354141235351562, "rewards/margins": 5.612937672932942, "rewards/rejected": -4.577523549397786, "step": 16773 }, { "epoch": 0.8890891262290302, "grad_norm": 35.25, "kl": 1.682154655456543, "learning_rate": 5e-07, "logits/chosen": -22870276.8, "logits/rejected": -9209715.333333334, "logps/chosen": -81.89371337890626, "logps/rejected": -230.29256184895834, "loss": 0.352, "rewards/chosen": 0.22048962116241455, "rewards/margins": 2.7473228375116983, "rewards/rejected": -2.5268332163492837, "step": 16774 }, { "epoch": 0.8891421302308324, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32833912.0, "logits/rejected": -42345868.8, "logps/chosen": -164.77989705403647, "logps/rejected": -421.5662109375, "loss": 0.2639, "rewards/chosen": -0.3647969166437785, "rewards/margins": 2.4194024165471397, "rewards/rejected": -2.784199333190918, "step": 16775 }, { "epoch": 0.8891951342326345, "grad_norm": 37.5, "kl": 1.7046031951904297, "learning_rate": 5e-07, "logits/chosen": -21579946.0, "logits/rejected": -20827560.0, "logps/chosen": -278.7523498535156, "logps/rejected": -202.49819946289062, "loss": 0.2489, "rewards/chosen": 0.8454355001449585, "rewards/margins": 2.9752177000045776, "rewards/rejected": -2.129782199859619, "step": 16776 }, { "epoch": 0.8892481382344367, "grad_norm": 47.5, "kl": 0.2813072204589844, "learning_rate": 5e-07, "logits/chosen": -9267324.0, "logits/rejected": -11421564.0, "logps/chosen": -550.6827392578125, "logps/rejected": -207.2774200439453, "loss": 0.2176, "rewards/chosen": 1.9152172803878784, "rewards/margins": 4.536012530326843, "rewards/rejected": -2.620795249938965, "step": 16777 }, { "epoch": 0.8893011422362388, "grad_norm": 75.0, "kl": 0.8158416748046875, "learning_rate": 5e-07, "logits/chosen": -68839744.0, "logits/rejected": -7085634.0, "logps/chosen": -517.30029296875, "logps/rejected": -210.60872395833334, "loss": 0.2595, "rewards/chosen": 0.11022034287452698, "rewards/margins": 2.5963497857252755, "rewards/rejected": -2.4861294428507485, "step": 16778 }, { "epoch": 0.889354146238041, "grad_norm": 38.5, "kl": 3.9894561767578125, "learning_rate": 5e-07, "logits/chosen": 24142208.0, "logits/rejected": -25808334.4, "logps/chosen": -650.5091145833334, "logps/rejected": -339.6160888671875, "loss": 0.1708, "rewards/chosen": 1.9229059219360352, "rewards/margins": 5.069973564147949, "rewards/rejected": -3.147067642211914, "step": 16779 }, { "epoch": 0.8894071502398431, "grad_norm": 81.0, "kl": 0.2140493392944336, "learning_rate": 5e-07, "logits/chosen": -36555360.0, "logits/rejected": -15498158.666666666, "logps/chosen": -424.04443359375, "logps/rejected": -221.2523193359375, "loss": 0.2792, "rewards/chosen": 0.4482396602630615, "rewards/margins": 3.8818642457326256, "rewards/rejected": -3.433624585469564, "step": 16780 }, { "epoch": 0.8894601542416453, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7367384.0, "logits/rejected": -31964416.0, "logps/chosen": -309.2618408203125, "logps/rejected": -399.0072937011719, "loss": 0.3186, "rewards/chosen": 0.7181900143623352, "rewards/margins": 2.210541784763336, "rewards/rejected": -1.492351770401001, "step": 16781 }, { "epoch": 0.8895131582434473, "grad_norm": 44.25, "kl": 1.3951292037963867, "learning_rate": 5e-07, "logits/chosen": -24468232.0, "logits/rejected": 10775473.0, "logps/chosen": -273.7186584472656, "logps/rejected": -159.35948181152344, "loss": 0.3865, "rewards/chosen": 0.22211475670337677, "rewards/margins": 1.4605834037065506, "rewards/rejected": -1.2384686470031738, "step": 16782 }, { "epoch": 0.8895661622452495, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46716168.0, "logits/rejected": -29614544.0, "logps/chosen": -511.4962463378906, "logps/rejected": -383.3198649088542, "loss": 0.1713, "rewards/chosen": 1.1507568359375, "rewards/margins": 3.321962356567383, "rewards/rejected": -2.171205520629883, "step": 16783 }, { "epoch": 0.8896191662470516, "grad_norm": 36.25, "kl": 5.5185089111328125, "learning_rate": 5e-07, "logits/chosen": -13546453.0, "logits/rejected": -26007216.0, "logps/chosen": -215.50181579589844, "logps/rejected": -299.3774719238281, "loss": 0.2641, "rewards/chosen": 1.0714274644851685, "rewards/margins": 3.7247010469436646, "rewards/rejected": -2.653273582458496, "step": 16784 }, { "epoch": 0.8896721702488538, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -86201704.0, "logits/rejected": -37626976.0, "logps/chosen": -176.38661193847656, "logps/rejected": -334.265869140625, "loss": 0.1833, "rewards/chosen": 0.4096149504184723, "rewards/margins": 3.429037799437841, "rewards/rejected": -3.0194228490193686, "step": 16785 }, { "epoch": 0.8897251742506559, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49031868.0, "logits/rejected": -6645171.0, "logps/chosen": -280.99005126953125, "logps/rejected": -290.01019287109375, "loss": 0.2704, "rewards/chosen": 0.18249469995498657, "rewards/margins": 3.1751909852027893, "rewards/rejected": -2.9926962852478027, "step": 16786 }, { "epoch": 0.8897781782524581, "grad_norm": 42.0, "kl": 1.2183303833007812, "learning_rate": 5e-07, "logits/chosen": -14612478.0, "logits/rejected": -34244596.0, "logps/chosen": -209.1282958984375, "logps/rejected": -491.2602844238281, "loss": 0.3887, "rewards/chosen": -0.6481138467788696, "rewards/margins": 2.11434543132782, "rewards/rejected": -2.7624592781066895, "step": 16787 }, { "epoch": 0.8898311822542602, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12115227.2, "logits/rejected": -4342474.666666667, "logps/chosen": -201.31776123046876, "logps/rejected": -125.84208170572917, "loss": 0.3984, "rewards/chosen": 0.23198082447052001, "rewards/margins": 1.1520611683527628, "rewards/rejected": -0.9200803438822428, "step": 16788 }, { "epoch": 0.8898841862560624, "grad_norm": 28.375, "kl": 0.3718109130859375, "learning_rate": 5e-07, "logits/chosen": -28236752.0, "logits/rejected": -34951205.333333336, "logps/chosen": -373.25775146484375, "logps/rejected": -406.6448567708333, "loss": 0.1444, "rewards/chosen": 0.6880416870117188, "rewards/margins": 3.671778996785482, "rewards/rejected": -2.983737309773763, "step": 16789 }, { "epoch": 0.8899371902578644, "grad_norm": 44.0, "kl": 0.46193361282348633, "learning_rate": 5e-07, "logits/chosen": -34046428.0, "logits/rejected": -11878799.0, "logps/chosen": -199.73684692382812, "logps/rejected": -246.74391174316406, "loss": 0.309, "rewards/chosen": 0.29741156101226807, "rewards/margins": 2.3200851678848267, "rewards/rejected": -2.0226736068725586, "step": 16790 }, { "epoch": 0.8899901942596666, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19303104.0, "logits/rejected": -808957.0, "logps/chosen": -280.16656494140625, "logps/rejected": -220.9793701171875, "loss": 0.1453, "rewards/chosen": 1.140649437904358, "rewards/margins": 3.781081795692444, "rewards/rejected": -2.640432357788086, "step": 16791 }, { "epoch": 0.8900431982614687, "grad_norm": 46.25, "kl": 0.29863739013671875, "learning_rate": 5e-07, "logits/chosen": -20898380.0, "logits/rejected": -19821190.4, "logps/chosen": -444.732421875, "logps/rejected": -463.528271484375, "loss": 0.2507, "rewards/chosen": -0.25380857785542804, "rewards/margins": 3.663182465235392, "rewards/rejected": -3.91699104309082, "step": 16792 }, { "epoch": 0.8900962022632709, "grad_norm": 61.0, "kl": 0.4678659439086914, "learning_rate": 5e-07, "logits/chosen": -4289784.8, "logits/rejected": -50180005.333333336, "logps/chosen": -239.021337890625, "logps/rejected": -170.4764200846354, "loss": 0.2829, "rewards/chosen": 0.5838749408721924, "rewards/margins": 3.129065752029419, "rewards/rejected": -2.5451908111572266, "step": 16793 }, { "epoch": 0.890149206265073, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6317626.0, "logits/rejected": -52592904.0, "logps/chosen": -469.3130187988281, "logps/rejected": -267.84716796875, "loss": 0.3409, "rewards/chosen": 0.6120338439941406, "rewards/margins": 1.6561470031738281, "rewards/rejected": -1.0441131591796875, "step": 16794 }, { "epoch": 0.8902022102668752, "grad_norm": 73.5, "kl": 3.2883872985839844, "learning_rate": 5e-07, "logits/chosen": -29580125.333333332, "logits/rejected": -14051068.8, "logps/chosen": -226.0711873372396, "logps/rejected": -380.502587890625, "loss": 0.1946, "rewards/chosen": 1.2158552010854085, "rewards/margins": 3.5629365762074787, "rewards/rejected": -2.3470813751220705, "step": 16795 }, { "epoch": 0.8902552142686773, "grad_norm": 38.5, "kl": 0.169342041015625, "learning_rate": 5e-07, "logits/chosen": -9709881.333333334, "logits/rejected": 10975256.0, "logps/chosen": -224.06416829427084, "logps/rejected": -231.87509765625, "loss": 0.2054, "rewards/chosen": 1.0843268235524495, "rewards/margins": 3.6619189103444416, "rewards/rejected": -2.5775920867919924, "step": 16796 }, { "epoch": 0.8903082182704795, "grad_norm": 57.25, "kl": 2.304147720336914, "learning_rate": 5e-07, "logits/chosen": -5215851.428571428, "logits/rejected": 522599.21875, "logps/chosen": -319.50816127232144, "logps/rejected": -60.71933364868164, "loss": 0.4392, "rewards/chosen": 0.43202665873936247, "rewards/margins": 1.3089298350470406, "rewards/rejected": -0.8769031763076782, "step": 16797 }, { "epoch": 0.8903612222722815, "grad_norm": 64.0, "kl": 0.28760814666748047, "learning_rate": 5e-07, "logits/chosen": -22265176.0, "logits/rejected": -295774.75, "logps/chosen": -275.6968078613281, "logps/rejected": -364.0661926269531, "loss": 0.2633, "rewards/chosen": 0.6195006370544434, "rewards/margins": 2.64530611038208, "rewards/rejected": -2.0258054733276367, "step": 16798 }, { "epoch": 0.8904142262740837, "grad_norm": 51.0, "kl": 0.6974143981933594, "learning_rate": 5e-07, "logits/chosen": -46048565.333333336, "logits/rejected": -42439316.0, "logps/chosen": -430.5706380208333, "logps/rejected": -268.8723449707031, "loss": 0.2916, "rewards/chosen": 0.7514455318450928, "rewards/margins": 3.2327122688293457, "rewards/rejected": -2.481266736984253, "step": 16799 }, { "epoch": 0.8904672302758858, "grad_norm": 62.5, "kl": 1.1232185363769531, "learning_rate": 5e-07, "logits/chosen": -43193446.4, "logits/rejected": -74195749.33333333, "logps/chosen": -369.8806640625, "logps/rejected": -404.10595703125, "loss": 0.3568, "rewards/chosen": 0.5276588439941406, "rewards/margins": 2.0858965555826825, "rewards/rejected": -1.5582377115885417, "step": 16800 }, { "epoch": 0.890520234277688, "grad_norm": 55.5, "kl": 3.2332496643066406, "learning_rate": 5e-07, "logits/chosen": -5608085.0, "logits/rejected": -49935194.666666664, "logps/chosen": -204.59878540039062, "logps/rejected": -436.9060872395833, "loss": 0.1873, "rewards/chosen": 1.5010193586349487, "rewards/margins": 3.9742589394251504, "rewards/rejected": -2.4732395807902017, "step": 16801 }, { "epoch": 0.8905732382794901, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6169349.5, "logits/rejected": -18857372.57142857, "logps/chosen": -143.84188842773438, "logps/rejected": -469.23141043526783, "loss": 0.1251, "rewards/chosen": 0.855633556842804, "rewards/margins": 4.707113768373217, "rewards/rejected": -3.8514802115304128, "step": 16802 }, { "epoch": 0.8906262422812923, "grad_norm": 86.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32604996.0, "logits/rejected": -32368864.0, "logps/chosen": -409.3710021972656, "logps/rejected": -377.651123046875, "loss": 0.2736, "rewards/chosen": 0.1053466796875, "rewards/margins": 1.9053200085957844, "rewards/rejected": -1.7999733289082844, "step": 16803 }, { "epoch": 0.8906792462830944, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72261928.0, "logits/rejected": -13350468.57142857, "logps/chosen": -188.1419677734375, "logps/rejected": -283.2802734375, "loss": 0.1665, "rewards/chosen": -0.19576263427734375, "rewards/margins": 2.2325867244175504, "rewards/rejected": -2.428349358694894, "step": 16804 }, { "epoch": 0.8907322502848966, "grad_norm": 65.0, "kl": 3.1313657760620117, "learning_rate": 5e-07, "logits/chosen": -43315762.666666664, "logits/rejected": -64943056.0, "logps/chosen": -294.5882161458333, "logps/rejected": -287.0308837890625, "loss": 0.3435, "rewards/chosen": 0.6386373043060303, "rewards/margins": 2.276979684829712, "rewards/rejected": -1.6383423805236816, "step": 16805 }, { "epoch": 0.8907852542866986, "grad_norm": 23.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -82917592.0, "logits/rejected": -10765968.0, "logps/chosen": -209.12632751464844, "logps/rejected": -221.58126395089286, "loss": 0.14, "rewards/chosen": -0.7725021243095398, "rewards/margins": 2.1669629897390092, "rewards/rejected": -2.939465114048549, "step": 16806 }, { "epoch": 0.8908382582885008, "grad_norm": 31.0, "kl": 1.9234161376953125, "learning_rate": 5e-07, "logits/chosen": -872025.9166666666, "logits/rejected": -32172934.4, "logps/chosen": -98.13455200195312, "logps/rejected": -140.9546142578125, "loss": 0.2311, "rewards/chosen": 0.4511461655298869, "rewards/margins": 2.806605760256449, "rewards/rejected": -2.3554595947265624, "step": 16807 }, { "epoch": 0.8908912622903029, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74691280.0, "logits/rejected": -15196162.666666666, "logps/chosen": -267.64178466796875, "logps/rejected": -218.39656575520834, "loss": 0.2984, "rewards/chosen": -0.31748658418655396, "rewards/margins": 1.5352436900138855, "rewards/rejected": -1.8527302742004395, "step": 16808 }, { "epoch": 0.8909442662921051, "grad_norm": 37.75, "kl": 0.23947525024414062, "learning_rate": 5e-07, "logits/chosen": -27434068.0, "logits/rejected": -15300081.142857144, "logps/chosen": -65.21905517578125, "logps/rejected": -303.3878696986607, "loss": 0.1424, "rewards/chosen": 0.8496689200401306, "rewards/margins": 3.748933119433267, "rewards/rejected": -2.8992641993931363, "step": 16809 }, { "epoch": 0.8909972702939072, "grad_norm": 41.75, "kl": 0.9956264495849609, "learning_rate": 5e-07, "logits/chosen": -15451228.8, "logits/rejected": 355418.1666666667, "logps/chosen": -466.38125, "logps/rejected": -67.8145243326823, "loss": 0.2215, "rewards/chosen": 1.1546059608459474, "rewards/margins": 4.755801932017008, "rewards/rejected": -3.601195971171061, "step": 16810 }, { "epoch": 0.8910502742957094, "grad_norm": 66.0, "kl": 0.9956040382385254, "learning_rate": 5e-07, "logits/chosen": 55587752.0, "logits/rejected": -11912720.0, "logps/chosen": -555.2811279296875, "logps/rejected": -171.32401529947916, "loss": 0.1764, "rewards/chosen": 0.4773808419704437, "rewards/margins": 3.544989357391993, "rewards/rejected": -3.0676085154215493, "step": 16811 }, { "epoch": 0.8911032782975115, "grad_norm": 39.25, "kl": 1.5495681762695312, "learning_rate": 5e-07, "logits/chosen": -15506317.0, "logits/rejected": -22688110.0, "logps/chosen": -95.83541107177734, "logps/rejected": -241.9244384765625, "loss": 0.2649, "rewards/chosen": 0.4332754909992218, "rewards/margins": 3.1840639412403107, "rewards/rejected": -2.750788450241089, "step": 16812 }, { "epoch": 0.8911562822993137, "grad_norm": 60.25, "kl": 4.081393241882324, "learning_rate": 5e-07, "logits/chosen": -19347725.333333332, "logits/rejected": -6344426.5, "logps/chosen": -301.95627848307294, "logps/rejected": -92.51600646972656, "loss": 0.4446, "rewards/chosen": 0.5232464075088501, "rewards/margins": 1.6973917484283447, "rewards/rejected": -1.1741453409194946, "step": 16813 }, { "epoch": 0.8912092863011157, "grad_norm": 61.25, "kl": 1.2907295227050781, "learning_rate": 5e-07, "logits/chosen": -43399897.6, "logits/rejected": -9748998.666666666, "logps/chosen": -381.074169921875, "logps/rejected": -223.5554402669271, "loss": 0.3481, "rewards/chosen": 0.6350091934204102, "rewards/margins": 2.4604456265767416, "rewards/rejected": -1.8254364331563313, "step": 16814 }, { "epoch": 0.8912622903029179, "grad_norm": 50.75, "kl": 2.746305465698242, "learning_rate": 5e-07, "logits/chosen": -13391294.4, "logits/rejected": -17695694.666666668, "logps/chosen": -217.3966796875, "logps/rejected": -99.34031168619792, "loss": 0.3522, "rewards/chosen": 0.7280265808105468, "rewards/margins": 2.466265074412028, "rewards/rejected": -1.7382384936014812, "step": 16815 }, { "epoch": 0.89131529430472, "grad_norm": 59.5, "kl": 0.09566116333007812, "learning_rate": 5e-07, "logits/chosen": 104944.66666666667, "logits/rejected": 10600460.8, "logps/chosen": -54.382720947265625, "logps/rejected": -286.8609619140625, "loss": 0.2905, "rewards/chosen": 0.337285319964091, "rewards/margins": 2.0839476505915324, "rewards/rejected": -1.7466623306274414, "step": 16816 }, { "epoch": 0.8913682983065221, "grad_norm": 37.25, "kl": 1.0249285697937012, "learning_rate": 5e-07, "logits/chosen": -13988873.0, "logits/rejected": -37600981.333333336, "logps/chosen": -196.0098114013672, "logps/rejected": -471.6551920572917, "loss": 0.137, "rewards/chosen": 0.9248726963996887, "rewards/margins": 4.334662735462189, "rewards/rejected": -3.4097900390625, "step": 16817 }, { "epoch": 0.8914213023083243, "grad_norm": 56.0, "kl": 1.7640562057495117, "learning_rate": 5e-07, "logits/chosen": -22690227.2, "logits/rejected": 71219.0, "logps/chosen": -394.7514892578125, "logps/rejected": -100.75748697916667, "loss": 0.3307, "rewards/chosen": 0.2828519344329834, "rewards/margins": 2.9158666133880615, "rewards/rejected": -2.633014678955078, "step": 16818 }, { "epoch": 0.8914743063101264, "grad_norm": 45.5, "kl": 0.049106597900390625, "learning_rate": 5e-07, "logits/chosen": -9914202.0, "logits/rejected": -37386768.0, "logps/chosen": -473.9846496582031, "logps/rejected": -437.52252197265625, "loss": 0.2277, "rewards/chosen": 0.7207423448562622, "rewards/margins": 3.9551843404769897, "rewards/rejected": -3.2344419956207275, "step": 16819 }, { "epoch": 0.8915273103119286, "grad_norm": 50.0, "kl": 8.316169738769531, "learning_rate": 5e-07, "logits/chosen": -51058011.428571425, "logits/rejected": -19545210.0, "logps/chosen": -366.07906668526783, "logps/rejected": -131.40655517578125, "loss": 0.4857, "rewards/chosen": 1.0060227939060755, "rewards/margins": 2.6448480912617276, "rewards/rejected": -1.6388252973556519, "step": 16820 }, { "epoch": 0.8915803143137306, "grad_norm": 70.0, "kl": 0.049861907958984375, "learning_rate": 5e-07, "logits/chosen": -30213283.2, "logits/rejected": -23064133.333333332, "logps/chosen": -260.525, "logps/rejected": -331.27280680338544, "loss": 0.3333, "rewards/chosen": 0.3665478706359863, "rewards/margins": 2.131929365793864, "rewards/rejected": -1.7653814951578777, "step": 16821 }, { "epoch": 0.8916333183155328, "grad_norm": 48.0, "kl": 5.304532051086426, "learning_rate": 5e-07, "logits/chosen": -3748191.2, "logits/rejected": -43902362.666666664, "logps/chosen": -157.242724609375, "logps/rejected": -439.3313802083333, "loss": 0.3042, "rewards/chosen": 1.2083913803100585, "rewards/margins": 3.612980715433756, "rewards/rejected": -2.4045893351236978, "step": 16822 }, { "epoch": 0.8916863223173349, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26103834.0, "logits/rejected": -27285444.0, "logps/chosen": -268.8826599121094, "logps/rejected": -522.0902099609375, "loss": 0.2476, "rewards/chosen": 0.32802850008010864, "rewards/margins": 4.883173286914825, "rewards/rejected": -4.555144786834717, "step": 16823 }, { "epoch": 0.8917393263191371, "grad_norm": 38.75, "kl": 1.8371973037719727, "learning_rate": 5e-07, "logits/chosen": -42825880.0, "logits/rejected": -40985264.0, "logps/chosen": -222.9214630126953, "logps/rejected": -221.1815948486328, "loss": 0.2391, "rewards/chosen": 0.6959893703460693, "rewards/margins": 3.9257652759552, "rewards/rejected": -3.229775905609131, "step": 16824 }, { "epoch": 0.8917923303209392, "grad_norm": 55.0, "kl": 6.179180145263672, "learning_rate": 5e-07, "logits/chosen": -65538841.6, "logits/rejected": -39744400.0, "logps/chosen": -481.06552734375, "logps/rejected": -221.2106730143229, "loss": 0.3171, "rewards/chosen": 1.4590076446533202, "rewards/margins": 3.444206460316976, "rewards/rejected": -1.9851988156636555, "step": 16825 }, { "epoch": 0.8918453343227414, "grad_norm": 51.75, "kl": 0.30536937713623047, "learning_rate": 5e-07, "logits/chosen": 4366606.0, "logits/rejected": -38124083.2, "logps/chosen": -29.688486735026043, "logps/rejected": -147.6467041015625, "loss": 0.2436, "rewards/chosen": 0.5922972361246744, "rewards/margins": 2.7456080118815103, "rewards/rejected": -2.153310775756836, "step": 16826 }, { "epoch": 0.8918983383245435, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22726338.666666668, "logits/rejected": -60629120.0, "logps/chosen": -293.06321207682294, "logps/rejected": -299.080712890625, "loss": 0.2301, "rewards/chosen": 0.2063774069150289, "rewards/margins": 2.8022909124692283, "rewards/rejected": -2.5959135055541993, "step": 16827 }, { "epoch": 0.8919513423263457, "grad_norm": 40.5, "kl": 0.7790813446044922, "learning_rate": 5e-07, "logits/chosen": -78370280.0, "logits/rejected": -53254548.0, "logps/chosen": -396.1575622558594, "logps/rejected": -463.3431091308594, "loss": 0.1611, "rewards/chosen": 1.471104383468628, "rewards/margins": 4.347995042800903, "rewards/rejected": -2.8768906593322754, "step": 16828 }, { "epoch": 0.8920043463281477, "grad_norm": 44.5, "kl": 2.1557273864746094, "learning_rate": 5e-07, "logits/chosen": -33864704.0, "logits/rejected": -17498552.0, "logps/chosen": -189.0652587890625, "logps/rejected": -306.4449869791667, "loss": 0.3326, "rewards/chosen": 0.7945276260375976, "rewards/margins": 2.4212159474690753, "rewards/rejected": -1.6266883214314778, "step": 16829 }, { "epoch": 0.8920573503299499, "grad_norm": 45.75, "kl": 1.0859432220458984, "learning_rate": 5e-07, "logits/chosen": -18789712.0, "logits/rejected": -23307248.0, "logps/chosen": -222.2654296875, "logps/rejected": -308.60056559244794, "loss": 0.272, "rewards/chosen": 0.7135940551757812, "rewards/margins": 3.215262826283773, "rewards/rejected": -2.5016687711079917, "step": 16830 }, { "epoch": 0.892110354331752, "grad_norm": 57.25, "kl": 1.4043731689453125, "learning_rate": 5e-07, "logits/chosen": -75949427.2, "logits/rejected": -2309022.8333333335, "logps/chosen": -516.45751953125, "logps/rejected": -223.94340006510416, "loss": 0.3208, "rewards/chosen": 0.21759705543518065, "rewards/margins": 4.334419298171997, "rewards/rejected": -4.116822242736816, "step": 16831 }, { "epoch": 0.8921633583335542, "grad_norm": 57.0, "kl": 3.7674427032470703, "learning_rate": 5e-07, "logits/chosen": -80848786.28571428, "logits/rejected": -13644820.0, "logps/chosen": -295.9312220982143, "logps/rejected": -389.19842529296875, "loss": 0.4441, "rewards/chosen": 0.45944622584751676, "rewards/margins": 3.2531841141836986, "rewards/rejected": -2.7937378883361816, "step": 16832 }, { "epoch": 0.8922163623353563, "grad_norm": 53.25, "kl": 0.4567575454711914, "learning_rate": 5e-07, "logits/chosen": -31104090.0, "logits/rejected": -24806272.0, "logps/chosen": -266.404296875, "logps/rejected": -395.5422668457031, "loss": 0.2352, "rewards/chosen": 0.6655881404876709, "rewards/margins": 3.106464147567749, "rewards/rejected": -2.440876007080078, "step": 16833 }, { "epoch": 0.8922693663371585, "grad_norm": 56.25, "kl": 0.3080635070800781, "learning_rate": 5e-07, "logits/chosen": -6512947.428571428, "logits/rejected": -47154084.0, "logps/chosen": -271.87789481026783, "logps/rejected": -753.451171875, "loss": 0.349, "rewards/chosen": 0.45003175735473633, "rewards/margins": 4.558491230010986, "rewards/rejected": -4.10845947265625, "step": 16834 }, { "epoch": 0.8923223703389606, "grad_norm": 46.75, "kl": 2.6654739379882812, "learning_rate": 5e-07, "logits/chosen": -38085462.4, "logits/rejected": 1696669.1666666667, "logps/chosen": -293.2597412109375, "logps/rejected": -107.34749348958333, "loss": 0.3802, "rewards/chosen": 0.4289896011352539, "rewards/margins": 2.691673310597738, "rewards/rejected": -2.262683709462484, "step": 16835 }, { "epoch": 0.8923753743407628, "grad_norm": 43.5, "kl": 1.0889396667480469, "learning_rate": 5e-07, "logits/chosen": -12969876.8, "logits/rejected": -15399140.0, "logps/chosen": -121.09947509765625, "logps/rejected": -235.09977213541666, "loss": 0.3664, "rewards/chosen": 0.22005877494812012, "rewards/margins": 2.303749259312948, "rewards/rejected": -2.0836904843648276, "step": 16836 }, { "epoch": 0.8924283783425648, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22542674.0, "logits/rejected": -14798880.0, "logps/chosen": -121.98580932617188, "logps/rejected": -248.07539876302084, "loss": 0.1732, "rewards/chosen": 0.8903442621231079, "rewards/margins": 3.197208364804586, "rewards/rejected": -2.306864102681478, "step": 16837 }, { "epoch": 0.892481382344367, "grad_norm": 44.5, "kl": 0.6911144256591797, "learning_rate": 5e-07, "logits/chosen": -20619545.6, "logits/rejected": -12541893.333333334, "logps/chosen": -130.5778564453125, "logps/rejected": -297.22821044921875, "loss": 0.2945, "rewards/chosen": 0.5345344543457031, "rewards/margins": 2.727597236633301, "rewards/rejected": -2.1930627822875977, "step": 16838 }, { "epoch": 0.8925343863461691, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26988802.0, "logits/rejected": 7720378.0, "logps/chosen": -304.125732421875, "logps/rejected": -178.38909912109375, "loss": 0.2592, "rewards/chosen": 0.8810750842094421, "rewards/margins": 3.042402446269989, "rewards/rejected": -2.161327362060547, "step": 16839 }, { "epoch": 0.8925873903479713, "grad_norm": 55.5, "kl": 0.926356315612793, "learning_rate": 5e-07, "logits/chosen": -52823286.4, "logits/rejected": -23506472.0, "logps/chosen": -257.76416015625, "logps/rejected": -290.0147705078125, "loss": 0.3213, "rewards/chosen": 0.5509446144104004, "rewards/margins": 2.2279255867004393, "rewards/rejected": -1.676980972290039, "step": 16840 }, { "epoch": 0.8926403943497734, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41368172.0, "logits/rejected": -47239016.0, "logps/chosen": -168.2633514404297, "logps/rejected": -540.1837158203125, "loss": 0.3021, "rewards/chosen": 0.02008165419101715, "rewards/margins": 2.5567065626382828, "rewards/rejected": -2.5366249084472656, "step": 16841 }, { "epoch": 0.8926933983515756, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33580840.0, "logits/rejected": -28742584.0, "logps/chosen": -266.9878845214844, "logps/rejected": -459.4202880859375, "loss": 0.2537, "rewards/chosen": 0.2796005308628082, "rewards/margins": 3.163095623254776, "rewards/rejected": -2.8834950923919678, "step": 16842 }, { "epoch": 0.8927464023533777, "grad_norm": 50.0, "kl": 3.1095657348632812, "learning_rate": 5e-07, "logits/chosen": -52064020.0, "logits/rejected": -9164281.0, "logps/chosen": -697.2262573242188, "logps/rejected": -226.7667236328125, "loss": 0.2012, "rewards/chosen": 1.2231054306030273, "rewards/margins": 4.487604856491089, "rewards/rejected": -3.2644994258880615, "step": 16843 }, { "epoch": 0.8927994063551798, "grad_norm": 41.0, "kl": 0.6019840240478516, "learning_rate": 5e-07, "logits/chosen": -31194556.0, "logits/rejected": -15230423.0, "logps/chosen": -431.9266052246094, "logps/rejected": -265.60784912109375, "loss": 0.2052, "rewards/chosen": 0.6568483710289001, "rewards/margins": 3.7148346304893494, "rewards/rejected": -3.057986259460449, "step": 16844 }, { "epoch": 0.8928524103569819, "grad_norm": 51.25, "kl": 0.6168174743652344, "learning_rate": 5e-07, "logits/chosen": -34850600.0, "logits/rejected": -21069896.0, "logps/chosen": -731.482421875, "logps/rejected": -263.2660827636719, "loss": 0.2255, "rewards/chosen": 0.8335720300674438, "rewards/margins": 3.9463948011398315, "rewards/rejected": -3.1128227710723877, "step": 16845 }, { "epoch": 0.8929054143587841, "grad_norm": 40.75, "kl": 2.5350513458251953, "learning_rate": 5e-07, "logits/chosen": 1088574.6666666667, "logits/rejected": -25922934.4, "logps/chosen": -227.28900146484375, "logps/rejected": -338.835400390625, "loss": 0.2019, "rewards/chosen": 0.7675283749898275, "rewards/margins": 3.8497019131978356, "rewards/rejected": -3.082173538208008, "step": 16846 }, { "epoch": 0.8929584183605862, "grad_norm": 49.25, "kl": 0.6187992095947266, "learning_rate": 5e-07, "logits/chosen": -41338121.6, "logits/rejected": -17099888.0, "logps/chosen": -304.3067626953125, "logps/rejected": -229.21636962890625, "loss": 0.3149, "rewards/chosen": 0.34398071765899657, "rewards/margins": 2.3871147712071736, "rewards/rejected": -2.0431340535481772, "step": 16847 }, { "epoch": 0.8930114223623884, "grad_norm": 45.0, "kl": 1.4331655502319336, "learning_rate": 5e-07, "logits/chosen": -38191376.0, "logits/rejected": -16587871.0, "logps/chosen": -182.56309509277344, "logps/rejected": -183.5872344970703, "loss": 0.374, "rewards/chosen": 0.03593038022518158, "rewards/margins": 1.3218146413564682, "rewards/rejected": -1.2858842611312866, "step": 16848 }, { "epoch": 0.8930644263641905, "grad_norm": 49.0, "kl": 2.277036666870117, "learning_rate": 5e-07, "logits/chosen": 11078297.333333334, "logits/rejected": -53485084.8, "logps/chosen": -277.1570231119792, "logps/rejected": -434.675341796875, "loss": 0.2943, "rewards/chosen": 0.4181700547536214, "rewards/margins": 2.7668114503224692, "rewards/rejected": -2.3486413955688477, "step": 16849 }, { "epoch": 0.8931174303659927, "grad_norm": 50.75, "kl": 1.3770825862884521, "learning_rate": 5e-07, "logits/chosen": -46612742.4, "logits/rejected": 6468298.0, "logps/chosen": -399.558642578125, "logps/rejected": -399.9830729166667, "loss": 0.2803, "rewards/chosen": 0.7398421287536621, "rewards/margins": 3.3861793200174963, "rewards/rejected": -2.6463371912638345, "step": 16850 }, { "epoch": 0.8931704343677948, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71389258.66666667, "logits/rejected": -19736339.2, "logps/chosen": -427.4759114583333, "logps/rejected": -436.620166015625, "loss": 0.1493, "rewards/chosen": 1.163257360458374, "rewards/margins": 3.7538613796234133, "rewards/rejected": -2.5906040191650392, "step": 16851 }, { "epoch": 0.893223438369597, "grad_norm": 43.0, "kl": 0.7361049652099609, "learning_rate": 5e-07, "logits/chosen": -27574202.666666668, "logits/rejected": -4507647.6, "logps/chosen": -480.5677897135417, "logps/rejected": -89.5195068359375, "loss": 0.2551, "rewards/chosen": 0.8166507085164388, "rewards/margins": 2.5247449239095054, "rewards/rejected": -1.7080942153930665, "step": 16852 }, { "epoch": 0.893276442371399, "grad_norm": 106.5, "kl": 1.8122482299804688, "learning_rate": 5e-07, "logits/chosen": -740342.8571428572, "logits/rejected": -195606976.0, "logps/chosen": -626.8452845982143, "logps/rejected": -1062.315185546875, "loss": 0.4038, "rewards/chosen": 0.40157014983040945, "rewards/margins": 5.68507845061166, "rewards/rejected": -5.28350830078125, "step": 16853 }, { "epoch": 0.8933294463732012, "grad_norm": 33.5, "kl": 2.3972721099853516, "learning_rate": 5e-07, "logits/chosen": -11430796.0, "logits/rejected": -33427635.2, "logps/chosen": -140.05797322591147, "logps/rejected": -315.4072509765625, "loss": 0.2459, "rewards/chosen": 0.803478479385376, "rewards/margins": 2.7813696384429933, "rewards/rejected": -1.9778911590576171, "step": 16854 }, { "epoch": 0.8933824503750033, "grad_norm": 49.75, "kl": 1.5672645568847656, "learning_rate": 5e-07, "logits/chosen": -56981184.0, "logits/rejected": -9450233.6, "logps/chosen": -166.79217529296875, "logps/rejected": -318.6130126953125, "loss": 0.2685, "rewards/chosen": 0.5323214133580526, "rewards/margins": 2.52477384408315, "rewards/rejected": -1.9924524307250977, "step": 16855 }, { "epoch": 0.8934354543768055, "grad_norm": 42.0, "kl": 1.9746894836425781, "learning_rate": 5e-07, "logits/chosen": 10794528.666666666, "logits/rejected": -27595865.6, "logps/chosen": -180.7669677734375, "logps/rejected": -205.8920654296875, "loss": 0.2317, "rewards/chosen": 0.543346643447876, "rewards/margins": 3.710255479812622, "rewards/rejected": -3.166908836364746, "step": 16856 }, { "epoch": 0.8934884583786076, "grad_norm": 28.875, "kl": 1.4556665420532227, "learning_rate": 5e-07, "logits/chosen": -1205248.0, "logits/rejected": -38566211.2, "logps/chosen": -135.45079549153647, "logps/rejected": -242.91689453125, "loss": 0.1447, "rewards/chosen": 1.6190190315246582, "rewards/margins": 4.549691104888916, "rewards/rejected": -2.9306720733642577, "step": 16857 }, { "epoch": 0.8935414623804098, "grad_norm": 57.5, "kl": 0.2261514663696289, "learning_rate": 5e-07, "logits/chosen": -43488448.0, "logits/rejected": -34363144.0, "logps/chosen": -272.94346110026044, "logps/rejected": -423.37567138671875, "loss": 0.3603, "rewards/chosen": 0.3599042495091756, "rewards/margins": 2.1411909659703574, "rewards/rejected": -1.7812867164611816, "step": 16858 }, { "epoch": 0.8935944663822118, "grad_norm": 43.25, "kl": 5.492525100708008, "learning_rate": 5e-07, "logits/chosen": -27841965.333333332, "logits/rejected": -24527924.0, "logps/chosen": -165.8351847330729, "logps/rejected": -596.6408081054688, "loss": 0.3758, "rewards/chosen": 0.5581439336140951, "rewards/margins": 4.21851380666097, "rewards/rejected": -3.660369873046875, "step": 16859 }, { "epoch": 0.893647470384014, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17024354.0, "logits/rejected": -19998312.0, "logps/chosen": -546.50048828125, "logps/rejected": -267.0563557942708, "loss": 0.168, "rewards/chosen": 0.3199264705181122, "rewards/margins": 3.45161172747612, "rewards/rejected": -3.131685256958008, "step": 16860 }, { "epoch": 0.8937004743858161, "grad_norm": 61.5, "kl": 3.5453720092773438, "learning_rate": 5e-07, "logits/chosen": -10257789.6, "logits/rejected": -6103365.333333333, "logps/chosen": -237.3067626953125, "logps/rejected": -253.9436238606771, "loss": 0.3411, "rewards/chosen": 0.820399284362793, "rewards/margins": 2.5987701416015625, "rewards/rejected": -1.7783708572387695, "step": 16861 }, { "epoch": 0.8937534783876183, "grad_norm": 43.5, "kl": 1.1690459251403809, "learning_rate": 5e-07, "logits/chosen": -38211750.4, "logits/rejected": -27948994.666666668, "logps/chosen": -233.963818359375, "logps/rejected": -191.26253255208334, "loss": 0.2956, "rewards/chosen": 0.9410189628601074, "rewards/margins": 2.065316899617513, "rewards/rejected": -1.1242979367574055, "step": 16862 }, { "epoch": 0.8938064823894204, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75302776.0, "logits/rejected": -33616258.666666664, "logps/chosen": -214.49703979492188, "logps/rejected": -346.4034423828125, "loss": 0.2232, "rewards/chosen": 0.9238224029541016, "rewards/margins": 2.584602991739909, "rewards/rejected": -1.6607805887858074, "step": 16863 }, { "epoch": 0.8938594863912226, "grad_norm": 49.0, "kl": 1.8387203216552734, "learning_rate": 5e-07, "logits/chosen": 3750628.4, "logits/rejected": -65712202.666666664, "logps/chosen": -158.064892578125, "logps/rejected": -531.9497477213541, "loss": 0.3329, "rewards/chosen": 0.48771233558654786, "rewards/margins": 3.0711342016855876, "rewards/rejected": -2.5834218660990396, "step": 16864 }, { "epoch": 0.8939124903930247, "grad_norm": 39.75, "kl": 4.688695907592773, "learning_rate": 5e-07, "logits/chosen": -26013090.0, "logits/rejected": -24805704.0, "logps/chosen": -661.2271728515625, "logps/rejected": -588.9113159179688, "loss": 0.2412, "rewards/chosen": 1.8094513416290283, "rewards/margins": 5.339004755020142, "rewards/rejected": -3.5295534133911133, "step": 16865 }, { "epoch": 0.8939654943948268, "grad_norm": 30.875, "kl": 3.2744140625, "learning_rate": 5e-07, "logits/chosen": 2762168.0, "logits/rejected": -72057928.0, "logps/chosen": -78.0763931274414, "logps/rejected": -301.26324462890625, "loss": 0.3268, "rewards/chosen": 0.3370163142681122, "rewards/margins": 2.9588501155376434, "rewards/rejected": -2.6218338012695312, "step": 16866 }, { "epoch": 0.894018498396629, "grad_norm": 63.25, "kl": 0.4143362045288086, "learning_rate": 5e-07, "logits/chosen": -78276518.4, "logits/rejected": -5339877.0, "logps/chosen": -299.400390625, "logps/rejected": -285.73760986328125, "loss": 0.3199, "rewards/chosen": 0.13685982227325438, "rewards/margins": 3.4481788555781043, "rewards/rejected": -3.31131903330485, "step": 16867 }, { "epoch": 0.894071502398431, "grad_norm": 70.0, "kl": 1.684051513671875, "learning_rate": 5e-07, "logits/chosen": 28913337.6, "logits/rejected": -9186074.666666666, "logps/chosen": -851.22119140625, "logps/rejected": -273.92982991536456, "loss": 0.3098, "rewards/chosen": 0.7031891345977783, "rewards/margins": 2.7407535711924234, "rewards/rejected": -2.037564436594645, "step": 16868 }, { "epoch": 0.8941245064002332, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52992384.0, "logits/rejected": -10841134.0, "logps/chosen": -333.495263671875, "logps/rejected": -147.1824747721354, "loss": 0.3717, "rewards/chosen": -0.07226959466934205, "rewards/margins": 2.4182047883669533, "rewards/rejected": -2.4904743830362954, "step": 16869 }, { "epoch": 0.8941775104020353, "grad_norm": 39.75, "kl": 1.4666976928710938, "learning_rate": 5e-07, "logits/chosen": -37070260.0, "logits/rejected": -50304220.0, "logps/chosen": -281.469482421875, "logps/rejected": -220.9208221435547, "loss": 0.1833, "rewards/chosen": 1.4005179405212402, "rewards/margins": 4.22250771522522, "rewards/rejected": -2.8219897747039795, "step": 16870 }, { "epoch": 0.8942305144038375, "grad_norm": 38.25, "kl": 3.792843818664551, "learning_rate": 5e-07, "logits/chosen": -15979541.333333334, "logits/rejected": -8411278.0, "logps/chosen": -192.82916259765625, "logps/rejected": -174.0361328125, "loss": 0.4597, "rewards/chosen": 0.30855417251586914, "rewards/margins": 1.3630928993225098, "rewards/rejected": -1.0545387268066406, "step": 16871 }, { "epoch": 0.8942835184056396, "grad_norm": 39.5, "kl": 0.33029937744140625, "learning_rate": 5e-07, "logits/chosen": -17189942.0, "logits/rejected": -29626562.0, "logps/chosen": -256.3563232421875, "logps/rejected": -434.8406982421875, "loss": 0.2905, "rewards/chosen": 0.6985880136489868, "rewards/margins": 2.486114740371704, "rewards/rejected": -1.7875267267227173, "step": 16872 }, { "epoch": 0.8943365224074418, "grad_norm": 35.5, "kl": 2.56626033782959, "learning_rate": 5e-07, "logits/chosen": -11850873.333333334, "logits/rejected": -45974904.0, "logps/chosen": -265.2621663411458, "logps/rejected": -249.5391387939453, "loss": 0.3411, "rewards/chosen": 0.9767325719197592, "rewards/margins": 2.8626304467519126, "rewards/rejected": -1.8858978748321533, "step": 16873 }, { "epoch": 0.8943895264092439, "grad_norm": 40.5, "kl": 1.4782238006591797, "learning_rate": 5e-07, "logits/chosen": -25233386.666666668, "logits/rejected": -10207517.6, "logps/chosen": -621.5674641927084, "logps/rejected": -318.988623046875, "loss": 0.1369, "rewards/chosen": 2.254070440928141, "rewards/margins": 5.795477453867594, "rewards/rejected": -3.541407012939453, "step": 16874 }, { "epoch": 0.894442530411046, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31633250.666666668, "logits/rejected": 3524076.0, "logps/chosen": -300.930908203125, "logps/rejected": -283.52119140625, "loss": 0.1914, "rewards/chosen": 0.41862742106119794, "rewards/margins": 3.6721117655436197, "rewards/rejected": -3.253484344482422, "step": 16875 }, { "epoch": 0.8944955344128481, "grad_norm": 53.75, "kl": 1.6702041625976562, "learning_rate": 5e-07, "logits/chosen": -34034584.0, "logits/rejected": -24873434.0, "logps/chosen": -497.8106689453125, "logps/rejected": -267.33856201171875, "loss": 0.2279, "rewards/chosen": 0.9745110273361206, "rewards/margins": 3.7388240098953247, "rewards/rejected": -2.764312982559204, "step": 16876 }, { "epoch": 0.8945485384146503, "grad_norm": 62.25, "kl": 1.2735671997070312, "learning_rate": 5e-07, "logits/chosen": -13701412.8, "logits/rejected": -17970333.333333332, "logps/chosen": -349.572119140625, "logps/rejected": -150.84187825520834, "loss": 0.4309, "rewards/chosen": 0.06054016351699829, "rewards/margins": 1.2767055471738178, "rewards/rejected": -1.2161653836568196, "step": 16877 }, { "epoch": 0.8946015424164524, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63964412.0, "logits/rejected": -46585792.0, "logps/chosen": -346.9880065917969, "logps/rejected": -517.8156127929688, "loss": 0.2263, "rewards/chosen": 0.556861162185669, "rewards/margins": 3.2492172718048096, "rewards/rejected": -2.6923561096191406, "step": 16878 }, { "epoch": 0.8946545464182546, "grad_norm": 29.875, "kl": 5.291618347167969, "learning_rate": 5e-07, "logits/chosen": -27453292.8, "logits/rejected": -8258500.0, "logps/chosen": -190.5846923828125, "logps/rejected": -213.09847005208334, "loss": 0.2531, "rewards/chosen": 1.3370250701904296, "rewards/margins": 4.4599257151285805, "rewards/rejected": -3.122900644938151, "step": 16879 }, { "epoch": 0.8947075504200567, "grad_norm": 55.75, "kl": 0.33663368225097656, "learning_rate": 5e-07, "logits/chosen": -67772458.66666667, "logits/rejected": -16599689.6, "logps/chosen": -743.4249674479166, "logps/rejected": -232.4072998046875, "loss": 0.1471, "rewards/chosen": 1.1806792418162029, "rewards/margins": 4.699613491694133, "rewards/rejected": -3.5189342498779297, "step": 16880 }, { "epoch": 0.8947605544218589, "grad_norm": 39.0, "kl": 0.9773712158203125, "learning_rate": 5e-07, "logits/chosen": -45561048.0, "logits/rejected": -28052008.0, "logps/chosen": -195.27633666992188, "logps/rejected": -671.9322509765625, "loss": 0.1763, "rewards/chosen": 0.022893333807587624, "rewards/margins": 3.08954308864971, "rewards/rejected": -3.0666497548421225, "step": 16881 }, { "epoch": 0.894813558423661, "grad_norm": 44.75, "kl": 0.162689208984375, "learning_rate": 5e-07, "logits/chosen": -94604904.0, "logits/rejected": -7338041.5, "logps/chosen": -550.8278198242188, "logps/rejected": -137.70651245117188, "loss": 0.263, "rewards/chosen": 0.5441978573799133, "rewards/margins": 2.814573347568512, "rewards/rejected": -2.2703754901885986, "step": 16882 }, { "epoch": 0.8948665624254631, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21983646.0, "logits/rejected": 8325783.0, "logps/chosen": -213.06118774414062, "logps/rejected": -394.62847900390625, "loss": 0.3307, "rewards/chosen": 0.20954680442810059, "rewards/margins": 2.046818256378174, "rewards/rejected": -1.8372714519500732, "step": 16883 }, { "epoch": 0.8949195664272652, "grad_norm": 24.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -8768726.0, "logps/rejected": -306.0606994628906, "loss": 0.0945, "rewards/rejected": -4.045302867889404, "step": 16884 }, { "epoch": 0.8949725704290674, "grad_norm": 67.5, "kl": 1.4116439819335938, "learning_rate": 5e-07, "logits/chosen": 7156295.333333333, "logits/rejected": -20161320.0, "logps/chosen": -423.2303466796875, "logps/rejected": -338.8875732421875, "loss": 0.4144, "rewards/chosen": 0.01186595360438029, "rewards/margins": 2.775319675604502, "rewards/rejected": -2.763453722000122, "step": 16885 }, { "epoch": 0.8950255744308695, "grad_norm": 49.5, "kl": 0.6573448181152344, "learning_rate": 5e-07, "logits/chosen": -41057688.0, "logits/rejected": -11660964.0, "logps/chosen": -357.2794189453125, "logps/rejected": -308.15203857421875, "loss": 0.2813, "rewards/chosen": 0.8068579832712809, "rewards/margins": 3.0758232275644937, "rewards/rejected": -2.268965244293213, "step": 16886 }, { "epoch": 0.8950785784326717, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -87992944.0, "logits/rejected": -61279488.0, "logps/chosen": -223.3368123372396, "logps/rejected": -278.998046875, "loss": 0.2233, "rewards/chosen": 0.5757014354070028, "rewards/margins": 3.473092563947042, "rewards/rejected": -2.897391128540039, "step": 16887 }, { "epoch": 0.8951315824344738, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35519432.0, "logits/rejected": -696593.85, "logps/chosen": -367.3902180989583, "logps/rejected": -150.6080078125, "loss": 0.2264, "rewards/chosen": 0.9536580244700114, "rewards/margins": 2.5350199858347575, "rewards/rejected": -1.581361961364746, "step": 16888 }, { "epoch": 0.895184586436276, "grad_norm": 64.5, "kl": 1.0809097290039062, "learning_rate": 5e-07, "logits/chosen": -24204166.0, "logits/rejected": -11090870.0, "logps/chosen": -475.217529296875, "logps/rejected": -264.3267517089844, "loss": 0.2618, "rewards/chosen": 0.6017491817474365, "rewards/margins": 3.468902111053467, "rewards/rejected": -2.8671529293060303, "step": 16889 }, { "epoch": 0.895237590438078, "grad_norm": 56.0, "kl": 0.8372640609741211, "learning_rate": 5e-07, "logits/chosen": -27380537.6, "logits/rejected": -11091593.333333334, "logps/chosen": -141.614306640625, "logps/rejected": -763.7020670572916, "loss": 0.3552, "rewards/chosen": 0.05803581476211548, "rewards/margins": 4.173726999759674, "rewards/rejected": -4.115691184997559, "step": 16890 }, { "epoch": 0.8952905944398802, "grad_norm": 45.75, "kl": 6.223911285400391, "learning_rate": 5e-07, "logits/chosen": -19434206.0, "logits/rejected": -42197160.0, "logps/chosen": -219.4789276123047, "logps/rejected": -308.9371337890625, "loss": 0.3632, "rewards/chosen": 0.7686874866485596, "rewards/margins": 3.6974289417266846, "rewards/rejected": -2.928741455078125, "step": 16891 }, { "epoch": 0.8953435984416823, "grad_norm": 51.25, "kl": 4.152961730957031, "learning_rate": 5e-07, "logits/chosen": -6625976.666666667, "logits/rejected": -10032811.2, "logps/chosen": -330.08628336588544, "logps/rejected": -159.527685546875, "loss": 0.2456, "rewards/chosen": 1.6815013885498047, "rewards/margins": 4.383383941650391, "rewards/rejected": -2.701882553100586, "step": 16892 }, { "epoch": 0.8953966024434845, "grad_norm": 62.0, "kl": 1.4082832336425781, "learning_rate": 5e-07, "logits/chosen": 32127342.0, "logits/rejected": -10012879.0, "logps/chosen": -426.2716064453125, "logps/rejected": -160.77870178222656, "loss": 0.3347, "rewards/chosen": -0.23798370361328125, "rewards/margins": 3.2579240798950195, "rewards/rejected": -3.495907783508301, "step": 16893 }, { "epoch": 0.8954496064452866, "grad_norm": 53.75, "kl": 1.3433685302734375, "learning_rate": 5e-07, "logits/chosen": -8500912.0, "logits/rejected": -94088584.0, "logps/chosen": -311.23992919921875, "logps/rejected": -409.2596130371094, "loss": 0.2285, "rewards/chosen": 1.123346209526062, "rewards/margins": 2.837774872779846, "rewards/rejected": -1.7144286632537842, "step": 16894 }, { "epoch": 0.8955026104470888, "grad_norm": 64.0, "kl": 2.509523391723633, "learning_rate": 5e-07, "logits/chosen": -13411450.666666666, "logits/rejected": -45981420.8, "logps/chosen": -472.8667805989583, "logps/rejected": -340.785400390625, "loss": 0.306, "rewards/chosen": 0.8013346195220947, "rewards/margins": 2.3000184535980224, "rewards/rejected": -1.4986838340759276, "step": 16895 }, { "epoch": 0.8955556144488909, "grad_norm": 52.75, "kl": 4.221067428588867, "learning_rate": 5e-07, "logits/chosen": -21321388.0, "logits/rejected": -40100852.0, "logps/chosen": -179.43853759765625, "logps/rejected": -558.8844604492188, "loss": 0.387, "rewards/chosen": -0.012277510948479176, "rewards/margins": 1.6294319359585643, "rewards/rejected": -1.6417094469070435, "step": 16896 }, { "epoch": 0.8956086184506931, "grad_norm": 65.0, "kl": 2.531282424926758, "learning_rate": 5e-07, "logits/chosen": -70969080.0, "logits/rejected": -15250220.0, "logps/chosen": -437.34503173828125, "logps/rejected": -190.7506103515625, "loss": 0.2472, "rewards/chosen": 1.128753662109375, "rewards/margins": 2.383286952972412, "rewards/rejected": -1.254533290863037, "step": 16897 }, { "epoch": 0.8956616224524951, "grad_norm": 49.75, "kl": 3.3554158210754395, "learning_rate": 5e-07, "logits/chosen": 4141960.4, "logits/rejected": -14152573.333333334, "logps/chosen": -255.5489013671875, "logps/rejected": -173.69950358072916, "loss": 0.3411, "rewards/chosen": 0.9649120330810547, "rewards/margins": 3.47947146097819, "rewards/rejected": -2.5145594278971353, "step": 16898 }, { "epoch": 0.8957146264542973, "grad_norm": 108.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67312424.0, "logits/rejected": -48436512.0, "logps/chosen": -353.0545959472656, "logps/rejected": -382.8497619628906, "loss": 0.3183, "rewards/chosen": 0.09773998707532883, "rewards/margins": 1.97536950558424, "rewards/rejected": -1.8776295185089111, "step": 16899 }, { "epoch": 0.8957676304560994, "grad_norm": 48.75, "kl": 0.5062837600708008, "learning_rate": 5e-07, "logits/chosen": -3992057.714285714, "logits/rejected": 1124226.5, "logps/chosen": -129.2821044921875, "logps/rejected": -480.47711181640625, "loss": 0.4075, "rewards/chosen": 0.2650259903499058, "rewards/margins": 4.1341849735804965, "rewards/rejected": -3.869158983230591, "step": 16900 }, { "epoch": 0.8958206344579016, "grad_norm": 30.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56517029.333333336, "logits/rejected": -55782201.6, "logps/chosen": -338.90854899088544, "logps/rejected": -433.95849609375, "loss": 0.0896, "rewards/chosen": 1.5540272394816081, "rewards/margins": 5.272519747416179, "rewards/rejected": -3.7184925079345703, "step": 16901 }, { "epoch": 0.8958736384597037, "grad_norm": 44.5, "kl": 1.3785343170166016, "learning_rate": 5e-07, "logits/chosen": 7261728.0, "logits/rejected": -46331765.333333336, "logps/chosen": -213.041796875, "logps/rejected": -324.1100260416667, "loss": 0.2727, "rewards/chosen": 1.083851432800293, "rewards/margins": 3.0814847310384117, "rewards/rejected": -1.9976332982381184, "step": 16902 }, { "epoch": 0.8959266424615059, "grad_norm": 41.25, "kl": 1.8849334716796875, "learning_rate": 5e-07, "logits/chosen": 6384638.0, "logits/rejected": -714528.3333333334, "logps/chosen": -211.2334228515625, "logps/rejected": -72.47214762369792, "loss": 0.3437, "rewards/chosen": 0.3425429821014404, "rewards/margins": 2.92953413327535, "rewards/rejected": -2.5869911511739097, "step": 16903 }, { "epoch": 0.895979646463308, "grad_norm": 38.25, "kl": 3.389993667602539, "learning_rate": 5e-07, "logits/chosen": -38670128.0, "logits/rejected": -37904761.6, "logps/chosen": -362.8721516927083, "logps/rejected": -400.781884765625, "loss": 0.1909, "rewards/chosen": 1.772974967956543, "rewards/margins": 4.879457664489746, "rewards/rejected": -3.1064826965332033, "step": 16904 }, { "epoch": 0.8960326504651102, "grad_norm": 59.5, "kl": 3.410299301147461, "learning_rate": 5e-07, "logits/chosen": -32825020.8, "logits/rejected": -14717113.333333334, "logps/chosen": -360.915576171875, "logps/rejected": -231.1006062825521, "loss": 0.3359, "rewards/chosen": 0.5194756507873535, "rewards/margins": 1.9979378700256347, "rewards/rejected": -1.4784622192382812, "step": 16905 }, { "epoch": 0.8960856544669122, "grad_norm": 39.75, "kl": 2.2917041778564453, "learning_rate": 5e-07, "logits/chosen": -10697733.333333334, "logits/rejected": -9436264.0, "logps/chosen": -260.1677652994792, "logps/rejected": -189.07530517578124, "loss": 0.263, "rewards/chosen": 0.9975339571634928, "rewards/margins": 2.842766539255778, "rewards/rejected": -1.8452325820922852, "step": 16906 }, { "epoch": 0.8961386584687144, "grad_norm": 43.0, "kl": 2.183032989501953, "learning_rate": 5e-07, "logits/chosen": 14498144.0, "logits/rejected": -24248912.0, "logps/chosen": -174.20159912109375, "logps/rejected": -182.427001953125, "loss": 0.2689, "rewards/chosen": 0.7549285888671875, "rewards/margins": 2.7555191040039064, "rewards/rejected": -2.000590515136719, "step": 16907 }, { "epoch": 0.8961916624705165, "grad_norm": 36.5, "kl": 2.569581985473633, "learning_rate": 5e-07, "logits/chosen": -32303453.333333332, "logits/rejected": -23988366.4, "logps/chosen": -192.37337239583334, "logps/rejected": -306.145654296875, "loss": 0.2197, "rewards/chosen": 0.6563475529352824, "rewards/margins": 3.4781134525934854, "rewards/rejected": -2.821765899658203, "step": 16908 }, { "epoch": 0.8962446664723187, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5525574.0, "logits/rejected": -25634237.333333332, "logps/chosen": -318.509765625, "logps/rejected": -188.33721923828125, "loss": 0.1892, "rewards/chosen": 0.9622940421104431, "rewards/margins": 3.869498352209727, "rewards/rejected": -2.9072043100992837, "step": 16909 }, { "epoch": 0.8962976704741208, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -103372684.8, "logits/rejected": -38094264.0, "logps/chosen": -401.7320556640625, "logps/rejected": -642.8155110677084, "loss": 0.269, "rewards/chosen": 0.49073061943054197, "rewards/margins": 4.8706051667531325, "rewards/rejected": -4.379874547322591, "step": 16910 }, { "epoch": 0.896350674475923, "grad_norm": 50.25, "kl": 1.8761940002441406, "learning_rate": 5e-07, "logits/chosen": -17140778.0, "logits/rejected": 1442028.75, "logps/chosen": -176.54885864257812, "logps/rejected": -319.81915283203125, "loss": 0.3423, "rewards/chosen": 0.48884084820747375, "rewards/margins": 1.940382033586502, "rewards/rejected": -1.4515411853790283, "step": 16911 }, { "epoch": 0.8964036784777251, "grad_norm": 39.0, "kl": 0.580564022064209, "learning_rate": 5e-07, "logits/chosen": -26616924.0, "logits/rejected": -18240628.0, "logps/chosen": -169.88095092773438, "logps/rejected": -211.9088592529297, "loss": 0.3521, "rewards/chosen": 0.17548398673534393, "rewards/margins": 1.9640697687864304, "rewards/rejected": -1.7885857820510864, "step": 16912 }, { "epoch": 0.8964566824795273, "grad_norm": 59.0, "kl": 1.119074821472168, "learning_rate": 5e-07, "logits/chosen": -21136120.0, "logits/rejected": -6524628.5, "logps/chosen": -216.7563934326172, "logps/rejected": -169.1671142578125, "loss": 0.402, "rewards/chosen": -0.0738280788064003, "rewards/margins": 0.9635238870978355, "rewards/rejected": -1.0373519659042358, "step": 16913 }, { "epoch": 0.8965096864813293, "grad_norm": 64.5, "kl": 0.6023726463317871, "learning_rate": 5e-07, "logits/chosen": -111778474.66666667, "logits/rejected": -21367208.0, "logps/chosen": -314.70790608723956, "logps/rejected": -243.212841796875, "loss": 0.3266, "rewards/chosen": 0.074005126953125, "rewards/margins": 2.1856189727783204, "rewards/rejected": -2.1116138458251954, "step": 16914 }, { "epoch": 0.8965626904831315, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8451871.0, "logits/rejected": -19219041.333333332, "logps/chosen": -119.38587188720703, "logps/rejected": -311.65533447265625, "loss": 0.3091, "rewards/chosen": -0.6619787216186523, "rewards/margins": 1.2993205388387044, "rewards/rejected": -1.9612992604573567, "step": 16915 }, { "epoch": 0.8966156944849336, "grad_norm": 40.5, "kl": 1.8713970184326172, "learning_rate": 5e-07, "logits/chosen": -105688266.66666667, "logits/rejected": -32993644.8, "logps/chosen": -206.7146199544271, "logps/rejected": -533.61396484375, "loss": 0.1979, "rewards/chosen": 0.61905304590861, "rewards/margins": 4.36064125696818, "rewards/rejected": -3.7415882110595704, "step": 16916 }, { "epoch": 0.8966686984867357, "grad_norm": 57.75, "kl": 0.7548675537109375, "learning_rate": 5e-07, "logits/chosen": -198640.0, "logits/rejected": -31504452.0, "logps/chosen": -513.5765380859375, "logps/rejected": -358.426025390625, "loss": 0.2759, "rewards/chosen": 0.8319187164306641, "rewards/margins": 2.15362286567688, "rewards/rejected": -1.3217041492462158, "step": 16917 }, { "epoch": 0.8967217024885379, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40476275.2, "logits/rejected": -43956384.0, "logps/chosen": -269.0862060546875, "logps/rejected": -442.9209798177083, "loss": 0.3151, "rewards/chosen": 0.4729288101196289, "rewards/margins": 2.3453861554463704, "rewards/rejected": -1.8724573453267415, "step": 16918 }, { "epoch": 0.89677470649034, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47888821.333333336, "logits/rejected": -27293254.4, "logps/chosen": -531.0885009765625, "logps/rejected": -247.557861328125, "loss": 0.2163, "rewards/chosen": 0.36002198855082196, "rewards/margins": 2.8317844549814857, "rewards/rejected": -2.471762466430664, "step": 16919 }, { "epoch": 0.8968277104921422, "grad_norm": 39.5, "kl": 1.5029258728027344, "learning_rate": 5e-07, "logits/chosen": -25438236.8, "logits/rejected": -27631541.333333332, "logps/chosen": -250.07568359375, "logps/rejected": -340.7948404947917, "loss": 0.2826, "rewards/chosen": 0.7565867900848389, "rewards/margins": 2.91765824953715, "rewards/rejected": -2.161071459452311, "step": 16920 }, { "epoch": 0.8968807144939442, "grad_norm": 42.5, "kl": 3.947768211364746, "learning_rate": 5e-07, "logits/chosen": -22960325.333333332, "logits/rejected": -24551816.0, "logps/chosen": -285.93556722005206, "logps/rejected": -598.1522827148438, "loss": 0.3958, "rewards/chosen": 0.5653352737426758, "rewards/margins": 3.1765687465667725, "rewards/rejected": -2.6112334728240967, "step": 16921 }, { "epoch": 0.8969337184957464, "grad_norm": 56.5, "kl": 0.5300827026367188, "learning_rate": 5e-07, "logits/chosen": -66829875.2, "logits/rejected": -10114144.666666666, "logps/chosen": -413.260302734375, "logps/rejected": -370.3933512369792, "loss": 0.3306, "rewards/chosen": 0.31257965564727785, "rewards/margins": 2.9228522221247353, "rewards/rejected": -2.6102725664774575, "step": 16922 }, { "epoch": 0.8969867224975485, "grad_norm": 28.75, "kl": 1.1450138092041016, "learning_rate": 5e-07, "logits/chosen": 5996955.333333333, "logits/rejected": -25217102.4, "logps/chosen": -157.30510457356772, "logps/rejected": -498.45927734375, "loss": 0.1341, "rewards/chosen": 1.1865007082621257, "rewards/margins": 4.262124411265056, "rewards/rejected": -3.07562370300293, "step": 16923 }, { "epoch": 0.8970397264993507, "grad_norm": 45.75, "kl": 0.45311737060546875, "learning_rate": 5e-07, "logits/chosen": -34879656.0, "logits/rejected": -21717533.333333332, "logps/chosen": -669.4800415039062, "logps/rejected": -479.5362141927083, "loss": 0.1168, "rewards/chosen": 1.6817917823791504, "rewards/margins": 5.009429454803467, "rewards/rejected": -3.3276376724243164, "step": 16924 }, { "epoch": 0.8970927305011528, "grad_norm": 61.0, "kl": 0.3143320083618164, "learning_rate": 5e-07, "logits/chosen": 1421572.0, "logits/rejected": -15188862.0, "logps/chosen": -318.07232666015625, "logps/rejected": -151.38217163085938, "loss": 0.274, "rewards/chosen": 0.630469560623169, "rewards/margins": 3.1064810752868652, "rewards/rejected": -2.4760115146636963, "step": 16925 }, { "epoch": 0.897145734502955, "grad_norm": 48.75, "kl": 0.6729717254638672, "learning_rate": 5e-07, "logits/chosen": -18164392.0, "logits/rejected": -6524753.6, "logps/chosen": -428.9684244791667, "logps/rejected": -277.7702880859375, "loss": 0.1309, "rewards/chosen": 1.6767635345458984, "rewards/margins": 4.036650466918945, "rewards/rejected": -2.3598869323730467, "step": 16926 }, { "epoch": 0.8971987385047571, "grad_norm": 46.75, "kl": 0.8782234191894531, "learning_rate": 5e-07, "logits/chosen": -43443108.0, "logits/rejected": -4651331.5, "logps/chosen": -288.7909240722656, "logps/rejected": -275.83660888671875, "loss": 0.3265, "rewards/chosen": -0.041544534265995026, "rewards/margins": 2.7191449627280235, "rewards/rejected": -2.7606894969940186, "step": 16927 }, { "epoch": 0.8972517425065593, "grad_norm": 44.5, "kl": 1.6028976440429688, "learning_rate": 5e-07, "logits/chosen": -3350722.0, "logits/rejected": -22549121.6, "logps/chosen": -99.9241943359375, "logps/rejected": -213.1283447265625, "loss": 0.2163, "rewards/chosen": 0.8788503805796305, "rewards/margins": 3.3284006277720133, "rewards/rejected": -2.449550247192383, "step": 16928 }, { "epoch": 0.8973047465083613, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 222617312.0, "logits/rejected": -25215245.714285713, "logps/chosen": -812.5533447265625, "logps/rejected": -246.0428466796875, "loss": 0.178, "rewards/chosen": -0.503887951374054, "rewards/margins": 2.442095935344696, "rewards/rejected": -2.94598388671875, "step": 16929 }, { "epoch": 0.8973577505101635, "grad_norm": 46.75, "kl": 0.5727224349975586, "learning_rate": 5e-07, "logits/chosen": -54125536.0, "logits/rejected": -24071070.0, "logps/chosen": -209.25277709960938, "logps/rejected": -392.61376953125, "loss": 0.3256, "rewards/chosen": -0.022419637069106102, "rewards/margins": 3.120379501953721, "rewards/rejected": -3.142799139022827, "step": 16930 }, { "epoch": 0.8974107545119656, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57700944.0, "logits/rejected": -28318616.0, "logps/chosen": -315.71868896484375, "logps/rejected": -408.0950113932292, "loss": 0.1963, "rewards/chosen": 0.35329437255859375, "rewards/margins": 2.794325033823649, "rewards/rejected": -2.441030661265055, "step": 16931 }, { "epoch": 0.8974637585137678, "grad_norm": 53.0, "kl": 5.342510223388672, "learning_rate": 5e-07, "logits/chosen": -71012966.4, "logits/rejected": -5197121.333333333, "logps/chosen": -619.48779296875, "logps/rejected": -161.922607421875, "loss": 0.3025, "rewards/chosen": 1.2516807556152343, "rewards/margins": 2.4384209156036376, "rewards/rejected": -1.1867401599884033, "step": 16932 }, { "epoch": 0.8975167625155699, "grad_norm": 29.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9220014.0, "logits/rejected": -13309902.0, "logps/chosen": -222.61004638671875, "logps/rejected": -427.8899841308594, "loss": 0.1857, "rewards/chosen": 0.758885383605957, "rewards/margins": 4.8316192626953125, "rewards/rejected": -4.0727338790893555, "step": 16933 }, { "epoch": 0.8975697665173721, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16369596.0, "logits/rejected": 2186006.8, "logps/chosen": -507.8973795572917, "logps/rejected": -176.194873046875, "loss": 0.2935, "rewards/chosen": 0.4314951499303182, "rewards/margins": 2.481301744778951, "rewards/rejected": -2.049806594848633, "step": 16934 }, { "epoch": 0.8976227705191742, "grad_norm": 29.375, "kl": 0.7639760971069336, "learning_rate": 5e-07, "logits/chosen": -12322962.666666666, "logits/rejected": -45519641.6, "logps/chosen": -117.91064453125, "logps/rejected": -494.369970703125, "loss": 0.1338, "rewards/chosen": 1.1446603139241536, "rewards/margins": 4.3148906071980795, "rewards/rejected": -3.1702302932739257, "step": 16935 }, { "epoch": 0.8976757745209764, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76240416.0, "logits/rejected": -33007232.0, "logps/chosen": -553.703857421875, "logps/rejected": -377.5168701171875, "loss": 0.2545, "rewards/chosen": 0.5989075104395548, "rewards/margins": 2.467012063662211, "rewards/rejected": -1.8681045532226563, "step": 16936 }, { "epoch": 0.8977287785227784, "grad_norm": 24.0, "kl": 1.3452510833740234, "learning_rate": 5e-07, "logits/chosen": 6829082.5, "logits/rejected": -28036208.0, "logps/chosen": -18.12399673461914, "logps/rejected": -340.9713134765625, "loss": 0.0972, "rewards/chosen": 1.3137567043304443, "rewards/margins": 4.404304265975952, "rewards/rejected": -3.090547561645508, "step": 16937 }, { "epoch": 0.8977817825245806, "grad_norm": 50.75, "kl": 5.362070083618164, "learning_rate": 5e-07, "logits/chosen": 2941300.6, "logits/rejected": -4191803.6666666665, "logps/chosen": -289.5862548828125, "logps/rejected": -112.2527567545573, "loss": 0.3275, "rewards/chosen": 1.439226245880127, "rewards/margins": 2.7148508389790855, "rewards/rejected": -1.2756245930989583, "step": 16938 }, { "epoch": 0.8978347865263827, "grad_norm": 44.5, "kl": 1.8871631622314453, "learning_rate": 5e-07, "logits/chosen": -46275702.4, "logits/rejected": -44689277.333333336, "logps/chosen": -420.13154296875, "logps/rejected": -425.7652994791667, "loss": 0.2837, "rewards/chosen": 0.642479419708252, "rewards/margins": 3.744210402170817, "rewards/rejected": -3.101730982462565, "step": 16939 }, { "epoch": 0.8978877905281849, "grad_norm": 46.75, "kl": 4.9562835693359375, "learning_rate": 5e-07, "logits/chosen": -9881952.0, "logits/rejected": 2203325.75, "logps/chosen": -183.828125, "logps/rejected": -254.00332641601562, "loss": 0.4209, "rewards/chosen": 0.6671051297869001, "rewards/margins": 3.9497741971697127, "rewards/rejected": -3.2826690673828125, "step": 16940 }, { "epoch": 0.897940794529987, "grad_norm": 37.25, "kl": 0.9308643341064453, "learning_rate": 5e-07, "logits/chosen": -54991396.0, "logits/rejected": -65542584.0, "logps/chosen": -259.7441101074219, "logps/rejected": -407.09716796875, "loss": 0.2338, "rewards/chosen": 0.47900155186653137, "rewards/margins": 3.2265868484973907, "rewards/rejected": -2.7475852966308594, "step": 16941 }, { "epoch": 0.8979937985317892, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39005037.333333336, "logits/rejected": -27752086.4, "logps/chosen": -225.20849609375, "logps/rejected": -339.4757080078125, "loss": 0.2011, "rewards/chosen": 0.6193905671437582, "rewards/margins": 3.3369213898976646, "rewards/rejected": -2.7175308227539063, "step": 16942 }, { "epoch": 0.8980468025335913, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33921820.8, "logits/rejected": -4715523.333333333, "logps/chosen": -323.9677734375, "logps/rejected": -283.72914632161456, "loss": 0.2843, "rewards/chosen": 0.77971510887146, "rewards/margins": 2.101767905553182, "rewards/rejected": -1.322052796681722, "step": 16943 }, { "epoch": 0.8980998065353935, "grad_norm": 47.75, "kl": 0.7618370056152344, "learning_rate": 5e-07, "logits/chosen": -14256451.2, "logits/rejected": -29609037.333333332, "logps/chosen": -335.86142578125, "logps/rejected": -296.06825764973956, "loss": 0.273, "rewards/chosen": 0.7066168308258056, "rewards/margins": 3.9063241799672443, "rewards/rejected": -3.199707349141439, "step": 16944 }, { "epoch": 0.8981528105371955, "grad_norm": 46.0, "kl": 0.6241073608398438, "learning_rate": 5e-07, "logits/chosen": -42689430.4, "logits/rejected": -26123682.666666668, "logps/chosen": -422.60302734375, "logps/rejected": -180.37003580729166, "loss": 0.2608, "rewards/chosen": 0.7207655429840087, "rewards/margins": 3.820141585667928, "rewards/rejected": -3.0993760426839194, "step": 16945 }, { "epoch": 0.8982058145389977, "grad_norm": 54.25, "kl": 0.7242517471313477, "learning_rate": 5e-07, "logits/chosen": -28766536.0, "logits/rejected": -28144588.8, "logps/chosen": -213.36995442708334, "logps/rejected": -175.40433349609376, "loss": 0.3914, "rewards/chosen": -0.35071420669555664, "rewards/margins": 0.8411808013916016, "rewards/rejected": -1.1918950080871582, "step": 16946 }, { "epoch": 0.8982588185407998, "grad_norm": 40.0, "kl": 0.1995086669921875, "learning_rate": 5e-07, "logits/chosen": -18754138.666666668, "logits/rejected": -23543918.4, "logps/chosen": -123.66133626302083, "logps/rejected": -306.876611328125, "loss": 0.2437, "rewards/chosen": 0.26803545157114667, "rewards/margins": 3.2610625664393105, "rewards/rejected": -2.993027114868164, "step": 16947 }, { "epoch": 0.898311822542602, "grad_norm": 19.875, "kl": 2.609005928039551, "learning_rate": 5e-07, "logits/chosen": -3543178.0, "logits/rejected": -72843027.2, "logps/chosen": -46.29766337076823, "logps/rejected": -529.29462890625, "loss": 0.2052, "rewards/chosen": 0.29898611704508465, "rewards/margins": 4.256514040629069, "rewards/rejected": -3.957527923583984, "step": 16948 }, { "epoch": 0.8983648265444041, "grad_norm": 56.0, "kl": 5.09065055847168, "learning_rate": 5e-07, "logits/chosen": -77121200.0, "logits/rejected": -76539072.0, "logps/chosen": -664.9520670572916, "logps/rejected": -719.025390625, "loss": 0.1972, "rewards/chosen": 0.9924280643463135, "rewards/margins": 4.267942285537719, "rewards/rejected": -3.2755142211914063, "step": 16949 }, { "epoch": 0.8984178305462063, "grad_norm": 37.75, "kl": 0.08905792236328125, "learning_rate": 5e-07, "logits/chosen": -81592496.0, "logits/rejected": -51076512.0, "logps/chosen": -726.6490478515625, "logps/rejected": -430.3720397949219, "loss": 0.2039, "rewards/chosen": 1.3238131999969482, "rewards/margins": 3.984929323196411, "rewards/rejected": -2.661116123199463, "step": 16950 }, { "epoch": 0.8984708345480084, "grad_norm": 49.25, "kl": 0.5827188491821289, "learning_rate": 5e-07, "logits/chosen": -34109816.0, "logits/rejected": -8004025.0, "logps/chosen": -393.4873453776042, "logps/rejected": -223.94476318359375, "loss": 0.3009, "rewards/chosen": 1.0683097839355469, "rewards/margins": 2.3944251537323, "rewards/rejected": -1.326115369796753, "step": 16951 }, { "epoch": 0.8985238385498105, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27232084.0, "logits/rejected": -15918928.0, "logps/chosen": -297.85614013671875, "logps/rejected": -370.9414978027344, "loss": 0.2549, "rewards/chosen": 0.47277146577835083, "rewards/margins": 3.1765089631080627, "rewards/rejected": -2.703737497329712, "step": 16952 }, { "epoch": 0.8985768425516126, "grad_norm": 30.5, "kl": 3.0848846435546875, "learning_rate": 5e-07, "logits/chosen": -12024070.0, "logits/rejected": -27313786.666666668, "logps/chosen": -379.1051025390625, "logps/rejected": -209.53922526041666, "loss": 0.123, "rewards/chosen": 2.3330750465393066, "rewards/margins": 5.732288837432861, "rewards/rejected": -3.3992137908935547, "step": 16953 }, { "epoch": 0.8986298465534148, "grad_norm": 46.5, "kl": 1.884347915649414, "learning_rate": 5e-07, "logits/chosen": -40169209.6, "logits/rejected": -8995742.666666666, "logps/chosen": -143.9257080078125, "logps/rejected": -322.38592529296875, "loss": 0.3616, "rewards/chosen": 0.35369443893432617, "rewards/margins": 2.4964054425557456, "rewards/rejected": -2.1427110036214194, "step": 16954 }, { "epoch": 0.8986828505552169, "grad_norm": 62.25, "kl": 1.8394174575805664, "learning_rate": 5e-07, "logits/chosen": -24909856.0, "logits/rejected": -6821562.5, "logps/chosen": -318.5489501953125, "logps/rejected": -197.42041015625, "loss": 0.3778, "rewards/chosen": 0.31787002086639404, "rewards/margins": 1.5587035417556763, "rewards/rejected": -1.2408335208892822, "step": 16955 }, { "epoch": 0.8987358545570191, "grad_norm": 38.75, "kl": 0.9585695266723633, "learning_rate": 5e-07, "logits/chosen": -40771440.0, "logits/rejected": -9129562.4, "logps/chosen": -234.52581787109375, "logps/rejected": -289.867333984375, "loss": 0.1751, "rewards/chosen": 0.9888060887654623, "rewards/margins": 3.9781405766805014, "rewards/rejected": -2.989334487915039, "step": 16956 }, { "epoch": 0.8987888585588212, "grad_norm": 43.0, "kl": 0.041423797607421875, "learning_rate": 5e-07, "logits/chosen": -58844000.0, "logits/rejected": -19284688.0, "logps/chosen": -207.40264892578125, "logps/rejected": -285.5767578125, "loss": 0.2464, "rewards/chosen": 0.435519536336263, "rewards/margins": 3.1283454259236656, "rewards/rejected": -2.6928258895874024, "step": 16957 }, { "epoch": 0.8988418625606234, "grad_norm": 159.0, "kl": 1.2376899719238281, "learning_rate": 5e-07, "logits/chosen": -10646204.8, "logits/rejected": -26431104.0, "logps/chosen": -493.73095703125, "logps/rejected": -211.2348429361979, "loss": 0.3125, "rewards/chosen": 0.8806549072265625, "rewards/margins": 2.5672651290893556, "rewards/rejected": -1.686610221862793, "step": 16958 }, { "epoch": 0.8988948665624255, "grad_norm": 29.375, "kl": 0.544102668762207, "learning_rate": 5e-07, "logits/chosen": -13204989.0, "logits/rejected": -25685960.0, "logps/chosen": -140.79513549804688, "logps/rejected": -159.06637573242188, "loss": 0.2522, "rewards/chosen": 0.6634011268615723, "rewards/margins": 2.818134069442749, "rewards/rejected": -2.1547329425811768, "step": 16959 }, { "epoch": 0.8989478705642276, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91689360.0, "logits/rejected": -17752606.85714286, "logps/chosen": -476.7005920410156, "logps/rejected": -389.3452845982143, "loss": 0.2295, "rewards/chosen": -0.4869934022426605, "rewards/margins": 1.4152442174298423, "rewards/rejected": -1.9022376196725028, "step": 16960 }, { "epoch": 0.8990008745660297, "grad_norm": 46.75, "kl": 1.9974746704101562, "learning_rate": 5e-07, "logits/chosen": -29394080.0, "logits/rejected": 5183522.0, "logps/chosen": -291.9581705729167, "logps/rejected": -721.12744140625, "loss": 0.4183, "rewards/chosen": 0.19849018255869547, "rewards/margins": 3.1127663056055703, "rewards/rejected": -2.914276123046875, "step": 16961 }, { "epoch": 0.8990538785678319, "grad_norm": 39.25, "kl": 3.460906982421875, "learning_rate": 5e-07, "logits/chosen": 3781855.0, "logits/rejected": -43887876.0, "logps/chosen": -187.680419921875, "logps/rejected": -616.8955078125, "loss": 0.3244, "rewards/chosen": 0.6099241971969604, "rewards/margins": 5.162732481956482, "rewards/rejected": -4.5528082847595215, "step": 16962 }, { "epoch": 0.899106882569634, "grad_norm": 39.5, "kl": 3.521416664123535, "learning_rate": 5e-07, "logits/chosen": -5921376.0, "logits/rejected": -37110485.333333336, "logps/chosen": -87.58607177734375, "logps/rejected": -404.4499918619792, "loss": 0.3414, "rewards/chosen": 0.34496047496795657, "rewards/margins": 3.5223044792811073, "rewards/rejected": -3.177344004313151, "step": 16963 }, { "epoch": 0.8991598865714362, "grad_norm": 61.75, "kl": 2.665834426879883, "learning_rate": 5e-07, "logits/chosen": -14243417.333333334, "logits/rejected": -11742847.0, "logps/chosen": -356.7327067057292, "logps/rejected": -114.52835083007812, "loss": 0.3275, "rewards/chosen": 0.6522977352142334, "rewards/margins": 3.146247148513794, "rewards/rejected": -2.4939494132995605, "step": 16964 }, { "epoch": 0.8992128905732383, "grad_norm": 75.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -98048256.0, "logits/rejected": -25108649.14285714, "logps/chosen": -191.1349639892578, "logps/rejected": -307.68687220982144, "loss": 0.2393, "rewards/chosen": 0.4254654049873352, "rewards/margins": 2.2333488038608005, "rewards/rejected": -1.8078833988734655, "step": 16965 }, { "epoch": 0.8992658945750405, "grad_norm": 48.25, "kl": 0.9998030662536621, "learning_rate": 5e-07, "logits/chosen": -21800028.8, "logits/rejected": -25500050.666666668, "logps/chosen": -292.07177734375, "logps/rejected": -967.1114908854166, "loss": 0.2462, "rewards/chosen": 0.987796688079834, "rewards/margins": 7.228190644582113, "rewards/rejected": -6.240393956502278, "step": 16966 }, { "epoch": 0.8993188985768426, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62013242.666666664, "logits/rejected": -14675520.0, "logps/chosen": -467.4514973958333, "logps/rejected": -259.17216796875, "loss": 0.2828, "rewards/chosen": 0.13988037904103598, "rewards/margins": 2.063250740369161, "rewards/rejected": -1.923370361328125, "step": 16967 }, { "epoch": 0.8993719025786446, "grad_norm": 60.0, "kl": 3.3653335571289062, "learning_rate": 5e-07, "logits/chosen": -49349428.0, "logits/rejected": -7329312.0, "logps/chosen": -241.06881713867188, "logps/rejected": -446.654052734375, "loss": 0.2635, "rewards/chosen": 0.8557672500610352, "rewards/margins": 3.8806657791137695, "rewards/rejected": -3.0248985290527344, "step": 16968 }, { "epoch": 0.8994249065804468, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28492376.0, "logits/rejected": -33126976.0, "logps/chosen": -398.6630452473958, "logps/rejected": -374.045947265625, "loss": 0.1289, "rewards/chosen": 1.367638111114502, "rewards/margins": 4.047745227813721, "rewards/rejected": -2.6801071166992188, "step": 16969 }, { "epoch": 0.8994779105822489, "grad_norm": 40.25, "kl": 1.5665464401245117, "learning_rate": 5e-07, "logits/chosen": -16552241.0, "logits/rejected": -12266238.0, "logps/chosen": -240.6900634765625, "logps/rejected": -218.5494842529297, "loss": 0.2378, "rewards/chosen": 0.9307640790939331, "rewards/margins": 2.681885242462158, "rewards/rejected": -1.751121163368225, "step": 16970 }, { "epoch": 0.8995309145840511, "grad_norm": 58.5, "kl": 2.793487548828125, "learning_rate": 5e-07, "logits/chosen": -13667310.4, "logits/rejected": -6571026.666666667, "logps/chosen": -844.698828125, "logps/rejected": -314.55592854817706, "loss": 0.401, "rewards/chosen": 0.5586165428161621, "rewards/margins": 2.2043511708577475, "rewards/rejected": -1.6457346280415852, "step": 16971 }, { "epoch": 0.8995839185858532, "grad_norm": 34.75, "kl": 0.7734642028808594, "learning_rate": 5e-07, "logits/chosen": -13250663.0, "logits/rejected": -37493120.0, "logps/chosen": -122.98365783691406, "logps/rejected": -347.9964599609375, "loss": 0.2884, "rewards/chosen": 0.14121311902999878, "rewards/margins": 2.953336179256439, "rewards/rejected": -2.8121230602264404, "step": 16972 }, { "epoch": 0.8996369225876554, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46844005.333333336, "logits/rejected": -2468739.25, "logps/chosen": -336.073486328125, "logps/rejected": -142.158935546875, "loss": 0.4493, "rewards/chosen": -0.1276664137840271, "rewards/margins": 1.2476266026496887, "rewards/rejected": -1.3752930164337158, "step": 16973 }, { "epoch": 0.8996899265894575, "grad_norm": 40.25, "kl": 1.6891021728515625, "learning_rate": 5e-07, "logits/chosen": -32036357.333333332, "logits/rejected": -55999584.0, "logps/chosen": -172.45452880859375, "logps/rejected": -355.5938720703125, "loss": 0.1897, "rewards/chosen": 0.9564207394917806, "rewards/margins": 3.254123147328695, "rewards/rejected": -2.297702407836914, "step": 16974 }, { "epoch": 0.8997429305912596, "grad_norm": 69.5, "kl": 1.6219482421875, "learning_rate": 5e-07, "logits/chosen": -19471258.0, "logits/rejected": -35450164.0, "logps/chosen": -714.0586547851562, "logps/rejected": -377.4759216308594, "loss": 0.2438, "rewards/chosen": 0.5145221948623657, "rewards/margins": 3.284918427467346, "rewards/rejected": -2.7703962326049805, "step": 16975 }, { "epoch": 0.8997959345930617, "grad_norm": 39.75, "kl": 1.072793960571289, "learning_rate": 5e-07, "logits/chosen": -81418968.0, "logits/rejected": -20886684.0, "logps/chosen": -254.3209228515625, "logps/rejected": -467.93951416015625, "loss": 0.2731, "rewards/chosen": -0.0036066919565200806, "rewards/margins": 4.44711409509182, "rewards/rejected": -4.45072078704834, "step": 16976 }, { "epoch": 0.8998489385948639, "grad_norm": 72.5, "kl": 1.792989730834961, "learning_rate": 5e-07, "logits/chosen": -2815965.5, "logits/rejected": -27485116.0, "logps/chosen": -155.05592346191406, "logps/rejected": -223.63787841796875, "loss": 0.2897, "rewards/chosen": 0.7303996086120605, "rewards/margins": 3.1900434494018555, "rewards/rejected": -2.459643840789795, "step": 16977 }, { "epoch": 0.899901942596666, "grad_norm": 38.25, "kl": 0.9999914169311523, "learning_rate": 5e-07, "logits/chosen": 1127116.875, "logits/rejected": 56155856.0, "logps/chosen": -175.90695190429688, "logps/rejected": -392.9647216796875, "loss": 0.2774, "rewards/chosen": 0.099392369389534, "rewards/margins": 2.9831451922655106, "rewards/rejected": -2.8837528228759766, "step": 16978 }, { "epoch": 0.8999549465984682, "grad_norm": 53.25, "kl": 2.2523860931396484, "learning_rate": 5e-07, "logits/chosen": -27180803.2, "logits/rejected": -15170174.666666666, "logps/chosen": -190.03472900390625, "logps/rejected": -199.17171223958334, "loss": 0.4217, "rewards/chosen": 0.5503677368164063, "rewards/margins": 1.5500357786814372, "rewards/rejected": -0.9996680418650309, "step": 16979 }, { "epoch": 0.9000079506002703, "grad_norm": 36.75, "kl": 0.278045654296875, "learning_rate": 5e-07, "logits/chosen": -23305318.4, "logits/rejected": -36461234.666666664, "logps/chosen": -289.26064453125, "logps/rejected": -599.0987955729166, "loss": 0.2017, "rewards/chosen": 1.110384178161621, "rewards/margins": 3.9841698328653967, "rewards/rejected": -2.873785654703776, "step": 16980 }, { "epoch": 0.9000609546020725, "grad_norm": 23.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -25470178.0, "logps/rejected": -344.9355163574219, "loss": 0.1136, "rewards/rejected": -3.1262545585632324, "step": 16981 }, { "epoch": 0.9001139586038746, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38603432.0, "logits/rejected": 5625542.857142857, "logps/chosen": -746.0452880859375, "logps/rejected": -484.91357421875, "loss": 0.0952, "rewards/chosen": 0.710711658000946, "rewards/margins": 3.7371695808001926, "rewards/rejected": -3.0264579227992465, "step": 16982 }, { "epoch": 0.9001669626056767, "grad_norm": 46.0, "kl": 0.029022216796875, "learning_rate": 5e-07, "logits/chosen": -7914021.333333333, "logits/rejected": -112692889.6, "logps/chosen": -135.71888224283853, "logps/rejected": -401.474169921875, "loss": 0.283, "rewards/chosen": 0.04158222675323486, "rewards/margins": 2.089691090583801, "rewards/rejected": -2.0481088638305662, "step": 16983 }, { "epoch": 0.9002199666074788, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51170192.0, "logits/rejected": 36366284.0, "logps/chosen": -338.33538818359375, "logps/rejected": -377.3375549316406, "loss": 0.2698, "rewards/chosen": 0.471945196390152, "rewards/margins": 2.71422877907753, "rewards/rejected": -2.242283582687378, "step": 16984 }, { "epoch": 0.900272970609281, "grad_norm": 37.0, "kl": 3.01605224609375, "learning_rate": 5e-07, "logits/chosen": -10691157.0, "logits/rejected": -9459371.0, "logps/chosen": -254.0213623046875, "logps/rejected": -212.8648681640625, "loss": 0.2404, "rewards/chosen": 1.1678762435913086, "rewards/margins": 3.5940637588500977, "rewards/rejected": -2.426187515258789, "step": 16985 }, { "epoch": 0.9003259746110831, "grad_norm": 41.75, "kl": 0.13858413696289062, "learning_rate": 5e-07, "logits/chosen": -26790288.0, "logits/rejected": -43640084.0, "logps/chosen": -320.3870849609375, "logps/rejected": -347.2723388671875, "loss": 0.2258, "rewards/chosen": 0.8450508713722229, "rewards/margins": 3.515215218067169, "rewards/rejected": -2.6701643466949463, "step": 16986 }, { "epoch": 0.9003789786128853, "grad_norm": 50.75, "kl": 2.9356117248535156, "learning_rate": 5e-07, "logits/chosen": -19001364.8, "logits/rejected": -6888128.0, "logps/chosen": -252.5610107421875, "logps/rejected": -476.4217122395833, "loss": 0.3196, "rewards/chosen": 0.9568233489990234, "rewards/margins": 3.2451780637105307, "rewards/rejected": -2.2883547147115073, "step": 16987 }, { "epoch": 0.9004319826146874, "grad_norm": 48.0, "kl": 1.01416015625, "learning_rate": 5e-07, "logits/chosen": -49107312.0, "logits/rejected": -31904803.2, "logps/chosen": -660.317138671875, "logps/rejected": -354.110791015625, "loss": 0.1678, "rewards/chosen": 1.3169007301330566, "rewards/margins": 3.7364996910095214, "rewards/rejected": -2.4195989608764648, "step": 16988 }, { "epoch": 0.9004849866164896, "grad_norm": 29.0, "kl": 1.484720230102539, "learning_rate": 5e-07, "logits/chosen": 9923084.0, "logits/rejected": -23728939.2, "logps/chosen": -25.356300354003906, "logps/rejected": -157.710009765625, "loss": 0.2942, "rewards/chosen": 0.6026680072148641, "rewards/margins": 2.2548559268315636, "rewards/rejected": -1.6521879196166993, "step": 16989 }, { "epoch": 0.9005379906182917, "grad_norm": 29.25, "kl": 1.525430679321289, "learning_rate": 5e-07, "logits/chosen": -5629735.5, "logits/rejected": -29535474.0, "logps/chosen": -208.27908325195312, "logps/rejected": -149.4695281982422, "loss": 0.2258, "rewards/chosen": 0.6449829936027527, "rewards/margins": 4.696635901927948, "rewards/rejected": -4.051652908325195, "step": 16990 }, { "epoch": 0.9005909946200938, "grad_norm": 50.75, "kl": 0.8315353393554688, "learning_rate": 5e-07, "logits/chosen": -16269148.0, "logits/rejected": -38856600.0, "logps/chosen": -241.95469665527344, "logps/rejected": -507.98468017578125, "loss": 0.2827, "rewards/chosen": 0.5082557797431946, "rewards/margins": 4.286814033985138, "rewards/rejected": -3.7785582542419434, "step": 16991 }, { "epoch": 0.9006439986218959, "grad_norm": 58.75, "kl": 1.9583854675292969, "learning_rate": 5e-07, "logits/chosen": -41484921.6, "logits/rejected": 132525813.33333333, "logps/chosen": -342.0512451171875, "logps/rejected": -852.0255533854166, "loss": 0.304, "rewards/chosen": 0.6213575839996338, "rewards/margins": 5.465199136734009, "rewards/rejected": -4.843841552734375, "step": 16992 }, { "epoch": 0.9006970026236981, "grad_norm": 40.5, "kl": 1.8388729095458984, "learning_rate": 5e-07, "logits/chosen": -11700996.8, "logits/rejected": -30361056.0, "logps/chosen": -212.8184326171875, "logps/rejected": -309.8518473307292, "loss": 0.2626, "rewards/chosen": 0.9082430839538574, "rewards/margins": 3.472648525238037, "rewards/rejected": -2.5644054412841797, "step": 16993 }, { "epoch": 0.9007500066255002, "grad_norm": 38.25, "kl": 1.2216377258300781, "learning_rate": 5e-07, "logits/chosen": -2034981.0, "logits/rejected": -21590235.42857143, "logps/chosen": -44.29469299316406, "logps/rejected": -319.72694614955356, "loss": 0.1324, "rewards/chosen": 1.2524757385253906, "rewards/margins": 4.672486441476004, "rewards/rejected": -3.4200107029506137, "step": 16994 }, { "epoch": 0.9008030106273024, "grad_norm": 49.0, "kl": 3.0526771545410156, "learning_rate": 5e-07, "logits/chosen": -3079344.0, "logits/rejected": -101856424.0, "logps/chosen": -185.4105428059896, "logps/rejected": -289.0369873046875, "loss": 0.4314, "rewards/chosen": 0.28116538127263385, "rewards/margins": 1.859018345673879, "rewards/rejected": -1.5778529644012451, "step": 16995 }, { "epoch": 0.9008560146291045, "grad_norm": 40.5, "kl": 0.5051612854003906, "learning_rate": 5e-07, "logits/chosen": -8100519.333333333, "logits/rejected": -3251735.6, "logps/chosen": -271.3076578776042, "logps/rejected": -232.1415771484375, "loss": 0.213, "rewards/chosen": 0.9126152992248535, "rewards/margins": 2.762104320526123, "rewards/rejected": -1.8494890213012696, "step": 16996 }, { "epoch": 0.9009090186309067, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19298210.666666668, "logits/rejected": -8780709.6, "logps/chosen": -438.3292236328125, "logps/rejected": -191.03359375, "loss": 0.1789, "rewards/chosen": 1.118651310602824, "rewards/margins": 3.842414013544719, "rewards/rejected": -2.7237627029418947, "step": 16997 }, { "epoch": 0.9009620226327087, "grad_norm": 47.0, "kl": 4.226751327514648, "learning_rate": 5e-07, "logits/chosen": -35927836.8, "logits/rejected": -22099218.666666668, "logps/chosen": -415.424951171875, "logps/rejected": -321.1086832682292, "loss": 0.2898, "rewards/chosen": 0.9107183456420899, "rewards/margins": 3.591342989603678, "rewards/rejected": -2.6806246439615884, "step": 16998 }, { "epoch": 0.9010150266345109, "grad_norm": 50.0, "kl": 2.0231761932373047, "learning_rate": 5e-07, "logits/chosen": -23172524.8, "logits/rejected": -37726002.666666664, "logps/chosen": -220.516650390625, "logps/rejected": -378.2838134765625, "loss": 0.3322, "rewards/chosen": 0.4064479827880859, "rewards/margins": 3.658893839518229, "rewards/rejected": -3.252445856730143, "step": 16999 }, { "epoch": 0.901068030636313, "grad_norm": 43.25, "kl": 0.12923526763916016, "learning_rate": 5e-07, "logits/chosen": -4022163.5, "logits/rejected": -26909037.333333332, "logps/chosen": -302.5679626464844, "logps/rejected": -331.3437093098958, "loss": 0.2125, "rewards/chosen": 0.799725353717804, "rewards/margins": 2.7351933916409807, "rewards/rejected": -1.935468037923177, "step": 17000 }, { "epoch": 0.9011210346381152, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14665574.4, "logits/rejected": -14121816.0, "logps/chosen": -183.8563720703125, "logps/rejected": -285.90639241536456, "loss": 0.3265, "rewards/chosen": 0.4677577972412109, "rewards/margins": 2.6954373677571617, "rewards/rejected": -2.2276795705159507, "step": 17001 }, { "epoch": 0.9011740386399173, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56381560.0, "logits/rejected": -14949926.857142856, "logps/chosen": -422.0609130859375, "logps/rejected": -244.90452357700892, "loss": 0.1934, "rewards/chosen": 0.087188720703125, "rewards/margins": 2.2822491782052174, "rewards/rejected": -2.1950604575020924, "step": 17002 }, { "epoch": 0.9012270426417195, "grad_norm": 49.5, "kl": 3.227590560913086, "learning_rate": 5e-07, "logits/chosen": 19279529.6, "logits/rejected": -42924576.0, "logps/chosen": -239.39189453125, "logps/rejected": -498.4984537760417, "loss": 0.3824, "rewards/chosen": 0.17262620925903321, "rewards/margins": 3.241365146636963, "rewards/rejected": -3.0687389373779297, "step": 17003 }, { "epoch": 0.9012800466435216, "grad_norm": 57.5, "kl": 0.29663991928100586, "learning_rate": 5e-07, "logits/chosen": 8625374.0, "logits/rejected": -37669568.0, "logps/chosen": -205.2854766845703, "logps/rejected": -448.3322448730469, "loss": 0.2543, "rewards/chosen": 0.31966710090637207, "rewards/margins": 3.116290330886841, "rewards/rejected": -2.7966232299804688, "step": 17004 }, { "epoch": 0.9013330506453238, "grad_norm": 56.75, "kl": 3.41506290435791, "learning_rate": 5e-07, "logits/chosen": -19039705.6, "logits/rejected": -16320344.0, "logps/chosen": -252.083544921875, "logps/rejected": -425.0166015625, "loss": 0.2658, "rewards/chosen": 0.9497398376464844, "rewards/margins": 3.90920041402181, "rewards/rejected": -2.9594605763753257, "step": 17005 }, { "epoch": 0.9013860546471258, "grad_norm": 64.5, "kl": 0.5946884155273438, "learning_rate": 5e-07, "logits/chosen": -22168228.0, "logits/rejected": 27721906.0, "logps/chosen": -472.23114013671875, "logps/rejected": -270.9272155761719, "loss": 0.2774, "rewards/chosen": 0.7992423176765442, "rewards/margins": 2.3950677514076233, "rewards/rejected": -1.595825433731079, "step": 17006 }, { "epoch": 0.901439058648928, "grad_norm": 43.75, "kl": 4.526239395141602, "learning_rate": 5e-07, "logits/chosen": -6042414.0, "logits/rejected": -52228464.0, "logps/chosen": -171.01443481445312, "logps/rejected": -403.0405578613281, "loss": 0.3581, "rewards/chosen": 0.4536358416080475, "rewards/margins": 2.8208116590976715, "rewards/rejected": -2.367175817489624, "step": 17007 }, { "epoch": 0.9014920626507301, "grad_norm": 51.0, "kl": 1.526310920715332, "learning_rate": 5e-07, "logits/chosen": -19338186.666666668, "logits/rejected": -9142056.0, "logps/chosen": -396.3426106770833, "logps/rejected": -225.64718627929688, "loss": 0.3148, "rewards/chosen": 0.694595734278361, "rewards/margins": 3.6130336920420327, "rewards/rejected": -2.918437957763672, "step": 17008 }, { "epoch": 0.9015450666525323, "grad_norm": 44.5, "kl": 2.8844528198242188, "learning_rate": 5e-07, "logits/chosen": -19950872.0, "logits/rejected": -34360644.0, "logps/chosen": -204.3646697998047, "logps/rejected": -524.5157470703125, "loss": 0.2694, "rewards/chosen": 0.28649991750717163, "rewards/margins": 3.8467277884483337, "rewards/rejected": -3.560227870941162, "step": 17009 }, { "epoch": 0.9015980706543344, "grad_norm": 55.25, "kl": 2.343709945678711, "learning_rate": 5e-07, "logits/chosen": 4025943.6666666665, "logits/rejected": -14073240.0, "logps/chosen": -220.8295694986979, "logps/rejected": -332.656005859375, "loss": 0.3008, "rewards/chosen": 0.7350122133890787, "rewards/margins": 3.199711004892985, "rewards/rejected": -2.4646987915039062, "step": 17010 }, { "epoch": 0.9016510746561366, "grad_norm": 43.5, "kl": 0.3143501281738281, "learning_rate": 5e-07, "logits/chosen": -31736904.0, "logits/rejected": -33922736.0, "logps/chosen": -347.4150390625, "logps/rejected": -305.79541015625, "loss": 0.2999, "rewards/chosen": 0.27492523193359375, "rewards/margins": 3.1335947513580322, "rewards/rejected": -2.8586695194244385, "step": 17011 }, { "epoch": 0.9017040786579387, "grad_norm": 31.625, "kl": 0.4886131286621094, "learning_rate": 5e-07, "logits/chosen": 6887459.0, "logits/rejected": -4991851.333333333, "logps/chosen": -37.031707763671875, "logps/rejected": -145.29606119791666, "loss": 0.2438, "rewards/chosen": 0.08746834099292755, "rewards/margins": 2.38549442589283, "rewards/rejected": -2.2980260848999023, "step": 17012 }, { "epoch": 0.9017570826597409, "grad_norm": 45.25, "kl": 0.5775918960571289, "learning_rate": 5e-07, "logits/chosen": -30590406.0, "logits/rejected": -41483872.0, "logps/chosen": -289.1620788574219, "logps/rejected": -233.19105529785156, "loss": 0.2853, "rewards/chosen": 0.24112339317798615, "rewards/margins": 2.4493882209062576, "rewards/rejected": -2.2082648277282715, "step": 17013 }, { "epoch": 0.9018100866615429, "grad_norm": 41.75, "kl": 5.250638961791992, "learning_rate": 5e-07, "logits/chosen": -16080782.0, "logits/rejected": -10371456.0, "logps/chosen": -483.0229187011719, "logps/rejected": -211.11712646484375, "loss": 0.2218, "rewards/chosen": 1.4214677810668945, "rewards/margins": 3.8057706356048584, "rewards/rejected": -2.384302854537964, "step": 17014 }, { "epoch": 0.9018630906633451, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5611796.666666667, "logits/rejected": -13071750.4, "logps/chosen": -255.56892903645834, "logps/rejected": -214.42509765625, "loss": 0.2517, "rewards/chosen": 0.9654986063639323, "rewards/margins": 2.44980951944987, "rewards/rejected": -1.4843109130859375, "step": 17015 }, { "epoch": 0.9019160946651472, "grad_norm": 32.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29210352.0, "logits/rejected": -28774310.4, "logps/chosen": -285.3009847005208, "logps/rejected": -206.08212890625, "loss": 0.1932, "rewards/chosen": 1.196264425913493, "rewards/margins": 3.4289752642313642, "rewards/rejected": -2.232710838317871, "step": 17016 }, { "epoch": 0.9019690986669494, "grad_norm": 82.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39881020.8, "logits/rejected": -32442930.666666668, "logps/chosen": -248.83486328125, "logps/rejected": -244.8104248046875, "loss": 0.3208, "rewards/chosen": 0.43191866874694823, "rewards/margins": 2.16810835202535, "rewards/rejected": -1.7361896832784016, "step": 17017 }, { "epoch": 0.9020221026687515, "grad_norm": 95.5, "kl": 3.805050849914551, "learning_rate": 5e-07, "logits/chosen": -12310431.0, "logps/chosen": -322.6252136230469, "loss": 0.5172, "rewards/chosen": 0.3132597804069519, "step": 17018 }, { "epoch": 0.9020751066705536, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50873880.0, "logits/rejected": -15522810.666666666, "logps/chosen": -454.0155334472656, "logps/rejected": -249.1680908203125, "loss": 0.136, "rewards/chosen": 0.9133041501045227, "rewards/margins": 3.976874808470408, "rewards/rejected": -3.0635706583658853, "step": 17019 }, { "epoch": 0.9021281106723558, "grad_norm": 55.5, "kl": 2.932432174682617, "learning_rate": 5e-07, "logits/chosen": -68190854.4, "logits/rejected": -15680393.333333334, "logps/chosen": -387.0326171875, "logps/rejected": -271.1181233723958, "loss": 0.2581, "rewards/chosen": 1.0493736267089844, "rewards/margins": 3.8654425938924155, "rewards/rejected": -2.816068967183431, "step": 17020 }, { "epoch": 0.9021811146741578, "grad_norm": 57.0, "kl": 0.4334449768066406, "learning_rate": 5e-07, "logits/chosen": -15040753.6, "logits/rejected": -36954114.666666664, "logps/chosen": -229.7310546875, "logps/rejected": -437.6590983072917, "loss": 0.2318, "rewards/chosen": 0.6467873096466065, "rewards/margins": 5.753870312372844, "rewards/rejected": -5.107083002726237, "step": 17021 }, { "epoch": 0.90223411867596, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81078976.0, "logits/rejected": -19309537.6, "logps/chosen": -366.4707438151042, "logps/rejected": -295.586376953125, "loss": 0.2421, "rewards/chosen": 0.21202749013900757, "rewards/margins": 3.1064805388450623, "rewards/rejected": -2.8944530487060547, "step": 17022 }, { "epoch": 0.9022871226777621, "grad_norm": 78.5, "kl": 6.866604804992676, "learning_rate": 5e-07, "logits/chosen": -16798950.4, "logits/rejected": -5767369.333333333, "logps/chosen": -374.056201171875, "logps/rejected": -160.95158894856772, "loss": 0.4158, "rewards/chosen": 0.8882091522216797, "rewards/margins": 2.0516432603200276, "rewards/rejected": -1.163434108098348, "step": 17023 }, { "epoch": 0.9023401266795643, "grad_norm": 106.5, "kl": 9.00323486328125, "learning_rate": 5e-07, "logits/chosen": -16734902.0, "logits/rejected": -46630373.333333336, "logps/chosen": -1801.393310546875, "logps/rejected": -299.84104410807294, "loss": 0.2157, "rewards/chosen": 4.06396484375, "rewards/margins": 6.038260459899902, "rewards/rejected": -1.9742956161499023, "step": 17024 }, { "epoch": 0.9023931306813664, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57394949.333333336, "logits/rejected": -35825004.8, "logps/chosen": -548.7228190104166, "logps/rejected": -518.7671875, "loss": 0.1679, "rewards/chosen": 0.8945150375366211, "rewards/margins": 3.9310298919677735, "rewards/rejected": -3.0365148544311524, "step": 17025 }, { "epoch": 0.9024461346831686, "grad_norm": 52.25, "kl": 1.5784759521484375, "learning_rate": 5e-07, "logits/chosen": -42373892.0, "logits/rejected": -32788068.0, "logps/chosen": -439.1145324707031, "logps/rejected": -371.809326171875, "loss": 0.239, "rewards/chosen": 0.8261345028877258, "rewards/margins": 3.3725393414497375, "rewards/rejected": -2.5464048385620117, "step": 17026 }, { "epoch": 0.9024991386849707, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 22680340.0, "logits/rejected": -3691686.3333333335, "logps/chosen": -153.14414978027344, "logps/rejected": -372.6465657552083, "loss": 0.1664, "rewards/chosen": 0.8512194156646729, "rewards/margins": 3.244241635004679, "rewards/rejected": -2.3930222193400064, "step": 17027 }, { "epoch": 0.9025521426867729, "grad_norm": 61.75, "kl": 4.41546630859375, "learning_rate": 5e-07, "logits/chosen": 18237920.0, "logits/rejected": -17655126.0, "logps/chosen": -595.7138671875, "logps/rejected": -383.13421630859375, "loss": 0.2756, "rewards/chosen": 0.7843408584594727, "rewards/margins": 3.208272695541382, "rewards/rejected": -2.423931837081909, "step": 17028 }, { "epoch": 0.9026051466885749, "grad_norm": 51.75, "kl": 1.1011276245117188, "learning_rate": 5e-07, "logits/chosen": 4713001.0, "logits/rejected": -25499930.0, "logps/chosen": -478.08966064453125, "logps/rejected": -409.78857421875, "loss": 0.2543, "rewards/chosen": 0.914851725101471, "rewards/margins": 3.031604588031769, "rewards/rejected": -2.116752862930298, "step": 17029 }, { "epoch": 0.9026581506903771, "grad_norm": 49.75, "kl": 0.4254436492919922, "learning_rate": 5e-07, "logits/chosen": -7195201.333333333, "logits/rejected": 9963932.0, "logps/chosen": -235.11942545572916, "logps/rejected": -586.9957275390625, "loss": 0.384, "rewards/chosen": 0.14782934387524924, "rewards/margins": 2.4553154011567435, "rewards/rejected": -2.307486057281494, "step": 17030 }, { "epoch": 0.9027111546921792, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25591861.333333332, "logits/rejected": -34403619.2, "logps/chosen": -265.54978434244794, "logps/rejected": -426.9791015625, "loss": 0.2627, "rewards/chosen": -0.10625714063644409, "rewards/margins": 2.3638386845588686, "rewards/rejected": -2.4700958251953127, "step": 17031 }, { "epoch": 0.9027641586939814, "grad_norm": 50.0, "kl": 2.550708770751953, "learning_rate": 5e-07, "logits/chosen": -16076190.666666666, "logits/rejected": -26262072.0, "logps/chosen": -239.9504597981771, "logps/rejected": -266.9932861328125, "loss": 0.3022, "rewards/chosen": 0.9596905708312988, "rewards/margins": 2.8917202949523926, "rewards/rejected": -1.9320297241210938, "step": 17032 }, { "epoch": 0.9028171626957835, "grad_norm": 38.25, "kl": 0.09456634521484375, "learning_rate": 5e-07, "logits/chosen": -25777154.666666668, "logits/rejected": 7887788.8, "logps/chosen": -246.44816080729166, "logps/rejected": -133.1644775390625, "loss": 0.1999, "rewards/chosen": 1.0433571338653564, "rewards/margins": 3.7493628025054933, "rewards/rejected": -2.706005668640137, "step": 17033 }, { "epoch": 0.9028701666975857, "grad_norm": 53.0, "kl": 2.0248165130615234, "learning_rate": 5e-07, "logits/chosen": -25746757.333333332, "logits/rejected": -39111492.0, "logps/chosen": -234.4532674153646, "logps/rejected": -540.9005126953125, "loss": 0.4222, "rewards/chosen": 0.14924190441767374, "rewards/margins": 2.2526583472887673, "rewards/rejected": -2.1034164428710938, "step": 17034 }, { "epoch": 0.9029231706993878, "grad_norm": 27.25, "kl": 0.8349418640136719, "learning_rate": 5e-07, "logits/chosen": -45969080.0, "logits/rejected": -19944182.666666668, "logps/chosen": -352.4725646972656, "logps/rejected": -392.7948404947917, "loss": 0.1388, "rewards/chosen": 0.7204257845878601, "rewards/margins": 3.7039488355318704, "rewards/rejected": -2.9835230509440103, "step": 17035 }, { "epoch": 0.90297617470119, "grad_norm": 55.25, "kl": 0.8495635986328125, "learning_rate": 5e-07, "logits/chosen": -62720403.2, "logits/rejected": -33557210.666666664, "logps/chosen": -482.36083984375, "logps/rejected": -236.54154459635416, "loss": 0.2782, "rewards/chosen": 0.7496429443359375, "rewards/margins": 2.7484257380167643, "rewards/rejected": -1.998782793680827, "step": 17036 }, { "epoch": 0.903029178702992, "grad_norm": 52.25, "kl": 4.9992828369140625, "learning_rate": 5e-07, "logits/chosen": -27682584.0, "logits/rejected": -644452.25, "logps/chosen": -356.9383138020833, "logps/rejected": -175.6656494140625, "loss": 0.2692, "rewards/chosen": 1.445009708404541, "rewards/margins": 4.871249675750732, "rewards/rejected": -3.4262399673461914, "step": 17037 }, { "epoch": 0.9030821827047942, "grad_norm": 69.5, "kl": 8.297077178955078, "learning_rate": 5e-07, "logits/chosen": -45001152.0, "logits/rejected": -156956640.0, "logps/chosen": -601.8896484375, "logps/rejected": -529.4696044921875, "loss": 0.4578, "rewards/chosen": 0.8916257449558803, "rewards/margins": 3.5545164176395962, "rewards/rejected": -2.662890672683716, "step": 17038 }, { "epoch": 0.9031351867065963, "grad_norm": 59.5, "kl": 4.788114547729492, "learning_rate": 5e-07, "logits/chosen": -1112621.8, "logits/rejected": -39957965.333333336, "logps/chosen": -235.64375, "logps/rejected": -60.75468953450521, "loss": 0.3208, "rewards/chosen": 0.9254663467407227, "rewards/margins": 3.1106368382771814, "rewards/rejected": -2.1851704915364585, "step": 17039 }, { "epoch": 0.9031881907083985, "grad_norm": 47.25, "kl": 0.5907754898071289, "learning_rate": 5e-07, "logits/chosen": -22617957.333333332, "logits/rejected": -10897424.0, "logps/chosen": -108.17945353190105, "logps/rejected": -285.520849609375, "loss": 0.2802, "rewards/chosen": 0.3971360921859741, "rewards/margins": 2.046201491355896, "rewards/rejected": -1.6490653991699218, "step": 17040 }, { "epoch": 0.9032411947102006, "grad_norm": 49.75, "kl": 4.656796455383301, "learning_rate": 5e-07, "logits/chosen": -39522185.6, "logits/rejected": -14580013.333333334, "logps/chosen": -369.4446533203125, "logps/rejected": -198.5459187825521, "loss": 0.3131, "rewards/chosen": 0.981495475769043, "rewards/margins": 2.192582543690999, "rewards/rejected": -1.2110870679219563, "step": 17041 }, { "epoch": 0.9032941987120028, "grad_norm": 35.25, "kl": 1.2952499389648438, "learning_rate": 5e-07, "logits/chosen": -4609861.333333333, "logits/rejected": 633260.5, "logps/chosen": -307.5302327473958, "logps/rejected": -271.38642578125, "loss": 0.2876, "rewards/chosen": 0.053788185119628906, "rewards/margins": 2.1808971405029296, "rewards/rejected": -2.1271089553833007, "step": 17042 }, { "epoch": 0.9033472027138049, "grad_norm": 48.0, "kl": 0.49933624267578125, "learning_rate": 5e-07, "logits/chosen": -16946724.0, "logits/rejected": -37635923.2, "logps/chosen": -298.13804117838544, "logps/rejected": -505.06845703125, "loss": 0.1945, "rewards/chosen": 0.5067688624064127, "rewards/margins": 3.88346373240153, "rewards/rejected": -3.3766948699951174, "step": 17043 }, { "epoch": 0.9034002067156071, "grad_norm": 48.25, "kl": 1.2054595947265625, "learning_rate": 5e-07, "logits/chosen": -23758740.0, "logits/rejected": -2192295.3333333335, "logps/chosen": -266.0335693359375, "logps/rejected": -252.81941731770834, "loss": 0.2357, "rewards/chosen": 0.8144237995147705, "rewards/margins": 3.582432985305786, "rewards/rejected": -2.7680091857910156, "step": 17044 }, { "epoch": 0.9034532107174091, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6928614.0, "logits/rejected": -22423298.666666668, "logps/chosen": -354.79046630859375, "logps/rejected": -374.7645670572917, "loss": 0.1683, "rewards/chosen": 1.2558151483535767, "rewards/margins": 3.61959977944692, "rewards/rejected": -2.3637846310933432, "step": 17045 }, { "epoch": 0.9035062147192113, "grad_norm": 75.0, "kl": 1.9537715911865234, "learning_rate": 5e-07, "logits/chosen": 1635370.7142857143, "logits/rejected": -36981340.0, "logps/chosen": -416.70064871651783, "logps/rejected": -587.939453125, "loss": 0.4264, "rewards/chosen": 0.3267072950090681, "rewards/margins": 5.365733078547886, "rewards/rejected": -5.039025783538818, "step": 17046 }, { "epoch": 0.9035592187210134, "grad_norm": 53.25, "kl": 3.0616302490234375, "learning_rate": 5e-07, "logits/chosen": -24041376.0, "logits/rejected": -10981843.0, "logps/chosen": -248.3733673095703, "logps/rejected": -360.9692077636719, "loss": 0.2781, "rewards/chosen": 0.939049243927002, "rewards/margins": 3.2525908946990967, "rewards/rejected": -2.3135416507720947, "step": 17047 }, { "epoch": 0.9036122227228156, "grad_norm": 71.0, "kl": 3.468242645263672, "learning_rate": 5e-07, "logits/chosen": 5348660.571428572, "logits/rejected": -223049648.0, "logps/chosen": -325.5176478794643, "logps/rejected": -479.7025146484375, "loss": 0.4737, "rewards/chosen": 0.32987628664289204, "rewards/margins": 2.115398747580392, "rewards/rejected": -1.7855224609375, "step": 17048 }, { "epoch": 0.9036652267246177, "grad_norm": 59.75, "kl": 0.6303367614746094, "learning_rate": 5e-07, "logits/chosen": -39206896.0, "logits/rejected": -8338216.666666667, "logps/chosen": -369.536328125, "logps/rejected": -197.6710205078125, "loss": 0.2979, "rewards/chosen": 0.9285201072692871, "rewards/margins": 1.9472451845804852, "rewards/rejected": -1.018725077311198, "step": 17049 }, { "epoch": 0.9037182307264199, "grad_norm": 60.25, "kl": 1.387603759765625, "learning_rate": 5e-07, "logits/chosen": 81758576.0, "logits/rejected": -20186864.0, "logps/chosen": -324.2589416503906, "logps/rejected": -267.665771484375, "loss": 0.2976, "rewards/chosen": 0.10232100635766983, "rewards/margins": 3.3409448638558388, "rewards/rejected": -3.238623857498169, "step": 17050 }, { "epoch": 0.903771234728222, "grad_norm": 53.0, "kl": 7.854684829711914, "learning_rate": 5e-07, "logits/chosen": -27778749.333333332, "logits/rejected": -6389724.0, "logps/chosen": -208.3385213216146, "logps/rejected": -521.1995849609375, "loss": 0.4507, "rewards/chosen": 0.6095174153645834, "rewards/margins": 2.844353278477987, "rewards/rejected": -2.2348358631134033, "step": 17051 }, { "epoch": 0.9038242387300242, "grad_norm": 44.0, "kl": 1.5015182495117188, "learning_rate": 5e-07, "logits/chosen": -28987632.0, "logits/rejected": -42464361.6, "logps/chosen": -424.7265625, "logps/rejected": -268.00478515625, "loss": 0.1964, "rewards/chosen": 0.9087951978047689, "rewards/margins": 4.011512025197347, "rewards/rejected": -3.102716827392578, "step": 17052 }, { "epoch": 0.9038772427318262, "grad_norm": 40.75, "kl": 1.5758304595947266, "learning_rate": 5e-07, "logits/chosen": -8521196.0, "logits/rejected": -12996773.6, "logps/chosen": -140.78030395507812, "logps/rejected": -208.316455078125, "loss": 0.3283, "rewards/chosen": -0.19010595480600992, "rewards/margins": 1.8125362952550252, "rewards/rejected": -2.0026422500610352, "step": 17053 }, { "epoch": 0.9039302467336284, "grad_norm": 51.75, "kl": 0.2739524841308594, "learning_rate": 5e-07, "logits/chosen": -3821119.0, "logits/rejected": -28391202.666666668, "logps/chosen": -61.7464599609375, "logps/rejected": -388.7794596354167, "loss": 0.2313, "rewards/chosen": -0.31707823276519775, "rewards/margins": 2.044141173362732, "rewards/rejected": -2.3612194061279297, "step": 17054 }, { "epoch": 0.9039832507354305, "grad_norm": 38.75, "kl": 2.265787124633789, "learning_rate": 5e-07, "logits/chosen": -25032036.0, "logits/rejected": -25066254.0, "logps/chosen": -254.71441650390625, "logps/rejected": -489.6067199707031, "loss": 0.1909, "rewards/chosen": 1.069894552230835, "rewards/margins": 4.417625665664673, "rewards/rejected": -3.347731113433838, "step": 17055 }, { "epoch": 0.9040362547372327, "grad_norm": 37.75, "kl": 1.1680011749267578, "learning_rate": 5e-07, "logits/chosen": -5356864.0, "logits/rejected": -32345208.0, "logps/chosen": -768.2276000976562, "logps/rejected": -223.65409342447916, "loss": 0.1468, "rewards/chosen": 2.1635804176330566, "rewards/margins": 4.287556966145834, "rewards/rejected": -2.123976548512777, "step": 17056 }, { "epoch": 0.9040892587390348, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7681137.0, "logits/rejected": -17096889.333333332, "logps/chosen": -19.305137634277344, "logps/rejected": -395.4849853515625, "loss": 0.2036, "rewards/chosen": -0.02056274563074112, "rewards/margins": 2.588117979466915, "rewards/rejected": -2.6086807250976562, "step": 17057 }, { "epoch": 0.904142262740837, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91678088.0, "logits/rejected": -32426550.85714286, "logps/chosen": -407.38677978515625, "logps/rejected": -328.31849888392856, "loss": 0.1947, "rewards/chosen": -0.8307861685752869, "rewards/margins": 1.426497995853424, "rewards/rejected": -2.257284164428711, "step": 17058 }, { "epoch": 0.9041952667426391, "grad_norm": 23.875, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -30850266.0, "logps/rejected": -213.31005859375, "loss": 0.1154, "rewards/rejected": -2.667786121368408, "step": 17059 }, { "epoch": 0.9042482707444413, "grad_norm": 32.5, "kl": 0.8781185150146484, "learning_rate": 5e-07, "logits/chosen": -30257165.333333332, "logits/rejected": -12714786.4, "logps/chosen": -218.91727701822916, "logps/rejected": -176.5082763671875, "loss": 0.1868, "rewards/chosen": 0.5106391906738281, "rewards/margins": 3.9236148834228515, "rewards/rejected": -3.4129756927490233, "step": 17060 }, { "epoch": 0.9043012747462433, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11759536.0, "logits/rejected": -19601705.6, "logps/chosen": -94.49293009440105, "logps/rejected": -263.162939453125, "loss": 0.2708, "rewards/chosen": -0.10902431607246399, "rewards/margins": 1.9687775790691378, "rewards/rejected": -2.0778018951416017, "step": 17061 }, { "epoch": 0.9043542787480455, "grad_norm": 50.75, "kl": 1.0238838195800781, "learning_rate": 5e-07, "logits/chosen": 17806232.0, "logits/rejected": -10168339.333333334, "logps/chosen": -245.035400390625, "logps/rejected": -104.34834798177083, "loss": 0.3437, "rewards/chosen": 0.44968881607055666, "rewards/margins": 2.360490385691325, "rewards/rejected": -1.9108015696207683, "step": 17062 }, { "epoch": 0.9044072827498476, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30331850.0, "logits/rejected": -13815658.666666666, "logps/chosen": -277.6663513183594, "logps/rejected": -163.72710164388022, "loss": 0.1969, "rewards/chosen": 0.8959980607032776, "rewards/margins": 3.417192359765371, "rewards/rejected": -2.5211942990620932, "step": 17063 }, { "epoch": 0.9044602867516498, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24481714.0, "logits/rejected": -9795802.666666666, "logps/chosen": -350.8475646972656, "logps/rejected": -182.53983561197916, "loss": 0.1972, "rewards/chosen": 0.18333816528320312, "rewards/margins": 3.498555819193522, "rewards/rejected": -3.315217653910319, "step": 17064 }, { "epoch": 0.9045132907534519, "grad_norm": 57.25, "kl": 3.4258689880371094, "learning_rate": 5e-07, "logits/chosen": -5496889.0, "logits/rejected": -7878370.0, "logps/chosen": -321.2542724609375, "logps/rejected": -170.19541931152344, "loss": 0.3387, "rewards/chosen": 0.8906011581420898, "rewards/margins": 3.0186238288879395, "rewards/rejected": -2.1280226707458496, "step": 17065 }, { "epoch": 0.9045662947552541, "grad_norm": 66.0, "kl": 1.7221479415893555, "learning_rate": 5e-07, "logits/chosen": -8428796.0, "logits/rejected": -31106467.2, "logps/chosen": -221.1541748046875, "logps/rejected": -257.433642578125, "loss": 0.3262, "rewards/chosen": 0.9236053625742594, "rewards/margins": 2.0056907812754314, "rewards/rejected": -1.082085418701172, "step": 17066 }, { "epoch": 0.9046192987570562, "grad_norm": 50.0, "kl": 1.9416170120239258, "learning_rate": 5e-07, "logits/chosen": -20474676.0, "logits/rejected": 5006089.0, "logps/chosen": -175.71464029947916, "logps/rejected": -45.37995910644531, "loss": 0.3935, "rewards/chosen": 0.699770450592041, "rewards/margins": 1.434477150440216, "rewards/rejected": -0.734706699848175, "step": 17067 }, { "epoch": 0.9046723027588583, "grad_norm": 48.25, "kl": 2.7067060470581055, "learning_rate": 5e-07, "logits/chosen": -33032760.0, "logits/rejected": -34101283.2, "logps/chosen": -247.85469563802084, "logps/rejected": -396.800634765625, "loss": 0.3194, "rewards/chosen": 0.027593294779459637, "rewards/margins": 3.0170571009318032, "rewards/rejected": -2.9894638061523438, "step": 17068 }, { "epoch": 0.9047253067606604, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7554065.0, "logits/rejected": -67392704.0, "logps/chosen": -307.0552062988281, "logps/rejected": -317.9808349609375, "loss": 0.2684, "rewards/chosen": 0.8019016981124878, "rewards/margins": 2.7219316959381104, "rewards/rejected": -1.9200299978256226, "step": 17069 }, { "epoch": 0.9047783107624625, "grad_norm": 64.5, "kl": 1.8886127471923828, "learning_rate": 5e-07, "logits/chosen": -10428809.0, "logits/rejected": -23128725.333333332, "logps/chosen": -256.2359619140625, "logps/rejected": -252.48712158203125, "loss": 0.2061, "rewards/chosen": 1.168640375137329, "rewards/margins": 3.116765896479289, "rewards/rejected": -1.9481255213419597, "step": 17070 }, { "epoch": 0.9048313147642647, "grad_norm": 66.5, "kl": 0.026645660400390625, "learning_rate": 5e-07, "logits/chosen": -19779820.0, "logits/rejected": -39583318.4, "logps/chosen": -332.23760986328125, "logps/rejected": -358.5828369140625, "loss": 0.2752, "rewards/chosen": 0.27157185475031537, "rewards/margins": 2.106062630812327, "rewards/rejected": -1.8344907760620117, "step": 17071 }, { "epoch": 0.9048843187660668, "grad_norm": 35.0, "kl": 0.9916782379150391, "learning_rate": 5e-07, "logits/chosen": -9902585.333333334, "logits/rejected": -23374121.6, "logps/chosen": -200.72847493489584, "logps/rejected": -147.57100830078124, "loss": 0.2336, "rewards/chosen": 1.0228254795074463, "rewards/margins": 3.143631601333618, "rewards/rejected": -2.120806121826172, "step": 17072 }, { "epoch": 0.904937322767869, "grad_norm": 43.5, "kl": 1.4321603775024414, "learning_rate": 5e-07, "logits/chosen": -1432863.25, "logits/rejected": -13741519.0, "logps/chosen": -175.74330139160156, "logps/rejected": -220.04159545898438, "loss": 0.3063, "rewards/chosen": 0.3871113955974579, "rewards/margins": 2.993719309568405, "rewards/rejected": -2.6066079139709473, "step": 17073 }, { "epoch": 0.9049903267696711, "grad_norm": 80.0, "kl": 1.458700180053711, "learning_rate": 5e-07, "logits/chosen": 69472486.4, "logits/rejected": -25517509.333333332, "logps/chosen": -407.185302734375, "logps/rejected": -303.6282958984375, "loss": 0.2986, "rewards/chosen": 0.4490629196166992, "rewards/margins": 3.2890122095743815, "rewards/rejected": -2.839949289957682, "step": 17074 }, { "epoch": 0.9050433307714733, "grad_norm": 39.5, "kl": 1.0719079971313477, "learning_rate": 5e-07, "logits/chosen": -56572864.0, "logits/rejected": -39568220.0, "logps/chosen": -238.00054931640625, "logps/rejected": -330.3956298828125, "loss": 0.255, "rewards/chosen": 1.3558727502822876, "rewards/margins": 2.8856714963912964, "rewards/rejected": -1.5297987461090088, "step": 17075 }, { "epoch": 0.9050963347732753, "grad_norm": 55.0, "kl": 0.776123046875, "learning_rate": 5e-07, "logits/chosen": -53461987.2, "logits/rejected": -27351810.666666668, "logps/chosen": -440.3474609375, "logps/rejected": -333.7856852213542, "loss": 0.2537, "rewards/chosen": 0.7720098972320557, "rewards/margins": 3.788210916519165, "rewards/rejected": -3.0162010192871094, "step": 17076 }, { "epoch": 0.9051493387750775, "grad_norm": 37.75, "kl": 0.6250381469726562, "learning_rate": 5e-07, "logits/chosen": -37338965.333333336, "logits/rejected": -43440678.4, "logps/chosen": -345.0723063151042, "logps/rejected": -414.963916015625, "loss": 0.1932, "rewards/chosen": 0.6019460757573446, "rewards/margins": 3.686379059155782, "rewards/rejected": -3.0844329833984374, "step": 17077 }, { "epoch": 0.9052023427768796, "grad_norm": 35.0, "kl": 3.9260082244873047, "learning_rate": 5e-07, "logits/chosen": 9053736.0, "logits/rejected": -38427506.666666664, "logps/chosen": -123.22244873046876, "logps/rejected": -151.76407877604166, "loss": 0.2866, "rewards/chosen": 0.7946411609649658, "rewards/margins": 3.435722017288208, "rewards/rejected": -2.641080856323242, "step": 17078 }, { "epoch": 0.9052553467786818, "grad_norm": 104.5, "kl": 3.574911594390869, "learning_rate": 5e-07, "logits/chosen": -589346.0, "logits/rejected": -19188520.0, "logps/chosen": -302.24713134765625, "logps/rejected": -424.00823974609375, "loss": 0.2452, "rewards/chosen": 1.2133318185806274, "rewards/margins": 4.143619656562805, "rewards/rejected": -2.9302878379821777, "step": 17079 }, { "epoch": 0.9053083507804839, "grad_norm": 23.25, "kl": 2.654529571533203, "learning_rate": 5e-07, "logits/chosen": 9975914.0, "logits/rejected": -32016205.714285713, "logps/chosen": -69.1147232055664, "logps/rejected": -344.23597935267856, "loss": 0.1395, "rewards/chosen": 0.3686058223247528, "rewards/margins": 3.1997730433940887, "rewards/rejected": -2.831167221069336, "step": 17080 }, { "epoch": 0.9053613547822861, "grad_norm": 45.5, "kl": 5.078453063964844, "learning_rate": 5e-07, "logits/chosen": -13402578.666666666, "logits/rejected": -16968774.4, "logps/chosen": -442.6931966145833, "logps/rejected": -304.232861328125, "loss": 0.3006, "rewards/chosen": 0.9314053853352865, "rewards/margins": 4.867239125569661, "rewards/rejected": -3.935833740234375, "step": 17081 }, { "epoch": 0.9054143587840882, "grad_norm": 47.5, "kl": 0.2660980224609375, "learning_rate": 5e-07, "logits/chosen": -21834888.0, "logits/rejected": -45635360.0, "logps/chosen": -361.8454284667969, "logps/rejected": -314.9071044921875, "loss": 0.2184, "rewards/chosen": 1.6022882461547852, "rewards/margins": 2.9577670097351074, "rewards/rejected": -1.3554787635803223, "step": 17082 }, { "epoch": 0.9054673627858904, "grad_norm": 39.5, "kl": 3.3472461700439453, "learning_rate": 5e-07, "logits/chosen": -8981175.2, "logits/rejected": -57194954.666666664, "logps/chosen": -194.08076171875, "logps/rejected": -294.50445556640625, "loss": 0.3565, "rewards/chosen": 0.8241567611694336, "rewards/margins": 2.6638612747192383, "rewards/rejected": -1.8397045135498047, "step": 17083 }, { "epoch": 0.9055203667876924, "grad_norm": 65.5, "kl": 2.4640254974365234, "learning_rate": 5e-07, "logits/chosen": -36230928.0, "logits/rejected": -114660320.0, "logps/chosen": -313.613720703125, "logps/rejected": -368.6265462239583, "loss": 0.3049, "rewards/chosen": 0.4749195098876953, "rewards/margins": 3.6846293131510417, "rewards/rejected": -3.209709803263346, "step": 17084 }, { "epoch": 0.9055733707894946, "grad_norm": 37.5, "kl": 1.6324043273925781, "learning_rate": 5e-07, "logits/chosen": -46817600.0, "logits/rejected": -18778918.4, "logps/chosen": -379.7869466145833, "logps/rejected": -226.34462890625, "loss": 0.2477, "rewards/chosen": 0.6057662963867188, "rewards/margins": 2.959922027587891, "rewards/rejected": -2.354155731201172, "step": 17085 }, { "epoch": 0.9056263747912967, "grad_norm": 38.25, "kl": 2.985154151916504, "learning_rate": 5e-07, "logits/chosen": 3337661.5, "logits/rejected": -22025488.0, "logps/chosen": -483.89337158203125, "logps/rejected": -305.8605041503906, "loss": 0.2232, "rewards/chosen": 1.2759714126586914, "rewards/margins": 4.16879677772522, "rewards/rejected": -2.8928253650665283, "step": 17086 }, { "epoch": 0.9056793787930989, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6357293.333333333, "logits/rejected": -59380966.4, "logps/chosen": -436.095458984375, "logps/rejected": -342.3283203125, "loss": 0.2023, "rewards/chosen": 0.7861944039662679, "rewards/margins": 3.07932718594869, "rewards/rejected": -2.293132781982422, "step": 17087 }, { "epoch": 0.905732382794901, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11512434.0, "logits/rejected": -35536100.571428575, "logps/chosen": -2691.26513671875, "logps/rejected": -232.62869698660714, "loss": 0.1269, "rewards/chosen": 4.611865520477295, "rewards/margins": 6.748609338487897, "rewards/rejected": -2.136743818010603, "step": 17088 }, { "epoch": 0.9057853867967032, "grad_norm": 56.75, "kl": 1.4699020385742188, "learning_rate": 5e-07, "logits/chosen": -18339400.0, "logits/rejected": -27288643.2, "logps/chosen": -143.39872233072916, "logps/rejected": -314.67900390625, "loss": 0.2653, "rewards/chosen": 0.15487696727116904, "rewards/margins": 2.192767401536306, "rewards/rejected": -2.0378904342651367, "step": 17089 }, { "epoch": 0.9058383907985053, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55238192.0, "logits/rejected": -13527970.0, "logps/chosen": -429.73046875, "logps/rejected": -422.3825988769531, "loss": 0.188, "rewards/chosen": 0.689629852771759, "rewards/margins": 4.571291983127594, "rewards/rejected": -3.881662130355835, "step": 17090 }, { "epoch": 0.9058913948003074, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3134403.6666666665, "logits/rejected": -62020057.6, "logps/chosen": -288.66424560546875, "logps/rejected": -566.38935546875, "loss": 0.2151, "rewards/chosen": 0.009268701076507568, "rewards/margins": 3.5376954436302186, "rewards/rejected": -3.528426742553711, "step": 17091 }, { "epoch": 0.9059443988021095, "grad_norm": 64.5, "kl": 0.8050079345703125, "learning_rate": 5e-07, "logits/chosen": 30724922.666666668, "logits/rejected": -34937384.0, "logps/chosen": -463.2413330078125, "logps/rejected": -400.4361267089844, "loss": 0.3425, "rewards/chosen": 0.7384943962097168, "rewards/margins": 2.2465792894363403, "rewards/rejected": -1.5080848932266235, "step": 17092 }, { "epoch": 0.9059974028039117, "grad_norm": 24.25, "kl": 0.9164524078369141, "learning_rate": 5e-07, "logits/chosen": 3493803.5, "logits/rejected": -13610713.142857144, "logps/chosen": -33.21966552734375, "logps/rejected": -203.60642787388392, "loss": 0.0893, "rewards/chosen": 1.1138397455215454, "rewards/margins": 4.459662488528661, "rewards/rejected": -3.345822743007115, "step": 17093 }, { "epoch": 0.9060504068057138, "grad_norm": 54.25, "kl": 2.19000244140625, "learning_rate": 5e-07, "logits/chosen": -38061581.333333336, "logits/rejected": -66765176.0, "logps/chosen": -331.6850179036458, "logps/rejected": -720.2353515625, "loss": 0.2921, "rewards/chosen": 0.9189112186431885, "rewards/margins": 3.5888421535491943, "rewards/rejected": -2.669930934906006, "step": 17094 }, { "epoch": 0.906103410807516, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -75605781.33333333, "logits/rejected": -47079459.2, "logps/chosen": -415.3904215494792, "logps/rejected": -351.24873046875, "loss": 0.2658, "rewards/chosen": 0.16529311736424765, "rewards/margins": 2.0202892263730368, "rewards/rejected": -1.854996109008789, "step": 17095 }, { "epoch": 0.9061564148093181, "grad_norm": 39.75, "kl": 2.414205551147461, "learning_rate": 5e-07, "logits/chosen": -10808854.4, "logits/rejected": -31935509.333333332, "logps/chosen": -326.9190673828125, "logps/rejected": -472.3326009114583, "loss": 0.2051, "rewards/chosen": 1.3524688720703124, "rewards/margins": 3.570810063680013, "rewards/rejected": -2.2183411916097007, "step": 17096 }, { "epoch": 0.9062094188111203, "grad_norm": 58.75, "kl": 0.12133407592773438, "learning_rate": 5e-07, "logits/chosen": -18927298.0, "logits/rejected": -17131324.0, "logps/chosen": -679.29443359375, "logps/rejected": -243.22247314453125, "loss": 0.272, "rewards/chosen": 0.9901237487792969, "rewards/margins": 3.0024616718292236, "rewards/rejected": -2.0123379230499268, "step": 17097 }, { "epoch": 0.9062624228129224, "grad_norm": 36.75, "kl": 0.35133886337280273, "learning_rate": 5e-07, "logits/chosen": -55902860.0, "logits/rejected": -13377944.0, "logps/chosen": -213.020751953125, "logps/rejected": -213.41192626953125, "loss": 0.2481, "rewards/chosen": 0.884795606136322, "rewards/margins": 2.4981275598208112, "rewards/rejected": -1.613331953684489, "step": 17098 }, { "epoch": 0.9063154268147245, "grad_norm": 48.0, "kl": 1.8211326599121094, "learning_rate": 5e-07, "logits/chosen": 5314784.0, "logits/rejected": -10821352.8, "logps/chosen": -163.79191080729166, "logps/rejected": -145.576171875, "loss": 0.3555, "rewards/chosen": 0.30661189556121826, "rewards/margins": 1.5811571836471559, "rewards/rejected": -1.2745452880859376, "step": 17099 }, { "epoch": 0.9063684308165266, "grad_norm": 47.25, "kl": 0.542266845703125, "learning_rate": 5e-07, "logits/chosen": -12705368.8, "logits/rejected": -868.6666666666666, "logps/chosen": -240.1251953125, "logps/rejected": -114.29315185546875, "loss": 0.3804, "rewards/chosen": 0.06688661575317383, "rewards/margins": 1.645333957672119, "rewards/rejected": -1.5784473419189453, "step": 17100 }, { "epoch": 0.9064214348183288, "grad_norm": 56.0, "kl": 1.4163932800292969, "learning_rate": 5e-07, "logits/chosen": -26670608.0, "logits/rejected": -5666186.5, "logps/chosen": -98.7376937866211, "logps/rejected": -187.88522338867188, "loss": 0.4223, "rewards/chosen": -0.3405795991420746, "rewards/margins": 1.5525435507297516, "rewards/rejected": -1.8931231498718262, "step": 17101 }, { "epoch": 0.9064744388201309, "grad_norm": 68.0, "kl": 0.7621536254882812, "learning_rate": 5e-07, "logits/chosen": -51395376.0, "logits/rejected": -36023068.8, "logps/chosen": -232.87786865234375, "logps/rejected": -520.64560546875, "loss": 0.3599, "rewards/chosen": -0.4445228576660156, "rewards/margins": 1.7498565673828126, "rewards/rejected": -2.194379425048828, "step": 17102 }, { "epoch": 0.9065274428219331, "grad_norm": 40.25, "kl": 2.264319896697998, "learning_rate": 5e-07, "logits/chosen": -12148422.4, "logits/rejected": -20973966.666666668, "logps/chosen": -154.96964111328126, "logps/rejected": -182.5522257486979, "loss": 0.3371, "rewards/chosen": 0.21740097999572755, "rewards/margins": 3.7025777022043864, "rewards/rejected": -3.4851767222086587, "step": 17103 }, { "epoch": 0.9065804468237352, "grad_norm": 48.75, "kl": 1.3971405029296875, "learning_rate": 5e-07, "logits/chosen": -56253173.333333336, "logits/rejected": -28274828.8, "logps/chosen": -571.1407063802084, "logps/rejected": -278.1944091796875, "loss": 0.2285, "rewards/chosen": 0.776207447052002, "rewards/margins": 3.5678879737854006, "rewards/rejected": -2.7916805267333986, "step": 17104 }, { "epoch": 0.9066334508255374, "grad_norm": 37.5, "kl": 0.3993721008300781, "learning_rate": 5e-07, "logits/chosen": -25666640.0, "logits/rejected": -7209489.5, "logps/chosen": -238.29605102539062, "logps/rejected": -130.10342407226562, "loss": 0.1973, "rewards/chosen": 0.7299448251724243, "rewards/margins": 4.3771597146987915, "rewards/rejected": -3.647214889526367, "step": 17105 }, { "epoch": 0.9066864548273394, "grad_norm": 42.5, "kl": 2.9257612228393555, "learning_rate": 5e-07, "logits/chosen": -26803641.6, "logits/rejected": -17998833.333333332, "logps/chosen": -267.7138916015625, "logps/rejected": -459.5690511067708, "loss": 0.3771, "rewards/chosen": 0.17279231548309326, "rewards/margins": 3.3953694105148315, "rewards/rejected": -3.2225770950317383, "step": 17106 }, { "epoch": 0.9067394588291416, "grad_norm": 68.5, "kl": 1.3880233764648438, "learning_rate": 5e-07, "logits/chosen": -3286448.8, "logits/rejected": -27359552.0, "logps/chosen": -273.0072509765625, "logps/rejected": -194.58536783854166, "loss": 0.3234, "rewards/chosen": 0.22520432472229004, "rewards/margins": 2.7193481286366783, "rewards/rejected": -2.494143803914388, "step": 17107 }, { "epoch": 0.9067924628309437, "grad_norm": 36.25, "kl": 0.640683650970459, "learning_rate": 5e-07, "logits/chosen": -3685510.75, "logits/rejected": -59964456.0, "logps/chosen": -151.11033630371094, "logps/rejected": -391.68292236328125, "loss": 0.2761, "rewards/chosen": 0.4877936840057373, "rewards/margins": 3.3177003860473633, "rewards/rejected": -2.829906702041626, "step": 17108 }, { "epoch": 0.9068454668327459, "grad_norm": 47.75, "kl": 0.49609851837158203, "learning_rate": 5e-07, "logits/chosen": -59659136.0, "logits/rejected": -22218248.0, "logps/chosen": -267.34844970703125, "logps/rejected": -258.8996276855469, "loss": 0.3084, "rewards/chosen": 0.39730527997016907, "rewards/margins": 2.2234592735767365, "rewards/rejected": -1.8261539936065674, "step": 17109 }, { "epoch": 0.906898470834548, "grad_norm": 47.0, "kl": 1.0209121704101562, "learning_rate": 5e-07, "logits/chosen": -20972046.666666668, "logits/rejected": -6663489.6, "logps/chosen": -180.2388916015625, "logps/rejected": -188.4609619140625, "loss": 0.2656, "rewards/chosen": 0.884785016377767, "rewards/margins": 2.5155850728352864, "rewards/rejected": -1.6308000564575196, "step": 17110 }, { "epoch": 0.9069514748363502, "grad_norm": 83.5, "kl": 9.560379028320312, "learning_rate": 5e-07, "logits/chosen": -59258997.333333336, "logits/rejected": -21575428.8, "logps/chosen": -648.3163248697916, "logps/rejected": -458.77548828125, "loss": 0.2509, "rewards/chosen": 2.8395630518595376, "rewards/margins": 5.81195265452067, "rewards/rejected": -2.9723896026611327, "step": 17111 }, { "epoch": 0.9070044788381523, "grad_norm": 56.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28605256.0, "logits/rejected": -14036307.2, "logps/chosen": -141.74642944335938, "logps/rejected": -255.0386474609375, "loss": 0.3435, "rewards/chosen": -0.21880346536636353, "rewards/margins": 2.1641738295555113, "rewards/rejected": -2.382977294921875, "step": 17112 }, { "epoch": 0.9070574828399545, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37208725.333333336, "logits/rejected": -7245425.6, "logps/chosen": -298.61484781901044, "logps/rejected": -345.3865234375, "loss": 0.1856, "rewards/chosen": 0.3282989462216695, "rewards/margins": 3.983567806084951, "rewards/rejected": -3.6552688598632814, "step": 17113 }, { "epoch": 0.9071104868417565, "grad_norm": 48.5, "kl": 0.5305938720703125, "learning_rate": 5e-07, "logits/chosen": -33398221.333333332, "logits/rejected": 9174570.0, "logps/chosen": -273.77773030598956, "logps/rejected": -208.6898956298828, "loss": 0.3587, "rewards/chosen": 0.5287884871164957, "rewards/margins": 2.0259905258814492, "rewards/rejected": -1.4972020387649536, "step": 17114 }, { "epoch": 0.9071634908435587, "grad_norm": 41.5, "kl": 3.709625244140625, "learning_rate": 5e-07, "logits/chosen": -10901977.6, "logits/rejected": -36228528.0, "logps/chosen": -184.89154052734375, "logps/rejected": -252.1212158203125, "loss": 0.3005, "rewards/chosen": 1.0386919975280762, "rewards/margins": 3.0580288569132485, "rewards/rejected": -2.0193368593851724, "step": 17115 }, { "epoch": 0.9072164948453608, "grad_norm": 51.0, "kl": 2.0393199920654297, "learning_rate": 5e-07, "logits/chosen": -42491657.6, "logits/rejected": -13064949.333333334, "logps/chosen": -225.05791015625, "logps/rejected": -295.34421793619794, "loss": 0.2693, "rewards/chosen": 0.8331034660339356, "rewards/margins": 3.104970900217692, "rewards/rejected": -2.2718674341837564, "step": 17116 }, { "epoch": 0.907269498847163, "grad_norm": 25.625, "kl": 2.397401809692383, "learning_rate": 5e-07, "logits/chosen": 1696152.0, "logits/rejected": -41914182.4, "logps/chosen": -151.79912312825522, "logps/rejected": -321.542041015625, "loss": 0.1547, "rewards/chosen": 1.028839111328125, "rewards/margins": 5.277568435668945, "rewards/rejected": -4.24872932434082, "step": 17117 }, { "epoch": 0.9073225028489651, "grad_norm": 48.75, "kl": 1.1344146728515625, "learning_rate": 5e-07, "logits/chosen": -56411872.0, "logits/rejected": -18335408.0, "logps/chosen": -563.09423828125, "logps/rejected": -304.792724609375, "loss": 0.165, "rewards/chosen": 0.7312530676523844, "rewards/margins": 5.746655289332073, "rewards/rejected": -5.015402221679688, "step": 17118 }, { "epoch": 0.9073755068507673, "grad_norm": 48.0, "kl": 2.821979522705078, "learning_rate": 5e-07, "logits/chosen": -59967624.0, "logits/rejected": -1547071.0, "logps/chosen": -644.131591796875, "logps/rejected": -142.22323608398438, "loss": 0.211, "rewards/chosen": 1.1575508117675781, "rewards/margins": 3.2015137672424316, "rewards/rejected": -2.0439629554748535, "step": 17119 }, { "epoch": 0.9074285108525694, "grad_norm": 44.0, "kl": 0.03261566162109375, "learning_rate": 5e-07, "logits/chosen": -46775712.0, "logits/rejected": -2779960.6, "logps/chosen": -492.4536946614583, "logps/rejected": -219.09638671875, "loss": 0.1945, "rewards/chosen": 0.5631449222564697, "rewards/margins": 2.9308565616607667, "rewards/rejected": -2.367711639404297, "step": 17120 }, { "epoch": 0.9074815148543715, "grad_norm": 48.75, "kl": 4.957326889038086, "learning_rate": 5e-07, "logits/chosen": -16496338.0, "logits/rejected": -15386298.0, "logps/chosen": -171.3608856201172, "logps/rejected": -387.5864562988281, "loss": 0.2453, "rewards/chosen": 1.0043425559997559, "rewards/margins": 3.4179930686950684, "rewards/rejected": -2.4136505126953125, "step": 17121 }, { "epoch": 0.9075345188561736, "grad_norm": 57.25, "kl": 2.685832977294922, "learning_rate": 5e-07, "logits/chosen": -25975318.4, "logits/rejected": -26729552.0, "logps/chosen": -335.516455078125, "logps/rejected": -406.5580240885417, "loss": 0.3744, "rewards/chosen": 0.21087322235107422, "rewards/margins": 2.5470783869425455, "rewards/rejected": -2.336205164591471, "step": 17122 }, { "epoch": 0.9075875228579757, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44930218.666666664, "logits/rejected": -14881139.2, "logps/chosen": -129.92498779296875, "logps/rejected": -206.526953125, "loss": 0.2592, "rewards/chosen": 0.3822309970855713, "rewards/margins": 2.612627649307251, "rewards/rejected": -2.2303966522216796, "step": 17123 }, { "epoch": 0.9076405268597779, "grad_norm": 36.0, "kl": 2.3299179077148438, "learning_rate": 5e-07, "logits/chosen": 3100891.5, "logits/rejected": -7979221.0, "logps/chosen": -203.45547485351562, "logps/rejected": -127.62867736816406, "loss": 0.241, "rewards/chosen": 1.26629638671875, "rewards/margins": 4.0772318840026855, "rewards/rejected": -2.8109354972839355, "step": 17124 }, { "epoch": 0.90769353086158, "grad_norm": 51.0, "kl": 3.9385318756103516, "learning_rate": 5e-07, "logits/chosen": -43526314.666666664, "logits/rejected": -176621.046875, "logps/chosen": -231.86580403645834, "logps/rejected": -267.61761474609375, "loss": 0.4334, "rewards/chosen": 0.38653830687205, "rewards/margins": 2.30764893690745, "rewards/rejected": -1.9211106300354004, "step": 17125 }, { "epoch": 0.9077465348633822, "grad_norm": 67.5, "kl": 1.6191959381103516, "learning_rate": 5e-07, "logits/chosen": -18134374.85714286, "logits/rejected": -9952714.0, "logps/chosen": -341.34005301339283, "logps/rejected": -540.5838623046875, "loss": 0.3076, "rewards/chosen": 0.9232000623430524, "rewards/margins": 4.773245981761387, "rewards/rejected": -3.850045919418335, "step": 17126 }, { "epoch": 0.9077995388651843, "grad_norm": 37.25, "kl": 0.6322193145751953, "learning_rate": 5e-07, "logits/chosen": -12631171.0, "logits/rejected": -27721622.0, "logps/chosen": -230.0384521484375, "logps/rejected": -253.72364807128906, "loss": 0.2448, "rewards/chosen": 0.6075682044029236, "rewards/margins": 3.2074000239372253, "rewards/rejected": -2.5998318195343018, "step": 17127 }, { "epoch": 0.9078525428669865, "grad_norm": 44.0, "kl": 1.312668800354004, "learning_rate": 5e-07, "logits/chosen": -21980073.333333332, "logits/rejected": -21662462.4, "logps/chosen": -130.5565388997396, "logps/rejected": -307.37587890625, "loss": 0.267, "rewards/chosen": 0.7435870170593262, "rewards/margins": 2.457919216156006, "rewards/rejected": -1.7143321990966798, "step": 17128 }, { "epoch": 0.9079055468687885, "grad_norm": 61.25, "kl": 0.58099365234375, "learning_rate": 5e-07, "logits/chosen": 18643776.0, "logits/rejected": 2870010.5, "logps/chosen": -363.9077555338542, "logps/rejected": -233.58645629882812, "loss": 0.3747, "rewards/chosen": 0.2702530026435852, "rewards/margins": 2.911839783191681, "rewards/rejected": -2.6415867805480957, "step": 17129 }, { "epoch": 0.9079585508705907, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59099860.0, "logits/rejected": -48327849.14285714, "logps/chosen": -221.91775512695312, "logps/rejected": -318.94102260044644, "loss": 0.1909, "rewards/chosen": 0.7591003775596619, "rewards/margins": 2.815402720655714, "rewards/rejected": -2.056302343096052, "step": 17130 }, { "epoch": 0.9080115548723928, "grad_norm": 43.25, "kl": 3.362499237060547, "learning_rate": 5e-07, "logits/chosen": -14654379.2, "logits/rejected": -24737077.333333332, "logps/chosen": -282.7143798828125, "logps/rejected": -359.6515299479167, "loss": 0.2155, "rewards/chosen": 1.3042367935180663, "rewards/margins": 4.5085904439290365, "rewards/rejected": -3.20435365041097, "step": 17131 }, { "epoch": 0.908064558874195, "grad_norm": 52.25, "kl": 0.2736949920654297, "learning_rate": 5e-07, "logits/chosen": -20787509.333333332, "logits/rejected": -1960108.0, "logps/chosen": -340.7848714192708, "logps/rejected": -252.3820068359375, "loss": 0.2655, "rewards/chosen": 0.5095444122950236, "rewards/margins": 2.697115365664164, "rewards/rejected": -2.1875709533691405, "step": 17132 }, { "epoch": 0.9081175628759971, "grad_norm": 36.0, "kl": 5.66878604888916, "learning_rate": 5e-07, "logits/chosen": -12198582.4, "logits/rejected": -124902869.33333333, "logps/chosen": -202.94212646484374, "logps/rejected": -595.3277180989584, "loss": 0.3517, "rewards/chosen": 0.607013750076294, "rewards/margins": 3.86975801785787, "rewards/rejected": -3.2627442677815757, "step": 17133 }, { "epoch": 0.9081705668777993, "grad_norm": 45.75, "kl": 1.3727569580078125, "learning_rate": 5e-07, "logits/chosen": -25443853.714285713, "logits/rejected": -31586850.0, "logps/chosen": -313.912841796875, "logps/rejected": -276.3612976074219, "loss": 0.3634, "rewards/chosen": 0.9586403710501534, "rewards/margins": 1.9302682025091988, "rewards/rejected": -0.9716278314590454, "step": 17134 }, { "epoch": 0.9082235708796014, "grad_norm": 63.5, "kl": 0.97479248046875, "learning_rate": 5e-07, "logits/chosen": -29976404.0, "logits/rejected": 2775534.0, "logps/chosen": -319.6185607910156, "logps/rejected": -411.0565185546875, "loss": 0.2652, "rewards/chosen": 0.40698662400245667, "rewards/margins": 2.7257355749607086, "rewards/rejected": -2.318748950958252, "step": 17135 }, { "epoch": 0.9082765748814036, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37752848.0, "logits/rejected": 12553072.0, "logps/chosen": -176.5665283203125, "logps/rejected": -213.5474609375, "loss": 0.3192, "rewards/chosen": 0.007010906934738159, "rewards/margins": 1.534729355573654, "rewards/rejected": -1.527718448638916, "step": 17136 }, { "epoch": 0.9083295788832056, "grad_norm": 44.0, "kl": 1.6609344482421875, "learning_rate": 5e-07, "logits/chosen": -19556016.0, "logits/rejected": -25961637.333333332, "logps/chosen": -180.08857421875, "logps/rejected": -135.4963582356771, "loss": 0.4145, "rewards/chosen": 0.15236225128173828, "rewards/margins": 1.4361709594726562, "rewards/rejected": -1.283808708190918, "step": 17137 }, { "epoch": 0.9083825828850078, "grad_norm": 51.5, "kl": 0.3316631317138672, "learning_rate": 5e-07, "logits/chosen": -19903292.0, "logits/rejected": -40141504.0, "logps/chosen": -148.7805938720703, "logps/rejected": -507.7004699707031, "loss": 0.318, "rewards/chosen": 0.03861858695745468, "rewards/margins": 2.3487146124243736, "rewards/rejected": -2.310096025466919, "step": 17138 }, { "epoch": 0.9084355868868099, "grad_norm": 46.0, "kl": 1.6401243209838867, "learning_rate": 5e-07, "logits/chosen": -66374694.4, "logits/rejected": -25299589.333333332, "logps/chosen": -239.9032958984375, "logps/rejected": -188.78389485677084, "loss": 0.2847, "rewards/chosen": 0.6056111812591553, "rewards/margins": 2.495904779434204, "rewards/rejected": -1.8902935981750488, "step": 17139 }, { "epoch": 0.9084885908886121, "grad_norm": 51.25, "kl": 3.48789119720459, "learning_rate": 5e-07, "logits/chosen": -58634144.0, "logits/rejected": -11504809.0, "logps/chosen": -329.3004557291667, "logps/rejected": -331.4498596191406, "loss": 0.3877, "rewards/chosen": 0.44950870672861737, "rewards/margins": 4.475594560305278, "rewards/rejected": -4.02608585357666, "step": 17140 }, { "epoch": 0.9085415948904142, "grad_norm": 118.5, "kl": 5.914731025695801, "learning_rate": 5e-07, "logits/chosen": -35212675.2, "logits/rejected": -1368692.0, "logps/chosen": -468.125634765625, "logps/rejected": -140.3054402669271, "loss": 0.2902, "rewards/chosen": 1.2831709861755372, "rewards/margins": 2.6504470189412435, "rewards/rejected": -1.3672760327657063, "step": 17141 }, { "epoch": 0.9085945988922164, "grad_norm": 51.25, "kl": 1.879408836364746, "learning_rate": 5e-07, "logits/chosen": 12142114.666666666, "logits/rejected": -9981250.4, "logps/chosen": -88.2537841796875, "logps/rejected": -223.37392578125, "loss": 0.3509, "rewards/chosen": 0.3373440106709798, "rewards/margins": 1.836263879140218, "rewards/rejected": -1.4989198684692382, "step": 17142 }, { "epoch": 0.9086476028940185, "grad_norm": 17.75, "kl": 0.8795661926269531, "learning_rate": 5e-07, "logits/chosen": 4529666.666666667, "logits/rejected": -42844348.8, "logps/chosen": -10.024096806844076, "logps/rejected": -550.256005859375, "loss": 0.2158, "rewards/chosen": 0.09643243749936421, "rewards/margins": 4.055254306395849, "rewards/rejected": -3.9588218688964845, "step": 17143 }, { "epoch": 0.9087006068958207, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37121256.0, "logits/rejected": -20968937.333333332, "logps/chosen": -384.7091064453125, "logps/rejected": -323.5117594401042, "loss": 0.1069, "rewards/chosen": 1.92699134349823, "rewards/margins": 4.424374779065451, "rewards/rejected": -2.49738343556722, "step": 17144 }, { "epoch": 0.9087536108976227, "grad_norm": 49.25, "kl": 1.4993362426757812, "learning_rate": 5e-07, "logits/chosen": -4621920.8, "logits/rejected": -13268061.333333334, "logps/chosen": -321.8666259765625, "logps/rejected": -126.72958374023438, "loss": 0.2611, "rewards/chosen": 1.1125308990478515, "rewards/margins": 3.30195525487264, "rewards/rejected": -2.1894243558247886, "step": 17145 }, { "epoch": 0.9088066148994249, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 66006540.0, "logits/rejected": -8684728.666666666, "logps/chosen": -72.11215209960938, "logps/rejected": -552.4278971354166, "loss": 0.1174, "rewards/chosen": 1.299431562423706, "rewards/margins": 4.73694109916687, "rewards/rejected": -3.437509536743164, "step": 17146 }, { "epoch": 0.908859618901227, "grad_norm": 48.0, "kl": 1.0091485977172852, "learning_rate": 5e-07, "logits/chosen": -7377221.6, "logits/rejected": -4057415.3333333335, "logps/chosen": -117.71788330078125, "logps/rejected": -168.9615681966146, "loss": 0.321, "rewards/chosen": 0.1627524971961975, "rewards/margins": 3.595047024885813, "rewards/rejected": -3.4322945276896157, "step": 17147 }, { "epoch": 0.9089126229030292, "grad_norm": 60.0, "kl": 0.5079221725463867, "learning_rate": 5e-07, "logits/chosen": -37065629.71428572, "logits/rejected": -40409960.0, "logps/chosen": -298.1383579799107, "logps/rejected": -54.60087966918945, "loss": 0.4186, "rewards/chosen": 0.3533158302307129, "rewards/margins": 1.341435730457306, "rewards/rejected": -0.988119900226593, "step": 17148 }, { "epoch": 0.9089656269048313, "grad_norm": 58.5, "kl": 5.378791809082031, "learning_rate": 5e-07, "logits/chosen": -22056049.6, "logits/rejected": -1145772.3333333333, "logps/chosen": -245.5755859375, "logps/rejected": -216.77642822265625, "loss": 0.2554, "rewards/chosen": 1.5317031860351562, "rewards/margins": 3.2921732584635417, "rewards/rejected": -1.7604700724283855, "step": 17149 }, { "epoch": 0.9090186309066335, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50892576.0, "logits/rejected": -32773866.0, "logps/chosen": -338.4186706542969, "logps/rejected": -513.3232421875, "loss": 0.281, "rewards/chosen": 0.23347321152687073, "rewards/margins": 2.588906913995743, "rewards/rejected": -2.355433702468872, "step": 17150 }, { "epoch": 0.9090716349084356, "grad_norm": 50.0, "kl": 3.850452423095703, "learning_rate": 5e-07, "logits/chosen": -42032758.4, "logits/rejected": -44109442.666666664, "logps/chosen": -462.444970703125, "logps/rejected": -660.720703125, "loss": 0.293, "rewards/chosen": 0.8485156059265136, "rewards/margins": 3.7799537340799967, "rewards/rejected": -2.931438128153483, "step": 17151 }, { "epoch": 0.9091246389102378, "grad_norm": 72.0, "kl": 2.0348968505859375, "learning_rate": 5e-07, "logits/chosen": -40637747.2, "logits/rejected": -37380773.333333336, "logps/chosen": -515.6904296875, "logps/rejected": -474.4388020833333, "loss": 0.3096, "rewards/chosen": 0.6136590957641601, "rewards/margins": 3.3100161870320637, "rewards/rejected": -2.696357091267904, "step": 17152 }, { "epoch": 0.9091776429120398, "grad_norm": 36.25, "kl": 2.291391372680664, "learning_rate": 5e-07, "logits/chosen": -51301716.0, "logits/rejected": -16419901.0, "logps/chosen": -366.4955749511719, "logps/rejected": -747.019287109375, "loss": 0.1855, "rewards/chosen": 1.371368408203125, "rewards/margins": 6.135654926300049, "rewards/rejected": -4.764286518096924, "step": 17153 }, { "epoch": 0.909230646913842, "grad_norm": 48.75, "kl": 1.5289888381958008, "learning_rate": 5e-07, "logits/chosen": -20873420.0, "logits/rejected": 5298841.5, "logps/chosen": -182.91482543945312, "logps/rejected": -291.2425231933594, "loss": 0.2553, "rewards/chosen": 0.6395207047462463, "rewards/margins": 3.4102302193641663, "rewards/rejected": -2.77070951461792, "step": 17154 }, { "epoch": 0.9092836509156441, "grad_norm": 48.0, "kl": 3.8162097930908203, "learning_rate": 5e-07, "logits/chosen": -36575507.2, "logits/rejected": -4084065.3333333335, "logps/chosen": -1049.3564453125, "logps/rejected": -361.2712809244792, "loss": 0.1983, "rewards/chosen": 1.931097412109375, "rewards/margins": 4.556012852986654, "rewards/rejected": -2.624915440877279, "step": 17155 }, { "epoch": 0.9093366549174463, "grad_norm": 67.5, "kl": 1.5074596405029297, "learning_rate": 5e-07, "logits/chosen": -41533432.0, "logits/rejected": -4013136.25, "logps/chosen": -189.10941569010416, "logps/rejected": -232.6051483154297, "loss": 0.3871, "rewards/chosen": 0.13062339027722678, "rewards/margins": 3.3895011444886527, "rewards/rejected": -3.258877754211426, "step": 17156 }, { "epoch": 0.9093896589192484, "grad_norm": 39.5, "kl": 0.0087890625, "learning_rate": 5e-07, "logits/chosen": -57392069.333333336, "logits/rejected": -37829740.8, "logps/chosen": -468.1879475911458, "logps/rejected": -411.9623046875, "loss": 0.1618, "rewards/chosen": 1.035844882329305, "rewards/margins": 4.378928073247273, "rewards/rejected": -3.3430831909179686, "step": 17157 }, { "epoch": 0.9094426629210506, "grad_norm": 52.0, "kl": 1.3591938018798828, "learning_rate": 5e-07, "logits/chosen": -10442153.333333334, "logits/rejected": -28757960.0, "logps/chosen": -189.08064778645834, "logps/rejected": -344.5614318847656, "loss": 0.3177, "rewards/chosen": 0.604543129603068, "rewards/margins": 2.802971522013346, "rewards/rejected": -2.1984283924102783, "step": 17158 }, { "epoch": 0.9094956669228527, "grad_norm": 65.5, "kl": 5.294137954711914, "learning_rate": 5e-07, "logits/chosen": -13280694.666666666, "logits/rejected": 1882076.375, "logps/chosen": -350.9910074869792, "logps/rejected": -65.35599517822266, "loss": 0.3614, "rewards/chosen": 1.1112078825632732, "rewards/margins": 3.646304289499919, "rewards/rejected": -2.5350964069366455, "step": 17159 }, { "epoch": 0.9095486709246549, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14725213.0, "logits/rejected": -62551156.0, "logps/chosen": -292.826171875, "logps/rejected": -588.0759887695312, "loss": 0.1882, "rewards/chosen": 1.0197830200195312, "rewards/margins": 5.179142475128174, "rewards/rejected": -4.159359455108643, "step": 17160 }, { "epoch": 0.9096016749264569, "grad_norm": 39.75, "kl": 6.654008865356445, "learning_rate": 5e-07, "logits/chosen": -30000452.57142857, "logits/rejected": -38636972.0, "logps/chosen": -354.43031529017856, "logps/rejected": -482.12689208984375, "loss": 0.4977, "rewards/chosen": 0.5793025834219796, "rewards/margins": 4.262920890535627, "rewards/rejected": -3.6836183071136475, "step": 17161 }, { "epoch": 0.9096546789282591, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38800244.0, "logits/rejected": -22009850.0, "logps/chosen": -320.3491516113281, "logps/rejected": -413.56475830078125, "loss": 0.1835, "rewards/chosen": 0.8481816053390503, "rewards/margins": 4.325067400932312, "rewards/rejected": -3.4768857955932617, "step": 17162 }, { "epoch": 0.9097076829300612, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -179994048.0, "logits/rejected": -44920038.4, "logps/chosen": -256.8187662760417, "logps/rejected": -426.85283203125, "loss": 0.2066, "rewards/chosen": 0.33359551429748535, "rewards/margins": 3.185742425918579, "rewards/rejected": -2.8521469116210936, "step": 17163 }, { "epoch": 0.9097606869318634, "grad_norm": 56.75, "kl": 0.5500278472900391, "learning_rate": 5e-07, "logits/chosen": -28261876.0, "logits/rejected": -16285415.0, "logps/chosen": -311.26556396484375, "logps/rejected": -195.01797485351562, "loss": 0.3074, "rewards/chosen": 0.3437347412109375, "rewards/margins": 2.0032622814178467, "rewards/rejected": -1.6595275402069092, "step": 17164 }, { "epoch": 0.9098136909336655, "grad_norm": 39.75, "kl": 1.1093597412109375, "learning_rate": 5e-07, "logits/chosen": -38683045.333333336, "logits/rejected": -8393123.2, "logps/chosen": -273.0359293619792, "logps/rejected": -593.293505859375, "loss": 0.2142, "rewards/chosen": 0.2345704436302185, "rewards/margins": 3.996985375881195, "rewards/rejected": -3.7624149322509766, "step": 17165 }, { "epoch": 0.9098666949354677, "grad_norm": 44.0, "kl": 0.3377962112426758, "learning_rate": 5e-07, "logits/chosen": -52142060.0, "logits/rejected": -29980600.0, "logps/chosen": -334.87982177734375, "logps/rejected": -433.779541015625, "loss": 0.2733, "rewards/chosen": 0.26490652561187744, "rewards/margins": 2.846582531929016, "rewards/rejected": -2.5816760063171387, "step": 17166 }, { "epoch": 0.9099196989372698, "grad_norm": 41.5, "kl": 1.377122402191162, "learning_rate": 5e-07, "logits/chosen": -3524594.8, "logits/rejected": -99194890.66666667, "logps/chosen": -247.7794677734375, "logps/rejected": -484.6105143229167, "loss": 0.3141, "rewards/chosen": 0.46040806770324705, "rewards/margins": 3.4251068592071534, "rewards/rejected": -2.9646987915039062, "step": 17167 }, { "epoch": 0.909972702939072, "grad_norm": 39.0, "kl": 1.5243034362792969, "learning_rate": 5e-07, "logits/chosen": -26437594.666666668, "logits/rejected": -40749952.0, "logps/chosen": -241.4418741861979, "logps/rejected": -428.38515625, "loss": 0.1772, "rewards/chosen": 1.5942966143290203, "rewards/margins": 4.527192465464275, "rewards/rejected": -2.932895851135254, "step": 17168 }, { "epoch": 0.910025706940874, "grad_norm": 35.75, "kl": 2.3465681076049805, "learning_rate": 5e-07, "logits/chosen": 1038098.375, "logits/rejected": -26266884.0, "logps/chosen": -109.13899993896484, "logps/rejected": -227.25750732421875, "loss": 0.3211, "rewards/chosen": 0.4612323045730591, "rewards/margins": 1.9146217107772827, "rewards/rejected": -1.4533894062042236, "step": 17169 }, { "epoch": 0.9100787109426761, "grad_norm": 49.25, "kl": 1.4295520782470703, "learning_rate": 5e-07, "logits/chosen": -34709688.0, "logits/rejected": -37863912.0, "logps/chosen": -173.74270629882812, "logps/rejected": -420.30828857421875, "loss": 0.3421, "rewards/chosen": -0.2371772825717926, "rewards/margins": 2.7245493829250336, "rewards/rejected": -2.961726665496826, "step": 17170 }, { "epoch": 0.9101317149444783, "grad_norm": 55.25, "kl": 0.40157318115234375, "learning_rate": 5e-07, "logits/chosen": -8200142.666666667, "logits/rejected": -43159680.0, "logps/chosen": -467.1427408854167, "logps/rejected": -446.67705078125, "loss": 0.25, "rewards/chosen": 1.018371820449829, "rewards/margins": 2.5407570362091065, "rewards/rejected": -1.5223852157592774, "step": 17171 }, { "epoch": 0.9101847189462804, "grad_norm": 60.5, "kl": 2.6993112564086914, "learning_rate": 5e-07, "logits/chosen": -20347125.333333332, "logits/rejected": -31865624.0, "logps/chosen": -316.5345052083333, "logps/rejected": -249.22674560546875, "loss": 0.401, "rewards/chosen": 0.2817935148874919, "rewards/margins": 3.496495167414347, "rewards/rejected": -3.2147016525268555, "step": 17172 }, { "epoch": 0.9102377229480826, "grad_norm": 39.25, "kl": 0.9966773986816406, "learning_rate": 5e-07, "logits/chosen": -16178550.4, "logits/rejected": -4255527.333333333, "logps/chosen": -180.208544921875, "logps/rejected": -161.26007080078125, "loss": 0.2247, "rewards/chosen": 0.9229421615600586, "rewards/margins": 3.4433037439982095, "rewards/rejected": -2.520361582438151, "step": 17173 }, { "epoch": 0.9102907269498847, "grad_norm": 38.0, "kl": 2.5843334197998047, "learning_rate": 5e-07, "logits/chosen": -2667964.5, "logits/rejected": -4404950.5, "logps/chosen": -287.41375732421875, "logps/rejected": -127.27647399902344, "loss": 0.1509, "rewards/chosen": 1.716017246246338, "rewards/margins": 4.1643335819244385, "rewards/rejected": -2.4483163356781006, "step": 17174 }, { "epoch": 0.9103437309516869, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9518765.0, "logits/rejected": -12650653.333333334, "logps/chosen": -220.49496459960938, "logps/rejected": -393.9945475260417, "loss": 0.0977, "rewards/chosen": 1.7344932556152344, "rewards/margins": 4.958545366923014, "rewards/rejected": -3.22405211130778, "step": 17175 }, { "epoch": 0.9103967349534889, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39217952.0, "logits/rejected": -29932970.666666668, "logps/chosen": -389.31083984375, "logps/rejected": -530.6449381510416, "loss": 0.3417, "rewards/chosen": 0.2825103759765625, "rewards/margins": 2.7335090001424156, "rewards/rejected": -2.450998624165853, "step": 17176 }, { "epoch": 0.9104497389552911, "grad_norm": 43.0, "kl": 2.1101083755493164, "learning_rate": 5e-07, "logits/chosen": -926383.75, "logits/rejected": -30853536.0, "logps/chosen": -149.64927673339844, "logps/rejected": -358.9544677734375, "loss": 0.3139, "rewards/chosen": 0.4684244990348816, "rewards/margins": 2.4316460490226746, "rewards/rejected": -1.963221549987793, "step": 17177 }, { "epoch": 0.9105027429570932, "grad_norm": 44.25, "kl": 1.3254594802856445, "learning_rate": 5e-07, "logits/chosen": -41986437.333333336, "logits/rejected": -86081081.6, "logps/chosen": -304.15484619140625, "logps/rejected": -411.852734375, "loss": 0.1512, "rewards/chosen": 1.501468340555827, "rewards/margins": 3.9419663111368815, "rewards/rejected": -2.4404979705810548, "step": 17178 }, { "epoch": 0.9105557469588954, "grad_norm": 59.5, "kl": 1.1740570068359375, "learning_rate": 5e-07, "logits/chosen": -24467482.0, "logits/rejected": -20151168.0, "logps/chosen": -524.2089233398438, "logps/rejected": -259.26763916015625, "loss": 0.2754, "rewards/chosen": 1.1899299621582031, "rewards/margins": 3.2317609786987305, "rewards/rejected": -2.0418310165405273, "step": 17179 }, { "epoch": 0.9106087509606975, "grad_norm": 32.75, "kl": 0.9482650756835938, "learning_rate": 5e-07, "logits/chosen": 12384889.0, "logits/rejected": -5528495.0, "logps/chosen": -259.44720458984375, "logps/rejected": -130.4700164794922, "loss": 0.1868, "rewards/chosen": 1.0981452465057373, "rewards/margins": 5.0837767124176025, "rewards/rejected": -3.9856314659118652, "step": 17180 }, { "epoch": 0.9106617549624997, "grad_norm": 39.5, "kl": 0.45911407470703125, "learning_rate": 5e-07, "logits/chosen": -13630332.0, "logits/rejected": -7552000.8, "logps/chosen": -204.6080525716146, "logps/rejected": -367.221826171875, "loss": 0.2125, "rewards/chosen": 0.36529362201690674, "rewards/margins": 3.9410420656204224, "rewards/rejected": -3.5757484436035156, "step": 17181 }, { "epoch": 0.9107147589643018, "grad_norm": 36.25, "kl": 0.40215396881103516, "learning_rate": 5e-07, "logits/chosen": -1338391.5, "logits/rejected": -17714998.0, "logps/chosen": -106.71078491210938, "logps/rejected": -363.4267578125, "loss": 0.3177, "rewards/chosen": 0.14918255805969238, "rewards/margins": 2.334378242492676, "rewards/rejected": -2.1851956844329834, "step": 17182 }, { "epoch": 0.910767762966104, "grad_norm": 50.75, "kl": 3.968226432800293, "learning_rate": 5e-07, "logits/chosen": 2667628.6, "logits/rejected": -3566380.6666666665, "logps/chosen": -130.4069580078125, "logps/rejected": -352.511474609375, "loss": 0.3514, "rewards/chosen": 0.5231478691101075, "rewards/margins": 3.9312664349873865, "rewards/rejected": -3.408118565877279, "step": 17183 }, { "epoch": 0.910820766967906, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55413724.0, "logits/rejected": -31596548.0, "logps/chosen": -599.4895629882812, "logps/rejected": -276.7366943359375, "loss": 0.2283, "rewards/chosen": 0.3907920718193054, "rewards/margins": 3.983940064907074, "rewards/rejected": -3.5931479930877686, "step": 17184 }, { "epoch": 0.9108737709697082, "grad_norm": 35.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67967850.66666667, "logits/rejected": -40983651.2, "logps/chosen": -332.19012451171875, "logps/rejected": -454.882080078125, "loss": 0.205, "rewards/chosen": 0.35705769062042236, "rewards/margins": 3.2153891801834105, "rewards/rejected": -2.858331489562988, "step": 17185 }, { "epoch": 0.9109267749715103, "grad_norm": 49.75, "kl": 1.4443511962890625, "learning_rate": 5e-07, "logits/chosen": -13662049.0, "logits/rejected": -20579490.0, "logps/chosen": -244.31527709960938, "logps/rejected": -313.4722900390625, "loss": 0.3577, "rewards/chosen": 0.28339076042175293, "rewards/margins": 2.0095702409744263, "rewards/rejected": -1.7261794805526733, "step": 17186 }, { "epoch": 0.9109797789733125, "grad_norm": 48.5, "kl": 5.958578109741211, "learning_rate": 5e-07, "logits/chosen": 1928152.142857143, "logits/rejected": -55858716.0, "logps/chosen": -195.49979073660714, "logps/rejected": -491.56109619140625, "loss": 0.4753, "rewards/chosen": 0.4946554047720773, "rewards/margins": 4.330849204744611, "rewards/rejected": -3.836193799972534, "step": 17187 }, { "epoch": 0.9110327829751146, "grad_norm": 35.0, "kl": 5.381982803344727, "learning_rate": 5e-07, "logits/chosen": 14540679.0, "logits/rejected": -33034896.0, "logps/chosen": -146.3302459716797, "logps/rejected": -391.6788635253906, "loss": 0.309, "rewards/chosen": 0.6556830406188965, "rewards/margins": 3.3363547325134277, "rewards/rejected": -2.6806716918945312, "step": 17188 }, { "epoch": 0.9110857869769168, "grad_norm": 72.0, "kl": 2.724040985107422, "learning_rate": 5e-07, "logits/chosen": -8517834.666666666, "logits/rejected": 48341.4375, "logps/chosen": -257.54302978515625, "logps/rejected": -127.32048034667969, "loss": 0.2625, "rewards/chosen": 1.1298032601674397, "rewards/margins": 5.6108260949452715, "rewards/rejected": -4.481022834777832, "step": 17189 }, { "epoch": 0.9111387909787189, "grad_norm": 48.0, "kl": 1.7952508926391602, "learning_rate": 5e-07, "logits/chosen": -9159971.0, "logits/rejected": -10242091.0, "logps/chosen": -253.44461059570312, "logps/rejected": -216.3398895263672, "loss": 0.3114, "rewards/chosen": 0.6221884489059448, "rewards/margins": 2.1951721906661987, "rewards/rejected": -1.572983741760254, "step": 17190 }, { "epoch": 0.911191794980521, "grad_norm": 34.75, "kl": 4.3923845291137695, "learning_rate": 5e-07, "logits/chosen": -225720.39583333334, "logits/rejected": -99650624.0, "logps/chosen": -406.6173502604167, "logps/rejected": -386.317236328125, "loss": 0.0961, "rewards/chosen": 2.4380032221476235, "rewards/margins": 5.487410036722819, "rewards/rejected": -3.049406814575195, "step": 17191 }, { "epoch": 0.9112447989823231, "grad_norm": 42.0, "kl": 2.632434844970703, "learning_rate": 5e-07, "logits/chosen": -11700524.0, "logits/rejected": -4589842.0, "logps/chosen": -179.96422119140624, "logps/rejected": -137.80345662434897, "loss": 0.3613, "rewards/chosen": 0.5872002601623535, "rewards/margins": 2.869993782043457, "rewards/rejected": -2.2827935218811035, "step": 17192 }, { "epoch": 0.9112978029841253, "grad_norm": 56.25, "kl": 1.3728885650634766, "learning_rate": 5e-07, "logits/chosen": 13891016.0, "logits/rejected": -26776190.0, "logps/chosen": -187.65193684895834, "logps/rejected": -211.80039978027344, "loss": 0.3887, "rewards/chosen": 0.19812635580698648, "rewards/margins": 2.8909440437952676, "rewards/rejected": -2.6928176879882812, "step": 17193 }, { "epoch": 0.9113508069859274, "grad_norm": 44.0, "kl": 0.2052173614501953, "learning_rate": 5e-07, "logits/chosen": -25240592.0, "logits/rejected": -13373576.0, "logps/chosen": -327.30519612630206, "logps/rejected": -362.476220703125, "loss": 0.1928, "rewards/chosen": 1.4775190353393555, "rewards/margins": 4.386570167541504, "rewards/rejected": -2.9090511322021486, "step": 17194 }, { "epoch": 0.9114038109877296, "grad_norm": 62.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19450630.666666668, "logits/rejected": -15762732.8, "logps/chosen": -286.55413818359375, "logps/rejected": -279.2439453125, "loss": 0.2331, "rewards/chosen": 1.3026153246561687, "rewards/margins": 2.925442091623942, "rewards/rejected": -1.6228267669677734, "step": 17195 }, { "epoch": 0.9114568149895317, "grad_norm": 46.5, "kl": 1.1847763061523438, "learning_rate": 5e-07, "logits/chosen": -26406008.0, "logits/rejected": -36229240.0, "logps/chosen": -275.9319091796875, "logps/rejected": -215.98262532552084, "loss": 0.2935, "rewards/chosen": 0.6149942398071289, "rewards/margins": 2.8286792755126955, "rewards/rejected": -2.2136850357055664, "step": 17196 }, { "epoch": 0.9115098189913339, "grad_norm": 65.0, "kl": 6.311661243438721, "learning_rate": 5e-07, "logits/chosen": -9733941.714285715, "logits/rejected": -32544032.0, "logps/chosen": -220.5235595703125, "logps/rejected": -889.5215454101562, "loss": 0.4451, "rewards/chosen": 0.7696666717529297, "rewards/margins": 2.5875133275985718, "rewards/rejected": -1.817846655845642, "step": 17197 }, { "epoch": 0.911562822993136, "grad_norm": 42.5, "kl": 2.8642444610595703, "learning_rate": 5e-07, "logits/chosen": -10394492.0, "logits/rejected": -91658858.66666667, "logps/chosen": -208.06865234375, "logps/rejected": -807.5548502604166, "loss": 0.3042, "rewards/chosen": 0.8361824989318848, "rewards/margins": 5.343175474802654, "rewards/rejected": -4.5069929758707685, "step": 17198 }, { "epoch": 0.9116158269949381, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22560400.0, "logits/rejected": -13974032.0, "logps/chosen": -417.56121826171875, "logps/rejected": -348.5888671875, "loss": 0.144, "rewards/chosen": 1.3264756202697754, "rewards/margins": 3.822247346242269, "rewards/rejected": -2.4957717259724936, "step": 17199 }, { "epoch": 0.9116688309967402, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 39532856.0, "logits/rejected": -7173506.0, "logps/chosen": -405.66925048828125, "logps/rejected": -382.6754150390625, "loss": 0.2132, "rewards/chosen": 0.7719955444335938, "rewards/margins": 3.9899790287017822, "rewards/rejected": -3.2179834842681885, "step": 17200 }, { "epoch": 0.9117218349985424, "grad_norm": 64.5, "kl": 2.711237907409668, "learning_rate": 5e-07, "logits/chosen": 10621388.666666666, "logits/rejected": -30038208.0, "logps/chosen": -260.9844563802083, "logps/rejected": -309.337353515625, "loss": 0.2582, "rewards/chosen": 0.7599380811055502, "rewards/margins": 2.856022103627523, "rewards/rejected": -2.0960840225219726, "step": 17201 }, { "epoch": 0.9117748390003445, "grad_norm": 54.0, "kl": 5.658472061157227, "learning_rate": 5e-07, "logits/chosen": -15356700.57142857, "logits/rejected": -74066072.0, "logps/chosen": -411.83475167410717, "logps/rejected": -1244.89111328125, "loss": 0.3298, "rewards/chosen": 1.355987821306501, "rewards/margins": 5.099347148622785, "rewards/rejected": -3.743359327316284, "step": 17202 }, { "epoch": 0.9118278430021467, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3105924.5, "logits/rejected": -1837273.2857142857, "logps/chosen": -159.53341674804688, "logps/rejected": -308.6773158482143, "loss": 0.1564, "rewards/chosen": 0.24300232529640198, "rewards/margins": 2.7315200865268707, "rewards/rejected": -2.4885177612304688, "step": 17203 }, { "epoch": 0.9118808470039488, "grad_norm": 44.25, "kl": 1.1664485931396484, "learning_rate": 5e-07, "logits/chosen": -39210348.8, "logits/rejected": -5550646.0, "logps/chosen": -302.04814453125, "logps/rejected": -103.80818684895833, "loss": 0.3439, "rewards/chosen": 0.14940601587295532, "rewards/margins": 3.559213856856028, "rewards/rejected": -3.4098078409830728, "step": 17204 }, { "epoch": 0.911933851005751, "grad_norm": 50.5, "kl": 0.21118545532226562, "learning_rate": 5e-07, "logits/chosen": 2083440.6666666667, "logits/rejected": -23536260.0, "logps/chosen": -144.9625040690104, "logps/rejected": -378.0213928222656, "loss": 0.4246, "rewards/chosen": -0.0765979786713918, "rewards/margins": 1.951440046230952, "rewards/rejected": -2.0280380249023438, "step": 17205 }, { "epoch": 0.911986855007553, "grad_norm": 41.75, "kl": 0.8467655181884766, "learning_rate": 5e-07, "logits/chosen": -26950248.0, "logits/rejected": -13207709.6, "logps/chosen": -213.5056355794271, "logps/rejected": -345.269921875, "loss": 0.2279, "rewards/chosen": 0.7705599466959635, "rewards/margins": 3.108569780985514, "rewards/rejected": -2.338009834289551, "step": 17206 }, { "epoch": 0.9120398590093552, "grad_norm": 63.25, "kl": 0.8643827438354492, "learning_rate": 5e-07, "logits/chosen": -70979446.85714285, "logits/rejected": 2481854.0, "logps/chosen": -294.98939732142856, "logps/rejected": -33.89361572265625, "loss": 0.3977, "rewards/chosen": 0.34951952525547575, "rewards/margins": 1.8337265593664986, "rewards/rejected": -1.484207034111023, "step": 17207 }, { "epoch": 0.9120928630111573, "grad_norm": 43.25, "kl": 0.12058258056640625, "learning_rate": 5e-07, "logits/chosen": 359923.75, "logits/rejected": -22199001.333333332, "logps/chosen": -269.1864013671875, "logps/rejected": -364.3534342447917, "loss": 0.2192, "rewards/chosen": 1.037427544593811, "rewards/margins": 3.505574345588684, "rewards/rejected": -2.468146800994873, "step": 17208 }, { "epoch": 0.9121458670129595, "grad_norm": 61.0, "kl": 4.829446792602539, "learning_rate": 5e-07, "logits/chosen": -42546194.666666664, "logits/rejected": -33435352.0, "logps/chosen": -228.33182779947916, "logps/rejected": -207.458740234375, "loss": 0.3514, "rewards/chosen": 0.8789774576822916, "rewards/margins": 3.2728096644083657, "rewards/rejected": -2.393832206726074, "step": 17209 }, { "epoch": 0.9121988710147616, "grad_norm": 49.25, "kl": 3.633291244506836, "learning_rate": 5e-07, "logits/chosen": 7314722.666666667, "logits/rejected": 14611097.0, "logps/chosen": -56.48597208658854, "logps/rejected": -203.88900756835938, "loss": 0.4091, "rewards/chosen": 0.6717816988627116, "rewards/margins": 1.7658167282740274, "rewards/rejected": -1.094035029411316, "step": 17210 }, { "epoch": 0.9122518750165638, "grad_norm": 65.0, "kl": 0.12111949920654297, "learning_rate": 5e-07, "logits/chosen": -15821916.0, "logits/rejected": -30774592.0, "logps/chosen": -181.4034423828125, "logps/rejected": -245.8645477294922, "loss": 0.3294, "rewards/chosen": -0.0450192466378212, "rewards/margins": 2.101563833653927, "rewards/rejected": -2.146583080291748, "step": 17211 }, { "epoch": 0.9123048790183659, "grad_norm": 48.25, "kl": 2.8570261001586914, "learning_rate": 5e-07, "logits/chosen": -34044984.0, "logits/rejected": -30049626.0, "logps/chosen": -222.39083862304688, "logps/rejected": -385.26336669921875, "loss": 0.3367, "rewards/chosen": 0.6591711044311523, "rewards/margins": 2.8615059852600098, "rewards/rejected": -2.2023348808288574, "step": 17212 }, { "epoch": 0.9123578830201681, "grad_norm": 61.5, "kl": 5.927103042602539, "learning_rate": 5e-07, "logits/chosen": -12107916.0, "logits/rejected": -10132342.0, "logps/chosen": -313.9184163411458, "logps/rejected": -123.92977905273438, "loss": 0.3706, "rewards/chosen": 0.9941377639770508, "rewards/margins": 3.5557754039764404, "rewards/rejected": -2.5616376399993896, "step": 17213 }, { "epoch": 0.9124108870219702, "grad_norm": 37.5, "kl": 0.9721431732177734, "learning_rate": 5e-07, "logits/chosen": -16240090.0, "logits/rejected": -21940164.0, "logps/chosen": -235.33839416503906, "logps/rejected": -429.4786376953125, "loss": 0.313, "rewards/chosen": -0.125595360994339, "rewards/margins": 2.6768469512462616, "rewards/rejected": -2.8024423122406006, "step": 17214 }, { "epoch": 0.9124638910237723, "grad_norm": 37.5, "kl": 1.2350921630859375, "learning_rate": 5e-07, "logits/chosen": -5168608.8, "logits/rejected": -21148976.0, "logps/chosen": -175.5818115234375, "logps/rejected": -176.56292724609375, "loss": 0.2656, "rewards/chosen": 0.9229002952575683, "rewards/margins": 4.343445237477621, "rewards/rejected": -3.4205449422200522, "step": 17215 }, { "epoch": 0.9125168950255744, "grad_norm": 35.75, "kl": 0.9537029266357422, "learning_rate": 5e-07, "logits/chosen": 3544172.5, "logits/rejected": -41948788.0, "logps/chosen": -244.49627685546875, "logps/rejected": -397.3049011230469, "loss": 0.2258, "rewards/chosen": 0.6207119822502136, "rewards/margins": 3.8849775195121765, "rewards/rejected": -3.264265537261963, "step": 17216 }, { "epoch": 0.9125698990273766, "grad_norm": 50.5, "kl": 4.329261779785156, "learning_rate": 5e-07, "logits/chosen": -13493941.714285715, "logits/rejected": -1025221.3125, "logps/chosen": -380.96316964285717, "logps/rejected": -134.77415466308594, "loss": 0.4019, "rewards/chosen": 0.9957730429513114, "rewards/margins": 2.574393953595843, "rewards/rejected": -1.5786209106445312, "step": 17217 }, { "epoch": 0.9126229030291787, "grad_norm": 47.25, "kl": 1.6426935195922852, "learning_rate": 5e-07, "logits/chosen": -37695868.0, "logits/rejected": -17504982.0, "logps/chosen": -338.06024169921875, "logps/rejected": -312.77520751953125, "loss": 0.2793, "rewards/chosen": 0.5670952200889587, "rewards/margins": 2.4568561911582947, "rewards/rejected": -1.889760971069336, "step": 17218 }, { "epoch": 0.9126759070309809, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18288461.333333332, "logits/rejected": -10309740.8, "logps/chosen": -204.23893229166666, "logps/rejected": -268.681005859375, "loss": 0.2396, "rewards/chosen": -0.057171568274497986, "rewards/margins": 2.656939759850502, "rewards/rejected": -2.714111328125, "step": 17219 }, { "epoch": 0.912728911032783, "grad_norm": 44.75, "kl": 0.8804206848144531, "learning_rate": 5e-07, "logits/chosen": -9814779.333333334, "logits/rejected": -20129505.6, "logps/chosen": -346.185302734375, "logps/rejected": -123.03565673828125, "loss": 0.1857, "rewards/chosen": 1.2342243194580078, "rewards/margins": 3.851862335205078, "rewards/rejected": -2.6176380157470702, "step": 17220 }, { "epoch": 0.9127819150345851, "grad_norm": 56.0, "kl": 3.066946029663086, "learning_rate": 5e-07, "logits/chosen": -27139136.0, "logits/rejected": -11538400.0, "logps/chosen": -193.23201497395834, "logps/rejected": -166.14553833007812, "loss": 0.4431, "rewards/chosen": 0.49484193325042725, "rewards/margins": 1.4173593521118164, "rewards/rejected": -0.9225174188613892, "step": 17221 }, { "epoch": 0.9128349190363872, "grad_norm": 43.0, "kl": 0.7516803741455078, "learning_rate": 5e-07, "logits/chosen": -47399656.0, "logits/rejected": -6730980.0, "logps/chosen": -209.43450927734375, "logps/rejected": -294.114013671875, "loss": 0.2496, "rewards/chosen": 0.6619858741760254, "rewards/margins": 3.3131518363952637, "rewards/rejected": -2.6511659622192383, "step": 17222 }, { "epoch": 0.9128879230381893, "grad_norm": 46.75, "kl": 2.203166961669922, "learning_rate": 5e-07, "logits/chosen": -9715072.8, "logits/rejected": -12274865.333333334, "logps/chosen": -276.119921875, "logps/rejected": -375.1911214192708, "loss": 0.3635, "rewards/chosen": 0.48198494911193845, "rewards/margins": 2.657947079340617, "rewards/rejected": -2.1759621302286782, "step": 17223 }, { "epoch": 0.9129409270399915, "grad_norm": 47.25, "kl": 3.1353683471679688, "learning_rate": 5e-07, "logits/chosen": 17716394.666666668, "logits/rejected": 20872120.0, "logps/chosen": -278.83091227213544, "logps/rejected": -289.40594482421875, "loss": 0.3088, "rewards/chosen": 1.2186903953552246, "rewards/margins": 2.7679229974746704, "rewards/rejected": -1.5492326021194458, "step": 17224 }, { "epoch": 0.9129939310417936, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34682640.0, "logits/rejected": -45442932.0, "logps/chosen": -272.43475341796875, "logps/rejected": -194.880126953125, "loss": 0.2542, "rewards/chosen": 0.5390308499336243, "rewards/margins": 2.7837522625923157, "rewards/rejected": -2.2447214126586914, "step": 17225 }, { "epoch": 0.9130469350435958, "grad_norm": 64.0, "kl": 1.8054990768432617, "learning_rate": 5e-07, "logits/chosen": -14542455.0, "logits/rejected": -50933004.0, "logps/chosen": -396.998291015625, "logps/rejected": -425.35504150390625, "loss": 0.2675, "rewards/chosen": 1.1652872562408447, "rewards/margins": 3.3369219303131104, "rewards/rejected": -2.1716346740722656, "step": 17226 }, { "epoch": 0.9130999390453979, "grad_norm": 68.0, "kl": 2.589620590209961, "learning_rate": 5e-07, "logits/chosen": -15335486.666666666, "logits/rejected": -18953609.6, "logps/chosen": -69.04601033528645, "logps/rejected": -405.008544921875, "loss": 0.2531, "rewards/chosen": 0.38235751787821454, "rewards/margins": 2.815141216913859, "rewards/rejected": -2.4327836990356446, "step": 17227 }, { "epoch": 0.9131529430472001, "grad_norm": 56.75, "kl": 5.570772171020508, "learning_rate": 5e-07, "logits/chosen": -8951076.0, "logits/rejected": -55738948.0, "logps/chosen": -191.01220703125, "logps/rejected": -179.27830505371094, "loss": 0.3448, "rewards/chosen": 0.9090620676676432, "rewards/margins": 3.399470011393229, "rewards/rejected": -2.490407943725586, "step": 17228 }, { "epoch": 0.9132059470490022, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15012394.0, "logits/rejected": -79797200.0, "logps/chosen": -208.98915100097656, "logps/rejected": -208.8375447591146, "loss": 0.211, "rewards/chosen": 0.5681266784667969, "rewards/margins": 2.5683509508768716, "rewards/rejected": -2.0002242724100747, "step": 17229 }, { "epoch": 0.9132589510508043, "grad_norm": 61.5, "kl": 3.197721481323242, "learning_rate": 5e-07, "logits/chosen": -13701868.8, "logits/rejected": -15597626.666666666, "logps/chosen": -400.38173828125, "logps/rejected": -273.3797200520833, "loss": 0.3207, "rewards/chosen": 0.8073447227478028, "rewards/margins": 2.5787137031555174, "rewards/rejected": -1.7713689804077148, "step": 17230 }, { "epoch": 0.9133119550526064, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91414056.0, "logits/rejected": -33128802.285714287, "logps/chosen": -695.804443359375, "logps/rejected": -253.15053013392858, "loss": 0.1497, "rewards/chosen": 0.694750964641571, "rewards/margins": 2.941147438117436, "rewards/rejected": -2.246396473475865, "step": 17231 }, { "epoch": 0.9133649590544086, "grad_norm": 45.75, "kl": 1.0807933807373047, "learning_rate": 5e-07, "logits/chosen": 1295104.0, "logits/rejected": -7472691.5, "logps/chosen": -166.33721923828125, "logps/rejected": -169.4306182861328, "loss": 0.2847, "rewards/chosen": 0.9767497777938843, "rewards/margins": 2.584499716758728, "rewards/rejected": -1.6077499389648438, "step": 17232 }, { "epoch": 0.9134179630562107, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38903824.0, "logits/rejected": -13646214.4, "logps/chosen": -384.6339518229167, "logps/rejected": -461.676806640625, "loss": 0.1826, "rewards/chosen": 0.5048792759577433, "rewards/margins": 3.87813290754954, "rewards/rejected": -3.3732536315917967, "step": 17233 }, { "epoch": 0.9134709670580129, "grad_norm": 39.0, "kl": 2.288421630859375, "learning_rate": 5e-07, "logits/chosen": 14434061.333333334, "logits/rejected": -14766315.2, "logps/chosen": -738.7447102864584, "logps/rejected": -235.7156005859375, "loss": 0.1855, "rewards/chosen": 1.4451643625895183, "rewards/margins": 4.1071413675944015, "rewards/rejected": -2.661977005004883, "step": 17234 }, { "epoch": 0.913523971059815, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59670124.8, "logits/rejected": 9868576.0, "logps/chosen": -321.8966796875, "logps/rejected": -299.5040690104167, "loss": 0.3605, "rewards/chosen": 0.12951799631118774, "rewards/margins": 1.7122726082801818, "rewards/rejected": -1.5827546119689941, "step": 17235 }, { "epoch": 0.9135769750616172, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60245011.2, "logits/rejected": -76323760.0, "logps/chosen": -341.03828125, "logps/rejected": -435.8506266276042, "loss": 0.345, "rewards/chosen": 0.05850006341934204, "rewards/margins": 2.254337243239085, "rewards/rejected": -2.1958371798197427, "step": 17236 }, { "epoch": 0.9136299790634193, "grad_norm": 31.375, "kl": 2.6261157989501953, "learning_rate": 5e-07, "logits/chosen": -327940.0, "logits/rejected": -28462732.8, "logps/chosen": -63.47579447428385, "logps/rejected": -225.3819580078125, "loss": 0.2901, "rewards/chosen": 0.19236111640930176, "rewards/margins": 2.300318956375122, "rewards/rejected": -2.1079578399658203, "step": 17237 }, { "epoch": 0.9136829830652214, "grad_norm": 57.0, "kl": 1.307851791381836, "learning_rate": 5e-07, "logits/chosen": -23397110.4, "logits/rejected": -6310163.333333333, "logps/chosen": -341.5020751953125, "logps/rejected": -179.92769368489584, "loss": 0.3171, "rewards/chosen": 0.6720906257629394, "rewards/margins": 2.219709809621175, "rewards/rejected": -1.5476191838582356, "step": 17238 }, { "epoch": 0.9137359870670235, "grad_norm": 51.75, "kl": 0.2165679931640625, "learning_rate": 5e-07, "logits/chosen": 7501351.0, "logits/rejected": 53703021.71428572, "logps/chosen": -86.98390197753906, "logps/rejected": -318.89027622767856, "loss": 0.2511, "rewards/chosen": -0.33726197481155396, "rewards/margins": 1.3044746007238115, "rewards/rejected": -1.6417365755353654, "step": 17239 }, { "epoch": 0.9137889910688257, "grad_norm": 79.5, "kl": 1.8035697937011719, "learning_rate": 5e-07, "logits/chosen": -64710218.666666664, "logits/rejected": -16485916.8, "logps/chosen": -701.0069173177084, "logps/rejected": -385.68076171875, "loss": 0.1664, "rewards/chosen": 1.527083396911621, "rewards/margins": 4.009393882751465, "rewards/rejected": -2.482310485839844, "step": 17240 }, { "epoch": 0.9138419950706278, "grad_norm": 52.75, "kl": 1.1316261291503906, "learning_rate": 5e-07, "logits/chosen": -44975590.4, "logits/rejected": 34007632.0, "logps/chosen": -297.581591796875, "logps/rejected": -914.53955078125, "loss": 0.2308, "rewards/chosen": 1.0336234092712402, "rewards/margins": 5.489230823516846, "rewards/rejected": -4.4556074142456055, "step": 17241 }, { "epoch": 0.91389499907243, "grad_norm": 54.5, "kl": 1.479644775390625, "learning_rate": 5e-07, "logits/chosen": 1146615.3333333333, "logits/rejected": 2688617.6, "logps/chosen": -146.13174438476562, "logps/rejected": -156.8022216796875, "loss": 0.3237, "rewards/chosen": 0.5003273884455363, "rewards/margins": 1.9292499462763466, "rewards/rejected": -1.4289225578308105, "step": 17242 }, { "epoch": 0.9139480030742321, "grad_norm": 36.0, "kl": 2.9629135131835938, "learning_rate": 5e-07, "logits/chosen": -16650423.0, "logits/rejected": -14480073.0, "logps/chosen": -765.0921630859375, "logps/rejected": -464.6461181640625, "loss": 0.1911, "rewards/chosen": 1.2949304580688477, "rewards/margins": 3.6741440296173096, "rewards/rejected": -2.379213571548462, "step": 17243 }, { "epoch": 0.9140010070760343, "grad_norm": 34.5, "kl": 2.4575796127319336, "learning_rate": 5e-07, "logits/chosen": 9700938.0, "logits/rejected": -30347509.333333332, "logps/chosen": -15.406694412231445, "logps/rejected": -364.8704833984375, "loss": 0.1814, "rewards/chosen": 0.6338072419166565, "rewards/margins": 3.366011997063955, "rewards/rejected": -2.7322047551472983, "step": 17244 }, { "epoch": 0.9140540110778363, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -81101749.33333333, "logits/rejected": -41005177.6, "logps/chosen": -394.8635660807292, "logps/rejected": -403.05068359375, "loss": 0.2136, "rewards/chosen": 0.22805583477020264, "rewards/margins": 3.171125292778015, "rewards/rejected": -2.9430694580078125, "step": 17245 }, { "epoch": 0.9141070150796385, "grad_norm": 34.5, "kl": 0.7221870422363281, "learning_rate": 5e-07, "logits/chosen": -9681061.333333334, "logits/rejected": -26861843.2, "logps/chosen": -176.9283447265625, "logps/rejected": -202.1918212890625, "loss": 0.3155, "rewards/chosen": 0.11900806427001953, "rewards/margins": 1.9521760940551758, "rewards/rejected": -1.8331680297851562, "step": 17246 }, { "epoch": 0.9141600190814406, "grad_norm": 79.5, "kl": 2.2265052795410156, "learning_rate": 5e-07, "logits/chosen": -18342216.0, "logits/rejected": 8199068.8, "logps/chosen": -131.33333333333334, "logps/rejected": -320.2641845703125, "loss": 0.2742, "rewards/chosen": 0.5103503465652466, "rewards/margins": 3.4658915758132935, "rewards/rejected": -2.955541229248047, "step": 17247 }, { "epoch": 0.9142130230832428, "grad_norm": 43.75, "kl": 2.420316696166992, "learning_rate": 5e-07, "logits/chosen": -8726468.8, "logits/rejected": -62135968.0, "logps/chosen": -252.9576416015625, "logps/rejected": -288.4282633463542, "loss": 0.3139, "rewards/chosen": 0.7086430549621582, "rewards/margins": 2.4067491849263507, "rewards/rejected": -1.6981061299641926, "step": 17248 }, { "epoch": 0.9142660270850449, "grad_norm": 69.0, "kl": 1.1752853393554688, "learning_rate": 5e-07, "logits/chosen": -14715856.0, "logits/rejected": -29030021.333333332, "logps/chosen": -560.377001953125, "logps/rejected": -279.50146484375, "loss": 0.2759, "rewards/chosen": 0.7859833240509033, "rewards/margins": 3.538789987564087, "rewards/rejected": -2.7528066635131836, "step": 17249 }, { "epoch": 0.9143190310868471, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38833664.0, "logits/rejected": -11185818.0, "logps/chosen": -412.54608154296875, "logps/rejected": -489.5605773925781, "loss": 0.1398, "rewards/chosen": 1.1455711126327515, "rewards/margins": 5.121532320976257, "rewards/rejected": -3.975961208343506, "step": 17250 }, { "epoch": 0.9143720350886492, "grad_norm": 49.25, "kl": 2.63433837890625, "learning_rate": 5e-07, "logits/chosen": -48345968.0, "logits/rejected": -31850160.0, "logps/chosen": -439.98614501953125, "logps/rejected": -187.43385314941406, "loss": 0.2502, "rewards/chosen": 0.8558780550956726, "rewards/margins": 4.794234931468964, "rewards/rejected": -3.938356876373291, "step": 17251 }, { "epoch": 0.9144250390904514, "grad_norm": 28.125, "kl": 3.41815185546875, "learning_rate": 5e-07, "logits/chosen": -5966355.333333333, "logits/rejected": -32949891.2, "logps/chosen": -190.04632568359375, "logps/rejected": -449.6359375, "loss": 0.2006, "rewards/chosen": 0.9481664498647054, "rewards/margins": 6.439076026280721, "rewards/rejected": -5.490909576416016, "step": 17252 }, { "epoch": 0.9144780430922534, "grad_norm": 29.5, "kl": 1.6056556701660156, "learning_rate": 5e-07, "logits/chosen": -28027270.0, "logits/rejected": -37631392.0, "logps/chosen": -193.40614318847656, "logps/rejected": -434.1748046875, "loss": 0.1856, "rewards/chosen": -0.350245863199234, "rewards/margins": 3.4070952037970224, "rewards/rejected": -3.7573410669962564, "step": 17253 }, { "epoch": 0.9145310470940556, "grad_norm": 29.875, "kl": 4.867527961730957, "learning_rate": 5e-07, "logits/chosen": -16367051.0, "logits/rejected": -71785960.0, "logps/chosen": -572.8169555664062, "logps/rejected": -351.666015625, "loss": 0.2265, "rewards/chosen": 1.5326706171035767, "rewards/margins": 3.969601273536682, "rewards/rejected": -2.4369306564331055, "step": 17254 }, { "epoch": 0.9145840510958577, "grad_norm": 89.5, "kl": 0.2962217330932617, "learning_rate": 5e-07, "logits/chosen": -59048068.0, "logits/rejected": 20329100.0, "logps/chosen": -347.6173400878906, "logps/rejected": -282.1574300130208, "loss": 0.3204, "rewards/chosen": 0.4909569025039673, "rewards/margins": 1.6017247438430786, "rewards/rejected": -1.1107678413391113, "step": 17255 }, { "epoch": 0.9146370550976599, "grad_norm": 108.0, "kl": 10.200944900512695, "learning_rate": 5e-07, "logits/chosen": -14121091.2, "logits/rejected": -15504769.333333334, "logps/chosen": -590.206982421875, "logps/rejected": -599.6651204427084, "loss": 0.2984, "rewards/chosen": 2.1473731994628906, "rewards/margins": 5.126372973124186, "rewards/rejected": -2.9789997736612954, "step": 17256 }, { "epoch": 0.914690059099462, "grad_norm": 33.75, "kl": 3.1318321228027344, "learning_rate": 5e-07, "logits/chosen": 7265083.0, "logits/rejected": -117409168.0, "logps/chosen": -306.760009765625, "logps/rejected": -426.3890075683594, "loss": 0.2197, "rewards/chosen": 1.2221401929855347, "rewards/margins": 4.244190335273743, "rewards/rejected": -3.022050142288208, "step": 17257 }, { "epoch": 0.9147430631012642, "grad_norm": 31.0, "kl": 1.9720563888549805, "learning_rate": 5e-07, "logits/chosen": -4317473.0, "logits/rejected": -33449816.0, "logps/chosen": -145.05914306640625, "logps/rejected": -535.499267578125, "loss": 0.3052, "rewards/chosen": 0.690531333287557, "rewards/margins": 4.130486567815145, "rewards/rejected": -3.439955234527588, "step": 17258 }, { "epoch": 0.9147960671030663, "grad_norm": 40.75, "kl": 1.6048059463500977, "learning_rate": 5e-07, "logits/chosen": -39344741.333333336, "logits/rejected": -507446.625, "logps/chosen": -208.1100056966146, "logps/rejected": -228.54583740234375, "loss": 0.4224, "rewards/chosen": 0.0081634521484375, "rewards/margins": 3.262995719909668, "rewards/rejected": -3.2548322677612305, "step": 17259 }, { "epoch": 0.9148490711048685, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36719762.666666664, "logits/rejected": -34411641.6, "logps/chosen": -131.88259887695312, "logps/rejected": -371.8375, "loss": 0.2528, "rewards/chosen": 0.025932669639587402, "rewards/margins": 2.695117545127869, "rewards/rejected": -2.6691848754882814, "step": 17260 }, { "epoch": 0.9149020751066705, "grad_norm": 50.0, "kl": 2.8923521041870117, "learning_rate": 5e-07, "logits/chosen": -3992860.4, "logits/rejected": -1650917.3333333333, "logps/chosen": -114.69312744140625, "logps/rejected": -153.19558715820312, "loss": 0.2785, "rewards/chosen": 0.84982328414917, "rewards/margins": 4.274257946014404, "rewards/rejected": -3.4244346618652344, "step": 17261 }, { "epoch": 0.9149550791084727, "grad_norm": 43.0, "kl": 3.2544307708740234, "learning_rate": 5e-07, "logits/chosen": -934518.0, "logits/rejected": 1137280.8, "logps/chosen": -275.8433837890625, "logps/rejected": -193.83643798828126, "loss": 0.3514, "rewards/chosen": 0.21075695753097534, "rewards/margins": 2.6174952387809753, "rewards/rejected": -2.40673828125, "step": 17262 }, { "epoch": 0.9150080831102748, "grad_norm": 50.75, "kl": 5.765439987182617, "learning_rate": 5e-07, "logits/chosen": 2589693.0, "logits/rejected": -59883578.666666664, "logps/chosen": -327.621923828125, "logps/rejected": -411.848876953125, "loss": 0.4539, "rewards/chosen": 0.16887073516845702, "rewards/margins": 1.933458137512207, "rewards/rejected": -1.76458740234375, "step": 17263 }, { "epoch": 0.915061087112077, "grad_norm": 49.5, "kl": 2.7711410522460938, "learning_rate": 5e-07, "logits/chosen": -20549018.0, "logits/rejected": -10199932.0, "logps/chosen": -305.10498046875, "logps/rejected": -219.28651428222656, "loss": 0.2821, "rewards/chosen": 1.18082594871521, "rewards/margins": 3.301889657974243, "rewards/rejected": -2.121063709259033, "step": 17264 }, { "epoch": 0.9151140911138791, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48386224.0, "logits/rejected": -16873040.0, "logps/chosen": -468.5908203125, "logps/rejected": -369.468994140625, "loss": 0.2336, "rewards/chosen": 0.6577325344085694, "rewards/margins": 3.8658411184946697, "rewards/rejected": -3.2081085840861, "step": 17265 }, { "epoch": 0.9151670951156813, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40924932.0, "logits/rejected": -17105256.0, "logps/chosen": -250.22451782226562, "logps/rejected": -250.45445251464844, "loss": 0.2912, "rewards/chosen": 0.2240116000175476, "rewards/margins": 3.2390145659446716, "rewards/rejected": -3.015002965927124, "step": 17266 }, { "epoch": 0.9152200991174834, "grad_norm": 76.0, "kl": 2.0944957733154297, "learning_rate": 5e-07, "logits/chosen": -3884890.6666666665, "logits/rejected": -9485795.0, "logps/chosen": -258.0589599609375, "logps/rejected": -168.1549835205078, "loss": 0.3864, "rewards/chosen": 0.5573559204737345, "rewards/margins": 1.7230307261149087, "rewards/rejected": -1.1656748056411743, "step": 17267 }, { "epoch": 0.9152731031192856, "grad_norm": 39.25, "kl": 0.4658222198486328, "learning_rate": 5e-07, "logits/chosen": -38308373.333333336, "logits/rejected": -15552276.8, "logps/chosen": -374.4764404296875, "logps/rejected": -165.44476318359375, "loss": 0.1902, "rewards/chosen": 0.8501750628153483, "rewards/margins": 3.357185331980387, "rewards/rejected": -2.507010269165039, "step": 17268 }, { "epoch": 0.9153261071210876, "grad_norm": 54.0, "kl": 3.710805892944336, "learning_rate": 5e-07, "logits/chosen": -20657829.333333332, "logits/rejected": -24487544.0, "logps/chosen": -285.8147786458333, "logps/rejected": -572.5181274414062, "loss": 0.4602, "rewards/chosen": 0.10254019498825073, "rewards/margins": 3.747990071773529, "rewards/rejected": -3.6454498767852783, "step": 17269 }, { "epoch": 0.9153791111228898, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -7006214.5, "logps/rejected": -243.49473571777344, "loss": 0.1165, "rewards/rejected": -3.2682156562805176, "step": 17270 }, { "epoch": 0.9154321151246919, "grad_norm": 55.5, "kl": 0.9322948455810547, "learning_rate": 5e-07, "logits/chosen": -41619257.6, "logits/rejected": -68520298.66666667, "logps/chosen": -636.090087890625, "logps/rejected": -502.914794921875, "loss": 0.2002, "rewards/chosen": 1.4682253837585448, "rewards/margins": 4.618924363454183, "rewards/rejected": -3.150698979695638, "step": 17271 }, { "epoch": 0.915485119126494, "grad_norm": 59.25, "kl": 0.671058177947998, "learning_rate": 5e-07, "logits/chosen": -14510806.666666666, "logits/rejected": -10714221.6, "logps/chosen": -419.8968505859375, "logps/rejected": -257.693701171875, "loss": 0.3022, "rewards/chosen": 0.267767071723938, "rewards/margins": 1.9296855211257935, "rewards/rejected": -1.6619184494018555, "step": 17272 }, { "epoch": 0.9155381231282962, "grad_norm": 56.75, "kl": 3.355736255645752, "learning_rate": 5e-07, "logits/chosen": -64660896.0, "logits/rejected": -32122348.8, "logps/chosen": -374.1968180338542, "logps/rejected": -158.996923828125, "loss": 0.2461, "rewards/chosen": 1.0035231908162434, "rewards/margins": 2.6998488744099935, "rewards/rejected": -1.69632568359375, "step": 17273 }, { "epoch": 0.9155911271300983, "grad_norm": 39.75, "kl": 0.7331695556640625, "learning_rate": 5e-07, "logits/chosen": -25463560.0, "logits/rejected": -13054720.0, "logps/chosen": -208.00455729166666, "logps/rejected": -495.70048828125, "loss": 0.2741, "rewards/chosen": 0.06405182679494222, "rewards/margins": 2.2573698123296104, "rewards/rejected": -2.193317985534668, "step": 17274 }, { "epoch": 0.9156441311319005, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41434204.0, "logits/rejected": -23067460.0, "logps/chosen": -176.61305236816406, "logps/rejected": -250.13296508789062, "loss": 0.2942, "rewards/chosen": -0.09365683794021606, "rewards/margins": 2.895807206630707, "rewards/rejected": -2.989464044570923, "step": 17275 }, { "epoch": 0.9156971351337025, "grad_norm": 85.5, "kl": 7.56346321105957, "learning_rate": 5e-07, "logits/chosen": -58368314.666666664, "logits/rejected": 6112369.0, "logps/chosen": -526.4408365885416, "logps/rejected": -497.5534362792969, "loss": 0.3849, "rewards/chosen": 1.0521787007649739, "rewards/margins": 2.3387815554936724, "rewards/rejected": -1.2866028547286987, "step": 17276 }, { "epoch": 0.9157501391355047, "grad_norm": 38.5, "kl": 1.936479091644287, "learning_rate": 5e-07, "logits/chosen": -7432630.666666667, "logits/rejected": -13832033.6, "logps/chosen": -184.85050455729166, "logps/rejected": -372.1558349609375, "loss": 0.2894, "rewards/chosen": 0.6750209331512451, "rewards/margins": 3.54725661277771, "rewards/rejected": -2.8722356796264648, "step": 17277 }, { "epoch": 0.9158031431373068, "grad_norm": 42.75, "kl": 1.4485273361206055, "learning_rate": 5e-07, "logits/chosen": -23913768.0, "logits/rejected": -1219726.8, "logps/chosen": -268.3555908203125, "logps/rejected": -146.15478515625, "loss": 0.263, "rewards/chosen": 0.7647515137990316, "rewards/margins": 2.5088343461354574, "rewards/rejected": -1.7440828323364257, "step": 17278 }, { "epoch": 0.915856147139109, "grad_norm": 29.0, "kl": 1.0877418518066406, "learning_rate": 5e-07, "logits/chosen": -10029571.0, "logits/rejected": -29436869.333333332, "logps/chosen": -71.6806869506836, "logps/rejected": -404.183837890625, "loss": 0.2264, "rewards/chosen": 0.03708134591579437, "rewards/margins": 2.8374947955211005, "rewards/rejected": -2.800413449605306, "step": 17279 }, { "epoch": 0.9159091511409111, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71313448.0, "logits/rejected": -50279648.0, "logps/chosen": -394.81756591796875, "logps/rejected": -383.014892578125, "loss": 0.2976, "rewards/chosen": 0.49573975801467896, "rewards/margins": 2.3872844576835632, "rewards/rejected": -1.8915446996688843, "step": 17280 }, { "epoch": 0.9159621551427133, "grad_norm": 56.0, "kl": 0.28147125244140625, "learning_rate": 5e-07, "logits/chosen": -30209481.14285714, "logits/rejected": -4777455.5, "logps/chosen": -267.3828125, "logps/rejected": -573.04296875, "loss": 0.4537, "rewards/chosen": -0.03727918863296509, "rewards/margins": 4.0677685141563416, "rewards/rejected": -4.105047702789307, "step": 17281 }, { "epoch": 0.9160151591445154, "grad_norm": 29.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 207946.0, "logits/rejected": -50300772.571428575, "logps/chosen": -27.20569610595703, "logps/rejected": -360.9998256138393, "loss": 0.1278, "rewards/chosen": -0.04730186611413956, "rewards/margins": 3.4149805871503696, "rewards/rejected": -3.462282453264509, "step": 17282 }, { "epoch": 0.9160681631463176, "grad_norm": 39.75, "kl": 7.980210304260254, "learning_rate": 5e-07, "logits/chosen": -17458426.666666668, "logits/rejected": -32630660.0, "logps/chosen": -255.1995646158854, "logps/rejected": -439.152099609375, "loss": 0.4133, "rewards/chosen": 0.9724457263946533, "rewards/margins": 2.521155595779419, "rewards/rejected": -1.5487098693847656, "step": 17283 }, { "epoch": 0.9161211671481196, "grad_norm": 49.5, "kl": 2.980560302734375, "learning_rate": 5e-07, "logits/chosen": -16323904.0, "logits/rejected": -56492224.0, "logps/chosen": -349.379345703125, "logps/rejected": -224.55816650390625, "loss": 0.306, "rewards/chosen": 1.0809549331665038, "rewards/margins": 2.4681171417236327, "rewards/rejected": -1.387162208557129, "step": 17284 }, { "epoch": 0.9161741711499218, "grad_norm": 67.0, "kl": 1.3447160720825195, "learning_rate": 5e-07, "logits/chosen": -28867939.2, "logits/rejected": -18493344.0, "logps/chosen": -582.97109375, "logps/rejected": -275.9361572265625, "loss": 0.3321, "rewards/chosen": 0.3254962682723999, "rewards/margins": 2.2619124174118044, "rewards/rejected": -1.9364161491394043, "step": 17285 }, { "epoch": 0.9162271751517239, "grad_norm": 56.0, "kl": 4.281181335449219, "learning_rate": 5e-07, "logits/chosen": -76790656.0, "logits/rejected": -5089218.0, "logps/chosen": -548.963671875, "logps/rejected": -292.9462890625, "loss": 0.3072, "rewards/chosen": 1.2617077827453613, "rewards/margins": 3.5532145500183105, "rewards/rejected": -2.291506767272949, "step": 17286 }, { "epoch": 0.9162801791535261, "grad_norm": 60.25, "kl": 6.100618362426758, "learning_rate": 5e-07, "logits/chosen": -18513275.42857143, "logits/rejected": -17996344.0, "logps/chosen": -255.17817034040178, "logps/rejected": -184.50711059570312, "loss": 0.3935, "rewards/chosen": 0.8896528652736119, "rewards/margins": 5.631831305367606, "rewards/rejected": -4.742178440093994, "step": 17287 }, { "epoch": 0.9163331831553282, "grad_norm": 47.25, "kl": 2.30499267578125, "learning_rate": 5e-07, "logits/chosen": 4632623.0, "logits/rejected": -51451541.333333336, "logps/chosen": -48.473052978515625, "logps/rejected": -306.49814860026044, "loss": 0.2543, "rewards/chosen": -0.224897101521492, "rewards/margins": 1.9184401482343674, "rewards/rejected": -2.1433372497558594, "step": 17288 }, { "epoch": 0.9163861871571304, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58136332.0, "logits/rejected": -15940752.0, "logps/chosen": -391.72735595703125, "logps/rejected": -326.9615173339844, "loss": 0.2663, "rewards/chosen": 0.051971420645713806, "rewards/margins": 3.951789364218712, "rewards/rejected": -3.899817943572998, "step": 17289 }, { "epoch": 0.9164391911589325, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61009157.333333336, "logits/rejected": -87938240.0, "logps/chosen": -201.23994954427084, "logps/rejected": -381.179833984375, "loss": 0.209, "rewards/chosen": 1.0236562887827556, "rewards/margins": 3.3277309576670326, "rewards/rejected": -2.3040746688842773, "step": 17290 }, { "epoch": 0.9164921951607347, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17018994.0, "logits/rejected": -30788690.285714287, "logps/chosen": -54.45271301269531, "logps/rejected": -397.74818638392856, "loss": 0.1137, "rewards/chosen": 0.6641807556152344, "rewards/margins": 3.7660421643938338, "rewards/rejected": -3.1018614087785994, "step": 17291 }, { "epoch": 0.9165451991625367, "grad_norm": 46.0, "kl": 1.6187496185302734, "learning_rate": 5e-07, "logits/chosen": -36765232.0, "logits/rejected": -20976860.0, "logps/chosen": -304.1015625, "logps/rejected": -298.769775390625, "loss": 0.3699, "rewards/chosen": -0.2081097513437271, "rewards/margins": 2.5288072675466537, "rewards/rejected": -2.736917018890381, "step": 17292 }, { "epoch": 0.9165982031643389, "grad_norm": 36.25, "kl": 0.7485427856445312, "learning_rate": 5e-07, "logits/chosen": 3763356.75, "logits/rejected": -27145701.333333332, "logps/chosen": -42.056087493896484, "logps/rejected": -239.77608235677084, "loss": 0.1674, "rewards/chosen": 0.9679110050201416, "rewards/margins": 3.906708002090454, "rewards/rejected": -2.9387969970703125, "step": 17293 }, { "epoch": 0.916651207166141, "grad_norm": 113.0, "kl": 8.629429817199707, "learning_rate": 5e-07, "logits/chosen": -21055056.0, "logits/rejected": 1065887.1666666667, "logps/chosen": -1059.497265625, "logps/rejected": -86.81776936848958, "loss": 0.3306, "rewards/chosen": 1.805013084411621, "rewards/margins": 3.6304039001464843, "rewards/rejected": -1.8253908157348633, "step": 17294 }, { "epoch": 0.9167042111679432, "grad_norm": 30.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60784864.0, "logits/rejected": -24033584.0, "logps/chosen": -196.58306884765625, "logps/rejected": -305.874267578125, "loss": 0.1662, "rewards/chosen": 0.9599915742874146, "rewards/margins": 4.815874218940735, "rewards/rejected": -3.8558826446533203, "step": 17295 }, { "epoch": 0.9167572151697453, "grad_norm": 51.25, "kl": 0.5623302459716797, "learning_rate": 5e-07, "logits/chosen": -45688784.0, "logits/rejected": -11946830.0, "logps/chosen": -185.36947631835938, "logps/rejected": -345.15277099609375, "loss": 0.3319, "rewards/chosen": 0.0693790391087532, "rewards/margins": 2.231536813080311, "rewards/rejected": -2.1621577739715576, "step": 17296 }, { "epoch": 0.9168102191715475, "grad_norm": 43.75, "kl": 2.0677318572998047, "learning_rate": 5e-07, "logits/chosen": -25029541.333333332, "logits/rejected": -56084168.0, "logps/chosen": -250.8094482421875, "logps/rejected": -500.3311767578125, "loss": 0.3767, "rewards/chosen": 0.1353284219900767, "rewards/margins": 3.821099648873011, "rewards/rejected": -3.6857712268829346, "step": 17297 }, { "epoch": 0.9168632231733496, "grad_norm": 44.0, "kl": 1.137864112854004, "learning_rate": 5e-07, "logits/chosen": -9866556.0, "logits/rejected": -29123512.0, "logps/chosen": -235.98809814453125, "logps/rejected": -148.0592498779297, "loss": 0.3167, "rewards/chosen": 0.018864430487155914, "rewards/margins": 2.0806260481476784, "rewards/rejected": -2.0617616176605225, "step": 17298 }, { "epoch": 0.9169162271751518, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37403360.0, "logits/rejected": -19726472.0, "logps/chosen": -344.79315185546875, "logps/rejected": -364.02520751953125, "loss": 0.222, "rewards/chosen": 0.565416693687439, "rewards/margins": 4.078651309013367, "rewards/rejected": -3.5132346153259277, "step": 17299 }, { "epoch": 0.9169692311769538, "grad_norm": 48.25, "kl": 2.8567142486572266, "learning_rate": 5e-07, "logits/chosen": -23929209.6, "logits/rejected": -130261130.66666667, "logps/chosen": -223.0212158203125, "logps/rejected": -304.6761067708333, "loss": 0.2831, "rewards/chosen": 0.7732704162597657, "rewards/margins": 4.44478619893392, "rewards/rejected": -3.671515782674154, "step": 17300 }, { "epoch": 0.917022235178756, "grad_norm": 45.25, "kl": 6.630409240722656, "learning_rate": 5e-07, "logits/chosen": -42447468.8, "logits/rejected": -16706957.333333334, "logps/chosen": -613.370361328125, "logps/rejected": -176.3068644205729, "loss": 0.2965, "rewards/chosen": 1.3495588302612305, "rewards/margins": 3.9432493845621743, "rewards/rejected": -2.593690554300944, "step": 17301 }, { "epoch": 0.9170752391805581, "grad_norm": 32.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9877354.0, "logits/rejected": -31810498.0, "logps/chosen": -386.43548583984375, "logps/rejected": -376.3580017089844, "loss": 0.1029, "rewards/chosen": 1.9397279024124146, "rewards/margins": 5.698452115058899, "rewards/rejected": -3.7587242126464844, "step": 17302 }, { "epoch": 0.9171282431823603, "grad_norm": 64.0, "kl": 1.1103973388671875, "learning_rate": 5e-07, "logits/chosen": -41835248.0, "logits/rejected": -7942100.0, "logps/chosen": -370.2624206542969, "logps/rejected": -193.0281524658203, "loss": 0.3083, "rewards/chosen": 0.8321266174316406, "rewards/margins": 2.658736824989319, "rewards/rejected": -1.8266102075576782, "step": 17303 }, { "epoch": 0.9171812471841624, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2803030.0, "logits/rejected": -9080259.333333334, "logps/chosen": -371.5527038574219, "logps/rejected": -258.022216796875, "loss": 0.1615, "rewards/chosen": 0.39779892563819885, "rewards/margins": 3.117462545633316, "rewards/rejected": -2.719663619995117, "step": 17304 }, { "epoch": 0.9172342511859646, "grad_norm": 28.125, "kl": 0.9401273727416992, "learning_rate": 5e-07, "logits/chosen": -9929185.0, "logits/rejected": -19969442.0, "logps/chosen": -455.24774169921875, "logps/rejected": -267.8351135253906, "loss": 0.1767, "rewards/chosen": 1.4509434700012207, "rewards/margins": 4.798499345779419, "rewards/rejected": -3.3475558757781982, "step": 17305 }, { "epoch": 0.9172872551877667, "grad_norm": 44.75, "kl": 4.393184661865234, "learning_rate": 5e-07, "logits/chosen": -47574492.8, "logits/rejected": -23028637.333333332, "logps/chosen": -801.52509765625, "logps/rejected": -224.44193522135416, "loss": 0.228, "rewards/chosen": 1.7102964401245118, "rewards/margins": 4.308375803629557, "rewards/rejected": -2.5980793635050454, "step": 17306 }, { "epoch": 0.9173402591895689, "grad_norm": 46.0, "kl": 0.7411098480224609, "learning_rate": 5e-07, "logits/chosen": -60293348.0, "logits/rejected": 5677952.0, "logps/chosen": -278.9922790527344, "logps/rejected": -595.7344970703125, "loss": 0.2881, "rewards/chosen": 0.08443747460842133, "rewards/margins": 5.069285497069359, "rewards/rejected": -4.9848480224609375, "step": 17307 }, { "epoch": 0.9173932631913709, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14590040.0, "logits/rejected": -20958684.0, "logps/chosen": -145.01800537109375, "logps/rejected": -456.50439453125, "loss": 0.3003, "rewards/chosen": -0.1464877724647522, "rewards/margins": 2.6860219836235046, "rewards/rejected": -2.832509756088257, "step": 17308 }, { "epoch": 0.9174462671931731, "grad_norm": 30.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 26733236.0, "logits/rejected": -40347259.428571425, "logps/chosen": -187.23452758789062, "logps/rejected": -233.94911411830358, "loss": 0.1514, "rewards/chosen": -0.013073730282485485, "rewards/margins": 3.200568935035595, "rewards/rejected": -3.2136426653180803, "step": 17309 }, { "epoch": 0.9174992711949752, "grad_norm": 33.5, "kl": 3.2255778312683105, "learning_rate": 5e-07, "logits/chosen": -9807552.0, "logits/rejected": -6268966.5, "logps/chosen": -237.68289184570312, "logps/rejected": -454.0861511230469, "loss": 0.2201, "rewards/chosen": 1.2609590291976929, "rewards/margins": 6.432807803153992, "rewards/rejected": -5.171848773956299, "step": 17310 }, { "epoch": 0.9175522751967774, "grad_norm": 52.0, "kl": 3.768235206604004, "learning_rate": 5e-07, "logits/chosen": 1007002.4, "logits/rejected": -6535355.333333333, "logps/chosen": -460.39443359375, "logps/rejected": -240.5422566731771, "loss": 0.2962, "rewards/chosen": 1.2923762321472168, "rewards/margins": 3.2467956860860188, "rewards/rejected": -1.954419453938802, "step": 17311 }, { "epoch": 0.9176052791985795, "grad_norm": 36.5, "kl": 4.522855758666992, "learning_rate": 5e-07, "logits/chosen": -7845134.0, "logits/rejected": -23093042.0, "logps/chosen": -396.3306884765625, "logps/rejected": -419.98809814453125, "loss": 0.1838, "rewards/chosen": 1.5092277526855469, "rewards/margins": 3.9375438690185547, "rewards/rejected": -2.428316116333008, "step": 17312 }, { "epoch": 0.9176582832003817, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33236442.666666668, "logits/rejected": -22511459.2, "logps/chosen": -215.4928995768229, "logps/rejected": -233.6543701171875, "loss": 0.2619, "rewards/chosen": 0.010392248630523682, "rewards/margins": 2.55806165933609, "rewards/rejected": -2.5476694107055664, "step": 17313 }, { "epoch": 0.9177112872021838, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47306880.0, "logits/rejected": -22264180.8, "logps/chosen": -290.94517008463544, "logps/rejected": -342.27119140625, "loss": 0.2426, "rewards/chosen": 0.22125192483266196, "rewards/margins": 2.873178537686666, "rewards/rejected": -2.651926612854004, "step": 17314 }, { "epoch": 0.917764291203986, "grad_norm": 46.0, "kl": 6.994617462158203, "learning_rate": 5e-07, "logits/chosen": -67154592.0, "logits/rejected": -22053818.0, "logps/chosen": -726.37158203125, "logps/rejected": -418.0788269042969, "loss": 0.2712, "rewards/chosen": 2.0076324939727783, "rewards/margins": 4.134833097457886, "rewards/rejected": -2.1272006034851074, "step": 17315 }, { "epoch": 0.917817295205788, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43719501.333333336, "logits/rejected": -26024795.2, "logps/chosen": -422.17919921875, "logps/rejected": -302.5384033203125, "loss": 0.1522, "rewards/chosen": 1.5072967211405437, "rewards/margins": 3.8473697344462074, "rewards/rejected": -2.340073013305664, "step": 17316 }, { "epoch": 0.9178702992075902, "grad_norm": 33.0, "kl": 1.8470478057861328, "learning_rate": 5e-07, "logits/chosen": -77474344.0, "logits/rejected": -50785914.666666664, "logps/chosen": -1390.6134033203125, "logps/rejected": -244.01338704427084, "loss": 0.0983, "rewards/chosen": 2.752096652984619, "rewards/margins": 5.6551642417907715, "rewards/rejected": -2.9030675888061523, "step": 17317 }, { "epoch": 0.9179233032093923, "grad_norm": 33.5, "kl": 0.3708791732788086, "learning_rate": 5e-07, "logits/chosen": -40448364.0, "logits/rejected": -50572468.0, "logps/chosen": -973.262939453125, "logps/rejected": -550.6475830078125, "loss": 0.1678, "rewards/chosen": 1.3777658939361572, "rewards/margins": 6.037611246109009, "rewards/rejected": -4.659845352172852, "step": 17318 }, { "epoch": 0.9179763072111945, "grad_norm": 53.5, "kl": 3.4422664642333984, "learning_rate": 5e-07, "logits/chosen": -6048241.5, "logits/rejected": -6171150.0, "logps/chosen": -179.78451538085938, "logps/rejected": -282.82684326171875, "loss": 0.233, "rewards/chosen": 1.3988081216812134, "rewards/margins": 3.528586983680725, "rewards/rejected": -2.1297788619995117, "step": 17319 }, { "epoch": 0.9180293112129966, "grad_norm": 68.0, "kl": 3.5979537963867188, "learning_rate": 5e-07, "logits/chosen": 1187178.6666666667, "logits/rejected": 136299353.6, "logps/chosen": -412.1625162760417, "logps/rejected": -346.9350830078125, "loss": 0.2694, "rewards/chosen": 0.7628633975982666, "rewards/margins": 3.989696741104126, "rewards/rejected": -3.2268333435058594, "step": 17320 }, { "epoch": 0.9180823152147988, "grad_norm": 31.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60066200.0, "logits/rejected": -26282458.666666668, "logps/chosen": -465.1650695800781, "logps/rejected": -480.8419596354167, "loss": 0.0986, "rewards/chosen": 0.876507580280304, "rewards/margins": 4.5268667340278625, "rewards/rejected": -3.6503591537475586, "step": 17321 }, { "epoch": 0.9181353192166009, "grad_norm": 55.0, "kl": 2.513355255126953, "learning_rate": 5e-07, "logits/chosen": -26567024.0, "logits/rejected": 443903.5, "logps/chosen": -237.77825927734375, "logps/rejected": -229.8661651611328, "loss": 0.3876, "rewards/chosen": 0.7924389839172363, "rewards/margins": 1.3734472393989563, "rewards/rejected": -0.58100825548172, "step": 17322 }, { "epoch": 0.9181883232184029, "grad_norm": 33.75, "kl": 1.3854999542236328, "learning_rate": 5e-07, "logits/chosen": -18233200.0, "logits/rejected": -47415072.0, "logps/chosen": -213.1885782877604, "logps/rejected": -342.344970703125, "loss": 0.2443, "rewards/chosen": 0.605643113454183, "rewards/margins": 2.444308694203695, "rewards/rejected": -1.8386655807495118, "step": 17323 }, { "epoch": 0.9182413272202051, "grad_norm": 63.0, "kl": 1.7190513610839844, "learning_rate": 5e-07, "logits/chosen": -71290202.66666667, "logits/rejected": -57090792.0, "logps/chosen": -454.6786702473958, "logps/rejected": -351.66314697265625, "loss": 0.3799, "rewards/chosen": 0.3640607198079427, "rewards/margins": 2.6082051595052085, "rewards/rejected": -2.2441444396972656, "step": 17324 }, { "epoch": 0.9182943312220072, "grad_norm": 55.75, "kl": 0.8976650238037109, "learning_rate": 5e-07, "logits/chosen": -32294291.2, "logits/rejected": -29866090.666666668, "logps/chosen": -199.53746337890624, "logps/rejected": -243.65606689453125, "loss": 0.4131, "rewards/chosen": -0.16876113414764404, "rewards/margins": 1.648669997851054, "rewards/rejected": -1.817431131998698, "step": 17325 }, { "epoch": 0.9183473352238094, "grad_norm": 50.75, "kl": 4.6746625900268555, "learning_rate": 5e-07, "logits/chosen": -17421610.666666668, "logits/rejected": -25893420.0, "logps/chosen": -325.9558512369792, "logps/rejected": -405.884033203125, "loss": 0.2843, "rewards/chosen": 1.1165038744608562, "rewards/margins": 5.284239451090495, "rewards/rejected": -4.167735576629639, "step": 17326 }, { "epoch": 0.9184003392256115, "grad_norm": 49.0, "kl": 1.865513801574707, "learning_rate": 5e-07, "logits/chosen": -11374750.0, "logits/rejected": -37311388.0, "logps/chosen": -126.15924072265625, "logps/rejected": -281.97607421875, "loss": 0.3544, "rewards/chosen": 0.17734476923942566, "rewards/margins": 2.181519716978073, "rewards/rejected": -2.0041749477386475, "step": 17327 }, { "epoch": 0.9184533432274137, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26199944.0, "logits/rejected": -27222246.4, "logps/chosen": -201.28299967447916, "logps/rejected": -167.27315673828124, "loss": 0.3111, "rewards/chosen": -0.2790030837059021, "rewards/margins": 1.7026100754737854, "rewards/rejected": -1.9816131591796875, "step": 17328 }, { "epoch": 0.9185063472292158, "grad_norm": 47.75, "kl": 2.3665761947631836, "learning_rate": 5e-07, "logits/chosen": -12031639.0, "logits/rejected": -72648184.0, "logps/chosen": -147.43435668945312, "logps/rejected": -338.7916259765625, "loss": 0.2961, "rewards/chosen": 0.5938087701797485, "rewards/margins": 2.5573986768722534, "rewards/rejected": -1.9635899066925049, "step": 17329 }, { "epoch": 0.918559351231018, "grad_norm": 35.25, "kl": 0.7960700988769531, "learning_rate": 5e-07, "logits/chosen": -30208266.0, "logits/rejected": -9432446.0, "logps/chosen": -266.9569091796875, "logps/rejected": -281.8626403808594, "loss": 0.245, "rewards/chosen": 0.24926653504371643, "rewards/margins": 4.178181320428848, "rewards/rejected": -3.928914785385132, "step": 17330 }, { "epoch": 0.91861235523282, "grad_norm": 38.5, "kl": 3.3765859603881836, "learning_rate": 5e-07, "logits/chosen": 10661003.0, "logits/rejected": -15242716.0, "logps/chosen": -23.436420440673828, "logps/rejected": -263.9345703125, "loss": 0.2623, "rewards/chosen": 0.7754889726638794, "rewards/margins": 3.9439464807510376, "rewards/rejected": -3.168457508087158, "step": 17331 }, { "epoch": 0.9186653592346222, "grad_norm": 63.0, "kl": 0.045032501220703125, "learning_rate": 5e-07, "logits/chosen": -4199066.4, "logits/rejected": -2492807.1666666665, "logps/chosen": -158.04090576171876, "logps/rejected": -332.2399495442708, "loss": 0.407, "rewards/chosen": 0.4784380912780762, "rewards/margins": 1.1995444615681967, "rewards/rejected": -0.7211063702901205, "step": 17332 }, { "epoch": 0.9187183632364243, "grad_norm": 49.5, "kl": 3.014828681945801, "learning_rate": 5e-07, "logits/chosen": -25871212.0, "logits/rejected": -21171876.0, "logps/chosen": -531.9628295898438, "logps/rejected": -248.2698974609375, "loss": 0.2786, "rewards/chosen": 0.9889594912528992, "rewards/margins": 2.924513518810272, "rewards/rejected": -1.935554027557373, "step": 17333 }, { "epoch": 0.9187713672382265, "grad_norm": 42.75, "kl": 1.3719635009765625, "learning_rate": 5e-07, "logits/chosen": -28357802.666666668, "logits/rejected": -46015156.0, "logps/chosen": -455.5474039713542, "logps/rejected": -563.8631591796875, "loss": 0.2506, "rewards/chosen": 1.105796178181966, "rewards/margins": 4.718648751576741, "rewards/rejected": -3.6128525733947754, "step": 17334 }, { "epoch": 0.9188243712400286, "grad_norm": 47.75, "kl": 0.5252351760864258, "learning_rate": 5e-07, "logits/chosen": -31452120.0, "logits/rejected": -31977544.0, "logps/chosen": -275.328125, "logps/rejected": -494.66583251953125, "loss": 0.4018, "rewards/chosen": -0.2533321678638458, "rewards/margins": 3.639900654554367, "rewards/rejected": -3.893232822418213, "step": 17335 }, { "epoch": 0.9188773752418308, "grad_norm": 42.25, "kl": 0.18604469299316406, "learning_rate": 5e-07, "logits/chosen": -35317555.2, "logits/rejected": -20731158.666666668, "logps/chosen": -234.604736328125, "logps/rejected": -131.53018188476562, "loss": 0.2662, "rewards/chosen": 0.5391485214233398, "rewards/margins": 3.072366587320963, "rewards/rejected": -2.5332180658976235, "step": 17336 }, { "epoch": 0.9189303792436329, "grad_norm": 57.75, "kl": 0.08359909057617188, "learning_rate": 5e-07, "logits/chosen": -92093848.0, "logits/rejected": -5727225.0, "logps/chosen": -251.75283813476562, "logps/rejected": -284.8224182128906, "loss": 0.342, "rewards/chosen": 0.2575848698616028, "rewards/margins": 1.4077337384223938, "rewards/rejected": -1.150148868560791, "step": 17337 }, { "epoch": 0.918983383245435, "grad_norm": 40.75, "kl": 2.405923843383789, "learning_rate": 5e-07, "logits/chosen": -22288948.0, "logits/rejected": 3933617.75, "logps/chosen": -380.77532958984375, "logps/rejected": -67.26054382324219, "loss": 0.2568, "rewards/chosen": 1.2096976041793823, "rewards/margins": 3.2792454957962036, "rewards/rejected": -2.0695478916168213, "step": 17338 }, { "epoch": 0.9190363872472371, "grad_norm": 47.25, "kl": 1.7671279907226562, "learning_rate": 5e-07, "logits/chosen": -21871304.0, "logits/rejected": 357247072.0, "logps/chosen": -501.7939758300781, "logps/rejected": -252.78396606445312, "loss": 0.1966, "rewards/chosen": 1.3495228290557861, "rewards/margins": 4.030742168426514, "rewards/rejected": -2.6812193393707275, "step": 17339 }, { "epoch": 0.9190893912490393, "grad_norm": 30.625, "kl": 1.7327680587768555, "learning_rate": 5e-07, "logits/chosen": -9090293.333333334, "logits/rejected": -21276252.8, "logps/chosen": -294.6767171223958, "logps/rejected": -396.4697021484375, "loss": 0.1671, "rewards/chosen": 1.4273182551066081, "rewards/margins": 4.0223951975504555, "rewards/rejected": -2.5950769424438476, "step": 17340 }, { "epoch": 0.9191423952508414, "grad_norm": 44.75, "kl": 2.925872802734375, "learning_rate": 5e-07, "logits/chosen": -25376394.666666668, "logits/rejected": -16820388.8, "logps/chosen": -308.9078776041667, "logps/rejected": -418.13857421875, "loss": 0.2323, "rewards/chosen": 1.0458343823750813, "rewards/margins": 3.4373989423116047, "rewards/rejected": -2.3915645599365236, "step": 17341 }, { "epoch": 0.9191953992526436, "grad_norm": 43.75, "kl": 0.5777454376220703, "learning_rate": 5e-07, "logits/chosen": -41649830.4, "logits/rejected": -81796202.66666667, "logps/chosen": -311.788818359375, "logps/rejected": -303.7434895833333, "loss": 0.2462, "rewards/chosen": 1.0353007316589355, "rewards/margins": 2.9028884569803877, "rewards/rejected": -1.867587725321452, "step": 17342 }, { "epoch": 0.9192484032544457, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37321564.0, "logits/rejected": -1223562.0, "logps/chosen": -258.3829345703125, "logps/rejected": -440.6866048177083, "loss": 0.2074, "rewards/chosen": -0.12076979130506516, "rewards/margins": 2.628489203751087, "rewards/rejected": -2.7492589950561523, "step": 17343 }, { "epoch": 0.9193014072562479, "grad_norm": 37.75, "kl": 1.1637887954711914, "learning_rate": 5e-07, "logits/chosen": -39577232.0, "logits/rejected": -22071564.0, "logps/chosen": -301.5952453613281, "logps/rejected": -319.87738037109375, "loss": 0.1633, "rewards/chosen": 1.2523789405822754, "rewards/margins": 4.069411277770996, "rewards/rejected": -2.8170323371887207, "step": 17344 }, { "epoch": 0.91935441125805, "grad_norm": 42.0, "kl": 0.8078060150146484, "learning_rate": 5e-07, "logits/chosen": -43982565.333333336, "logits/rejected": -11620733.6, "logps/chosen": -223.58087158203125, "logps/rejected": -306.4711181640625, "loss": 0.2637, "rewards/chosen": 0.8395231564839681, "rewards/margins": 2.6374273618062336, "rewards/rejected": -1.7979042053222656, "step": 17345 }, { "epoch": 0.9194074152598521, "grad_norm": 67.0, "kl": 0.001880645751953125, "learning_rate": 5e-07, "logits/chosen": -11737148.0, "logits/rejected": -47417440.0, "logps/chosen": -288.31787109375, "logps/rejected": -561.0713297526041, "loss": 0.3168, "rewards/chosen": 0.36862454414367674, "rewards/margins": 2.4595373630523683, "rewards/rejected": -2.0909128189086914, "step": 17346 }, { "epoch": 0.9194604192616542, "grad_norm": 38.5, "kl": 1.981308937072754, "learning_rate": 5e-07, "logits/chosen": -14988433.0, "logits/rejected": -16057896.0, "logps/chosen": -236.8333740234375, "logps/rejected": -153.09217834472656, "loss": 0.2074, "rewards/chosen": 1.372896432876587, "rewards/margins": 3.682114601135254, "rewards/rejected": -2.309218168258667, "step": 17347 }, { "epoch": 0.9195134232634564, "grad_norm": 38.5, "kl": 1.360264778137207, "learning_rate": 5e-07, "logits/chosen": 1503090.6666666667, "logits/rejected": -26777900.0, "logps/chosen": -94.95009358723958, "logps/rejected": -341.2343444824219, "loss": 0.355, "rewards/chosen": 0.37477342287699383, "rewards/margins": 3.3146416346232095, "rewards/rejected": -2.939868211746216, "step": 17348 }, { "epoch": 0.9195664272652585, "grad_norm": 50.75, "kl": 1.7916450500488281, "learning_rate": 5e-07, "logits/chosen": -35994282.666666664, "logits/rejected": -25837731.2, "logps/chosen": -524.8023274739584, "logps/rejected": -196.07958984375, "loss": 0.2354, "rewards/chosen": 1.4962937037150066, "rewards/margins": 3.2206888834635414, "rewards/rejected": -1.724395179748535, "step": 17349 }, { "epoch": 0.9196194312670607, "grad_norm": 44.25, "kl": 3.411998748779297, "learning_rate": 5e-07, "logits/chosen": -19954465.6, "logits/rejected": 26109504.0, "logps/chosen": -185.12711181640626, "logps/rejected": -425.6890462239583, "loss": 0.313, "rewards/chosen": 0.8935075759887695, "rewards/margins": 4.366203880310058, "rewards/rejected": -3.472696304321289, "step": 17350 }, { "epoch": 0.9196724352688628, "grad_norm": 35.0, "kl": 4.119529724121094, "learning_rate": 5e-07, "logits/chosen": -14753648.0, "logits/rejected": 37200128.0, "logps/chosen": -287.342919921875, "logps/rejected": -203.7745361328125, "loss": 0.2998, "rewards/chosen": 1.4605376243591308, "rewards/margins": 3.141128412882487, "rewards/rejected": -1.6805907885233562, "step": 17351 }, { "epoch": 0.919725439270665, "grad_norm": 47.25, "kl": 1.0039129257202148, "learning_rate": 5e-07, "logits/chosen": -37027072.0, "logits/rejected": -16038924.8, "logps/chosen": -232.96053059895834, "logps/rejected": -327.27900390625, "loss": 0.2275, "rewards/chosen": 0.31390947103500366, "rewards/margins": 3.477227342128754, "rewards/rejected": -3.16331787109375, "step": 17352 }, { "epoch": 0.919778443272467, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34047848.0, "logits/rejected": -109627208.0, "logps/chosen": -297.8573913574219, "logps/rejected": -329.89599609375, "loss": 0.3069, "rewards/chosen": -0.11803092062473297, "rewards/margins": 2.489987000823021, "rewards/rejected": -2.608017921447754, "step": 17353 }, { "epoch": 0.9198314472742692, "grad_norm": 43.75, "kl": 2.316422462463379, "learning_rate": 5e-07, "logits/chosen": -2061395.0, "logits/rejected": -12843461.0, "logps/chosen": -184.2223358154297, "logps/rejected": -229.32061767578125, "loss": 0.3002, "rewards/chosen": 0.8932583332061768, "rewards/margins": 2.7052435874938965, "rewards/rejected": -1.8119852542877197, "step": 17354 }, { "epoch": 0.9198844512760713, "grad_norm": 49.25, "kl": 8.850276947021484, "learning_rate": 5e-07, "logits/chosen": -6590737.5, "logits/rejected": -13790999.0, "logps/chosen": -282.92132568359375, "logps/rejected": -189.48451232910156, "loss": 0.3621, "rewards/chosen": 1.4215816259384155, "rewards/margins": 3.11452579498291, "rewards/rejected": -1.6929441690444946, "step": 17355 }, { "epoch": 0.9199374552778735, "grad_norm": 52.75, "kl": 1.6052284240722656, "learning_rate": 5e-07, "logits/chosen": -26433397.333333332, "logits/rejected": -13219334.4, "logps/chosen": -369.4413655598958, "logps/rejected": -370.865576171875, "loss": 0.1979, "rewards/chosen": 0.7293052673339844, "rewards/margins": 4.113637542724609, "rewards/rejected": -3.384332275390625, "step": 17356 }, { "epoch": 0.9199904592796756, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22850340.0, "logits/rejected": -26082427.42857143, "logps/chosen": -221.67376708984375, "logps/rejected": -192.48517717633928, "loss": 0.0896, "rewards/chosen": 0.2737381160259247, "rewards/margins": 4.1234297794955115, "rewards/rejected": -3.8496916634695872, "step": 17357 }, { "epoch": 0.9200434632814778, "grad_norm": 46.25, "kl": 2.1545820236206055, "learning_rate": 5e-07, "logits/chosen": -29023344.0, "logits/rejected": -34215424.0, "logps/chosen": -307.9557861328125, "logps/rejected": -427.1312662760417, "loss": 0.4084, "rewards/chosen": -0.1724919080734253, "rewards/margins": 3.329301635424296, "rewards/rejected": -3.501793543497721, "step": 17358 }, { "epoch": 0.9200964672832799, "grad_norm": 27.125, "kl": 2.2231626510620117, "learning_rate": 5e-07, "logits/chosen": -2999541.0, "logits/rejected": -53415712.0, "logps/chosen": -220.096923828125, "logps/rejected": -381.903076171875, "loss": 0.1668, "rewards/chosen": 1.3237617015838623, "rewards/margins": 3.9671638011932373, "rewards/rejected": -2.643402099609375, "step": 17359 }, { "epoch": 0.9201494712850821, "grad_norm": 38.0, "kl": 3.000912666320801, "learning_rate": 5e-07, "logits/chosen": -12904234.666666666, "logits/rejected": -2560116.8, "logps/chosen": -456.2544759114583, "logps/rejected": -177.6262939453125, "loss": 0.1814, "rewards/chosen": 1.858526388804118, "rewards/margins": 4.514498488108317, "rewards/rejected": -2.6559720993041993, "step": 17360 }, { "epoch": 0.9202024752868841, "grad_norm": 41.0, "kl": 6.910604476928711, "learning_rate": 5e-07, "logits/chosen": -3036486.4, "logits/rejected": -6645364.0, "logps/chosen": -178.1398193359375, "logps/rejected": -74.42922465006511, "loss": 0.4295, "rewards/chosen": 0.31409735679626466, "rewards/margins": 3.2337836742401125, "rewards/rejected": -2.9196863174438477, "step": 17361 }, { "epoch": 0.9202554792886863, "grad_norm": 39.25, "kl": 1.2146453857421875, "learning_rate": 5e-07, "logits/chosen": -2258471.3333333335, "logits/rejected": -47610598.4, "logps/chosen": -266.1136474609375, "logps/rejected": -344.3773193359375, "loss": 0.2156, "rewards/chosen": 0.18473186095555624, "rewards/margins": 3.497870441277822, "rewards/rejected": -3.3131385803222657, "step": 17362 }, { "epoch": 0.9203084832904884, "grad_norm": 55.75, "kl": 3.727254867553711, "learning_rate": 5e-07, "logits/chosen": -18468592.0, "logits/rejected": -32656676.0, "logps/chosen": -299.3421936035156, "logps/rejected": -366.901123046875, "loss": 0.3108, "rewards/chosen": 0.7249638438224792, "rewards/margins": 2.370546042919159, "rewards/rejected": -1.6455821990966797, "step": 17363 }, { "epoch": 0.9203614872922906, "grad_norm": 90.0, "kl": 2.9417495727539062, "learning_rate": 5e-07, "logits/chosen": -57471653.333333336, "logits/rejected": -26807866.0, "logps/chosen": -541.262939453125, "logps/rejected": -138.6162109375, "loss": 0.2442, "rewards/chosen": 1.1808797518412273, "rewards/margins": 5.709773699442546, "rewards/rejected": -4.528893947601318, "step": 17364 }, { "epoch": 0.9204144912940927, "grad_norm": 54.0, "kl": 3.6848983764648438, "learning_rate": 5e-07, "logits/chosen": -884266.0, "logits/rejected": -64871228.0, "logps/chosen": -203.78692626953125, "logps/rejected": -150.122802734375, "loss": 0.3705, "rewards/chosen": 0.789734681447347, "rewards/margins": 2.3277176221211753, "rewards/rejected": -1.5379829406738281, "step": 17365 }, { "epoch": 0.9204674952958949, "grad_norm": 69.0, "kl": 3.550609588623047, "learning_rate": 5e-07, "logits/chosen": -6958419.2, "logits/rejected": -22566402.666666668, "logps/chosen": -469.63603515625, "logps/rejected": -306.4739583333333, "loss": 0.2748, "rewards/chosen": 0.9580121040344238, "rewards/margins": 5.191229661305745, "rewards/rejected": -4.233217557271321, "step": 17366 }, { "epoch": 0.920520499297697, "grad_norm": 41.0, "kl": 5.054344177246094, "learning_rate": 5e-07, "logits/chosen": 4929297.333333333, "logits/rejected": 4532659.0, "logps/chosen": -230.54833984375, "logps/rejected": -87.38795471191406, "loss": 0.2765, "rewards/chosen": 1.3718746503194172, "rewards/margins": 4.488269408543904, "rewards/rejected": -3.1163947582244873, "step": 17367 }, { "epoch": 0.9205735032994992, "grad_norm": 54.5, "kl": 2.6249771118164062, "learning_rate": 5e-07, "logits/chosen": -33167322.666666668, "logits/rejected": 2995937.6, "logps/chosen": -427.7721761067708, "logps/rejected": -371.15146484375, "loss": 0.2128, "rewards/chosen": 1.3238118489583333, "rewards/margins": 4.178210194905599, "rewards/rejected": -2.8543983459472657, "step": 17368 }, { "epoch": 0.9206265073013012, "grad_norm": 50.75, "kl": 3.987852096557617, "learning_rate": 5e-07, "logits/chosen": -24144565.333333332, "logits/rejected": -11290798.0, "logps/chosen": -278.5857340494792, "logps/rejected": -185.6428985595703, "loss": 0.4751, "rewards/chosen": 0.008951246738433838, "rewards/margins": 2.704954206943512, "rewards/rejected": -2.696002960205078, "step": 17369 }, { "epoch": 0.9206795113031034, "grad_norm": 45.0, "kl": 4.482175827026367, "learning_rate": 5e-07, "logits/chosen": -45769156.0, "logits/rejected": -27233544.0, "logps/chosen": -364.56072998046875, "logps/rejected": -333.1258544921875, "loss": 0.2539, "rewards/chosen": 1.034571886062622, "rewards/margins": 4.526984453201294, "rewards/rejected": -3.492412567138672, "step": 17370 }, { "epoch": 0.9207325153049055, "grad_norm": 71.5, "kl": 4.282540321350098, "learning_rate": 5e-07, "logits/chosen": -19401682.0, "logits/rejected": -11748126.0, "logps/chosen": -314.19573974609375, "logps/rejected": -309.05670166015625, "loss": 0.3101, "rewards/chosen": 0.6120744347572327, "rewards/margins": 3.4902129769325256, "rewards/rejected": -2.878138542175293, "step": 17371 }, { "epoch": 0.9207855193067077, "grad_norm": 57.0, "kl": 6.046895980834961, "learning_rate": 5e-07, "logits/chosen": -46097289.14285714, "logits/rejected": -50177984.0, "logps/chosen": -363.2988978794643, "logps/rejected": -316.6600036621094, "loss": 0.4084, "rewards/chosen": 0.8575109754289899, "rewards/margins": 3.4993812356676375, "rewards/rejected": -2.6418702602386475, "step": 17372 }, { "epoch": 0.9208385233085098, "grad_norm": 68.0, "kl": 4.045055389404297, "learning_rate": 5e-07, "logits/chosen": -6545202.0, "logits/rejected": -38016205.333333336, "logps/chosen": -598.1058959960938, "logps/rejected": -517.4970703125, "loss": 0.1852, "rewards/chosen": 1.5937302112579346, "rewards/margins": 4.931069612503052, "rewards/rejected": -3.337339401245117, "step": 17373 }, { "epoch": 0.9208915273103119, "grad_norm": 44.75, "kl": 0.9618434906005859, "learning_rate": 5e-07, "logits/chosen": -42625364.0, "logits/rejected": -23987744.0, "logps/chosen": -827.8419189453125, "logps/rejected": -254.08233642578125, "loss": 0.1782, "rewards/chosen": 1.8203363418579102, "rewards/margins": 3.9930875301361084, "rewards/rejected": -2.1727511882781982, "step": 17374 }, { "epoch": 0.9209445313121141, "grad_norm": 63.0, "kl": 4.156154632568359, "learning_rate": 5e-07, "logits/chosen": -70990520.0, "logits/rejected": -25749064.0, "logps/chosen": -406.1363220214844, "logps/rejected": -254.02859497070312, "loss": 0.3209, "rewards/chosen": 1.133287787437439, "rewards/margins": 2.8279850482940674, "rewards/rejected": -1.6946972608566284, "step": 17375 }, { "epoch": 0.9209975353139161, "grad_norm": 36.25, "kl": 6.065916061401367, "learning_rate": 5e-07, "logits/chosen": -2404031.6, "logits/rejected": 10096414.666666666, "logps/chosen": -100.11617431640624, "logps/rejected": -506.2015787760417, "loss": 0.3538, "rewards/chosen": 1.005392837524414, "rewards/margins": 3.8198891957600916, "rewards/rejected": -2.8144963582356772, "step": 17376 }, { "epoch": 0.9210505393157183, "grad_norm": 36.25, "kl": 1.9276466369628906, "learning_rate": 5e-07, "logits/chosen": -3937562.0, "logits/rejected": -9977034.666666666, "logps/chosen": -102.73541259765625, "logps/rejected": -89.77536010742188, "loss": 0.2911, "rewards/chosen": 0.9300693511962891, "rewards/margins": 3.5769299507141112, "rewards/rejected": -2.6468605995178223, "step": 17377 }, { "epoch": 0.9211035433175204, "grad_norm": 114.0, "kl": 4.814334869384766, "learning_rate": 5e-07, "logits/chosen": -68767616.0, "logits/rejected": -75449904.0, "logps/chosen": -567.44775390625, "logps/rejected": -253.13435872395834, "loss": 0.2676, "rewards/chosen": 1.4801166534423829, "rewards/margins": 3.6530495961507166, "rewards/rejected": -2.1729329427083335, "step": 17378 }, { "epoch": 0.9211565473193226, "grad_norm": 60.25, "kl": 0.14359283447265625, "learning_rate": 5e-07, "logits/chosen": -20722816.0, "logits/rejected": -11742295.0, "logps/chosen": -280.6115417480469, "logps/rejected": -277.880615234375, "loss": 0.2938, "rewards/chosen": 0.25553908944129944, "rewards/margins": 2.3721303045749664, "rewards/rejected": -2.116591215133667, "step": 17379 }, { "epoch": 0.9212095513211247, "grad_norm": 33.25, "kl": 1.3753671646118164, "learning_rate": 5e-07, "logits/chosen": 1211322.75, "logits/rejected": -58426812.0, "logps/chosen": -72.77161407470703, "logps/rejected": -611.2991943359375, "loss": 0.245, "rewards/chosen": 0.5753637552261353, "rewards/margins": 4.004353880882263, "rewards/rejected": -3.428990125656128, "step": 17380 }, { "epoch": 0.9212625553229269, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -113506101.33333333, "logits/rejected": -4301926.8, "logps/chosen": -416.1591796875, "logps/rejected": -177.08125, "loss": 0.2128, "rewards/chosen": 0.8720550537109375, "rewards/margins": 3.119149398803711, "rewards/rejected": -2.2470943450927736, "step": 17381 }, { "epoch": 0.921315559324729, "grad_norm": 50.25, "kl": 0.05988502502441406, "learning_rate": 5e-07, "logits/chosen": -29209502.0, "logits/rejected": -20461160.0, "logps/chosen": -693.6715087890625, "logps/rejected": -134.95541381835938, "loss": 0.2873, "rewards/chosen": 0.5181827545166016, "rewards/margins": 2.455377459526062, "rewards/rejected": -1.9371947050094604, "step": 17382 }, { "epoch": 0.9213685633265312, "grad_norm": 49.5, "kl": 2.6536483764648438, "learning_rate": 5e-07, "logits/chosen": -49369824.0, "logits/rejected": -32417206.4, "logps/chosen": -262.9419352213542, "logps/rejected": -125.70223388671874, "loss": 0.3907, "rewards/chosen": 0.30549710988998413, "rewards/margins": 1.6803529143333436, "rewards/rejected": -1.3748558044433594, "step": 17383 }, { "epoch": 0.9214215673283332, "grad_norm": 44.25, "kl": 0.9862680435180664, "learning_rate": 5e-07, "logits/chosen": -15306998.666666666, "logits/rejected": -8529484.8, "logps/chosen": -194.14227294921875, "logps/rejected": -192.3144775390625, "loss": 0.3886, "rewards/chosen": -0.24150596062342325, "rewards/margins": 0.9275400598843894, "rewards/rejected": -1.1690460205078126, "step": 17384 }, { "epoch": 0.9214745713301354, "grad_norm": 38.5, "kl": 4.263912200927734, "learning_rate": 5e-07, "logits/chosen": -7412382.4, "logits/rejected": -10714518.666666666, "logps/chosen": -221.00458984375, "logps/rejected": -251.00946044921875, "loss": 0.4282, "rewards/chosen": -3.726482391357422e-05, "rewards/margins": 2.685033090909322, "rewards/rejected": -2.685070355733236, "step": 17385 }, { "epoch": 0.9215275753319375, "grad_norm": 57.75, "kl": 1.1329727172851562, "learning_rate": 5e-07, "logits/chosen": -17308080.0, "logits/rejected": 28282632.0, "logps/chosen": -198.30952962239584, "logps/rejected": -443.0176086425781, "loss": 0.384, "rewards/chosen": 0.1804406444231669, "rewards/margins": 2.6512677470842996, "rewards/rejected": -2.470827102661133, "step": 17386 }, { "epoch": 0.9215805793337397, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20484366.4, "logits/rejected": 2100956.0, "logps/chosen": -289.485498046875, "logps/rejected": -235.95477294921875, "loss": 0.2971, "rewards/chosen": 0.16237274408340455, "rewards/margins": 5.116412635644276, "rewards/rejected": -4.954039891560872, "step": 17387 }, { "epoch": 0.9216335833355418, "grad_norm": 43.0, "kl": 4.403100967407227, "learning_rate": 5e-07, "logits/chosen": -21423609.6, "logits/rejected": -21802650.666666668, "logps/chosen": -231.598828125, "logps/rejected": -279.0271809895833, "loss": 0.3428, "rewards/chosen": 0.9351846694946289, "rewards/margins": 4.045599174499512, "rewards/rejected": -3.110414505004883, "step": 17388 }, { "epoch": 0.921686587337344, "grad_norm": 52.25, "kl": 0.8892984390258789, "learning_rate": 5e-07, "logits/chosen": -4126869.0, "logits/rejected": 2421250.4, "logps/chosen": -101.68465169270833, "logps/rejected": -184.7475830078125, "loss": 0.2185, "rewards/chosen": 1.0742751757303874, "rewards/margins": 3.845232931772868, "rewards/rejected": -2.7709577560424803, "step": 17389 }, { "epoch": 0.9217395913391461, "grad_norm": 53.5, "kl": 0.026714324951171875, "learning_rate": 5e-07, "logits/chosen": -67889862.4, "logits/rejected": -4006896.6666666665, "logps/chosen": -263.09697265625, "logps/rejected": -97.64241536458333, "loss": 0.4438, "rewards/chosen": 0.019019386172294615, "rewards/margins": 0.5991376449664434, "rewards/rejected": -0.5801182587941488, "step": 17390 }, { "epoch": 0.9217925953409483, "grad_norm": 53.75, "kl": 0.767817497253418, "learning_rate": 5e-07, "logits/chosen": -42679865.6, "logits/rejected": -19476974.666666668, "logps/chosen": -362.39345703125, "logps/rejected": -533.5553792317709, "loss": 0.1998, "rewards/chosen": 1.1831794738769532, "rewards/margins": 5.2874850591023765, "rewards/rejected": -4.104305585225423, "step": 17391 }, { "epoch": 0.9218455993427503, "grad_norm": 32.25, "kl": 3.9096221923828125, "learning_rate": 5e-07, "logits/chosen": -10691041.6, "logits/rejected": -15684458.666666666, "logps/chosen": -184.94481201171874, "logps/rejected": -608.07666015625, "loss": 0.2639, "rewards/chosen": 1.1391989707946777, "rewards/margins": 4.6362378120422365, "rewards/rejected": -3.4970388412475586, "step": 17392 }, { "epoch": 0.9218986033445525, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40184160.0, "logits/rejected": -5214763.428571428, "logps/chosen": -523.8662719726562, "logps/rejected": -184.07329450334822, "loss": 0.1462, "rewards/chosen": 0.15363769233226776, "rewards/margins": 2.558956088764327, "rewards/rejected": -2.405318396432059, "step": 17393 }, { "epoch": 0.9219516073463546, "grad_norm": 54.5, "kl": 2.1740217208862305, "learning_rate": 5e-07, "logits/chosen": 8404258.0, "logits/rejected": -20937792.0, "logps/chosen": -192.41402180989584, "logps/rejected": -541.6018676757812, "loss": 0.3525, "rewards/chosen": 0.598690907160441, "rewards/margins": 2.6129182974497476, "rewards/rejected": -2.0142273902893066, "step": 17394 }, { "epoch": 0.9220046113481568, "grad_norm": 81.5, "kl": 2.852642059326172, "learning_rate": 5e-07, "logits/chosen": -5399565.142857143, "logits/rejected": -39305736.0, "logps/chosen": -491.90391322544644, "logps/rejected": -380.7323303222656, "loss": 0.4816, "rewards/chosen": 0.32726621627807617, "rewards/margins": 1.0645495653152466, "rewards/rejected": -0.7372833490371704, "step": 17395 }, { "epoch": 0.9220576153499589, "grad_norm": 41.5, "kl": 2.0670738220214844, "learning_rate": 5e-07, "logits/chosen": -51245904.0, "logits/rejected": 8449944.0, "logps/chosen": -353.9366455078125, "logps/rejected": -548.4833577473959, "loss": 0.1587, "rewards/chosen": 0.8068866729736328, "rewards/margins": 3.5755176544189453, "rewards/rejected": -2.7686309814453125, "step": 17396 }, { "epoch": 0.9221106193517611, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29967544.0, "logits/rejected": -18499190.4, "logps/chosen": -285.2681070963542, "logps/rejected": -331.3335205078125, "loss": 0.2051, "rewards/chosen": 0.39020029703776044, "rewards/margins": 3.066046396891276, "rewards/rejected": -2.6758460998535156, "step": 17397 }, { "epoch": 0.9221636233535632, "grad_norm": 52.5, "kl": 3.759777069091797, "learning_rate": 5e-07, "logits/chosen": -45799504.0, "logits/rejected": 13349745.333333334, "logps/chosen": -518.07255859375, "logps/rejected": -153.3311767578125, "loss": 0.2216, "rewards/chosen": 1.4606335639953614, "rewards/margins": 4.504001585642497, "rewards/rejected": -3.0433680216471353, "step": 17398 }, { "epoch": 0.9222166273553654, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12242091.2, "logits/rejected": -9110039.333333334, "logps/chosen": -419.743896484375, "logps/rejected": -157.49309285481772, "loss": 0.2265, "rewards/chosen": 1.0909469604492188, "rewards/margins": 3.2335657755533855, "rewards/rejected": -2.1426188151041665, "step": 17399 }, { "epoch": 0.9222696313571674, "grad_norm": 47.25, "kl": 2.269688606262207, "learning_rate": 5e-07, "logits/chosen": -96637760.0, "logits/rejected": 84674560.0, "logps/chosen": -134.7487335205078, "logps/rejected": -384.0770568847656, "loss": 0.3832, "rewards/chosen": -0.10959997773170471, "rewards/margins": 1.7363357245922089, "rewards/rejected": -1.8459357023239136, "step": 17400 }, { "epoch": 0.9223226353589696, "grad_norm": 55.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38347760.0, "logits/rejected": -60675824.0, "logps/chosen": -351.3146057128906, "logps/rejected": -371.8695882161458, "loss": 0.2498, "rewards/chosen": 0.36600494384765625, "rewards/margins": 2.125758647918701, "rewards/rejected": -1.759753704071045, "step": 17401 }, { "epoch": 0.9223756393607717, "grad_norm": 64.0, "kl": 5.540595054626465, "learning_rate": 5e-07, "logits/chosen": -17887029.714285713, "logits/rejected": 11587478.0, "logps/chosen": -278.33536202566967, "logps/rejected": -169.55758666992188, "loss": 0.4201, "rewards/chosen": 0.8543643951416016, "rewards/margins": 2.0589221715927124, "rewards/rejected": -1.2045577764511108, "step": 17402 }, { "epoch": 0.9224286433625739, "grad_norm": 35.25, "kl": 0.48531341552734375, "learning_rate": 5e-07, "logits/chosen": -37590472.0, "logits/rejected": -44348000.0, "logps/chosen": -248.94410705566406, "logps/rejected": -365.91531808035717, "loss": 0.1777, "rewards/chosen": 0.17076721787452698, "rewards/margins": 2.560064975704466, "rewards/rejected": -2.389297757829939, "step": 17403 }, { "epoch": 0.922481647364376, "grad_norm": 56.5, "kl": 2.031230926513672, "learning_rate": 5e-07, "logits/chosen": -24417190.0, "logits/rejected": -25596350.0, "logps/chosen": -359.31610107421875, "logps/rejected": -600.9395751953125, "loss": 0.2517, "rewards/chosen": 0.7928295135498047, "rewards/margins": 4.0444982051849365, "rewards/rejected": -3.251668691635132, "step": 17404 }, { "epoch": 0.9225346513661782, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56989316.0, "logits/rejected": -32045802.666666668, "logps/chosen": -240.87661743164062, "logps/rejected": -288.3586832682292, "loss": 0.2461, "rewards/chosen": 0.3262473940849304, "rewards/margins": 2.6815044283866882, "rewards/rejected": -2.355257034301758, "step": 17405 }, { "epoch": 0.9225876553679803, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45359178.666666664, "logits/rejected": -10345748.8, "logps/chosen": -427.3148600260417, "logps/rejected": -332.8356201171875, "loss": 0.175, "rewards/chosen": 0.9392669995625814, "rewards/margins": 4.532795747121175, "rewards/rejected": -3.5935287475585938, "step": 17406 }, { "epoch": 0.9226406593697825, "grad_norm": 43.75, "kl": 3.6959800720214844, "learning_rate": 5e-07, "logits/chosen": -34357276.0, "logits/rejected": -49738296.0, "logps/chosen": -381.29290771484375, "logps/rejected": -375.8900146484375, "loss": 0.195, "rewards/chosen": 1.6744184494018555, "rewards/margins": 4.567048072814941, "rewards/rejected": -2.892629623413086, "step": 17407 }, { "epoch": 0.9226936633715845, "grad_norm": 38.0, "kl": 0.7263832092285156, "learning_rate": 5e-07, "logits/chosen": -24072996.0, "logits/rejected": -89308768.0, "logps/chosen": -609.7778930664062, "logps/rejected": -289.557373046875, "loss": 0.2315, "rewards/chosen": 1.1899325847625732, "rewards/margins": 3.6391441822052, "rewards/rejected": -2.449211597442627, "step": 17408 }, { "epoch": 0.9227466673733867, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5118131.333333333, "logits/rejected": -33073363.2, "logps/chosen": -71.796630859375, "logps/rejected": -327.379541015625, "loss": 0.2423, "rewards/chosen": 0.05920227368672689, "rewards/margins": 2.572408755620321, "rewards/rejected": -2.5132064819335938, "step": 17409 }, { "epoch": 0.9227996713751888, "grad_norm": 111.0, "kl": 6.036224365234375, "learning_rate": 5e-07, "logits/chosen": -24838200.0, "logits/rejected": -110220072.0, "logps/chosen": -1060.0693359375, "logps/rejected": -223.37942504882812, "loss": 0.4741, "rewards/chosen": 0.43491264184316, "rewards/margins": 2.336912830670675, "rewards/rejected": -1.9020001888275146, "step": 17410 }, { "epoch": 0.922852675376991, "grad_norm": 83.0, "kl": 4.888689994812012, "learning_rate": 5e-07, "logits/chosen": -45988608.0, "logits/rejected": -698453.625, "logps/chosen": -326.90407307942706, "logps/rejected": -205.58973693847656, "loss": 0.4649, "rewards/chosen": 0.634121298789978, "rewards/margins": 1.4567485451698303, "rewards/rejected": -0.8226272463798523, "step": 17411 }, { "epoch": 0.9229056793787931, "grad_norm": 29.5, "kl": 0.6711635589599609, "learning_rate": 5e-07, "logits/chosen": -32426877.333333332, "logits/rejected": -37163084.8, "logps/chosen": -314.51719156901044, "logps/rejected": -454.948193359375, "loss": 0.1608, "rewards/chosen": 1.2722856998443604, "rewards/margins": 4.743238687515259, "rewards/rejected": -3.4709529876708984, "step": 17412 }, { "epoch": 0.9229586833805953, "grad_norm": 34.25, "kl": 2.5368337631225586, "learning_rate": 5e-07, "logits/chosen": -18511996.8, "logits/rejected": -428172.6666666667, "logps/chosen": -147.13023681640624, "logps/rejected": -199.3250935872396, "loss": 0.3323, "rewards/chosen": 0.5572540283203125, "rewards/margins": 2.8889101664225256, "rewards/rejected": -2.3316561381022134, "step": 17413 }, { "epoch": 0.9230116873823974, "grad_norm": 41.75, "kl": 5.880229949951172, "learning_rate": 5e-07, "logits/chosen": -2919761.6, "logits/rejected": -58024533.333333336, "logps/chosen": -92.23043823242188, "logps/rejected": -495.3428548177083, "loss": 0.3935, "rewards/chosen": 0.2850963115692139, "rewards/margins": 2.8083414872487387, "rewards/rejected": -2.523245175679525, "step": 17414 }, { "epoch": 0.9230646913841996, "grad_norm": 36.25, "kl": 0.36391735076904297, "learning_rate": 5e-07, "logits/chosen": -26155024.0, "logits/rejected": -21464025.6, "logps/chosen": -400.823486328125, "logps/rejected": -455.689404296875, "loss": 0.2463, "rewards/chosen": -0.19674142201741537, "rewards/margins": 3.265676180521647, "rewards/rejected": -3.4624176025390625, "step": 17415 }, { "epoch": 0.9231176953860016, "grad_norm": 49.25, "kl": 1.1656417846679688, "learning_rate": 5e-07, "logits/chosen": -20780048.0, "logits/rejected": -17117805.333333332, "logps/chosen": -512.0845947265625, "logps/rejected": -194.44232177734375, "loss": 0.1679, "rewards/chosen": 1.264373779296875, "rewards/margins": 3.6524740854899087, "rewards/rejected": -2.3881003061930337, "step": 17416 }, { "epoch": 0.9231706993878038, "grad_norm": 41.25, "kl": 0.24575042724609375, "learning_rate": 5e-07, "logits/chosen": -23664408.0, "logits/rejected": -15287737.6, "logps/chosen": -271.4788818359375, "logps/rejected": -315.6899658203125, "loss": 0.1591, "rewards/chosen": 0.6254928906758627, "rewards/margins": 4.188371308644612, "rewards/rejected": -3.56287841796875, "step": 17417 }, { "epoch": 0.9232237033896059, "grad_norm": 38.75, "kl": 2.156658172607422, "learning_rate": 5e-07, "logits/chosen": -25123806.4, "logits/rejected": -16297890.666666666, "logps/chosen": -628.80205078125, "logps/rejected": -153.7481892903646, "loss": 0.2085, "rewards/chosen": 2.2402610778808594, "rewards/margins": 4.097818374633789, "rewards/rejected": -1.8575572967529297, "step": 17418 }, { "epoch": 0.9232767073914081, "grad_norm": 48.75, "kl": 1.8124446868896484, "learning_rate": 5e-07, "logits/chosen": -6301838.666666667, "logits/rejected": -6895976.8, "logps/chosen": -116.12449137369792, "logps/rejected": -302.50244140625, "loss": 0.3141, "rewards/chosen": -0.3853045304616292, "rewards/margins": 2.134173313776652, "rewards/rejected": -2.5194778442382812, "step": 17419 }, { "epoch": 0.9233297113932102, "grad_norm": 37.5, "kl": 0.7571382522583008, "learning_rate": 5e-07, "logits/chosen": -15620936.0, "logits/rejected": -20177854.4, "logps/chosen": -158.72962443033853, "logps/rejected": -378.49013671875, "loss": 0.2658, "rewards/chosen": -0.13007760047912598, "rewards/margins": 2.6698522090911867, "rewards/rejected": -2.7999298095703127, "step": 17420 }, { "epoch": 0.9233827153950124, "grad_norm": 55.0, "kl": 1.198822021484375, "learning_rate": 5e-07, "logits/chosen": -36143613.333333336, "logits/rejected": -60957024.0, "logps/chosen": -364.6674397786458, "logps/rejected": -382.866455078125, "loss": 0.23, "rewards/chosen": 0.9743337631225586, "rewards/margins": 3.303221321105957, "rewards/rejected": -2.3288875579833985, "step": 17421 }, { "epoch": 0.9234357193968145, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57030056.0, "logits/rejected": 214035.125, "logps/chosen": -316.689697265625, "logps/rejected": -203.2220458984375, "loss": 0.2641, "rewards/chosen": 0.3643152117729187, "rewards/margins": 3.5427772402763367, "rewards/rejected": -3.178462028503418, "step": 17422 }, { "epoch": 0.9234887233986167, "grad_norm": 53.0, "kl": 0.4418630599975586, "learning_rate": 5e-07, "logits/chosen": -25067928.0, "logits/rejected": -42351496.0, "logps/chosen": -148.32473754882812, "logps/rejected": -252.79006958007812, "loss": 0.3232, "rewards/chosen": 0.020306773483753204, "rewards/margins": 2.260496325790882, "rewards/rejected": -2.240189552307129, "step": 17423 }, { "epoch": 0.9235417274004187, "grad_norm": 53.5, "kl": 0.3576469421386719, "learning_rate": 5e-07, "logits/chosen": -4586209.0, "logits/rejected": -5736887.333333333, "logps/chosen": -142.77064514160156, "logps/rejected": -225.46610514322916, "loss": 0.2437, "rewards/chosen": 0.6708477735519409, "rewards/margins": 2.392354369163513, "rewards/rejected": -1.7215065956115723, "step": 17424 }, { "epoch": 0.9235947314022208, "grad_norm": 48.0, "kl": 4.621323585510254, "learning_rate": 5e-07, "logits/chosen": -96164496.0, "logits/rejected": -7252335.5, "logps/chosen": -458.4937438964844, "logps/rejected": -154.75424194335938, "loss": 0.3228, "rewards/chosen": 1.0763746500015259, "rewards/margins": 3.286559224128723, "rewards/rejected": -2.2101845741271973, "step": 17425 }, { "epoch": 0.923647735404023, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30840153.6, "logits/rejected": -10381232.666666666, "logps/chosen": -311.8013671875, "logps/rejected": -119.51275634765625, "loss": 0.1823, "rewards/chosen": 1.21816987991333, "rewards/margins": 4.225855859120687, "rewards/rejected": -3.007685979207357, "step": 17426 }, { "epoch": 0.9237007394058251, "grad_norm": 28.875, "kl": 2.80734920501709, "learning_rate": 5e-07, "logits/chosen": -23911996.0, "logits/rejected": -34553093.333333336, "logps/chosen": -326.7004699707031, "logps/rejected": -266.17657470703125, "loss": 0.1992, "rewards/chosen": 0.2844073474407196, "rewards/margins": 3.8348389168580375, "rewards/rejected": -3.550431569417318, "step": 17427 }, { "epoch": 0.9237537434076273, "grad_norm": 64.5, "kl": 2.9093894958496094, "learning_rate": 5e-07, "logits/chosen": -58909320.0, "logits/rejected": -7801257.0, "logps/chosen": -645.2446899414062, "logps/rejected": -603.715087890625, "loss": 0.2578, "rewards/chosen": 0.6636962890625, "rewards/margins": 3.7339847087860107, "rewards/rejected": -3.0702884197235107, "step": 17428 }, { "epoch": 0.9238067474094294, "grad_norm": 53.25, "kl": 0.0290069580078125, "learning_rate": 5e-07, "logits/chosen": -27557744.0, "logits/rejected": -85876394.66666667, "logps/chosen": -726.08447265625, "logps/rejected": -226.46466064453125, "loss": 0.2388, "rewards/chosen": 1.2055561065673828, "rewards/margins": 3.3925872166951496, "rewards/rejected": -2.187031110127767, "step": 17429 }, { "epoch": 0.9238597514112316, "grad_norm": 26.25, "kl": 1.731064796447754, "learning_rate": 5e-07, "logits/chosen": 8562775.0, "logits/rejected": -53738252.0, "logps/chosen": -29.614967346191406, "logps/rejected": -381.08477783203125, "loss": 0.3111, "rewards/chosen": -0.02884015440940857, "rewards/margins": 3.7948617041110992, "rewards/rejected": -3.823701858520508, "step": 17430 }, { "epoch": 0.9239127554130336, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47803385.6, "logits/rejected": -44605944.0, "logps/chosen": -264.6546142578125, "logps/rejected": -507.24169921875, "loss": 0.3217, "rewards/chosen": 0.3414284706115723, "rewards/margins": 2.761970869700114, "rewards/rejected": -2.4205423990885415, "step": 17431 }, { "epoch": 0.9239657594148358, "grad_norm": 80.5, "kl": 4.039882659912109, "learning_rate": 5e-07, "logits/chosen": -14172702.0, "logits/rejected": -31403320.0, "logps/chosen": -310.50531005859375, "logps/rejected": -507.19122314453125, "loss": 0.3227, "rewards/chosen": 0.7785439491271973, "rewards/margins": 3.6548752784729004, "rewards/rejected": -2.876331329345703, "step": 17432 }, { "epoch": 0.9240187634166379, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12349228.0, "logits/rejected": -37498466.666666664, "logps/chosen": -334.314453125, "logps/rejected": -551.0436197916666, "loss": 0.2512, "rewards/chosen": 0.7347940444946289, "rewards/margins": 3.649866167704264, "rewards/rejected": -2.9150721232096353, "step": 17433 }, { "epoch": 0.9240717674184401, "grad_norm": 33.0, "kl": 2.2333507537841797, "learning_rate": 5e-07, "logits/chosen": -8141737.333333333, "logits/rejected": -35659552.0, "logps/chosen": -258.9142252604167, "logps/rejected": -285.416650390625, "loss": 0.162, "rewards/chosen": 1.453181266784668, "rewards/margins": 4.321999549865723, "rewards/rejected": -2.8688182830810547, "step": 17434 }, { "epoch": 0.9241247714202422, "grad_norm": 52.0, "kl": 1.9016952514648438, "learning_rate": 5e-07, "logits/chosen": -35731432.0, "logits/rejected": -1474451.75, "logps/chosen": -265.1270751953125, "logps/rejected": -69.44149780273438, "loss": 0.2427, "rewards/chosen": 0.755964994430542, "rewards/margins": 3.968763828277588, "rewards/rejected": -3.212798833847046, "step": 17435 }, { "epoch": 0.9241777754220444, "grad_norm": 38.0, "kl": 1.458786964416504, "learning_rate": 5e-07, "logits/chosen": -19999961.333333332, "logits/rejected": -18353926.4, "logps/chosen": -75.77233378092448, "logps/rejected": -249.7265869140625, "loss": 0.2405, "rewards/chosen": 0.0741179237763087, "rewards/margins": 3.2033844719330467, "rewards/rejected": -3.129266548156738, "step": 17436 }, { "epoch": 0.9242307794238465, "grad_norm": 33.0, "kl": 1.5632781982421875, "learning_rate": 5e-07, "logits/chosen": -53911236.0, "logits/rejected": -17654802.285714287, "logps/chosen": -264.52001953125, "logps/rejected": -231.91793387276786, "loss": 0.1281, "rewards/chosen": -0.25581055879592896, "rewards/margins": 2.6179796372141158, "rewards/rejected": -2.8737901960100447, "step": 17437 }, { "epoch": 0.9242837834256487, "grad_norm": 50.5, "kl": 2.0576553344726562, "learning_rate": 5e-07, "logits/chosen": -36148032.0, "logits/rejected": -22526813.333333332, "logps/chosen": -242.862890625, "logps/rejected": -347.54541015625, "loss": 0.3488, "rewards/chosen": 0.4723519325256348, "rewards/margins": 2.4154935518900555, "rewards/rejected": -1.9431416193644206, "step": 17438 }, { "epoch": 0.9243367874274507, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61389562.666666664, "logits/rejected": -21456113.6, "logps/chosen": -267.3116455078125, "logps/rejected": -264.06689453125, "loss": 0.2365, "rewards/chosen": 0.13662540912628174, "rewards/margins": 2.9229779481887816, "rewards/rejected": -2.7863525390625, "step": 17439 }, { "epoch": 0.9243897914292529, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2496263.0, "logits/rejected": 815848.8, "logps/chosen": -255.97467041015625, "logps/rejected": -140.28887939453125, "loss": 0.1557, "rewards/chosen": 0.7652527491251627, "rewards/margins": 4.891393343607585, "rewards/rejected": -4.126140594482422, "step": 17440 }, { "epoch": 0.924442795431055, "grad_norm": 66.5, "kl": 4.981963157653809, "learning_rate": 5e-07, "logits/chosen": -16430721.333333334, "logits/rejected": -8511476.0, "logps/chosen": -350.305419921875, "logps/rejected": -144.65077209472656, "loss": 0.4394, "rewards/chosen": 0.6189941565195719, "rewards/margins": 2.6690518061319985, "rewards/rejected": -2.0500576496124268, "step": 17441 }, { "epoch": 0.9244957994328572, "grad_norm": 60.25, "kl": 2.4345340728759766, "learning_rate": 5e-07, "logits/chosen": -30781459.2, "logits/rejected": -11378741.333333334, "logps/chosen": -365.2919921875, "logps/rejected": -121.13804117838542, "loss": 0.4289, "rewards/chosen": 0.18384660482406617, "rewards/margins": 1.6940288265546162, "rewards/rejected": -1.51018222173055, "step": 17442 }, { "epoch": 0.9245488034346593, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34013088.0, "logits/rejected": -74701542.4, "logps/chosen": -415.0118815104167, "logps/rejected": -412.877880859375, "loss": 0.1301, "rewards/chosen": 0.9988688627878824, "rewards/margins": 4.398954312006633, "rewards/rejected": -3.40008544921875, "step": 17443 }, { "epoch": 0.9246018074364615, "grad_norm": 48.0, "kl": 0.11905670166015625, "learning_rate": 5e-07, "logits/chosen": -14057897.6, "logits/rejected": -10801476.0, "logps/chosen": -152.8089111328125, "logps/rejected": -149.3663533528646, "loss": 0.4163, "rewards/chosen": 0.009394800662994385, "rewards/margins": 1.0396075208981832, "rewards/rejected": -1.0302127202351887, "step": 17444 }, { "epoch": 0.9246548114382636, "grad_norm": 42.0, "kl": 4.7215070724487305, "learning_rate": 5e-07, "logits/chosen": -10121940.0, "logits/rejected": -54146296.0, "logps/chosen": -174.7663370768229, "logps/rejected": -400.974853515625, "loss": 0.3668, "rewards/chosen": 0.8111907641092936, "rewards/margins": 3.3046156565348306, "rewards/rejected": -2.493424892425537, "step": 17445 }, { "epoch": 0.9247078154400658, "grad_norm": 45.5, "kl": 0.5657081604003906, "learning_rate": 5e-07, "logits/chosen": -6342659.5, "logits/rejected": -4538364.0, "logps/chosen": -282.3133544921875, "logps/rejected": -353.5079345703125, "loss": 0.2454, "rewards/chosen": 0.46611663699150085, "rewards/margins": 3.471405476331711, "rewards/rejected": -3.00528883934021, "step": 17446 }, { "epoch": 0.9247608194418678, "grad_norm": 58.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32054610.0, "logits/rejected": -10496041.0, "logps/chosen": -245.1802215576172, "logps/rejected": -222.87908935546875, "loss": 0.321, "rewards/chosen": 0.12083368003368378, "rewards/margins": 1.8142564743757248, "rewards/rejected": -1.693422794342041, "step": 17447 }, { "epoch": 0.92481382344367, "grad_norm": 55.0, "kl": 3.57293701171875, "learning_rate": 5e-07, "logits/chosen": -18946457.6, "logits/rejected": -2376319.8333333335, "logps/chosen": -538.17763671875, "logps/rejected": -247.4404093424479, "loss": 0.3584, "rewards/chosen": 0.44033203125, "rewards/margins": 5.830938593546549, "rewards/rejected": -5.39060656229655, "step": 17448 }, { "epoch": 0.9248668274454721, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77362968.0, "logits/rejected": -28768946.285714287, "logps/chosen": -273.39276123046875, "logps/rejected": -355.4926060267857, "loss": 0.1453, "rewards/chosen": -0.6361053586006165, "rewards/margins": 2.407359302043915, "rewards/rejected": -3.0434646606445312, "step": 17449 }, { "epoch": 0.9249198314472743, "grad_norm": 50.25, "kl": 3.916421890258789, "learning_rate": 5e-07, "logits/chosen": -10510397.6, "logits/rejected": -40058416.0, "logps/chosen": -521.898291015625, "logps/rejected": -293.5335286458333, "loss": 0.2696, "rewards/chosen": 1.5965810775756837, "rewards/margins": 3.347516473134359, "rewards/rejected": -1.750935395558675, "step": 17450 }, { "epoch": 0.9249728354490764, "grad_norm": 60.25, "kl": 1.8801860809326172, "learning_rate": 5e-07, "logits/chosen": -41766105.6, "logits/rejected": -16365433.333333334, "logps/chosen": -143.67960205078126, "logps/rejected": -246.05684407552084, "loss": 0.4111, "rewards/chosen": 0.2917940616607666, "rewards/margins": 1.1773501237233481, "rewards/rejected": -0.8855560620625814, "step": 17451 }, { "epoch": 0.9250258394508786, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68090576.0, "logits/rejected": 90054048.0, "logps/chosen": -830.6053466796875, "logps/rejected": -599.6423746744791, "loss": 0.0898, "rewards/chosen": 1.7404052019119263, "rewards/margins": 5.4596164623896275, "rewards/rejected": -3.7192112604777017, "step": 17452 }, { "epoch": 0.9250788434526807, "grad_norm": 63.0, "kl": 1.578592300415039, "learning_rate": 5e-07, "logits/chosen": 4223355.2, "logits/rejected": -20139628.0, "logps/chosen": -381.60625, "logps/rejected": -136.9600626627604, "loss": 0.3465, "rewards/chosen": 0.49277820587158205, "rewards/margins": 3.1474703788757323, "rewards/rejected": -2.6546921730041504, "step": 17453 }, { "epoch": 0.9251318474544828, "grad_norm": 39.0, "kl": 7.333980560302734, "learning_rate": 5e-07, "logits/chosen": -20865192.0, "logits/rejected": -444936.0, "logps/chosen": -384.4005533854167, "logps/rejected": -309.2481994628906, "loss": 0.2493, "rewards/chosen": 1.8348204294840496, "rewards/margins": 3.6995413700739546, "rewards/rejected": -1.8647209405899048, "step": 17454 }, { "epoch": 0.9251848514562849, "grad_norm": 53.25, "kl": 1.0669021606445312, "learning_rate": 5e-07, "logits/chosen": -58915372.0, "logits/rejected": -2381786.75, "logps/chosen": -438.88885498046875, "logps/rejected": -96.20502471923828, "loss": 0.301, "rewards/chosen": 0.8281677961349487, "rewards/margins": 2.226540684700012, "rewards/rejected": -1.3983728885650635, "step": 17455 }, { "epoch": 0.9252378554580871, "grad_norm": 54.25, "kl": 1.173933506011963, "learning_rate": 5e-07, "logits/chosen": -6749620.0, "logits/rejected": -41491804.8, "logps/chosen": -119.5202128092448, "logps/rejected": -309.07314453125, "loss": 0.204, "rewards/chosen": 1.1094048817952473, "rewards/margins": 3.161016400655111, "rewards/rejected": -2.0516115188598634, "step": 17456 }, { "epoch": 0.9252908594598892, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47514848.0, "logits/rejected": -3011743.25, "logps/chosen": -242.72987365722656, "logps/rejected": -107.50057220458984, "loss": 0.3624, "rewards/chosen": -0.004259765148162842, "rewards/margins": 1.4172547459602356, "rewards/rejected": -1.4215145111083984, "step": 17457 }, { "epoch": 0.9253438634616914, "grad_norm": 48.75, "kl": 0.5984678268432617, "learning_rate": 5e-07, "logits/chosen": -10063348.0, "logits/rejected": -57262425.6, "logps/chosen": -222.50433349609375, "logps/rejected": -154.54305419921874, "loss": 0.3135, "rewards/chosen": 0.3298601706822713, "rewards/margins": 1.6803573211034137, "rewards/rejected": -1.3504971504211425, "step": 17458 }, { "epoch": 0.9253968674634935, "grad_norm": 50.5, "kl": 5.044599533081055, "learning_rate": 5e-07, "logits/chosen": 19720350.0, "logits/rejected": 116323912.0, "logps/chosen": -151.8389129638672, "logps/rejected": -456.5113220214844, "loss": 0.2841, "rewards/chosen": 0.42610839009284973, "rewards/margins": 3.2114453613758087, "rewards/rejected": -2.785336971282959, "step": 17459 }, { "epoch": 0.9254498714652957, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38871673.6, "logits/rejected": -10267258.0, "logps/chosen": -251.3343994140625, "logps/rejected": -120.26947021484375, "loss": 0.3559, "rewards/chosen": 0.46845703125, "rewards/margins": 1.372932243347168, "rewards/rejected": -0.904475212097168, "step": 17460 }, { "epoch": 0.9255028754670978, "grad_norm": 38.0, "kl": 1.3466148376464844, "learning_rate": 5e-07, "logits/chosen": -69060922.66666667, "logits/rejected": -10743719.2, "logps/chosen": -186.23895263671875, "logps/rejected": -327.96611328125, "loss": 0.2385, "rewards/chosen": 0.5268979469935099, "rewards/margins": 3.53271955649058, "rewards/rejected": -3.0058216094970702, "step": 17461 }, { "epoch": 0.9255558794689, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43916192.0, "logits/rejected": -22998250.0, "logps/chosen": -372.1946105957031, "logps/rejected": -361.70257568359375, "loss": 0.293, "rewards/chosen": 0.13038292527198792, "rewards/margins": 2.4216178953647614, "rewards/rejected": -2.2912349700927734, "step": 17462 }, { "epoch": 0.925608883470702, "grad_norm": 50.75, "kl": 1.3027114868164062, "learning_rate": 5e-07, "logits/chosen": -46865386.666666664, "logits/rejected": -39119500.0, "logps/chosen": -358.33447265625, "logps/rejected": -433.6318054199219, "loss": 0.3329, "rewards/chosen": 0.4424184163411458, "rewards/margins": 4.465933640797933, "rewards/rejected": -4.023515224456787, "step": 17463 }, { "epoch": 0.9256618874725042, "grad_norm": 66.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33995532.0, "logits/rejected": -13442146.666666666, "logps/chosen": -489.7588195800781, "logps/rejected": -390.0184326171875, "loss": 0.2756, "rewards/chosen": -0.688305675983429, "rewards/margins": 1.7914525866508484, "rewards/rejected": -2.4797582626342773, "step": 17464 }, { "epoch": 0.9257148914743063, "grad_norm": 53.0, "kl": 3.915487289428711, "learning_rate": 5e-07, "logits/chosen": -42371320.0, "logits/rejected": -15074334.0, "logps/chosen": -684.0020141601562, "logps/rejected": -474.3267517089844, "loss": 0.216, "rewards/chosen": 1.5829601287841797, "rewards/margins": 3.5388882160186768, "rewards/rejected": -1.955928087234497, "step": 17465 }, { "epoch": 0.9257678954761085, "grad_norm": 38.5, "kl": 0.40160369873046875, "learning_rate": 5e-07, "logits/chosen": -53348304.0, "logits/rejected": -33206291.2, "logps/chosen": -741.3394368489584, "logps/rejected": -237.3299072265625, "loss": 0.1409, "rewards/chosen": 1.5470706621805828, "rewards/margins": 4.988425795237223, "rewards/rejected": -3.4413551330566405, "step": 17466 }, { "epoch": 0.9258208994779106, "grad_norm": 55.75, "kl": 1.8574066162109375, "learning_rate": 5e-07, "logits/chosen": -32841088.0, "logits/rejected": -13360618.0, "logps/chosen": -402.38836669921875, "logps/rejected": -265.3116149902344, "loss": 0.28, "rewards/chosen": 0.39781248569488525, "rewards/margins": 4.09639585018158, "rewards/rejected": -3.6985833644866943, "step": 17467 }, { "epoch": 0.9258739034797128, "grad_norm": 47.0, "kl": 4.048112869262695, "learning_rate": 5e-07, "logits/chosen": -17632658.666666668, "logits/rejected": -15661667.2, "logps/chosen": -123.44193522135417, "logps/rejected": -496.925, "loss": 0.2644, "rewards/chosen": 1.0572691758473713, "rewards/margins": 3.8092490037282305, "rewards/rejected": -2.7519798278808594, "step": 17468 }, { "epoch": 0.9259269074815148, "grad_norm": 48.0, "kl": 0.5614356994628906, "learning_rate": 5e-07, "logits/chosen": -59932652.0, "logits/rejected": -61265264.0, "logps/chosen": -666.515625, "logps/rejected": -986.6134033203125, "loss": 0.1753, "rewards/chosen": 1.6050090789794922, "rewards/margins": 5.156415939331055, "rewards/rejected": -3.5514068603515625, "step": 17469 }, { "epoch": 0.925979911483317, "grad_norm": 52.5, "kl": 0.9389991760253906, "learning_rate": 5e-07, "logits/chosen": -33261472.0, "logits/rejected": -16801.75, "logps/chosen": -326.79046630859375, "logps/rejected": -428.1795349121094, "loss": 0.346, "rewards/chosen": 0.45256491502126056, "rewards/margins": 2.129614313443502, "rewards/rejected": -1.6770493984222412, "step": 17470 }, { "epoch": 0.9260329154851191, "grad_norm": 68.0, "kl": 3.655338764190674, "learning_rate": 5e-07, "logits/chosen": -26748456.0, "logits/rejected": 3167390.6666666665, "logps/chosen": -382.55732421875, "logps/rejected": -107.73240152994792, "loss": 0.3989, "rewards/chosen": 0.5997087955474854, "rewards/margins": 2.598360776901245, "rewards/rejected": -1.9986519813537598, "step": 17471 }, { "epoch": 0.9260859194869213, "grad_norm": 46.5, "kl": 3.0826616287231445, "learning_rate": 5e-07, "logits/chosen": 5956575.0, "logits/rejected": -16344839.0, "logps/chosen": -287.94622802734375, "logps/rejected": -322.84259033203125, "loss": 0.32, "rewards/chosen": 0.18695738911628723, "rewards/margins": 2.921470195055008, "rewards/rejected": -2.7345128059387207, "step": 17472 }, { "epoch": 0.9261389234887234, "grad_norm": 50.0, "kl": 0.9472503662109375, "learning_rate": 5e-07, "logits/chosen": -18828472.0, "logits/rejected": -32730073.6, "logps/chosen": -875.3028971354166, "logps/rejected": -604.810888671875, "loss": 0.1781, "rewards/chosen": 1.7714163462320964, "rewards/margins": 4.619863001505534, "rewards/rejected": -2.8484466552734373, "step": 17473 }, { "epoch": 0.9261919274905255, "grad_norm": 66.0, "kl": 1.9930353164672852, "learning_rate": 5e-07, "logits/chosen": -12117821.333333334, "logits/rejected": -43809372.8, "logps/chosen": -413.0933430989583, "logps/rejected": -345.921630859375, "loss": 0.2356, "rewards/chosen": 1.46337095896403, "rewards/margins": 2.879092566172282, "rewards/rejected": -1.415721607208252, "step": 17474 }, { "epoch": 0.9262449314923277, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20106803.2, "logits/rejected": -12812068.0, "logps/chosen": -213.41240234375, "logps/rejected": -213.1929728190104, "loss": 0.2921, "rewards/chosen": 0.3540358066558838, "rewards/margins": 3.093319050470988, "rewards/rejected": -2.739283243815104, "step": 17475 }, { "epoch": 0.9262979354941298, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29819786.0, "logits/rejected": -38158736.0, "logps/chosen": -314.52801513671875, "logps/rejected": -516.6109619140625, "loss": 0.3025, "rewards/chosen": 0.18465536832809448, "rewards/margins": 2.9057931303977966, "rewards/rejected": -2.721137762069702, "step": 17476 }, { "epoch": 0.926350939495932, "grad_norm": 41.0, "kl": 2.5651683807373047, "learning_rate": 5e-07, "logits/chosen": -33609168.0, "logits/rejected": -26054116.0, "logps/chosen": -320.15234375, "logps/rejected": -243.75091552734375, "loss": 0.1936, "rewards/chosen": 1.4443349838256836, "rewards/margins": 4.949773073196411, "rewards/rejected": -3.5054380893707275, "step": 17477 }, { "epoch": 0.926403943497734, "grad_norm": 50.75, "kl": 0.26134395599365234, "learning_rate": 5e-07, "logits/chosen": 3119136.0, "logits/rejected": -31869524.57142857, "logps/chosen": -12.545949935913086, "logps/rejected": -427.150634765625, "loss": 0.1258, "rewards/chosen": 0.5460460782051086, "rewards/margins": 3.699024430343083, "rewards/rejected": -3.1529783521379744, "step": 17478 }, { "epoch": 0.9264569474995362, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54310480.0, "logits/rejected": -19798716.8, "logps/chosen": -348.7581380208333, "logps/rejected": -328.4989501953125, "loss": 0.2614, "rewards/chosen": -0.21592203776041666, "rewards/margins": 2.6924968083699548, "rewards/rejected": -2.9084188461303713, "step": 17479 }, { "epoch": 0.9265099515013383, "grad_norm": 33.5, "kl": 2.6433944702148438, "learning_rate": 5e-07, "logits/chosen": -34414310.4, "logits/rejected": -19011748.0, "logps/chosen": -183.82200927734374, "logps/rejected": -315.710205078125, "loss": 0.2734, "rewards/chosen": 0.5622054100036621, "rewards/margins": 4.914950784047444, "rewards/rejected": -4.352745374043782, "step": 17480 }, { "epoch": 0.9265629555031405, "grad_norm": 56.25, "kl": 0.8990745544433594, "learning_rate": 5e-07, "logits/chosen": -24437315.2, "logits/rejected": -20162376.0, "logps/chosen": -316.6079345703125, "logps/rejected": -252.86893717447916, "loss": 0.3224, "rewards/chosen": 0.5910680770874024, "rewards/margins": 3.6972749710083006, "rewards/rejected": -3.1062068939208984, "step": 17481 }, { "epoch": 0.9266159595049426, "grad_norm": 38.25, "kl": 0.268310546875, "learning_rate": 5e-07, "logits/chosen": 21590972.0, "logits/rejected": -36704261.333333336, "logps/chosen": -231.7071075439453, "logps/rejected": -456.6466878255208, "loss": 0.241, "rewards/chosen": 0.14518776535987854, "rewards/margins": 3.530584086974462, "rewards/rejected": -3.3853963216145835, "step": 17482 }, { "epoch": 0.9266689635067448, "grad_norm": 55.0, "kl": 3.5994529724121094, "learning_rate": 5e-07, "logits/chosen": 16935220.0, "logits/rejected": -7187631.0, "logps/chosen": -271.2478332519531, "logps/rejected": -134.55482482910156, "loss": 0.3237, "rewards/chosen": 0.6172434091567993, "rewards/margins": 2.1034809350967407, "rewards/rejected": -1.4862375259399414, "step": 17483 }, { "epoch": 0.9267219675085469, "grad_norm": 51.0, "kl": 0.9461355209350586, "learning_rate": 5e-07, "logits/chosen": -19068984.0, "logits/rejected": -8146368.0, "logps/chosen": -267.47930908203125, "logps/rejected": -333.58184814453125, "loss": 0.3516, "rewards/chosen": 0.657534678777059, "rewards/margins": 2.023683746655782, "rewards/rejected": -1.3661490678787231, "step": 17484 }, { "epoch": 0.926774971510349, "grad_norm": 32.25, "kl": 1.2567319869995117, "learning_rate": 5e-07, "logits/chosen": 7785116.0, "logits/rejected": -8911148.0, "logps/chosen": -69.11225128173828, "logps/rejected": -177.86328125, "loss": 0.2471, "rewards/chosen": 1.009939193725586, "rewards/margins": 2.513482689857483, "rewards/rejected": -1.503543496131897, "step": 17485 }, { "epoch": 0.9268279755121511, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 63253414.4, "logits/rejected": -21826441.333333332, "logps/chosen": -438.628466796875, "logps/rejected": -268.30865478515625, "loss": 0.3163, "rewards/chosen": 0.20518934726715088, "rewards/margins": 5.321090658505757, "rewards/rejected": -5.1159013112386065, "step": 17486 }, { "epoch": 0.9268809795139533, "grad_norm": 37.0, "kl": 2.0987548828125, "learning_rate": 5e-07, "logits/chosen": 5556203.2, "logits/rejected": -5963977.333333333, "logps/chosen": -69.927783203125, "logps/rejected": -353.9828694661458, "loss": 0.3185, "rewards/chosen": 0.5220284938812256, "rewards/margins": 3.4030811468760174, "rewards/rejected": -2.8810526529947915, "step": 17487 }, { "epoch": 0.9269339835157554, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74900144.0, "logits/rejected": -6432333.0, "logps/chosen": -374.67437744140625, "logps/rejected": -156.78997802734375, "loss": 0.3503, "rewards/chosen": 0.4268737733364105, "rewards/margins": 2.330265372991562, "rewards/rejected": -1.9033915996551514, "step": 17488 }, { "epoch": 0.9269869875175576, "grad_norm": 39.0, "kl": 2.5272598266601562, "learning_rate": 5e-07, "logits/chosen": -12764366.4, "logits/rejected": -36876845.333333336, "logps/chosen": -148.07620849609376, "logps/rejected": -236.54911295572916, "loss": 0.3194, "rewards/chosen": 0.6531991481781005, "rewards/margins": 4.469223928451538, "rewards/rejected": -3.8160247802734375, "step": 17489 }, { "epoch": 0.9270399915193597, "grad_norm": 42.25, "kl": 0.9385261535644531, "learning_rate": 5e-07, "logits/chosen": 4518789.0, "logits/rejected": -14073106.666666666, "logps/chosen": -53.17449951171875, "logps/rejected": -559.171142578125, "loss": 0.2348, "rewards/chosen": -0.07347947359085083, "rewards/margins": 2.340720991293589, "rewards/rejected": -2.41420046488444, "step": 17490 }, { "epoch": 0.9270929955211619, "grad_norm": 52.25, "kl": 0.22469329833984375, "learning_rate": 5e-07, "logits/chosen": -38065320.0, "logits/rejected": -53657532.0, "logps/chosen": -413.75146484375, "logps/rejected": -306.9996032714844, "loss": 0.3297, "rewards/chosen": 0.08115158975124359, "rewards/margins": 1.7991741746664047, "rewards/rejected": -1.7180225849151611, "step": 17491 }, { "epoch": 0.927145999522964, "grad_norm": 52.75, "kl": 0.6025829315185547, "learning_rate": 5e-07, "logits/chosen": -32640192.0, "logits/rejected": 9231614.0, "logps/chosen": -253.3367716471354, "logps/rejected": -393.4366149902344, "loss": 0.4103, "rewards/chosen": -0.046991984049479164, "rewards/margins": 2.9861138661702475, "rewards/rejected": -3.0331058502197266, "step": 17492 }, { "epoch": 0.9271990035247661, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -99003536.0, "logits/rejected": -26980160.0, "logps/chosen": -303.24566650390625, "logps/rejected": -206.92433166503906, "loss": 0.3289, "rewards/chosen": -0.2591667175292969, "rewards/margins": 2.652688980102539, "rewards/rejected": -2.911855697631836, "step": 17493 }, { "epoch": 0.9272520075265682, "grad_norm": 50.0, "kl": 0.8636283874511719, "learning_rate": 5e-07, "logits/chosen": 15320360.0, "logits/rejected": -15440110.0, "logps/chosen": -241.04843139648438, "logps/rejected": -233.52691650390625, "loss": 0.2692, "rewards/chosen": 0.18315577507019043, "rewards/margins": 3.2984507083892822, "rewards/rejected": -3.115294933319092, "step": 17494 }, { "epoch": 0.9273050115283704, "grad_norm": 63.5, "kl": 5.938562393188477, "learning_rate": 5e-07, "logits/chosen": -37229532.8, "logits/rejected": -37950482.666666664, "logps/chosen": -315.11435546875, "logps/rejected": -500.8370768229167, "loss": 0.3333, "rewards/chosen": 0.8657463073730469, "rewards/margins": 4.5595036824544275, "rewards/rejected": -3.6937573750813804, "step": 17495 }, { "epoch": 0.9273580155301725, "grad_norm": 37.5, "kl": 1.536128044128418, "learning_rate": 5e-07, "logits/chosen": 46992112.0, "logits/rejected": -17053012.8, "logps/chosen": -203.3545939127604, "logps/rejected": -356.5074951171875, "loss": 0.173, "rewards/chosen": 0.7075017293294271, "rewards/margins": 4.418506749471028, "rewards/rejected": -3.7110050201416014, "step": 17496 }, { "epoch": 0.9274110195319747, "grad_norm": 50.0, "kl": 2.2925281524658203, "learning_rate": 5e-07, "logits/chosen": -20551176.0, "logits/rejected": -25118381.333333332, "logps/chosen": -374.5505126953125, "logps/rejected": -114.108642578125, "loss": 0.2926, "rewards/chosen": 0.7854855537414551, "rewards/margins": 2.5511967023213704, "rewards/rejected": -1.7657111485799153, "step": 17497 }, { "epoch": 0.9274640235337768, "grad_norm": 75.5, "kl": 4.484744071960449, "learning_rate": 5e-07, "logits/chosen": -1575421.125, "logits/rejected": 97600.5, "logps/chosen": -784.8911743164062, "logps/rejected": -270.3730163574219, "loss": 0.2674, "rewards/chosen": 1.3704140186309814, "rewards/margins": 4.0248284339904785, "rewards/rejected": -2.654414415359497, "step": 17498 }, { "epoch": 0.927517027535579, "grad_norm": 47.25, "kl": 1.1710662841796875, "learning_rate": 5e-07, "logits/chosen": -29740024.0, "logits/rejected": -36280564.0, "logps/chosen": -357.72515869140625, "logps/rejected": -311.10772705078125, "loss": 0.3606, "rewards/chosen": -0.030015582218766212, "rewards/margins": 1.6576298531144857, "rewards/rejected": -1.687645435333252, "step": 17499 }, { "epoch": 0.927570031537381, "grad_norm": 58.75, "kl": 0.07143402099609375, "learning_rate": 5e-07, "logits/chosen": -4053808.8, "logits/rejected": -7322978.666666667, "logps/chosen": -150.7722412109375, "logps/rejected": -233.82145182291666, "loss": 0.4011, "rewards/chosen": -0.10083359479904175, "rewards/margins": 1.4275630116462708, "rewards/rejected": -1.5283966064453125, "step": 17500 }, { "epoch": 0.9276230355391832, "grad_norm": 74.0, "kl": 0.4603691101074219, "learning_rate": 5e-07, "logits/chosen": -20865116.8, "logits/rejected": -2844464.0, "logps/chosen": -289.731201171875, "logps/rejected": -268.35345458984375, "loss": 0.3061, "rewards/chosen": 0.29090514183044436, "rewards/margins": 3.3180403550465902, "rewards/rejected": -3.027135213216146, "step": 17501 }, { "epoch": 0.9276760395409853, "grad_norm": 47.5, "kl": 2.8252525329589844, "learning_rate": 5e-07, "logits/chosen": -14365763.2, "logits/rejected": -13508490.666666666, "logps/chosen": -317.4137939453125, "logps/rejected": -260.39251708984375, "loss": 0.2573, "rewards/chosen": 1.1676339149475097, "rewards/margins": 3.7520248730977377, "rewards/rejected": -2.584390958150228, "step": 17502 }, { "epoch": 0.9277290435427875, "grad_norm": 52.25, "kl": 3.0209083557128906, "learning_rate": 5e-07, "logits/chosen": -15754089.333333334, "logits/rejected": -25346670.0, "logps/chosen": -275.3498942057292, "logps/rejected": -281.9719543457031, "loss": 0.3788, "rewards/chosen": 0.5541456937789917, "rewards/margins": 2.2206732034683228, "rewards/rejected": -1.666527509689331, "step": 17503 }, { "epoch": 0.9277820475445896, "grad_norm": 45.5, "kl": 1.7283897399902344, "learning_rate": 5e-07, "logits/chosen": -53700160.0, "logits/rejected": -22793992.0, "logps/chosen": -254.67083740234375, "logps/rejected": -540.9069213867188, "loss": 0.2214, "rewards/chosen": 0.8350507616996765, "rewards/margins": 3.7486321330070496, "rewards/rejected": -2.913581371307373, "step": 17504 }, { "epoch": 0.9278350515463918, "grad_norm": 44.25, "kl": 1.4656600952148438, "learning_rate": 5e-07, "logits/chosen": -33252661.333333332, "logits/rejected": -5127750.8, "logps/chosen": -179.38492838541666, "logps/rejected": -119.741259765625, "loss": 0.2927, "rewards/chosen": 0.6372305949529012, "rewards/margins": 2.105896290143331, "rewards/rejected": -1.4686656951904298, "step": 17505 }, { "epoch": 0.9278880555481939, "grad_norm": 57.0, "kl": 1.445505142211914, "learning_rate": 5e-07, "logits/chosen": -2158415.5, "logits/rejected": -6817623.0, "logps/chosen": -373.39532470703125, "logps/rejected": -323.7731018066406, "loss": 0.303, "rewards/chosen": 0.1829393357038498, "rewards/margins": 2.9708371609449387, "rewards/rejected": -2.787897825241089, "step": 17506 }, { "epoch": 0.9279410595499961, "grad_norm": 50.5, "kl": 0.4333152770996094, "learning_rate": 5e-07, "logits/chosen": -29802526.0, "logits/rejected": -15643071.0, "logps/chosen": -401.76123046875, "logps/rejected": -269.6732177734375, "loss": 0.2593, "rewards/chosen": 0.33985406160354614, "rewards/margins": 3.166840612888336, "rewards/rejected": -2.82698655128479, "step": 17507 }, { "epoch": 0.9279940635517981, "grad_norm": 31.5, "kl": 2.121187210083008, "learning_rate": 5e-07, "logits/chosen": -17810419.2, "logits/rejected": -13965840.0, "logps/chosen": -196.38092041015625, "logps/rejected": -457.1244303385417, "loss": 0.214, "rewards/chosen": 1.3826115608215332, "rewards/margins": 4.80956137975057, "rewards/rejected": -3.4269498189290366, "step": 17508 }, { "epoch": 0.9280470675536003, "grad_norm": 40.25, "kl": 1.3930234909057617, "learning_rate": 5e-07, "logits/chosen": -16816648.0, "logits/rejected": -38579948.0, "logps/chosen": -186.6100056966146, "logps/rejected": -577.134033203125, "loss": 0.2986, "rewards/chosen": 0.8347771962483724, "rewards/margins": 3.2102495034535727, "rewards/rejected": -2.3754723072052, "step": 17509 }, { "epoch": 0.9281000715554024, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -96013704.0, "logits/rejected": -5414522.5, "logps/chosen": -448.9128723144531, "logps/rejected": -206.8190155029297, "loss": 0.3684, "rewards/chosen": 0.1518799066543579, "rewards/margins": 1.4656050205230713, "rewards/rejected": -1.3137251138687134, "step": 17510 }, { "epoch": 0.9281530755572046, "grad_norm": 64.5, "kl": 5.49254846572876, "learning_rate": 5e-07, "logits/chosen": -20500840.0, "logits/rejected": 9342890.0, "logps/chosen": -391.4551595052083, "logps/rejected": -312.001708984375, "loss": 0.3557, "rewards/chosen": 1.012125571568807, "rewards/margins": 3.014948447545369, "rewards/rejected": -2.0028228759765625, "step": 17511 }, { "epoch": 0.9282060795590067, "grad_norm": 55.75, "kl": 2.622429847717285, "learning_rate": 5e-07, "logits/chosen": -19494433.333333332, "logits/rejected": -41758752.0, "logps/chosen": -609.65625, "logps/rejected": -347.73974609375, "loss": 0.3719, "rewards/chosen": 0.38057605425516766, "rewards/margins": 3.728233496348063, "rewards/rejected": -3.3476574420928955, "step": 17512 }, { "epoch": 0.9282590835608089, "grad_norm": 50.0, "kl": 5.623167991638184, "learning_rate": 5e-07, "logits/chosen": -37518022.4, "logits/rejected": -2861397.0, "logps/chosen": -641.427978515625, "logps/rejected": -340.4753824869792, "loss": 0.1853, "rewards/chosen": 1.7834503173828125, "rewards/margins": 3.9013184229532873, "rewards/rejected": -2.117868105570475, "step": 17513 }, { "epoch": 0.928312087562611, "grad_norm": 57.25, "kl": 2.0277099609375, "learning_rate": 5e-07, "logits/chosen": -35315760.0, "logits/rejected": -10435563.0, "logps/chosen": -376.650634765625, "logps/rejected": -237.15077209472656, "loss": 0.301, "rewards/chosen": 0.7062376737594604, "rewards/margins": 2.264883875846863, "rewards/rejected": -1.5586462020874023, "step": 17514 }, { "epoch": 0.9283650915644132, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41611924.0, "logits/rejected": -11792466.666666666, "logps/chosen": -248.85939025878906, "logps/rejected": -555.8193359375, "loss": 0.1501, "rewards/chosen": 0.6085067987442017, "rewards/margins": 3.5289560556411743, "rewards/rejected": -2.9204492568969727, "step": 17515 }, { "epoch": 0.9284180955662152, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43713349.333333336, "logits/rejected": -26283768.0, "logps/chosen": -394.5337727864583, "logps/rejected": -578.32255859375, "loss": 0.1803, "rewards/chosen": 0.47054751714070636, "rewards/margins": 4.476034577687581, "rewards/rejected": -4.005487060546875, "step": 17516 }, { "epoch": 0.9284710995680174, "grad_norm": 58.5, "kl": 3.8983993530273438, "learning_rate": 5e-07, "logits/chosen": -14888288.0, "logits/rejected": 575408.0, "logps/chosen": -747.033447265625, "logps/rejected": -228.9185028076172, "loss": 0.3604, "rewards/chosen": 1.0594831307729085, "rewards/margins": 3.1380949815114336, "rewards/rejected": -2.0786118507385254, "step": 17517 }, { "epoch": 0.9285241035698195, "grad_norm": 40.0, "kl": 1.7472457885742188, "learning_rate": 5e-07, "logits/chosen": -51006005.333333336, "logits/rejected": -84759974.4, "logps/chosen": -667.902099609375, "logps/rejected": -338.491455078125, "loss": 0.2314, "rewards/chosen": 0.9332280953725179, "rewards/margins": 3.012145980199178, "rewards/rejected": -2.07891788482666, "step": 17518 }, { "epoch": 0.9285771075716217, "grad_norm": 49.75, "kl": 5.417924880981445, "learning_rate": 5e-07, "logits/chosen": -1781721.142857143, "logits/rejected": -10094542.0, "logps/chosen": -180.65248325892858, "logps/rejected": -210.0651397705078, "loss": 0.4294, "rewards/chosen": 0.7344308580671038, "rewards/margins": 4.037024804524013, "rewards/rejected": -3.302593946456909, "step": 17519 }, { "epoch": 0.9286301115734238, "grad_norm": 66.5, "kl": 0.5021476745605469, "learning_rate": 5e-07, "logits/chosen": -29730934.4, "logits/rejected": -45743776.0, "logps/chosen": -340.2666259765625, "logps/rejected": -328.5601806640625, "loss": 0.3772, "rewards/chosen": 0.06722267866134643, "rewards/margins": 2.246174577871958, "rewards/rejected": -2.178951899210612, "step": 17520 }, { "epoch": 0.928683115575226, "grad_norm": 52.0, "kl": 2.6571340560913086, "learning_rate": 5e-07, "logits/chosen": -25872512.0, "logits/rejected": -28079562.666666668, "logps/chosen": -251.6536865234375, "logps/rejected": -264.5211181640625, "loss": 0.3167, "rewards/chosen": 0.5499833583831787, "rewards/margins": 4.381232277552287, "rewards/rejected": -3.831248919169108, "step": 17521 }, { "epoch": 0.9287361195770281, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -93100880.0, "logits/rejected": -33650752.0, "logps/chosen": -393.9693908691406, "logps/rejected": -391.8369547526042, "loss": 0.2227, "rewards/chosen": 0.5137191414833069, "rewards/margins": 2.741819600264231, "rewards/rejected": -2.2281004587809243, "step": 17522 }, { "epoch": 0.9287891235788303, "grad_norm": 37.0, "kl": 0.312469482421875, "learning_rate": 5e-07, "logits/chosen": -3207542.5, "logits/rejected": -24497653.333333332, "logps/chosen": -204.9352264404297, "logps/rejected": -330.9976399739583, "loss": 0.1994, "rewards/chosen": 0.7663257718086243, "rewards/margins": 3.0314281582832336, "rewards/rejected": -2.2651023864746094, "step": 17523 }, { "epoch": 0.9288421275806323, "grad_norm": 41.75, "kl": 1.1077194213867188, "learning_rate": 5e-07, "logits/chosen": -9454345.333333334, "logits/rejected": -47091961.6, "logps/chosen": -214.3602294921875, "logps/rejected": -424.153564453125, "loss": 0.2543, "rewards/chosen": 0.6241930723190308, "rewards/margins": 2.775494647026062, "rewards/rejected": -2.1513015747070314, "step": 17524 }, { "epoch": 0.9288951315824344, "grad_norm": 52.25, "kl": 0.7641105651855469, "learning_rate": 5e-07, "logits/chosen": 725237.5, "logits/rejected": -14875781.0, "logps/chosen": -222.097900390625, "logps/rejected": -292.43353271484375, "loss": 0.2967, "rewards/chosen": 0.5291758179664612, "rewards/margins": 2.2498653531074524, "rewards/rejected": -1.7206895351409912, "step": 17525 }, { "epoch": 0.9289481355842366, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5059417.0, "logits/rejected": -186582.85416666666, "logps/chosen": -90.00239562988281, "logps/rejected": -187.1310017903646, "loss": 0.1282, "rewards/chosen": 1.3944518566131592, "rewards/margins": 4.0618280569712315, "rewards/rejected": -2.6673762003580728, "step": 17526 }, { "epoch": 0.9290011395860387, "grad_norm": 47.0, "kl": 2.371767997741699, "learning_rate": 5e-07, "logits/chosen": -38117776.0, "logits/rejected": -71286288.0, "logps/chosen": -383.8458557128906, "logps/rejected": -583.2089233398438, "loss": 0.2275, "rewards/chosen": 1.1258095502853394, "rewards/margins": 4.697947144508362, "rewards/rejected": -3.5721375942230225, "step": 17527 }, { "epoch": 0.9290541435878409, "grad_norm": 31.125, "kl": 2.944005012512207, "learning_rate": 5e-07, "logits/chosen": -16940725.333333332, "logits/rejected": -41661363.2, "logps/chosen": -124.98262532552083, "logps/rejected": -601.19814453125, "loss": 0.3183, "rewards/chosen": -0.16065216064453125, "rewards/margins": 3.1875640869140627, "rewards/rejected": -3.348216247558594, "step": 17528 }, { "epoch": 0.929107147589643, "grad_norm": 39.75, "kl": 1.1630144119262695, "learning_rate": 5e-07, "logits/chosen": -11418034.0, "logits/rejected": -37341456.0, "logps/chosen": -147.64378356933594, "logps/rejected": -317.33416748046875, "loss": 0.2244, "rewards/chosen": 0.9567364454269409, "rewards/margins": 3.1860049962997437, "rewards/rejected": -2.2292685508728027, "step": 17529 }, { "epoch": 0.9291601515914452, "grad_norm": 39.75, "kl": 0.04889488220214844, "learning_rate": 5e-07, "logits/chosen": -21369214.0, "logits/rejected": -50633290.666666664, "logps/chosen": -385.4783935546875, "logps/rejected": -430.6685791015625, "loss": 0.1602, "rewards/chosen": 0.6027023792266846, "rewards/margins": 3.136751890182495, "rewards/rejected": -2.5340495109558105, "step": 17530 }, { "epoch": 0.9292131555932472, "grad_norm": 37.5, "kl": 3.853976249694824, "learning_rate": 5e-07, "logits/chosen": 4446549.0, "logits/rejected": -23296496.0, "logps/chosen": -573.5986328125, "logps/rejected": -319.61126708984375, "loss": 0.2034, "rewards/chosen": 1.945557713508606, "rewards/margins": 3.4912766218185425, "rewards/rejected": -1.5457189083099365, "step": 17531 }, { "epoch": 0.9292661595950494, "grad_norm": 54.0, "kl": 2.4771251678466797, "learning_rate": 5e-07, "logits/chosen": -877722.0, "logits/rejected": -11488617.333333334, "logps/chosen": -329.35992431640625, "logps/rejected": -173.4529012044271, "loss": 0.2738, "rewards/chosen": 1.9062950611114502, "rewards/margins": 2.8748366832733154, "rewards/rejected": -0.9685416221618652, "step": 17532 }, { "epoch": 0.9293191635968515, "grad_norm": 50.0, "kl": 1.854736328125, "learning_rate": 5e-07, "logits/chosen": -36505882.666666664, "logits/rejected": -23213139.2, "logps/chosen": -499.1951497395833, "logps/rejected": -253.128173828125, "loss": 0.3316, "rewards/chosen": 0.3725922107696533, "rewards/margins": 2.00917592048645, "rewards/rejected": -1.6365837097167968, "step": 17533 }, { "epoch": 0.9293721675986537, "grad_norm": 51.0, "kl": 6.077868461608887, "learning_rate": 5e-07, "logits/chosen": -19525515.42857143, "logits/rejected": -53553972.0, "logps/chosen": -301.7955845424107, "logps/rejected": -596.321533203125, "loss": 0.4295, "rewards/chosen": 0.9496141161237445, "rewards/margins": 2.3330614055906023, "rewards/rejected": -1.383447289466858, "step": 17534 }, { "epoch": 0.9294251716004558, "grad_norm": 43.75, "kl": 0.8321170806884766, "learning_rate": 5e-07, "logits/chosen": -8616541.0, "logits/rejected": -11996900.0, "logps/chosen": -463.5296630859375, "logps/rejected": -269.21881103515625, "loss": 0.2435, "rewards/chosen": 0.9067613482475281, "rewards/margins": 3.0850512385368347, "rewards/rejected": -2.1782898902893066, "step": 17535 }, { "epoch": 0.929478175602258, "grad_norm": 39.0, "kl": 0.15443992614746094, "learning_rate": 5e-07, "logits/chosen": 1922985.25, "logits/rejected": -52864154.666666664, "logps/chosen": -81.31031799316406, "logps/rejected": -285.14801025390625, "loss": 0.2671, "rewards/chosen": 0.4717285633087158, "rewards/margins": 3.0191983381907144, "rewards/rejected": -2.5474697748819985, "step": 17536 }, { "epoch": 0.9295311796040601, "grad_norm": 41.75, "kl": 2.6137285232543945, "learning_rate": 5e-07, "logits/chosen": -19209562.0, "logits/rejected": -32148510.0, "logps/chosen": -209.9862823486328, "logps/rejected": -370.88507080078125, "loss": 0.2841, "rewards/chosen": 1.0146796703338623, "rewards/margins": 2.8336031436920166, "rewards/rejected": -1.8189234733581543, "step": 17537 }, { "epoch": 0.9295841836058623, "grad_norm": 34.75, "kl": 3.5543365478515625, "learning_rate": 5e-07, "logits/chosen": -1513880.0, "logits/rejected": -29850141.333333332, "logps/chosen": -198.9216552734375, "logps/rejected": -322.1100667317708, "loss": 0.3229, "rewards/chosen": 0.8204135894775391, "rewards/margins": 4.0942929585774746, "rewards/rejected": -3.273879369099935, "step": 17538 }, { "epoch": 0.9296371876076643, "grad_norm": 42.5, "kl": 5.553824424743652, "learning_rate": 5e-07, "logits/chosen": 2935921.0, "logits/rejected": -33949596.0, "logps/chosen": -130.20469665527344, "logps/rejected": -416.79876708984375, "loss": 0.2495, "rewards/chosen": 0.7015156149864197, "rewards/margins": 5.538370072841644, "rewards/rejected": -4.836854457855225, "step": 17539 }, { "epoch": 0.9296901916094665, "grad_norm": 46.25, "kl": 1.0421257019042969, "learning_rate": 5e-07, "logits/chosen": -57685816.0, "logits/rejected": -27065141.333333332, "logps/chosen": -381.06817626953125, "logps/rejected": -303.29368082682294, "loss": 0.2071, "rewards/chosen": 0.38198089599609375, "rewards/margins": 2.643479665120443, "rewards/rejected": -2.261498769124349, "step": 17540 }, { "epoch": 0.9297431956112686, "grad_norm": 40.5, "kl": 1.133784294128418, "learning_rate": 5e-07, "logits/chosen": 810291.0, "logits/rejected": -115567.25, "logps/chosen": -709.412353515625, "logps/rejected": -249.1946258544922, "loss": 0.1883, "rewards/chosen": 1.7992132902145386, "rewards/margins": 4.179797291755676, "rewards/rejected": -2.3805840015411377, "step": 17541 }, { "epoch": 0.9297961996130708, "grad_norm": 45.75, "kl": 4.232122421264648, "learning_rate": 5e-07, "logits/chosen": -18799172.8, "logits/rejected": -26802781.333333332, "logps/chosen": -176.2803955078125, "logps/rejected": -115.81636555989583, "loss": 0.3536, "rewards/chosen": 0.6808220863342285, "rewards/margins": 2.3727983156840007, "rewards/rejected": -1.6919762293497722, "step": 17542 }, { "epoch": 0.9298492036148729, "grad_norm": 47.25, "kl": 2.0749406814575195, "learning_rate": 5e-07, "logits/chosen": -22055264.0, "logits/rejected": -1673035.875, "logps/chosen": -248.43048095703125, "logps/rejected": -197.67984008789062, "loss": 0.2888, "rewards/chosen": 0.6808074712753296, "rewards/margins": 2.9484070539474487, "rewards/rejected": -2.267599582672119, "step": 17543 }, { "epoch": 0.9299022076166751, "grad_norm": 40.25, "kl": 1.9111089706420898, "learning_rate": 5e-07, "logits/chosen": -26553317.333333332, "logits/rejected": -25939756.8, "logps/chosen": -186.72074381510416, "logps/rejected": -479.577734375, "loss": 0.2835, "rewards/chosen": -0.23713086048762003, "rewards/margins": 2.6420389850934347, "rewards/rejected": -2.8791698455810546, "step": 17544 }, { "epoch": 0.9299552116184772, "grad_norm": 48.5, "kl": 2.8858642578125, "learning_rate": 5e-07, "logits/chosen": -33438115.2, "logits/rejected": -54735450.666666664, "logps/chosen": -277.922998046875, "logps/rejected": -641.7764485677084, "loss": 0.3482, "rewards/chosen": 0.31324167251586915, "rewards/margins": 4.1673962593078615, "rewards/rejected": -3.854154586791992, "step": 17545 }, { "epoch": 0.9300082156202794, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33104694.0, "logits/rejected": -27782400.0, "logps/chosen": -443.84417724609375, "logps/rejected": -356.3231201171875, "loss": 0.2403, "rewards/chosen": 0.5903862118721008, "rewards/margins": 2.90832382440567, "rewards/rejected": -2.3179376125335693, "step": 17546 }, { "epoch": 0.9300612196220814, "grad_norm": 31.25, "kl": 2.4895763397216797, "learning_rate": 5e-07, "logits/chosen": -7682718.0, "logits/rejected": -5050868.666666667, "logps/chosen": -298.18853759765625, "logps/rejected": -184.99015299479166, "loss": 0.1741, "rewards/chosen": 1.8474011421203613, "rewards/margins": 5.21250327428182, "rewards/rejected": -3.3651021321614585, "step": 17547 }, { "epoch": 0.9301142236238836, "grad_norm": 38.5, "kl": 3.2259044647216797, "learning_rate": 5e-07, "logits/chosen": -40841360.0, "logits/rejected": -44722536.0, "logps/chosen": -299.77435302734375, "logps/rejected": -471.592041015625, "loss": 0.172, "rewards/chosen": 1.5986683368682861, "rewards/margins": 5.006022055943808, "rewards/rejected": -3.407353719075521, "step": 17548 }, { "epoch": 0.9301672276256857, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41016245.333333336, "logits/rejected": -38506776.0, "logps/chosen": -286.47802734375, "logps/rejected": -423.19354248046875, "loss": 0.358, "rewards/chosen": 0.1914177139600118, "rewards/margins": 3.6752281387646994, "rewards/rejected": -3.4838104248046875, "step": 17549 }, { "epoch": 0.9302202316274879, "grad_norm": 88.5, "kl": 2.0358963012695312, "learning_rate": 5e-07, "logits/chosen": -66948554.666666664, "logits/rejected": 9756871.2, "logps/chosen": -948.7201334635416, "logps/rejected": -140.85035400390626, "loss": 0.238, "rewards/chosen": 1.4013586044311523, "rewards/margins": 3.510812759399414, "rewards/rejected": -2.1094541549682617, "step": 17550 }, { "epoch": 0.93027323562929, "grad_norm": 42.5, "kl": 1.4318666458129883, "learning_rate": 5e-07, "logits/chosen": -33370627.2, "logits/rejected": -5408643.0, "logps/chosen": -357.4242431640625, "logps/rejected": -299.2877197265625, "loss": 0.3479, "rewards/chosen": 0.5472201824188232, "rewards/margins": 2.895777336756388, "rewards/rejected": -2.348557154337565, "step": 17551 }, { "epoch": 0.9303262396310922, "grad_norm": 56.75, "kl": 3.318635940551758, "learning_rate": 5e-07, "logits/chosen": -34050605.71428572, "logits/rejected": -3188322.0, "logps/chosen": -310.79234095982144, "logps/rejected": -161.50286865234375, "loss": 0.3869, "rewards/chosen": 0.7132494109017509, "rewards/margins": 6.0702778952462335, "rewards/rejected": -5.357028484344482, "step": 17552 }, { "epoch": 0.9303792436328943, "grad_norm": 35.25, "kl": 0.9565749168395996, "learning_rate": 5e-07, "logits/chosen": -23113630.4, "logits/rejected": -36069376.0, "logps/chosen": -292.3799560546875, "logps/rejected": -413.6885986328125, "loss": 0.2651, "rewards/chosen": 0.8820226669311524, "rewards/margins": 4.074952888488769, "rewards/rejected": -3.192930221557617, "step": 17553 }, { "epoch": 0.9304322476346965, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68713544.0, "logits/rejected": -43517892.571428575, "logps/chosen": -720.7886962890625, "logps/rejected": -291.62587193080356, "loss": 0.1663, "rewards/chosen": 0.27827149629592896, "rewards/margins": 3.0669225539479936, "rewards/rejected": -2.7886510576520647, "step": 17554 }, { "epoch": 0.9304852516364985, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34470052.0, "logits/rejected": -2189201.8333333335, "logps/chosen": -486.177001953125, "logps/rejected": -163.59419759114584, "loss": 0.1945, "rewards/chosen": 0.008726537227630615, "rewards/margins": 2.8599155147870383, "rewards/rejected": -2.8511889775594077, "step": 17555 }, { "epoch": 0.9305382556383007, "grad_norm": 43.25, "kl": 3.897336006164551, "learning_rate": 5e-07, "logits/chosen": -17769124.0, "logits/rejected": -1065718.875, "logps/chosen": -180.06722005208334, "logps/rejected": -361.317138671875, "loss": 0.3337, "rewards/chosen": 0.7563635508219401, "rewards/margins": 3.427074591318766, "rewards/rejected": -2.670711040496826, "step": 17556 }, { "epoch": 0.9305912596401028, "grad_norm": 43.75, "kl": 5.094135284423828, "learning_rate": 5e-07, "logits/chosen": -14139041.6, "logits/rejected": -12906132.0, "logps/chosen": -266.8394287109375, "logps/rejected": -360.9181315104167, "loss": 0.352, "rewards/chosen": 0.7533498287200928, "rewards/margins": 3.1590223789215086, "rewards/rejected": -2.405672550201416, "step": 17557 }, { "epoch": 0.930644263641905, "grad_norm": 63.0, "kl": 2.979071617126465, "learning_rate": 5e-07, "logits/chosen": -10887756.0, "logits/rejected": -7981849.333333333, "logps/chosen": -303.455029296875, "logps/rejected": -117.38255818684895, "loss": 0.3591, "rewards/chosen": 0.3837733745574951, "rewards/margins": 3.7558099905649818, "rewards/rejected": -3.372036616007487, "step": 17558 }, { "epoch": 0.9306972676437071, "grad_norm": 62.0, "kl": 6.354560852050781, "learning_rate": 5e-07, "logits/chosen": -20609908.57142857, "logits/rejected": -30323124.0, "logps/chosen": -498.23960658482144, "logps/rejected": -139.0276641845703, "loss": 0.405, "rewards/chosen": 0.9984651293073382, "rewards/margins": 2.835093923977443, "rewards/rejected": -1.836628794670105, "step": 17559 }, { "epoch": 0.9307502716455093, "grad_norm": 57.25, "kl": 0.9904022216796875, "learning_rate": 5e-07, "logits/chosen": -17841716.0, "logits/rejected": -6310044.0, "logps/chosen": -301.8574625651042, "logps/rejected": -369.780322265625, "loss": 0.3099, "rewards/chosen": 0.30864131450653076, "rewards/margins": 2.216943621635437, "rewards/rejected": -1.9083023071289062, "step": 17560 }, { "epoch": 0.9308032756473114, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3383880.0, "logits/rejected": -9723678.0, "logps/chosen": -371.522216796875, "logps/rejected": -152.23275756835938, "loss": 0.2377, "rewards/chosen": 0.30433160066604614, "rewards/margins": 3.690796673297882, "rewards/rejected": -3.386465072631836, "step": 17561 }, { "epoch": 0.9308562796491135, "grad_norm": 42.75, "kl": 0.2659883499145508, "learning_rate": 5e-07, "logits/chosen": -53104640.0, "logits/rejected": -15749969.0, "logps/chosen": -230.3048095703125, "logps/rejected": -363.47552490234375, "loss": 0.2512, "rewards/chosen": 0.40104836225509644, "rewards/margins": 3.1020981669425964, "rewards/rejected": -2.7010498046875, "step": 17562 }, { "epoch": 0.9309092836509156, "grad_norm": 52.5, "kl": 3.391630172729492, "learning_rate": 5e-07, "logits/chosen": -30685580.8, "logits/rejected": -39894152.0, "logps/chosen": -256.4914306640625, "logps/rejected": -408.697998046875, "loss": 0.2737, "rewards/chosen": 1.286183738708496, "rewards/margins": 3.395508829752604, "rewards/rejected": -2.109325091044108, "step": 17563 }, { "epoch": 0.9309622876527178, "grad_norm": 46.0, "kl": 0.3187446594238281, "learning_rate": 5e-07, "logits/chosen": -36279626.666666664, "logits/rejected": -41981315.2, "logps/chosen": -402.0546875, "logps/rejected": -404.5649658203125, "loss": 0.2029, "rewards/chosen": 0.7826832930246989, "rewards/margins": 3.119000546137492, "rewards/rejected": -2.336317253112793, "step": 17564 }, { "epoch": 0.9310152916545199, "grad_norm": 96.5, "kl": 6.105863571166992, "learning_rate": 5e-07, "logits/chosen": -11888145.333333334, "logits/rejected": -6872589.5, "logps/chosen": -375.700439453125, "logps/rejected": -547.854248046875, "loss": 0.4181, "rewards/chosen": 0.5067785580952963, "rewards/margins": 5.410215695699056, "rewards/rejected": -4.90343713760376, "step": 17565 }, { "epoch": 0.9310682956563221, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -83023136.0, "logps/rejected": -366.0187072753906, "loss": 0.0899, "rewards/rejected": -2.5600104331970215, "step": 17566 }, { "epoch": 0.9311212996581242, "grad_norm": 51.25, "kl": 0.027828216552734375, "learning_rate": 5e-07, "logits/chosen": -55893488.0, "logits/rejected": -37946580.0, "logps/chosen": -273.1652526855469, "logps/rejected": -319.4918212890625, "loss": 0.2708, "rewards/chosen": 0.06838150322437286, "rewards/margins": 3.159457877278328, "rewards/rejected": -3.091076374053955, "step": 17567 }, { "epoch": 0.9311743036599264, "grad_norm": 49.0, "kl": 1.3760566711425781, "learning_rate": 5e-07, "logits/chosen": -36037288.0, "logits/rejected": 13005992.0, "logps/chosen": -308.2158203125, "logps/rejected": -496.3924560546875, "loss": 0.3584, "rewards/chosen": -0.29040223360061646, "rewards/margins": 2.4229370951652527, "rewards/rejected": -2.713339328765869, "step": 17568 }, { "epoch": 0.9312273076617285, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65551690.666666664, "logits/rejected": 2274100.5, "logps/chosen": -462.2169596354167, "logps/rejected": -100.5682144165039, "loss": 0.2911, "rewards/chosen": 0.6188550790150961, "rewards/margins": 3.3914061387379966, "rewards/rejected": -2.7725510597229004, "step": 17569 }, { "epoch": 0.9312803116635306, "grad_norm": 52.5, "kl": 2.0466861724853516, "learning_rate": 5e-07, "logits/chosen": -19185841.6, "logits/rejected": -39006560.0, "logps/chosen": -244.158251953125, "logps/rejected": -419.9694417317708, "loss": 0.3247, "rewards/chosen": 0.23241329193115234, "rewards/margins": 3.179535230000814, "rewards/rejected": -2.9471219380696616, "step": 17570 }, { "epoch": 0.9313333156653327, "grad_norm": 62.5, "kl": 3.8901901245117188, "learning_rate": 5e-07, "logits/chosen": -23244796.0, "logits/rejected": -42451488.0, "logps/chosen": -289.12420654296875, "logps/rejected": -660.9186401367188, "loss": 0.2718, "rewards/chosen": 0.7684932947158813, "rewards/margins": 4.392576813697815, "rewards/rejected": -3.6240835189819336, "step": 17571 }, { "epoch": 0.9313863196671349, "grad_norm": 41.5, "kl": 1.0678749084472656, "learning_rate": 5e-07, "logits/chosen": 10222860.0, "logits/rejected": -15468283.2, "logps/chosen": -358.866943359375, "logps/rejected": -212.5833984375, "loss": 0.1777, "rewards/chosen": 0.7816437085469564, "rewards/margins": 4.191190751393636, "rewards/rejected": -3.40954704284668, "step": 17572 }, { "epoch": 0.931439323668937, "grad_norm": 34.75, "kl": 1.3809428215026855, "learning_rate": 5e-07, "logits/chosen": -9706906.0, "logits/rejected": 3217794.0, "logps/chosen": -105.63192749023438, "logps/rejected": -131.6585693359375, "loss": 0.3409, "rewards/chosen": 0.46978187561035156, "rewards/margins": 2.006253480911255, "rewards/rejected": -1.5364716053009033, "step": 17573 }, { "epoch": 0.9314923276707392, "grad_norm": 46.5, "kl": 1.127176284790039, "learning_rate": 5e-07, "logits/chosen": -46871144.0, "logits/rejected": -11372617.333333334, "logps/chosen": -230.29830932617188, "logps/rejected": -257.3828531901042, "loss": 0.1881, "rewards/chosen": 1.078680396080017, "rewards/margins": 3.6977606217066445, "rewards/rejected": -2.6190802256266275, "step": 17574 }, { "epoch": 0.9315453316725413, "grad_norm": 56.0, "kl": 3.512054443359375, "learning_rate": 5e-07, "logits/chosen": -30597776.0, "logits/rejected": -7706882.0, "logps/chosen": -180.99136352539062, "logps/rejected": -292.8073425292969, "loss": 0.301, "rewards/chosen": 0.3434566557407379, "rewards/margins": 3.1450962126255035, "rewards/rejected": -2.8016395568847656, "step": 17575 }, { "epoch": 0.9315983356743434, "grad_norm": 45.25, "kl": 0.5389595031738281, "learning_rate": 5e-07, "logits/chosen": -25698373.333333332, "logits/rejected": -31823161.6, "logps/chosen": -436.1009928385417, "logps/rejected": -370.497607421875, "loss": 0.228, "rewards/chosen": 0.609296719233195, "rewards/margins": 2.6682376066843667, "rewards/rejected": -2.058940887451172, "step": 17576 }, { "epoch": 0.9316513396761456, "grad_norm": 49.0, "kl": 3.1320152282714844, "learning_rate": 5e-07, "logits/chosen": -56328464.0, "logits/rejected": 11019579.0, "logps/chosen": -501.2867838541667, "logps/rejected": -592.8049926757812, "loss": 0.3267, "rewards/chosen": 0.8690458933512369, "rewards/margins": 2.787638703982035, "rewards/rejected": -1.9185928106307983, "step": 17577 }, { "epoch": 0.9317043436779476, "grad_norm": 60.5, "kl": 4.2000226974487305, "learning_rate": 5e-07, "logits/chosen": 9601445.6, "logits/rejected": -27491778.666666668, "logps/chosen": -407.42109375, "logps/rejected": -106.61346435546875, "loss": 0.4111, "rewards/chosen": 0.5474908351898193, "rewards/margins": 1.6939693291982014, "rewards/rejected": -1.146478494008382, "step": 17578 }, { "epoch": 0.9317573476797498, "grad_norm": 27.75, "kl": 2.6568918228149414, "learning_rate": 5e-07, "logits/chosen": -36929740.0, "logits/rejected": -34394669.333333336, "logps/chosen": -1181.5328369140625, "logps/rejected": -231.35066731770834, "loss": 0.1799, "rewards/chosen": 2.2200722694396973, "rewards/margins": 4.464644114176432, "rewards/rejected": -2.244571844736735, "step": 17579 }, { "epoch": 0.9318103516815519, "grad_norm": 32.25, "kl": 2.5098190307617188, "learning_rate": 5e-07, "logits/chosen": 12662637.333333334, "logits/rejected": 1359321.5, "logps/chosen": -24.706649780273438, "logps/rejected": -182.790869140625, "loss": 0.1647, "rewards/chosen": 1.7095181147257488, "rewards/margins": 4.131658999125163, "rewards/rejected": -2.422140884399414, "step": 17580 }, { "epoch": 0.9318633556833541, "grad_norm": 32.25, "kl": 0.9606418609619141, "learning_rate": 5e-07, "logits/chosen": -4516465.0, "logits/rejected": -48438576.0, "logps/chosen": -217.98275756835938, "logps/rejected": -362.4580485026042, "loss": 0.1739, "rewards/chosen": 1.0979602336883545, "rewards/margins": 3.2375916639963784, "rewards/rejected": -2.139631430308024, "step": 17581 }, { "epoch": 0.9319163596851562, "grad_norm": 63.5, "kl": 4.159000396728516, "learning_rate": 5e-07, "logits/chosen": -25437476.8, "logits/rejected": -20838434.666666668, "logps/chosen": -269.3275146484375, "logps/rejected": -316.5289306640625, "loss": 0.2347, "rewards/chosen": 1.2400254249572753, "rewards/margins": 3.542085870107015, "rewards/rejected": -2.3020604451497397, "step": 17582 }, { "epoch": 0.9319693636869584, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2321462.0, "logits/rejected": -33339125.333333332, "logps/chosen": -402.59539794921875, "logps/rejected": -267.59336344401044, "loss": 0.1146, "rewards/chosen": 2.0332305431365967, "rewards/margins": 4.759164571762085, "rewards/rejected": -2.7259340286254883, "step": 17583 }, { "epoch": 0.9320223676887605, "grad_norm": 54.0, "kl": 3.811227798461914, "learning_rate": 5e-07, "logits/chosen": -20529801.6, "logits/rejected": -27801010.666666668, "logps/chosen": -385.0903076171875, "logps/rejected": -276.6331380208333, "loss": 0.3394, "rewards/chosen": 1.2882080078125, "rewards/margins": 2.3290456136067705, "rewards/rejected": -1.0408376057942708, "step": 17584 }, { "epoch": 0.9320753716905626, "grad_norm": 52.0, "kl": 1.1163959503173828, "learning_rate": 5e-07, "logits/chosen": -15651696.0, "logits/rejected": -90419248.0, "logps/chosen": -289.001220703125, "logps/rejected": -674.2138061523438, "loss": 0.3494, "rewards/chosen": 0.38987823327382404, "rewards/margins": 3.034942110379537, "rewards/rejected": -2.645063877105713, "step": 17585 }, { "epoch": 0.9321283756923647, "grad_norm": 55.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1694463.0, "logits/rejected": -11170102.0, "logps/chosen": -570.4349365234375, "logps/rejected": -183.21988932291666, "loss": 0.2878, "rewards/chosen": 0.08697815239429474, "rewards/margins": 1.7833487540483475, "rewards/rejected": -1.6963706016540527, "step": 17586 }, { "epoch": 0.9321813796941669, "grad_norm": 36.75, "kl": 2.4356555938720703, "learning_rate": 5e-07, "logits/chosen": -2801028.4, "logits/rejected": -28033738.666666668, "logps/chosen": -117.3109619140625, "logps/rejected": -353.0445556640625, "loss": 0.3994, "rewards/chosen": 0.28939132690429686, "rewards/margins": 2.025214958190918, "rewards/rejected": -1.735823631286621, "step": 17587 }, { "epoch": 0.932234383695969, "grad_norm": 36.25, "kl": 2.3649444580078125, "learning_rate": 5e-07, "logits/chosen": -5121859.5, "logits/rejected": -17116572.0, "logps/chosen": -136.67428588867188, "logps/rejected": -185.3443603515625, "loss": 0.3456, "rewards/chosen": 0.5145049095153809, "rewards/margins": 2.2901055812835693, "rewards/rejected": -1.7756006717681885, "step": 17588 }, { "epoch": 0.9322873876977712, "grad_norm": 49.75, "kl": 2.71286678314209, "learning_rate": 5e-07, "logits/chosen": -20266450.666666668, "logits/rejected": -25120956.8, "logps/chosen": -267.7244873046875, "logps/rejected": -230.7633544921875, "loss": 0.299, "rewards/chosen": 1.2385912736256917, "rewards/margins": 2.8196925004323323, "rewards/rejected": -1.5811012268066407, "step": 17589 }, { "epoch": 0.9323403916995733, "grad_norm": 43.75, "kl": 3.533296585083008, "learning_rate": 5e-07, "logits/chosen": 5361214.333333333, "logits/rejected": -7692846.4, "logps/chosen": -112.32041422526042, "logps/rejected": -345.533984375, "loss": 0.1882, "rewards/chosen": 1.3748960494995117, "rewards/margins": 4.875272178649903, "rewards/rejected": -3.5003761291503905, "step": 17590 }, { "epoch": 0.9323933957013755, "grad_norm": 52.5, "kl": 4.306221008300781, "learning_rate": 5e-07, "logits/chosen": -44468534.4, "logits/rejected": 728134.0833333334, "logps/chosen": -416.54072265625, "logps/rejected": -101.90437825520833, "loss": 0.3613, "rewards/chosen": 0.8703865051269531, "rewards/margins": 3.3295981725056967, "rewards/rejected": -2.4592116673787436, "step": 17591 }, { "epoch": 0.9324463997031776, "grad_norm": 39.25, "kl": 0.6143951416015625, "learning_rate": 5e-07, "logits/chosen": -2558947.0, "logits/rejected": -34177677.333333336, "logps/chosen": -387.564013671875, "logps/rejected": -426.3614908854167, "loss": 0.1282, "rewards/chosen": 1.8632425308227538, "rewards/margins": 4.611288452148438, "rewards/rejected": -2.7480459213256836, "step": 17592 }, { "epoch": 0.9324994037049797, "grad_norm": 47.25, "kl": 5.04905891418457, "learning_rate": 5e-07, "logits/chosen": -53810572.8, "logits/rejected": -10347562.0, "logps/chosen": -293.442529296875, "logps/rejected": -109.50492350260417, "loss": 0.4131, "rewards/chosen": 0.11225783824920654, "rewards/margins": 2.7250810066858926, "rewards/rejected": -2.612823168436686, "step": 17593 }, { "epoch": 0.9325524077067818, "grad_norm": 51.5, "kl": 1.748347282409668, "learning_rate": 5e-07, "logits/chosen": -44437650.666666664, "logits/rejected": -19649006.0, "logps/chosen": -236.7249552408854, "logps/rejected": -205.158203125, "loss": 0.3293, "rewards/chosen": 0.6693994204203287, "rewards/margins": 4.745298067728679, "rewards/rejected": -4.07589864730835, "step": 17594 }, { "epoch": 0.932605411708584, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 66099280.0, "logits/rejected": -1336826.857142857, "logps/chosen": -528.2791137695312, "logps/rejected": -351.08998325892856, "loss": 0.28, "rewards/chosen": -1.407684326171875, "rewards/margins": 0.2843235560825892, "rewards/rejected": -1.6920078822544642, "step": 17595 }, { "epoch": 0.9326584157103861, "grad_norm": 34.0, "kl": 0.8904209136962891, "learning_rate": 5e-07, "logits/chosen": 5847012.0, "logits/rejected": -12278114.0, "logps/chosen": -160.8782501220703, "logps/rejected": -273.33685302734375, "loss": 0.2056, "rewards/chosen": 0.6744714975357056, "rewards/margins": 4.309290051460266, "rewards/rejected": -3.6348185539245605, "step": 17596 }, { "epoch": 0.9327114197121883, "grad_norm": 35.75, "kl": 0.6637725830078125, "learning_rate": 5e-07, "logits/chosen": -27460469.333333332, "logits/rejected": -76335494.4, "logps/chosen": -209.13541666666666, "logps/rejected": -480.248291015625, "loss": 0.1848, "rewards/chosen": 0.4216390053431193, "rewards/margins": 4.221763364473979, "rewards/rejected": -3.8001243591308596, "step": 17597 }, { "epoch": 0.9327644237139904, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26359005.333333332, "logits/rejected": -44598476.8, "logps/chosen": -276.8424072265625, "logps/rejected": -438.619189453125, "loss": 0.2466, "rewards/chosen": 0.13132437070210776, "rewards/margins": 2.6990581353505454, "rewards/rejected": -2.5677337646484375, "step": 17598 }, { "epoch": 0.9328174277157926, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43424088.0, "logits/rejected": -17835908.0, "logps/chosen": -483.93023681640625, "logps/rejected": -303.18560791015625, "loss": 0.2497, "rewards/chosen": 0.39778223633766174, "rewards/margins": 3.26769295334816, "rewards/rejected": -2.869910717010498, "step": 17599 }, { "epoch": 0.9328704317175947, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37754613.333333336, "logits/rejected": -46096268.8, "logps/chosen": -395.04541015625, "logps/rejected": -273.1852294921875, "loss": 0.2596, "rewards/chosen": -0.10748012860616048, "rewards/margins": 3.2874425093332924, "rewards/rejected": -3.394922637939453, "step": 17600 }, { "epoch": 0.9329234357193968, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18514670.0, "logits/rejected": -21181118.0, "logps/chosen": -396.5779724121094, "logps/rejected": -225.90798950195312, "loss": 0.3141, "rewards/chosen": 0.1212085708975792, "rewards/margins": 2.095080517232418, "rewards/rejected": -1.9738719463348389, "step": 17601 }, { "epoch": 0.9329764397211989, "grad_norm": 48.75, "kl": 2.326366901397705, "learning_rate": 5e-07, "logits/chosen": -37306816.0, "logits/rejected": -20822952.0, "logps/chosen": -389.613818359375, "logps/rejected": -853.9254557291666, "loss": 0.249, "rewards/chosen": 0.8101226806640625, "rewards/margins": 6.328852844238281, "rewards/rejected": -5.518730163574219, "step": 17602 }, { "epoch": 0.9330294437230011, "grad_norm": 62.75, "kl": 4.135863304138184, "learning_rate": 5e-07, "logits/chosen": -1207103.75, "logits/rejected": -25452312.0, "logps/chosen": -78.04742431640625, "logps/rejected": -443.42901611328125, "loss": 0.2809, "rewards/chosen": 0.9007507562637329, "rewards/margins": 3.986944317817688, "rewards/rejected": -3.086193561553955, "step": 17603 }, { "epoch": 0.9330824477248032, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23349344.0, "logits/rejected": -16135252.0, "logps/chosen": -423.432861328125, "logps/rejected": -186.0366668701172, "loss": 0.2579, "rewards/chosen": 0.34178581833839417, "rewards/margins": 3.7470389902591705, "rewards/rejected": -3.4052531719207764, "step": 17604 }, { "epoch": 0.9331354517266054, "grad_norm": 38.0, "kl": 2.5883922576904297, "learning_rate": 5e-07, "logits/chosen": -2585691.3333333335, "logits/rejected": -229821.1875, "logps/chosen": -144.2096150716146, "logps/rejected": -68.45195007324219, "loss": 0.381, "rewards/chosen": 0.6599268913269043, "rewards/margins": 1.8292288780212402, "rewards/rejected": -1.169301986694336, "step": 17605 }, { "epoch": 0.9331884557284075, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57238928.0, "logits/rejected": -30826003.2, "logps/chosen": -388.859130859375, "logps/rejected": -230.80869140625, "loss": 0.2633, "rewards/chosen": 0.011091741422812143, "rewards/margins": 3.4523633326093353, "rewards/rejected": -3.4412715911865233, "step": 17606 }, { "epoch": 0.9332414597302097, "grad_norm": 58.0, "kl": 1.3401870727539062, "learning_rate": 5e-07, "logits/chosen": -34184620.8, "logits/rejected": -12431421.333333334, "logps/chosen": -468.35361328125, "logps/rejected": -133.83857218424478, "loss": 0.3686, "rewards/chosen": 0.6578694343566894, "rewards/margins": 1.7931161085764566, "rewards/rejected": -1.1352466742197673, "step": 17607 }, { "epoch": 0.9332944637320117, "grad_norm": 55.5, "kl": 3.765812873840332, "learning_rate": 5e-07, "logits/chosen": -826764.8, "logits/rejected": -20014642.666666668, "logps/chosen": -298.217822265625, "logps/rejected": -110.3221435546875, "loss": 0.3272, "rewards/chosen": 1.1955458641052246, "rewards/margins": 2.3038654645284016, "rewards/rejected": -1.108319600423177, "step": 17608 }, { "epoch": 0.9333474677338139, "grad_norm": 45.75, "kl": 2.822803497314453, "learning_rate": 5e-07, "logits/chosen": -41560492.0, "logits/rejected": -37856412.0, "logps/chosen": -259.82080078125, "logps/rejected": -370.4858703613281, "loss": 0.2454, "rewards/chosen": 0.8864391446113586, "rewards/margins": 3.7568934559822083, "rewards/rejected": -2.8704543113708496, "step": 17609 }, { "epoch": 0.933400471735616, "grad_norm": 34.5, "kl": 1.2719764709472656, "learning_rate": 5e-07, "logits/chosen": 4249869.5, "logits/rejected": -29272016.0, "logps/chosen": -103.49910736083984, "logps/rejected": -351.073974609375, "loss": 0.1867, "rewards/chosen": 0.9905925989151001, "rewards/margins": 3.102609912554423, "rewards/rejected": -2.1120173136393228, "step": 17610 }, { "epoch": 0.9334534757374182, "grad_norm": 41.25, "kl": 1.8658332824707031, "learning_rate": 5e-07, "logits/chosen": 1985944.75, "logits/rejected": -47434128.0, "logps/chosen": -201.72381591796875, "logps/rejected": -421.69256591796875, "loss": 0.2738, "rewards/chosen": 0.6671749949455261, "rewards/margins": 2.4194294810295105, "rewards/rejected": -1.7522544860839844, "step": 17611 }, { "epoch": 0.9335064797392203, "grad_norm": 52.0, "kl": 3.3978118896484375, "learning_rate": 5e-07, "logits/chosen": -57872761.6, "logits/rejected": -44656949.333333336, "logps/chosen": -245.169384765625, "logps/rejected": -603.4224446614584, "loss": 0.2854, "rewards/chosen": 1.1336810111999511, "rewards/margins": 3.0404986063639323, "rewards/rejected": -1.9068175951639812, "step": 17612 }, { "epoch": 0.9335594837410225, "grad_norm": 56.0, "kl": 4.987004280090332, "learning_rate": 5e-07, "logits/chosen": -8606747.333333334, "logits/rejected": -41484320.0, "logps/chosen": -205.351806640625, "logps/rejected": -373.4085998535156, "loss": 0.3865, "rewards/chosen": 0.7511960665384928, "rewards/margins": 3.3544599215189614, "rewards/rejected": -2.6032638549804688, "step": 17613 }, { "epoch": 0.9336124877428246, "grad_norm": 57.25, "kl": 5.58477783203125, "learning_rate": 5e-07, "logits/chosen": -36303206.85714286, "logits/rejected": -51351312.0, "logps/chosen": -642.2721121651786, "logps/rejected": -259.31201171875, "loss": 0.3473, "rewards/chosen": 1.237297603062221, "rewards/margins": 3.8272574629102434, "rewards/rejected": -2.5899598598480225, "step": 17614 }, { "epoch": 0.9336654917446268, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28635792.0, "logits/rejected": -25852243.2, "logps/chosen": -213.8526611328125, "logps/rejected": -302.4053466796875, "loss": 0.3181, "rewards/chosen": -0.03092995285987854, "rewards/margins": 1.6349813401699067, "rewards/rejected": -1.6659112930297852, "step": 17615 }, { "epoch": 0.9337184957464288, "grad_norm": 46.75, "kl": 1.070648193359375, "learning_rate": 5e-07, "logits/chosen": -20861312.0, "logits/rejected": -21906676.0, "logps/chosen": -473.3162536621094, "logps/rejected": -121.92578887939453, "loss": 0.2232, "rewards/chosen": 1.1370093822479248, "rewards/margins": 2.9577548503875732, "rewards/rejected": -1.8207454681396484, "step": 17616 }, { "epoch": 0.933771499748231, "grad_norm": 69.5, "kl": 2.390012741088867, "learning_rate": 5e-07, "logits/chosen": -19448126.0, "logits/rejected": -34071912.0, "logps/chosen": -230.6027374267578, "logps/rejected": -240.72613525390625, "loss": 0.221, "rewards/chosen": 1.444527268409729, "rewards/margins": 3.2980616092681885, "rewards/rejected": -1.8535343408584595, "step": 17617 }, { "epoch": 0.9338245037500331, "grad_norm": 53.25, "kl": 0.4276447296142578, "learning_rate": 5e-07, "logits/chosen": -19767988.0, "logits/rejected": -20043988.0, "logps/chosen": -293.1101481119792, "logps/rejected": -484.1230773925781, "loss": 0.3217, "rewards/chosen": 0.44069631894429523, "rewards/margins": 4.2274547417958575, "rewards/rejected": -3.7867584228515625, "step": 17618 }, { "epoch": 0.9338775077518353, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1606054.25, "logits/rejected": -37735448.0, "logps/chosen": -356.84161376953125, "logps/rejected": -290.57460530598956, "loss": 0.1461, "rewards/chosen": 1.6745872497558594, "rewards/margins": 4.076633930206299, "rewards/rejected": -2.4020466804504395, "step": 17619 }, { "epoch": 0.9339305117536374, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17014582.0, "logits/rejected": -44515364.0, "logps/chosen": -286.5452575683594, "logps/rejected": -399.70330810546875, "loss": 0.3001, "rewards/chosen": -0.10275040566921234, "rewards/margins": 2.496162548661232, "rewards/rejected": -2.5989129543304443, "step": 17620 }, { "epoch": 0.9339835157554396, "grad_norm": 31.375, "kl": 1.895599365234375, "learning_rate": 5e-07, "logits/chosen": -2912007.0, "logits/rejected": -3370454.0, "logps/chosen": -238.63095092773438, "logps/rejected": -187.06192016601562, "loss": 0.294, "rewards/chosen": 0.17159849405288696, "rewards/margins": 4.731866896152496, "rewards/rejected": -4.560268402099609, "step": 17621 }, { "epoch": 0.9340365197572417, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21736798.0, "logits/rejected": -14155921.142857144, "logps/chosen": -74.58755493164062, "logps/rejected": -157.22056361607142, "loss": 0.1912, "rewards/chosen": 1.9557923078536987, "rewards/margins": 3.5799141100474765, "rewards/rejected": -1.624121802193778, "step": 17622 }, { "epoch": 0.9340895237590439, "grad_norm": 46.5, "kl": 1.5949554443359375, "learning_rate": 5e-07, "logits/chosen": -78275546.66666667, "logits/rejected": -30262060.8, "logps/chosen": -410.6620279947917, "logps/rejected": -242.6989013671875, "loss": 0.2893, "rewards/chosen": 0.2952311436335246, "rewards/margins": 2.382490626970927, "rewards/rejected": -2.0872594833374025, "step": 17623 }, { "epoch": 0.9341425277608459, "grad_norm": 43.75, "kl": 2.826864242553711, "learning_rate": 5e-07, "logits/chosen": -18416336.0, "logits/rejected": -36727385.6, "logps/chosen": -247.4889933268229, "logps/rejected": -326.3530029296875, "loss": 0.2802, "rewards/chosen": 0.3673619826634725, "rewards/margins": 2.7574639876683555, "rewards/rejected": -2.390102005004883, "step": 17624 }, { "epoch": 0.9341955317626481, "grad_norm": 58.0, "kl": 1.06890869140625, "learning_rate": 5e-07, "logits/chosen": -39666713.6, "logits/rejected": -10721169.333333334, "logps/chosen": -295.074365234375, "logps/rejected": -257.3379720052083, "loss": 0.3994, "rewards/chosen": 0.08089652061462402, "rewards/margins": 1.363836669921875, "rewards/rejected": -1.282940149307251, "step": 17625 }, { "epoch": 0.9342485357644502, "grad_norm": 43.0, "kl": 0.493408203125, "learning_rate": 5e-07, "logits/chosen": -26730338.0, "logits/rejected": -40187272.0, "logps/chosen": -242.02508544921875, "logps/rejected": -350.9700520833333, "loss": 0.2109, "rewards/chosen": -0.1179656982421875, "rewards/margins": 2.164371967315674, "rewards/rejected": -2.2823376655578613, "step": 17626 }, { "epoch": 0.9343015397662523, "grad_norm": 42.25, "kl": 2.895432472229004, "learning_rate": 5e-07, "logits/chosen": 8080183.0, "logits/rejected": 31725834.0, "logps/chosen": -31.026857376098633, "logps/rejected": -317.07476806640625, "loss": 0.4156, "rewards/chosen": 0.10243363678455353, "rewards/margins": 1.517885759472847, "rewards/rejected": -1.4154521226882935, "step": 17627 }, { "epoch": 0.9343545437680545, "grad_norm": 25.625, "kl": 3.307138442993164, "learning_rate": 5e-07, "logits/chosen": -12891463.0, "logits/rejected": -53177376.0, "logps/chosen": -502.840576171875, "logps/rejected": -509.57904052734375, "loss": 0.2422, "rewards/chosen": 1.5105881690979004, "rewards/margins": 4.449779033660889, "rewards/rejected": -2.9391908645629883, "step": 17628 }, { "epoch": 0.9344075477698566, "grad_norm": 37.0, "kl": 1.6642465591430664, "learning_rate": 5e-07, "logits/chosen": -3770129.3333333335, "logits/rejected": -53012486.4, "logps/chosen": -172.58235677083334, "logps/rejected": -456.126318359375, "loss": 0.2512, "rewards/chosen": 0.5915970007578532, "rewards/margins": 2.8062023321787515, "rewards/rejected": -2.2146053314208984, "step": 17629 }, { "epoch": 0.9344605517716588, "grad_norm": 53.5, "kl": 0.13495635986328125, "learning_rate": 5e-07, "logits/chosen": -11933965.6, "logits/rejected": -10812456.666666666, "logps/chosen": -222.54775390625, "logps/rejected": -230.21378580729166, "loss": 0.3325, "rewards/chosen": 0.5382768154144287, "rewards/margins": 1.957415723800659, "rewards/rejected": -1.4191389083862305, "step": 17630 }, { "epoch": 0.9345135557734608, "grad_norm": 43.0, "kl": 2.100440502166748, "learning_rate": 5e-07, "logits/chosen": 2053726.875, "logits/rejected": -10082437.0, "logps/chosen": -176.8507080078125, "logps/rejected": -427.18341064453125, "loss": 0.3228, "rewards/chosen": 0.3299769163131714, "rewards/margins": 2.0425634384155273, "rewards/rejected": -1.712586522102356, "step": 17631 }, { "epoch": 0.934566559775263, "grad_norm": 52.25, "kl": 2.7085914611816406, "learning_rate": 5e-07, "logits/chosen": 7257900.0, "logits/rejected": -95029301.33333333, "logps/chosen": -406.1144287109375, "logps/rejected": -494.0756429036458, "loss": 0.3668, "rewards/chosen": 0.4816270351409912, "rewards/margins": 3.3913315614064534, "rewards/rejected": -2.9097045262654624, "step": 17632 }, { "epoch": 0.9346195637770651, "grad_norm": 57.75, "kl": 1.3232002258300781, "learning_rate": 5e-07, "logits/chosen": 8694979.333333334, "logits/rejected": -1152869.25, "logps/chosen": -292.816650390625, "logps/rejected": -87.32381439208984, "loss": 0.4207, "rewards/chosen": 0.1163829763730367, "rewards/margins": 2.1668881376584372, "rewards/rejected": -2.0505051612854004, "step": 17633 }, { "epoch": 0.9346725677788673, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34992292.0, "logits/rejected": -16384480.0, "logps/chosen": -350.5634765625, "logps/rejected": -435.1107584635417, "loss": 0.1567, "rewards/chosen": 0.567953884601593, "rewards/margins": 3.2733851869901023, "rewards/rejected": -2.7054313023885093, "step": 17634 }, { "epoch": 0.9347255717806694, "grad_norm": 38.25, "kl": 4.462155818939209, "learning_rate": 5e-07, "logits/chosen": 3794981.6, "logits/rejected": -25605480.0, "logps/chosen": -72.26041259765626, "logps/rejected": -619.3814697265625, "loss": 0.3676, "rewards/chosen": 0.28710575103759767, "rewards/margins": 4.901579411824544, "rewards/rejected": -4.614473660786946, "step": 17635 }, { "epoch": 0.9347785757824716, "grad_norm": 38.25, "kl": 1.3575763702392578, "learning_rate": 5e-07, "logits/chosen": -16969240.0, "logits/rejected": -13540623.0, "logps/chosen": -200.45245361328125, "logps/rejected": -193.5032958984375, "loss": 0.2799, "rewards/chosen": 0.3671087920665741, "rewards/margins": 2.9187221229076385, "rewards/rejected": -2.5516133308410645, "step": 17636 }, { "epoch": 0.9348315797842737, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 350283776.0, "logits/rejected": -9980519.0, "logps/chosen": -342.3521728515625, "logps/rejected": -292.3116760253906, "loss": 0.2113, "rewards/chosen": 0.6803959012031555, "rewards/margins": 3.7161481976509094, "rewards/rejected": -3.035752296447754, "step": 17637 }, { "epoch": 0.9348845837860759, "grad_norm": 33.25, "kl": 1.108363151550293, "learning_rate": 5e-07, "logits/chosen": 4985402.333333333, "logits/rejected": -24961804.8, "logps/chosen": -365.6038411458333, "logps/rejected": -484.49931640625, "loss": 0.1896, "rewards/chosen": 1.693440278371175, "rewards/margins": 4.988769563039144, "rewards/rejected": -3.295329284667969, "step": 17638 }, { "epoch": 0.9349375877878779, "grad_norm": 52.25, "kl": 0.01839447021484375, "learning_rate": 5e-07, "logits/chosen": -28117126.4, "logits/rejected": -16486029.333333334, "logps/chosen": -323.598828125, "logps/rejected": -174.67582194010416, "loss": 0.2926, "rewards/chosen": 0.3632157564163208, "rewards/margins": 3.2594667991002404, "rewards/rejected": -2.8962510426839194, "step": 17639 }, { "epoch": 0.9349905917896801, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -30298760.0, "logits/rejected": -11815509.6, "logps/chosen": -340.0301106770833, "logps/rejected": -216.03662109375, "loss": 0.1741, "rewards/chosen": 0.6786560217539469, "rewards/margins": 3.6363571325937905, "rewards/rejected": -2.9577011108398437, "step": 17640 }, { "epoch": 0.9350435957914822, "grad_norm": 62.75, "kl": 0.8123111724853516, "learning_rate": 5e-07, "logits/chosen": -43703068.0, "logits/rejected": -12851717.333333334, "logps/chosen": -919.4404296875, "logps/rejected": -369.2799886067708, "loss": 0.1268, "rewards/chosen": 1.1468536853790283, "rewards/margins": 4.096127112706503, "rewards/rejected": -2.949273427327474, "step": 17641 }, { "epoch": 0.9350965997932844, "grad_norm": 49.5, "kl": 1.2895889282226562, "learning_rate": 5e-07, "logits/chosen": -18417574.0, "logits/rejected": -40794540.0, "logps/chosen": -351.6806945800781, "logps/rejected": -253.0702362060547, "loss": 0.2814, "rewards/chosen": 0.2583334147930145, "rewards/margins": 3.482012242078781, "rewards/rejected": -3.2236788272857666, "step": 17642 }, { "epoch": 0.9351496037950865, "grad_norm": 44.0, "kl": 0.5727291107177734, "learning_rate": 5e-07, "logits/chosen": -42511088.0, "logits/rejected": -35544189.333333336, "logps/chosen": -324.3697204589844, "logps/rejected": -196.4114990234375, "loss": 0.2277, "rewards/chosen": 0.0917915403842926, "rewards/margins": 2.9620562295118966, "rewards/rejected": -2.870264689127604, "step": 17643 }, { "epoch": 0.9352026077968887, "grad_norm": 41.25, "kl": 0.7537899017333984, "learning_rate": 5e-07, "logits/chosen": -13596715.0, "logits/rejected": -1299511.25, "logps/chosen": -318.8610534667969, "logps/rejected": -164.62600708007812, "loss": 0.3602, "rewards/chosen": 0.012412257492542267, "rewards/margins": 1.589076228439808, "rewards/rejected": -1.5766639709472656, "step": 17644 }, { "epoch": 0.9352556117986908, "grad_norm": 44.5, "kl": 1.0331993103027344, "learning_rate": 5e-07, "logits/chosen": -25368259.2, "logits/rejected": 111475136.0, "logps/chosen": -250.5164794921875, "logps/rejected": -621.766845703125, "loss": 0.2801, "rewards/chosen": 0.41235198974609377, "rewards/margins": 4.110440635681153, "rewards/rejected": -3.6980886459350586, "step": 17645 }, { "epoch": 0.935308615800493, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63740192.0, "logits/rejected": -31190568.0, "logps/chosen": -804.1497802734375, "logps/rejected": -240.7237548828125, "loss": 0.1816, "rewards/chosen": 1.82781982421875, "rewards/margins": 3.649716218312581, "rewards/rejected": -1.8218963940938313, "step": 17646 }, { "epoch": 0.935361619802295, "grad_norm": 35.75, "kl": 6.5655059814453125, "learning_rate": 5e-07, "logits/chosen": 7508364.8, "logits/rejected": -26693632.0, "logps/chosen": -210.5515380859375, "logps/rejected": -511.4080810546875, "loss": 0.2189, "rewards/chosen": 1.5736636161804198, "rewards/margins": 6.400291029612223, "rewards/rejected": -4.826627413431804, "step": 17647 }, { "epoch": 0.9354146238040972, "grad_norm": 44.75, "kl": 0.6071157455444336, "learning_rate": 5e-07, "logits/chosen": -29978920.0, "logits/rejected": -26701896.0, "logps/chosen": -212.78518676757812, "logps/rejected": -292.0797119140625, "loss": 0.3223, "rewards/chosen": -0.07289247214794159, "rewards/margins": 1.9912108927965164, "rewards/rejected": -2.064103364944458, "step": 17648 }, { "epoch": 0.9354676278058993, "grad_norm": 48.75, "kl": 0.011568069458007812, "learning_rate": 5e-07, "logits/chosen": -59876776.0, "logits/rejected": -40203976.0, "logps/chosen": -237.4567413330078, "logps/rejected": -346.85223388671875, "loss": 0.3487, "rewards/chosen": -0.5044769048690796, "rewards/margins": 2.5535508394241333, "rewards/rejected": -3.058027744293213, "step": 17649 }, { "epoch": 0.9355206318077015, "grad_norm": 56.75, "kl": 3.8648204803466797, "learning_rate": 5e-07, "logits/chosen": 10684300.0, "logits/rejected": -39550876.0, "logps/chosen": -309.09698486328125, "logps/rejected": -354.79302978515625, "loss": 0.2534, "rewards/chosen": 0.9554298520088196, "rewards/margins": 3.280691921710968, "rewards/rejected": -2.3252620697021484, "step": 17650 }, { "epoch": 0.9355736358095036, "grad_norm": 45.75, "kl": 4.760221481323242, "learning_rate": 5e-07, "logits/chosen": 3074348.2, "logits/rejected": -30597634.666666668, "logps/chosen": -284.3414794921875, "logps/rejected": -227.1584269205729, "loss": 0.3185, "rewards/chosen": 0.760218620300293, "rewards/margins": 3.124911626180013, "rewards/rejected": -2.36469300587972, "step": 17651 }, { "epoch": 0.9356266398113058, "grad_norm": 32.5, "kl": 2.876758575439453, "learning_rate": 5e-07, "logits/chosen": -15018297.0, "logits/rejected": -27171630.0, "logps/chosen": -253.63125610351562, "logps/rejected": -296.3828125, "loss": 0.3012, "rewards/chosen": 0.35696521401405334, "rewards/margins": 3.84265199303627, "rewards/rejected": -3.485686779022217, "step": 17652 }, { "epoch": 0.9356796438131079, "grad_norm": 57.0, "kl": 1.538015365600586, "learning_rate": 5e-07, "logits/chosen": -44879782.4, "logits/rejected": -25447485.333333332, "logps/chosen": -281.022802734375, "logps/rejected": -297.8238118489583, "loss": 0.3402, "rewards/chosen": 0.3153942584991455, "rewards/margins": 2.159081792831421, "rewards/rejected": -1.8436875343322754, "step": 17653 }, { "epoch": 0.9357326478149101, "grad_norm": 38.25, "kl": 1.797800064086914, "learning_rate": 5e-07, "logits/chosen": 704709.25, "logits/rejected": -16101768.0, "logps/chosen": -158.59292602539062, "logps/rejected": -269.2471516927083, "loss": 0.2116, "rewards/chosen": 0.3685687184333801, "rewards/margins": 2.6991991798082986, "rewards/rejected": -2.3306304613749185, "step": 17654 }, { "epoch": 0.9357856518167121, "grad_norm": 53.0, "kl": 1.0581989288330078, "learning_rate": 5e-07, "logits/chosen": -29253800.0, "logits/rejected": -10314061.0, "logps/chosen": -437.63653564453125, "logps/rejected": -141.3346405029297, "loss": 0.2867, "rewards/chosen": 0.594519853591919, "rewards/margins": 3.269977331161499, "rewards/rejected": -2.67545747756958, "step": 17655 }, { "epoch": 0.9358386558185143, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94545128.0, "logits/rejected": -30171493.333333332, "logps/chosen": -414.82025146484375, "logps/rejected": -352.1343587239583, "loss": 0.1085, "rewards/chosen": 1.007344126701355, "rewards/margins": 4.086506406466166, "rewards/rejected": -3.079162279764811, "step": 17656 }, { "epoch": 0.9358916598203164, "grad_norm": 77.5, "kl": 0.30274200439453125, "learning_rate": 5e-07, "logits/chosen": -29014965.333333332, "logits/rejected": -17189265.6, "logps/chosen": -530.433349609375, "logps/rejected": -342.7040771484375, "loss": 0.1251, "rewards/chosen": 1.500396728515625, "rewards/margins": 4.774293899536133, "rewards/rejected": -3.273897171020508, "step": 17657 }, { "epoch": 0.9359446638221186, "grad_norm": 40.0, "kl": 0.16278839111328125, "learning_rate": 5e-07, "logits/chosen": -10670794.666666666, "logits/rejected": -63156736.0, "logps/chosen": -121.13966878255208, "logps/rejected": -496.386865234375, "loss": 0.2381, "rewards/chosen": 0.34758226076761883, "rewards/margins": 2.8967250982920327, "rewards/rejected": -2.549142837524414, "step": 17658 }, { "epoch": 0.9359976678239207, "grad_norm": 51.5, "kl": 3.9117283821105957, "learning_rate": 5e-07, "logits/chosen": -43912003.2, "logits/rejected": -65834437.333333336, "logps/chosen": -362.9913818359375, "logps/rejected": -509.270751953125, "loss": 0.3253, "rewards/chosen": 0.5116419792175293, "rewards/margins": 2.3153204917907715, "rewards/rejected": -1.8036785125732422, "step": 17659 }, { "epoch": 0.9360506718257229, "grad_norm": 41.5, "kl": 1.843606948852539, "learning_rate": 5e-07, "logits/chosen": -82826400.0, "logits/rejected": -16213498.0, "logps/chosen": -680.9171142578125, "logps/rejected": -267.13262939453125, "loss": 0.2426, "rewards/chosen": 1.6452804803848267, "rewards/margins": 3.12518572807312, "rewards/rejected": -1.4799052476882935, "step": 17660 }, { "epoch": 0.936103675827525, "grad_norm": 62.75, "kl": 0.04149436950683594, "learning_rate": 5e-07, "logits/chosen": 37253792.0, "logits/rejected": -4635784.0, "logps/chosen": -408.82550048828125, "logps/rejected": -234.969970703125, "loss": 0.2183, "rewards/chosen": 0.29794931411743164, "rewards/margins": 2.225826422373454, "rewards/rejected": -1.9278771082560222, "step": 17661 }, { "epoch": 0.9361566798293272, "grad_norm": 86.0, "kl": 3.060479164123535, "learning_rate": 5e-07, "logits/chosen": -14427987.42857143, "logits/rejected": -37036888.0, "logps/chosen": -314.2000209263393, "logps/rejected": -745.765625, "loss": 0.4226, "rewards/chosen": 0.4499242305755615, "rewards/margins": 5.6764256954193115, "rewards/rejected": -5.22650146484375, "step": 17662 }, { "epoch": 0.9362096838311292, "grad_norm": 26.75, "kl": 1.9849014282226562, "learning_rate": 5e-07, "logits/chosen": -19301267.2, "logits/rejected": -26440717.333333332, "logps/chosen": -88.71013793945312, "logps/rejected": -414.9676513671875, "loss": 0.2862, "rewards/chosen": 0.5848919868469238, "rewards/margins": 2.982668272654215, "rewards/rejected": -2.3977762858072915, "step": 17663 }, { "epoch": 0.9362626878329314, "grad_norm": 45.5, "kl": 0.11542892456054688, "learning_rate": 5e-07, "logits/chosen": -20682842.666666668, "logits/rejected": -30934886.4, "logps/chosen": -265.0306803385417, "logps/rejected": -190.83236083984374, "loss": 0.2456, "rewards/chosen": -0.24305649598439535, "rewards/margins": 3.6452262798945108, "rewards/rejected": -3.8882827758789062, "step": 17664 }, { "epoch": 0.9363156918347335, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25640488.0, "logits/rejected": 13803353.6, "logps/chosen": -95.01476033528645, "logps/rejected": -289.016748046875, "loss": 0.3303, "rewards/chosen": -0.128510852654775, "rewards/margins": 1.5833066980044048, "rewards/rejected": -1.7118175506591797, "step": 17665 }, { "epoch": 0.9363686958365357, "grad_norm": 40.75, "kl": 1.487335205078125, "learning_rate": 5e-07, "logits/chosen": -2408181.6666666665, "logits/rejected": -13542798.4, "logps/chosen": -223.53759765625, "logps/rejected": -234.1807861328125, "loss": 0.2233, "rewards/chosen": 0.9771560033162435, "rewards/margins": 3.189286549886068, "rewards/rejected": -2.212130546569824, "step": 17666 }, { "epoch": 0.9364216998383378, "grad_norm": 41.0, "kl": 0.5733375549316406, "learning_rate": 5e-07, "logits/chosen": -91867669.33333333, "logits/rejected": -17742371.2, "logps/chosen": -253.98736572265625, "logps/rejected": -205.6915771484375, "loss": 0.285, "rewards/chosen": 1.0365885893503826, "rewards/margins": 2.196403710047404, "rewards/rejected": -1.1598151206970215, "step": 17667 }, { "epoch": 0.93647470384014, "grad_norm": 59.75, "kl": 0.7281513214111328, "learning_rate": 5e-07, "logits/chosen": -56130394.666666664, "logits/rejected": -41735444.0, "logps/chosen": -456.8955078125, "logps/rejected": -725.5814208984375, "loss": 0.3161, "rewards/chosen": 0.5286572774251302, "rewards/margins": 5.581910451253255, "rewards/rejected": -5.053253173828125, "step": 17668 }, { "epoch": 0.9365277078419421, "grad_norm": 53.25, "kl": 0.012441158294677734, "learning_rate": 5e-07, "logits/chosen": -66922296.0, "logits/rejected": -25385766.85714286, "logps/chosen": -632.3798828125, "logps/rejected": -394.20361328125, "loss": 0.1824, "rewards/chosen": 0.7110534906387329, "rewards/margins": 2.917458176612854, "rewards/rejected": -2.206404685974121, "step": 17669 }, { "epoch": 0.9365807118437443, "grad_norm": 33.75, "kl": 2.169525146484375, "learning_rate": 5e-07, "logits/chosen": -11534751.0, "logits/rejected": 10001305.0, "logps/chosen": -165.60757446289062, "logps/rejected": -245.61215209960938, "loss": 0.3544, "rewards/chosen": -0.02163653075695038, "rewards/margins": 2.2760245352983475, "rewards/rejected": -2.297661066055298, "step": 17670 }, { "epoch": 0.9366337158455463, "grad_norm": 55.5, "kl": 4.739109039306641, "learning_rate": 5e-07, "logits/chosen": -14881640.0, "logits/rejected": -51071306.666666664, "logps/chosen": -557.417724609375, "logps/rejected": -379.6599527994792, "loss": 0.3053, "rewards/chosen": 1.014516544342041, "rewards/margins": 3.9914745012919104, "rewards/rejected": -2.9769579569498696, "step": 17671 }, { "epoch": 0.9366867198473485, "grad_norm": 36.5, "kl": 0.60302734375, "learning_rate": 5e-07, "logits/chosen": -35093837.333333336, "logits/rejected": -36370723.2, "logps/chosen": -225.4497273763021, "logps/rejected": -458.418701171875, "loss": 0.1262, "rewards/chosen": 1.187914212544759, "rewards/margins": 5.469053967793783, "rewards/rejected": -4.281139755249024, "step": 17672 }, { "epoch": 0.9367397238491506, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10452990.0, "logits/rejected": -53854217.14285714, "logps/chosen": -400.6737060546875, "logps/rejected": -364.3954380580357, "loss": 0.2233, "rewards/chosen": -0.0916290283203125, "rewards/margins": 1.9214067459106445, "rewards/rejected": -2.013035774230957, "step": 17673 }, { "epoch": 0.9367927278509528, "grad_norm": 45.5, "kl": 4.126285552978516, "learning_rate": 5e-07, "logits/chosen": -27185560.0, "logits/rejected": -24794548.0, "logps/chosen": -374.215087890625, "logps/rejected": -352.23431396484375, "loss": 0.3313, "rewards/chosen": 0.7977491617202759, "rewards/margins": 4.102954983711243, "rewards/rejected": -3.305205821990967, "step": 17674 }, { "epoch": 0.9368457318527549, "grad_norm": 52.0, "kl": 3.3475494384765625, "learning_rate": 5e-07, "logits/chosen": -67225113.6, "logits/rejected": -51978666.666666664, "logps/chosen": -533.529931640625, "logps/rejected": -307.7252604166667, "loss": 0.2605, "rewards/chosen": 1.0677643775939942, "rewards/margins": 3.3269806861877442, "rewards/rejected": -2.25921630859375, "step": 17675 }, { "epoch": 0.9368987358545571, "grad_norm": 46.0, "kl": 0.5320930480957031, "learning_rate": 5e-07, "logits/chosen": -49462969.6, "logits/rejected": -290098.2916666667, "logps/chosen": -215.999072265625, "logps/rejected": -189.73956298828125, "loss": 0.3162, "rewards/chosen": 0.548098087310791, "rewards/margins": 2.6308388710021973, "rewards/rejected": -2.0827407836914062, "step": 17676 }, { "epoch": 0.9369517398563592, "grad_norm": 36.5, "kl": 3.409688949584961, "learning_rate": 5e-07, "logits/chosen": -43330496.0, "logits/rejected": 36133.390625, "logps/chosen": -250.37447102864584, "logps/rejected": -435.8189453125, "loss": 0.2403, "rewards/chosen": 0.8249959945678711, "rewards/margins": 5.590035438537598, "rewards/rejected": -4.765039443969727, "step": 17677 }, { "epoch": 0.9370047438581612, "grad_norm": 44.0, "kl": 4.717241287231445, "learning_rate": 5e-07, "logits/chosen": -19382298.0, "logits/rejected": -21084470.0, "logps/chosen": -276.8390197753906, "logps/rejected": -325.6140441894531, "loss": 0.2375, "rewards/chosen": 1.495208978652954, "rewards/margins": 4.01001238822937, "rewards/rejected": -2.514803409576416, "step": 17678 }, { "epoch": 0.9370577478599634, "grad_norm": 61.25, "kl": 2.492889404296875, "learning_rate": 5e-07, "logits/chosen": -28048736.0, "logits/rejected": -10742949.0, "logps/chosen": -311.0867919921875, "logps/rejected": -198.39044189453125, "loss": 0.3695, "rewards/chosen": 0.5536906719207764, "rewards/margins": 5.2710254192352295, "rewards/rejected": -4.717334747314453, "step": 17679 }, { "epoch": 0.9371107518617655, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34213312.0, "logits/rejected": -18119068.0, "logps/chosen": -401.3165283203125, "logps/rejected": -571.2327880859375, "loss": 0.2331, "rewards/chosen": 0.9960111379623413, "rewards/margins": 3.458912491798401, "rewards/rejected": -2.4629013538360596, "step": 17680 }, { "epoch": 0.9371637558635677, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31262821.333333332, "logits/rejected": -8498003.2, "logps/chosen": -508.2843424479167, "logps/rejected": -153.646142578125, "loss": 0.358, "rewards/chosen": -0.04805704951286316, "rewards/margins": 1.1066863358020782, "rewards/rejected": -1.1547433853149414, "step": 17681 }, { "epoch": 0.9372167598653698, "grad_norm": 33.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15322821.333333334, "logits/rejected": -38335772.8, "logps/chosen": -195.41532389322916, "logps/rejected": -306.21396484375, "loss": 0.208, "rewards/chosen": 0.158450315395991, "rewards/margins": 3.0349365214506783, "rewards/rejected": -2.8764862060546874, "step": 17682 }, { "epoch": 0.937269763867172, "grad_norm": 50.0, "kl": 0.02074432373046875, "learning_rate": 5e-07, "logits/chosen": -26640115.2, "logits/rejected": -8214752.0, "logps/chosen": -237.014453125, "logps/rejected": -400.7545572916667, "loss": 0.3217, "rewards/chosen": 0.26269409656524656, "rewards/margins": 2.7924223502477012, "rewards/rejected": -2.5297282536824546, "step": 17683 }, { "epoch": 0.9373227678689741, "grad_norm": 73.0, "kl": 0.2696990966796875, "learning_rate": 5e-07, "logits/chosen": -32171720.0, "logits/rejected": -5442994.0, "logps/chosen": -419.8167724609375, "logps/rejected": -274.35321044921875, "loss": 0.3594, "rewards/chosen": 0.29570289452870685, "rewards/margins": 2.8590938647588096, "rewards/rejected": -2.5633909702301025, "step": 17684 }, { "epoch": 0.9373757718707763, "grad_norm": 45.25, "kl": 2.764616012573242, "learning_rate": 5e-07, "logits/chosen": 10031403.333333334, "logits/rejected": 220882048.0, "logps/chosen": -81.60376485188802, "logps/rejected": -394.87138671875, "loss": 0.3239, "rewards/chosen": 0.19624767700831094, "rewards/margins": 2.5575626413027446, "rewards/rejected": -2.3613149642944338, "step": 17685 }, { "epoch": 0.9374287758725783, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -157350016.0, "logits/rejected": -28537702.85714286, "logps/chosen": -281.11175537109375, "logps/rejected": -242.39456612723214, "loss": 0.2416, "rewards/chosen": -0.320791631937027, "rewards/margins": 1.4231801927089691, "rewards/rejected": -1.743971824645996, "step": 17686 }, { "epoch": 0.9374817798743805, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31750445.333333332, "logits/rejected": -26498163.2, "logps/chosen": -339.6708170572917, "logps/rejected": -370.4138671875, "loss": 0.2145, "rewards/chosen": 0.8384241263071696, "rewards/margins": 3.1293604056040443, "rewards/rejected": -2.290936279296875, "step": 17687 }, { "epoch": 0.9375347838761826, "grad_norm": 56.0, "kl": 2.735374927520752, "learning_rate": 5e-07, "logits/chosen": 30194416.0, "logits/rejected": -43716716.0, "logps/chosen": -282.59075927734375, "logps/rejected": -472.0410461425781, "loss": 0.3014, "rewards/chosen": 0.4929482340812683, "rewards/margins": 2.8871342539787292, "rewards/rejected": -2.394186019897461, "step": 17688 }, { "epoch": 0.9375877878779848, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -111586016.0, "logits/rejected": -22583688.0, "logps/chosen": -656.4931030273438, "logps/rejected": -404.5192057291667, "loss": 0.162, "rewards/chosen": 0.690808117389679, "rewards/margins": 3.163987934589386, "rewards/rejected": -2.473179817199707, "step": 17689 }, { "epoch": 0.9376407918797869, "grad_norm": 33.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5784168.0, "logits/rejected": -27824029.333333332, "logps/chosen": -200.83006286621094, "logps/rejected": -184.7829793294271, "loss": 0.1656, "rewards/chosen": 0.5732215642929077, "rewards/margins": 3.161631464958191, "rewards/rejected": -2.588409900665283, "step": 17690 }, { "epoch": 0.9376937958815891, "grad_norm": 65.0, "kl": 4.3080291748046875, "learning_rate": 5e-07, "logits/chosen": -36900763.428571425, "logits/rejected": 435957.78125, "logps/chosen": -402.84835379464283, "logps/rejected": -50.065216064453125, "loss": 0.4899, "rewards/chosen": 0.2836273568017142, "rewards/margins": 2.810332145009722, "rewards/rejected": -2.526704788208008, "step": 17691 }, { "epoch": 0.9377467998833912, "grad_norm": 54.5, "kl": 0.10296249389648438, "learning_rate": 5e-07, "logits/chosen": -43958160.0, "logits/rejected": -23176692.0, "logps/chosen": -437.2039489746094, "logps/rejected": -167.40628051757812, "loss": 0.1856, "rewards/chosen": 1.2946349382400513, "rewards/margins": 3.9485503435134888, "rewards/rejected": -2.6539154052734375, "step": 17692 }, { "epoch": 0.9377998038851934, "grad_norm": 43.25, "kl": 0.6531791687011719, "learning_rate": 5e-07, "logits/chosen": -72378764.8, "logits/rejected": -55961056.0, "logps/chosen": -225.4281005859375, "logps/rejected": -922.8561197916666, "loss": 0.34, "rewards/chosen": 0.1445775270462036, "rewards/margins": 5.101581438382467, "rewards/rejected": -4.957003911336263, "step": 17693 }, { "epoch": 0.9378528078869954, "grad_norm": 37.0, "kl": 0.5707206726074219, "learning_rate": 5e-07, "logits/chosen": -24561840.0, "logits/rejected": -13413786.666666666, "logps/chosen": -359.8418273925781, "logps/rejected": -227.18965657552084, "loss": 0.2274, "rewards/chosen": 0.077947236597538, "rewards/margins": 3.0614715591073036, "rewards/rejected": -2.9835243225097656, "step": 17694 }, { "epoch": 0.9379058118887976, "grad_norm": 47.25, "kl": 6.253196716308594, "learning_rate": 5e-07, "logits/chosen": 26632676.57142857, "logits/rejected": -17450976.0, "logps/chosen": -225.12928989955358, "logps/rejected": -484.7857666015625, "loss": 0.5078, "rewards/chosen": 0.3746922016143799, "rewards/margins": 3.311136245727539, "rewards/rejected": -2.936444044113159, "step": 17695 }, { "epoch": 0.9379588158905997, "grad_norm": 63.5, "kl": 3.1605777740478516, "learning_rate": 5e-07, "logits/chosen": -38879917.71428572, "logits/rejected": 27888974.0, "logps/chosen": -482.33809988839283, "logps/rejected": -158.8695068359375, "loss": 0.3827, "rewards/chosen": 0.7520315987723214, "rewards/margins": 1.3080374939101083, "rewards/rejected": -0.5560058951377869, "step": 17696 }, { "epoch": 0.9380118198924019, "grad_norm": 50.0, "kl": 6.3693647384643555, "learning_rate": 5e-07, "logits/chosen": -48788800.0, "logits/rejected": 416819.6666666667, "logps/chosen": -1065.51455078125, "logps/rejected": -129.86588541666666, "loss": 0.3121, "rewards/chosen": 2.166522216796875, "rewards/margins": 3.656106821695964, "rewards/rejected": -1.4895846048990886, "step": 17697 }, { "epoch": 0.938064823894204, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 113884748.8, "logits/rejected": -13059485.333333334, "logps/chosen": -421.225830078125, "logps/rejected": -117.94016520182292, "loss": 0.2993, "rewards/chosen": 0.7114913940429688, "rewards/margins": 2.7375054041544598, "rewards/rejected": -2.0260140101114907, "step": 17698 }, { "epoch": 0.9381178278960062, "grad_norm": 46.75, "kl": 1.4339179992675781, "learning_rate": 5e-07, "logits/chosen": -15594705.6, "logits/rejected": -39310760.0, "logps/chosen": -293.1900634765625, "logps/rejected": -404.6368408203125, "loss": 0.2776, "rewards/chosen": 0.6550240516662598, "rewards/margins": 3.0139854749043784, "rewards/rejected": -2.3589614232381186, "step": 17699 }, { "epoch": 0.9381708318978083, "grad_norm": 50.25, "kl": 4.235527038574219, "learning_rate": 5e-07, "logits/chosen": -36968915.2, "logits/rejected": -17540469.333333332, "logps/chosen": -303.178515625, "logps/rejected": -303.2652994791667, "loss": 0.3699, "rewards/chosen": 0.8665715217590332, "rewards/margins": 2.6139069557189942, "rewards/rejected": -1.747335433959961, "step": 17700 }, { "epoch": 0.9382238358996104, "grad_norm": 51.75, "kl": 1.0808591842651367, "learning_rate": 5e-07, "logits/chosen": -21081657.14285714, "logits/rejected": -428912.1875, "logps/chosen": -231.80543736049108, "logps/rejected": -155.70632934570312, "loss": 0.3649, "rewards/chosen": 0.6216423852103097, "rewards/margins": 1.945539150919233, "rewards/rejected": -1.3238967657089233, "step": 17701 }, { "epoch": 0.9382768399014125, "grad_norm": 38.0, "kl": 0.13463783264160156, "learning_rate": 5e-07, "logits/chosen": -55876469.333333336, "logits/rejected": -4597514.5, "logps/chosen": -161.31953938802084, "logps/rejected": -89.41738891601562, "loss": 0.3671, "rewards/chosen": 0.16506993770599365, "rewards/margins": 3.9653111696243286, "rewards/rejected": -3.800241231918335, "step": 17702 }, { "epoch": 0.9383298439032147, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5155847.5, "logits/rejected": 213408.33333333334, "logps/chosen": -295.4237060546875, "logps/rejected": -536.5481363932291, "loss": 0.1595, "rewards/chosen": -0.1376177817583084, "rewards/margins": 4.345360403259595, "rewards/rejected": -4.482978185017903, "step": 17703 }, { "epoch": 0.9383828479050168, "grad_norm": 44.75, "kl": 1.1072158813476562, "learning_rate": 5e-07, "logits/chosen": -17811188.57142857, "logits/rejected": -6520232.0, "logps/chosen": -175.21165248325892, "logps/rejected": -68.28099060058594, "loss": 0.3861, "rewards/chosen": 0.6300357409885952, "rewards/margins": 2.1397403308323453, "rewards/rejected": -1.50970458984375, "step": 17704 }, { "epoch": 0.938435851906819, "grad_norm": 34.25, "kl": 0.0718994140625, "learning_rate": 5e-07, "logits/chosen": -23949796.8, "logits/rejected": -45101232.0, "logps/chosen": -749.9744140625, "logps/rejected": -500.3125813802083, "loss": 0.1348, "rewards/chosen": 1.9491294860839843, "rewards/margins": 6.016289138793946, "rewards/rejected": -4.067159652709961, "step": 17705 }, { "epoch": 0.9384888559086211, "grad_norm": 72.0, "kl": 1.6397018432617188, "learning_rate": 5e-07, "logits/chosen": -64950784.0, "logits/rejected": 26239.0, "logps/chosen": -482.4742431640625, "logps/rejected": -405.03857421875, "loss": 0.3838, "rewards/chosen": 0.3746658166249593, "rewards/margins": 2.161425789197286, "rewards/rejected": -1.7867599725723267, "step": 17706 }, { "epoch": 0.9385418599104233, "grad_norm": 38.0, "kl": 0.1827678680419922, "learning_rate": 5e-07, "logits/chosen": -14349984.0, "logits/rejected": -52108704.0, "logps/chosen": -206.0128173828125, "logps/rejected": -335.23284912109375, "loss": 0.2642, "rewards/chosen": 0.12277813255786896, "rewards/margins": 3.336965039372444, "rewards/rejected": -3.214186906814575, "step": 17707 }, { "epoch": 0.9385948639122254, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25311572.0, "logits/rejected": -29607160.0, "logps/chosen": -663.9716796875, "logps/rejected": -351.0784912109375, "loss": 0.1554, "rewards/chosen": 0.4814239740371704, "rewards/margins": 3.4206844568252563, "rewards/rejected": -2.939260482788086, "step": 17708 }, { "epoch": 0.9386478679140275, "grad_norm": 48.25, "kl": 1.8656387329101562, "learning_rate": 5e-07, "logits/chosen": -22254796.0, "logits/rejected": -51083996.0, "logps/chosen": -246.846923828125, "logps/rejected": -246.70924377441406, "loss": 0.2561, "rewards/chosen": 0.8800406455993652, "rewards/margins": 2.8596757650375366, "rewards/rejected": -1.9796351194381714, "step": 17709 }, { "epoch": 0.9387008719158296, "grad_norm": 43.75, "kl": 3.5512046813964844, "learning_rate": 5e-07, "logits/chosen": -38162340.0, "logits/rejected": 5340888.0, "logps/chosen": -148.494873046875, "logps/rejected": -221.4247589111328, "loss": 0.3332, "rewards/chosen": 0.6527342200279236, "rewards/margins": 2.5716649889945984, "rewards/rejected": -1.9189307689666748, "step": 17710 }, { "epoch": 0.9387538759176318, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 12780493.0, "logits/rejected": -37912123.428571425, "logps/chosen": -60.78310775756836, "logps/rejected": -310.44266183035717, "loss": 0.1883, "rewards/chosen": -1.7219135761260986, "rewards/margins": 1.0319147450583324, "rewards/rejected": -2.753828321184431, "step": 17711 }, { "epoch": 0.9388068799194339, "grad_norm": 79.5, "kl": 1.2309064865112305, "learning_rate": 5e-07, "logits/chosen": -36585635.2, "logits/rejected": -1318778.4166666667, "logps/chosen": -232.3107421875, "logps/rejected": -99.095947265625, "loss": 0.3243, "rewards/chosen": 0.7555336952209473, "rewards/margins": 1.901206334431966, "rewards/rejected": -1.1456726392110188, "step": 17712 }, { "epoch": 0.9388598839212361, "grad_norm": 42.25, "kl": 2.4927291870117188, "learning_rate": 5e-07, "logits/chosen": -51949208.0, "logits/rejected": -29972276.0, "logps/chosen": -155.78114318847656, "logps/rejected": -228.15000915527344, "loss": 0.2982, "rewards/chosen": 0.4718913435935974, "rewards/margins": 2.191359221935272, "rewards/rejected": -1.7194678783416748, "step": 17713 }, { "epoch": 0.9389128879230382, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50959216.0, "logits/rejected": -43478628.0, "logps/chosen": -441.09234619140625, "logps/rejected": -415.36480712890625, "loss": 0.2531, "rewards/chosen": 0.3324684202671051, "rewards/margins": 3.7632435858249664, "rewards/rejected": -3.4307751655578613, "step": 17714 }, { "epoch": 0.9389658919248404, "grad_norm": 43.0, "kl": 4.859431266784668, "learning_rate": 5e-07, "logits/chosen": -33974580.0, "logits/rejected": -7732430.0, "logps/chosen": -457.0673522949219, "logps/rejected": -230.88912963867188, "loss": 0.2294, "rewards/chosen": 2.308990478515625, "rewards/margins": 4.84019923210144, "rewards/rejected": -2.5312087535858154, "step": 17715 }, { "epoch": 0.9390188959266424, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8506796.0, "logits/rejected": -16710422.4, "logps/chosen": -210.30301920572916, "logps/rejected": -176.9660400390625, "loss": 0.2218, "rewards/chosen": 1.2434667746225994, "rewards/margins": 3.261292854944865, "rewards/rejected": -2.0178260803222656, "step": 17716 }, { "epoch": 0.9390718999284446, "grad_norm": 44.25, "kl": 0.5145721435546875, "learning_rate": 5e-07, "logits/chosen": -10981576.0, "logits/rejected": -3035232.25, "logps/chosen": -73.01200866699219, "logps/rejected": -360.76593017578125, "loss": 0.3536, "rewards/chosen": -0.14409860968589783, "rewards/margins": 2.1025728285312653, "rewards/rejected": -2.246671438217163, "step": 17717 }, { "epoch": 0.9391249039302467, "grad_norm": 60.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18238200.0, "logits/rejected": -89995088.0, "logps/chosen": -313.09739176432294, "logps/rejected": -696.2970581054688, "loss": 0.2899, "rewards/chosen": 0.6996156374613444, "rewards/margins": 2.6158663431803384, "rewards/rejected": -1.9162507057189941, "step": 17718 }, { "epoch": 0.9391779079320489, "grad_norm": 49.5, "kl": 0.8516054153442383, "learning_rate": 5e-07, "logits/chosen": -28023037.333333332, "logits/rejected": -1971538.125, "logps/chosen": -206.35953776041666, "logps/rejected": -72.9615478515625, "loss": 0.2921, "rewards/chosen": 0.6148689985275269, "rewards/margins": 4.247300028800964, "rewards/rejected": -3.6324310302734375, "step": 17719 }, { "epoch": 0.939230911933851, "grad_norm": 35.5, "kl": 3.777583122253418, "learning_rate": 5e-07, "logits/chosen": 13939718.0, "logits/rejected": -60280309.333333336, "logps/chosen": -433.6407775878906, "logps/rejected": -444.5352783203125, "loss": 0.1951, "rewards/chosen": 1.6987131834030151, "rewards/margins": 4.337087194124857, "rewards/rejected": -2.6383740107218423, "step": 17720 }, { "epoch": 0.9392839159356532, "grad_norm": 57.25, "kl": 3.4866394996643066, "learning_rate": 5e-07, "logits/chosen": -21831781.333333332, "logits/rejected": 3246378.0, "logps/chosen": -368.9581705729167, "logps/rejected": -116.97248840332031, "loss": 0.3081, "rewards/chosen": 1.4202219645182292, "rewards/margins": 2.209776024023692, "rewards/rejected": -0.7895540595054626, "step": 17721 }, { "epoch": 0.9393369199374553, "grad_norm": 43.75, "kl": 10.30394172668457, "learning_rate": 5e-07, "logits/chosen": -6488574.5, "logps/chosen": -465.9007568359375, "loss": 0.4246, "rewards/chosen": 1.4634431600570679, "step": 17722 }, { "epoch": 0.9393899239392575, "grad_norm": 37.75, "kl": 1.4313087463378906, "learning_rate": 5e-07, "logits/chosen": -26425008.0, "logits/rejected": -4209732.5, "logps/chosen": -175.10557556152344, "logps/rejected": -123.85362243652344, "loss": 0.3331, "rewards/chosen": 0.10044687986373901, "rewards/margins": 3.031587779521942, "rewards/rejected": -2.931140899658203, "step": 17723 }, { "epoch": 0.9394429279410595, "grad_norm": 68.5, "kl": 0.17284202575683594, "learning_rate": 5e-07, "logits/chosen": -45051664.0, "logits/rejected": -16482049.0, "logps/chosen": -483.0588684082031, "logps/rejected": -570.30419921875, "loss": 0.3111, "rewards/chosen": 0.9706384539604187, "rewards/margins": 4.20889014005661, "rewards/rejected": -3.2382516860961914, "step": 17724 }, { "epoch": 0.9394959319428617, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19583510.666666668, "logits/rejected": -6564523.2, "logps/chosen": -321.5543212890625, "logps/rejected": -200.9733154296875, "loss": 0.3367, "rewards/chosen": -0.17400974035263062, "rewards/margins": 1.2745285630226135, "rewards/rejected": -1.4485383033752441, "step": 17725 }, { "epoch": 0.9395489359446638, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53860208.0, "logits/rejected": -11166732.57142857, "logps/chosen": -347.2476806640625, "logps/rejected": -250.61408342633928, "loss": 0.1415, "rewards/chosen": -0.21647034585475922, "rewards/margins": 2.3952509654419765, "rewards/rejected": -2.6117213112967357, "step": 17726 }, { "epoch": 0.9396019399464659, "grad_norm": 35.75, "kl": 3.9318389892578125, "learning_rate": 5e-07, "logits/chosen": -36032036.0, "logits/rejected": -57364336.0, "logps/chosen": -529.2042846679688, "logps/rejected": -520.3547973632812, "loss": 0.2238, "rewards/chosen": 1.0795326232910156, "rewards/margins": 3.8331377506256104, "rewards/rejected": -2.7536051273345947, "step": 17727 }, { "epoch": 0.9396549439482681, "grad_norm": 42.0, "kl": 0.9325752258300781, "learning_rate": 5e-07, "logits/chosen": 4021848.0, "logits/rejected": -16245496.0, "logps/chosen": -183.48994954427084, "logps/rejected": -245.210546875, "loss": 0.1867, "rewards/chosen": 1.6840356190999348, "rewards/margins": 3.4001675923665364, "rewards/rejected": -1.7161319732666016, "step": 17728 }, { "epoch": 0.9397079479500702, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39175856.0, "logits/rejected": -5986822.0, "logps/chosen": -367.6090087890625, "logps/rejected": -147.9682820638021, "loss": 0.184, "rewards/chosen": 1.1409683227539062, "rewards/margins": 3.516528924306234, "rewards/rejected": -2.3755606015523276, "step": 17729 }, { "epoch": 0.9397609519518724, "grad_norm": 47.0, "kl": 3.1215782165527344, "learning_rate": 5e-07, "logits/chosen": -47558484.0, "logits/rejected": -18882464.0, "logps/chosen": -254.97364807128906, "logps/rejected": -318.603271484375, "loss": 0.3773, "rewards/chosen": -0.08797626197338104, "rewards/margins": 1.4074886590242386, "rewards/rejected": -1.4954649209976196, "step": 17730 }, { "epoch": 0.9398139559536745, "grad_norm": 49.5, "kl": 4.279946327209473, "learning_rate": 5e-07, "logits/chosen": -11896296.0, "logits/rejected": -31949796.0, "logps/chosen": -283.62542724609375, "logps/rejected": -325.9790344238281, "loss": 0.3068, "rewards/chosen": 0.9910966555277506, "rewards/margins": 4.537763516108195, "rewards/rejected": -3.5466668605804443, "step": 17731 }, { "epoch": 0.9398669599554766, "grad_norm": 41.75, "kl": 2.9803848266601562, "learning_rate": 5e-07, "logits/chosen": -25494294.4, "logits/rejected": -73869669.33333333, "logps/chosen": -226.6286376953125, "logps/rejected": -251.027587890625, "loss": 0.359, "rewards/chosen": 0.5399073123931885, "rewards/margins": 1.713312800725301, "rewards/rejected": -1.1734054883321126, "step": 17732 }, { "epoch": 0.9399199639572787, "grad_norm": 40.25, "kl": 2.051952362060547, "learning_rate": 5e-07, "logits/chosen": -10186070.666666666, "logits/rejected": -20713056.0, "logps/chosen": -115.0669453938802, "logps/rejected": -372.57021484375, "loss": 0.2611, "rewards/chosen": -0.09265708923339844, "rewards/margins": 3.780706787109375, "rewards/rejected": -3.8733638763427733, "step": 17733 }, { "epoch": 0.9399729679590809, "grad_norm": 63.25, "kl": 2.325655460357666, "learning_rate": 5e-07, "logits/chosen": -6293842.666666667, "logits/rejected": -26035362.0, "logps/chosen": -176.83894856770834, "logps/rejected": -320.5176086425781, "loss": 0.443, "rewards/chosen": 0.2967093785603841, "rewards/margins": 2.1872382958730063, "rewards/rejected": -1.890528917312622, "step": 17734 }, { "epoch": 0.940025971960883, "grad_norm": 58.0, "kl": 6.0523529052734375, "learning_rate": 5e-07, "logits/chosen": -8757701.714285715, "logits/rejected": -48479440.0, "logps/chosen": -690.7207728794643, "logps/rejected": -180.37094116210938, "loss": 0.4984, "rewards/chosen": 0.4171596254621233, "rewards/margins": 2.1165157045636858, "rewards/rejected": -1.6993560791015625, "step": 17735 }, { "epoch": 0.9400789759626852, "grad_norm": 34.5, "kl": 0.2469940185546875, "learning_rate": 5e-07, "logits/chosen": -25032728.0, "logits/rejected": -38779474.666666664, "logps/chosen": -822.754150390625, "logps/rejected": -315.21722412109375, "loss": 0.087, "rewards/chosen": 2.3880553245544434, "rewards/margins": 5.19612455368042, "rewards/rejected": -2.8080692291259766, "step": 17736 }, { "epoch": 0.9401319799644873, "grad_norm": 53.5, "kl": 1.78228759765625, "learning_rate": 5e-07, "logits/chosen": -11349458.0, "logits/rejected": -32062504.0, "logps/chosen": -306.6881103515625, "logps/rejected": -439.9124450683594, "loss": 0.2721, "rewards/chosen": 0.7123487591743469, "rewards/margins": 2.5345779061317444, "rewards/rejected": -1.8222291469573975, "step": 17737 }, { "epoch": 0.9401849839662895, "grad_norm": 31.125, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7742581.6, "logits/rejected": -87946912.0, "logps/chosen": -284.21748046875, "logps/rejected": -594.9676513671875, "loss": 0.1533, "rewards/chosen": 1.6917255401611329, "rewards/margins": 5.167007255554199, "rewards/rejected": -3.4752817153930664, "step": 17738 }, { "epoch": 0.9402379879680915, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47266720.0, "logits/rejected": -22462464.0, "logps/chosen": -307.3371276855469, "logps/rejected": -254.86196899414062, "loss": 0.2266, "rewards/chosen": 0.9168672561645508, "rewards/margins": 2.8290982246398926, "rewards/rejected": -1.9122309684753418, "step": 17739 }, { "epoch": 0.9402909919698937, "grad_norm": 49.0, "kl": 0.4338836669921875, "learning_rate": 5e-07, "logits/chosen": -24173248.0, "logits/rejected": -43751460.0, "logps/chosen": -298.1263427734375, "logps/rejected": -604.3648681640625, "loss": 0.2751, "rewards/chosen": -0.011727333068847656, "rewards/margins": 4.485235691070557, "rewards/rejected": -4.496963024139404, "step": 17740 }, { "epoch": 0.9403439959716958, "grad_norm": 48.5, "kl": 4.886731147766113, "learning_rate": 5e-07, "logits/chosen": -47698592.0, "logits/rejected": -25776120.0, "logps/chosen": -395.9535725911458, "logps/rejected": -186.53271484375, "loss": 0.4751, "rewards/chosen": 0.6903627713521322, "rewards/margins": 1.5749389727910361, "rewards/rejected": -0.8845762014389038, "step": 17741 }, { "epoch": 0.940396999973498, "grad_norm": 42.5, "kl": 0.5705699920654297, "learning_rate": 5e-07, "logits/chosen": -28948704.0, "logits/rejected": 25047782.4, "logps/chosen": -399.540283203125, "logps/rejected": -148.2951416015625, "loss": 0.174, "rewards/chosen": 1.0566160678863525, "rewards/margins": 4.029156351089478, "rewards/rejected": -2.972540283203125, "step": 17742 }, { "epoch": 0.9404500039753001, "grad_norm": 48.0, "kl": 2.91766357421875, "learning_rate": 5e-07, "logits/chosen": -74139354.66666667, "logits/rejected": -24251464.0, "logps/chosen": -444.6819254557292, "logps/rejected": -200.49881591796876, "loss": 0.1909, "rewards/chosen": 0.8580307960510254, "rewards/margins": 3.6606820106506346, "rewards/rejected": -2.802651214599609, "step": 17743 }, { "epoch": 0.9405030079771023, "grad_norm": 64.0, "kl": 5.812746047973633, "learning_rate": 5e-07, "logits/chosen": -22097014.85714286, "logits/rejected": -16901294.0, "logps/chosen": -239.52122279575892, "logps/rejected": -188.86138916015625, "loss": 0.4513, "rewards/chosen": 0.8227437564304897, "rewards/margins": 1.5432179995945523, "rewards/rejected": -0.7204742431640625, "step": 17744 }, { "epoch": 0.9405560119789044, "grad_norm": 54.5, "kl": 3.348325729370117, "learning_rate": 5e-07, "logits/chosen": -9225975.333333334, "logits/rejected": -20399790.0, "logps/chosen": -158.62994384765625, "logps/rejected": -183.6371612548828, "loss": 0.4046, "rewards/chosen": 0.40669703483581543, "rewards/margins": 3.8345108032226562, "rewards/rejected": -3.427813768386841, "step": 17745 }, { "epoch": 0.9406090159807066, "grad_norm": 48.0, "kl": 0.05675697326660156, "learning_rate": 5e-07, "logits/chosen": -19024577.333333332, "logits/rejected": -16532089.6, "logps/chosen": -251.161376953125, "logps/rejected": -398.517626953125, "loss": 0.2905, "rewards/chosen": -0.057613362868626915, "rewards/margins": 2.32520943681399, "rewards/rejected": -2.382822799682617, "step": 17746 }, { "epoch": 0.9406620199825086, "grad_norm": 57.0, "kl": 3.025848388671875, "learning_rate": 5e-07, "logits/chosen": 16914712.0, "logits/rejected": -40203228.0, "logps/chosen": -272.9686279296875, "logps/rejected": -356.18426513671875, "loss": 0.3028, "rewards/chosen": 0.4379499554634094, "rewards/margins": 2.194428265094757, "rewards/rejected": -1.7564783096313477, "step": 17747 }, { "epoch": 0.9407150239843108, "grad_norm": 42.25, "kl": 1.5704975128173828, "learning_rate": 5e-07, "logits/chosen": -12675878.0, "logits/rejected": -31480500.0, "logps/chosen": -193.09674072265625, "logps/rejected": -271.86431884765625, "loss": 0.3385, "rewards/chosen": -0.08268279582262039, "rewards/margins": 2.0365191027522087, "rewards/rejected": -2.119201898574829, "step": 17748 }, { "epoch": 0.9407680279861129, "grad_norm": 46.75, "kl": 0.4080085754394531, "learning_rate": 5e-07, "logits/chosen": -46846453.333333336, "logits/rejected": -61084441.6, "logps/chosen": -262.1625569661458, "logps/rejected": -325.1781982421875, "loss": 0.2388, "rewards/chosen": 0.5385235150655111, "rewards/margins": 2.7084476788838705, "rewards/rejected": -2.1699241638183593, "step": 17749 }, { "epoch": 0.9408210319879151, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73162288.0, "logits/rejected": -45003914.666666664, "logps/chosen": -212.37477111816406, "logps/rejected": -403.5036214192708, "loss": 0.2312, "rewards/chosen": -0.6343734860420227, "rewards/margins": 2.06155127286911, "rewards/rejected": -2.695924758911133, "step": 17750 }, { "epoch": 0.9408740359897172, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46123541.333333336, "logits/rejected": -25245043.2, "logps/chosen": -518.0081380208334, "logps/rejected": -246.4216552734375, "loss": 0.1507, "rewards/chosen": 1.2079569498697917, "rewards/margins": 3.817232958475749, "rewards/rejected": -2.609276008605957, "step": 17751 }, { "epoch": 0.9409270399915194, "grad_norm": 37.0, "kl": 2.7785873413085938, "learning_rate": 5e-07, "logits/chosen": 22491776.0, "logits/rejected": -13813752.0, "logps/chosen": -357.582861328125, "logps/rejected": -188.87150065104166, "loss": 0.3742, "rewards/chosen": 0.5623855590820312, "rewards/margins": 2.198805809020996, "rewards/rejected": -1.6364202499389648, "step": 17752 }, { "epoch": 0.9409800439933215, "grad_norm": 38.5, "kl": 4.175022125244141, "learning_rate": 5e-07, "logits/chosen": -18890550.85714286, "logits/rejected": -48280360.0, "logps/chosen": -280.7800990513393, "logps/rejected": -581.0128173828125, "loss": 0.3406, "rewards/chosen": 1.0601382936750139, "rewards/margins": 4.167218514851161, "rewards/rejected": -3.1070802211761475, "step": 17753 }, { "epoch": 0.9410330479951237, "grad_norm": 55.5, "kl": 0.7692489624023438, "learning_rate": 5e-07, "logits/chosen": -29817122.666666668, "logits/rejected": -60894899.2, "logps/chosen": -435.7965494791667, "logps/rejected": -291.436279296875, "loss": 0.177, "rewards/chosen": 0.7437348365783691, "rewards/margins": 4.251243686676025, "rewards/rejected": -3.5075088500976563, "step": 17754 }, { "epoch": 0.9410860519969257, "grad_norm": 51.75, "kl": 1.5011405944824219, "learning_rate": 5e-07, "logits/chosen": -39658908.8, "logits/rejected": -9306418.0, "logps/chosen": -362.94072265625, "logps/rejected": -195.91451009114584, "loss": 0.3038, "rewards/chosen": 0.6113522052764893, "rewards/margins": 3.8063849608103433, "rewards/rejected": -3.195032755533854, "step": 17755 }, { "epoch": 0.9411390559987279, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20912880.0, "logits/rejected": -3497772.0, "logps/chosen": -288.4239095052083, "logps/rejected": -409.3669921875, "loss": 0.2773, "rewards/chosen": 0.4116770426432292, "rewards/margins": 2.914779154459635, "rewards/rejected": -2.503102111816406, "step": 17756 }, { "epoch": 0.94119206000053, "grad_norm": 56.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34976217.6, "logits/rejected": -10207861.333333334, "logps/chosen": -301.517041015625, "logps/rejected": -376.295166015625, "loss": 0.3889, "rewards/chosen": -0.2523963212966919, "rewards/margins": 2.5364582777023315, "rewards/rejected": -2.7888545989990234, "step": 17757 }, { "epoch": 0.9412450640023322, "grad_norm": 51.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17517446.0, "logits/rejected": -20107360.0, "logps/chosen": -521.763671875, "logps/rejected": -238.53096516927084, "loss": 0.2295, "rewards/chosen": 0.3978897035121918, "rewards/margins": 2.8839841783046722, "rewards/rejected": -2.4860944747924805, "step": 17758 }, { "epoch": 0.9412980680041343, "grad_norm": 44.75, "kl": 8.006881713867188, "learning_rate": 5e-07, "logits/chosen": -11863432.0, "logits/rejected": -54545224.0, "logps/chosen": -169.84789021809897, "logps/rejected": -144.81082153320312, "loss": 0.4166, "rewards/chosen": 0.6758685111999512, "rewards/margins": 3.2916011810302734, "rewards/rejected": -2.6157326698303223, "step": 17759 }, { "epoch": 0.9413510720059365, "grad_norm": 27.5, "kl": 4.732826232910156, "learning_rate": 5e-07, "logits/chosen": 4640402.4, "logits/rejected": -42789746.666666664, "logps/chosen": -102.8527099609375, "logps/rejected": -432.7124430338542, "loss": 0.354, "rewards/chosen": 0.4351907253265381, "rewards/margins": 3.8247816244761146, "rewards/rejected": -3.3895908991495767, "step": 17760 }, { "epoch": 0.9414040760077386, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -88054074.66666667, "logits/rejected": -37818252.8, "logps/chosen": -400.6142171223958, "logps/rejected": -353.306396484375, "loss": 0.2283, "rewards/chosen": 0.5439738432566324, "rewards/margins": 3.129257313410441, "rewards/rejected": -2.5852834701538088, "step": 17761 }, { "epoch": 0.9414570800095408, "grad_norm": 45.75, "kl": 0.3998069763183594, "learning_rate": 5e-07, "logits/chosen": 14488884.8, "logits/rejected": -500184.6666666667, "logps/chosen": -176.45166015625, "logps/rejected": -116.3243408203125, "loss": 0.3402, "rewards/chosen": 0.11404967308044434, "rewards/margins": 3.14155904452006, "rewards/rejected": -3.0275093714396157, "step": 17762 }, { "epoch": 0.9415100840113428, "grad_norm": 43.5, "kl": 0.880889892578125, "learning_rate": 5e-07, "logits/chosen": -72877568.0, "logits/rejected": -54632504.0, "logps/chosen": -814.06591796875, "logps/rejected": -479.04052734375, "loss": 0.1367, "rewards/chosen": 1.9696227312088013, "rewards/margins": 4.52271044254303, "rewards/rejected": -2.5530877113342285, "step": 17763 }, { "epoch": 0.941563088013145, "grad_norm": 31.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29669670.0, "logits/rejected": -49212874.666666664, "logps/chosen": -206.84288024902344, "logps/rejected": -295.7281087239583, "loss": 0.1925, "rewards/chosen": 0.5135689973831177, "rewards/margins": 3.1471250454584756, "rewards/rejected": -2.633556048075358, "step": 17764 }, { "epoch": 0.9416160920149471, "grad_norm": 33.75, "kl": 1.7916679382324219, "learning_rate": 5e-07, "logits/chosen": -24295488.0, "logits/rejected": -21922448.0, "logps/chosen": -192.1798095703125, "logps/rejected": -375.943115234375, "loss": 0.2637, "rewards/chosen": 0.9084643363952637, "rewards/margins": 3.6788182894388832, "rewards/rejected": -2.7703539530436196, "step": 17765 }, { "epoch": 0.9416690960167493, "grad_norm": 50.25, "kl": 0.8795013427734375, "learning_rate": 5e-07, "logits/chosen": -52024844.8, "logits/rejected": -11829685.333333334, "logps/chosen": -359.956787109375, "logps/rejected": -230.0618896484375, "loss": 0.3339, "rewards/chosen": 0.18183668851852416, "rewards/margins": 3.4676439801851906, "rewards/rejected": -3.2858072916666665, "step": 17766 }, { "epoch": 0.9417221000185514, "grad_norm": 54.75, "kl": 0.02484893798828125, "learning_rate": 5e-07, "logits/chosen": -30348684.0, "logits/rejected": -30543786.0, "logps/chosen": -254.7972869873047, "logps/rejected": -318.557373046875, "loss": 0.2614, "rewards/chosen": 0.7971199154853821, "rewards/margins": 2.3232045769691467, "rewards/rejected": -1.5260846614837646, "step": 17767 }, { "epoch": 0.9417751040203536, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34754176.0, "logits/rejected": -47206148.0, "logps/chosen": -279.25482177734375, "logps/rejected": -470.8816223144531, "loss": 0.227, "rewards/chosen": 0.6323827505111694, "rewards/margins": 2.9862173795700073, "rewards/rejected": -2.353834629058838, "step": 17768 }, { "epoch": 0.9418281080221557, "grad_norm": 34.5, "kl": 4.921684265136719, "learning_rate": 5e-07, "logits/chosen": 5695718.0, "logits/rejected": 6857720.5, "logps/chosen": -177.22000122070312, "logps/rejected": -194.43212890625, "loss": 0.2861, "rewards/chosen": 1.212929368019104, "rewards/margins": 3.9635123014450073, "rewards/rejected": -2.7505829334259033, "step": 17769 }, { "epoch": 0.9418811120239579, "grad_norm": 41.75, "kl": 0.5965633392333984, "learning_rate": 5e-07, "logits/chosen": -41775865.6, "logits/rejected": -21929689.333333332, "logps/chosen": -531.753125, "logps/rejected": -262.6951904296875, "loss": 0.216, "rewards/chosen": 1.1294519424438476, "rewards/margins": 4.661383819580078, "rewards/rejected": -3.5319318771362305, "step": 17770 }, { "epoch": 0.9419341160257599, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 840112.8333333334, "logits/rejected": -35947235.2, "logps/chosen": -126.63711547851562, "logps/rejected": -351.665625, "loss": 0.228, "rewards/chosen": -0.13357404867808023, "rewards/margins": 3.1301321585973105, "rewards/rejected": -3.2637062072753906, "step": 17771 }, { "epoch": 0.9419871200275621, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53467573.333333336, "logits/rejected": -10767168.0, "logps/chosen": -481.4289143880208, "logps/rejected": -431.085009765625, "loss": 0.1939, "rewards/chosen": 1.6855449676513672, "rewards/margins": 3.6412282943725587, "rewards/rejected": -1.9556833267211915, "step": 17772 }, { "epoch": 0.9420401240293642, "grad_norm": 67.5, "kl": 0.35508251190185547, "learning_rate": 5e-07, "logits/chosen": -19562998.85714286, "logits/rejected": -3931194.75, "logps/chosen": -353.3456333705357, "logps/rejected": -97.71926879882812, "loss": 0.3638, "rewards/chosen": 0.4420075075966971, "rewards/margins": 4.16045161655971, "rewards/rejected": -3.7184441089630127, "step": 17773 }, { "epoch": 0.9420931280311664, "grad_norm": 34.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8102780.5, "logits/rejected": -21746843.42857143, "logps/chosen": -350.7717590332031, "logps/rejected": -267.4412144252232, "loss": 0.1067, "rewards/chosen": 1.9452790021896362, "rewards/margins": 4.4652018036161145, "rewards/rejected": -2.5199228014264787, "step": 17774 }, { "epoch": 0.9421461320329685, "grad_norm": 44.25, "kl": 3.060068130493164, "learning_rate": 5e-07, "logits/chosen": -33118198.4, "logits/rejected": -56564293.333333336, "logps/chosen": -478.296337890625, "logps/rejected": -394.4654947916667, "loss": 0.2328, "rewards/chosen": 1.434791088104248, "rewards/margins": 4.310859203338623, "rewards/rejected": -2.876068115234375, "step": 17775 }, { "epoch": 0.9421991360347707, "grad_norm": 42.75, "kl": 0.9124593734741211, "learning_rate": 5e-07, "logits/chosen": -11235260.0, "logits/rejected": -17359288.0, "logps/chosen": -255.6338653564453, "logps/rejected": -245.93685913085938, "loss": 0.2835, "rewards/chosen": 0.06689798831939697, "rewards/margins": 2.9103113412857056, "rewards/rejected": -2.8434133529663086, "step": 17776 }, { "epoch": 0.9422521400365728, "grad_norm": 60.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47651200.0, "logits/rejected": -31921716.0, "logps/chosen": -382.49951171875, "logps/rejected": -341.384765625, "loss": 0.3268, "rewards/chosen": 0.25128939747810364, "rewards/margins": 2.418549805879593, "rewards/rejected": -2.1672604084014893, "step": 17777 }, { "epoch": 0.9423051440383748, "grad_norm": 46.75, "kl": 0.07634735107421875, "learning_rate": 5e-07, "logits/chosen": -18789008.0, "logits/rejected": -16182508.0, "logps/chosen": -290.7619140625, "logps/rejected": -165.34647623697916, "loss": 0.347, "rewards/chosen": 0.48074841499328613, "rewards/margins": 1.699452002843221, "rewards/rejected": -1.2187035878499348, "step": 17778 }, { "epoch": 0.942358148040177, "grad_norm": 29.375, "kl": 2.055530548095703, "learning_rate": 5e-07, "logits/chosen": -16668305.333333334, "logits/rejected": -23604945.6, "logps/chosen": -183.91743977864584, "logps/rejected": -260.743994140625, "loss": 0.1639, "rewards/chosen": 1.0981703599294026, "rewards/margins": 4.746981700261434, "rewards/rejected": -3.6488113403320312, "step": 17779 }, { "epoch": 0.9424111520419791, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -1799807.25, "logps/rejected": -300.99853515625, "loss": 0.1618, "rewards/rejected": -2.6532511711120605, "step": 17780 }, { "epoch": 0.9424641560437813, "grad_norm": 57.75, "kl": 1.6934394836425781, "learning_rate": 5e-07, "logits/chosen": -1931074.857142857, "logits/rejected": -3836295.0, "logps/chosen": -276.05235072544644, "logps/rejected": -106.13992309570312, "loss": 0.4573, "rewards/chosen": 0.10200631618499756, "rewards/margins": 3.1755582094192505, "rewards/rejected": -3.073551893234253, "step": 17781 }, { "epoch": 0.9425171600455834, "grad_norm": 30.625, "kl": 2.864898681640625, "learning_rate": 5e-07, "logits/chosen": -21480058.666666668, "logits/rejected": -14264513.6, "logps/chosen": -786.56201171875, "logps/rejected": -188.7706298828125, "loss": 0.0674, "rewards/chosen": 2.445554574330648, "rewards/margins": 5.3288561185201, "rewards/rejected": -2.883301544189453, "step": 17782 }, { "epoch": 0.9425701640473856, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58830154.666666664, "logits/rejected": -24218816.0, "logps/chosen": -393.443359375, "logps/rejected": -293.0920654296875, "loss": 0.3363, "rewards/chosen": 0.0234563151995341, "rewards/margins": 1.4957137386004131, "rewards/rejected": -1.472257423400879, "step": 17783 }, { "epoch": 0.9426231680491877, "grad_norm": 69.0, "kl": 6.341236114501953, "learning_rate": 5e-07, "logits/chosen": -9546846.4, "logits/rejected": -22418768.0, "logps/chosen": -303.4578857421875, "logps/rejected": -163.38702392578125, "loss": 0.3193, "rewards/chosen": 1.347140121459961, "rewards/margins": 4.5237017949422205, "rewards/rejected": -3.1765616734822593, "step": 17784 }, { "epoch": 0.9426761720509899, "grad_norm": 29.25, "kl": 0.3630180358886719, "learning_rate": 5e-07, "logits/chosen": -39287160.0, "logits/rejected": -33207378.666666668, "logps/chosen": -564.6378784179688, "logps/rejected": -251.62687174479166, "loss": 0.0946, "rewards/chosen": 2.1033215522766113, "rewards/margins": 4.9714131355285645, "rewards/rejected": -2.868091583251953, "step": 17785 }, { "epoch": 0.9427291760527919, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47135593.6, "logits/rejected": -56588160.0, "logps/chosen": -415.883984375, "logps/rejected": -575.1007486979166, "loss": 0.3263, "rewards/chosen": 0.03179198801517487, "rewards/margins": 3.5392160246769584, "rewards/rejected": -3.5074240366617837, "step": 17786 }, { "epoch": 0.9427821800545941, "grad_norm": 38.0, "kl": 0.6210594177246094, "learning_rate": 5e-07, "logits/chosen": -19606733.333333332, "logits/rejected": -14336691.2, "logps/chosen": -199.1512654622396, "logps/rejected": -290.7953125, "loss": 0.1729, "rewards/chosen": 1.199536959330241, "rewards/margins": 4.058315722147624, "rewards/rejected": -2.8587787628173826, "step": 17787 }, { "epoch": 0.9428351840563962, "grad_norm": 46.25, "kl": 0.1917552947998047, "learning_rate": 5e-07, "logits/chosen": -40742464.0, "logits/rejected": -20692784.0, "logps/chosen": -535.41748046875, "logps/rejected": -189.29188537597656, "loss": 0.2261, "rewards/chosen": 1.2822121381759644, "rewards/margins": 3.081246852874756, "rewards/rejected": -1.7990347146987915, "step": 17788 }, { "epoch": 0.9428881880581984, "grad_norm": 36.0, "kl": 5.513302803039551, "learning_rate": 5e-07, "logits/chosen": -30257008.0, "logits/rejected": -78405240.0, "logps/chosen": -368.8075256347656, "logps/rejected": -386.1846923828125, "loss": 0.3587, "rewards/chosen": 0.3443182408809662, "rewards/margins": 2.039392441511154, "rewards/rejected": -1.695074200630188, "step": 17789 }, { "epoch": 0.9429411920600005, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53604214.4, "logits/rejected": -31216058.666666668, "logps/chosen": -288.0715576171875, "logps/rejected": -568.5042317708334, "loss": 0.3068, "rewards/chosen": 0.751525592803955, "rewards/margins": 3.285416475931803, "rewards/rejected": -2.533890883127848, "step": 17790 }, { "epoch": 0.9429941960618027, "grad_norm": 50.0, "kl": 1.3940048217773438, "learning_rate": 5e-07, "logits/chosen": -32671984.0, "logits/rejected": -44799493.333333336, "logps/chosen": -255.618505859375, "logps/rejected": -385.8511149088542, "loss": 0.2458, "rewards/chosen": 0.8074530601501465, "rewards/margins": 3.5539928118387856, "rewards/rejected": -2.746539751688639, "step": 17791 }, { "epoch": 0.9430472000636048, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55594896.0, "logits/rejected": -2796911.5, "logps/chosen": -205.7034708658854, "logps/rejected": -107.12789916992188, "loss": 0.3476, "rewards/chosen": 0.6133290529251099, "rewards/margins": 1.6357619762420654, "rewards/rejected": -1.0224329233169556, "step": 17792 }, { "epoch": 0.943100204065407, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -59824234.666666664, "logits/rejected": -24009433.6, "logps/chosen": -444.8308919270833, "logps/rejected": -239.395947265625, "loss": 0.2952, "rewards/chosen": 0.0003387381633122762, "rewards/margins": 2.1815137793620427, "rewards/rejected": -2.1811750411987303, "step": 17793 }, { "epoch": 0.943153208067209, "grad_norm": 83.0, "kl": 1.8431148529052734, "learning_rate": 5e-07, "logits/chosen": 10471920.0, "logits/rejected": 1487769.3333333333, "logps/chosen": -649.680126953125, "logps/rejected": -388.8702799479167, "loss": 0.2802, "rewards/chosen": 0.7997549057006836, "rewards/margins": 4.085541216532389, "rewards/rejected": -3.2857863108317056, "step": 17794 }, { "epoch": 0.9432062120690112, "grad_norm": 30.0, "kl": 0.38753318786621094, "learning_rate": 5e-07, "logits/rejected": -14445108.0, "logps/rejected": -172.27346801757812, "loss": 0.1246, "rewards/rejected": -2.585498809814453, "step": 17795 }, { "epoch": 0.9432592160708133, "grad_norm": 33.0, "kl": 1.0234184265136719, "learning_rate": 5e-07, "logits/chosen": -8117800.0, "logits/rejected": -942973.7, "logps/chosen": -274.4406331380208, "logps/rejected": -250.913671875, "loss": 0.1712, "rewards/chosen": 1.0993294715881348, "rewards/margins": 4.3486401557922365, "rewards/rejected": -3.2493106842041017, "step": 17796 }, { "epoch": 0.9433122200726155, "grad_norm": 53.75, "kl": 2.8128137588500977, "learning_rate": 5e-07, "logits/chosen": -14970465.333333334, "logits/rejected": -55637904.0, "logps/chosen": -245.70503743489584, "logps/rejected": -387.91192626953125, "loss": 0.4105, "rewards/chosen": 0.26223371426264447, "rewards/margins": 3.043445567289988, "rewards/rejected": -2.7812118530273438, "step": 17797 }, { "epoch": 0.9433652240744176, "grad_norm": 49.75, "kl": 0.2890338897705078, "learning_rate": 5e-07, "logits/chosen": -8595566.4, "logits/rejected": -37788610.666666664, "logps/chosen": -184.597802734375, "logps/rejected": -414.6405436197917, "loss": 0.3066, "rewards/chosen": 0.3875087261199951, "rewards/margins": 2.514397954940796, "rewards/rejected": -2.126889228820801, "step": 17798 }, { "epoch": 0.9434182280762198, "grad_norm": 69.5, "kl": 2.8133583068847656, "learning_rate": 5e-07, "logits/chosen": 38966096.0, "logits/rejected": -16100092.0, "logps/chosen": -423.92974853515625, "logps/rejected": -432.1200256347656, "loss": 0.2192, "rewards/chosen": 0.7476650476455688, "rewards/margins": 3.501322388648987, "rewards/rejected": -2.753657341003418, "step": 17799 }, { "epoch": 0.9434712320780219, "grad_norm": 49.0, "kl": 1.3150482177734375, "learning_rate": 5e-07, "logits/chosen": -19843393.333333332, "logits/rejected": -2900483.25, "logps/chosen": -180.69732666015625, "logps/rejected": -103.32327270507812, "loss": 0.3229, "rewards/chosen": 0.6312663157780966, "rewards/margins": 2.9066068728764853, "rewards/rejected": -2.2753405570983887, "step": 17800 }, { "epoch": 0.943524236079824, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5423952.5, "logits/rejected": -47602533.333333336, "logps/chosen": -288.81597900390625, "logps/rejected": -202.7781778971354, "loss": 0.1493, "rewards/chosen": 1.9588127136230469, "rewards/margins": 4.373927911122641, "rewards/rejected": -2.4151151974995932, "step": 17801 }, { "epoch": 0.9435772400816261, "grad_norm": 47.75, "kl": 1.995584487915039, "learning_rate": 5e-07, "logits/chosen": -17371158.666666668, "logits/rejected": -11179113.6, "logps/chosen": -1315.30322265625, "logps/rejected": -194.71961669921876, "loss": 0.2573, "rewards/chosen": 1.5254878997802734, "rewards/margins": 2.7346750259399415, "rewards/rejected": -1.209187126159668, "step": 17802 }, { "epoch": 0.9436302440834283, "grad_norm": 48.5, "kl": 1.9923934936523438, "learning_rate": 5e-07, "logits/chosen": -4226405.333333333, "logits/rejected": -10489952.8, "logps/chosen": -432.4449869791667, "logps/rejected": -299.61201171875, "loss": 0.2419, "rewards/chosen": 0.21074014902114868, "rewards/margins": 3.116013205051422, "rewards/rejected": -2.9052730560302735, "step": 17803 }, { "epoch": 0.9436832480852304, "grad_norm": 39.25, "kl": 1.1960945129394531, "learning_rate": 5e-07, "logits/chosen": -15317912.0, "logits/rejected": -34417261.333333336, "logps/chosen": -316.1713623046875, "logps/rejected": -412.282470703125, "loss": 0.227, "rewards/chosen": 1.1407854080200195, "rewards/margins": 3.2883073806762697, "rewards/rejected": -2.14752197265625, "step": 17804 }, { "epoch": 0.9437362520870326, "grad_norm": 41.75, "kl": 0.13968276977539062, "learning_rate": 5e-07, "logits/chosen": -63295804.0, "logits/rejected": -27144670.0, "logps/chosen": -325.0731506347656, "logps/rejected": -450.95758056640625, "loss": 0.2679, "rewards/chosen": 0.136616513133049, "rewards/margins": 3.9372551888227463, "rewards/rejected": -3.8006386756896973, "step": 17805 }, { "epoch": 0.9437892560888347, "grad_norm": 46.0, "kl": 1.55096435546875, "learning_rate": 5e-07, "logits/chosen": -11477706.4, "logits/rejected": -6781126.666666667, "logps/chosen": -162.61346435546875, "logps/rejected": -255.28365071614584, "loss": 0.386, "rewards/chosen": 0.361352276802063, "rewards/margins": 2.076538077990214, "rewards/rejected": -1.7151858011881511, "step": 17806 }, { "epoch": 0.9438422600906369, "grad_norm": 54.75, "kl": 2.242908477783203, "learning_rate": 5e-07, "logits/chosen": -2407706.0, "logits/rejected": -36446208.0, "logps/chosen": -385.41131591796875, "logps/rejected": -231.7013397216797, "loss": 0.2252, "rewards/chosen": 0.8148418664932251, "rewards/margins": 2.248803734779358, "rewards/rejected": -1.4339618682861328, "step": 17807 }, { "epoch": 0.943895264092439, "grad_norm": 40.25, "kl": 0.06613922119140625, "learning_rate": 5e-07, "logits/chosen": -22464994.666666668, "logits/rejected": -27515296.0, "logps/chosen": -720.4921875, "logps/rejected": -318.8018310546875, "loss": 0.1529, "rewards/chosen": 1.3758249282836914, "rewards/margins": 4.48392276763916, "rewards/rejected": -3.1080978393554686, "step": 17808 }, { "epoch": 0.9439482680942411, "grad_norm": 47.0, "kl": 1.0414619445800781, "learning_rate": 5e-07, "logits/chosen": -418008.25, "logits/rejected": 1098488.75, "logps/chosen": -142.12147521972656, "logps/rejected": -203.3469696044922, "loss": 0.3239, "rewards/chosen": 0.26115331053733826, "rewards/margins": 1.6909185349941254, "rewards/rejected": -1.429765224456787, "step": 17809 }, { "epoch": 0.9440012720960432, "grad_norm": 39.5, "kl": 0.7138500213623047, "learning_rate": 5e-07, "logits/chosen": -27201502.0, "logits/rejected": -32456186.666666668, "logps/chosen": -435.9682312011719, "logps/rejected": -372.27783203125, "loss": 0.1085, "rewards/chosen": 1.682399034500122, "rewards/margins": 4.790155331293741, "rewards/rejected": -3.1077562967936196, "step": 17810 }, { "epoch": 0.9440542760978454, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56693048.0, "logits/rejected": -48571744.0, "logps/chosen": -349.525146484375, "logps/rejected": -473.4773763020833, "loss": 0.1655, "rewards/chosen": 0.062652587890625, "rewards/margins": 3.4072930018107095, "rewards/rejected": -3.3446404139200845, "step": 17811 }, { "epoch": 0.9441072800996475, "grad_norm": 39.25, "kl": 0.4894428253173828, "learning_rate": 5e-07, "logits/chosen": -20727624.0, "logits/rejected": -29428454.4, "logps/chosen": -373.1433919270833, "logps/rejected": -324.6515625, "loss": 0.1932, "rewards/chosen": 1.50164794921875, "rewards/margins": 4.017191696166992, "rewards/rejected": -2.515543746948242, "step": 17812 }, { "epoch": 0.9441602841014497, "grad_norm": 57.0, "kl": 0.2871246337890625, "learning_rate": 5e-07, "logits/chosen": -42018672.0, "logits/rejected": -26009820.8, "logps/chosen": -443.9213053385417, "logps/rejected": -375.49140625, "loss": 0.2716, "rewards/chosen": 0.12283529837926228, "rewards/margins": 2.2123616655667626, "rewards/rejected": -2.0895263671875, "step": 17813 }, { "epoch": 0.9442132881032518, "grad_norm": 49.25, "kl": 0.7074432373046875, "learning_rate": 5e-07, "logits/chosen": -16529556.57142857, "logits/rejected": -50911080.0, "logps/chosen": -165.41993931361608, "logps/rejected": -389.9234619140625, "loss": 0.3867, "rewards/chosen": 0.44146738733564106, "rewards/margins": 3.5864289828709195, "rewards/rejected": -3.1449615955352783, "step": 17814 }, { "epoch": 0.944266292105054, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42124121.6, "logits/rejected": -24792944.0, "logps/chosen": -391.8239990234375, "logps/rejected": -247.40140787760416, "loss": 0.2797, "rewards/chosen": 0.6758895874023437, "rewards/margins": 2.4869080543518067, "rewards/rejected": -1.811018466949463, "step": 17815 }, { "epoch": 0.944319296106856, "grad_norm": 41.75, "kl": 0.7083702087402344, "learning_rate": 5e-07, "logits/chosen": -19247528.0, "logits/rejected": -33459240.0, "logps/chosen": -166.92740885416666, "logps/rejected": -347.978515625, "loss": 0.3449, "rewards/chosen": 0.35379024346669513, "rewards/margins": 3.662907083829244, "rewards/rejected": -3.309116840362549, "step": 17816 }, { "epoch": 0.9443723001086582, "grad_norm": 109.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14075062.0, "logits/rejected": 6910770.0, "logps/chosen": -386.3006896972656, "logps/rejected": -510.6097412109375, "loss": 0.2863, "rewards/chosen": 0.6571739315986633, "rewards/margins": 2.4049927592277527, "rewards/rejected": -1.7478188276290894, "step": 17817 }, { "epoch": 0.9444253041104603, "grad_norm": 37.25, "kl": 2.8436365127563477, "learning_rate": 5e-07, "logits/chosen": -4956979.2, "logits/rejected": -28348584.0, "logps/chosen": -987.46845703125, "logps/rejected": -303.94028727213544, "loss": 0.2325, "rewards/chosen": 1.7812652587890625, "rewards/margins": 3.307141621907552, "rewards/rejected": -1.5258763631184895, "step": 17818 }, { "epoch": 0.9444783081122625, "grad_norm": 53.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48863976.0, "logits/rejected": -2721500.25, "logps/chosen": -190.96583557128906, "logps/rejected": -316.80889892578125, "loss": 0.3638, "rewards/chosen": -0.34058088064193726, "rewards/margins": 1.5807294249534607, "rewards/rejected": -1.921310305595398, "step": 17819 }, { "epoch": 0.9445313121140646, "grad_norm": 34.75, "kl": 3.620210647583008, "learning_rate": 5e-07, "logits/chosen": 4198535.0, "logits/rejected": -33114576.0, "logps/chosen": -178.54225158691406, "logps/rejected": -352.1881103515625, "loss": 0.2269, "rewards/chosen": 1.4613406658172607, "rewards/margins": 3.562748908996582, "rewards/rejected": -2.1014082431793213, "step": 17820 }, { "epoch": 0.9445843161158668, "grad_norm": 48.5, "kl": 0.5714168548583984, "learning_rate": 5e-07, "logits/chosen": -45491264.0, "logits/rejected": -76750965.33333333, "logps/chosen": -299.99345703125, "logps/rejected": -496.2268880208333, "loss": 0.2888, "rewards/chosen": 0.4349409580230713, "rewards/margins": 3.3071228504180907, "rewards/rejected": -2.8721818923950195, "step": 17821 }, { "epoch": 0.9446373201176689, "grad_norm": 44.0, "kl": 3.1572036743164062, "learning_rate": 5e-07, "logits/chosen": -43710688.0, "logits/rejected": -5514144.5, "logps/chosen": -764.660400390625, "logps/rejected": -119.15872192382812, "loss": 0.1991, "rewards/chosen": 1.8728577295939128, "rewards/margins": 7.978989283243815, "rewards/rejected": -6.106131553649902, "step": 17822 }, { "epoch": 0.9446903241194711, "grad_norm": 46.0, "kl": 1.0120792388916016, "learning_rate": 5e-07, "logits/chosen": -36417395.2, "logits/rejected": -22628053.333333332, "logps/chosen": -277.840283203125, "logps/rejected": -146.5450439453125, "loss": 0.3543, "rewards/chosen": 0.6943605422973633, "rewards/margins": 3.65824457804362, "rewards/rejected": -2.9638840357462564, "step": 17823 }, { "epoch": 0.9447433281212732, "grad_norm": 31.75, "kl": 1.2898616790771484, "learning_rate": 5e-07, "logits/chosen": -39149661.333333336, "logits/rejected": -7068679.2, "logps/chosen": -248.3719482421875, "logps/rejected": -203.28590087890626, "loss": 0.2047, "rewards/chosen": 1.0475366115570068, "rewards/margins": 3.311498594284058, "rewards/rejected": -2.263961982727051, "step": 17824 }, { "epoch": 0.9447963321230753, "grad_norm": 65.5, "kl": 0.40389537811279297, "learning_rate": 5e-07, "logits/chosen": -2772415.2, "logits/rejected": -26778880.0, "logps/chosen": -191.54659423828124, "logps/rejected": -445.6499430338542, "loss": 0.2449, "rewards/chosen": 0.5693912506103516, "rewards/margins": 4.9481000900268555, "rewards/rejected": -4.378708839416504, "step": 17825 }, { "epoch": 0.9448493361248774, "grad_norm": 67.0, "kl": 1.1698951721191406, "learning_rate": 5e-07, "logits/chosen": -41396105.14285714, "logits/rejected": 8677074.0, "logps/chosen": -360.18990652901783, "logps/rejected": -82.40335083007812, "loss": 0.4523, "rewards/chosen": 0.24325200489589147, "rewards/margins": 1.1564432638032096, "rewards/rejected": -0.9131912589073181, "step": 17826 }, { "epoch": 0.9449023401266796, "grad_norm": 59.75, "kl": 0.5977096557617188, "learning_rate": 5e-07, "logits/chosen": -38818252.8, "logits/rejected": -45169658.666666664, "logps/chosen": -345.662451171875, "logps/rejected": -165.10260009765625, "loss": 0.3681, "rewards/chosen": 0.3680549144744873, "rewards/margins": 1.6553535143534344, "rewards/rejected": -1.287298599878947, "step": 17827 }, { "epoch": 0.9449553441284817, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11089885.6, "logits/rejected": -15859730.666666666, "logps/chosen": -224.1712646484375, "logps/rejected": -128.16278076171875, "loss": 0.2588, "rewards/chosen": 0.4490049839019775, "rewards/margins": 4.75233826637268, "rewards/rejected": -4.303333282470703, "step": 17828 }, { "epoch": 0.9450083481302838, "grad_norm": 46.5, "kl": 1.1358699798583984, "learning_rate": 5e-07, "logits/chosen": -20032032.0, "logits/rejected": -25524886.0, "logps/chosen": -428.5086975097656, "logps/rejected": -359.63140869140625, "loss": 0.2955, "rewards/chosen": 0.3144151568412781, "rewards/margins": 2.896114766597748, "rewards/rejected": -2.5816996097564697, "step": 17829 }, { "epoch": 0.945061352132086, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 16607394.0, "logits/rejected": -41584036.571428575, "logps/chosen": -81.2420654296875, "logps/rejected": -382.9374302455357, "loss": 0.2592, "rewards/chosen": 0.3031112849712372, "rewards/margins": 2.201201919998441, "rewards/rejected": -1.8980906350272042, "step": 17830 }, { "epoch": 0.9451143561338881, "grad_norm": 58.5, "kl": 0.4050941467285156, "learning_rate": 5e-07, "logits/chosen": -27409892.57142857, "logits/rejected": -49648080.0, "logps/chosen": -253.95947265625, "logps/rejected": -380.8048400878906, "loss": 0.3864, "rewards/chosen": 0.36523679324558805, "rewards/margins": 2.6492181164877757, "rewards/rejected": -2.2839813232421875, "step": 17831 }, { "epoch": 0.9451673601356902, "grad_norm": 37.0, "kl": 2.285858154296875, "learning_rate": 5e-07, "logits/chosen": 1516855.25, "logits/rejected": -13560057.333333334, "logps/chosen": -242.19183349609375, "logps/rejected": -246.22993977864584, "loss": 0.0992, "rewards/chosen": 1.595678687095642, "rewards/margins": 5.3409452835718785, "rewards/rejected": -3.745266596476237, "step": 17832 }, { "epoch": 0.9452203641374923, "grad_norm": 53.0, "kl": 0.11095809936523438, "learning_rate": 5e-07, "logits/chosen": -38518692.0, "logits/rejected": -60606976.0, "logps/chosen": -250.05422973632812, "logps/rejected": -359.3858642578125, "loss": 0.2376, "rewards/chosen": 0.6241485476493835, "rewards/margins": 3.3698524832725525, "rewards/rejected": -2.745703935623169, "step": 17833 }, { "epoch": 0.9452733681392945, "grad_norm": 31.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5784102.5, "logits/rejected": -42227416.0, "logps/chosen": -351.08087158203125, "logps/rejected": -505.1493733723958, "loss": 0.1211, "rewards/chosen": 1.610345482826233, "rewards/margins": 5.192653934160868, "rewards/rejected": -3.5823084513346353, "step": 17834 }, { "epoch": 0.9453263721410966, "grad_norm": 67.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58456460.8, "logits/rejected": 17671604.0, "logps/chosen": -434.020068359375, "logps/rejected": -181.38496907552084, "loss": 0.308, "rewards/chosen": 0.4043585777282715, "rewards/margins": 2.957692654927572, "rewards/rejected": -2.5533340771993003, "step": 17835 }, { "epoch": 0.9453793761428988, "grad_norm": 52.75, "kl": 1.5857963562011719, "learning_rate": 5e-07, "logits/chosen": -33454760.0, "logits/rejected": -31240096.0, "logps/chosen": -192.05582682291666, "logps/rejected": -356.2467529296875, "loss": 0.2459, "rewards/chosen": 0.4280006488164266, "rewards/margins": 2.956480606396993, "rewards/rejected": -2.5284799575805663, "step": 17836 }, { "epoch": 0.9454323801447009, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28836979.2, "logits/rejected": -20404548.0, "logps/chosen": -389.17099609375, "logps/rejected": -217.62113444010416, "loss": 0.2398, "rewards/chosen": 0.9170049667358399, "rewards/margins": 4.10339552561442, "rewards/rejected": -3.1863905588785806, "step": 17837 }, { "epoch": 0.9454853841465031, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28752350.0, "logits/rejected": -2625488.0, "logps/chosen": -335.21343994140625, "logps/rejected": -333.2705383300781, "loss": 0.3177, "rewards/chosen": 0.334385484457016, "rewards/margins": 2.0068739354610443, "rewards/rejected": -1.6724884510040283, "step": 17838 }, { "epoch": 0.9455383881483052, "grad_norm": 50.0, "kl": 0.3682975769042969, "learning_rate": 5e-07, "logits/chosen": -12850916.0, "logits/rejected": -13557250.666666666, "logps/chosen": -385.60565185546875, "logps/rejected": -256.2274576822917, "loss": 0.1866, "rewards/chosen": 0.8667333722114563, "rewards/margins": 3.439049859841665, "rewards/rejected": -2.5723164876302085, "step": 17839 }, { "epoch": 0.9455913921501073, "grad_norm": 61.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27735762.666666668, "logits/rejected": -10633880.0, "logps/chosen": -367.3658854166667, "logps/rejected": -377.6786376953125, "loss": 0.2959, "rewards/chosen": 0.40152589480082196, "rewards/margins": 2.922129456202189, "rewards/rejected": -2.520603561401367, "step": 17840 }, { "epoch": 0.9456443961519094, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24252634.0, "logits/rejected": -47459850.666666664, "logps/chosen": -479.3079528808594, "logps/rejected": -278.5896402994792, "loss": 0.1115, "rewards/chosen": 2.5343689918518066, "rewards/margins": 5.3603196144104, "rewards/rejected": -2.8259506225585938, "step": 17841 }, { "epoch": 0.9456974001537116, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48438664.0, "logits/rejected": 86900474.66666667, "logps/chosen": -104.58446502685547, "logps/rejected": -153.9166463216146, "loss": 0.1944, "rewards/chosen": 0.14738665521144867, "rewards/margins": 2.7817480812470117, "rewards/rejected": -2.634361426035563, "step": 17842 }, { "epoch": 0.9457504041555137, "grad_norm": 43.5, "kl": 1.513031005859375, "learning_rate": 5e-07, "logits/chosen": -19949988.8, "logits/rejected": -13530076.0, "logps/chosen": -200.39736328125, "logps/rejected": -788.1676432291666, "loss": 0.3048, "rewards/chosen": 0.25384066104888914, "rewards/margins": 4.184516803423564, "rewards/rejected": -3.9306761423746743, "step": 17843 }, { "epoch": 0.9458034081573159, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9735844.0, "logits/rejected": -9668173.0, "logps/chosen": -217.65548706054688, "logps/rejected": -403.1030578613281, "loss": 0.2475, "rewards/chosen": 0.3681908845901489, "rewards/margins": 3.6973220109939575, "rewards/rejected": -3.3291311264038086, "step": 17844 }, { "epoch": 0.945856412159118, "grad_norm": 87.0, "kl": 3.531719207763672, "learning_rate": 5e-07, "logits/chosen": -18685249.6, "logits/rejected": -179531.33333333334, "logps/chosen": -191.0650634765625, "logps/rejected": -70.7864481608073, "loss": 0.3148, "rewards/chosen": 1.1189226150512694, "rewards/margins": 3.5530933380126952, "rewards/rejected": -2.434170722961426, "step": 17845 }, { "epoch": 0.9459094161609202, "grad_norm": 163.0, "kl": 3.0128135681152344, "learning_rate": 5e-07, "logits/chosen": -61461568.0, "logits/rejected": -14510400.0, "logps/chosen": -366.0066731770833, "logps/rejected": -253.0620361328125, "loss": 0.2525, "rewards/chosen": 1.5500335693359375, "rewards/margins": 3.272395706176758, "rewards/rejected": -1.7223621368408204, "step": 17846 }, { "epoch": 0.9459624201627223, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9437017.6, "logits/rejected": -22635525.333333332, "logps/chosen": -95.31681518554687, "logps/rejected": -268.4458414713542, "loss": 0.4328, "rewards/chosen": -0.2862383842468262, "rewards/margins": 2.081551202138265, "rewards/rejected": -2.3677895863850913, "step": 17847 }, { "epoch": 0.9460154241645244, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68233779.2, "logits/rejected": -18777104.0, "logps/chosen": -443.3099609375, "logps/rejected": -530.4564615885416, "loss": 0.297, "rewards/chosen": 0.33153715133666994, "rewards/margins": 2.8693768819173178, "rewards/rejected": -2.537839730580648, "step": 17848 }, { "epoch": 0.9460684281663265, "grad_norm": 69.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -786693.0, "logits/rejected": 2038846.4, "logps/chosen": -228.93070475260416, "logps/rejected": -355.4046142578125, "loss": 0.2778, "rewards/chosen": -0.025262196858723957, "rewards/margins": 2.0433220545450843, "rewards/rejected": -2.0685842514038084, "step": 17849 }, { "epoch": 0.9461214321681287, "grad_norm": 30.875, "kl": 1.3929424285888672, "learning_rate": 5e-07, "logits/chosen": -13467385.6, "logits/rejected": -33136413.333333332, "logps/chosen": -293.6037353515625, "logps/rejected": -428.808837890625, "loss": 0.2927, "rewards/chosen": 0.7163353443145752, "rewards/margins": 4.043901491165161, "rewards/rejected": -3.327566146850586, "step": 17850 }, { "epoch": 0.9461744361699308, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77303898.66666667, "logits/rejected": -29307136.0, "logps/chosen": -497.2410074869792, "logps/rejected": -127.090234375, "loss": 0.3129, "rewards/chosen": 0.36033324400583905, "rewards/margins": 2.08932208220164, "rewards/rejected": -1.7289888381958007, "step": 17851 }, { "epoch": 0.946227440171733, "grad_norm": 69.5, "kl": 4.897499084472656, "learning_rate": 5e-07, "logits/chosen": -8212059.333333333, "logits/rejected": -39771908.0, "logps/chosen": -426.4280192057292, "logps/rejected": -203.05906677246094, "loss": 0.3371, "rewards/chosen": 0.7349789142608643, "rewards/margins": 5.0560362339019775, "rewards/rejected": -4.321057319641113, "step": 17852 }, { "epoch": 0.9462804441735351, "grad_norm": 49.25, "kl": 2.0445632934570312, "learning_rate": 5e-07, "logits/chosen": -57495530.666666664, "logits/rejected": -43082790.4, "logps/chosen": -501.1173502604167, "logps/rejected": -406.433544921875, "loss": 0.2389, "rewards/chosen": 0.22633771101633707, "rewards/margins": 2.474307910601298, "rewards/rejected": -2.247970199584961, "step": 17853 }, { "epoch": 0.9463334481753373, "grad_norm": 59.25, "kl": 2.165281295776367, "learning_rate": 5e-07, "logits/chosen": -43690121.14285714, "logits/rejected": 9297158.0, "logps/chosen": -398.82481166294644, "logps/rejected": -64.09494018554688, "loss": 0.4892, "rewards/chosen": 0.3826836517878941, "rewards/margins": 0.39901589628841194, "rewards/rejected": -0.016332244500517845, "step": 17854 }, { "epoch": 0.9463864521771393, "grad_norm": 48.75, "kl": 0.11417388916015625, "learning_rate": 5e-07, "logits/chosen": -93165600.0, "logits/rejected": -26194968.0, "logps/chosen": -475.5603515625, "logps/rejected": -217.5633748372396, "loss": 0.291, "rewards/chosen": 0.6648725986480712, "rewards/margins": 3.7026981830596926, "rewards/rejected": -3.037825584411621, "step": 17855 }, { "epoch": 0.9464394561789415, "grad_norm": 46.0, "kl": 0.597773551940918, "learning_rate": 5e-07, "logits/chosen": -24807733.333333332, "logits/rejected": -98603280.0, "logps/chosen": -316.22776285807294, "logps/rejected": -284.4400329589844, "loss": 0.3486, "rewards/chosen": 0.2769918441772461, "rewards/margins": 4.725597858428955, "rewards/rejected": -4.448606014251709, "step": 17856 }, { "epoch": 0.9464924601807436, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68118392.0, "logits/rejected": -26978406.0, "logps/chosen": -264.6639099121094, "logps/rejected": -245.135009765625, "loss": 0.3042, "rewards/chosen": -0.09341393411159515, "rewards/margins": 2.9574388414621353, "rewards/rejected": -3.0508527755737305, "step": 17857 }, { "epoch": 0.9465454641825458, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35208906.666666664, "logits/rejected": -37248332.8, "logps/chosen": -819.3694661458334, "logps/rejected": -245.632470703125, "loss": 0.184, "rewards/chosen": 1.4977224667867024, "rewards/margins": 4.007440408070882, "rewards/rejected": -2.5097179412841797, "step": 17858 }, { "epoch": 0.9465984681843479, "grad_norm": 57.5, "kl": 1.0846672058105469, "learning_rate": 5e-07, "logits/chosen": -27887884.8, "logits/rejected": -38757194.666666664, "logps/chosen": -325.3048583984375, "logps/rejected": -324.2248942057292, "loss": 0.3482, "rewards/chosen": 0.07638276815414428, "rewards/margins": 3.363915574550629, "rewards/rejected": -3.2875328063964844, "step": 17859 }, { "epoch": 0.9466514721861501, "grad_norm": 44.75, "kl": 1.1504936218261719, "learning_rate": 5e-07, "logits/chosen": -28905840.0, "logits/rejected": -21416925.333333332, "logps/chosen": -286.4988708496094, "logps/rejected": -271.54364013671875, "loss": 0.1712, "rewards/chosen": 0.9958438873291016, "rewards/margins": 3.788540840148926, "rewards/rejected": -2.792696952819824, "step": 17860 }, { "epoch": 0.9467044761879522, "grad_norm": 55.5, "kl": 1.6406841278076172, "learning_rate": 5e-07, "logits/chosen": -2377293.25, "logits/rejected": 5195454.0, "logps/chosen": -292.8138427734375, "logps/rejected": -204.5782012939453, "loss": 0.3176, "rewards/chosen": 0.40190619230270386, "rewards/margins": 2.6189090609550476, "rewards/rejected": -2.2170028686523438, "step": 17861 }, { "epoch": 0.9467574801897544, "grad_norm": 55.5, "kl": 1.0594253540039062, "learning_rate": 5e-07, "logits/chosen": -14707852.0, "logits/rejected": 846724.25, "logps/chosen": -496.867919921875, "logps/rejected": -352.720947265625, "loss": 0.2473, "rewards/chosen": 0.7458093762397766, "rewards/margins": 3.958371937274933, "rewards/rejected": -3.2125625610351562, "step": 17862 }, { "epoch": 0.9468104841915564, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34593264.0, "logits/rejected": 874240.8125, "logps/chosen": -403.4169108072917, "logps/rejected": -276.1226806640625, "loss": 0.3145, "rewards/chosen": 0.6132936875025431, "rewards/margins": 4.338403741518657, "rewards/rejected": -3.7251100540161133, "step": 17863 }, { "epoch": 0.9468634881933586, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37296464.0, "logits/rejected": -43640792.0, "logps/chosen": -205.71771240234375, "logps/rejected": -274.06886800130206, "loss": 0.2193, "rewards/chosen": -0.2282697707414627, "rewards/margins": 2.4280552516380944, "rewards/rejected": -2.656325022379557, "step": 17864 }, { "epoch": 0.9469164921951607, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28896800.0, "logits/rejected": -22273281.333333332, "logps/chosen": -303.4324462890625, "logps/rejected": -344.7584635416667, "loss": 0.2996, "rewards/chosen": 0.2274242162704468, "rewards/margins": 3.293520522117615, "rewards/rejected": -3.066096305847168, "step": 17865 }, { "epoch": 0.9469694961969629, "grad_norm": 30.375, "kl": 3.0053634643554688, "learning_rate": 5e-07, "logits/chosen": -8134420.0, "logits/rejected": -15235513.6, "logps/chosen": -231.507568359375, "logps/rejected": -189.5202880859375, "loss": 0.2101, "rewards/chosen": 0.6829857031504313, "rewards/margins": 3.421877876917521, "rewards/rejected": -2.73889217376709, "step": 17866 }, { "epoch": 0.947022500198765, "grad_norm": 55.75, "kl": 6.764773368835449, "learning_rate": 5e-07, "logits/chosen": -51764112.0, "logits/rejected": -50137802.666666664, "logps/chosen": -332.78515625, "logps/rejected": -619.7764485677084, "loss": 0.3001, "rewards/chosen": 0.8825127601623535, "rewards/margins": 4.47885897954305, "rewards/rejected": -3.5963462193806968, "step": 17867 }, { "epoch": 0.9470755042005672, "grad_norm": 63.75, "kl": 1.8435325622558594, "learning_rate": 5e-07, "logits/chosen": -32206354.285714287, "logits/rejected": -1513098.5, "logps/chosen": -349.79373604910717, "logps/rejected": -85.60002136230469, "loss": 0.4016, "rewards/chosen": 0.49276930945260183, "rewards/margins": 3.7611935819898332, "rewards/rejected": -3.2684242725372314, "step": 17868 }, { "epoch": 0.9471285082023693, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84939776.0, "logits/rejected": -21596131.2, "logps/chosen": -216.05452473958334, "logps/rejected": -259.733349609375, "loss": 0.1954, "rewards/chosen": 1.2947503725687664, "rewards/margins": 3.1432843844095864, "rewards/rejected": -1.8485340118408202, "step": 17869 }, { "epoch": 0.9471815122041715, "grad_norm": 37.0, "kl": 0.7417154312133789, "learning_rate": 5e-07, "logits/chosen": -12291801.333333334, "logits/rejected": -51727008.0, "logps/chosen": -66.65046691894531, "logps/rejected": -263.228466796875, "loss": 0.2374, "rewards/chosen": 0.6781987349192301, "rewards/margins": 2.7661885420481362, "rewards/rejected": -2.0879898071289062, "step": 17870 }, { "epoch": 0.9472345162059735, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33549011.2, "logits/rejected": -34926544.0, "logps/chosen": -345.430712890625, "logps/rejected": -472.406494140625, "loss": 0.3101, "rewards/chosen": 0.24354615211486816, "rewards/margins": 3.1985854943593344, "rewards/rejected": -2.9550393422444663, "step": 17871 }, { "epoch": 0.9472875202077757, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36930288.0, "logits/rejected": -24204216.0, "logps/chosen": -339.0567626953125, "logps/rejected": -312.8921203613281, "loss": 0.3229, "rewards/chosen": 0.23809850215911865, "rewards/margins": 1.6929489374160767, "rewards/rejected": -1.454850435256958, "step": 17872 }, { "epoch": 0.9473405242095778, "grad_norm": 40.25, "kl": 0.371429443359375, "learning_rate": 5e-07, "logits/chosen": 30024061.333333332, "logits/rejected": -8520473.6, "logps/chosen": -370.3658040364583, "logps/rejected": -583.4939453125, "loss": 0.1657, "rewards/chosen": 0.753613551457723, "rewards/margins": 4.473288234074911, "rewards/rejected": -3.7196746826171876, "step": 17873 }, { "epoch": 0.94739352821138, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78795936.0, "logits/rejected": -10010898.285714285, "logps/chosen": -278.6082763671875, "logps/rejected": -275.35367257254467, "loss": 0.2442, "rewards/chosen": 1.0337707996368408, "rewards/margins": 2.5297622340066095, "rewards/rejected": -1.4959914343697684, "step": 17874 }, { "epoch": 0.9474465322131821, "grad_norm": 29.625, "kl": 0.596348762512207, "learning_rate": 5e-07, "logits/chosen": 9448107.333333334, "logits/rejected": -31561203.2, "logps/chosen": -148.7339070638021, "logps/rejected": -290.0662109375, "loss": 0.1984, "rewards/chosen": 1.1380284627278645, "rewards/margins": 3.703345044453939, "rewards/rejected": -2.565316581726074, "step": 17875 }, { "epoch": 0.9474995362149843, "grad_norm": 37.5, "kl": 1.734386920928955, "learning_rate": 5e-07, "logits/chosen": -12759805.333333334, "logits/rejected": -5677304.5, "logps/chosen": -307.0729573567708, "logps/rejected": -132.0645294189453, "loss": 0.2638, "rewards/chosen": 0.9812410672505697, "rewards/margins": 7.112253983815511, "rewards/rejected": -6.131012916564941, "step": 17876 }, { "epoch": 0.9475525402167864, "grad_norm": 63.5, "kl": 0.8903293609619141, "learning_rate": 5e-07, "logits/chosen": -67953049.6, "logits/rejected": -40988528.0, "logps/chosen": -236.3906982421875, "logps/rejected": -287.2123209635417, "loss": 0.3553, "rewards/chosen": 0.13572471141815184, "rewards/margins": 2.0934640645980833, "rewards/rejected": -1.9577393531799316, "step": 17877 }, { "epoch": 0.9476055442185886, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37584392.0, "logits/rejected": -26560938.0, "logps/chosen": -196.08709716796875, "logps/rejected": -242.2062225341797, "loss": 0.2865, "rewards/chosen": 0.3816007673740387, "rewards/margins": 2.5167151987552643, "rewards/rejected": -2.1351144313812256, "step": 17878 }, { "epoch": 0.9476585482203906, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -91369568.0, "logits/rejected": -1702418.5, "logps/chosen": -246.47021484375, "logps/rejected": -532.801513671875, "loss": 0.2315, "rewards/chosen": 0.4156084358692169, "rewards/margins": 4.424892455339432, "rewards/rejected": -4.009284019470215, "step": 17879 }, { "epoch": 0.9477115522221927, "grad_norm": 44.25, "kl": 0.8735942840576172, "learning_rate": 5e-07, "logits/chosen": -53445184.0, "logits/rejected": -12893048.0, "logps/chosen": -253.27630615234375, "logps/rejected": -251.61807250976562, "loss": 0.261, "rewards/chosen": 0.4604601263999939, "rewards/margins": 2.7432445883750916, "rewards/rejected": -2.2827844619750977, "step": 17880 }, { "epoch": 0.9477645562239949, "grad_norm": 57.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 10187528.0, "logits/rejected": 8228859.2, "logps/chosen": -319.4732666015625, "logps/rejected": -412.03427734375, "loss": 0.2388, "rewards/chosen": 0.4621616204579671, "rewards/margins": 2.5332950433095296, "rewards/rejected": -2.0711334228515623, "step": 17881 }, { "epoch": 0.947817560225797, "grad_norm": 33.75, "kl": 0.7923622131347656, "learning_rate": 5e-07, "logits/chosen": -4158465.2, "logits/rejected": -30730893.333333332, "logps/chosen": -69.878466796875, "logps/rejected": -444.4876302083333, "loss": 0.3111, "rewards/chosen": 0.18436591625213622, "rewards/margins": 3.6780433734258016, "rewards/rejected": -3.4936774571736655, "step": 17882 }, { "epoch": 0.9478705642275992, "grad_norm": 54.5, "kl": 0.29801177978515625, "learning_rate": 5e-07, "logits/chosen": -35509360.0, "logits/rejected": 28273496.0, "logps/chosen": -339.2850646972656, "logps/rejected": -321.9109802246094, "loss": 0.2782, "rewards/chosen": 0.727587103843689, "rewards/margins": 2.6175488233566284, "rewards/rejected": -1.8899617195129395, "step": 17883 }, { "epoch": 0.9479235682294013, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29978276.0, "logits/rejected": -41373620.0, "logps/chosen": -315.3020935058594, "logps/rejected": -479.0060119628906, "loss": 0.1783, "rewards/chosen": 0.7901430130004883, "rewards/margins": 4.103857517242432, "rewards/rejected": -3.3137145042419434, "step": 17884 }, { "epoch": 0.9479765722312035, "grad_norm": 60.5, "kl": 0.6243801116943359, "learning_rate": 5e-07, "logits/chosen": -23308043.2, "logits/rejected": -916264.0, "logps/chosen": -431.48251953125, "logps/rejected": -285.4491780598958, "loss": 0.2362, "rewards/chosen": 0.7430028438568115, "rewards/margins": 4.480999263127645, "rewards/rejected": -3.7379964192708335, "step": 17885 }, { "epoch": 0.9480295762330055, "grad_norm": 47.5, "kl": 1.5728683471679688, "learning_rate": 5e-07, "logits/chosen": -32348300.0, "logits/rejected": 14577045.0, "logps/chosen": -296.65106201171875, "logps/rejected": -293.3184814453125, "loss": 0.2858, "rewards/chosen": 0.47617101669311523, "rewards/margins": 2.2905551195144653, "rewards/rejected": -1.81438410282135, "step": 17886 }, { "epoch": 0.9480825802348077, "grad_norm": 29.25, "kl": 7.475858688354492, "learning_rate": 5e-07, "logits/chosen": -28669612.8, "logits/rejected": -37351621.333333336, "logps/chosen": -310.2046875, "logps/rejected": -231.41666666666666, "loss": 0.2664, "rewards/chosen": 1.4407628059387207, "rewards/margins": 4.500212955474853, "rewards/rejected": -3.059450149536133, "step": 17887 }, { "epoch": 0.9481355842366098, "grad_norm": 34.75, "kl": 7.472810745239258, "learning_rate": 5e-07, "logits/chosen": -5879051.2, "logits/rejected": -20682170.666666668, "logps/chosen": -217.3912353515625, "logps/rejected": -372.6058756510417, "loss": 0.3058, "rewards/chosen": 1.2624323844909668, "rewards/margins": 3.571706740061442, "rewards/rejected": -2.309274355570475, "step": 17888 }, { "epoch": 0.948188588238412, "grad_norm": 50.25, "kl": 0.4243659973144531, "learning_rate": 5e-07, "logits/chosen": -53527888.0, "logits/rejected": -7161039.2, "logps/chosen": -372.4576009114583, "logps/rejected": -182.8809326171875, "loss": 0.3006, "rewards/chosen": -0.15053507685661316, "rewards/margins": 2.148805171251297, "rewards/rejected": -2.29934024810791, "step": 17889 }, { "epoch": 0.9482415922402141, "grad_norm": 67.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11811014.4, "logits/rejected": -17688736.0, "logps/chosen": -348.3520263671875, "logps/rejected": -288.946044921875, "loss": 0.3722, "rewards/chosen": 0.08117890357971191, "rewards/margins": 2.163820664087931, "rewards/rejected": -2.0826417605082193, "step": 17890 }, { "epoch": 0.9482945962420163, "grad_norm": 43.75, "kl": 0.0017242431640625, "learning_rate": 5e-07, "logits/chosen": 9430009.0, "logits/rejected": -11545132.0, "logps/chosen": -191.69796752929688, "logps/rejected": -188.97298177083334, "loss": 0.2487, "rewards/chosen": 1.6635797023773193, "rewards/margins": 3.1146358648935957, "rewards/rejected": -1.4510561625162761, "step": 17891 }, { "epoch": 0.9483476002438184, "grad_norm": 35.5, "kl": 0.6537857055664062, "learning_rate": 5e-07, "logits/chosen": -16646635.0, "logits/rejected": -22524508.0, "logps/chosen": -171.8401336669922, "logps/rejected": -265.4893493652344, "loss": 0.2434, "rewards/chosen": 0.4826844334602356, "rewards/margins": 3.2532618641853333, "rewards/rejected": -2.7705774307250977, "step": 17892 }, { "epoch": 0.9484006042456206, "grad_norm": 61.25, "kl": 2.6656742095947266, "learning_rate": 5e-07, "logits/chosen": -29224154.666666668, "logits/rejected": -41895392.0, "logps/chosen": -510.746337890625, "logps/rejected": -646.9881591796875, "loss": 0.2448, "rewards/chosen": 1.2754456202189128, "rewards/margins": 5.570489565531413, "rewards/rejected": -4.2950439453125, "step": 17893 }, { "epoch": 0.9484536082474226, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28173050.0, "logits/rejected": -2962945.5, "logps/chosen": -153.34939575195312, "logps/rejected": -353.869384765625, "loss": 0.2222, "rewards/chosen": 0.7942250967025757, "rewards/margins": 3.359838128089905, "rewards/rejected": -2.565613031387329, "step": 17894 }, { "epoch": 0.9485066122492248, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43806573.333333336, "logits/rejected": -6509184.0, "logps/chosen": -310.8033447265625, "logps/rejected": -108.59847259521484, "loss": 0.4202, "rewards/chosen": 0.2619415521621704, "rewards/margins": 0.9749605655670166, "rewards/rejected": -0.7130190134048462, "step": 17895 }, { "epoch": 0.9485596162510269, "grad_norm": 36.25, "kl": 3.2101545333862305, "learning_rate": 5e-07, "logits/chosen": -10123961.0, "logits/rejected": -44169260.0, "logps/chosen": -146.9984130859375, "logps/rejected": -659.7161865234375, "loss": 0.2142, "rewards/chosen": 0.964502215385437, "rewards/margins": 4.680592656135559, "rewards/rejected": -3.716090440750122, "step": 17896 }, { "epoch": 0.9486126202528291, "grad_norm": 53.25, "kl": 0.1987152099609375, "learning_rate": 5e-07, "logits/chosen": -36884608.0, "logits/rejected": -60961898.666666664, "logps/chosen": -455.46435546875, "logps/rejected": -93.49251302083333, "loss": 0.3138, "rewards/chosen": 0.7629894733428955, "rewards/margins": 2.956396532058716, "rewards/rejected": -2.1934070587158203, "step": 17897 }, { "epoch": 0.9486656242546312, "grad_norm": 52.75, "kl": 1.9103240966796875, "learning_rate": 5e-07, "logits/chosen": -24756446.4, "logits/rejected": 5413694.0, "logps/chosen": -386.506298828125, "logps/rejected": -97.71273803710938, "loss": 0.3561, "rewards/chosen": 0.3670715093612671, "rewards/margins": 3.7580748001734414, "rewards/rejected": -3.3910032908121743, "step": 17898 }, { "epoch": 0.9487186282564334, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1881122.75, "logits/rejected": -3983468.5, "logps/chosen": -259.41375732421875, "logps/rejected": -213.27835083007812, "loss": 0.3037, "rewards/chosen": -0.025360479950904846, "rewards/margins": 2.223306283354759, "rewards/rejected": -2.248666763305664, "step": 17899 }, { "epoch": 0.9487716322582355, "grad_norm": 30.875, "kl": 1.506117820739746, "learning_rate": 5e-07, "logits/chosen": 7395281.333333333, "logits/rejected": -31453305.6, "logps/chosen": -538.2152913411459, "logps/rejected": -312.72177734375, "loss": 0.1624, "rewards/chosen": 1.4694698651631672, "rewards/margins": 4.560077699025472, "rewards/rejected": -3.090607833862305, "step": 17900 }, { "epoch": 0.9488246362600377, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20840346.666666668, "logits/rejected": -57458816.0, "logps/chosen": -503.6380615234375, "logps/rejected": -171.5697479248047, "loss": 0.2962, "rewards/chosen": 1.14779798189799, "rewards/margins": 2.546687165896098, "rewards/rejected": -1.398889183998108, "step": 17901 }, { "epoch": 0.9488776402618397, "grad_norm": 48.75, "kl": 0.23611068725585938, "learning_rate": 5e-07, "logits/chosen": -24062220.0, "logits/rejected": -12914525.333333334, "logps/chosen": -287.1836853027344, "logps/rejected": -192.7029012044271, "loss": 0.2159, "rewards/chosen": -0.07330780476331711, "rewards/margins": 2.524637726445993, "rewards/rejected": -2.59794553120931, "step": 17902 }, { "epoch": 0.9489306442636419, "grad_norm": 53.5, "kl": 0.039147377014160156, "learning_rate": 5e-07, "logits/chosen": -52943219.2, "logits/rejected": -33391688.0, "logps/chosen": -384.8654052734375, "logps/rejected": -301.2481689453125, "loss": 0.3482, "rewards/chosen": 0.20023925304412843, "rewards/margins": 2.2406353076299035, "rewards/rejected": -2.040396054585775, "step": 17903 }, { "epoch": 0.948983648265444, "grad_norm": 50.25, "kl": 2.6993408203125, "learning_rate": 5e-07, "logits/chosen": -35373906.666666664, "logits/rejected": -36624784.0, "logps/chosen": -591.253173828125, "logps/rejected": -393.94805908203125, "loss": 0.289, "rewards/chosen": 1.206570307413737, "rewards/margins": 3.580991903940837, "rewards/rejected": -2.3744215965270996, "step": 17904 }, { "epoch": 0.9490366522672462, "grad_norm": 40.75, "kl": 1.5501985549926758, "learning_rate": 5e-07, "logits/chosen": -17714920.0, "logits/rejected": -25827522.0, "logps/chosen": -346.0931396484375, "logps/rejected": -250.06341552734375, "loss": 0.2282, "rewards/chosen": 0.7404378056526184, "rewards/margins": 3.310429871082306, "rewards/rejected": -2.5699920654296875, "step": 17905 }, { "epoch": 0.9490896562690483, "grad_norm": 29.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11439189.0, "logits/rejected": -37195666.666666664, "logps/chosen": -64.79827880859375, "logps/rejected": -286.14422607421875, "loss": 0.1373, "rewards/chosen": 0.8598430752754211, "rewards/margins": 3.8285444378852844, "rewards/rejected": -2.9687013626098633, "step": 17906 }, { "epoch": 0.9491426602708505, "grad_norm": 49.5, "kl": 0.8564071655273438, "learning_rate": 5e-07, "logits/chosen": -8268995.0, "logits/rejected": -1436320.0, "logps/chosen": -165.5742950439453, "logps/rejected": -590.8661499023438, "loss": 0.2238, "rewards/chosen": 0.6712250709533691, "rewards/margins": 3.7945361137390137, "rewards/rejected": -3.1233110427856445, "step": 17907 }, { "epoch": 0.9491956642726526, "grad_norm": 39.5, "kl": 2.0747337341308594, "learning_rate": 5e-07, "logits/chosen": -37142149.333333336, "logits/rejected": -39665968.0, "logps/chosen": -258.5638427734375, "logps/rejected": -216.8757080078125, "loss": 0.2104, "rewards/chosen": 0.53556227684021, "rewards/margins": 3.1311368465423586, "rewards/rejected": -2.5955745697021486, "step": 17908 }, { "epoch": 0.9492486682744548, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4404060.5, "logits/rejected": -17659849.333333332, "logps/chosen": -37.630775451660156, "logps/rejected": -287.47735595703125, "loss": 0.2289, "rewards/chosen": -0.5230875015258789, "rewards/margins": 2.272022247314453, "rewards/rejected": -2.795109748840332, "step": 17909 }, { "epoch": 0.9493016722762568, "grad_norm": 53.25, "kl": 1.2131500244140625, "learning_rate": 5e-07, "logits/chosen": -51769530.666666664, "logits/rejected": -16275950.0, "logps/chosen": -596.317626953125, "logps/rejected": -122.828125, "loss": 0.3001, "rewards/chosen": 0.7663315931955973, "rewards/margins": 6.861955563227336, "rewards/rejected": -6.095623970031738, "step": 17910 }, { "epoch": 0.949354676278059, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55720645.333333336, "logits/rejected": -21982000.0, "logps/chosen": -451.8563639322917, "logps/rejected": -207.99594116210938, "loss": 0.2506, "rewards/chosen": 1.1326027711232503, "rewards/margins": 4.166437705357869, "rewards/rejected": -3.033834934234619, "step": 17911 }, { "epoch": 0.9494076802798611, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11977605.0, "logits/rejected": -11517228.57142857, "logps/chosen": -296.09381103515625, "logps/rejected": -290.51773507254467, "loss": 0.0777, "rewards/chosen": 2.426556348800659, "rewards/margins": 5.208440950938634, "rewards/rejected": -2.7818846021379744, "step": 17912 }, { "epoch": 0.9494606842816633, "grad_norm": 48.75, "kl": 1.3458147048950195, "learning_rate": 5e-07, "logits/chosen": -48250986.666666664, "logits/rejected": -2897500.5, "logps/chosen": -398.3212483723958, "logps/rejected": -281.7577209472656, "loss": 0.3118, "rewards/chosen": 1.0075594584147136, "rewards/margins": 2.3265093962351484, "rewards/rejected": -1.3189499378204346, "step": 17913 }, { "epoch": 0.9495136882834654, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1738778.1666666667, "logits/rejected": -56183308.8, "logps/chosen": -115.6041971842448, "logps/rejected": -316.7659912109375, "loss": 0.2197, "rewards/chosen": 0.17948812246322632, "rewards/margins": 3.8399823546409606, "rewards/rejected": -3.6604942321777343, "step": 17914 }, { "epoch": 0.9495666922852676, "grad_norm": 58.5, "kl": 7.748329162597656, "learning_rate": 5e-07, "logits/chosen": -47193714.28571428, "logits/rejected": -407645.875, "logps/chosen": -560.2463727678571, "logps/rejected": -75.2108383178711, "loss": 0.4139, "rewards/chosen": 1.3059413092476981, "rewards/margins": 1.8740611331803458, "rewards/rejected": -0.5681198239326477, "step": 17915 }, { "epoch": 0.9496196962870697, "grad_norm": 58.25, "kl": 1.4918813705444336, "learning_rate": 5e-07, "logits/chosen": -41239506.666666664, "logits/rejected": -32881452.0, "logps/chosen": -275.944580078125, "logps/rejected": -242.0147705078125, "loss": 0.3189, "rewards/chosen": 0.6137281258900961, "rewards/margins": 4.526909192403157, "rewards/rejected": -3.9131810665130615, "step": 17916 }, { "epoch": 0.9496727002888719, "grad_norm": 41.0, "kl": 1.8755512237548828, "learning_rate": 5e-07, "logits/chosen": -18047102.4, "logits/rejected": -30673194.666666668, "logps/chosen": -453.51630859375, "logps/rejected": -579.4932454427084, "loss": 0.2224, "rewards/chosen": 1.2413573265075684, "rewards/margins": 4.96201753616333, "rewards/rejected": -3.7206602096557617, "step": 17917 }, { "epoch": 0.9497257042906739, "grad_norm": 52.75, "kl": 1.1583366394042969, "learning_rate": 5e-07, "logits/chosen": -6035364.0, "logits/rejected": -1597804.0, "logps/chosen": -210.08658854166666, "logps/rejected": -264.2951354980469, "loss": 0.3202, "rewards/chosen": 0.736967404683431, "rewards/margins": 3.330106337865194, "rewards/rejected": -2.5931389331817627, "step": 17918 }, { "epoch": 0.9497787082924761, "grad_norm": 63.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20393734.666666668, "logits/rejected": -1953872.625, "logps/chosen": -400.2825520833333, "logps/rejected": -111.06288146972656, "loss": 0.4202, "rewards/chosen": -0.1221300760904948, "rewards/margins": 2.809909184773763, "rewards/rejected": -2.932039260864258, "step": 17919 }, { "epoch": 0.9498317122942782, "grad_norm": 41.5, "kl": 0.8483619689941406, "learning_rate": 5e-07, "logits/chosen": -30187157.333333332, "logits/rejected": 34888960.0, "logps/chosen": -354.7970377604167, "logps/rejected": -847.9085083007812, "loss": 0.2785, "rewards/chosen": 1.111751874287923, "rewards/margins": 4.570009311040242, "rewards/rejected": -3.4582574367523193, "step": 17920 }, { "epoch": 0.9498847162960804, "grad_norm": 47.25, "kl": 2.409579277038574, "learning_rate": 5e-07, "logits/chosen": -43726996.0, "logits/rejected": -2220313.3333333335, "logps/chosen": -362.8994140625, "logps/rejected": -340.3443196614583, "loss": 0.2534, "rewards/chosen": 0.4063577651977539, "rewards/margins": 2.5148466428120932, "rewards/rejected": -2.1084888776143393, "step": 17921 }, { "epoch": 0.9499377202978825, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12853514.666666666, "logits/rejected": -2168598.0, "logps/chosen": -195.6145222981771, "logps/rejected": -525.583642578125, "loss": 0.2091, "rewards/chosen": 0.5213072697321574, "rewards/margins": 3.6717217365900674, "rewards/rejected": -3.15041446685791, "step": 17922 }, { "epoch": 0.9499907242996847, "grad_norm": 46.75, "kl": 0.8722057342529297, "learning_rate": 5e-07, "logits/chosen": -33364840.0, "logits/rejected": -2743478.4, "logps/chosen": -250.08866373697916, "logps/rejected": -577.16064453125, "loss": 0.2001, "rewards/chosen": 1.1306682427724202, "rewards/margins": 5.141565306981405, "rewards/rejected": -4.010897064208985, "step": 17923 }, { "epoch": 0.9500437283014868, "grad_norm": 46.5, "kl": 0.2881278991699219, "learning_rate": 5e-07, "logits/chosen": -23779685.333333332, "logits/rejected": -37015416.0, "logps/chosen": -104.38991292317708, "logps/rejected": -439.4603271484375, "loss": 0.4093, "rewards/chosen": -0.003611286481221517, "rewards/margins": 5.366001884142558, "rewards/rejected": -5.369613170623779, "step": 17924 }, { "epoch": 0.950096732303289, "grad_norm": 50.25, "kl": 0.11975669860839844, "learning_rate": 5e-07, "logits/chosen": -72664357.33333333, "logits/rejected": -26366102.4, "logps/chosen": -285.579345703125, "logps/rejected": -271.876953125, "loss": 0.2086, "rewards/chosen": 0.3673234780629476, "rewards/margins": 3.4172084649403893, "rewards/rejected": -3.0498849868774416, "step": 17925 }, { "epoch": 0.950149736305091, "grad_norm": 45.25, "kl": 5.2989501953125, "learning_rate": 5e-07, "logits/chosen": 5114793.6, "logits/rejected": -34207637.333333336, "logps/chosen": -99.55662231445312, "logps/rejected": -148.90869140625, "loss": 0.3829, "rewards/chosen": 0.8425745010375977, "rewards/margins": 3.3645134607950844, "rewards/rejected": -2.521938959757487, "step": 17926 }, { "epoch": 0.9502027403068932, "grad_norm": 36.75, "kl": 2.713918685913086, "learning_rate": 5e-07, "logits/chosen": -13570379.0, "logits/rejected": -43369666.666666664, "logps/chosen": -264.9918518066406, "logps/rejected": -390.5147298177083, "loss": 0.1758, "rewards/chosen": 1.3272922039031982, "rewards/margins": 3.215533971786499, "rewards/rejected": -1.8882417678833008, "step": 17927 }, { "epoch": 0.9502557443086953, "grad_norm": 28.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17879358.666666668, "logits/rejected": -66437856.0, "logps/chosen": -101.86789957682292, "logps/rejected": -436.011181640625, "loss": 0.1941, "rewards/chosen": 0.6332414150238037, "rewards/margins": 3.2384807109832763, "rewards/rejected": -2.6052392959594726, "step": 17928 }, { "epoch": 0.9503087483104975, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37719600.0, "logits/rejected": -26916864.0, "logps/chosen": -298.1847737630208, "logps/rejected": -265.9899169921875, "loss": 0.3475, "rewards/chosen": -0.051815807819366455, "rewards/margins": 1.1707359194755553, "rewards/rejected": -1.2225517272949218, "step": 17929 }, { "epoch": 0.9503617523122996, "grad_norm": 39.5, "kl": 2.5063209533691406, "learning_rate": 5e-07, "logits/chosen": 1530330.0, "logits/rejected": -5936151.333333333, "logps/chosen": -184.44163818359374, "logps/rejected": -129.40461222330728, "loss": 0.3745, "rewards/chosen": 0.3990943193435669, "rewards/margins": 1.823621964454651, "rewards/rejected": -1.424527645111084, "step": 17930 }, { "epoch": 0.9504147563141017, "grad_norm": 42.0, "kl": 0.5209274291992188, "learning_rate": 5e-07, "logits/chosen": -64164888.0, "logits/rejected": -34160429.333333336, "logps/chosen": -450.2361755371094, "logps/rejected": -441.1311848958333, "loss": 0.1687, "rewards/chosen": 0.37306976318359375, "rewards/margins": 3.8153206507364907, "rewards/rejected": -3.442250887552897, "step": 17931 }, { "epoch": 0.9504677603159039, "grad_norm": 36.75, "kl": 1.157212257385254, "learning_rate": 5e-07, "logits/chosen": 5724980.0, "logits/rejected": -24244042.666666668, "logps/chosen": -27.027772903442383, "logps/rejected": -472.845947265625, "loss": 0.1438, "rewards/chosen": 1.1216431856155396, "rewards/margins": 3.798244595527649, "rewards/rejected": -2.6766014099121094, "step": 17932 }, { "epoch": 0.9505207643177059, "grad_norm": 40.75, "kl": 0.3018684387207031, "learning_rate": 5e-07, "logits/chosen": -33106790.0, "logits/rejected": -37970664.0, "logps/chosen": -314.5047912597656, "logps/rejected": -264.44403076171875, "loss": 0.2101, "rewards/chosen": 0.7848966121673584, "rewards/margins": 4.7076287269592285, "rewards/rejected": -3.92273211479187, "step": 17933 }, { "epoch": 0.9505737683195081, "grad_norm": 36.25, "kl": 5.541871070861816, "learning_rate": 5e-07, "logits/chosen": -5848230.0, "logits/rejected": -67837416.0, "logps/chosen": -123.48638916015625, "logps/rejected": -583.083251953125, "loss": 0.3342, "rewards/chosen": -0.1166846752166748, "rewards/margins": 3.5141804218292236, "rewards/rejected": -3.6308650970458984, "step": 17934 }, { "epoch": 0.9506267723213102, "grad_norm": 52.75, "kl": 3.2096385955810547, "learning_rate": 5e-07, "logits/chosen": -40059939.2, "logits/rejected": -114642784.0, "logps/chosen": -787.389990234375, "logps/rejected": -345.5686848958333, "loss": 0.3909, "rewards/chosen": 0.8797432899475097, "rewards/margins": 2.141957950592041, "rewards/rejected": -1.2622146606445312, "step": 17935 }, { "epoch": 0.9506797763231124, "grad_norm": 59.25, "kl": 4.603351593017578, "learning_rate": 5e-07, "logits/chosen": -10601831.2, "logits/rejected": -40097538.666666664, "logps/chosen": -252.991357421875, "logps/rejected": -514.6247151692709, "loss": 0.3303, "rewards/chosen": 0.5682206153869629, "rewards/margins": 3.8010525703430176, "rewards/rejected": -3.2328319549560547, "step": 17936 }, { "epoch": 0.9507327803249145, "grad_norm": 40.5, "kl": 3.0357131958007812, "learning_rate": 5e-07, "logits/chosen": -28499462.4, "logits/rejected": 57783061.333333336, "logps/chosen": -469.54912109375, "logps/rejected": -564.8697509765625, "loss": 0.2515, "rewards/chosen": 1.3610013008117676, "rewards/margins": 4.202737140655517, "rewards/rejected": -2.84173583984375, "step": 17937 }, { "epoch": 0.9507857843267167, "grad_norm": 89.5, "kl": 1.6122493743896484, "learning_rate": 5e-07, "logits/chosen": 1985002.625, "logits/rejected": -119687800.0, "logps/chosen": -226.8531494140625, "logps/rejected": -474.88421630859375, "loss": 0.2827, "rewards/chosen": 0.9705349802970886, "rewards/margins": 2.8028157353401184, "rewards/rejected": -1.8322807550430298, "step": 17938 }, { "epoch": 0.9508387883285188, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20341816.0, "logits/rejected": -12554324.0, "logps/chosen": -227.61807250976562, "logps/rejected": -297.9979248046875, "loss": 0.2434, "rewards/chosen": 0.7977062463760376, "rewards/margins": 2.5583975315093994, "rewards/rejected": -1.7606912851333618, "step": 17939 }, { "epoch": 0.950891792330321, "grad_norm": 32.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10033020.0, "logits/rejected": -45908720.0, "logps/chosen": -207.10585021972656, "logps/rejected": -163.9388427734375, "loss": 0.2382, "rewards/chosen": 0.500630259513855, "rewards/margins": 3.3977426290512085, "rewards/rejected": -2.8971123695373535, "step": 17940 }, { "epoch": 0.950944796332123, "grad_norm": 42.0, "kl": 1.866668701171875, "learning_rate": 5e-07, "logits/chosen": 19399220.0, "logits/rejected": -54940665.6, "logps/chosen": -349.3143717447917, "logps/rejected": -363.213818359375, "loss": 0.2221, "rewards/chosen": 0.4237929979960124, "rewards/margins": 4.122332731882731, "rewards/rejected": -3.6985397338867188, "step": 17941 }, { "epoch": 0.9509978003339252, "grad_norm": 56.5, "kl": 3.464993476867676, "learning_rate": 5e-07, "logits/chosen": 2865627.714285714, "logits/rejected": -3674291.25, "logps/chosen": -487.65897042410717, "logps/rejected": -112.55431365966797, "loss": 0.2918, "rewards/chosen": 1.364187513078962, "rewards/margins": 5.742498193468366, "rewards/rejected": -4.378310680389404, "step": 17942 }, { "epoch": 0.9510508043357273, "grad_norm": 53.25, "kl": 5.1857757568359375, "learning_rate": 5e-07, "logits/chosen": -20630088.0, "logits/rejected": -10113346.0, "logps/chosen": -219.1147664388021, "logps/rejected": -200.34803771972656, "loss": 0.4294, "rewards/chosen": 0.5466769536336263, "rewards/margins": 1.730231006940206, "rewards/rejected": -1.1835540533065796, "step": 17943 }, { "epoch": 0.9511038083375295, "grad_norm": 53.0, "kl": 1.598358154296875, "learning_rate": 5e-07, "logits/chosen": -50830896.0, "logits/rejected": -19958954.0, "logps/chosen": -463.0836588541667, "logps/rejected": -151.11073303222656, "loss": 0.3337, "rewards/chosen": 0.9980215231577555, "rewards/margins": 2.385542551676432, "rewards/rejected": -1.3875210285186768, "step": 17944 }, { "epoch": 0.9511568123393316, "grad_norm": 42.0, "kl": 1.3277511596679688, "learning_rate": 5e-07, "logits/chosen": -130547840.0, "logits/rejected": -44177026.666666664, "logps/chosen": -224.6182373046875, "logps/rejected": -377.5088704427083, "loss": 0.2812, "rewards/chosen": 0.6411325931549072, "rewards/margins": 2.9102067152659097, "rewards/rejected": -2.2690741221110025, "step": 17945 }, { "epoch": 0.9512098163411338, "grad_norm": 57.25, "kl": 1.168696403503418, "learning_rate": 5e-07, "logits/chosen": -37313392.0, "logits/rejected": 3587878.25, "logps/chosen": -168.90631103515625, "logps/rejected": -87.83084869384766, "loss": 0.4173, "rewards/chosen": -0.19249525666236877, "rewards/margins": 1.4186863601207733, "rewards/rejected": -1.611181616783142, "step": 17946 }, { "epoch": 0.9512628203429359, "grad_norm": 33.0, "kl": 0.030445098876953125, "learning_rate": 5e-07, "logits/chosen": -30980698.666666668, "logits/rejected": -23419017.6, "logps/chosen": -131.61539713541666, "logps/rejected": -264.30869140625, "loss": 0.241, "rewards/chosen": 0.37479480107625324, "rewards/margins": 2.5230858167012533, "rewards/rejected": -2.148291015625, "step": 17947 }, { "epoch": 0.951315824344738, "grad_norm": 51.25, "kl": 0.234832763671875, "learning_rate": 5e-07, "logits/chosen": 43512115.2, "logits/rejected": -30339648.0, "logps/chosen": -258.0826416015625, "logps/rejected": -604.0283610026041, "loss": 0.3191, "rewards/chosen": 0.2378373384475708, "rewards/margins": 3.353809857368469, "rewards/rejected": -3.1159725189208984, "step": 17948 }, { "epoch": 0.9513688283465401, "grad_norm": 47.0, "kl": 5.393152236938477, "learning_rate": 5e-07, "logits/chosen": -35333385.6, "logits/rejected": -32235976.0, "logps/chosen": -688.310791015625, "logps/rejected": -340.2626953125, "loss": 0.2525, "rewards/chosen": 1.7224456787109375, "rewards/margins": 3.457042407989502, "rewards/rejected": -1.7345967292785645, "step": 17949 }, { "epoch": 0.9514218323483423, "grad_norm": 51.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27220926.0, "logits/rejected": -25040502.0, "logps/chosen": -399.4977111816406, "logps/rejected": -390.4341735839844, "loss": 0.2816, "rewards/chosen": 0.08831968158483505, "rewards/margins": 3.0828510746359825, "rewards/rejected": -2.9945313930511475, "step": 17950 }, { "epoch": 0.9514748363501444, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41496720.0, "logits/rejected": -6063626.5, "logps/chosen": -451.1624755859375, "logps/rejected": -193.577392578125, "loss": 0.2647, "rewards/chosen": 0.5042206048965454, "rewards/margins": 2.9297815561294556, "rewards/rejected": -2.42556095123291, "step": 17951 }, { "epoch": 0.9515278403519466, "grad_norm": 39.5, "kl": 0.5264091491699219, "learning_rate": 5e-07, "logits/chosen": 3064770.0, "logits/rejected": -12482632.0, "logps/chosen": -763.9613444010416, "logps/rejected": -217.9220703125, "loss": 0.1161, "rewards/chosen": 1.9499562581380208, "rewards/margins": 5.1876572926839195, "rewards/rejected": -3.2377010345458985, "step": 17952 }, { "epoch": 0.9515808443537487, "grad_norm": 81.0, "kl": 1.4163475036621094, "learning_rate": 5e-07, "logits/chosen": -54589180.0, "logps/chosen": -371.26165771484375, "loss": 0.4092, "rewards/chosen": 0.5639142990112305, "step": 17953 }, { "epoch": 0.9516338483555509, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22677350.0, "logits/rejected": -51502528.0, "logps/chosen": -500.8643798828125, "logps/rejected": -293.4674072265625, "loss": 0.1605, "rewards/chosen": 1.0160584449768066, "rewards/margins": 4.5199501514434814, "rewards/rejected": -3.503891706466675, "step": 17954 }, { "epoch": 0.951686852357353, "grad_norm": 63.75, "kl": 1.1460151672363281, "learning_rate": 5e-07, "logits/chosen": -25523778.666666668, "logits/rejected": -12272079.2, "logps/chosen": -340.96291097005206, "logps/rejected": -269.2348388671875, "loss": 0.2484, "rewards/chosen": 1.1660070419311523, "rewards/margins": 2.397732734680176, "rewards/rejected": -1.2317256927490234, "step": 17955 }, { "epoch": 0.9517398563591551, "grad_norm": 40.5, "kl": 1.6952543258666992, "learning_rate": 5e-07, "logits/chosen": 30400906.0, "logits/rejected": -33030592.0, "logps/chosen": -459.4076232910156, "logps/rejected": -319.5283203125, "loss": 0.1901, "rewards/chosen": 1.0858135223388672, "rewards/margins": 4.078437566757202, "rewards/rejected": -2.992624044418335, "step": 17956 }, { "epoch": 0.9517928603609572, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39996576.0, "logits/rejected": -32778480.0, "logps/chosen": -578.741796875, "logps/rejected": -447.9711100260417, "loss": 0.2933, "rewards/chosen": 0.7600549697875977, "rewards/margins": 3.6651809056599935, "rewards/rejected": -2.905125935872396, "step": 17957 }, { "epoch": 0.9518458643627594, "grad_norm": 36.5, "kl": 0.95440673828125, "learning_rate": 5e-07, "logits/chosen": 5321359.0, "logits/rejected": -15319710.666666666, "logps/chosen": -39.4897575378418, "logps/rejected": -146.59464518229166, "loss": 0.2305, "rewards/chosen": 0.13414619863033295, "rewards/margins": 2.36216368774573, "rewards/rejected": -2.228017489115397, "step": 17958 }, { "epoch": 0.9518988683645615, "grad_norm": 45.0, "kl": 0.029117584228515625, "learning_rate": 5e-07, "logits/chosen": -14405326.0, "logits/rejected": -14439253.0, "logps/chosen": -226.85511779785156, "logps/rejected": -229.30166625976562, "loss": 0.2293, "rewards/chosen": 0.8623722195625305, "rewards/margins": 3.014632761478424, "rewards/rejected": -2.1522605419158936, "step": 17959 }, { "epoch": 0.9519518723663637, "grad_norm": 46.25, "kl": 0.4191017150878906, "learning_rate": 5e-07, "logits/chosen": -43396360.0, "logits/rejected": -32627496.0, "logps/chosen": -503.6710205078125, "logps/rejected": -342.17669677734375, "loss": 0.2525, "rewards/chosen": 0.8819805383682251, "rewards/margins": 2.5416263341903687, "rewards/rejected": -1.6596457958221436, "step": 17960 }, { "epoch": 0.9520048763681658, "grad_norm": 39.75, "kl": 0.5630569458007812, "learning_rate": 5e-07, "logits/chosen": -45588992.0, "logits/rejected": -33251472.0, "logps/chosen": -356.4089050292969, "logps/rejected": -485.08184814453125, "loss": 0.1589, "rewards/chosen": 1.099243402481079, "rewards/margins": 5.322134733200073, "rewards/rejected": -4.222891330718994, "step": 17961 }, { "epoch": 0.952057880369968, "grad_norm": 63.25, "kl": 2.901569366455078, "learning_rate": 5e-07, "logits/chosen": -12692893.6, "logits/rejected": -14765161.333333334, "logps/chosen": -327.039892578125, "logps/rejected": -315.7803548177083, "loss": 0.4174, "rewards/chosen": 0.24339356422424316, "rewards/margins": 1.6788051764170329, "rewards/rejected": -1.4354116121927898, "step": 17962 }, { "epoch": 0.95211088437177, "grad_norm": 32.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29850020.0, "logits/rejected": -17144362.0, "logps/chosen": -173.08761596679688, "logps/rejected": -330.9550476074219, "loss": 0.2599, "rewards/chosen": 0.40755075216293335, "rewards/margins": 3.2326950430870056, "rewards/rejected": -2.8251442909240723, "step": 17963 }, { "epoch": 0.9521638883735722, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -99396533.33333333, "logits/rejected": -25219977.6, "logps/chosen": -667.3943277994791, "logps/rejected": -245.248095703125, "loss": 0.2144, "rewards/chosen": 0.2400736610094706, "rewards/margins": 3.214213446776072, "rewards/rejected": -2.9741397857666017, "step": 17964 }, { "epoch": 0.9522168923753743, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33055356.0, "logits/rejected": -38366109.333333336, "logps/chosen": -218.58811950683594, "logps/rejected": -258.8782145182292, "loss": 0.1955, "rewards/chosen": 0.25206565856933594, "rewards/margins": 3.2377192179361978, "rewards/rejected": -2.985653559366862, "step": 17965 }, { "epoch": 0.9522698963771765, "grad_norm": 50.75, "kl": 3.3007068634033203, "learning_rate": 5e-07, "logits/chosen": -18909450.0, "logits/rejected": -19466266.0, "logps/chosen": -126.2254409790039, "logps/rejected": -525.9744262695312, "loss": 0.3084, "rewards/chosen": 0.38419339060783386, "rewards/margins": 2.8783423602581024, "rewards/rejected": -2.4941489696502686, "step": 17966 }, { "epoch": 0.9523229003789786, "grad_norm": 57.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5361343.0, "logits/rejected": -25791048.0, "logps/chosen": -282.6587829589844, "logps/rejected": -474.9764404296875, "loss": 0.2849, "rewards/chosen": 0.23695486783981323, "rewards/margins": 2.7007667422294617, "rewards/rejected": -2.4638118743896484, "step": 17967 }, { "epoch": 0.9523759043807808, "grad_norm": 53.25, "kl": 0.017368316650390625, "learning_rate": 5e-07, "logits/chosen": -16054459.0, "logits/rejected": -56754792.0, "logps/chosen": -555.6376953125, "logps/rejected": -354.391357421875, "loss": 0.2915, "rewards/chosen": 0.7208744287490845, "rewards/margins": 2.7238792181015015, "rewards/rejected": -2.003004789352417, "step": 17968 }, { "epoch": 0.9524289083825829, "grad_norm": 54.75, "kl": 1.2446842193603516, "learning_rate": 5e-07, "logits/chosen": -3320893.3333333335, "logits/rejected": -1503764.75, "logps/chosen": -309.81801350911456, "logps/rejected": -155.76156616210938, "loss": 0.3824, "rewards/chosen": 0.7461268107096354, "rewards/margins": 1.0482699076334634, "rewards/rejected": -0.3021430969238281, "step": 17969 }, { "epoch": 0.9524819123843851, "grad_norm": 46.5, "kl": 2.2276763916015625, "learning_rate": 5e-07, "logits/chosen": -16737946.0, "logits/rejected": 1209751.875, "logps/chosen": -344.1486511230469, "logps/rejected": -82.46878051757812, "loss": 0.2156, "rewards/chosen": 1.369346261024475, "rewards/margins": 3.218531608581543, "rewards/rejected": -1.8491853475570679, "step": 17970 }, { "epoch": 0.9525349163861871, "grad_norm": 51.25, "kl": 0.3966693878173828, "learning_rate": 5e-07, "logits/chosen": -63022752.0, "logits/rejected": -3503039.6, "logps/chosen": -317.68162027994794, "logps/rejected": -133.0052978515625, "loss": 0.3415, "rewards/chosen": 0.17699639002482095, "rewards/margins": 1.6023843447367352, "rewards/rejected": -1.4253879547119142, "step": 17971 }, { "epoch": 0.9525879203879893, "grad_norm": 60.25, "kl": 7.060464859008789, "learning_rate": 5e-07, "logits/chosen": -11028124.0, "logits/rejected": -74069576.0, "logps/chosen": -439.2689615885417, "logps/rejected": -422.378173828125, "loss": 0.2503, "rewards/chosen": 1.7484130859375, "rewards/margins": 4.150843858718872, "rewards/rejected": -2.402430772781372, "step": 17972 }, { "epoch": 0.9526409243897914, "grad_norm": 58.5, "kl": 0.4427652359008789, "learning_rate": 5e-07, "logits/chosen": -603218.25, "logits/rejected": -21358848.0, "logps/chosen": -236.42593383789062, "logps/rejected": -356.0203857421875, "loss": 0.1954, "rewards/chosen": 0.8974958658218384, "rewards/margins": 3.6463769674301147, "rewards/rejected": -2.7488811016082764, "step": 17973 }, { "epoch": 0.9526939283915936, "grad_norm": 45.5, "kl": 0.8857784271240234, "learning_rate": 5e-07, "logits/chosen": -15278571.0, "logits/rejected": -24867202.0, "logps/chosen": -211.25643920898438, "logps/rejected": -312.81744384765625, "loss": 0.2002, "rewards/chosen": 1.2042262554168701, "rewards/margins": 4.014230489730835, "rewards/rejected": -2.810004234313965, "step": 17974 }, { "epoch": 0.9527469323933957, "grad_norm": 58.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -90763864.0, "logits/rejected": -38938688.0, "logps/chosen": -287.89501953125, "logps/rejected": -360.16412353515625, "loss": 0.3355, "rewards/chosen": -0.2520591616630554, "rewards/margins": 2.5245352387428284, "rewards/rejected": -2.776594400405884, "step": 17975 }, { "epoch": 0.9527999363951979, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -78106880.0, "logits/rejected": -33214978.0, "logps/chosen": -616.6162109375, "logps/rejected": -302.4049072265625, "loss": 0.2191, "rewards/chosen": 1.1984206438064575, "rewards/margins": 3.2665501832962036, "rewards/rejected": -2.068129539489746, "step": 17976 }, { "epoch": 0.952852940397, "grad_norm": 58.75, "kl": 0.7088394165039062, "learning_rate": 5e-07, "logits/chosen": 4978407.5, "logits/rejected": -40764060.0, "logps/chosen": -481.7454833984375, "logps/rejected": -624.6839599609375, "loss": 0.2193, "rewards/chosen": 1.1630746126174927, "rewards/margins": 4.943627953529358, "rewards/rejected": -3.7805533409118652, "step": 17977 }, { "epoch": 0.9529059443988022, "grad_norm": 41.75, "kl": 1.1181449890136719, "learning_rate": 5e-07, "logits/chosen": -15645460.0, "logits/rejected": -15099176.0, "logps/chosen": -185.87075805664062, "logps/rejected": -267.57049560546875, "loss": 0.2812, "rewards/chosen": 0.4996040463447571, "rewards/margins": 2.976875603199005, "rewards/rejected": -2.477271556854248, "step": 17978 }, { "epoch": 0.9529589484006042, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 25947396.0, "logits/rejected": -15681685.714285715, "logps/chosen": -133.52545166015625, "logps/rejected": -172.78962053571428, "loss": 0.1368, "rewards/chosen": 1.1034576892852783, "rewards/margins": 3.717315776007516, "rewards/rejected": -2.613858086722238, "step": 17979 }, { "epoch": 0.9530119524024064, "grad_norm": 37.75, "kl": 1.9638404846191406, "learning_rate": 5e-07, "logits/chosen": -25718128.0, "logits/rejected": -30477981.333333332, "logps/chosen": -246.154296875, "logps/rejected": -326.3641357421875, "loss": 0.2697, "rewards/chosen": 0.7476590156555176, "rewards/margins": 2.9045149167378748, "rewards/rejected": -2.156855901082357, "step": 17980 }, { "epoch": 0.9530649564042085, "grad_norm": 54.0, "kl": 4.667964935302734, "learning_rate": 5e-07, "logits/chosen": -21248974.666666668, "logits/rejected": -1909734.125, "logps/chosen": -515.9451090494791, "logps/rejected": -118.46271514892578, "loss": 0.3777, "rewards/chosen": 0.9793458779652914, "rewards/margins": 2.857627352078756, "rewards/rejected": -1.8782814741134644, "step": 17981 }, { "epoch": 0.9531179604060106, "grad_norm": 44.75, "kl": 0.46547698974609375, "learning_rate": 5e-07, "logits/chosen": 4313597.0, "logits/rejected": 16838475.2, "logps/chosen": -129.52592976888022, "logps/rejected": -248.8579833984375, "loss": 0.2615, "rewards/chosen": 0.7641275723775228, "rewards/margins": 2.3571189244588218, "rewards/rejected": -1.5929913520812988, "step": 17982 }, { "epoch": 0.9531709644078128, "grad_norm": 42.0, "kl": 0.6725664138793945, "learning_rate": 5e-07, "logits/chosen": -58549504.0, "logits/rejected": -17102154.0, "logps/chosen": -318.4059753417969, "logps/rejected": -371.4252014160156, "loss": 0.1941, "rewards/chosen": 1.0450842380523682, "rewards/margins": 4.688815355300903, "rewards/rejected": -3.643731117248535, "step": 17983 }, { "epoch": 0.9532239684096149, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 152628736.0, "logits/rejected": -14757542.4, "logps/chosen": -513.1211344401041, "logps/rejected": -284.85322265625, "loss": 0.2326, "rewards/chosen": 0.4339701334635417, "rewards/margins": 2.9262079874674476, "rewards/rejected": -2.492237854003906, "step": 17984 }, { "epoch": 0.9532769724114171, "grad_norm": 69.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60284379.428571425, "logits/rejected": 58414312.0, "logps/chosen": -461.73563058035717, "logps/rejected": -624.8231201171875, "loss": 0.4326, "rewards/chosen": 0.11195811203547887, "rewards/margins": 1.9778761523110526, "rewards/rejected": -1.8659180402755737, "step": 17985 }, { "epoch": 0.9533299764132191, "grad_norm": 44.25, "kl": 2.8779287338256836, "learning_rate": 5e-07, "logits/chosen": -19554521.333333332, "logits/rejected": 21726468.0, "logps/chosen": -325.6855875651042, "logps/rejected": -271.4046325683594, "loss": 0.3326, "rewards/chosen": 1.0531464417775471, "rewards/margins": 2.5109613736470537, "rewards/rejected": -1.4578149318695068, "step": 17986 }, { "epoch": 0.9533829804150213, "grad_norm": 45.75, "kl": 1.0376014709472656, "learning_rate": 5e-07, "logits/chosen": 3742742.0, "logits/rejected": -58337452.0, "logps/chosen": -260.95880126953125, "logps/rejected": -228.52407836914062, "loss": 0.4136, "rewards/chosen": -0.2226712703704834, "rewards/margins": 1.6053779125213623, "rewards/rejected": -1.8280491828918457, "step": 17987 }, { "epoch": 0.9534359844168234, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26389416.0, "logits/rejected": -26559068.0, "logps/chosen": -225.05386352539062, "logps/rejected": -229.77467346191406, "loss": 0.1938, "rewards/chosen": 0.9121932983398438, "rewards/margins": 4.447824001312256, "rewards/rejected": -3.535630702972412, "step": 17988 }, { "epoch": 0.9534889884186256, "grad_norm": 34.25, "kl": 0.2883338928222656, "learning_rate": 5e-07, "logits/chosen": -38884376.0, "logits/rejected": 22229.75, "logps/chosen": -618.1724243164062, "logps/rejected": -88.37130737304688, "loss": 0.2914, "rewards/chosen": 1.2288455963134766, "rewards/margins": 2.6468905210494995, "rewards/rejected": -1.418044924736023, "step": 17989 }, { "epoch": 0.9535419924204277, "grad_norm": 35.0, "kl": 2.7616958618164062, "learning_rate": 5e-07, "logits/chosen": -12088091.2, "logits/rejected": 68948122.66666667, "logps/chosen": -75.23776245117188, "logps/rejected": -358.4383138020833, "loss": 0.3128, "rewards/chosen": 0.4775986194610596, "rewards/margins": 4.265293550491333, "rewards/rejected": -3.7876949310302734, "step": 17990 }, { "epoch": 0.9535949964222299, "grad_norm": 48.25, "kl": 0.6458911895751953, "learning_rate": 5e-07, "logits/chosen": -22614683.2, "logits/rejected": -23808992.0, "logps/chosen": -372.133203125, "logps/rejected": -231.2657674153646, "loss": 0.2958, "rewards/chosen": 0.6782527446746827, "rewards/margins": 2.883706776301066, "rewards/rejected": -2.2054540316263833, "step": 17991 }, { "epoch": 0.953648000424032, "grad_norm": 71.5, "kl": 4.071895599365234, "learning_rate": 5e-07, "logits/chosen": -32658626.666666668, "logits/rejected": -21588704.0, "logps/chosen": -267.55666097005206, "logps/rejected": -122.04046630859375, "loss": 0.3671, "rewards/chosen": 0.575977603594462, "rewards/margins": 3.1747546593348184, "rewards/rejected": -2.5987770557403564, "step": 17992 }, { "epoch": 0.9537010044258342, "grad_norm": 45.75, "kl": 0.7171096801757812, "learning_rate": 5e-07, "logits/chosen": -27748326.0, "logits/rejected": -24472558.0, "logps/chosen": -168.5611572265625, "logps/rejected": -298.8652038574219, "loss": 0.3988, "rewards/chosen": 0.19028306007385254, "rewards/margins": 1.1968598365783691, "rewards/rejected": -1.0065767765045166, "step": 17993 }, { "epoch": 0.9537540084276362, "grad_norm": 57.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62710617.6, "logits/rejected": -46775520.0, "logps/chosen": -408.7298583984375, "logps/rejected": -389.2537434895833, "loss": 0.2084, "rewards/chosen": 1.124135684967041, "rewards/margins": 4.350314172108968, "rewards/rejected": -3.2261784871419272, "step": 17994 }, { "epoch": 0.9538070124294384, "grad_norm": 50.0, "kl": 4.743695259094238, "learning_rate": 5e-07, "logits/chosen": -46200469.333333336, "logits/rejected": -21616172.0, "logps/chosen": -516.7784830729166, "logps/rejected": -315.25152587890625, "loss": 0.2184, "rewards/chosen": 1.810999075571696, "rewards/margins": 4.685434738794963, "rewards/rejected": -2.8744356632232666, "step": 17995 }, { "epoch": 0.9538600164312405, "grad_norm": 44.0, "kl": 1.6979351043701172, "learning_rate": 5e-07, "logits/chosen": -23641361.6, "logits/rejected": -44592664.0, "logps/chosen": -278.805712890625, "logps/rejected": -427.4919840494792, "loss": 0.2809, "rewards/chosen": 0.4926603317260742, "rewards/margins": 3.3952278772989906, "rewards/rejected": -2.9025675455729165, "step": 17996 }, { "epoch": 0.9539130204330427, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18403997.333333332, "logits/rejected": -26281520.0, "logps/chosen": -164.37977091471353, "logps/rejected": -310.021875, "loss": 0.2151, "rewards/chosen": 0.14854649702707926, "rewards/margins": 4.697775928179423, "rewards/rejected": -4.549229431152344, "step": 17997 }, { "epoch": 0.9539660244348448, "grad_norm": 55.5, "kl": 0.15909194946289062, "learning_rate": 5e-07, "logits/chosen": -41040041.6, "logits/rejected": -10346141.333333334, "logps/chosen": -335.8233642578125, "logps/rejected": -166.60545857747397, "loss": 0.3568, "rewards/chosen": 0.20439116954803466, "rewards/margins": 1.720943299929301, "rewards/rejected": -1.5165521303812664, "step": 17998 }, { "epoch": 0.954019028436647, "grad_norm": 50.25, "kl": 3.5397377014160156, "learning_rate": 5e-07, "logits/chosen": -88237834.66666667, "logits/rejected": -12449610.0, "logps/chosen": -364.2852376302083, "logps/rejected": -609.6442260742188, "loss": 0.3099, "rewards/chosen": 0.915261427561442, "rewards/margins": 3.7101518313090005, "rewards/rejected": -2.7948904037475586, "step": 17999 }, { "epoch": 0.9540720324384491, "grad_norm": 46.0, "kl": 0.6297779083251953, "learning_rate": 5e-07, "logits/chosen": -78615408.0, "logits/rejected": -26371212.0, "logps/chosen": -303.7243347167969, "logps/rejected": -436.8623046875, "loss": 0.3612, "rewards/chosen": -0.10156285762786865, "rewards/margins": 2.2462114095687866, "rewards/rejected": -2.3477742671966553, "step": 18000 }, { "epoch": 0.9541250364402513, "grad_norm": 36.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19869552.0, "logits/rejected": -30662197.333333332, "logps/chosen": -285.8734436035156, "logps/rejected": -156.70342000325522, "loss": 0.1625, "rewards/chosen": 0.5635620355606079, "rewards/margins": 3.554752310117086, "rewards/rejected": -2.991190274556478, "step": 18001 }, { "epoch": 0.9541780404420533, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1694708.0, "logits/rejected": -69159424.0, "logps/chosen": -336.5546468098958, "logps/rejected": -442.311669921875, "loss": 0.2927, "rewards/chosen": 0.20546770095825195, "rewards/margins": 2.173704242706299, "rewards/rejected": -1.968236541748047, "step": 18002 }, { "epoch": 0.9542310444438555, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63420384.0, "logits/rejected": -5016771.666666667, "logps/chosen": -282.16229248046875, "logps/rejected": -285.94875081380206, "loss": 0.1886, "rewards/chosen": -0.31516265869140625, "rewards/margins": 3.6387818654378257, "rewards/rejected": -3.953944524129232, "step": 18003 }, { "epoch": 0.9542840484456576, "grad_norm": 36.25, "kl": 2.030406951904297, "learning_rate": 5e-07, "logits/chosen": -20562316.8, "logits/rejected": -81643296.0, "logps/chosen": -227.70244140625, "logps/rejected": -400.4082438151042, "loss": 0.2413, "rewards/chosen": 1.2838358879089355, "rewards/margins": 3.4872641563415527, "rewards/rejected": -2.203428268432617, "step": 18004 }, { "epoch": 0.9543370524474598, "grad_norm": 52.0, "kl": 0.9762306213378906, "learning_rate": 5e-07, "logits/chosen": -19551062.4, "logits/rejected": -662022.3333333334, "logps/chosen": -294.75146484375, "logps/rejected": -136.38855997721353, "loss": 0.4034, "rewards/chosen": 0.11491866111755371, "rewards/margins": 1.4271151542663574, "rewards/rejected": -1.3121964931488037, "step": 18005 }, { "epoch": 0.9543900564492619, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42653733.333333336, "logits/rejected": -17892204.0, "logps/chosen": -531.3006998697916, "logps/rejected": -137.85848999023438, "loss": 0.3125, "rewards/chosen": 0.7733144760131836, "rewards/margins": 2.9860851764678955, "rewards/rejected": -2.212770700454712, "step": 18006 }, { "epoch": 0.9544430604510641, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18308112.0, "logits/rejected": -18955632.0, "logps/chosen": -159.9769490559896, "logps/rejected": -167.15399169921875, "loss": 0.2276, "rewards/chosen": 0.4199419816335042, "rewards/margins": 3.274561389287313, "rewards/rejected": -2.8546194076538085, "step": 18007 }, { "epoch": 0.9544960644528662, "grad_norm": 40.25, "kl": 3.1332597732543945, "learning_rate": 5e-07, "logits/chosen": -27476582.4, "logits/rejected": 1220993.8333333333, "logps/chosen": -624.1357421875, "logps/rejected": -102.37180582682292, "loss": 0.24, "rewards/chosen": 1.233751392364502, "rewards/margins": 4.657678063710531, "rewards/rejected": -3.423926671346029, "step": 18008 }, { "epoch": 0.9545490684546684, "grad_norm": 59.5, "kl": 1.2061710357666016, "learning_rate": 5e-07, "logits/chosen": -12946588.0, "logits/rejected": -40572068.0, "logps/chosen": -283.1741536458333, "logps/rejected": -348.7508239746094, "loss": 0.2744, "rewards/chosen": 0.7331271966298422, "rewards/margins": 4.840291102727254, "rewards/rejected": -4.107163906097412, "step": 18009 }, { "epoch": 0.9546020724564704, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61488112.0, "logits/rejected": -17870896.0, "logps/chosen": -246.66099548339844, "logps/rejected": -266.72286551339283, "loss": 0.2139, "rewards/chosen": -0.21107636392116547, "rewards/margins": 2.672573189650263, "rewards/rejected": -2.8836495535714284, "step": 18010 }, { "epoch": 0.9546550764582726, "grad_norm": 39.0, "kl": 0.8861331939697266, "learning_rate": 5e-07, "logits/chosen": -8281894.0, "logits/rejected": -16830430.666666668, "logps/chosen": -187.72561645507812, "logps/rejected": -262.2154134114583, "loss": 0.1794, "rewards/chosen": 0.5842777490615845, "rewards/margins": 3.622353196144104, "rewards/rejected": -3.0380754470825195, "step": 18011 }, { "epoch": 0.9547080804600747, "grad_norm": 45.75, "kl": 2.8473968505859375, "learning_rate": 5e-07, "logits/chosen": -34734592.0, "logits/rejected": -61486704.0, "logps/chosen": -346.0379943847656, "logps/rejected": -336.384033203125, "loss": 0.171, "rewards/chosen": 1.2314224243164062, "rewards/margins": 3.8687870502471924, "rewards/rejected": -2.637364625930786, "step": 18012 }, { "epoch": 0.9547610844618769, "grad_norm": 42.5, "kl": 5.28289794921875, "learning_rate": 5e-07, "logits/chosen": -25561896.0, "logits/rejected": 6027813.333333333, "logps/chosen": -956.6517578125, "logps/rejected": -171.9205525716146, "loss": 0.314, "rewards/chosen": 1.7293487548828126, "rewards/margins": 4.53855889638265, "rewards/rejected": -2.8092101414998374, "step": 18013 }, { "epoch": 0.954814088463679, "grad_norm": 47.0, "kl": 0.4582252502441406, "learning_rate": 5e-07, "logits/chosen": -28606394.666666668, "logits/rejected": -40938832.0, "logps/chosen": -277.6096598307292, "logps/rejected": -728.28125, "loss": 0.2542, "rewards/chosen": 0.8920262654622396, "rewards/margins": 4.087729295094808, "rewards/rejected": -3.1957030296325684, "step": 18014 }, { "epoch": 0.9548670924654812, "grad_norm": 41.0, "kl": 2.5174779891967773, "learning_rate": 5e-07, "logits/chosen": -16893132.0, "logits/rejected": -6958922.4, "logps/chosen": -264.1153157552083, "logps/rejected": -269.9830078125, "loss": 0.2181, "rewards/chosen": 1.0843368371327717, "rewards/margins": 3.2222434838612877, "rewards/rejected": -2.1379066467285157, "step": 18015 }, { "epoch": 0.9549200964672833, "grad_norm": 57.5, "kl": 2.204117774963379, "learning_rate": 5e-07, "logits/chosen": -39422246.4, "logits/rejected": 13010040.0, "logps/chosen": -423.93154296875, "logps/rejected": -218.3828125, "loss": 0.3258, "rewards/chosen": 0.715119457244873, "rewards/margins": 1.743917640050252, "rewards/rejected": -1.0287981828053792, "step": 18016 }, { "epoch": 0.9549731004690855, "grad_norm": 32.0, "kl": 0.5602588653564453, "learning_rate": 5e-07, "logits/chosen": -1874451.5, "logits/rejected": -58979658.666666664, "logps/chosen": -149.091796875, "logps/rejected": -420.1923421223958, "loss": 0.1318, "rewards/chosen": 1.0286736488342285, "rewards/margins": 4.26332426071167, "rewards/rejected": -3.2346506118774414, "step": 18017 }, { "epoch": 0.9550261044708875, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -47162548.0, "logits/rejected": -6600795.0, "logps/chosen": -383.86285400390625, "logps/rejected": -376.45806884765625, "loss": 0.2551, "rewards/chosen": 0.26698800921440125, "rewards/margins": 3.7010614573955536, "rewards/rejected": -3.4340734481811523, "step": 18018 }, { "epoch": 0.9550791084726897, "grad_norm": 51.25, "kl": 0.1502819061279297, "learning_rate": 5e-07, "logits/chosen": -47911075.2, "logits/rejected": -23492586.666666668, "logps/chosen": -391.0490234375, "logps/rejected": -378.8246256510417, "loss": 0.3735, "rewards/chosen": -0.1929202437400818, "rewards/margins": 2.268488371372223, "rewards/rejected": -2.4614086151123047, "step": 18019 }, { "epoch": 0.9551321124744918, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7066374.0, "logits/rejected": -14284169.6, "logps/chosen": -234.0724894205729, "logps/rejected": -122.094775390625, "loss": 0.1748, "rewards/chosen": 1.286244551340739, "rewards/margins": 3.770279852549235, "rewards/rejected": -2.484035301208496, "step": 18020 }, { "epoch": 0.955185116476294, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11262598.0, "logits/rejected": -51825856.0, "logps/chosen": -283.88934326171875, "logps/rejected": -454.4924011230469, "loss": 0.1644, "rewards/chosen": 1.0544971227645874, "rewards/margins": 4.594188094139099, "rewards/rejected": -3.5396909713745117, "step": 18021 }, { "epoch": 0.9552381204780961, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39393800.0, "logits/rejected": -17420398.4, "logps/chosen": -293.7632242838542, "logps/rejected": -146.503662109375, "loss": 0.224, "rewards/chosen": 0.8012741406758627, "rewards/margins": 2.996032937367757, "rewards/rejected": -2.1947587966918944, "step": 18022 }, { "epoch": 0.9552911244798983, "grad_norm": 57.25, "kl": 2.34619140625, "learning_rate": 5e-07, "logits/chosen": -90248140.8, "logits/rejected": -78531728.0, "logps/chosen": -464.38701171875, "logps/rejected": -435.3824869791667, "loss": 0.2718, "rewards/chosen": 0.7546258449554444, "rewards/margins": 3.2041416645050047, "rewards/rejected": -2.4495158195495605, "step": 18023 }, { "epoch": 0.9553441284817004, "grad_norm": 32.25, "kl": 0.6796588897705078, "learning_rate": 5e-07, "logits/chosen": -6431906.0, "logits/rejected": -54975992.0, "logps/chosen": -96.601318359375, "logps/rejected": -489.89361572265625, "loss": 0.2388, "rewards/chosen": 0.693450927734375, "rewards/margins": 3.2780473232269287, "rewards/rejected": -2.5845963954925537, "step": 18024 }, { "epoch": 0.9553971324835026, "grad_norm": 64.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15310859.2, "logits/rejected": -68655893.33333333, "logps/chosen": -243.6783203125, "logps/rejected": -426.642578125, "loss": 0.3271, "rewards/chosen": 0.538153600692749, "rewards/margins": 2.2648032029469807, "rewards/rejected": -1.7266496022542317, "step": 18025 }, { "epoch": 0.9554501364853046, "grad_norm": 45.25, "kl": 1.11187744140625, "learning_rate": 5e-07, "logits/chosen": -42086756.0, "logits/rejected": -45162196.0, "logps/chosen": -390.20709228515625, "logps/rejected": -566.87158203125, "loss": 0.2527, "rewards/chosen": 0.5389091968536377, "rewards/margins": 3.595820665359497, "rewards/rejected": -3.0569114685058594, "step": 18026 }, { "epoch": 0.9555031404871068, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14075001.333333334, "logits/rejected": -7062736.0, "logps/chosen": -721.87158203125, "logps/rejected": -196.2857177734375, "loss": 0.1457, "rewards/chosen": 1.3639963467915852, "rewards/margins": 4.55011084874471, "rewards/rejected": -3.186114501953125, "step": 18027 }, { "epoch": 0.9555561444889089, "grad_norm": 64.5, "kl": 1.6122398376464844, "learning_rate": 5e-07, "logits/chosen": -7024960.0, "logits/rejected": -21576748.0, "logps/chosen": -170.5310262044271, "logps/rejected": -192.822021484375, "loss": 0.3405, "rewards/chosen": 0.46181829770406085, "rewards/margins": 3.8917301495869956, "rewards/rejected": -3.4299118518829346, "step": 18028 }, { "epoch": 0.9556091484907111, "grad_norm": 47.75, "kl": 1.8806743621826172, "learning_rate": 5e-07, "logits/chosen": -5562630.666666667, "logits/rejected": -9649976.0, "logps/chosen": -226.17828369140625, "logps/rejected": -302.5651550292969, "loss": 0.3908, "rewards/chosen": 0.279646098613739, "rewards/margins": 3.25980681180954, "rewards/rejected": -2.980160713195801, "step": 18029 }, { "epoch": 0.9556621524925132, "grad_norm": 43.0, "kl": 0.9867935180664062, "learning_rate": 5e-07, "logits/chosen": 14865353.6, "logits/rejected": -13282700.0, "logps/chosen": -157.89742431640624, "logps/rejected": -249.49810791015625, "loss": 0.2994, "rewards/chosen": 0.3905695199966431, "rewards/margins": 3.6118906259536745, "rewards/rejected": -3.2213211059570312, "step": 18030 }, { "epoch": 0.9557151564943153, "grad_norm": 45.5, "kl": 1.1712255477905273, "learning_rate": 5e-07, "logits/chosen": -18518850.0, "logits/rejected": -74319840.0, "logps/chosen": -565.7449340820312, "logps/rejected": -413.38470458984375, "loss": 0.2167, "rewards/chosen": 1.4278392791748047, "rewards/margins": 3.984947919845581, "rewards/rejected": -2.5571086406707764, "step": 18031 }, { "epoch": 0.9557681604961175, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7553177.5, "logits/rejected": -9695581.0, "logps/chosen": -121.49166870117188, "logps/rejected": -197.48960876464844, "loss": 0.2221, "rewards/chosen": 0.44498977065086365, "rewards/margins": 4.097844213247299, "rewards/rejected": -3.6528544425964355, "step": 18032 }, { "epoch": 0.9558211644979195, "grad_norm": 42.5, "kl": 3.4337005615234375, "learning_rate": 5e-07, "logits/chosen": -51541077.333333336, "logits/rejected": -16535091.2, "logps/chosen": -930.9043782552084, "logps/rejected": -244.29921875, "loss": 0.2158, "rewards/chosen": 1.4476563135782878, "rewards/margins": 3.6460741678873703, "rewards/rejected": -2.198417854309082, "step": 18033 }, { "epoch": 0.9558741684997217, "grad_norm": 61.75, "kl": 0.3129444122314453, "learning_rate": 5e-07, "logits/chosen": -34944772.0, "logits/rejected": -2146078.0, "logps/chosen": -152.15965270996094, "logps/rejected": -186.97933959960938, "loss": 0.3493, "rewards/chosen": 0.5609337091445923, "rewards/margins": 1.4768491983413696, "rewards/rejected": -0.9159154891967773, "step": 18034 }, { "epoch": 0.9559271725015238, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6498458.0, "logits/rejected": -3518639.0, "logps/chosen": -265.3668212890625, "logps/rejected": -91.63404083251953, "loss": 0.2666, "rewards/chosen": 0.6748085021972656, "rewards/margins": 2.7313785552978516, "rewards/rejected": -2.056570053100586, "step": 18035 }, { "epoch": 0.955980176503326, "grad_norm": 46.25, "kl": 0.6150360107421875, "learning_rate": 5e-07, "logits/chosen": -18800708.0, "logits/rejected": -33420212.0, "logps/chosen": -431.0235595703125, "logps/rejected": -242.43011474609375, "loss": 0.2702, "rewards/chosen": 0.7331463098526001, "rewards/margins": 4.059742093086243, "rewards/rejected": -3.3265957832336426, "step": 18036 }, { "epoch": 0.9560331805051281, "grad_norm": 58.25, "kl": 0.5767908096313477, "learning_rate": 5e-07, "logits/chosen": -15842088.0, "logits/rejected": -40352360.0, "logps/chosen": -146.96591796875, "logps/rejected": -279.1835123697917, "loss": 0.3766, "rewards/chosen": 0.02879974842071533, "rewards/margins": 1.9109643856684368, "rewards/rejected": -1.8821646372477214, "step": 18037 }, { "epoch": 0.9560861845069303, "grad_norm": 55.5, "kl": 0.3409881591796875, "learning_rate": 5e-07, "logits/chosen": -31122406.4, "logits/rejected": -24530410.666666668, "logps/chosen": -243.2930419921875, "logps/rejected": -467.3588460286458, "loss": 0.2769, "rewards/chosen": 0.828091049194336, "rewards/margins": 2.628478749593099, "rewards/rejected": -1.800387700398763, "step": 18038 }, { "epoch": 0.9561391885087324, "grad_norm": 41.0, "kl": 1.7257156372070312, "learning_rate": 5e-07, "logits/chosen": -35718336.0, "logits/rejected": -32472570.0, "logps/chosen": -375.1854248046875, "logps/rejected": -415.551513671875, "loss": 0.2346, "rewards/chosen": 1.264406681060791, "rewards/margins": 3.333920478820801, "rewards/rejected": -2.0695137977600098, "step": 18039 }, { "epoch": 0.9561921925105346, "grad_norm": 41.5, "kl": 1.7575998306274414, "learning_rate": 5e-07, "logits/chosen": -12159118.666666666, "logits/rejected": -19040884.0, "logps/chosen": -187.5419921875, "logps/rejected": -544.464111328125, "loss": 0.3124, "rewards/chosen": 0.90433136622111, "rewards/margins": 3.6707986990610757, "rewards/rejected": -2.766467332839966, "step": 18040 }, { "epoch": 0.9562451965123366, "grad_norm": 41.5, "kl": 2.5055484771728516, "learning_rate": 5e-07, "logits/chosen": -44996536.0, "logits/rejected": -29933676.0, "logps/chosen": -619.8956909179688, "logps/rejected": -406.02862548828125, "loss": 0.2351, "rewards/chosen": 1.0748625993728638, "rewards/margins": 3.9953068494796753, "rewards/rejected": -2.9204442501068115, "step": 18041 }, { "epoch": 0.9562982005141388, "grad_norm": 35.75, "kl": 1.5174579620361328, "learning_rate": 5e-07, "logits/chosen": -70497784.0, "logits/rejected": -13809636.0, "logps/chosen": -513.0027465820312, "logps/rejected": -174.4545440673828, "loss": 0.1632, "rewards/chosen": 1.9350296258926392, "rewards/margins": 5.81318199634552, "rewards/rejected": -3.878152370452881, "step": 18042 }, { "epoch": 0.9563512045159409, "grad_norm": 37.75, "kl": 5.202337265014648, "learning_rate": 5e-07, "logits/chosen": -14365334.0, "logits/rejected": -38047752.0, "logps/chosen": -192.59458923339844, "logps/rejected": -353.4601745605469, "loss": 0.1993, "rewards/chosen": 1.3746449947357178, "rewards/margins": 4.195772409439087, "rewards/rejected": -2.821127414703369, "step": 18043 }, { "epoch": 0.9564042085177431, "grad_norm": 34.5, "kl": 3.9305801391601562, "learning_rate": 5e-07, "logits/chosen": -21160387.2, "logits/rejected": -64559904.0, "logps/chosen": -464.53486328125, "logps/rejected": -608.9466145833334, "loss": 0.1967, "rewards/chosen": 1.6049488067626954, "rewards/margins": 4.459769694010417, "rewards/rejected": -2.854820887247721, "step": 18044 }, { "epoch": 0.9564572125195452, "grad_norm": 56.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 80988236.8, "logits/rejected": -11650670.666666666, "logps/chosen": -347.2180419921875, "logps/rejected": -249.44441731770834, "loss": 0.2281, "rewards/chosen": 0.8663226127624511, "rewards/margins": 4.03205181757609, "rewards/rejected": -3.165729204813639, "step": 18045 }, { "epoch": 0.9565102165213474, "grad_norm": 35.25, "kl": 1.4813995361328125, "learning_rate": 5e-07, "logits/chosen": -6107355.2, "logits/rejected": -19986902.666666668, "logps/chosen": -187.695166015625, "logps/rejected": -264.6438802083333, "loss": 0.2336, "rewards/chosen": 0.8915846824645997, "rewards/margins": 4.194664605458578, "rewards/rejected": -3.303079922993978, "step": 18046 }, { "epoch": 0.9565632205231495, "grad_norm": 36.5, "kl": 1.2502403259277344, "learning_rate": 5e-07, "logits/chosen": -23439888.0, "logits/rejected": -48719893.333333336, "logps/chosen": -301.8330078125, "logps/rejected": -849.9999186197916, "loss": 0.2045, "rewards/chosen": 0.936739730834961, "rewards/margins": 6.358984756469726, "rewards/rejected": -5.422245025634766, "step": 18047 }, { "epoch": 0.9566162245249517, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51994912.0, "logits/rejected": -17794808.0, "logps/chosen": -457.3800354003906, "logps/rejected": -223.437255859375, "loss": 0.2516, "rewards/chosen": -0.15238705277442932, "rewards/margins": 2.2774504919846854, "rewards/rejected": -2.4298375447591147, "step": 18048 }, { "epoch": 0.9566692285267537, "grad_norm": 39.5, "kl": 1.1228618621826172, "learning_rate": 5e-07, "logits/chosen": -11227285.333333334, "logits/rejected": -10315759.2, "logps/chosen": -298.9969889322917, "logps/rejected": -211.744482421875, "loss": 0.2489, "rewards/chosen": 0.8931914965311686, "rewards/margins": 3.237223974863688, "rewards/rejected": -2.3440324783325197, "step": 18049 }, { "epoch": 0.9567222325285559, "grad_norm": 47.75, "kl": 0.44205474853515625, "learning_rate": 5e-07, "logits/chosen": -23741389.333333332, "logits/rejected": -32083430.4, "logps/chosen": -384.8599446614583, "logps/rejected": -191.07369384765624, "loss": 0.3251, "rewards/chosen": 0.3753092686335246, "rewards/margins": 1.5964226643244426, "rewards/rejected": -1.221113395690918, "step": 18050 }, { "epoch": 0.956775236530358, "grad_norm": 41.75, "kl": 4.720331192016602, "learning_rate": 5e-07, "logits/chosen": 4529082.8, "logits/rejected": -25713890.666666668, "logps/chosen": -350.352197265625, "logps/rejected": -240.36649576822916, "loss": 0.3378, "rewards/chosen": 1.3848074913024901, "rewards/margins": 2.85455265045166, "rewards/rejected": -1.46974515914917, "step": 18051 }, { "epoch": 0.9568282405321602, "grad_norm": 46.0, "kl": 0.06553077697753906, "learning_rate": 5e-07, "logits/chosen": 1592755.8333333333, "logits/rejected": -19686291.2, "logps/chosen": -51.31466166178385, "logps/rejected": -287.2554443359375, "loss": 0.2348, "rewards/chosen": 0.045215025544166565, "rewards/margins": 2.9602064043283463, "rewards/rejected": -2.9149913787841797, "step": 18052 }, { "epoch": 0.9568812445339623, "grad_norm": 72.0, "kl": 3.2852210998535156, "learning_rate": 5e-07, "logits/chosen": -13016488.0, "logits/rejected": 4544795.5, "logps/chosen": -247.8409423828125, "logps/rejected": -88.46781158447266, "loss": 0.438, "rewards/chosen": 0.3620288372039795, "rewards/margins": 1.4920146465301514, "rewards/rejected": -1.1299858093261719, "step": 18053 }, { "epoch": 0.9569342485357645, "grad_norm": 38.75, "kl": 2.098356246948242, "learning_rate": 5e-07, "logits/chosen": -16573935.0, "logits/rejected": -13484403.0, "logps/chosen": -217.99855041503906, "logps/rejected": -278.40692138671875, "loss": 0.2364, "rewards/chosen": 0.8948495388031006, "rewards/margins": 3.2276830673217773, "rewards/rejected": -2.3328335285186768, "step": 18054 }, { "epoch": 0.9569872525375666, "grad_norm": 43.0, "kl": 4.180539131164551, "learning_rate": 5e-07, "logits/chosen": -26919660.8, "logits/rejected": -38491424.0, "logps/chosen": -201.66005859375, "logps/rejected": -396.52294921875, "loss": 0.2909, "rewards/chosen": 0.7292337894439698, "rewards/margins": 2.836158800125122, "rewards/rejected": -2.1069250106811523, "step": 18055 }, { "epoch": 0.9570402565393687, "grad_norm": 38.25, "kl": 1.4488868713378906, "learning_rate": 5e-07, "logits/chosen": -29111027.2, "logits/rejected": -14844014.666666666, "logps/chosen": -214.533349609375, "logps/rejected": -464.9107259114583, "loss": 0.2824, "rewards/chosen": 0.48014240264892577, "rewards/margins": 3.9021572748819984, "rewards/rejected": -3.4220148722330728, "step": 18056 }, { "epoch": 0.9570932605411708, "grad_norm": 42.25, "kl": 3.0497493743896484, "learning_rate": 5e-07, "logits/chosen": -65922612.0, "logits/rejected": -42986536.0, "logps/chosen": -258.6455078125, "logps/rejected": -262.3128356933594, "loss": 0.2037, "rewards/chosen": 1.4189587831497192, "rewards/margins": 2.7533499002456665, "rewards/rejected": -1.3343911170959473, "step": 18057 }, { "epoch": 0.957146264542973, "grad_norm": 73.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18017160.0, "logits/rejected": 4655280.333333333, "logps/chosen": -108.59854125976562, "logps/rejected": -204.4157918294271, "loss": 0.3548, "rewards/chosen": -0.037260428071022034, "rewards/margins": 0.9255697180827459, "rewards/rejected": -0.9628301461537679, "step": 18058 }, { "epoch": 0.9571992685447751, "grad_norm": 47.25, "kl": 2.5096664428710938, "learning_rate": 5e-07, "logits/chosen": -21169224.0, "logits/rejected": -25060738.0, "logps/chosen": -593.1149088541666, "logps/rejected": -310.5063171386719, "loss": 0.2684, "rewards/chosen": 1.1534047921498616, "rewards/margins": 2.9657293160756426, "rewards/rejected": -1.8123245239257812, "step": 18059 }, { "epoch": 0.9572522725465773, "grad_norm": 52.5, "kl": 4.138332366943359, "learning_rate": 5e-07, "logits/chosen": -17911019.2, "logits/rejected": -78343322.66666667, "logps/chosen": -263.1139404296875, "logps/rejected": -316.38498942057294, "loss": 0.4046, "rewards/chosen": 0.4320675849914551, "rewards/margins": 2.2147989590962727, "rewards/rejected": -1.7827313741048176, "step": 18060 }, { "epoch": 0.9573052765483794, "grad_norm": 52.75, "kl": 1.0813183784484863, "learning_rate": 5e-07, "logits/chosen": 30720044.8, "logits/rejected": -27681085.333333332, "logps/chosen": -229.40439453125, "logps/rejected": -201.11478678385416, "loss": 0.248, "rewards/chosen": 0.9982992172241211, "rewards/margins": 4.171008682250976, "rewards/rejected": -3.1727094650268555, "step": 18061 }, { "epoch": 0.9573582805501816, "grad_norm": 31.75, "kl": 2.688304901123047, "learning_rate": 5e-07, "logits/chosen": -15863314.666666666, "logits/rejected": -16538582.4, "logps/chosen": -86.31229654947917, "logps/rejected": -439.632421875, "loss": 0.2692, "rewards/chosen": -0.013961538672447205, "rewards/margins": 3.672734704613686, "rewards/rejected": -3.686696243286133, "step": 18062 }, { "epoch": 0.9574112845519837, "grad_norm": 52.25, "kl": 0.20145416259765625, "learning_rate": 5e-07, "logits/chosen": -17660552.0, "logits/rejected": -20696569.6, "logps/chosen": -151.10320027669272, "logps/rejected": -246.8326416015625, "loss": 0.2456, "rewards/chosen": 0.49441730976104736, "rewards/margins": 2.6872231721878053, "rewards/rejected": -2.192805862426758, "step": 18063 }, { "epoch": 0.9574642885537858, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12256675.0, "logits/rejected": -2029423.6666666667, "logps/chosen": -199.78602600097656, "logps/rejected": -242.79302978515625, "loss": 0.1861, "rewards/chosen": 0.5029830932617188, "rewards/margins": 4.264469146728516, "rewards/rejected": -3.761486053466797, "step": 18064 }, { "epoch": 0.9575172925555879, "grad_norm": 50.5, "kl": 1.0711517333984375, "learning_rate": 5e-07, "logits/chosen": -50960549.333333336, "logits/rejected": -22193832.0, "logps/chosen": -320.6138509114583, "logps/rejected": -233.49209594726562, "loss": 0.37, "rewards/chosen": 0.26548131306966144, "rewards/margins": 3.9740117390950522, "rewards/rejected": -3.7085304260253906, "step": 18065 }, { "epoch": 0.9575702965573901, "grad_norm": 30.25, "kl": 1.2325363159179688, "learning_rate": 5e-07, "logits/chosen": -17119380.0, "logits/rejected": -56080396.0, "logps/chosen": -158.69744873046875, "logps/rejected": -216.93576049804688, "loss": 0.2458, "rewards/chosen": 0.5169058442115784, "rewards/margins": 3.982962191104889, "rewards/rejected": -3.4660563468933105, "step": 18066 }, { "epoch": 0.9576233005591922, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51587032.0, "logits/rejected": -15875743.0, "logps/chosen": -384.65625, "logps/rejected": -304.0788269042969, "loss": 0.1967, "rewards/chosen": 0.7532062530517578, "rewards/margins": 4.143042325973511, "rewards/rejected": -3.389836072921753, "step": 18067 }, { "epoch": 0.9576763045609944, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -77977808.0, "logits/rejected": -40486482.666666664, "logps/chosen": -431.8362731933594, "logps/rejected": -361.1780598958333, "loss": 0.1153, "rewards/chosen": 0.9271957874298096, "rewards/margins": 3.9628047148386636, "rewards/rejected": -3.035608927408854, "step": 18068 }, { "epoch": 0.9577293085627965, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8493356.666666666, "logits/rejected": -42656880.0, "logps/chosen": -176.3267618815104, "logps/rejected": -309.3010009765625, "loss": 0.3046, "rewards/chosen": -0.08694686492284139, "rewards/margins": 2.1312191049257914, "rewards/rejected": -2.218165969848633, "step": 18069 }, { "epoch": 0.9577823125645987, "grad_norm": 50.0, "kl": 0.9138679504394531, "learning_rate": 5e-07, "logits/chosen": -16579236.8, "logits/rejected": -25012850.666666668, "logps/chosen": -532.392626953125, "logps/rejected": -377.2030436197917, "loss": 0.2609, "rewards/chosen": 1.1673120498657226, "rewards/margins": 3.5731837590535482, "rewards/rejected": -2.4058717091878257, "step": 18070 }, { "epoch": 0.9578353165664008, "grad_norm": 30.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14011120.0, "logits/rejected": -9904691.2, "logps/chosen": -103.88789876302083, "logps/rejected": -279.275, "loss": 0.2579, "rewards/chosen": 0.5113688707351685, "rewards/margins": 2.463759160041809, "rewards/rejected": -1.9523902893066407, "step": 18071 }, { "epoch": 0.9578883205682029, "grad_norm": 79.0, "kl": 7.016483306884766, "learning_rate": 5e-07, "logits/chosen": -102359040.0, "logits/rejected": -6562723.0, "logps/chosen": -441.3475748697917, "logps/rejected": -142.77899169921875, "loss": 0.2755, "rewards/chosen": 1.0825510025024414, "rewards/margins": 4.29016637802124, "rewards/rejected": -3.207615375518799, "step": 18072 }, { "epoch": 0.957941324570005, "grad_norm": 43.0, "kl": 0.8780460357666016, "learning_rate": 5e-07, "logits/chosen": -25745989.333333332, "logits/rejected": -4012066.8, "logps/chosen": -240.6146443684896, "logps/rejected": -348.65498046875, "loss": 0.2466, "rewards/chosen": 0.7406972249348959, "rewards/margins": 3.8227464040120442, "rewards/rejected": -3.0820491790771483, "step": 18073 }, { "epoch": 0.9579943285718072, "grad_norm": 50.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54353216.0, "logits/rejected": -9129642.4, "logps/chosen": -485.6896158854167, "logps/rejected": -261.462939453125, "loss": 0.244, "rewards/chosen": 0.19213563203811646, "rewards/margins": 2.450059711933136, "rewards/rejected": -2.2579240798950195, "step": 18074 }, { "epoch": 0.9580473325736093, "grad_norm": 56.75, "kl": 4.2341766357421875, "learning_rate": 5e-07, "logits/chosen": -10135057.6, "logits/rejected": -3293371.6666666665, "logps/chosen": -459.62578125, "logps/rejected": -154.58716837565103, "loss": 0.354, "rewards/chosen": 0.7582749366760254, "rewards/margins": 3.8232824643452963, "rewards/rejected": -3.065007527669271, "step": 18075 }, { "epoch": 0.9581003365754115, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1278657.3333333333, "logits/rejected": -37181216.0, "logps/chosen": -196.4773966471354, "logps/rejected": -224.3257568359375, "loss": 0.2767, "rewards/chosen": 0.05352922280629476, "rewards/margins": 3.072308214505514, "rewards/rejected": -3.018778991699219, "step": 18076 }, { "epoch": 0.9581533405772136, "grad_norm": 41.5, "kl": 5.434478759765625, "learning_rate": 5e-07, "logits/chosen": -18942830.4, "logits/rejected": -16120173.333333334, "logps/chosen": -329.197265625, "logps/rejected": -126.81832885742188, "loss": 0.2859, "rewards/chosen": 1.4446528434753418, "rewards/margins": 1.9452409426371258, "rewards/rejected": -0.5005880991617838, "step": 18077 }, { "epoch": 0.9582063445790158, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31687204.0, "logits/rejected": -22184104.0, "logps/chosen": -289.63116455078125, "logps/rejected": -324.1953531901042, "loss": 0.2335, "rewards/chosen": -0.2742229700088501, "rewards/margins": 1.9545231262842813, "rewards/rejected": -2.2287460962931314, "step": 18078 }, { "epoch": 0.9582593485808178, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -58385130.666666664, "logits/rejected": -11928248.0, "logps/chosen": -374.9090983072917, "logps/rejected": -422.46435546875, "loss": 0.2637, "rewards/chosen": 0.8121561209360758, "rewards/margins": 2.6368292013804115, "rewards/rejected": -1.8246730804443358, "step": 18079 }, { "epoch": 0.95831235258262, "grad_norm": 40.5, "kl": 0.1613922119140625, "learning_rate": 5e-07, "logits/chosen": -47022906.666666664, "logits/rejected": -16203627.2, "logps/chosen": -595.7965087890625, "logps/rejected": -144.80618896484376, "loss": 0.2206, "rewards/chosen": 1.097682237625122, "rewards/margins": 3.1937159061431886, "rewards/rejected": -2.0960336685180665, "step": 18080 }, { "epoch": 0.9583653565844221, "grad_norm": 80.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32476688.0, "logits/rejected": -10706162.0, "logps/chosen": -543.4176025390625, "logps/rejected": -175.00665283203125, "loss": 0.2547, "rewards/chosen": 0.46004486083984375, "rewards/margins": 2.988994598388672, "rewards/rejected": -2.528949737548828, "step": 18081 }, { "epoch": 0.9584183605862242, "grad_norm": 105.5, "kl": 7.299129486083984, "learning_rate": 5e-07, "logits/chosen": -108542412.8, "logits/rejected": -37408192.0, "logps/chosen": -463.76533203125, "logps/rejected": -440.7196451822917, "loss": 0.3632, "rewards/chosen": 1.0412182807922363, "rewards/margins": 3.700160026550293, "rewards/rejected": -2.6589417457580566, "step": 18082 }, { "epoch": 0.9584713645880264, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12938050.0, "logits/rejected": -19635468.0, "logps/chosen": -390.08770751953125, "logps/rejected": -181.8050537109375, "loss": 0.3238, "rewards/chosen": 0.021705836057662964, "rewards/margins": 1.9492721259593964, "rewards/rejected": -1.9275662899017334, "step": 18083 }, { "epoch": 0.9585243685898285, "grad_norm": 46.5, "kl": 2.9425697326660156, "learning_rate": 5e-07, "logits/chosen": 13020264.0, "logits/rejected": -32961236.0, "logps/chosen": -125.83991241455078, "logps/rejected": -200.6190185546875, "loss": 0.3587, "rewards/chosen": 0.3680729866027832, "rewards/margins": 2.8397104740142822, "rewards/rejected": -2.471637487411499, "step": 18084 }, { "epoch": 0.9585773725916307, "grad_norm": 59.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -85948490.66666667, "logits/rejected": -20837977.6, "logps/chosen": -385.5634358723958, "logps/rejected": -411.2482421875, "loss": 0.1398, "rewards/chosen": 1.0735829671223958, "rewards/margins": 4.455957539876302, "rewards/rejected": -3.382374572753906, "step": 18085 }, { "epoch": 0.9586303765934328, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66873088.0, "logits/rejected": -1870405.8333333333, "logps/chosen": -572.8777465820312, "logps/rejected": -173.28336588541666, "loss": 0.2551, "rewards/chosen": -0.02264251746237278, "rewards/margins": 1.9408457434425752, "rewards/rejected": -1.963488260904948, "step": 18086 }, { "epoch": 0.958683380595235, "grad_norm": 64.0, "kl": 4.479194641113281, "learning_rate": 5e-07, "logits/chosen": -36456288.0, "logits/rejected": -22633637.333333332, "logps/chosen": -273.667919921875, "logps/rejected": -170.87650553385416, "loss": 0.3314, "rewards/chosen": 0.8149335861206055, "rewards/margins": 2.7770418802897137, "rewards/rejected": -1.9621082941691081, "step": 18087 }, { "epoch": 0.958736384597037, "grad_norm": 45.75, "kl": 1.7424440383911133, "learning_rate": 5e-07, "logits/chosen": -14873570.666666666, "logits/rejected": -97259400.0, "logps/chosen": -208.177978515625, "logps/rejected": -318.96478271484375, "loss": 0.3034, "rewards/chosen": 0.8742674191792806, "rewards/margins": 2.9661527474721274, "rewards/rejected": -2.0918853282928467, "step": 18088 }, { "epoch": 0.9587893885988392, "grad_norm": 52.25, "kl": 2.8388633728027344, "learning_rate": 5e-07, "logits/chosen": -10989250.0, "logits/rejected": -6734842.0, "logps/chosen": -198.6632843017578, "logps/rejected": -163.06564331054688, "loss": 0.3583, "rewards/chosen": 0.5077983736991882, "rewards/margins": 2.266874611377716, "rewards/rejected": -1.7590762376785278, "step": 18089 }, { "epoch": 0.9588423926006413, "grad_norm": 50.0, "kl": 5.203306198120117, "learning_rate": 5e-07, "logits/chosen": -26906754.0, "logits/rejected": 5475531.5, "logps/chosen": -711.3959350585938, "logps/rejected": -276.26214599609375, "loss": 0.2707, "rewards/chosen": 1.6075613498687744, "rewards/margins": 3.6363275051116943, "rewards/rejected": -2.02876615524292, "step": 18090 }, { "epoch": 0.9588953966024435, "grad_norm": 52.0, "kl": 0.23186492919921875, "learning_rate": 5e-07, "logits/chosen": -51152666.666666664, "logits/rejected": -53341300.0, "logps/chosen": -174.3543904622396, "logps/rejected": -527.9481811523438, "loss": 0.4038, "rewards/chosen": -0.08637689550717671, "rewards/margins": 3.2591981987158456, "rewards/rejected": -3.3455750942230225, "step": 18091 }, { "epoch": 0.9589484006042456, "grad_norm": 50.75, "kl": 2.3389625549316406, "learning_rate": 5e-07, "logits/chosen": -37045760.0, "logits/rejected": -24311933.333333332, "logps/chosen": -273.99873046875, "logps/rejected": -235.77107747395834, "loss": 0.3267, "rewards/chosen": 0.3461117744445801, "rewards/margins": 3.691793918609619, "rewards/rejected": -3.345682144165039, "step": 18092 }, { "epoch": 0.9590014046060478, "grad_norm": 81.5, "kl": 2.562591552734375, "learning_rate": 5e-07, "logits/chosen": -12309680.0, "logits/rejected": -23529404.0, "logps/chosen": -252.12167358398438, "logps/rejected": -320.2817077636719, "loss": 0.3305, "rewards/chosen": 0.5362542867660522, "rewards/margins": 2.1208690404891968, "rewards/rejected": -1.5846147537231445, "step": 18093 }, { "epoch": 0.9590544086078499, "grad_norm": 35.5, "kl": 1.5063762664794922, "learning_rate": 5e-07, "logits/chosen": -18760606.0, "logits/rejected": -10809767.333333334, "logps/chosen": -421.6590576171875, "logps/rejected": -211.93302408854166, "loss": 0.1581, "rewards/chosen": 1.558380126953125, "rewards/margins": 3.786296844482422, "rewards/rejected": -2.227916717529297, "step": 18094 }, { "epoch": 0.959107412609652, "grad_norm": 49.75, "kl": 1.1541547775268555, "learning_rate": 5e-07, "logits/chosen": -15079120.0, "logits/rejected": -18351534.4, "logps/chosen": -238.56498209635416, "logps/rejected": -527.1384765625, "loss": 0.2599, "rewards/chosen": 0.7010586261749268, "rewards/margins": 2.5867828845977785, "rewards/rejected": -1.8857242584228515, "step": 18095 }, { "epoch": 0.9591604166114541, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15020492.0, "logits/rejected": -35964404.0, "logps/chosen": -243.07650756835938, "logps/rejected": -248.83055114746094, "loss": 0.3068, "rewards/chosen": 0.35196056962013245, "rewards/margins": 2.061196595430374, "rewards/rejected": -1.7092360258102417, "step": 18096 }, { "epoch": 0.9592134206132563, "grad_norm": 41.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37666482.28571428, "logits/rejected": -1267805.0, "logps/chosen": -246.215087890625, "logps/rejected": -95.58563232421875, "loss": 0.3015, "rewards/chosen": 0.9251006671360561, "rewards/margins": 4.447288615362985, "rewards/rejected": -3.5221879482269287, "step": 18097 }, { "epoch": 0.9592664246150584, "grad_norm": 50.25, "kl": 1.1361312866210938, "learning_rate": 5e-07, "logits/chosen": 10570482.0, "logits/rejected": -65128460.0, "logps/chosen": -351.4425048828125, "logps/rejected": -429.2494201660156, "loss": 0.2433, "rewards/chosen": 0.5386713743209839, "rewards/margins": 3.5929559469223022, "rewards/rejected": -3.0542845726013184, "step": 18098 }, { "epoch": 0.9593194286168606, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49443875.2, "logits/rejected": -59212608.0, "logps/chosen": -436.5986328125, "logps/rejected": -342.39404296875, "loss": 0.2386, "rewards/chosen": 0.6124017715454102, "rewards/margins": 4.473179308573405, "rewards/rejected": -3.8607775370279946, "step": 18099 }, { "epoch": 0.9593724326186627, "grad_norm": 67.5, "kl": 3.8937454223632812, "learning_rate": 5e-07, "logits/chosen": 4672925.333333333, "logits/rejected": -38543328.0, "logps/chosen": -403.8924560546875, "logps/rejected": -392.694482421875, "loss": 0.3043, "rewards/chosen": -0.15516104300816855, "rewards/margins": 1.3133781949679058, "rewards/rejected": -1.4685392379760742, "step": 18100 }, { "epoch": 0.9594254366204649, "grad_norm": 49.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5049782.0, "logits/rejected": -11322014.857142856, "logps/chosen": -367.33349609375, "logps/rejected": -404.59730747767856, "loss": 0.1475, "rewards/chosen": 1.3827027082443237, "rewards/margins": 4.201438920838492, "rewards/rejected": -2.8187362125941684, "step": 18101 }, { "epoch": 0.959478440622267, "grad_norm": 50.75, "kl": 1.1976661682128906, "learning_rate": 5e-07, "logits/chosen": -43222776.0, "logits/rejected": -15953123.0, "logps/chosen": -601.5256754557291, "logps/rejected": -240.56727600097656, "loss": 0.3035, "rewards/chosen": 1.075408935546875, "rewards/margins": 2.8154854774475098, "rewards/rejected": -1.7400765419006348, "step": 18102 }, { "epoch": 0.9595314446240691, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -478317.0625, "logits/rejected": -25805838.0, "logps/chosen": -214.29693603515625, "logps/rejected": -445.8400573730469, "loss": 0.2769, "rewards/chosen": 0.4211798310279846, "rewards/margins": 3.0296936631202698, "rewards/rejected": -2.608513832092285, "step": 18103 }, { "epoch": 0.9595844486258712, "grad_norm": 35.75, "kl": 4.717320442199707, "learning_rate": 5e-07, "logits/chosen": 7999547.0, "logits/rejected": -19561253.333333332, "logps/chosen": -1371.6961669921875, "logps/rejected": -301.3894856770833, "loss": 0.1438, "rewards/chosen": 4.074014186859131, "rewards/margins": 6.272075176239014, "rewards/rejected": -2.198060989379883, "step": 18104 }, { "epoch": 0.9596374526276734, "grad_norm": 65.0, "kl": 6.34110689163208, "learning_rate": 5e-07, "logits/chosen": -37971344.0, "logits/rejected": -27041170.0, "logps/chosen": -567.6923828125, "logps/rejected": -230.73614501953125, "loss": 0.2972, "rewards/chosen": 0.8379325866699219, "rewards/margins": 2.5851986408233643, "rewards/rejected": -1.7472660541534424, "step": 18105 }, { "epoch": 0.9596904566294755, "grad_norm": 53.5, "kl": 0.48177433013916016, "learning_rate": 5e-07, "logits/chosen": -57170643.2, "logits/rejected": -7208533.333333333, "logps/chosen": -401.010595703125, "logps/rejected": -566.5686848958334, "loss": 0.3102, "rewards/chosen": 0.48889665603637694, "rewards/margins": 3.4469019254048665, "rewards/rejected": -2.9580052693684897, "step": 18106 }, { "epoch": 0.9597434606312777, "grad_norm": 62.5, "kl": 0.16827774047851562, "learning_rate": 5e-07, "logits/chosen": -27375932.0, "logits/rejected": -33634876.0, "logps/chosen": -225.28854370117188, "logps/rejected": -134.25796508789062, "loss": 0.3293, "rewards/chosen": 0.19347821176052094, "rewards/margins": 2.0174096673727036, "rewards/rejected": -1.8239314556121826, "step": 18107 }, { "epoch": 0.9597964646330798, "grad_norm": 44.75, "kl": 1.4645023345947266, "learning_rate": 5e-07, "logits/chosen": -8379884.5, "logits/rejected": -19794930.0, "logps/chosen": -450.2447509765625, "logps/rejected": -208.14013671875, "loss": 0.2927, "rewards/chosen": 1.100764274597168, "rewards/margins": 3.0315616130828857, "rewards/rejected": -1.9307973384857178, "step": 18108 }, { "epoch": 0.959849468634882, "grad_norm": 71.5, "kl": 8.356103897094727, "learning_rate": 5e-07, "logits/chosen": -20757262.666666668, "logits/rejected": -28207784.0, "logps/chosen": -317.4889322916667, "logps/rejected": -180.05592346191406, "loss": 0.4169, "rewards/chosen": 1.127883752187093, "rewards/margins": 2.2579389413197832, "rewards/rejected": -1.1300551891326904, "step": 18109 }, { "epoch": 0.959902472636684, "grad_norm": 44.75, "kl": 0.7073631286621094, "learning_rate": 5e-07, "logits/chosen": -42218544.0, "logits/rejected": -25992564.0, "logps/chosen": -356.96405029296875, "logps/rejected": -367.4658203125, "loss": 0.28, "rewards/chosen": 0.027465075254440308, "rewards/margins": 4.0829183757305145, "rewards/rejected": -4.055453300476074, "step": 18110 }, { "epoch": 0.9599554766384862, "grad_norm": 51.0, "kl": 0.5179367065429688, "learning_rate": 5e-07, "logits/chosen": -23954110.0, "logits/rejected": -5623749.5, "logps/chosen": -323.7049255371094, "logps/rejected": -365.10626220703125, "loss": 0.3714, "rewards/chosen": -0.0809730663895607, "rewards/margins": 1.4090073928236961, "rewards/rejected": -1.4899804592132568, "step": 18111 }, { "epoch": 0.9600084806402883, "grad_norm": 45.5, "kl": 2.228668212890625, "learning_rate": 5e-07, "logits/chosen": -20177276.0, "logits/rejected": -11515780.0, "logps/chosen": -396.7467956542969, "logps/rejected": -135.05409240722656, "loss": 0.2329, "rewards/chosen": 1.005807876586914, "rewards/margins": 2.9517741203308105, "rewards/rejected": -1.9459662437438965, "step": 18112 }, { "epoch": 0.9600614846420905, "grad_norm": 38.0, "kl": 0.6592960357666016, "learning_rate": 5e-07, "logits/chosen": -10531765.333333334, "logits/rejected": -24581113.6, "logps/chosen": -418.7416585286458, "logps/rejected": -260.617138671875, "loss": 0.1615, "rewards/chosen": 0.6404144366582235, "rewards/margins": 4.095603569348653, "rewards/rejected": -3.4551891326904296, "step": 18113 }, { "epoch": 0.9601144886438926, "grad_norm": 34.25, "kl": 5.289334297180176, "learning_rate": 5e-07, "logits/chosen": -35291174.4, "logits/rejected": -25459024.0, "logps/chosen": -496.251171875, "logps/rejected": -238.29427083333334, "loss": 0.2971, "rewards/chosen": 1.3642618179321289, "rewards/margins": 3.665190251668294, "rewards/rejected": -2.3009284337361655, "step": 18114 }, { "epoch": 0.9601674926456948, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 17756664.0, "logits/rejected": -42325248.0, "logps/chosen": -226.006591796875, "logps/rejected": -601.3177734375, "loss": 0.1582, "rewards/chosen": 0.9327998161315918, "rewards/margins": 4.191940212249756, "rewards/rejected": -3.259140396118164, "step": 18115 }, { "epoch": 0.9602204966474969, "grad_norm": 62.0, "kl": 0.011651992797851562, "learning_rate": 5e-07, "logits/chosen": -18300824.0, "logits/rejected": -34490968.0, "logps/chosen": -295.84613037109375, "logps/rejected": -286.27081298828125, "loss": 0.3439, "rewards/chosen": 0.018164262175559998, "rewards/margins": 2.000908836722374, "rewards/rejected": -1.982744574546814, "step": 18116 }, { "epoch": 0.9602735006492991, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22433598.0, "logits/rejected": -39386064.0, "logps/chosen": -805.3665771484375, "logps/rejected": -358.525146484375, "loss": 0.1221, "rewards/chosen": 2.0009002685546875, "rewards/margins": 4.312596480051676, "rewards/rejected": -2.311696211496989, "step": 18117 }, { "epoch": 0.9603265046511011, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 14699755.0, "logits/rejected": -28505273.14285714, "logps/chosen": -265.8331298828125, "logps/rejected": -297.06881277901783, "loss": 0.1388, "rewards/chosen": 0.03050537221133709, "rewards/margins": 2.818715832329222, "rewards/rejected": -2.788210460117885, "step": 18118 }, { "epoch": 0.9603795086529033, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31624900.0, "logits/rejected": -7842072.0, "logps/chosen": -297.24932861328125, "logps/rejected": -215.75721958705358, "loss": 0.173, "rewards/chosen": 1.04571533203125, "rewards/margins": 3.1884987694876537, "rewards/rejected": -2.1427834374564037, "step": 18119 }, { "epoch": 0.9604325126547054, "grad_norm": 60.75, "kl": 1.2874393463134766, "learning_rate": 5e-07, "logits/chosen": 3301524.0, "logits/rejected": -22729904.0, "logps/chosen": -417.42523193359375, "logps/rejected": -479.0128173828125, "loss": 0.2638, "rewards/chosen": 0.3950868844985962, "rewards/margins": 4.040488362312317, "rewards/rejected": -3.6454014778137207, "step": 18120 }, { "epoch": 0.9604855166565076, "grad_norm": 99.5, "kl": 2.949678897857666, "learning_rate": 5e-07, "logits/chosen": -45361292.8, "logits/rejected": -20674144.0, "logps/chosen": -399.1197265625, "logps/rejected": -554.079345703125, "loss": 0.3399, "rewards/chosen": 0.8632772445678711, "rewards/margins": 2.3149431228637694, "rewards/rejected": -1.4516658782958984, "step": 18121 }, { "epoch": 0.9605385206583097, "grad_norm": 43.75, "kl": 0.2219867706298828, "learning_rate": 5e-07, "logits/chosen": 90810984.0, "logits/rejected": -30139789.333333332, "logps/chosen": -414.16070556640625, "logps/rejected": -254.18082682291666, "loss": 0.1801, "rewards/chosen": 1.1044418811798096, "rewards/margins": 3.6029605070749917, "rewards/rejected": -2.498518625895182, "step": 18122 }, { "epoch": 0.9605915246601119, "grad_norm": 51.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63785808.0, "logits/rejected": -60483526.4, "logps/chosen": -369.3170979817708, "logps/rejected": -440.949755859375, "loss": 0.2637, "rewards/chosen": -0.0019612709681193032, "rewards/margins": 2.61770826180776, "rewards/rejected": -2.619669532775879, "step": 18123 }, { "epoch": 0.960644528661914, "grad_norm": 34.0, "kl": 0.467742919921875, "learning_rate": 5e-07, "logits/chosen": -19239446.666666668, "logits/rejected": -24121640.0, "logps/chosen": -636.7810465494791, "logps/rejected": -433.47666015625, "loss": 0.1581, "rewards/chosen": 1.1418418884277344, "rewards/margins": 4.521743392944336, "rewards/rejected": -3.3799015045166017, "step": 18124 }, { "epoch": 0.9606975326637162, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -45510716.0, "logps/rejected": -360.8516540527344, "loss": 0.0941, "rewards/rejected": -2.5868306159973145, "step": 18125 }, { "epoch": 0.9607505366655182, "grad_norm": 34.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18409566.0, "logits/rejected": -41703016.0, "logps/chosen": -205.37144470214844, "logps/rejected": -537.4246826171875, "loss": 0.2402, "rewards/chosen": 0.40267762541770935, "rewards/margins": 3.4147320687770844, "rewards/rejected": -3.012054443359375, "step": 18126 }, { "epoch": 0.9608035406673204, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19481081.6, "logits/rejected": -56818426.666666664, "logps/chosen": -241.874169921875, "logps/rejected": -539.5144856770834, "loss": 0.2468, "rewards/chosen": 0.5370899200439453, "rewards/margins": 6.531151707967122, "rewards/rejected": -5.994061787923177, "step": 18127 }, { "epoch": 0.9608565446691225, "grad_norm": 59.0, "kl": 2.8596973419189453, "learning_rate": 5e-07, "logits/chosen": -29357234.666666668, "logits/rejected": -46483900.0, "logps/chosen": -294.9693196614583, "logps/rejected": -347.7880859375, "loss": 0.4154, "rewards/chosen": 0.5075679222742716, "rewards/margins": 1.4091723958651223, "rewards/rejected": -0.9016044735908508, "step": 18128 }, { "epoch": 0.9609095486709247, "grad_norm": 41.0, "kl": 1.4286251068115234, "learning_rate": 5e-07, "logits/chosen": -36347301.333333336, "logits/rejected": -23222248.0, "logps/chosen": -152.8028361002604, "logps/rejected": -457.83642578125, "loss": 0.2632, "rewards/chosen": 0.22496004899342856, "rewards/margins": 3.3025368769963586, "rewards/rejected": -3.07757682800293, "step": 18129 }, { "epoch": 0.9609625526727268, "grad_norm": 57.25, "kl": 3.4690933227539062, "learning_rate": 5e-07, "logits/chosen": -41149144.0, "logits/rejected": -4704107.0, "logps/chosen": -303.7093505859375, "logps/rejected": -287.59619140625, "loss": 0.212, "rewards/chosen": 1.0393069982528687, "rewards/margins": 3.6627358198165894, "rewards/rejected": -2.6234288215637207, "step": 18130 }, { "epoch": 0.961015556674529, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18913304.0, "logits/rejected": -26737552.0, "logps/chosen": -581.5872802734375, "logps/rejected": -352.3224283854167, "loss": 0.1224, "rewards/chosen": 1.0827758312225342, "rewards/margins": 5.143658399581909, "rewards/rejected": -4.060882568359375, "step": 18131 }, { "epoch": 0.9610685606763311, "grad_norm": 58.25, "kl": 0.5358848571777344, "learning_rate": 5e-07, "logits/chosen": -11293472.0, "logits/rejected": 2237365.0, "logps/chosen": -277.52859933035717, "logps/rejected": -381.6860656738281, "loss": 0.3475, "rewards/chosen": 0.6172270093645368, "rewards/margins": 1.7412107501711165, "rewards/rejected": -1.1239837408065796, "step": 18132 }, { "epoch": 0.9611215646781331, "grad_norm": 43.5, "kl": 1.5269145965576172, "learning_rate": 5e-07, "logits/chosen": -27886342.4, "logits/rejected": -7766901.333333333, "logps/chosen": -112.5489013671875, "logps/rejected": -335.2747395833333, "loss": 0.2447, "rewards/chosen": 0.8146725654602051, "rewards/margins": 3.3117622693379722, "rewards/rejected": -2.497089703877767, "step": 18133 }, { "epoch": 0.9611745686799353, "grad_norm": 39.5, "kl": 2.4994277954101562, "learning_rate": 5e-07, "logits/chosen": 1807528.25, "logits/rejected": -7281179.5, "logps/chosen": -276.9061279296875, "logps/rejected": -318.9095764160156, "loss": 0.205, "rewards/chosen": 1.6597249507904053, "rewards/margins": 4.275649785995483, "rewards/rejected": -2.615924835205078, "step": 18134 }, { "epoch": 0.9612275726817374, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13906724.0, "logits/rejected": -51883848.0, "logps/chosen": -569.4161376953125, "logps/rejected": -449.55120849609375, "loss": 0.2525, "rewards/chosen": 0.45245361328125, "rewards/margins": 3.6335487365722656, "rewards/rejected": -3.1810951232910156, "step": 18135 }, { "epoch": 0.9612805766835396, "grad_norm": 48.0, "kl": 7.114170074462891, "learning_rate": 5e-07, "logits/chosen": -25187341.333333332, "logits/rejected": -33563568.0, "logps/chosen": -228.5139363606771, "logps/rejected": -462.007568359375, "loss": 0.4273, "rewards/chosen": 0.6920960744222006, "rewards/margins": 3.0956972440083823, "rewards/rejected": -2.4036011695861816, "step": 18136 }, { "epoch": 0.9613335806853417, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 31226500.0, "logits/rejected": -41606589.333333336, "logps/chosen": -221.27781677246094, "logps/rejected": -401.7261149088542, "loss": 0.2087, "rewards/chosen": -0.18042269349098206, "rewards/margins": 2.7588676710923514, "rewards/rejected": -2.9392903645833335, "step": 18137 }, { "epoch": 0.9613865846871439, "grad_norm": 48.75, "kl": 4.049245834350586, "learning_rate": 5e-07, "logits/chosen": -31008537.6, "logits/rejected": -26121717.333333332, "logps/chosen": -377.4051513671875, "logps/rejected": -289.7652180989583, "loss": 0.3162, "rewards/chosen": 1.2344744682312012, "rewards/margins": 3.0209054629007976, "rewards/rejected": -1.7864309946695964, "step": 18138 }, { "epoch": 0.961439588688946, "grad_norm": 50.75, "kl": 5.622737884521484, "learning_rate": 5e-07, "logits/chosen": -46891680.0, "logits/rejected": -93631792.0, "logps/chosen": -294.48228236607144, "logps/rejected": -492.61212158203125, "loss": 0.3674, "rewards/chosen": 0.9612338202340263, "rewards/margins": 3.573593582425799, "rewards/rejected": -2.6123597621917725, "step": 18139 }, { "epoch": 0.9614925926907482, "grad_norm": 58.5, "kl": 2.2419347763061523, "learning_rate": 5e-07, "logits/chosen": -37432499.2, "logits/rejected": 1457323.3333333333, "logps/chosen": -307.9660888671875, "logps/rejected": -315.52939860026044, "loss": 0.352, "rewards/chosen": 0.8446375846862793, "rewards/margins": 2.1315948486328127, "rewards/rejected": -1.2869572639465332, "step": 18140 }, { "epoch": 0.9615455966925502, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4763587.0, "logits/rejected": -31860461.714285713, "logps/chosen": -21.03327751159668, "logps/rejected": -309.6224888392857, "loss": 0.114, "rewards/chosen": 1.1319894790649414, "rewards/margins": 3.6562367847987582, "rewards/rejected": -2.524247305733817, "step": 18141 }, { "epoch": 0.9615986006943524, "grad_norm": 51.0, "kl": 0.01290130615234375, "learning_rate": 5e-07, "logits/chosen": -19091690.666666668, "logits/rejected": -9973010.4, "logps/chosen": -377.2777506510417, "logps/rejected": -180.1164794921875, "loss": 0.184, "rewards/chosen": 1.2958351771036785, "rewards/margins": 3.242832056681315, "rewards/rejected": -1.9469968795776367, "step": 18142 }, { "epoch": 0.9616516046961545, "grad_norm": 45.75, "kl": 0.9786376953125, "learning_rate": 5e-07, "logits/chosen": -71298421.33333333, "logits/rejected": -25419051.2, "logps/chosen": -295.6351318359375, "logps/rejected": -189.8443359375, "loss": 0.2798, "rewards/chosen": 0.5316975911458334, "rewards/margins": 2.2935107549031577, "rewards/rejected": -1.7618131637573242, "step": 18143 }, { "epoch": 0.9617046086979567, "grad_norm": 60.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 223829408.0, "logits/rejected": -15563224.0, "logps/chosen": -606.314208984375, "logps/rejected": -299.72979736328125, "loss": 0.2022, "rewards/chosen": 0.17117615044116974, "rewards/margins": 2.648459787170092, "rewards/rejected": -2.4772836367289224, "step": 18144 }, { "epoch": 0.9617576126997588, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -142018508.8, "logits/rejected": -22706434.666666668, "logps/chosen": -533.479248046875, "logps/rejected": -285.51112874348956, "loss": 0.2958, "rewards/chosen": 0.3925930500030518, "rewards/margins": 2.6498613198598227, "rewards/rejected": -2.257268269856771, "step": 18145 }, { "epoch": 0.961810616701561, "grad_norm": 41.25, "kl": 3.3194808959960938, "learning_rate": 5e-07, "logits/chosen": -10336057.6, "logits/rejected": -21618596.0, "logps/chosen": -262.983447265625, "logps/rejected": -159.00509643554688, "loss": 0.2342, "rewards/chosen": 1.0569965362548828, "rewards/margins": 3.525336138407389, "rewards/rejected": -2.4683396021525064, "step": 18146 }, { "epoch": 0.9618636207033631, "grad_norm": 25.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68601640.0, "logits/rejected": -47316085.333333336, "logps/chosen": -180.49386596679688, "logps/rejected": -441.2814534505208, "loss": 0.1641, "rewards/chosen": 0.6835685968399048, "rewards/margins": 3.2313584884007773, "rewards/rejected": -2.5477898915608725, "step": 18147 }, { "epoch": 0.9619166247051653, "grad_norm": 71.5, "kl": 0.6660385131835938, "learning_rate": 5e-07, "logits/chosen": -4588746.0, "logits/rejected": -1782126.0, "logps/chosen": -83.94454956054688, "logps/rejected": -129.63502197265626, "loss": 0.3136, "rewards/chosen": 0.6697421073913574, "rewards/margins": 1.9733511924743652, "rewards/rejected": -1.3036090850830078, "step": 18148 }, { "epoch": 0.9619696287069673, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14072879.0, "logits/rejected": -47949504.0, "logps/chosen": -205.9044952392578, "logps/rejected": -613.46337890625, "loss": 0.2839, "rewards/chosen": -0.12827959656715393, "rewards/margins": 4.413731664419174, "rewards/rejected": -4.542011260986328, "step": 18149 }, { "epoch": 0.9620226327087695, "grad_norm": 43.5, "kl": 0.7324886322021484, "learning_rate": 5e-07, "logits/chosen": -17316669.333333332, "logits/rejected": -22752070.4, "logps/chosen": -696.2345377604166, "logps/rejected": -324.1345703125, "loss": 0.1972, "rewards/chosen": 1.1681836446126301, "rewards/margins": 3.3153413136800127, "rewards/rejected": -2.147157669067383, "step": 18150 }, { "epoch": 0.9620756367105716, "grad_norm": 57.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37705085.333333336, "logits/rejected": -23529459.2, "logps/chosen": -424.6092936197917, "logps/rejected": -306.472216796875, "loss": 0.2003, "rewards/chosen": 0.6732207934061686, "rewards/margins": 3.7195357958475745, "rewards/rejected": -3.046315002441406, "step": 18151 }, { "epoch": 0.9621286407123738, "grad_norm": 73.5, "kl": 0.5096912384033203, "learning_rate": 5e-07, "logits/chosen": -51994521.6, "logits/rejected": 5288699.666666667, "logps/chosen": -330.0364990234375, "logps/rejected": -217.4655558268229, "loss": 0.3397, "rewards/chosen": 0.3120110988616943, "rewards/margins": 2.6491507053375245, "rewards/rejected": -2.33713960647583, "step": 18152 }, { "epoch": 0.9621816447141759, "grad_norm": 58.25, "kl": 1.4540901184082031, "learning_rate": 5e-07, "logits/chosen": -19963106.0, "logits/rejected": -12058830.0, "logps/chosen": -328.19097900390625, "logps/rejected": -195.48861694335938, "loss": 0.169, "rewards/chosen": 1.3906431198120117, "rewards/margins": 5.311168193817139, "rewards/rejected": -3.920525074005127, "step": 18153 }, { "epoch": 0.9622346487159781, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16242818.0, "logits/rejected": -14884568.0, "logps/chosen": -162.73443603515625, "logps/rejected": -289.3797607421875, "loss": 0.2511, "rewards/chosen": 0.023432349786162376, "rewards/margins": 2.3678414976845183, "rewards/rejected": -2.344409147898356, "step": 18154 }, { "epoch": 0.9622876527177802, "grad_norm": 52.0, "kl": 0.09025192260742188, "learning_rate": 5e-07, "logits/chosen": -46062492.0, "logits/rejected": -15524830.666666666, "logps/chosen": -272.34906005859375, "logps/rejected": -220.09749348958334, "loss": 0.2095, "rewards/chosen": 0.4775795042514801, "rewards/margins": 2.3109271426995592, "rewards/rejected": -1.8333476384480794, "step": 18155 }, { "epoch": 0.9623406567195824, "grad_norm": 33.75, "kl": 2.3914718627929688, "learning_rate": 5e-07, "logits/chosen": -23807162.666666668, "logits/rejected": -27608944.0, "logps/chosen": -159.93771362304688, "logps/rejected": -362.631103515625, "loss": 0.2192, "rewards/chosen": 0.927448590596517, "rewards/margins": 3.7303405125935876, "rewards/rejected": -2.8028919219970705, "step": 18156 }, { "epoch": 0.9623936607213844, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8840420.0, "logits/rejected": -15101680.0, "logps/chosen": -320.908935546875, "logps/rejected": -264.91015625, "loss": 0.213, "rewards/chosen": 0.3783794343471527, "rewards/margins": 2.2739471693833666, "rewards/rejected": -1.8955677350362141, "step": 18157 }, { "epoch": 0.9624466647231866, "grad_norm": 59.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40697648.0, "logits/rejected": -39092140.0, "logps/chosen": -404.213134765625, "logps/rejected": -197.11746215820312, "loss": 0.3188, "rewards/chosen": -0.0052429139614105225, "rewards/margins": 2.70308056473732, "rewards/rejected": -2.7083234786987305, "step": 18158 }, { "epoch": 0.9624996687249887, "grad_norm": 51.0, "kl": 0.13141345977783203, "learning_rate": 5e-07, "logits/chosen": -33839864.0, "logits/rejected": -13788114.0, "logps/chosen": -349.20379638671875, "logps/rejected": -151.71897888183594, "loss": 0.2284, "rewards/chosen": 0.5669434070587158, "rewards/margins": 3.817882776260376, "rewards/rejected": -3.25093936920166, "step": 18159 }, { "epoch": 0.9625526727267909, "grad_norm": 47.0, "kl": 4.942037582397461, "learning_rate": 5e-07, "logits/chosen": -83679957.33333333, "logits/rejected": -14049905.6, "logps/chosen": -372.3803304036458, "logps/rejected": -330.06455078125, "loss": 0.233, "rewards/chosen": 0.9827964305877686, "rewards/margins": 2.852867269515991, "rewards/rejected": -1.8700708389282226, "step": 18160 }, { "epoch": 0.962605676728593, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36873253.333333336, "logits/rejected": -41224312.0, "logps/chosen": -561.35888671875, "logps/rejected": -272.6748352050781, "loss": 0.2149, "rewards/chosen": 1.3657209078470867, "rewards/margins": 4.644173304239909, "rewards/rejected": -3.2784523963928223, "step": 18161 }, { "epoch": 0.9626586807303952, "grad_norm": 35.0, "kl": 2.202972412109375, "learning_rate": 5e-07, "logits/chosen": -95745904.0, "logits/rejected": -15634798.666666666, "logps/chosen": -460.69598388671875, "logps/rejected": -446.3442789713542, "loss": 0.1615, "rewards/chosen": 0.6188796758651733, "rewards/margins": 3.2623210350672402, "rewards/rejected": -2.643441359202067, "step": 18162 }, { "epoch": 0.9627116847321973, "grad_norm": 47.0, "kl": 0.9928140640258789, "learning_rate": 5e-07, "logits/chosen": -42534676.0, "logits/rejected": -30056640.0, "logps/chosen": -379.6821594238281, "logps/rejected": -325.7876281738281, "loss": 0.219, "rewards/chosen": 0.8544187545776367, "rewards/margins": 4.508031368255615, "rewards/rejected": -3.6536126136779785, "step": 18163 }, { "epoch": 0.9627646887339995, "grad_norm": 42.0, "kl": 2.689908981323242, "learning_rate": 5e-07, "logits/chosen": -53826553.6, "logits/rejected": -25238906.666666668, "logps/chosen": -436.7005859375, "logps/rejected": -497.7093505859375, "loss": 0.2461, "rewards/chosen": 0.960849380493164, "rewards/margins": 4.33678461710612, "rewards/rejected": -3.3759352366129556, "step": 18164 }, { "epoch": 0.9628176927358015, "grad_norm": 49.75, "kl": 1.6115074157714844, "learning_rate": 5e-07, "logits/chosen": -25921168.0, "logits/rejected": -16687766.0, "logps/chosen": -207.56712341308594, "logps/rejected": -334.8175354003906, "loss": 0.3022, "rewards/chosen": 0.48089009523391724, "rewards/margins": 2.5584433674812317, "rewards/rejected": -2.0775532722473145, "step": 18165 }, { "epoch": 0.9628706967376037, "grad_norm": 40.25, "kl": 0.46378326416015625, "learning_rate": 5e-07, "logits/chosen": -59543348.0, "logits/rejected": -8922328.0, "logps/chosen": -166.0461883544922, "logps/rejected": -414.3666585286458, "loss": 0.1782, "rewards/chosen": 0.014287188649177551, "rewards/margins": 3.1882857034603753, "rewards/rejected": -3.1739985148111978, "step": 18166 }, { "epoch": 0.9629237007394058, "grad_norm": 46.5, "kl": 0.5634078979492188, "learning_rate": 5e-07, "logits/chosen": -21406661.333333332, "logits/rejected": -23915640.0, "logps/chosen": -146.2890625, "logps/rejected": -248.49990234375, "loss": 0.2895, "rewards/chosen": 0.15784581502278647, "rewards/margins": 1.897976811726888, "rewards/rejected": -1.7401309967041017, "step": 18167 }, { "epoch": 0.962976704741208, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 8808461.333333334, "logits/rejected": -44514025.6, "logps/chosen": -39.00592549641927, "logps/rejected": -372.3552490234375, "loss": 0.249, "rewards/chosen": 0.34512964884440106, "rewards/margins": 2.2339367548624676, "rewards/rejected": -1.8888071060180665, "step": 18168 }, { "epoch": 0.9630297087430101, "grad_norm": 27.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -504567.84375, "logits/rejected": -14476300.57142857, "logps/chosen": -96.6217041015625, "logps/rejected": -289.7804652622768, "loss": 0.1453, "rewards/chosen": -0.6267043948173523, "rewards/margins": 2.3540270243372237, "rewards/rejected": -2.980731419154576, "step": 18169 }, { "epoch": 0.9630827127448123, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15648646.4, "logits/rejected": 1392842.6666666667, "logps/chosen": -152.4919677734375, "logps/rejected": -155.82172648111978, "loss": 0.3373, "rewards/chosen": 0.37952275276184083, "rewards/margins": 3.222082567214966, "rewards/rejected": -2.842559814453125, "step": 18170 }, { "epoch": 0.9631357167466144, "grad_norm": 62.0, "kl": 0.15744781494140625, "learning_rate": 5e-07, "logits/chosen": -21916288.0, "logits/rejected": 5590562.0, "logps/chosen": -207.0982462565104, "logps/rejected": -248.32135009765625, "loss": 0.3759, "rewards/chosen": 0.29299060503641766, "rewards/margins": 2.6410252253214517, "rewards/rejected": -2.348034620285034, "step": 18171 }, { "epoch": 0.9631887207484165, "grad_norm": 59.0, "kl": 0.04078483581542969, "learning_rate": 5e-07, "logits/chosen": -68164474.66666667, "logits/rejected": -2094029.2, "logps/chosen": -574.4922688802084, "logps/rejected": -181.7925048828125, "loss": 0.1581, "rewards/chosen": 0.8847147623697916, "rewards/margins": 3.771181360880534, "rewards/rejected": -2.8864665985107423, "step": 18172 }, { "epoch": 0.9632417247502186, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 15697160.0, "logits/rejected": -4532394.8, "logps/chosen": -325.6724039713542, "logps/rejected": -94.64979858398438, "loss": 0.243, "rewards/chosen": 0.22432289520899454, "rewards/margins": 2.638335422674815, "rewards/rejected": -2.4140125274658204, "step": 18173 }, { "epoch": 0.9632947287520208, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -60457306.666666664, "logits/rejected": -51472857.6, "logps/chosen": -347.79736328125, "logps/rejected": -308.3282958984375, "loss": 0.1793, "rewards/chosen": 0.4135500987370809, "rewards/margins": 4.143506153424581, "rewards/rejected": -3.7299560546875, "step": 18174 }, { "epoch": 0.9633477327538229, "grad_norm": 65.0, "kl": 0.7857666015625, "learning_rate": 5e-07, "logits/chosen": 14868972.8, "logits/rejected": -33314581.333333332, "logps/chosen": -181.56036376953125, "logps/rejected": -566.255859375, "loss": 0.2367, "rewards/chosen": 0.7517014503479004, "rewards/margins": 4.570342477162679, "rewards/rejected": -3.818641026814779, "step": 18175 }, { "epoch": 0.9634007367556251, "grad_norm": 32.25, "kl": 1.6681365966796875, "learning_rate": 5e-07, "logits/chosen": -5761313.333333333, "logits/rejected": -3775237.2, "logps/chosen": -170.103271484375, "logps/rejected": -318.907275390625, "loss": 0.2037, "rewards/chosen": 0.9148695468902588, "rewards/margins": 3.4760874271392823, "rewards/rejected": -2.5612178802490235, "step": 18176 }, { "epoch": 0.9634537407574272, "grad_norm": 43.0, "kl": 0.1540851593017578, "learning_rate": 5e-07, "logits/chosen": -26014476.0, "logits/rejected": -9918145.0, "logps/chosen": -259.89337158203125, "logps/rejected": -134.3279266357422, "loss": 0.2276, "rewards/chosen": 0.7552914023399353, "rewards/margins": 4.133247792720795, "rewards/rejected": -3.3779563903808594, "step": 18177 }, { "epoch": 0.9635067447592294, "grad_norm": 52.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15340246.0, "logits/rejected": -49593368.0, "logps/chosen": -458.9136047363281, "logps/rejected": -380.76727294921875, "loss": 0.2149, "rewards/chosen": 1.2265013456344604, "rewards/margins": 3.062557578086853, "rewards/rejected": -1.8360562324523926, "step": 18178 }, { "epoch": 0.9635597487610315, "grad_norm": 57.5, "kl": 0.739959716796875, "learning_rate": 5e-07, "logits/chosen": -69366378.66666667, "logits/rejected": -7702401.6, "logps/chosen": -300.7143961588542, "logps/rejected": -161.48526611328126, "loss": 0.2585, "rewards/chosen": 0.6488820314407349, "rewards/margins": 2.757827877998352, "rewards/rejected": -2.108945846557617, "step": 18179 }, { "epoch": 0.9636127527628336, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -795117.3333333334, "logits/rejected": -48498854.4, "logps/chosen": -217.93697102864584, "logps/rejected": -295.3046875, "loss": 0.2048, "rewards/chosen": 0.2588932116826375, "rewards/margins": 3.1424387057622276, "rewards/rejected": -2.88354549407959, "step": 18180 }, { "epoch": 0.9636657567646357, "grad_norm": 57.75, "kl": 4.307180404663086, "learning_rate": 5e-07, "logits/chosen": -48326997.333333336, "logits/rejected": -44563936.0, "logps/chosen": -363.8624674479167, "logps/rejected": -147.766064453125, "loss": 0.266, "rewards/chosen": 1.6334538459777832, "rewards/margins": 3.490480899810791, "rewards/rejected": -1.8570270538330078, "step": 18181 }, { "epoch": 0.9637187607664379, "grad_norm": 30.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73397760.0, "logits/rejected": -5071064.0, "logps/chosen": -278.659423828125, "logps/rejected": -299.4701232910156, "loss": 0.1944, "rewards/chosen": 0.7528533935546875, "rewards/margins": 5.104010105133057, "rewards/rejected": -4.351156711578369, "step": 18182 }, { "epoch": 0.96377176476824, "grad_norm": 43.25, "kl": 0.3892936706542969, "learning_rate": 5e-07, "logits/chosen": -13424997.333333334, "logits/rejected": -11236276.0, "logps/chosen": -128.89956665039062, "logps/rejected": -230.46640625, "loss": 0.2021, "rewards/chosen": 1.0109866460164387, "rewards/margins": 3.1622151692708336, "rewards/rejected": -2.1512285232543946, "step": 18183 }, { "epoch": 0.9638247687700421, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -94085680.0, "logits/rejected": -30937145.14285714, "logps/chosen": -340.3387756347656, "logps/rejected": -233.34837123325892, "loss": 0.1627, "rewards/chosen": -0.412271112203598, "rewards/margins": 2.407365096466882, "rewards/rejected": -2.81963620867048, "step": 18184 }, { "epoch": 0.9638777727718443, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28774260.0, "logits/rejected": -45009216.0, "logps/chosen": -149.47482299804688, "logps/rejected": -424.0089634486607, "loss": 0.1301, "rewards/chosen": 0.22280578315258026, "rewards/margins": 2.721934124827385, "rewards/rejected": -2.4991283416748047, "step": 18185 }, { "epoch": 0.9639307767736464, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15345061.0, "logits/rejected": -35511896.0, "logps/chosen": -248.33828735351562, "logps/rejected": -478.4192810058594, "loss": 0.2166, "rewards/chosen": 0.7983840107917786, "rewards/margins": 3.661762058734894, "rewards/rejected": -2.8633780479431152, "step": 18186 }, { "epoch": 0.9639837807754486, "grad_norm": 31.5, "kl": 2.718170166015625, "learning_rate": 5e-07, "logits/chosen": -10043369.0, "logits/rejected": -46892288.0, "logps/chosen": -160.86187744140625, "logps/rejected": -446.0360412597656, "loss": 0.2375, "rewards/chosen": 0.6223698854446411, "rewards/margins": 3.38249671459198, "rewards/rejected": -2.760126829147339, "step": 18187 }, { "epoch": 0.9640367847772506, "grad_norm": 84.5, "kl": 0.17082595825195312, "learning_rate": 5e-07, "logits/chosen": -45716853.333333336, "logits/rejected": 94464531.2, "logps/chosen": -315.21811930338544, "logps/rejected": -595.344580078125, "loss": 0.2012, "rewards/chosen": 1.0273951689402263, "rewards/margins": 4.0380105177561445, "rewards/rejected": -3.010615348815918, "step": 18188 }, { "epoch": 0.9640897887790528, "grad_norm": 32.5, "kl": 0.3508491516113281, "learning_rate": 5e-07, "logits/chosen": -34074176.0, "logits/rejected": -25798472.0, "logps/chosen": -2190.9912109375, "logps/rejected": -333.27305094401044, "loss": 0.1688, "rewards/chosen": 2.7460360527038574, "rewards/margins": 5.075060049692789, "rewards/rejected": -2.329023996988932, "step": 18189 }, { "epoch": 0.9641427927808549, "grad_norm": 24.75, "kl": 2.280959129333496, "learning_rate": 5e-07, "logits/chosen": -613295.75, "logits/rejected": -27859161.6, "logps/chosen": -34.880411783854164, "logps/rejected": -464.381640625, "loss": 0.2373, "rewards/chosen": -0.025609274705251057, "rewards/margins": 3.5885955135027565, "rewards/rejected": -3.6142047882080077, "step": 18190 }, { "epoch": 0.9641957967826571, "grad_norm": 46.0, "kl": 1.4770736694335938, "learning_rate": 5e-07, "logits/chosen": -26824848.0, "logits/rejected": 2643451.8, "logps/chosen": -418.8412679036458, "logps/rejected": -111.15205078125, "loss": 0.2304, "rewards/chosen": 1.0812093416849773, "rewards/margins": 2.837927786509196, "rewards/rejected": -1.7567184448242188, "step": 18191 }, { "epoch": 0.9642488007844592, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56922880.0, "logits/rejected": -15161149.333333334, "logps/chosen": -62.54869079589844, "logps/rejected": -370.0173746744792, "loss": 0.2707, "rewards/chosen": -0.4075496792793274, "rewards/margins": 2.1142672101656594, "rewards/rejected": -2.521816889444987, "step": 18192 }, { "epoch": 0.9643018047862614, "grad_norm": 47.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6717945.333333333, "logits/rejected": -72350476.8, "logps/chosen": -438.818359375, "logps/rejected": -715.02705078125, "loss": 0.1506, "rewards/chosen": 1.4827917416890461, "rewards/margins": 6.489238198598225, "rewards/rejected": -5.006446456909179, "step": 18193 }, { "epoch": 0.9643548087880635, "grad_norm": 83.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 152914265.6, "logits/rejected": -3295569.3333333335, "logps/chosen": -559.4921875, "logps/rejected": -78.5049336751302, "loss": 0.2911, "rewards/chosen": 0.35651273727416993, "rewards/margins": 3.585244210561117, "rewards/rejected": -3.2287314732869468, "step": 18194 }, { "epoch": 0.9644078127898656, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 54091272.0, "logits/rejected": -38938410.666666664, "logps/chosen": -525.577392578125, "logps/rejected": -417.8474934895833, "loss": 0.0923, "rewards/chosen": 1.6639831066131592, "rewards/margins": 5.231686035792032, "rewards/rejected": -3.5677029291788735, "step": 18195 }, { "epoch": 0.9644608167916677, "grad_norm": 42.25, "kl": 1.4844818115234375, "learning_rate": 5e-07, "logits/chosen": -40460464.0, "logits/rejected": -31907344.0, "logps/chosen": -378.93951416015625, "logps/rejected": -195.07627868652344, "loss": 0.2495, "rewards/chosen": 0.49394509196281433, "rewards/margins": 3.093083828687668, "rewards/rejected": -2.5991387367248535, "step": 18196 }, { "epoch": 0.9645138207934699, "grad_norm": 40.25, "kl": 1.7329864501953125, "learning_rate": 5e-07, "logits/chosen": -2052269.0, "logits/rejected": -13495914.0, "logps/chosen": -297.41290283203125, "logps/rejected": -338.0319519042969, "loss": 0.2842, "rewards/chosen": 0.16209453344345093, "rewards/margins": 3.4353067278862, "rewards/rejected": -3.273212194442749, "step": 18197 }, { "epoch": 0.964566824795272, "grad_norm": 42.5, "kl": 3.697813034057617, "learning_rate": 5e-07, "logits/chosen": -33899024.0, "logits/rejected": -27118954.666666668, "logps/chosen": -377.3521484375, "logps/rejected": -437.6756998697917, "loss": 0.35, "rewards/chosen": 0.7712474822998047, "rewards/margins": 3.170178508758545, "rewards/rejected": -2.3989310264587402, "step": 18198 }, { "epoch": 0.9646198287970742, "grad_norm": 28.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -1144561.25, "logits/rejected": -50529146.666666664, "logps/chosen": -131.1112518310547, "logps/rejected": -455.3679606119792, "loss": 0.1287, "rewards/chosen": 1.1589670181274414, "rewards/margins": 4.290574709574381, "rewards/rejected": -3.13160769144694, "step": 18199 }, { "epoch": 0.9646728327988763, "grad_norm": 57.75, "kl": 3.1197261810302734, "learning_rate": 5e-07, "logits/chosen": 62868240.0, "logits/rejected": -65653992.0, "logps/chosen": -493.0503336588542, "logps/rejected": -419.72918701171875, "loss": 0.3337, "rewards/chosen": 0.7926557064056396, "rewards/margins": 3.3439085483551025, "rewards/rejected": -2.551252841949463, "step": 18200 }, { "epoch": 0.9647258368006785, "grad_norm": 41.5, "kl": 0.22924423217773438, "learning_rate": 5e-07, "logits/chosen": -23385382.0, "logits/rejected": -19324950.0, "logps/chosen": -312.91070556640625, "logps/rejected": -252.97850036621094, "loss": 0.1685, "rewards/chosen": 0.9578900933265686, "rewards/margins": 4.743594706058502, "rewards/rejected": -3.7857046127319336, "step": 18201 }, { "epoch": 0.9647788408024806, "grad_norm": 62.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12196724.0, "logits/rejected": -18363024.0, "logps/chosen": -290.1777750651042, "logps/rejected": -614.059326171875, "loss": 0.3131, "rewards/chosen": 0.08310241500536601, "rewards/margins": 2.873628614346186, "rewards/rejected": -2.79052619934082, "step": 18202 }, { "epoch": 0.9648318448042827, "grad_norm": 39.5, "kl": 1.1939468383789062, "learning_rate": 5e-07, "logits/chosen": -52165075.2, "logits/rejected": -22755789.333333332, "logps/chosen": -297.657080078125, "logps/rejected": -441.1220703125, "loss": 0.2851, "rewards/chosen": 0.45616731643676756, "rewards/margins": 5.97231175104777, "rewards/rejected": -5.516144434611003, "step": 18203 }, { "epoch": 0.9648848488060848, "grad_norm": 29.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63555941.333333336, "logits/rejected": -33295433.6, "logps/chosen": -543.2670491536459, "logps/rejected": -197.1919921875, "loss": 0.1889, "rewards/chosen": 1.091671387354533, "rewards/margins": 3.4925979773203535, "rewards/rejected": -2.4009265899658203, "step": 18204 }, { "epoch": 0.964937852807887, "grad_norm": 44.5, "kl": 0.3375396728515625, "learning_rate": 5e-07, "logits/chosen": -10414150.4, "logits/rejected": -4050410.6666666665, "logps/chosen": -147.8670654296875, "logps/rejected": -395.9767252604167, "loss": 0.2482, "rewards/chosen": 0.8197127342224121, "rewards/margins": 3.8014676729838053, "rewards/rejected": -2.981754938761393, "step": 18205 }, { "epoch": 0.9649908568096891, "grad_norm": 53.25, "kl": 1.2774658203125, "learning_rate": 5e-07, "logits/chosen": -10105774.0, "logits/rejected": -25775940.0, "logps/chosen": -272.3161926269531, "logps/rejected": -347.94873046875, "loss": 0.3097, "rewards/chosen": 0.11805162578821182, "rewards/margins": 2.3821500316262245, "rewards/rejected": -2.2640984058380127, "step": 18206 }, { "epoch": 0.9650438608114913, "grad_norm": 39.25, "kl": 2.9573326110839844, "learning_rate": 5e-07, "logits/chosen": -67196997.33333333, "logits/rejected": -66107270.4, "logps/chosen": -190.4286905924479, "logps/rejected": -142.38927001953124, "loss": 0.3131, "rewards/chosen": 0.7347363630930582, "rewards/margins": 2.329666248957316, "rewards/rejected": -1.5949298858642578, "step": 18207 }, { "epoch": 0.9650968648132934, "grad_norm": 52.5, "kl": 3.847888946533203, "learning_rate": 5e-07, "logits/chosen": -3268692.0, "logits/rejected": -37882128.0, "logps/chosen": -300.0047200520833, "logps/rejected": -259.4694091796875, "loss": 0.4061, "rewards/chosen": 0.30866458018620807, "rewards/margins": 1.5415091236432392, "rewards/rejected": -1.2328445434570312, "step": 18208 }, { "epoch": 0.9651498688150956, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10519653.333333334, "logits/rejected": -13679897.6, "logps/chosen": -217.39591471354166, "logps/rejected": -278.3689208984375, "loss": 0.2269, "rewards/chosen": 0.8459862073262533, "rewards/margins": 3.4980942090352376, "rewards/rejected": -2.652108001708984, "step": 18209 }, { "epoch": 0.9652028728168977, "grad_norm": 82.0, "kl": 1.7216720581054688, "learning_rate": 5e-07, "logits/chosen": 6201934.8, "logits/rejected": -57436784.0, "logps/chosen": -77.97517700195313, "logps/rejected": -477.1104736328125, "loss": 0.2569, "rewards/chosen": 0.7340104103088378, "rewards/margins": 4.689632765452067, "rewards/rejected": -3.955622355143229, "step": 18210 }, { "epoch": 0.9652558768186998, "grad_norm": 44.25, "kl": 4.931995391845703, "learning_rate": 5e-07, "logits/chosen": -19507672.0, "logits/rejected": -36170952.0, "logps/chosen": -290.0135091145833, "logps/rejected": -522.0731201171875, "loss": 0.2347, "rewards/chosen": 1.6143248875935872, "rewards/margins": 3.4503681262334185, "rewards/rejected": -1.8360432386398315, "step": 18211 }, { "epoch": 0.9653088808205019, "grad_norm": 40.0, "kl": 1.7591018676757812, "learning_rate": 5e-07, "logits/chosen": -44445580.0, "logits/rejected": -41945936.0, "logps/chosen": -382.66558837890625, "logps/rejected": -245.3144734700521, "loss": 0.1315, "rewards/chosen": 1.7840492725372314, "rewards/margins": 3.6560604572296143, "rewards/rejected": -1.8720111846923828, "step": 18212 }, { "epoch": 0.9653618848223041, "grad_norm": 29.5, "kl": 0.5040016174316406, "learning_rate": 5e-07, "logits/chosen": -104416181.33333333, "logits/rejected": -33312198.4, "logps/chosen": -175.78605143229166, "logps/rejected": -375.978955078125, "loss": 0.1536, "rewards/chosen": 0.721741517384847, "rewards/margins": 4.759867127736409, "rewards/rejected": -4.038125610351562, "step": 18213 }, { "epoch": 0.9654148888241062, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 781958.0625, "logits/rejected": -13505542.857142856, "logps/chosen": -512.5760498046875, "logps/rejected": -443.26876395089283, "loss": 0.0934, "rewards/chosen": 1.599542260169983, "rewards/margins": 4.572357092584882, "rewards/rejected": -2.9728148324148997, "step": 18214 }, { "epoch": 0.9654678928259084, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6733876.8, "logits/rejected": 206520853.33333334, "logps/chosen": -245.018994140625, "logps/rejected": -176.28951009114584, "loss": 0.2296, "rewards/chosen": 0.7674023628234863, "rewards/margins": 4.322864882151285, "rewards/rejected": -3.5554625193277993, "step": 18215 }, { "epoch": 0.9655208968277105, "grad_norm": 344.0, "kl": 3.7576522827148438, "learning_rate": 5e-07, "logits/chosen": -58877204.0, "logits/rejected": -9965353.0, "logps/chosen": -439.431640625, "logps/rejected": -283.4285888671875, "loss": 0.2999, "rewards/chosen": 0.6032810807228088, "rewards/margins": 2.7008283734321594, "rewards/rejected": -2.0975472927093506, "step": 18216 }, { "epoch": 0.9655739008295127, "grad_norm": 46.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23806904.0, "logits/rejected": -55161984.0, "logps/chosen": -359.332763671875, "logps/rejected": -554.3035888671875, "loss": 0.2451, "rewards/chosen": -0.7426376342773438, "rewards/margins": 1.756333033243815, "rewards/rejected": -2.4989706675211587, "step": 18217 }, { "epoch": 0.9656269048313147, "grad_norm": 40.0, "kl": 2.4877548217773438, "learning_rate": 5e-07, "logits/chosen": -14736330.0, "logits/rejected": -45688944.0, "logps/chosen": -255.0546417236328, "logps/rejected": -474.70684814453125, "loss": 0.2964, "rewards/chosen": 0.3737149238586426, "rewards/margins": 3.3698346614837646, "rewards/rejected": -2.996119737625122, "step": 18218 }, { "epoch": 0.9656799088331169, "grad_norm": 33.75, "kl": 0.46262359619140625, "learning_rate": 5e-07, "logits/chosen": -10299084.0, "logits/rejected": -49942793.14285714, "logps/chosen": -434.35186767578125, "logps/rejected": -273.55125209263394, "loss": 0.1068, "rewards/chosen": 1.1552002429962158, "rewards/margins": 3.734101806368147, "rewards/rejected": -2.578901563371931, "step": 18219 }, { "epoch": 0.965732912834919, "grad_norm": 43.0, "kl": 0.5062885284423828, "learning_rate": 5e-07, "logits/chosen": 5722784.0, "logits/rejected": -18151480.0, "logps/chosen": -122.22601318359375, "logps/rejected": -318.6004638671875, "loss": 0.2111, "rewards/chosen": 0.5759425163269043, "rewards/margins": 3.082430362701416, "rewards/rejected": -2.5064878463745117, "step": 18220 }, { "epoch": 0.9657859168367212, "grad_norm": 47.25, "kl": 0.6364650726318359, "learning_rate": 5e-07, "logits/chosen": -27309386.0, "logits/rejected": -33841892.0, "logps/chosen": -215.62091064453125, "logps/rejected": -187.95458984375, "loss": 0.3432, "rewards/chosen": 0.11823802441358566, "rewards/margins": 2.397812895476818, "rewards/rejected": -2.2795748710632324, "step": 18221 }, { "epoch": 0.9658389208385233, "grad_norm": 26.875, "kl": 5.5944061279296875, "learning_rate": 5e-07, "logits/chosen": -17793357.333333332, "logits/rejected": -20398678.4, "logps/chosen": -514.6737467447916, "logps/rejected": -201.308740234375, "loss": 0.2164, "rewards/chosen": 1.20337971051534, "rewards/margins": 3.5803040345509842, "rewards/rejected": -2.3769243240356444, "step": 18222 }, { "epoch": 0.9658919248403255, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29908906.666666668, "logits/rejected": -33791977.6, "logps/chosen": -257.438232421875, "logps/rejected": -466.931787109375, "loss": 0.2846, "rewards/chosen": -0.27811012665430707, "rewards/margins": 2.053414913018545, "rewards/rejected": -2.3315250396728517, "step": 18223 }, { "epoch": 0.9659449288421276, "grad_norm": 55.75, "kl": 3.104747772216797, "learning_rate": 5e-07, "logits/chosen": -12456284.0, "logits/rejected": -6306461.333333333, "logps/chosen": -339.699072265625, "logps/rejected": -142.63361612955728, "loss": 0.2347, "rewards/chosen": 1.3808507919311523, "rewards/margins": 5.2169300715128575, "rewards/rejected": -3.8360792795817056, "step": 18224 }, { "epoch": 0.9659979328439298, "grad_norm": 43.25, "kl": 0.05922222137451172, "learning_rate": 5e-07, "logits/chosen": -14620048.0, "logits/rejected": -31166637.714285713, "logps/chosen": -1328.27490234375, "logps/rejected": -189.9521484375, "loss": 0.2084, "rewards/chosen": 2.989331007003784, "rewards/margins": 5.020735229764666, "rewards/rejected": -2.0314042227608815, "step": 18225 }, { "epoch": 0.9660509368457318, "grad_norm": 44.5, "kl": 4.859661102294922, "learning_rate": 5e-07, "logits/chosen": 18183832.0, "logits/rejected": -35968280.0, "logps/chosen": -177.4909912109375, "logps/rejected": -424.4722493489583, "loss": 0.2718, "rewards/chosen": 1.052457332611084, "rewards/margins": 4.105840841929117, "rewards/rejected": -3.0533835093180337, "step": 18226 }, { "epoch": 0.966103940847534, "grad_norm": 54.0, "kl": 1.0219993591308594, "learning_rate": 5e-07, "logits/chosen": -4038777.0, "logits/rejected": -16394302.0, "logps/chosen": -313.826904296875, "logps/rejected": -384.114990234375, "loss": 0.2208, "rewards/chosen": 0.7088302373886108, "rewards/margins": 4.129393935203552, "rewards/rejected": -3.4205636978149414, "step": 18227 }, { "epoch": 0.9661569448493361, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44879328.0, "logits/rejected": -52518634.666666664, "logps/chosen": -233.49758911132812, "logps/rejected": -231.8064168294271, "loss": 0.2023, "rewards/chosen": 0.3412818908691406, "rewards/margins": 2.931728998819987, "rewards/rejected": -2.590447107950846, "step": 18228 }, { "epoch": 0.9662099488511383, "grad_norm": 43.0, "kl": 1.499018669128418, "learning_rate": 5e-07, "logits/chosen": -13771984.0, "logits/rejected": -12334374.666666666, "logps/chosen": -151.9374755859375, "logps/rejected": -145.9124959309896, "loss": 0.3063, "rewards/chosen": 0.3141519546508789, "rewards/margins": 3.527377955118815, "rewards/rejected": -3.213226000467936, "step": 18229 }, { "epoch": 0.9662629528529404, "grad_norm": 57.0, "kl": 2.1666688919067383, "learning_rate": 5e-07, "logits/chosen": -47422480.0, "logits/rejected": -7363873.0, "logps/chosen": -147.35458374023438, "logps/rejected": -151.61917114257812, "loss": 0.2959, "rewards/chosen": 0.8655455112457275, "rewards/margins": 1.9927490949630737, "rewards/rejected": -1.1272035837173462, "step": 18230 }, { "epoch": 0.9663159568547426, "grad_norm": 59.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3987682.8, "logits/rejected": -82057664.0, "logps/chosen": -311.54990234375, "logps/rejected": -506.184814453125, "loss": 0.252, "rewards/chosen": 0.6216064453125, "rewards/margins": 4.639630889892578, "rewards/rejected": -4.018024444580078, "step": 18231 }, { "epoch": 0.9663689608565447, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41540725.333333336, "logits/rejected": -30631622.4, "logps/chosen": -292.1191813151042, "logps/rejected": -281.051806640625, "loss": 0.2102, "rewards/chosen": 0.7454610665639242, "rewards/margins": 3.5351548989613852, "rewards/rejected": -2.789693832397461, "step": 18232 }, { "epoch": 0.9664219648583469, "grad_norm": 45.0, "kl": 0.9246978759765625, "learning_rate": 5e-07, "logits/chosen": -19327404.0, "logits/rejected": -46954698.666666664, "logps/chosen": -100.57034301757812, "logps/rejected": -419.70849609375, "loss": 0.2347, "rewards/chosen": -0.021999835968017578, "rewards/margins": 2.696444352467855, "rewards/rejected": -2.7184441884358725, "step": 18233 }, { "epoch": 0.9664749688601489, "grad_norm": 63.5, "kl": 0.2466869354248047, "learning_rate": 5e-07, "logits/chosen": -33377476.0, "logits/rejected": -59441704.0, "logps/chosen": -214.17637634277344, "logps/rejected": -387.5050354003906, "loss": 0.2565, "rewards/chosen": 0.6483930349349976, "rewards/margins": 2.7849448919296265, "rewards/rejected": -2.136551856994629, "step": 18234 }, { "epoch": 0.966527972861951, "grad_norm": 24.25, "kl": 3.989471435546875, "learning_rate": 5e-07, "logits/chosen": 248265.4375, "logits/rejected": -19289460.0, "logps/chosen": -119.14004516601562, "logps/rejected": -389.712646484375, "loss": 0.1432, "rewards/chosen": 1.4207086563110352, "rewards/margins": 4.332428296407064, "rewards/rejected": -2.911719640096029, "step": 18235 }, { "epoch": 0.9665809768637532, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16988817.333333332, "logits/rejected": -37806147.2, "logps/chosen": -328.8883056640625, "logps/rejected": -286.518798828125, "loss": 0.293, "rewards/chosen": -0.03362592061360677, "rewards/margins": 1.8553003946940105, "rewards/rejected": -1.8889263153076172, "step": 18236 }, { "epoch": 0.9666339808655553, "grad_norm": 28.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2182946.0, "logits/rejected": -39963482.666666664, "logps/chosen": -69.32850646972656, "logps/rejected": -300.9390055338542, "loss": 0.1829, "rewards/chosen": 0.21269941329956055, "rewards/margins": 2.642449378967285, "rewards/rejected": -2.4297499656677246, "step": 18237 }, { "epoch": 0.9666869848673575, "grad_norm": 49.0, "kl": 0.5777854919433594, "learning_rate": 5e-07, "logits/chosen": -3091218.75, "logits/rejected": -8586805.0, "logps/chosen": -362.7041015625, "logps/rejected": -136.201171875, "loss": 0.2287, "rewards/chosen": 0.6073943972587585, "rewards/margins": 3.3996207118034363, "rewards/rejected": -2.7922263145446777, "step": 18238 }, { "epoch": 0.9667399888691596, "grad_norm": 51.25, "kl": 1.3985652923583984, "learning_rate": 5e-07, "logits/chosen": -14267741.0, "logits/rejected": -7466739.0, "logps/chosen": -264.40972900390625, "logps/rejected": -90.314453125, "loss": 0.4428, "rewards/chosen": -0.3777063488960266, "rewards/margins": 1.0491400361061096, "rewards/rejected": -1.4268463850021362, "step": 18239 }, { "epoch": 0.9667929928709618, "grad_norm": 48.75, "kl": 1.631561279296875, "learning_rate": 5e-07, "logits/chosen": -5620162.666666667, "logits/rejected": -14442475.2, "logps/chosen": -1013.2132975260416, "logps/rejected": -242.437353515625, "loss": 0.2184, "rewards/chosen": 1.3469533920288086, "rewards/margins": 3.9737796783447266, "rewards/rejected": -2.626826286315918, "step": 18240 }, { "epoch": 0.9668459968727638, "grad_norm": 57.75, "kl": 0.24564552307128906, "learning_rate": 5e-07, "logits/chosen": -10215528.0, "logits/rejected": 69480794.66666667, "logps/chosen": -192.28145751953124, "logps/rejected": -652.3343912760416, "loss": 0.2851, "rewards/chosen": 0.5468443870544434, "rewards/margins": 2.9480682055155434, "rewards/rejected": -2.4012238184611, "step": 18241 }, { "epoch": 0.966899000874566, "grad_norm": 53.0, "kl": 2.750112533569336, "learning_rate": 5e-07, "logits/chosen": -11688080.0, "logits/rejected": -2843180.8, "logps/chosen": -369.0708821614583, "logps/rejected": -256.27978515625, "loss": 0.2619, "rewards/chosen": 1.2190468311309814, "rewards/margins": 3.2958475589752196, "rewards/rejected": -2.076800727844238, "step": 18242 }, { "epoch": 0.9669520048763681, "grad_norm": 58.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 6741310.666666667, "logits/rejected": -16420460.8, "logps/chosen": -661.204833984375, "logps/rejected": -266.987548828125, "loss": 0.1728, "rewards/chosen": 1.5344492594401042, "rewards/margins": 3.7651276270548504, "rewards/rejected": -2.230678367614746, "step": 18243 }, { "epoch": 0.9670050088781703, "grad_norm": 23.375, "kl": 0.660919189453125, "learning_rate": 5e-07, "logits/chosen": -56359460.0, "logits/rejected": -33868618.666666664, "logps/chosen": -204.7092742919922, "logps/rejected": -353.7143961588542, "loss": 0.0674, "rewards/chosen": 1.8008344173431396, "rewards/margins": 5.178279956181845, "rewards/rejected": -3.3774455388387046, "step": 18244 }, { "epoch": 0.9670580128799724, "grad_norm": 40.25, "kl": 0.6925497055053711, "learning_rate": 5e-07, "logits/chosen": -22232268.0, "logits/rejected": 8386155.0, "logps/chosen": -136.4094034830729, "logps/rejected": -93.80640411376953, "loss": 0.4019, "rewards/chosen": 0.02065548300743103, "rewards/margins": 3.102808803319931, "rewards/rejected": -3.0821533203125, "step": 18245 }, { "epoch": 0.9671110168817746, "grad_norm": 53.25, "kl": 1.4380359649658203, "learning_rate": 5e-07, "logits/chosen": -48180133.333333336, "logits/rejected": -20913652.8, "logps/chosen": -371.9198404947917, "logps/rejected": -344.77568359375, "loss": 0.2648, "rewards/chosen": -0.09871445099512736, "rewards/margins": 3.0613162954648336, "rewards/rejected": -3.160030746459961, "step": 18246 }, { "epoch": 0.9671640208835767, "grad_norm": 43.25, "kl": 0.6479740142822266, "learning_rate": 5e-07, "logits/chosen": -42033340.0, "logits/rejected": -70111192.0, "logps/chosen": -473.18524169921875, "logps/rejected": -266.5098571777344, "loss": 0.2041, "rewards/chosen": 1.0947487354278564, "rewards/margins": 3.7034144401550293, "rewards/rejected": -2.608665704727173, "step": 18247 }, { "epoch": 0.9672170248853789, "grad_norm": 48.75, "kl": 6.004408836364746, "learning_rate": 5e-07, "logits/chosen": -20554100.0, "logits/rejected": -39261616.0, "logps/chosen": -104.46344757080078, "logps/rejected": -448.53045654296875, "loss": 0.3002, "rewards/chosen": 0.9272030591964722, "rewards/margins": 4.933714270591736, "rewards/rejected": -4.006511211395264, "step": 18248 }, { "epoch": 0.9672700288871809, "grad_norm": 39.75, "kl": 2.3103771209716797, "learning_rate": 5e-07, "logits/chosen": -1119667.25, "logits/rejected": -63284376.0, "logps/chosen": -154.26334635416666, "logps/rejected": -638.3281860351562, "loss": 0.31, "rewards/chosen": 0.8277774651845297, "rewards/margins": 4.59535272916158, "rewards/rejected": -3.767575263977051, "step": 18249 }, { "epoch": 0.9673230328889831, "grad_norm": 30.5, "kl": 1.0054397583007812, "learning_rate": 5e-07, "logits/chosen": 5033396.333333333, "logits/rejected": -19873304.0, "logps/chosen": -79.68821716308594, "logps/rejected": -111.93118896484376, "loss": 0.2736, "rewards/chosen": 0.8580602804819742, "rewards/margins": 2.528153435389201, "rewards/rejected": -1.6700931549072267, "step": 18250 }, { "epoch": 0.9673760368907852, "grad_norm": 54.25, "kl": 3.082386016845703, "learning_rate": 5e-07, "logits/chosen": -14998230.666666666, "logits/rejected": -1668111.375, "logps/chosen": -280.404541015625, "logps/rejected": -132.81631469726562, "loss": 0.4472, "rewards/chosen": 0.3933591842651367, "rewards/margins": 1.428873896598816, "rewards/rejected": -1.0355147123336792, "step": 18251 }, { "epoch": 0.9674290408925874, "grad_norm": 41.5, "kl": 0.3567180633544922, "learning_rate": 5e-07, "logits/chosen": -1332341.0, "logits/rejected": -16576694.4, "logps/chosen": -330.60439046223956, "logps/rejected": -218.5342529296875, "loss": 0.1752, "rewards/chosen": 1.1623207728068035, "rewards/margins": 4.044254557291667, "rewards/rejected": -2.8819337844848634, "step": 18252 }, { "epoch": 0.9674820448943895, "grad_norm": 39.25, "kl": 3.2002296447753906, "learning_rate": 5e-07, "logits/chosen": -6479607.5, "logits/rejected": -36042600.0, "logps/chosen": -221.7173614501953, "logps/rejected": -357.1275634765625, "loss": 0.3087, "rewards/chosen": 0.8525578379631042, "rewards/margins": 2.6511053442955017, "rewards/rejected": -1.7985475063323975, "step": 18253 }, { "epoch": 0.9675350488961917, "grad_norm": 67.0, "kl": 3.4642629623413086, "learning_rate": 5e-07, "logits/chosen": -10470031.2, "logits/rejected": -24927376.0, "logps/chosen": -584.47841796875, "logps/rejected": -314.2622884114583, "loss": 0.3464, "rewards/chosen": 0.7932812690734863, "rewards/margins": 2.423894913991292, "rewards/rejected": -1.630613644917806, "step": 18254 }, { "epoch": 0.9675880528979938, "grad_norm": 23.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 3666109.3333333335, "logits/rejected": -18670889.6, "logps/chosen": -232.9538370768229, "logps/rejected": -382.672119140625, "loss": 0.121, "rewards/chosen": 1.658447265625, "rewards/margins": 5.555757904052735, "rewards/rejected": -3.8973106384277343, "step": 18255 }, { "epoch": 0.967641056899796, "grad_norm": 30.5, "kl": 0.8918571472167969, "learning_rate": 5e-07, "logits/chosen": -8514476.0, "logits/rejected": -28310470.4, "logps/chosen": -330.57704671223956, "logps/rejected": -408.2493896484375, "loss": 0.1482, "rewards/chosen": 1.3886953989664714, "rewards/margins": 4.295853487650554, "rewards/rejected": -2.907158088684082, "step": 18256 }, { "epoch": 0.967694060901598, "grad_norm": 57.5, "kl": 0.15597915649414062, "learning_rate": 5e-07, "logits/chosen": -24740780.0, "logits/rejected": -22329406.0, "logps/chosen": -592.5476684570312, "logps/rejected": -263.4937744140625, "loss": 0.1408, "rewards/chosen": 1.3164746761322021, "rewards/margins": 4.879209280014038, "rewards/rejected": -3.562734603881836, "step": 18257 }, { "epoch": 0.9677470649034002, "grad_norm": 78.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35550220.0, "logits/rejected": -118729720.0, "logps/chosen": -462.10498046875, "logps/rejected": -209.42091369628906, "loss": 0.2517, "rewards/chosen": 0.3268982172012329, "rewards/margins": 3.0541244745254517, "rewards/rejected": -2.7272262573242188, "step": 18258 }, { "epoch": 0.9678000689052023, "grad_norm": 54.0, "kl": 5.674081802368164, "learning_rate": 5e-07, "logits/chosen": -13081762.666666666, "logits/rejected": -30558144.0, "logps/chosen": -368.1641845703125, "logps/rejected": -241.8972625732422, "loss": 0.2922, "rewards/chosen": 1.246432622273763, "rewards/margins": 3.966507275899251, "rewards/rejected": -2.7200746536254883, "step": 18259 }, { "epoch": 0.9678530729070045, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20200268.0, "logits/rejected": -20584062.0, "logps/chosen": -278.1958312988281, "logps/rejected": -285.22698974609375, "loss": 0.2865, "rewards/chosen": 0.12484227120876312, "rewards/margins": 2.623752936720848, "rewards/rejected": -2.498910665512085, "step": 18260 }, { "epoch": 0.9679060769088066, "grad_norm": 45.5, "kl": 2.287017822265625, "learning_rate": 5e-07, "logits/chosen": 23721864.0, "logits/rejected": -40214716.8, "logps/chosen": -519.7421875, "logps/rejected": -556.061279296875, "loss": 0.2002, "rewards/chosen": 0.8832132816314697, "rewards/margins": 3.5661760807037353, "rewards/rejected": -2.6829627990722655, "step": 18261 }, { "epoch": 0.9679590809106088, "grad_norm": 49.75, "kl": 1.2557578086853027, "learning_rate": 5e-07, "logits/chosen": -10603386.0, "logits/rejected": -36712420.0, "logps/chosen": -160.87529500325522, "logps/rejected": -422.8465881347656, "loss": 0.3793, "rewards/chosen": 0.33598875999450684, "rewards/margins": 2.7887003421783447, "rewards/rejected": -2.452711582183838, "step": 18262 }, { "epoch": 0.9680120849124109, "grad_norm": 50.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -74003590.4, "logits/rejected": -76752752.0, "logps/chosen": -201.6644287109375, "logps/rejected": -454.2489827473958, "loss": 0.3246, "rewards/chosen": -0.0037813574075698853, "rewards/margins": 4.168719570835431, "rewards/rejected": -4.172500928243001, "step": 18263 }, { "epoch": 0.9680650889142131, "grad_norm": 27.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9803292.0, "logits/rejected": -53024137.14285714, "logps/chosen": -5.1157379150390625, "logps/rejected": -352.5147181919643, "loss": 0.1273, "rewards/chosen": 0.002503013703972101, "rewards/margins": 2.7996908188797534, "rewards/rejected": -2.7971878051757812, "step": 18264 }, { "epoch": 0.9681180929160151, "grad_norm": 39.5, "kl": 3.309328079223633, "learning_rate": 5e-07, "logits/chosen": 6044682.0, "logits/rejected": -21647904.0, "logps/chosen": -18.478810628255207, "logps/rejected": -373.388037109375, "loss": 0.2277, "rewards/chosen": 1.8035909334818523, "rewards/margins": 4.246084181467692, "rewards/rejected": -2.4424932479858397, "step": 18265 }, { "epoch": 0.9681710969178173, "grad_norm": 47.5, "kl": 1.7313957214355469, "learning_rate": 5e-07, "logits/chosen": -20414294.85714286, "logits/rejected": 102912248.0, "logps/chosen": -425.95633370535717, "logps/rejected": -143.6498260498047, "loss": 0.4036, "rewards/chosen": 0.5991642815726144, "rewards/margins": 2.5942601306097846, "rewards/rejected": -1.9950958490371704, "step": 18266 }, { "epoch": 0.9682241009196194, "grad_norm": 42.75, "kl": 1.7339973449707031, "learning_rate": 5e-07, "logits/chosen": 1885117.3333333333, "logits/rejected": 159161356.8, "logps/chosen": -324.4947509765625, "logps/rejected": -353.1845947265625, "loss": 0.2248, "rewards/chosen": 0.6502904097239176, "rewards/margins": 3.5530574003855384, "rewards/rejected": -2.902766990661621, "step": 18267 }, { "epoch": 0.9682771049214216, "grad_norm": 59.75, "kl": 3.5176610946655273, "learning_rate": 5e-07, "logits/chosen": -23059102.0, "logits/rejected": -25517954.0, "logps/chosen": -256.8604736328125, "logps/rejected": -453.65936279296875, "loss": 0.2876, "rewards/chosen": 0.7227070331573486, "rewards/margins": 3.738710403442383, "rewards/rejected": -3.016003370285034, "step": 18268 }, { "epoch": 0.9683301089232237, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -23925604.8, "logits/rejected": -33295040.0, "logps/chosen": -454.53544921875, "logps/rejected": -369.0035400390625, "loss": 0.2453, "rewards/chosen": 1.0855059623718262, "rewards/margins": 3.5037922859191895, "rewards/rejected": -2.4182863235473633, "step": 18269 }, { "epoch": 0.9683831129250259, "grad_norm": 50.0, "kl": 0.21622848510742188, "learning_rate": 5e-07, "logits/chosen": -94373446.4, "logits/rejected": -22759888.0, "logps/chosen": -295.7728271484375, "logps/rejected": -461.714111328125, "loss": 0.2969, "rewards/chosen": 0.2580728054046631, "rewards/margins": 3.5431998093922936, "rewards/rejected": -3.2851270039876304, "step": 18270 }, { "epoch": 0.968436116926828, "grad_norm": 50.75, "kl": 1.9487571716308594, "learning_rate": 5e-07, "logits/chosen": -34190312.0, "logits/rejected": -55627144.0, "logps/chosen": -373.1940002441406, "logps/rejected": -398.2544250488281, "loss": 0.226, "rewards/chosen": 0.6758044958114624, "rewards/margins": 3.8029919862747192, "rewards/rejected": -3.127187490463257, "step": 18271 }, { "epoch": 0.9684891209286302, "grad_norm": 38.25, "kl": 2.6677780151367188, "learning_rate": 5e-07, "logits/chosen": -50182896.0, "logits/rejected": -9754956.8, "logps/chosen": -503.2413736979167, "logps/rejected": -356.913232421875, "loss": 0.1348, "rewards/chosen": 1.6803741455078125, "rewards/margins": 4.896662902832031, "rewards/rejected": -3.216288757324219, "step": 18272 }, { "epoch": 0.9685421249304322, "grad_norm": 48.0, "kl": 4.160306930541992, "learning_rate": 5e-07, "logits/chosen": -32232896.0, "logits/rejected": -17892676.0, "logps/chosen": -250.028173828125, "logps/rejected": -323.7875162760417, "loss": 0.3876, "rewards/chosen": 0.3128990411758423, "rewards/margins": 2.487668442726135, "rewards/rejected": -2.174769401550293, "step": 18273 }, { "epoch": 0.9685951289322344, "grad_norm": 40.0, "kl": 0.6091785430908203, "learning_rate": 5e-07, "logits/chosen": 7315705.5, "logits/rejected": -12819248.0, "logps/chosen": -264.7376403808594, "logps/rejected": -243.6902872721354, "loss": 0.2162, "rewards/chosen": -0.4156654477119446, "rewards/margins": 2.6732862989107766, "rewards/rejected": -3.088951746622721, "step": 18274 }, { "epoch": 0.9686481329340365, "grad_norm": 58.75, "kl": 11.019617080688477, "learning_rate": 5e-07, "logits/chosen": -8798090.857142856, "logits/rejected": -37927788.0, "logps/chosen": -725.4730747767857, "logps/rejected": -131.18731689453125, "loss": 0.2515, "rewards/chosen": 2.6790063040597096, "rewards/margins": 6.312558855329241, "rewards/rejected": -3.6335525512695312, "step": 18275 }, { "epoch": 0.9687011369358387, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5545186.666666667, "logits/rejected": 125563033.6, "logps/chosen": -237.53743489583334, "logps/rejected": -381.8794677734375, "loss": 0.3299, "rewards/chosen": -0.26264293988545734, "rewards/margins": 1.5196982542673747, "rewards/rejected": -1.782341194152832, "step": 18276 }, { "epoch": 0.9687541409376408, "grad_norm": 57.0, "kl": 1.7854080200195312, "learning_rate": 5e-07, "logits/chosen": -84432006.4, "logits/rejected": -41134410.666666664, "logps/chosen": -568.1265625, "logps/rejected": -429.0487060546875, "loss": 0.283, "rewards/chosen": 0.5472314357757568, "rewards/margins": 5.147870461146037, "rewards/rejected": -4.60063902537028, "step": 18277 }, { "epoch": 0.968807144939443, "grad_norm": 49.0, "kl": 0.4343681335449219, "learning_rate": 5e-07, "logits/chosen": -39490757.333333336, "logits/rejected": -56743824.0, "logps/chosen": -260.9208577473958, "logps/rejected": -392.96783447265625, "loss": 0.3716, "rewards/chosen": 0.16917153199513754, "rewards/margins": 3.8476092418034873, "rewards/rejected": -3.6784377098083496, "step": 18278 }, { "epoch": 0.9688601489412451, "grad_norm": 48.5, "kl": 1.5574836730957031, "learning_rate": 5e-07, "logits/chosen": -27342217.6, "logits/rejected": -39072549.333333336, "logps/chosen": -288.4623291015625, "logps/rejected": -263.65358479817706, "loss": 0.3106, "rewards/chosen": 0.38613650798797605, "rewards/margins": 3.1701086123784386, "rewards/rejected": -2.7839721043904624, "step": 18279 }, { "epoch": 0.9689131529430473, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39879164.0, "logits/rejected": 10943524.0, "logps/chosen": -275.38446044921875, "logps/rejected": -426.4659423828125, "loss": 0.2799, "rewards/chosen": 0.28069570660591125, "rewards/margins": 2.6250684559345245, "rewards/rejected": -2.3443727493286133, "step": 18280 }, { "epoch": 0.9689661569448493, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -71301674.66666667, "logits/rejected": -4697449.0, "logps/chosen": -284.4974772135417, "logps/rejected": -298.019287109375, "loss": 0.3679, "rewards/chosen": 0.6128344933191935, "rewards/margins": 1.4005648295084634, "rewards/rejected": -0.78773033618927, "step": 18281 }, { "epoch": 0.9690191609466515, "grad_norm": 34.5, "kl": 1.7550830841064453, "learning_rate": 5e-07, "logits/chosen": -11301549.6, "logits/rejected": -8068650.666666667, "logps/chosen": -125.3322509765625, "logps/rejected": -524.8277587890625, "loss": 0.2116, "rewards/chosen": 1.006693458557129, "rewards/margins": 6.460163052876791, "rewards/rejected": -5.453469594319661, "step": 18282 }, { "epoch": 0.9690721649484536, "grad_norm": 45.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -35283484.0, "logits/rejected": -15753633.0, "logps/chosen": -385.59930419921875, "logps/rejected": -287.02105712890625, "loss": 0.2244, "rewards/chosen": 0.6751431822776794, "rewards/margins": 3.9687644839286804, "rewards/rejected": -3.293621301651001, "step": 18283 }, { "epoch": 0.9691251689502558, "grad_norm": 47.5, "kl": 1.062912940979004, "learning_rate": 5e-07, "logits/chosen": -25661792.0, "logits/rejected": -18328604.8, "logps/chosen": -216.7216593424479, "logps/rejected": -291.9591796875, "loss": 0.2288, "rewards/chosen": 0.9452246030171713, "rewards/margins": 2.968183549245199, "rewards/rejected": -2.0229589462280275, "step": 18284 }, { "epoch": 0.9691781729520579, "grad_norm": 47.5, "kl": 0.2361602783203125, "learning_rate": 5e-07, "logits/chosen": -39642612.0, "logits/rejected": -13422063.0, "logps/chosen": -518.125732421875, "logps/rejected": -259.2415771484375, "loss": 0.2209, "rewards/chosen": 0.5783302783966064, "rewards/margins": 3.9492228031158447, "rewards/rejected": -3.3708925247192383, "step": 18285 }, { "epoch": 0.96923117695386, "grad_norm": 68.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18410174.666666668, "logits/rejected": -27300521.6, "logps/chosen": -720.8338216145834, "logps/rejected": -595.0212890625, "loss": 0.2601, "rewards/chosen": 0.3407577673594157, "rewards/margins": 2.7621368567148843, "rewards/rejected": -2.4213790893554688, "step": 18286 }, { "epoch": 0.9692841809556622, "grad_norm": 42.5, "kl": 3.0963993072509766, "learning_rate": 5e-07, "logits/chosen": 118694553.6, "logits/rejected": -686881.1666666666, "logps/chosen": -322.240576171875, "logps/rejected": -139.9019571940104, "loss": 0.2969, "rewards/chosen": 0.9928411483764649, "rewards/margins": 2.634407107035319, "rewards/rejected": -1.6415659586588542, "step": 18287 }, { "epoch": 0.9693371849574642, "grad_norm": 50.75, "kl": 1.2071456909179688, "learning_rate": 5e-07, "logits/chosen": -17692160.0, "logits/rejected": -28254892.0, "logps/chosen": -373.875, "logps/rejected": -299.8330078125, "loss": 0.3167, "rewards/chosen": 0.9449218908945719, "rewards/margins": 2.8037293354670205, "rewards/rejected": -1.8588074445724487, "step": 18288 }, { "epoch": 0.9693901889592664, "grad_norm": 45.75, "kl": 0.6237621307373047, "learning_rate": 5e-07, "logits/chosen": 5001995.0, "logits/rejected": -4502147.2, "logps/chosen": -185.8153076171875, "logps/rejected": -176.65333251953126, "loss": 0.2423, "rewards/chosen": 0.15834554036458334, "rewards/margins": 2.6564811070760093, "rewards/rejected": -2.498135566711426, "step": 18289 }, { "epoch": 0.9694431929610685, "grad_norm": 46.25, "kl": 1.1158065795898438, "learning_rate": 5e-07, "logits/chosen": -11630410.666666666, "logits/rejected": -18143500.8, "logps/chosen": -258.0620930989583, "logps/rejected": -279.2052734375, "loss": 0.3318, "rewards/chosen": 0.5491497913996378, "rewards/margins": 1.4636273304621379, "rewards/rejected": -0.9144775390625, "step": 18290 }, { "epoch": 0.9694961969628707, "grad_norm": 38.0, "kl": 0.5934991836547852, "learning_rate": 5e-07, "logits/chosen": -9280042.0, "logits/rejected": -34396684.8, "logps/chosen": -255.96976725260416, "logps/rejected": -532.01748046875, "loss": 0.2252, "rewards/chosen": 0.3538668950398763, "rewards/margins": 3.536090024312337, "rewards/rejected": -3.1822231292724608, "step": 18291 }, { "epoch": 0.9695492009646728, "grad_norm": 51.25, "kl": 4.388101577758789, "learning_rate": 5e-07, "logits/chosen": 6239363.2, "logits/rejected": -19637549.333333332, "logps/chosen": -204.13287353515625, "logps/rejected": -924.7794596354166, "loss": 0.3623, "rewards/chosen": 0.3263246059417725, "rewards/margins": 3.788613780339559, "rewards/rejected": -3.4622891743977866, "step": 18292 }, { "epoch": 0.969602204966475, "grad_norm": 51.5, "kl": 4.116486549377441, "learning_rate": 5e-07, "logits/chosen": -8235325.333333333, "logits/rejected": -217165536.0, "logps/chosen": -270.93365478515625, "logps/rejected": -585.3372802734375, "loss": 0.3271, "rewards/chosen": 0.765998125076294, "rewards/margins": 4.2063562870025635, "rewards/rejected": -3.4403581619262695, "step": 18293 }, { "epoch": 0.9696552089682771, "grad_norm": 40.0, "kl": 0.5236701965332031, "learning_rate": 5e-07, "logits/chosen": -14505165.0, "logits/rejected": -33994240.0, "logps/chosen": -453.156494140625, "logps/rejected": -523.0013427734375, "loss": 0.2294, "rewards/chosen": 0.9185478091239929, "rewards/margins": 3.877128541469574, "rewards/rejected": -2.958580732345581, "step": 18294 }, { "epoch": 0.9697082129700793, "grad_norm": 55.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27550262.4, "logits/rejected": -26206917.333333332, "logps/chosen": -249.9383544921875, "logps/rejected": -466.362548828125, "loss": 0.3711, "rewards/chosen": -0.0987526535987854, "rewards/margins": 2.214310332139333, "rewards/rejected": -2.3130629857381186, "step": 18295 }, { "epoch": 0.9697612169718813, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48938245.333333336, "logits/rejected": -38927472.0, "logps/chosen": -448.1254475911458, "logps/rejected": -372.22216796875, "loss": 0.1322, "rewards/chosen": 1.523384730021159, "rewards/margins": 4.576680819193522, "rewards/rejected": -3.0532960891723633, "step": 18296 }, { "epoch": 0.9698142209736835, "grad_norm": 51.5, "kl": 2.531951904296875, "learning_rate": 5e-07, "logits/chosen": -32778348.8, "logits/rejected": -22268989.333333332, "logps/chosen": -304.9594482421875, "logps/rejected": -235.91141764322916, "loss": 0.3244, "rewards/chosen": 0.41573357582092285, "rewards/margins": 4.300993839899698, "rewards/rejected": -3.885260264078776, "step": 18297 }, { "epoch": 0.9698672249754856, "grad_norm": 54.5, "kl": 0.31177520751953125, "learning_rate": 5e-07, "logits/chosen": 5485523.0, "logits/rejected": -10337751.0, "logps/chosen": -208.95875549316406, "logps/rejected": -177.91329956054688, "loss": 0.3057, "rewards/chosen": 0.19489018619060516, "rewards/margins": 2.042482778429985, "rewards/rejected": -1.8475925922393799, "step": 18298 }, { "epoch": 0.9699202289772878, "grad_norm": 63.25, "kl": 0.21097946166992188, "learning_rate": 5e-07, "logits/chosen": -1695258.0, "logits/rejected": -37701686.4, "logps/chosen": -259.2340494791667, "logps/rejected": -231.7184326171875, "loss": 0.1975, "rewards/chosen": 0.8410295645395914, "rewards/margins": 3.115284172693888, "rewards/rejected": -2.2742546081542967, "step": 18299 }, { "epoch": 0.9699732329790899, "grad_norm": 34.5, "kl": 3.000347137451172, "learning_rate": 5e-07, "logits/chosen": -11920059.0, "logits/rejected": -48120092.0, "logps/chosen": -228.107666015625, "logps/rejected": -230.9696807861328, "loss": 0.2164, "rewards/chosen": 1.097063660621643, "rewards/margins": 4.176218628883362, "rewards/rejected": -3.0791549682617188, "step": 18300 }, { "epoch": 0.9700262369808921, "grad_norm": 58.75, "kl": 2.151348114013672, "learning_rate": 5e-07, "logits/chosen": -48792972.0, "logits/rejected": 300019.0, "logps/chosen": -505.1697082519531, "logps/rejected": -126.82820129394531, "loss": 0.2331, "rewards/chosen": 0.8289859294891357, "rewards/margins": 4.2583441734313965, "rewards/rejected": -3.4293582439422607, "step": 18301 }, { "epoch": 0.9700792409826942, "grad_norm": 56.75, "kl": 0.23671436309814453, "learning_rate": 5e-07, "logits/chosen": -39290840.0, "logits/rejected": -26829012.0, "logps/chosen": -371.7604573567708, "logps/rejected": -689.1956787109375, "loss": 0.3761, "rewards/chosen": 0.19622508684794107, "rewards/margins": 2.438671271006266, "rewards/rejected": -2.242446184158325, "step": 18302 }, { "epoch": 0.9701322449844964, "grad_norm": 26.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 39005120.0, "logits/rejected": -15196836.8, "logps/chosen": -135.5121053059896, "logps/rejected": -318.7708984375, "loss": 0.1429, "rewards/chosen": 0.6173764864603678, "rewards/margins": 5.173172918955485, "rewards/rejected": -4.555796432495117, "step": 18303 }, { "epoch": 0.9701852489862984, "grad_norm": 43.25, "kl": 4.587497711181641, "learning_rate": 5e-07, "logits/chosen": -38981696.0, "logits/rejected": -17704413.333333332, "logps/chosen": -297.094384765625, "logps/rejected": -311.20200602213544, "loss": 0.2813, "rewards/chosen": 1.0584997177124023, "rewards/margins": 4.352574094136556, "rewards/rejected": -3.294074376424154, "step": 18304 }, { "epoch": 0.9702382529881006, "grad_norm": 76.5, "kl": 0.4992713928222656, "learning_rate": 5e-07, "logits/chosen": -46640008.0, "logits/rejected": -50172748.0, "logps/chosen": -287.7801513671875, "logps/rejected": -484.6493835449219, "loss": 0.2648, "rewards/chosen": 0.5936865210533142, "rewards/margins": 3.535503089427948, "rewards/rejected": -2.941816568374634, "step": 18305 }, { "epoch": 0.9702912569899027, "grad_norm": 36.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33653202.666666664, "logits/rejected": -19469723.2, "logps/chosen": -193.55875651041666, "logps/rejected": -383.5431396484375, "loss": 0.2549, "rewards/chosen": -0.24221624930699667, "rewards/margins": 2.8084499875704445, "rewards/rejected": -3.0506662368774413, "step": 18306 }, { "epoch": 0.9703442609917049, "grad_norm": 39.75, "kl": 2.0876235961914062, "learning_rate": 5e-07, "logits/chosen": -15110876.0, "logits/rejected": -49547872.0, "logps/chosen": -248.8838907877604, "logps/rejected": -418.90859375, "loss": 0.2656, "rewards/chosen": -0.06260426839192708, "rewards/margins": 3.3820788065592446, "rewards/rejected": -3.444683074951172, "step": 18307 }, { "epoch": 0.970397264993507, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51386901.333333336, "logits/rejected": -22948972.8, "logps/chosen": -397.5711263020833, "logps/rejected": -297.980810546875, "loss": 0.2527, "rewards/chosen": -0.30107422669728595, "rewards/margins": 2.7649722973505653, "rewards/rejected": -3.0660465240478514, "step": 18308 }, { "epoch": 0.9704502689953092, "grad_norm": 36.75, "kl": 2.3941354751586914, "learning_rate": 5e-07, "logits/chosen": -2744047.0, "logits/rejected": 154126816.0, "logps/chosen": -310.38348388671875, "logps/rejected": -432.5500793457031, "loss": 0.2644, "rewards/chosen": 0.7381290793418884, "rewards/margins": 4.383431375026703, "rewards/rejected": -3.6453022956848145, "step": 18309 }, { "epoch": 0.9705032729971113, "grad_norm": 31.75, "kl": 4.846551895141602, "learning_rate": 5e-07, "logits/chosen": -9624485.6, "logits/rejected": -12681593.333333334, "logps/chosen": -107.3584716796875, "logps/rejected": -352.9270833333333, "loss": 0.3546, "rewards/chosen": 0.21966326236724854, "rewards/margins": 3.9419676065444946, "rewards/rejected": -3.722304344177246, "step": 18310 }, { "epoch": 0.9705562769989134, "grad_norm": 49.25, "kl": 6.308835029602051, "learning_rate": 5e-07, "logits/chosen": -6675381.0, "logits/rejected": -11661188.0, "logps/chosen": -449.25238037109375, "logps/rejected": -241.6387176513672, "loss": 0.2857, "rewards/chosen": 1.7276504039764404, "rewards/margins": 2.8515485525131226, "rewards/rejected": -1.1238981485366821, "step": 18311 }, { "epoch": 0.9706092810007155, "grad_norm": 53.0, "kl": 2.222707748413086, "learning_rate": 5e-07, "logits/chosen": -28758784.0, "logits/rejected": 42109360.0, "logps/chosen": -383.393896484375, "logps/rejected": -230.1429239908854, "loss": 0.3468, "rewards/chosen": 0.6537383556365967, "rewards/margins": 2.5397175312042237, "rewards/rejected": -1.885979175567627, "step": 18312 }, { "epoch": 0.9706622850025177, "grad_norm": 37.5, "kl": 1.7560348510742188, "learning_rate": 5e-07, "logits/chosen": -22042044.0, "logits/rejected": -3136159.0, "logps/chosen": -398.3631286621094, "logps/rejected": -205.86322021484375, "loss": 0.1668, "rewards/chosen": 1.704991340637207, "rewards/margins": 3.972059488296509, "rewards/rejected": -2.2670681476593018, "step": 18313 }, { "epoch": 0.9707152890043198, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16260641.333333334, "logits/rejected": -30762057.6, "logps/chosen": -159.9616495768229, "logps/rejected": -334.2696044921875, "loss": 0.1744, "rewards/chosen": 0.5188415845235189, "rewards/margins": 3.550116570790609, "rewards/rejected": -3.03127498626709, "step": 18314 }, { "epoch": 0.970768293006122, "grad_norm": 58.0, "kl": 3.8156070709228516, "learning_rate": 5e-07, "logits/chosen": 5784632.0, "logits/rejected": -51004012.0, "logps/chosen": -355.3410237630208, "logps/rejected": -342.82476806640625, "loss": 0.4065, "rewards/chosen": 0.4544084866841634, "rewards/margins": 2.2804643710454306, "rewards/rejected": -1.826055884361267, "step": 18315 }, { "epoch": 0.9708212970079241, "grad_norm": 36.5, "kl": 2.2681732177734375, "learning_rate": 5e-07, "logits/chosen": -33839196.8, "logits/rejected": -40619370.666666664, "logps/chosen": -266.75615234375, "logps/rejected": -573.8887939453125, "loss": 0.276, "rewards/chosen": 0.6566629886627198, "rewards/margins": 3.916909901301066, "rewards/rejected": -3.260246912638346, "step": 18316 }, { "epoch": 0.9708743010097263, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33903432.0, "logits/rejected": -1639589.7142857143, "logps/chosen": -484.4613037109375, "logps/rejected": -353.04764229910717, "loss": 0.0821, "rewards/chosen": 2.231646776199341, "rewards/margins": 5.02380463055202, "rewards/rejected": -2.7921578543526784, "step": 18317 }, { "epoch": 0.9709273050115284, "grad_norm": 58.5, "kl": 0.07548141479492188, "learning_rate": 5e-07, "logits/chosen": -45619413.333333336, "logits/rejected": -14005179.0, "logps/chosen": -312.60972086588544, "logps/rejected": -204.04615783691406, "loss": 0.4264, "rewards/chosen": -0.04515520731608073, "rewards/margins": 1.3101670344670613, "rewards/rejected": -1.355322241783142, "step": 18318 }, { "epoch": 0.9709803090133305, "grad_norm": 49.0, "kl": 1.4743614196777344, "learning_rate": 5e-07, "logits/chosen": -13789157.0, "logits/rejected": 72614496.0, "logps/chosen": -117.32441711425781, "logps/rejected": -223.3159637451172, "loss": 0.3753, "rewards/chosen": -0.0207059383392334, "rewards/margins": 1.3822100162506104, "rewards/rejected": -1.4029159545898438, "step": 18319 }, { "epoch": 0.9710333130151326, "grad_norm": 49.0, "kl": 0.6609916687011719, "learning_rate": 5e-07, "logits/chosen": -38978652.0, "logits/rejected": 3019122.5, "logps/chosen": -325.28570556640625, "logps/rejected": -87.83075714111328, "loss": 0.2307, "rewards/chosen": 0.7734172940254211, "rewards/margins": 3.373591959476471, "rewards/rejected": -2.60017466545105, "step": 18320 }, { "epoch": 0.9710863170169348, "grad_norm": 65.0, "kl": 0.3088960647583008, "learning_rate": 5e-07, "logits/chosen": -23122912.0, "logits/rejected": -24098988.0, "logps/chosen": -278.0520935058594, "logps/rejected": -244.9869384765625, "loss": 0.3245, "rewards/chosen": 0.2587989866733551, "rewards/margins": 1.817873865365982, "rewards/rejected": -1.559074878692627, "step": 18321 }, { "epoch": 0.9711393210187369, "grad_norm": 24.125, "kl": 2.4488868713378906, "learning_rate": 5e-07, "logits/chosen": -152933.5, "logits/rejected": -26404563.2, "logps/chosen": -42.24805196126302, "logps/rejected": -400.8255859375, "loss": 0.2122, "rewards/chosen": 0.4388923645019531, "rewards/margins": 3.8742691040039063, "rewards/rejected": -3.435376739501953, "step": 18322 }, { "epoch": 0.9711923250205391, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21653221.333333332, "logits/rejected": -60843379.2, "logps/chosen": -227.26033528645834, "logps/rejected": -239.5470703125, "loss": 0.2897, "rewards/chosen": 0.17200060685475668, "rewards/margins": 2.051547916730245, "rewards/rejected": -1.8795473098754882, "step": 18323 }, { "epoch": 0.9712453290223412, "grad_norm": 58.0, "kl": 6.484596252441406, "learning_rate": 5e-07, "logits/chosen": -11914581.333333334, "logits/rejected": -21341720.0, "logps/chosen": -317.7410074869792, "logps/rejected": -496.2344665527344, "loss": 0.3772, "rewards/chosen": 1.2514019807179768, "rewards/margins": 2.909803469975789, "rewards/rejected": -1.6584014892578125, "step": 18324 }, { "epoch": 0.9712983330241434, "grad_norm": 36.5, "kl": 2.1582841873168945, "learning_rate": 5e-07, "logits/chosen": -4156035.0, "logits/rejected": -144737.25, "logps/chosen": -64.99820454915364, "logps/rejected": -226.29110717773438, "loss": 0.4083, "rewards/chosen": 0.20847002665201822, "rewards/margins": 2.84746781984965, "rewards/rejected": -2.638997793197632, "step": 18325 }, { "epoch": 0.9713513370259454, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -37958712.0, "logps/rejected": -288.622314453125, "loss": 0.1473, "rewards/rejected": -2.1778759956359863, "step": 18326 }, { "epoch": 0.9714043410277476, "grad_norm": 55.5, "kl": 0.6519546508789062, "learning_rate": 5e-07, "logits/chosen": -19925001.6, "logits/rejected": -28304522.666666668, "logps/chosen": -137.5052490234375, "logps/rejected": -391.5787353515625, "loss": 0.2827, "rewards/chosen": 0.3851089239120483, "rewards/margins": 3.812535579999288, "rewards/rejected": -3.4274266560872397, "step": 18327 }, { "epoch": 0.9714573450295497, "grad_norm": 39.5, "kl": 0.7422866821289062, "learning_rate": 5e-07, "logits/chosen": -22331996.0, "logits/rejected": -11255888.0, "logps/chosen": -171.56488037109375, "logps/rejected": -354.1122741699219, "loss": 0.2924, "rewards/chosen": 0.46573659777641296, "rewards/margins": 2.910316437482834, "rewards/rejected": -2.444579839706421, "step": 18328 }, { "epoch": 0.9715103490313519, "grad_norm": 44.25, "kl": 0.8097209930419922, "learning_rate": 5e-07, "logits/chosen": -28937428.0, "logits/rejected": -77974248.0, "logps/chosen": -388.4178466796875, "logps/rejected": -401.9470520019531, "loss": 0.2474, "rewards/chosen": 0.8854705691337585, "rewards/margins": 3.433543384075165, "rewards/rejected": -2.5480728149414062, "step": 18329 }, { "epoch": 0.971563353033154, "grad_norm": 57.25, "kl": 3.424612045288086, "learning_rate": 5e-07, "logits/chosen": -12529164.0, "logits/rejected": -20393654.666666668, "logps/chosen": -296.6483154296875, "logps/rejected": -307.6895345052083, "loss": 0.359, "rewards/chosen": 0.7718710899353027, "rewards/margins": 2.654969374338786, "rewards/rejected": -1.8830982844034831, "step": 18330 }, { "epoch": 0.9716163570349562, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11041381.333333334, "logits/rejected": -9707718.4, "logps/chosen": -349.23828125, "logps/rejected": -407.101611328125, "loss": 0.1451, "rewards/chosen": 0.9551967779795328, "rewards/margins": 4.623268906275431, "rewards/rejected": -3.6680721282958983, "step": 18331 }, { "epoch": 0.9716693610367583, "grad_norm": 32.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2189074.0, "logits/rejected": -14136834.666666666, "logps/chosen": -122.05931854248047, "logps/rejected": -290.67112223307294, "loss": 0.1408, "rewards/chosen": 1.3203644752502441, "rewards/margins": 4.019886175791422, "rewards/rejected": -2.6995217005411782, "step": 18332 }, { "epoch": 0.9717223650385605, "grad_norm": 43.0, "kl": 1.7616462707519531, "learning_rate": 5e-07, "logits/chosen": -39516044.0, "logits/rejected": -34353136.0, "logps/chosen": -421.6781005859375, "logps/rejected": -401.6700439453125, "loss": 0.1911, "rewards/chosen": 1.3575265407562256, "rewards/margins": 5.277753114700317, "rewards/rejected": -3.920226573944092, "step": 18333 }, { "epoch": 0.9717753690403625, "grad_norm": 56.0, "kl": 1.4209403991699219, "learning_rate": 5e-07, "logits/chosen": -18640192.0, "logits/rejected": 10693747.0, "logps/chosen": -615.1096801757812, "logps/rejected": -261.944091796875, "loss": 0.2761, "rewards/chosen": 1.0747016668319702, "rewards/margins": 3.2440415620803833, "rewards/rejected": -2.169339895248413, "step": 18334 }, { "epoch": 0.9718283730421646, "grad_norm": 64.0, "kl": 0.19687461853027344, "learning_rate": 5e-07, "logits/chosen": -40461513.6, "logits/rejected": 4093680.0, "logps/chosen": -434.999365234375, "logps/rejected": -98.60076904296875, "loss": 0.2835, "rewards/chosen": 0.44143404960632326, "rewards/margins": 4.333385610580445, "rewards/rejected": -3.891951560974121, "step": 18335 }, { "epoch": 0.9718813770439668, "grad_norm": 40.25, "kl": 0.0037250518798828125, "learning_rate": 5e-07, "logits/chosen": -37295432.0, "logits/rejected": -31126838.85714286, "logps/chosen": -363.66375732421875, "logps/rejected": -383.34375, "loss": 0.1385, "rewards/chosen": 1.8312195539474487, "rewards/margins": 4.735656755311148, "rewards/rejected": -2.9044372013636996, "step": 18336 }, { "epoch": 0.9719343810457689, "grad_norm": 50.0, "kl": 2.0927200317382812, "learning_rate": 5e-07, "logits/chosen": 20808326.4, "logits/rejected": -8908043.333333334, "logps/chosen": -187.358056640625, "logps/rejected": -454.6029459635417, "loss": 0.3385, "rewards/chosen": 0.3882452487945557, "rewards/margins": 2.981198771794637, "rewards/rejected": -2.5929535230000815, "step": 18337 }, { "epoch": 0.9719873850475711, "grad_norm": 70.5, "kl": 2.7011260986328125, "learning_rate": 5e-07, "logits/chosen": -35889115.428571425, "logits/rejected": -141635648.0, "logps/chosen": -252.2735595703125, "logps/rejected": -196.50411987304688, "loss": 0.4407, "rewards/chosen": 0.3518989767347063, "rewards/margins": 2.318511196545192, "rewards/rejected": -1.9666122198104858, "step": 18338 }, { "epoch": 0.9720403890493732, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12263525.333333334, "logits/rejected": -41441843.2, "logps/chosen": -179.59676106770834, "logps/rejected": -310.220556640625, "loss": 0.1192, "rewards/chosen": 1.7436126073201497, "rewards/margins": 4.566290791829427, "rewards/rejected": -2.8226781845092774, "step": 18339 }, { "epoch": 0.9720933930511754, "grad_norm": 76.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -67654048.0, "logits/rejected": -40780773.333333336, "logps/chosen": -282.31011962890625, "logps/rejected": -278.0884195963542, "loss": 0.2629, "rewards/chosen": -0.07130298763513565, "rewards/margins": 1.8644285822908084, "rewards/rejected": -1.935731569925944, "step": 18340 }, { "epoch": 0.9721463970529775, "grad_norm": 57.75, "kl": 2.960142135620117, "learning_rate": 5e-07, "logits/chosen": 15519654.4, "logits/rejected": -4616100.666666667, "logps/chosen": -355.285498046875, "logps/rejected": -504.468017578125, "loss": 0.1975, "rewards/chosen": 1.3865528106689453, "rewards/margins": 4.3712304433186855, "rewards/rejected": -2.9846776326497397, "step": 18341 }, { "epoch": 0.9721994010547796, "grad_norm": 47.5, "kl": 1.4084014892578125, "learning_rate": 5e-07, "logits/chosen": -31014003.2, "logits/rejected": -28371840.0, "logps/chosen": -367.61318359375, "logps/rejected": -639.8643798828125, "loss": 0.2057, "rewards/chosen": 1.469434928894043, "rewards/margins": 5.010868899027506, "rewards/rejected": -3.5414339701334634, "step": 18342 }, { "epoch": 0.9722524050565817, "grad_norm": 37.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4761927.6, "logits/rejected": -17995904.0, "logps/chosen": -130.640087890625, "logps/rejected": -74.75655619303386, "loss": 0.354, "rewards/chosen": 0.0015972137451171875, "rewards/margins": 3.0712326049804686, "rewards/rejected": -3.0696353912353516, "step": 18343 }, { "epoch": 0.9723054090583839, "grad_norm": 43.5, "kl": 1.3567686080932617, "learning_rate": 5e-07, "logits/chosen": -1561104.4, "logits/rejected": 272986581.3333333, "logps/chosen": -158.86749267578125, "logps/rejected": -548.0679931640625, "loss": 0.2444, "rewards/chosen": 0.979531192779541, "rewards/margins": 4.383861764272054, "rewards/rejected": -3.404330571492513, "step": 18344 }, { "epoch": 0.972358413060186, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -33730872.0, "logits/rejected": -8305210.0, "logps/chosen": -548.774658203125, "logps/rejected": -277.0757751464844, "loss": 0.3256, "rewards/chosen": -0.25855693221092224, "rewards/margins": 3.0974086821079254, "rewards/rejected": -3.3559656143188477, "step": 18345 }, { "epoch": 0.9724114170619882, "grad_norm": 40.5, "kl": 0.15404319763183594, "learning_rate": 5e-07, "logits/chosen": -29008067.2, "logits/rejected": -3521908.6666666665, "logps/chosen": -317.7591552734375, "logps/rejected": -49.14466857910156, "loss": 0.3081, "rewards/chosen": 0.6610699653625488, "rewards/margins": 2.2089247067769366, "rewards/rejected": -1.547854741414388, "step": 18346 }, { "epoch": 0.9724644210637903, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -76432048.0, "logits/rejected": -44800720.0, "logps/chosen": -196.0806427001953, "logps/rejected": -379.1637369791667, "loss": 0.2005, "rewards/chosen": -0.20487405359745026, "rewards/margins": 2.9293718189001083, "rewards/rejected": -3.1342458724975586, "step": 18347 }, { "epoch": 0.9725174250655925, "grad_norm": 46.5, "kl": 0.36702728271484375, "learning_rate": 5e-07, "logits/chosen": -36391312.0, "logits/rejected": -21133201.333333332, "logps/chosen": -308.6230224609375, "logps/rejected": -637.1980387369791, "loss": 0.3031, "rewards/chosen": 0.30024399757385256, "rewards/margins": 3.1753914992014565, "rewards/rejected": -2.875147501627604, "step": 18348 }, { "epoch": 0.9725704290673945, "grad_norm": 90.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31121766.0, "logits/rejected": -5683029.0, "logps/chosen": -407.3670959472656, "logps/rejected": -438.00482177734375, "loss": 0.1911, "rewards/chosen": 1.015052080154419, "rewards/margins": 3.6574342250823975, "rewards/rejected": -2.6423821449279785, "step": 18349 }, { "epoch": 0.9726234330691967, "grad_norm": 54.0, "kl": 0.275115966796875, "learning_rate": 5e-07, "logits/chosen": -104140544.0, "logits/rejected": -66779059.2, "logps/chosen": -277.8768717447917, "logps/rejected": -316.944287109375, "loss": 0.1921, "rewards/chosen": 1.0544377168019612, "rewards/margins": 3.1924919923146566, "rewards/rejected": -2.1380542755126952, "step": 18350 }, { "epoch": 0.9726764370709988, "grad_norm": 39.75, "kl": 2.5246143341064453, "learning_rate": 5e-07, "logits/chosen": -14434800.0, "logits/rejected": 2159743.75, "logps/chosen": -191.11800130208334, "logps/rejected": -173.77584838867188, "loss": 0.4067, "rewards/chosen": 0.5869079033533732, "rewards/margins": 2.699007789293925, "rewards/rejected": -2.1120998859405518, "step": 18351 }, { "epoch": 0.972729441072801, "grad_norm": 45.75, "kl": 0.4156675338745117, "learning_rate": 5e-07, "logits/chosen": -5262791.2, "logits/rejected": 186300.5, "logps/chosen": -287.435009765625, "logps/rejected": -71.25308736165364, "loss": 0.2091, "rewards/chosen": 1.4147254943847656, "rewards/margins": 3.9033610343933107, "rewards/rejected": -2.488635540008545, "step": 18352 }, { "epoch": 0.9727824450746031, "grad_norm": 44.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20568328.0, "logits/rejected": -21201202.0, "logps/chosen": -322.0964050292969, "logps/rejected": -274.7400207519531, "loss": 0.1829, "rewards/chosen": 0.7424030303955078, "rewards/margins": 4.458068609237671, "rewards/rejected": -3.715665578842163, "step": 18353 }, { "epoch": 0.9728354490764053, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21280173.333333332, "logits/rejected": 14967600.0, "logps/chosen": -246.31148274739584, "logps/rejected": -308.803076171875, "loss": 0.224, "rewards/chosen": 0.497040589650472, "rewards/margins": 2.709883530934652, "rewards/rejected": -2.2128429412841797, "step": 18354 }, { "epoch": 0.9728884530782074, "grad_norm": 45.25, "kl": 0.32617950439453125, "learning_rate": 5e-07, "logits/chosen": -46204185.6, "logits/rejected": -32570378.666666668, "logps/chosen": -368.7201171875, "logps/rejected": -324.6888834635417, "loss": 0.2531, "rewards/chosen": 0.933776569366455, "rewards/margins": 3.2711309115091955, "rewards/rejected": -2.3373543421427407, "step": 18355 }, { "epoch": 0.9729414570800096, "grad_norm": 58.0, "kl": 3.4611501693725586, "learning_rate": 5e-07, "logits/chosen": -1847926.8333333333, "logits/rejected": -28551716.0, "logps/chosen": -117.21718343098958, "logps/rejected": -233.53196716308594, "loss": 0.2329, "rewards/chosen": 1.4746187527974446, "rewards/margins": 4.381626923878987, "rewards/rejected": -2.907008171081543, "step": 18356 }, { "epoch": 0.9729944610818116, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27514493.333333332, "logits/rejected": -10307024.0, "logps/chosen": -403.210693359375, "logps/rejected": -167.2456298828125, "loss": 0.1508, "rewards/chosen": 0.6827209790547689, "rewards/margins": 5.151708634694417, "rewards/rejected": -4.468987655639649, "step": 18357 }, { "epoch": 0.9730474650836138, "grad_norm": 50.0, "kl": 2.425424575805664, "learning_rate": 5e-07, "logits/chosen": -20648172.0, "logits/rejected": -12577471.2, "logps/chosen": -293.012939453125, "logps/rejected": -274.709130859375, "loss": 0.2642, "rewards/chosen": 0.8670370578765869, "rewards/margins": 2.390050935745239, "rewards/rejected": -1.5230138778686524, "step": 18358 }, { "epoch": 0.9731004690854159, "grad_norm": 49.5, "kl": 1.3224506378173828, "learning_rate": 5e-07, "logits/chosen": -18479638.0, "logits/rejected": -15818122.0, "logps/chosen": -193.13397216796875, "logps/rejected": -300.46099853515625, "loss": 0.3634, "rewards/chosen": -0.039261627942323685, "rewards/margins": 2.0372230522334576, "rewards/rejected": -2.0764846801757812, "step": 18359 }, { "epoch": 0.9731534730872181, "grad_norm": 50.5, "kl": 2.3224964141845703, "learning_rate": 5e-07, "logits/chosen": -10854803.2, "logits/rejected": -23861045.333333332, "logps/chosen": -180.21875, "logps/rejected": -458.776611328125, "loss": 0.3493, "rewards/chosen": 0.21077051162719726, "rewards/margins": 3.3370710055033364, "rewards/rejected": -3.126300493876139, "step": 18360 }, { "epoch": 0.9732064770890202, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51389724.0, "logits/rejected": 41700868.0, "logps/chosen": -382.7115173339844, "logps/rejected": -456.4764404296875, "loss": 0.2939, "rewards/chosen": 0.05390244722366333, "rewards/margins": 3.0636804699897766, "rewards/rejected": -3.0097780227661133, "step": 18361 }, { "epoch": 0.9732594810908224, "grad_norm": 54.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31165450.666666668, "logits/rejected": -7165949.0, "logps/chosen": -213.772216796875, "logps/rejected": -147.30735778808594, "loss": 0.3244, "rewards/chosen": 0.31522587935129803, "rewards/margins": 6.596860686937968, "rewards/rejected": -6.28163480758667, "step": 18362 }, { "epoch": 0.9733124850926245, "grad_norm": 45.0, "kl": 2.1373062133789062, "learning_rate": 5e-07, "logits/chosen": -21911622.0, "logits/rejected": -14170999.0, "logps/chosen": -401.1391906738281, "logps/rejected": -178.27914428710938, "loss": 0.1952, "rewards/chosen": 1.7327141761779785, "rewards/margins": 4.166069507598877, "rewards/rejected": -2.4333553314208984, "step": 18363 }, { "epoch": 0.9733654890944267, "grad_norm": 62.25, "kl": 4.795764923095703, "learning_rate": 5e-07, "logits/chosen": -27181209.14285714, "logits/rejected": 8829023.0, "logps/chosen": -333.2350376674107, "logps/rejected": -4.033849716186523, "loss": 0.5216, "rewards/chosen": 0.47202702930995394, "rewards/margins": 0.4971058028084891, "rewards/rejected": -0.025078773498535156, "step": 18364 }, { "epoch": 0.9734184930962287, "grad_norm": 44.25, "kl": 1.858062744140625, "learning_rate": 5e-07, "logits/chosen": -35518848.0, "logits/rejected": -70145861.33333333, "logps/chosen": -210.886572265625, "logps/rejected": -377.5073649088542, "loss": 0.3194, "rewards/chosen": 0.21650993824005127, "rewards/margins": 3.7062054872512817, "rewards/rejected": -3.4896955490112305, "step": 18365 }, { "epoch": 0.9734714970980309, "grad_norm": 41.0, "kl": 0.10988235473632812, "learning_rate": 5e-07, "logits/chosen": -24710602.666666668, "logits/rejected": -70311225.6, "logps/chosen": -194.04191080729166, "logps/rejected": -568.5310546875, "loss": 0.2034, "rewards/chosen": 0.17255884408950806, "rewards/margins": 3.3517094254493713, "rewards/rejected": -3.1791505813598633, "step": 18366 }, { "epoch": 0.973524501099833, "grad_norm": 42.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21818502.0, "logits/rejected": -44496040.0, "logps/chosen": -263.61883544921875, "logps/rejected": -417.91680908203125, "loss": 0.2784, "rewards/chosen": 0.1744847297668457, "rewards/margins": 2.516021728515625, "rewards/rejected": -2.3415369987487793, "step": 18367 }, { "epoch": 0.9735775051016352, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8418797.333333334, "logits/rejected": -19267744.0, "logps/chosen": -291.51059977213544, "logps/rejected": -481.3705078125, "loss": 0.2147, "rewards/chosen": 0.6058899561564127, "rewards/margins": 3.3596317927042643, "rewards/rejected": -2.7537418365478517, "step": 18368 }, { "epoch": 0.9736305091034373, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40557440.0, "logits/rejected": -25763918.0, "logps/chosen": -131.0548553466797, "logps/rejected": -259.0965576171875, "loss": 0.3424, "rewards/chosen": -0.3537940979003906, "rewards/margins": 2.5299956798553467, "rewards/rejected": -2.8837897777557373, "step": 18369 }, { "epoch": 0.9736835131052395, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29255852.0, "logits/rejected": -13268702.666666666, "logps/chosen": -383.8319396972656, "logps/rejected": -206.68241373697916, "loss": 0.1941, "rewards/chosen": -0.14243698120117188, "rewards/margins": 2.7197465896606445, "rewards/rejected": -2.8621835708618164, "step": 18370 }, { "epoch": 0.9737365171070416, "grad_norm": 57.75, "kl": 3.95123291015625, "learning_rate": 5e-07, "logits/chosen": 11590311.2, "logits/rejected": -50482389.333333336, "logps/chosen": -204.0895263671875, "logps/rejected": -258.9220784505208, "loss": 0.3071, "rewards/chosen": 0.7130950927734375, "rewards/margins": 2.139360014597575, "rewards/rejected": -1.4262649218241374, "step": 18371 }, { "epoch": 0.9737895211088438, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17108028.0, "logits/rejected": -15865831.0, "logps/chosen": -255.14231872558594, "logps/rejected": -205.85372924804688, "loss": 0.2232, "rewards/chosen": 0.6116583347320557, "rewards/margins": 3.132004499435425, "rewards/rejected": -2.520346164703369, "step": 18372 }, { "epoch": 0.9738425251106458, "grad_norm": 72.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51947139.2, "logits/rejected": 2820513.3333333335, "logps/chosen": -612.0412109375, "logps/rejected": -410.9424641927083, "loss": 0.3677, "rewards/chosen": -0.07763185501098632, "rewards/margins": 2.217560863494873, "rewards/rejected": -2.2951927185058594, "step": 18373 }, { "epoch": 0.973895529112448, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -36060154.666666664, "logits/rejected": -27896643.2, "logps/chosen": -202.01239013671875, "logps/rejected": -432.906591796875, "loss": 0.2791, "rewards/chosen": -0.1416913370291392, "rewards/margins": 2.4149729390939076, "rewards/rejected": -2.5566642761230467, "step": 18374 }, { "epoch": 0.9739485331142501, "grad_norm": 41.5, "kl": 0.761962890625, "learning_rate": 5e-07, "logits/chosen": -30989088.0, "logits/rejected": -6467118.0, "logps/chosen": -228.73077392578125, "logps/rejected": -249.83329264322916, "loss": 0.2465, "rewards/chosen": -0.21558724343776703, "rewards/margins": 2.363527829448382, "rewards/rejected": -2.579115072886149, "step": 18375 }, { "epoch": 0.9740015371160523, "grad_norm": 48.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -84966976.0, "logits/rejected": -29286904.0, "logps/chosen": -488.1893005371094, "logps/rejected": -300.88958740234375, "loss": 0.2687, "rewards/chosen": 0.461325079202652, "rewards/margins": 2.633012682199478, "rewards/rejected": -2.171687602996826, "step": 18376 }, { "epoch": 0.9740545411178544, "grad_norm": 46.25, "kl": 0.34955596923828125, "learning_rate": 5e-07, "logits/chosen": -35582758.4, "logits/rejected": -62264624.0, "logps/chosen": -326.4413818359375, "logps/rejected": -516.3755696614584, "loss": 0.3347, "rewards/chosen": 0.07242577075958252, "rewards/margins": 2.644462513923645, "rewards/rejected": -2.5720367431640625, "step": 18377 }, { "epoch": 0.9741075451196566, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49040448.0, "logits/rejected": -12711216.0, "logps/chosen": -594.992919921875, "logps/rejected": -118.5672607421875, "loss": 0.1601, "rewards/chosen": 1.066389004389445, "rewards/margins": 4.206711880366008, "rewards/rejected": -3.1403228759765627, "step": 18378 }, { "epoch": 0.9741605491214587, "grad_norm": 44.5, "kl": 0.5670623779296875, "learning_rate": 5e-07, "logits/chosen": 7129594.0, "logits/rejected": -34029324.0, "logps/chosen": -298.91680908203125, "logps/rejected": -316.80450439453125, "loss": 0.2428, "rewards/chosen": 0.5691260099411011, "rewards/margins": 3.2499693632125854, "rewards/rejected": -2.6808433532714844, "step": 18379 }, { "epoch": 0.9742135531232609, "grad_norm": 39.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19103636.0, "logits/rejected": -27220202.0, "logps/chosen": -539.3404541015625, "logps/rejected": -262.084228515625, "loss": 0.2217, "rewards/chosen": 1.308064579963684, "rewards/margins": 3.344446063041687, "rewards/rejected": -2.036381483078003, "step": 18380 }, { "epoch": 0.9742665571250629, "grad_norm": 48.5, "kl": 1.0354232788085938, "learning_rate": 5e-07, "logits/chosen": -31229270.0, "logits/rejected": -7215775.0, "logps/chosen": -196.37884521484375, "logps/rejected": -209.8301239013672, "loss": 0.3693, "rewards/chosen": 0.0016087479889392853, "rewards/margins": 1.2927092500030994, "rewards/rejected": -1.2911005020141602, "step": 18381 }, { "epoch": 0.9743195611268651, "grad_norm": 47.75, "kl": 1.0622215270996094, "learning_rate": 5e-07, "logits/chosen": -32760460.8, "logits/rejected": -21410058.666666668, "logps/chosen": -277.155224609375, "logps/rejected": -173.40059407552084, "loss": 0.2746, "rewards/chosen": 0.7050937175750732, "rewards/margins": 2.959414339065552, "rewards/rejected": -2.2543206214904785, "step": 18382 }, { "epoch": 0.9743725651286672, "grad_norm": 37.75, "kl": 0.8453559875488281, "learning_rate": 5e-07, "logits/chosen": -37697177.6, "logits/rejected": 2386606.6666666665, "logps/chosen": -575.56787109375, "logps/rejected": -349.2112223307292, "loss": 0.2741, "rewards/chosen": 1.0245893478393555, "rewards/margins": 3.520832824707031, "rewards/rejected": -2.496243476867676, "step": 18383 }, { "epoch": 0.9744255691304694, "grad_norm": 45.5, "kl": 0.825531005859375, "learning_rate": 5e-07, "logits/chosen": -8249720.666666667, "logits/rejected": -35578268.8, "logps/chosen": -309.18017578125, "logps/rejected": -268.521728515625, "loss": 0.1446, "rewards/chosen": 1.2756271362304688, "rewards/margins": 3.941168785095215, "rewards/rejected": -2.665541648864746, "step": 18384 }, { "epoch": 0.9744785731322715, "grad_norm": 41.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 5617506.666666667, "logits/rejected": -36997785.6, "logps/chosen": -144.41236368815103, "logps/rejected": -258.726513671875, "loss": 0.2457, "rewards/chosen": 0.6313557624816895, "rewards/margins": 2.779679203033447, "rewards/rejected": -2.1483234405517577, "step": 18385 }, { "epoch": 0.9745315771340736, "grad_norm": 49.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45579984.0, "logits/rejected": -44795868.0, "logps/chosen": -631.509765625, "logps/rejected": -240.68450927734375, "loss": 0.1775, "rewards/chosen": 0.7426117658615112, "rewards/margins": 4.368066668510437, "rewards/rejected": -3.625454902648926, "step": 18386 }, { "epoch": 0.9745845811358758, "grad_norm": 36.25, "kl": 3.1051816940307617, "learning_rate": 5e-07, "logits/chosen": -7806289.333333333, "logits/rejected": -86034032.0, "logps/chosen": -443.6982828776042, "logps/rejected": -247.40887451171875, "loss": 0.3344, "rewards/chosen": 0.7295104662577311, "rewards/margins": 3.5763889948527017, "rewards/rejected": -2.8468785285949707, "step": 18387 }, { "epoch": 0.9746375851376778, "grad_norm": 52.25, "kl": 1.120253562927246, "learning_rate": 5e-07, "logits/chosen": -53807232.0, "logits/rejected": -12170540.0, "logps/chosen": -453.30767822265625, "logps/rejected": -129.06056213378906, "loss": 0.3015, "rewards/chosen": 0.2947143614292145, "rewards/margins": 2.4652511179447174, "rewards/rejected": -2.170536756515503, "step": 18388 }, { "epoch": 0.97469058913948, "grad_norm": 29.0, "kl": 1.2065467834472656, "learning_rate": 5e-07, "logits/chosen": 12980881.6, "logits/rejected": -34914944.0, "logps/chosen": -141.84613037109375, "logps/rejected": -175.03765869140625, "loss": 0.2111, "rewards/chosen": 1.2224557876586915, "rewards/margins": 5.033118693033854, "rewards/rejected": -3.8106629053751626, "step": 18389 }, { "epoch": 0.9747435931412821, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -101992896.0, "logits/rejected": -19374688.0, "logps/chosen": -413.9307861328125, "logps/rejected": -321.1360270182292, "loss": 0.1294, "rewards/chosen": 0.843231201171875, "rewards/margins": 4.031595230102539, "rewards/rejected": -3.188364028930664, "step": 18390 }, { "epoch": 0.9747965971430843, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48667050.666666664, "logits/rejected": -51368099.2, "logps/chosen": -596.2611083984375, "logps/rejected": -286.67568359375, "loss": 0.1724, "rewards/chosen": 1.5717225074768066, "rewards/margins": 4.4812699317932125, "rewards/rejected": -2.9095474243164063, "step": 18391 }, { "epoch": 0.9748496011448864, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9701208.0, "logits/rejected": -532009.3333333334, "logps/chosen": -119.52991485595703, "logps/rejected": -361.53662109375, "loss": 0.2638, "rewards/chosen": -0.23191414773464203, "rewards/margins": 1.8061136156320572, "rewards/rejected": -2.038027763366699, "step": 18392 }, { "epoch": 0.9749026051466886, "grad_norm": 42.5, "kl": 5.4358673095703125, "learning_rate": 5e-07, "logits/chosen": 9071862.666666666, "logits/rejected": -16709136.0, "logps/chosen": -261.99733479817706, "logps/rejected": -368.69085693359375, "loss": 0.343, "rewards/chosen": 0.8589843908945719, "rewards/margins": 6.301451603571574, "rewards/rejected": -5.442467212677002, "step": 18393 }, { "epoch": 0.9749556091484907, "grad_norm": 45.5, "kl": 1.1066093444824219, "learning_rate": 5e-07, "logits/chosen": -32936405.333333332, "logits/rejected": -78327590.4, "logps/chosen": -464.3026529947917, "logps/rejected": -531.012060546875, "loss": 0.1323, "rewards/chosen": 1.9480988184611003, "rewards/margins": 4.41731325785319, "rewards/rejected": -2.46921443939209, "step": 18394 }, { "epoch": 0.9750086131502929, "grad_norm": 41.0, "kl": 1.0371522903442383, "learning_rate": 5e-07, "logits/chosen": -41623000.0, "logits/rejected": -6497450.0, "logps/chosen": -444.5484924316406, "logps/rejected": -322.1611633300781, "loss": 0.215, "rewards/chosen": 1.0327036380767822, "rewards/margins": 3.347191095352173, "rewards/rejected": -2.3144874572753906, "step": 18395 }, { "epoch": 0.9750616171520949, "grad_norm": 27.875, "kl": 4.328791618347168, "learning_rate": 5e-07, "logits/chosen": 22347530.0, "logits/rejected": 675910.0, "logps/chosen": -918.2145385742188, "logps/rejected": -300.01690673828125, "loss": 0.2409, "rewards/chosen": 1.2242337465286255, "rewards/margins": 3.6070507764816284, "rewards/rejected": -2.382817029953003, "step": 18396 }, { "epoch": 0.9751146211538971, "grad_norm": 46.5, "kl": 3.7602157592773438, "learning_rate": 5e-07, "logits/chosen": -17471872.0, "logits/rejected": -35252564.0, "logps/chosen": -255.84464518229166, "logps/rejected": -340.93914794921875, "loss": 0.3998, "rewards/chosen": 0.637312094370524, "rewards/margins": 2.9038044611612954, "rewards/rejected": -2.2664923667907715, "step": 18397 }, { "epoch": 0.9751676251556992, "grad_norm": 39.25, "kl": 1.1772918701171875, "learning_rate": 5e-07, "logits/chosen": -26145816.0, "logits/rejected": -13108515.0, "logps/chosen": -271.27813720703125, "logps/rejected": -192.96588134765625, "loss": 0.2647, "rewards/chosen": 0.2893105745315552, "rewards/margins": 3.6580554246902466, "rewards/rejected": -3.3687448501586914, "step": 18398 }, { "epoch": 0.9752206291575014, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -41614435.2, "logits/rejected": -61081381.333333336, "logps/chosen": -264.185595703125, "logps/rejected": -236.88435872395834, "loss": 0.3615, "rewards/chosen": 0.16461868286132814, "rewards/margins": 2.017460060119629, "rewards/rejected": -1.8528413772583008, "step": 18399 }, { "epoch": 0.9752736331593035, "grad_norm": 43.75, "kl": 1.7541546821594238, "learning_rate": 5e-07, "logits/chosen": -18972388.8, "logits/rejected": -26811488.0, "logps/chosen": -287.5260009765625, "logps/rejected": -650.1471354166666, "loss": 0.2727, "rewards/chosen": 0.6410123348236084, "rewards/margins": 3.979224348068237, "rewards/rejected": -3.338212013244629, "step": 18400 }, { "epoch": 0.9753266371611057, "grad_norm": 42.0, "kl": 0.0, "learning_rate": 5e-07, "logits/rejected": -15950350.0, "logps/rejected": -393.79046630859375, "loss": 0.1127, "rewards/rejected": -2.8971333503723145, "step": 18401 }, { "epoch": 0.9753796411629078, "grad_norm": 35.5, "kl": 3.4277162551879883, "learning_rate": 5e-07, "logits/chosen": -4266936.0, "logits/rejected": -2436150.25, "logps/chosen": -243.6631622314453, "logps/rejected": -124.38459777832031, "loss": 0.2363, "rewards/chosen": 0.9956339001655579, "rewards/margins": 3.6530914902687073, "rewards/rejected": -2.6574575901031494, "step": 18402 }, { "epoch": 0.97543264516471, "grad_norm": 48.25, "kl": 4.671712875366211, "learning_rate": 5e-07, "logits/chosen": -22296182.4, "logits/rejected": -9403382.666666666, "logps/chosen": -249.54873046875, "logps/rejected": -108.9058837890625, "loss": 0.4168, "rewards/chosen": 0.17616195678710939, "rewards/margins": 4.156015268961588, "rewards/rejected": -3.979853312174479, "step": 18403 }, { "epoch": 0.975485649166512, "grad_norm": 51.75, "kl": 0.32375335693359375, "learning_rate": 5e-07, "logits/chosen": -32863666.0, "logits/rejected": -33035700.0, "logps/chosen": -377.9239501953125, "logps/rejected": -244.54043579101562, "loss": 0.2749, "rewards/chosen": 0.4612196087837219, "rewards/margins": 2.305483877658844, "rewards/rejected": -1.844264268875122, "step": 18404 }, { "epoch": 0.9755386531683142, "grad_norm": 58.0, "kl": 1.2463817596435547, "learning_rate": 5e-07, "logits/chosen": -22517888.0, "logits/rejected": -7804876.5, "logps/chosen": -202.49002075195312, "logps/rejected": -134.55776977539062, "loss": 0.2735, "rewards/chosen": 0.7794389128684998, "rewards/margins": 2.3680288195610046, "rewards/rejected": -1.5885899066925049, "step": 18405 }, { "epoch": 0.9755916571701163, "grad_norm": 32.25, "kl": 0.7442798614501953, "learning_rate": 5e-07, "logits/chosen": -56735.0, "logits/rejected": -45777696.0, "logps/chosen": -95.25687408447266, "logps/rejected": -363.2801513671875, "loss": 0.1969, "rewards/chosen": 0.21142369508743286, "rewards/margins": 2.876251995563507, "rewards/rejected": -2.664828300476074, "step": 18406 }, { "epoch": 0.9756446611719185, "grad_norm": 32.25, "kl": 2.391633987426758, "learning_rate": 5e-07, "logits/chosen": -26159920.0, "logits/rejected": -40175452.0, "logps/chosen": -246.2367960611979, "logps/rejected": -545.7379760742188, "loss": 0.2767, "rewards/chosen": 1.10850191116333, "rewards/margins": 4.681028366088867, "rewards/rejected": -3.572526454925537, "step": 18407 }, { "epoch": 0.9756976651737206, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -56014460.0, "logits/rejected": 9945329.0, "logps/chosen": -446.4598083496094, "logps/rejected": -442.7774353027344, "loss": 0.2847, "rewards/chosen": 0.48658305406570435, "rewards/margins": 3.1556647419929504, "rewards/rejected": -2.669081687927246, "step": 18408 }, { "epoch": 0.9757506691755228, "grad_norm": 42.25, "kl": 4.447259902954102, "learning_rate": 5e-07, "logits/chosen": -43049900.8, "logits/rejected": -2475891.0, "logps/chosen": -283.8983154296875, "logps/rejected": -498.5638020833333, "loss": 0.3196, "rewards/chosen": 0.6902907848358154, "rewards/margins": 4.785937325159709, "rewards/rejected": -4.0956465403238935, "step": 18409 }, { "epoch": 0.9758036731773249, "grad_norm": 51.5, "kl": 0.3229560852050781, "learning_rate": 5e-07, "logits/chosen": -18763209.6, "logits/rejected": -36597392.0, "logps/chosen": -472.02998046875, "logps/rejected": -384.8011067708333, "loss": 0.3273, "rewards/chosen": 0.7074199676513672, "rewards/margins": 2.4223950068155924, "rewards/rejected": -1.7149750391642253, "step": 18410 }, { "epoch": 0.975856677179127, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -12964862.0, "logits/rejected": -16548016.0, "logps/chosen": -347.7041015625, "logps/rejected": -384.1507975260417, "loss": 0.1858, "rewards/chosen": 0.06413116306066513, "rewards/margins": 3.344991428156694, "rewards/rejected": -3.280860265096029, "step": 18411 }, { "epoch": 0.9759096811809291, "grad_norm": 37.25, "kl": 0.21916580200195312, "learning_rate": 5e-07, "logits/chosen": -106714552.0, "logits/rejected": -32803730.285714287, "logps/chosen": -775.828369140625, "logps/rejected": -261.6433803013393, "loss": 0.1057, "rewards/chosen": 0.5985168814659119, "rewards/margins": 3.607916840485164, "rewards/rejected": -3.009399959019252, "step": 18412 }, { "epoch": 0.9759626851827313, "grad_norm": 42.5, "kl": 1.3184127807617188, "learning_rate": 5e-07, "logits/chosen": 517675.3333333333, "logits/rejected": -18531044.8, "logps/chosen": -237.119140625, "logps/rejected": -376.8979248046875, "loss": 0.1853, "rewards/chosen": 1.5812578201293945, "rewards/margins": 3.183892250061035, "rewards/rejected": -1.6026344299316406, "step": 18413 }, { "epoch": 0.9760156891845334, "grad_norm": 67.0, "kl": 2.0180206298828125, "learning_rate": 5e-07, "logits/chosen": -64000876.0, "logits/rejected": -37115760.0, "logps/chosen": -402.6651611328125, "logps/rejected": -253.52740478515625, "loss": 0.3322, "rewards/chosen": 0.42740437388420105, "rewards/margins": 2.004083961248398, "rewards/rejected": -1.5766795873641968, "step": 18414 }, { "epoch": 0.9760686931863356, "grad_norm": 51.25, "kl": 1.5311813354492188, "learning_rate": 5e-07, "logits/chosen": -17183764.8, "logits/rejected": -14692369.333333334, "logps/chosen": -253.206591796875, "logps/rejected": -331.320068359375, "loss": 0.3098, "rewards/chosen": 0.7831467628479004, "rewards/margins": 2.99425417582194, "rewards/rejected": -2.2111074129740396, "step": 18415 }, { "epoch": 0.9761216971881377, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28889388.0, "logits/rejected": -10071038.0, "logps/chosen": -184.28334045410156, "logps/rejected": -247.17843627929688, "loss": 0.2905, "rewards/chosen": 0.6194460988044739, "rewards/margins": 2.000796377658844, "rewards/rejected": -1.3813502788543701, "step": 18416 }, { "epoch": 0.9761747011899399, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26401320.0, "logits/rejected": -50997436.0, "logps/chosen": -201.35800170898438, "logps/rejected": -308.50775146484375, "loss": 0.2604, "rewards/chosen": 0.7413490414619446, "rewards/margins": 2.5004077553749084, "rewards/rejected": -1.7590587139129639, "step": 18417 }, { "epoch": 0.976227705191742, "grad_norm": 50.75, "kl": 2.429027557373047, "learning_rate": 5e-07, "logits/chosen": -67478229.33333333, "logits/rejected": -45160280.0, "logps/chosen": -475.8460693359375, "logps/rejected": -487.6583557128906, "loss": 0.2296, "rewards/chosen": 1.1310149828592937, "rewards/margins": 3.784992376963298, "rewards/rejected": -2.653977394104004, "step": 18418 }, { "epoch": 0.9762807091935441, "grad_norm": 38.5, "kl": 1.121840476989746, "learning_rate": 5e-07, "logits/chosen": -5613764.0, "logits/rejected": -24633278.0, "logps/chosen": -143.70071411132812, "logps/rejected": -250.30474853515625, "loss": 0.2931, "rewards/chosen": 0.7061266899108887, "rewards/margins": 2.575250744819641, "rewards/rejected": -1.8691240549087524, "step": 18419 }, { "epoch": 0.9763337131953462, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5639365.0, "logits/rejected": -20206448.0, "logps/chosen": -105.87889099121094, "logps/rejected": -448.90557861328125, "loss": 0.2582, "rewards/chosen": 0.3139269948005676, "rewards/margins": 3.804159462451935, "rewards/rejected": -3.490232467651367, "step": 18420 }, { "epoch": 0.9763867171971484, "grad_norm": 43.0, "kl": 7.209662437438965, "learning_rate": 5e-07, "logits/chosen": -18974602.285714287, "logits/rejected": -22020754.0, "logps/chosen": -284.32777622767856, "logps/rejected": -229.47242736816406, "loss": 0.434, "rewards/chosen": 0.9042588642665318, "rewards/margins": 2.6551041773387363, "rewards/rejected": -1.7508453130722046, "step": 18421 }, { "epoch": 0.9764397211989505, "grad_norm": 45.0, "kl": 2.6907501220703125, "learning_rate": 5e-07, "logits/chosen": -10621545.333333334, "logits/rejected": -5270987.0, "logps/chosen": -167.6668701171875, "logps/rejected": -185.05035400390625, "loss": 0.323, "rewards/chosen": 0.7505497137705485, "rewards/margins": 2.13390843073527, "rewards/rejected": -1.3833587169647217, "step": 18422 }, { "epoch": 0.9764927252007527, "grad_norm": 65.0, "kl": 2.741140365600586, "learning_rate": 5e-07, "logits/chosen": -35740790.4, "logits/rejected": -24605272.0, "logps/chosen": -302.821728515625, "logps/rejected": -286.21533203125, "loss": 0.2543, "rewards/chosen": 1.1909190177917481, "rewards/margins": 2.7789167722066246, "rewards/rejected": -1.5879977544148762, "step": 18423 }, { "epoch": 0.9765457292025548, "grad_norm": 51.75, "kl": 0.9593963623046875, "learning_rate": 5e-07, "logits/chosen": -19575246.4, "logits/rejected": -29539573.333333332, "logps/chosen": -238.3166259765625, "logps/rejected": -419.4090576171875, "loss": 0.2691, "rewards/chosen": 0.4120288372039795, "rewards/margins": 4.87752776145935, "rewards/rejected": -4.465498924255371, "step": 18424 }, { "epoch": 0.976598733204357, "grad_norm": 53.0, "kl": 0.4087228775024414, "learning_rate": 5e-07, "logits/chosen": -25308781.333333332, "logits/rejected": 16126000.0, "logps/chosen": -269.9813232421875, "logps/rejected": -309.9466857910156, "loss": 0.3633, "rewards/chosen": 0.5013719399770101, "rewards/margins": 1.488359053929647, "rewards/rejected": -0.9869871139526367, "step": 18425 }, { "epoch": 0.976651737206159, "grad_norm": 58.0, "kl": 1.782114028930664, "learning_rate": 5e-07, "logits/chosen": -9179058.666666666, "logits/rejected": -41786195.2, "logps/chosen": -196.1756388346354, "logps/rejected": -348.7828369140625, "loss": 0.3244, "rewards/chosen": 0.09458641211191814, "rewards/margins": 1.7329695145289103, "rewards/rejected": -1.6383831024169921, "step": 18426 }, { "epoch": 0.9767047412079612, "grad_norm": 53.0, "kl": 2.039055824279785, "learning_rate": 5e-07, "logits/chosen": -9699323.2, "logits/rejected": -14108077.333333334, "logps/chosen": -200.95008544921876, "logps/rejected": -518.2274983723959, "loss": 0.4183, "rewards/chosen": -0.0920221447944641, "rewards/margins": 1.5620398084322613, "rewards/rejected": -1.6540619532267253, "step": 18427 }, { "epoch": 0.9767577452097633, "grad_norm": 18.75, "kl": 4.235589981079102, "learning_rate": 5e-07, "logits/chosen": 7204976.0, "logits/rejected": -42360746.666666664, "logps/chosen": -685.298828125, "logps/rejected": -321.4510498046875, "loss": 0.2315, "rewards/chosen": 1.7335458755493165, "rewards/margins": 3.859545644124349, "rewards/rejected": -2.1259997685750327, "step": 18428 }, { "epoch": 0.9768107492115655, "grad_norm": 76.5, "kl": 3.393918991088867, "learning_rate": 5e-07, "logits/chosen": -46978858.666666664, "logits/rejected": -18328376.0, "logps/chosen": -458.722900390625, "logps/rejected": -179.03619384765625, "loss": 0.3461, "rewards/chosen": 1.308619499206543, "rewards/margins": 1.787706583738327, "rewards/rejected": -0.47908708453178406, "step": 18429 }, { "epoch": 0.9768637532133676, "grad_norm": 42.5, "kl": 3.0900516510009766, "learning_rate": 5e-07, "logits/chosen": -51337920.0, "logits/rejected": -82325696.0, "logps/chosen": -142.5544677734375, "logps/rejected": -395.125, "loss": 0.3436, "rewards/chosen": 0.4955312252044678, "rewards/margins": 4.018356466293335, "rewards/rejected": -3.522825241088867, "step": 18430 }, { "epoch": 0.9769167572151698, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43868362.666666664, "logits/rejected": -25967704.0, "logps/chosen": -435.8478190104167, "logps/rejected": -202.3306640625, "loss": 0.242, "rewards/chosen": 0.6298985481262207, "rewards/margins": 2.5269761085510254, "rewards/rejected": -1.8970775604248047, "step": 18431 }, { "epoch": 0.9769697612169719, "grad_norm": 54.75, "kl": 0.670379638671875, "learning_rate": 5e-07, "logits/chosen": -87350328.0, "logits/rejected": -19364268.0, "logps/chosen": -439.6650390625, "logps/rejected": -219.55877685546875, "loss": 0.1975, "rewards/chosen": 0.7165282964706421, "rewards/margins": 2.96514626344045, "rewards/rejected": -2.248617966969808, "step": 18432 }, { "epoch": 0.9770227652187741, "grad_norm": 48.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40825256.0, "logits/rejected": -35306016.0, "logps/chosen": -350.0561930338542, "logps/rejected": -230.8438720703125, "loss": 0.2313, "rewards/chosen": 1.1861724853515625, "rewards/margins": 3.277279853820801, "rewards/rejected": -2.0911073684692383, "step": 18433 }, { "epoch": 0.9770757692205762, "grad_norm": 47.75, "kl": 1.2992897033691406, "learning_rate": 5e-07, "logits/chosen": -52040252.8, "logits/rejected": -36069781.333333336, "logps/chosen": -364.7302734375, "logps/rejected": -326.5500081380208, "loss": 0.2835, "rewards/chosen": 0.6793584823608398, "rewards/margins": 3.664747873942057, "rewards/rejected": -2.9853893915812173, "step": 18434 }, { "epoch": 0.9771287732223783, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61876172.0, "logits/rejected": 88702314.66666667, "logps/chosen": -204.4747314453125, "logps/rejected": -327.2668863932292, "loss": 0.1964, "rewards/chosen": 0.3870651125907898, "rewards/margins": 3.0332719683647156, "rewards/rejected": -2.646206855773926, "step": 18435 }, { "epoch": 0.9771817772241804, "grad_norm": 58.75, "kl": 2.3603286743164062, "learning_rate": 5e-07, "logits/chosen": -5032176.0, "logits/rejected": -41247898.666666664, "logps/chosen": -708.283642578125, "logps/rejected": -462.5935872395833, "loss": 0.2891, "rewards/chosen": 0.9274552345275879, "rewards/margins": 2.9616679827372234, "rewards/rejected": -2.0342127482096353, "step": 18436 }, { "epoch": 0.9772347812259825, "grad_norm": 68.5, "kl": 2.3401870727539062, "learning_rate": 5e-07, "logits/chosen": -87791744.0, "logits/rejected": -38148784.0, "logps/chosen": -524.22509765625, "logps/rejected": -288.0246887207031, "loss": 0.2668, "rewards/chosen": 1.1172572374343872, "rewards/margins": 3.2317882776260376, "rewards/rejected": -2.1145310401916504, "step": 18437 }, { "epoch": 0.9772877852277847, "grad_norm": 45.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6063777.333333333, "logits/rejected": -36956272.0, "logps/chosen": -278.0865478515625, "logps/rejected": -305.759326171875, "loss": 0.2161, "rewards/chosen": -0.10125100612640381, "rewards/margins": 3.039569306373596, "rewards/rejected": -3.1408203125, "step": 18438 }, { "epoch": 0.9773407892295868, "grad_norm": 73.5, "kl": 1.0759730339050293, "learning_rate": 5e-07, "logits/chosen": -12410100.0, "logits/rejected": -5424499.5, "logps/chosen": -412.6728515625, "logps/rejected": -97.95404815673828, "loss": 0.2546, "rewards/chosen": 0.9861224492390951, "rewards/margins": 5.698983510335286, "rewards/rejected": -4.712861061096191, "step": 18439 }, { "epoch": 0.977393793231389, "grad_norm": 55.5, "kl": 0.6992034912109375, "learning_rate": 5e-07, "logits/chosen": -48031420.8, "logits/rejected": -29211226.666666668, "logps/chosen": -323.4870361328125, "logps/rejected": -191.3806355794271, "loss": 0.3719, "rewards/chosen": 0.24696052074432373, "rewards/margins": 1.7370047966639202, "rewards/rejected": -1.4900442759195964, "step": 18440 }, { "epoch": 0.9774467972331911, "grad_norm": 37.0, "kl": 2.909912109375, "learning_rate": 5e-07, "logits/chosen": 4450094.0, "logits/rejected": -35959328.0, "logps/chosen": -93.29448699951172, "logps/rejected": -315.5899658203125, "loss": 0.2809, "rewards/chosen": 0.765183687210083, "rewards/margins": 2.9855217933654785, "rewards/rejected": -2.2203381061553955, "step": 18441 }, { "epoch": 0.9774998012349932, "grad_norm": 31.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4352848.666666667, "logits/rejected": -2739781.6, "logps/chosen": -192.19091796875, "logps/rejected": -251.4908447265625, "loss": 0.2286, "rewards/chosen": -0.07123407224814098, "rewards/margins": 3.4945362677176797, "rewards/rejected": -3.5657703399658205, "step": 18442 }, { "epoch": 0.9775528052367953, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26540016.0, "logits/rejected": -20113329.6, "logps/chosen": -390.4417724609375, "logps/rejected": -165.2724365234375, "loss": 0.1765, "rewards/chosen": 0.7695190111796061, "rewards/margins": 4.059353605906169, "rewards/rejected": -3.2898345947265626, "step": 18443 }, { "epoch": 0.9776058092385975, "grad_norm": 40.75, "kl": 3.8073272705078125, "learning_rate": 5e-07, "logits/chosen": -12406397.0, "logits/rejected": -7638239.5, "logps/chosen": -75.86433410644531, "logps/rejected": -437.90716552734375, "loss": 0.2854, "rewards/chosen": 0.7750944495201111, "rewards/margins": 2.705370843410492, "rewards/rejected": -1.9302763938903809, "step": 18444 }, { "epoch": 0.9776588132403996, "grad_norm": 42.5, "kl": 5.266164779663086, "learning_rate": 5e-07, "logits/chosen": -13779226.666666666, "logits/rejected": -30251584.0, "logps/chosen": -140.6747029622396, "logps/rejected": -354.9936767578125, "loss": 0.3009, "rewards/chosen": 0.2519421974817912, "rewards/margins": 2.5059892098108927, "rewards/rejected": -2.2540470123291017, "step": 18445 }, { "epoch": 0.9777118172422018, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28064933.333333332, "logits/rejected": -35475219.2, "logps/chosen": -303.94419352213544, "logps/rejected": -258.452294921875, "loss": 0.1619, "rewards/chosen": 1.1248952547709148, "rewards/margins": 3.518841520945231, "rewards/rejected": -2.3939462661743165, "step": 18446 }, { "epoch": 0.9777648212440039, "grad_norm": 52.75, "kl": 2.024139404296875, "learning_rate": 5e-07, "logits/chosen": -26069802.0, "logits/rejected": -30145606.0, "logps/chosen": -410.3035888671875, "logps/rejected": -296.4458312988281, "loss": 0.1914, "rewards/chosen": 1.1033620834350586, "rewards/margins": 4.596099376678467, "rewards/rejected": -3.492737293243408, "step": 18447 }, { "epoch": 0.9778178252458061, "grad_norm": 38.25, "kl": 1.8463630676269531, "learning_rate": 5e-07, "logits/chosen": 3892754.5, "logits/rejected": -25761864.0, "logps/chosen": -268.7552490234375, "logps/rejected": -531.6771240234375, "loss": 0.2453, "rewards/chosen": 1.1021562814712524, "rewards/margins": 3.7848910093307495, "rewards/rejected": -2.682734727859497, "step": 18448 }, { "epoch": 0.9778708292476082, "grad_norm": 44.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39309013.333333336, "logits/rejected": -27350576.0, "logps/chosen": -264.5328369140625, "logps/rejected": -383.824658203125, "loss": 0.2686, "rewards/chosen": -0.3743728796641032, "rewards/margins": 3.127268966039022, "rewards/rejected": -3.501641845703125, "step": 18449 }, { "epoch": 0.9779238332494103, "grad_norm": 35.75, "kl": 0.6912164688110352, "learning_rate": 5e-07, "logits/chosen": -12422805.333333334, "logits/rejected": -41054588.8, "logps/chosen": -218.229736328125, "logps/rejected": -200.15152587890626, "loss": 0.1761, "rewards/chosen": 1.1903370221455891, "rewards/margins": 3.417611726125081, "rewards/rejected": -2.227274703979492, "step": 18450 }, { "epoch": 0.9779768372512124, "grad_norm": 35.25, "kl": 1.2225570678710938, "learning_rate": 5e-07, "logits/chosen": -18921276.0, "logits/rejected": -10324914.0, "logps/chosen": -328.9439697265625, "logps/rejected": -249.22596740722656, "loss": 0.1308, "rewards/chosen": 1.8951611518859863, "rewards/margins": 4.480366945266724, "rewards/rejected": -2.5852057933807373, "step": 18451 }, { "epoch": 0.9780298412530146, "grad_norm": 67.5, "kl": 5.938096046447754, "learning_rate": 5e-07, "logits/chosen": -21574666.0, "logps/chosen": -235.26046752929688, "loss": 0.5091, "rewards/chosen": 0.5476461052894592, "step": 18452 }, { "epoch": 0.9780828452548167, "grad_norm": 34.0, "kl": 0.6804571151733398, "learning_rate": 5e-07, "logits/chosen": -10657663.2, "logits/rejected": -50196346.666666664, "logps/chosen": -66.90729370117188, "logps/rejected": -511.664306640625, "loss": 0.3222, "rewards/chosen": 0.22099602222442627, "rewards/margins": 3.1963216066360474, "rewards/rejected": -2.975325584411621, "step": 18453 }, { "epoch": 0.9781358492566189, "grad_norm": 41.0, "kl": 0.9587535858154297, "learning_rate": 5e-07, "logits/chosen": -21063488.0, "logits/rejected": -22218586.0, "logps/chosen": -224.01412963867188, "logps/rejected": -185.27655029296875, "loss": 0.2395, "rewards/chosen": 1.0981078147888184, "rewards/margins": 3.641735792160034, "rewards/rejected": -2.543627977371216, "step": 18454 }, { "epoch": 0.978188853258421, "grad_norm": 53.25, "kl": 5.103791236877441, "learning_rate": 5e-07, "logits/chosen": 24014216.0, "logits/rejected": -28255109.333333332, "logps/chosen": -270.47470703125, "logps/rejected": -412.0048014322917, "loss": 0.2554, "rewards/chosen": 1.149628257751465, "rewards/margins": 3.505869801839193, "rewards/rejected": -2.356241544087728, "step": 18455 }, { "epoch": 0.9782418572602232, "grad_norm": 75.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -25772616.0, "logits/rejected": -32196280.0, "logps/chosen": -357.5706481933594, "logps/rejected": -379.4464416503906, "loss": 0.3299, "rewards/chosen": -0.6126400232315063, "rewards/margins": 3.292031168937683, "rewards/rejected": -3.9046711921691895, "step": 18456 }, { "epoch": 0.9782948612620253, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39163440.0, "logits/rejected": -19742995.2, "logps/chosen": -313.1193440755208, "logps/rejected": -236.7783447265625, "loss": 0.1754, "rewards/chosen": 0.6587870518366495, "rewards/margins": 3.8409369389216104, "rewards/rejected": -3.182149887084961, "step": 18457 }, { "epoch": 0.9783478652638274, "grad_norm": 43.5, "kl": 2.2459659576416016, "learning_rate": 5e-07, "logits/chosen": -18284596.0, "logits/rejected": -2050158.0, "logps/chosen": -187.4573974609375, "logps/rejected": -295.61370849609375, "loss": 0.3347, "rewards/chosen": 0.7213771343231201, "rewards/margins": 3.0172901153564453, "rewards/rejected": -2.295912981033325, "step": 18458 }, { "epoch": 0.9784008692656295, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13336153.333333334, "logits/rejected": -20933910.4, "logps/chosen": -152.70181274414062, "logps/rejected": -223.6042724609375, "loss": 0.1982, "rewards/chosen": 0.5871954758961996, "rewards/margins": 3.394867022832235, "rewards/rejected": -2.807671546936035, "step": 18459 }, { "epoch": 0.9784538732674317, "grad_norm": 38.5, "kl": 2.2562408447265625, "learning_rate": 5e-07, "logits/chosen": -41478288.0, "logits/rejected": -6941420.0, "logps/chosen": -214.07181803385416, "logps/rejected": -450.99716796875, "loss": 0.2231, "rewards/chosen": -0.12464902798334758, "rewards/margins": 3.4337139328320823, "rewards/rejected": -3.5583629608154297, "step": 18460 }, { "epoch": 0.9785068772692338, "grad_norm": 40.25, "kl": 1.1877861022949219, "learning_rate": 5e-07, "logits/chosen": -26697740.0, "logits/rejected": -19967794.0, "logps/chosen": -249.6096649169922, "logps/rejected": -392.7405700683594, "loss": 0.2726, "rewards/chosen": 0.8753485083580017, "rewards/margins": 3.1051339507102966, "rewards/rejected": -2.229785442352295, "step": 18461 }, { "epoch": 0.978559881271036, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17078056.0, "logits/rejected": -31945936.0, "logps/chosen": -332.00830078125, "logps/rejected": -228.61356608072916, "loss": 0.3761, "rewards/chosen": 0.23253698348999025, "rewards/margins": 1.3678942044576008, "rewards/rejected": -1.1353572209676106, "step": 18462 }, { "epoch": 0.9786128852728381, "grad_norm": 60.25, "kl": 0.5254974365234375, "learning_rate": 5e-07, "logits/chosen": -27865238.0, "logits/rejected": 3199985.5, "logps/chosen": -248.97421264648438, "logps/rejected": -249.23971557617188, "loss": 0.39, "rewards/chosen": 0.0031945258378982544, "rewards/margins": 1.6774812489748, "rewards/rejected": -1.6742867231369019, "step": 18463 }, { "epoch": 0.9786658892746403, "grad_norm": 60.5, "kl": 0.6513757705688477, "learning_rate": 5e-07, "logits/chosen": 1854968.6, "logits/rejected": -29995240.0, "logps/chosen": -217.4292724609375, "logps/rejected": -373.882568359375, "loss": 0.3532, "rewards/chosen": 0.21695547103881835, "rewards/margins": 2.483660825093587, "rewards/rejected": -2.266705354054769, "step": 18464 }, { "epoch": 0.9787188932764423, "grad_norm": 43.0, "kl": 3.3660335540771484, "learning_rate": 5e-07, "logits/chosen": 20564500.0, "logits/rejected": -12551916.0, "logps/chosen": -204.5447235107422, "logps/rejected": -158.16561889648438, "loss": 0.2636, "rewards/chosen": 1.2632851600646973, "rewards/margins": 2.500124931335449, "rewards/rejected": -1.236839771270752, "step": 18465 }, { "epoch": 0.9787718972782445, "grad_norm": 53.25, "kl": 1.3090286254882812, "learning_rate": 5e-07, "logits/chosen": -29817224.0, "logits/rejected": -4010557.0, "logps/chosen": -332.23077392578125, "logps/rejected": -84.25852966308594, "loss": 0.3279, "rewards/chosen": 0.03698673099279404, "rewards/margins": 2.681269071996212, "rewards/rejected": -2.644282341003418, "step": 18466 }, { "epoch": 0.9788249012800466, "grad_norm": 30.0, "kl": 0.7702522277832031, "learning_rate": 5e-07, "logits/chosen": -22670906.0, "logits/rejected": -9208962.0, "logps/chosen": -215.4949493408203, "logps/rejected": -205.5733642578125, "loss": 0.2274, "rewards/chosen": 0.9546303153038025, "rewards/margins": 3.903371751308441, "rewards/rejected": -2.9487414360046387, "step": 18467 }, { "epoch": 0.9788779052818488, "grad_norm": 32.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -5729980.0, "logits/rejected": -106931392.0, "logps/chosen": -293.3409729003906, "logps/rejected": -302.7588297526042, "loss": 0.1539, "rewards/chosen": 0.875622570514679, "rewards/margins": 4.056699891885122, "rewards/rejected": -3.181077321370443, "step": 18468 }, { "epoch": 0.9789309092836509, "grad_norm": 63.25, "kl": 0.626922607421875, "learning_rate": 5e-07, "logits/chosen": -19082358.85714286, "logits/rejected": -61357152.0, "logps/chosen": -342.9232700892857, "logps/rejected": -529.3799438476562, "loss": 0.437, "rewards/chosen": 0.13015041181019374, "rewards/margins": 2.0351614696638927, "rewards/rejected": -1.9050110578536987, "step": 18469 }, { "epoch": 0.9789839132854531, "grad_norm": 65.0, "kl": 2.6201229095458984, "learning_rate": 5e-07, "logits/chosen": -86439.2, "logits/rejected": -1123144.25, "logps/chosen": -338.2349365234375, "logps/rejected": -157.30241902669272, "loss": 0.312, "rewards/chosen": 1.0387929916381835, "rewards/margins": 3.1204548517862953, "rewards/rejected": -2.081661860148112, "step": 18470 }, { "epoch": 0.9790369172872552, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -50682329.6, "logits/rejected": -14430570.666666666, "logps/chosen": -384.8508056640625, "logps/rejected": -163.03179931640625, "loss": 0.3258, "rewards/chosen": 0.14851610660552977, "rewards/margins": 3.6308836221694945, "rewards/rejected": -3.482367515563965, "step": 18471 }, { "epoch": 0.9790899212890574, "grad_norm": 48.5, "kl": 3.1073856353759766, "learning_rate": 5e-07, "logits/chosen": -44748570.666666664, "logits/rejected": -56819528.0, "logps/chosen": -406.8541259765625, "logps/rejected": -297.57354736328125, "loss": 0.3187, "rewards/chosen": 0.9920679728190104, "rewards/margins": 3.0054592291514077, "rewards/rejected": -2.0133912563323975, "step": 18472 }, { "epoch": 0.9791429252908594, "grad_norm": 35.0, "kl": 2.4981813430786133, "learning_rate": 5e-07, "logits/chosen": -10875772.0, "logits/rejected": -13623280.0, "logps/chosen": -207.1727498372396, "logps/rejected": -215.165771484375, "loss": 0.1989, "rewards/chosen": 0.48332564036051434, "rewards/margins": 3.4732182184855143, "rewards/rejected": -2.989892578125, "step": 18473 }, { "epoch": 0.9791959292926616, "grad_norm": 70.0, "kl": 3.812641143798828, "learning_rate": 5e-07, "logits/chosen": -22449450.666666668, "logits/rejected": -15997918.0, "logps/chosen": -269.0709635416667, "logps/rejected": -230.89083862304688, "loss": 0.3595, "rewards/chosen": 0.6434251467386881, "rewards/margins": 2.5708425442377725, "rewards/rejected": -1.9274173974990845, "step": 18474 }, { "epoch": 0.9792489332944637, "grad_norm": 31.75, "kl": 0.4176826477050781, "learning_rate": 5e-07, "logits/chosen": -9452018.0, "logits/rejected": -26750619.2, "logps/chosen": -493.9004313151042, "logps/rejected": -431.215625, "loss": 0.1454, "rewards/chosen": 1.1017165978749592, "rewards/margins": 5.053189833958943, "rewards/rejected": -3.9514732360839844, "step": 18475 }, { "epoch": 0.9793019372962659, "grad_norm": 768.0, "kl": 5.802689552307129, "learning_rate": 5e-07, "logits/chosen": -12495577.6, "logits/rejected": -35000381.333333336, "logps/chosen": -448.526806640625, "logps/rejected": -211.97159830729166, "loss": 0.2204, "rewards/chosen": 1.8269403457641602, "rewards/margins": 3.660923131306966, "rewards/rejected": -1.833982785542806, "step": 18476 }, { "epoch": 0.979354941298068, "grad_norm": 65.5, "kl": 1.2912063598632812, "learning_rate": 5e-07, "logits/chosen": -39339336.0, "logits/rejected": -13921764.0, "logps/chosen": -424.6819254557292, "logps/rejected": -257.6756896972656, "loss": 0.3613, "rewards/chosen": 0.4755716323852539, "rewards/margins": 2.4084341526031494, "rewards/rejected": -1.9328625202178955, "step": 18477 }, { "epoch": 0.9794079452998702, "grad_norm": 56.0, "kl": 1.2614269256591797, "learning_rate": 5e-07, "logits/chosen": -25809281.6, "logits/rejected": -1970163.8333333333, "logps/chosen": -168.431884765625, "logps/rejected": -101.6840108235677, "loss": 0.4559, "rewards/chosen": -0.06261551380157471, "rewards/margins": 0.5624874035517374, "rewards/rejected": -0.6251029173533121, "step": 18478 }, { "epoch": 0.9794609493016723, "grad_norm": 46.25, "kl": 3.097970962524414, "learning_rate": 5e-07, "logits/chosen": -15198213.333333334, "logits/rejected": -16378341.0, "logps/chosen": -417.2461751302083, "logps/rejected": -122.44328308105469, "loss": 0.289, "rewards/chosen": 1.1521569887797039, "rewards/margins": 2.8180549542109175, "rewards/rejected": -1.6658979654312134, "step": 18479 }, { "epoch": 0.9795139533034745, "grad_norm": 55.0, "kl": 0.4212226867675781, "learning_rate": 5e-07, "logits/chosen": -50043146.666666664, "logits/rejected": -14256177.0, "logps/chosen": -309.3932698567708, "logps/rejected": -98.29701232910156, "loss": 0.3484, "rewards/chosen": 0.47266896565755206, "rewards/margins": 3.7271951039632163, "rewards/rejected": -3.254526138305664, "step": 18480 }, { "epoch": 0.9795669573052765, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15141792.0, "logits/rejected": -9685074.0, "logps/chosen": -300.2592468261719, "logps/rejected": -158.27865600585938, "loss": 0.262, "rewards/chosen": 0.4303373396396637, "rewards/margins": 3.21620312333107, "rewards/rejected": -2.7858657836914062, "step": 18481 }, { "epoch": 0.9796199613070787, "grad_norm": 50.75, "kl": 1.0960502624511719, "learning_rate": 5e-07, "logits/chosen": -4611347.0, "logits/rejected": -30057840.0, "logps/chosen": -694.9219970703125, "logps/rejected": -398.6084289550781, "loss": 0.2741, "rewards/chosen": 0.8417720794677734, "rewards/margins": 3.534687042236328, "rewards/rejected": -2.6929149627685547, "step": 18482 }, { "epoch": 0.9796729653088808, "grad_norm": 40.75, "kl": 2.7123069763183594, "learning_rate": 5e-07, "logits/chosen": -23098664.0, "logits/rejected": -15131668.8, "logps/chosen": -300.607177734375, "logps/rejected": -127.0086669921875, "loss": 0.2426, "rewards/chosen": 1.6117959022521973, "rewards/margins": 2.6267004013061523, "rewards/rejected": -1.014904499053955, "step": 18483 }, { "epoch": 0.979725969310683, "grad_norm": 43.25, "kl": 0.8157873153686523, "learning_rate": 5e-07, "logits/chosen": 2645044.0, "logits/rejected": -12684971.42857143, "logps/chosen": -38.810550689697266, "logps/rejected": -194.34258161272322, "loss": 0.1169, "rewards/chosen": 1.385996699333191, "rewards/margins": 4.125762139047895, "rewards/rejected": -2.7397654397147044, "step": 18484 }, { "epoch": 0.9797789733124851, "grad_norm": 32.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9021693.333333334, "logits/rejected": -34538323.2, "logps/chosen": -222.9211629231771, "logps/rejected": -346.6834228515625, "loss": 0.2689, "rewards/chosen": 0.8540801207224528, "rewards/margins": 2.4089274565378824, "rewards/rejected": -1.5548473358154298, "step": 18485 }, { "epoch": 0.9798319773142873, "grad_norm": 43.5, "kl": 2.3053150177001953, "learning_rate": 5e-07, "logits/chosen": -2187394.6666666665, "logits/rejected": 87424345.6, "logps/chosen": -178.5489298502604, "logps/rejected": -545.35927734375, "loss": 0.2258, "rewards/chosen": 0.4873925844828288, "rewards/margins": 2.8066253344217933, "rewards/rejected": -2.3192327499389647, "step": 18486 }, { "epoch": 0.9798849813160894, "grad_norm": 43.0, "kl": 1.2982559204101562, "learning_rate": 5e-07, "logits/chosen": -42484117.333333336, "logits/rejected": -20227468.8, "logps/chosen": -355.5478515625, "logps/rejected": -330.260205078125, "loss": 0.1778, "rewards/chosen": 1.6486795743306477, "rewards/margins": 3.1598154385884603, "rewards/rejected": -1.5111358642578125, "step": 18487 }, { "epoch": 0.9799379853178914, "grad_norm": 53.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39372512.0, "logits/rejected": -51010521.6, "logps/chosen": -369.5424397786458, "logps/rejected": -400.723876953125, "loss": 0.2722, "rewards/chosen": -0.015289311607678732, "rewards/margins": 1.9522582958141963, "rewards/rejected": -1.967547607421875, "step": 18488 }, { "epoch": 0.9799909893196936, "grad_norm": 60.0, "kl": 0.6612319946289062, "learning_rate": 5e-07, "logits/chosen": -89937000.0, "logits/rejected": -19893968.0, "logps/chosen": -621.6371459960938, "logps/rejected": -232.36386108398438, "loss": 0.285, "rewards/chosen": 0.8305625915527344, "rewards/margins": 2.6138330698013306, "rewards/rejected": -1.7832704782485962, "step": 18489 }, { "epoch": 0.9800439933214957, "grad_norm": 51.25, "kl": 0.7500209808349609, "learning_rate": 5e-07, "logits/chosen": -9018386.0, "logits/rejected": -31889360.0, "logps/chosen": -145.92062377929688, "logps/rejected": -259.73028564453125, "loss": 0.3089, "rewards/chosen": -0.4013305604457855, "rewards/margins": 1.4958957731723785, "rewards/rejected": -1.897226333618164, "step": 18490 }, { "epoch": 0.9800969973232979, "grad_norm": 51.75, "kl": 3.393491744995117, "learning_rate": 5e-07, "logits/chosen": -20336966.4, "logits/rejected": -18568894.666666668, "logps/chosen": -351.844775390625, "logps/rejected": -246.93607584635416, "loss": 0.3222, "rewards/chosen": 0.38641908168792727, "rewards/margins": 3.093646788597107, "rewards/rejected": -2.7072277069091797, "step": 18491 }, { "epoch": 0.9801500013251, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -15869950.0, "logits/rejected": -30982955.42857143, "logps/chosen": -386.6904296875, "logps/rejected": -304.34423828125, "loss": 0.1097, "rewards/chosen": 1.3248413801193237, "rewards/margins": 4.151378376143319, "rewards/rejected": -2.8265369960239957, "step": 18492 }, { "epoch": 0.9802030053269022, "grad_norm": 39.5, "kl": 2.487942695617676, "learning_rate": 5e-07, "logits/chosen": -47132228.0, "logits/rejected": -19548754.0, "logps/chosen": -414.83782958984375, "logps/rejected": -464.2611083984375, "loss": 0.2326, "rewards/chosen": 0.767494261264801, "rewards/margins": 4.061331331729889, "rewards/rejected": -3.293837070465088, "step": 18493 }, { "epoch": 0.9802560093287043, "grad_norm": 55.0, "kl": 3.3057870864868164, "learning_rate": 5e-07, "logits/chosen": -21390916.0, "logits/rejected": 2013998.375, "logps/chosen": -122.94376373291016, "logps/rejected": -163.82150268554688, "loss": 0.3196, "rewards/chosen": 0.6129011511802673, "rewards/margins": 2.692471921443939, "rewards/rejected": -2.079570770263672, "step": 18494 }, { "epoch": 0.9803090133305065, "grad_norm": 30.0, "kl": 5.356540679931641, "learning_rate": 5e-07, "logits/chosen": -23813140.8, "logits/rejected": -29182586.666666668, "logps/chosen": -317.993359375, "logps/rejected": -298.2024739583333, "loss": 0.2973, "rewards/chosen": 1.0664539337158203, "rewards/margins": 3.4686609903971353, "rewards/rejected": -2.402207056681315, "step": 18495 }, { "epoch": 0.9803620173323085, "grad_norm": 40.0, "kl": 0.7073602676391602, "learning_rate": 5e-07, "logits/chosen": -10162078.0, "logits/rejected": -49583720.0, "logps/chosen": -384.7738444010417, "logps/rejected": -241.38803100585938, "loss": 0.2635, "rewards/chosen": 1.2934670448303223, "rewards/margins": 2.7382309436798096, "rewards/rejected": -1.4447638988494873, "step": 18496 }, { "epoch": 0.9804150213341107, "grad_norm": 38.25, "kl": 1.7116985321044922, "learning_rate": 5e-07, "logits/chosen": -684247.3333333334, "logits/rejected": -30816000.0, "logps/chosen": -194.49617513020834, "logps/rejected": -520.350927734375, "loss": 0.3153, "rewards/chosen": -0.36367011070251465, "rewards/margins": 2.866767740249634, "rewards/rejected": -3.2304378509521485, "step": 18497 }, { "epoch": 0.9804680253359128, "grad_norm": 43.75, "kl": 4.436824798583984, "learning_rate": 5e-07, "logits/chosen": -25910084.8, "logits/rejected": -10985512.0, "logps/chosen": -483.26806640625, "logps/rejected": -183.66691080729166, "loss": 0.1901, "rewards/chosen": 1.886101531982422, "rewards/margins": 5.403839238484701, "rewards/rejected": -3.517737706502279, "step": 18498 }, { "epoch": 0.980521029337715, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 19927949.333333332, "logits/rejected": -41089222.4, "logps/chosen": -139.56161499023438, "logps/rejected": -447.795068359375, "loss": 0.2901, "rewards/chosen": 0.21323484182357788, "rewards/margins": 2.2629285216331483, "rewards/rejected": -2.0496936798095704, "step": 18499 }, { "epoch": 0.9805740333395171, "grad_norm": 41.25, "kl": 4.228339195251465, "learning_rate": 5e-07, "logits/chosen": 4165747.0, "logits/rejected": -45568260.0, "logps/chosen": -145.81533813476562, "logps/rejected": -434.8013000488281, "loss": 0.2925, "rewards/chosen": 0.5993788838386536, "rewards/margins": 3.423249065876007, "rewards/rejected": -2.8238701820373535, "step": 18500 }, { "epoch": 0.9806270373413193, "grad_norm": 72.5, "kl": 4.00439453125, "learning_rate": 5e-07, "logits/chosen": -59345741.71428572, "logits/rejected": -20685296.0, "logps/chosen": -461.5068359375, "logps/rejected": -437.75823974609375, "loss": 0.3108, "rewards/chosen": 1.1559841973440987, "rewards/margins": 6.682360444750104, "rewards/rejected": -5.526376247406006, "step": 18501 }, { "epoch": 0.9806800413431214, "grad_norm": 60.75, "kl": 4.699169158935547, "learning_rate": 5e-07, "logits/chosen": -32123117.333333332, "logits/rejected": 15767313.0, "logps/chosen": -663.3952229817709, "logps/rejected": -464.754638671875, "loss": 0.3595, "rewards/chosen": 1.1071682771046956, "rewards/margins": 3.081332166989644, "rewards/rejected": -1.9741638898849487, "step": 18502 }, { "epoch": 0.9807330453449236, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1340328.0, "logits/rejected": -29582291.2, "logps/chosen": -263.48960367838544, "logps/rejected": -301.584033203125, "loss": 0.3142, "rewards/chosen": -0.5151316324869791, "rewards/margins": 1.7646455128987633, "rewards/rejected": -2.2797771453857423, "step": 18503 }, { "epoch": 0.9807860493467256, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -39284380.0, "logits/rejected": -18521472.0, "logps/chosen": -264.4866027832031, "logps/rejected": -292.03326416015625, "loss": 0.2639, "rewards/chosen": 0.26059266924858093, "rewards/margins": 3.852661818265915, "rewards/rejected": -3.592069149017334, "step": 18504 }, { "epoch": 0.9808390533485278, "grad_norm": 53.25, "kl": 0.4760456085205078, "learning_rate": 5e-07, "logits/chosen": 200479104.0, "logits/rejected": -3376326.0, "logps/chosen": -474.0091145833333, "logps/rejected": -205.409033203125, "loss": 0.19, "rewards/chosen": 0.7743534247080485, "rewards/margins": 3.8629615942637123, "rewards/rejected": -3.088608169555664, "step": 18505 }, { "epoch": 0.9808920573503299, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24949682.0, "logits/rejected": 11234523.42857143, "logps/chosen": -509.8680114746094, "logps/rejected": -442.35714285714283, "loss": 0.2278, "rewards/chosen": 0.7131378054618835, "rewards/margins": 2.488090148993901, "rewards/rejected": -1.7749523435320174, "step": 18506 }, { "epoch": 0.9809450613521321, "grad_norm": 44.75, "kl": 3.0723438262939453, "learning_rate": 5e-07, "logits/chosen": -10566808.0, "logits/rejected": 26264208.0, "logps/chosen": -302.4876302083333, "logps/rejected": -142.61257934570312, "loss": 0.4556, "rewards/chosen": 0.5883956750233968, "rewards/margins": 1.2726959188779197, "rewards/rejected": -0.6843002438545227, "step": 18507 }, { "epoch": 0.9809980653539342, "grad_norm": 46.0, "kl": 7.517498016357422, "learning_rate": 5e-07, "logits/chosen": -18967216.0, "logits/rejected": -26441906.666666668, "logps/chosen": -194.17379150390624, "logps/rejected": -676.5531819661459, "loss": 0.378, "rewards/chosen": 0.9622050285339355, "rewards/margins": 4.370654201507568, "rewards/rejected": -3.408449172973633, "step": 18508 }, { "epoch": 0.9810510693557364, "grad_norm": 39.0, "kl": 0.12939071655273438, "learning_rate": 5e-07, "logits/chosen": -5717216.5, "logits/rejected": -9025448.0, "logps/chosen": -80.53848266601562, "logps/rejected": -305.5973714192708, "loss": 0.2329, "rewards/chosen": 0.007681652903556824, "rewards/margins": 2.49786739051342, "rewards/rejected": -2.4901857376098633, "step": 18509 }, { "epoch": 0.9811040733575385, "grad_norm": 38.0, "kl": 0.4489898681640625, "learning_rate": 5e-07, "logits/chosen": -23729888.0, "logits/rejected": 123315381.33333333, "logps/chosen": -424.655810546875, "logps/rejected": -252.2363077799479, "loss": 0.246, "rewards/chosen": 0.9120019912719727, "rewards/margins": 3.2343782424926757, "rewards/rejected": -2.322376251220703, "step": 18510 }, { "epoch": 0.9811570773593407, "grad_norm": 36.75, "kl": 0.42180633544921875, "learning_rate": 5e-07, "logits/chosen": -27622718.0, "logits/rejected": -18547814.0, "logps/chosen": -471.817138671875, "logps/rejected": -224.16685485839844, "loss": 0.1352, "rewards/chosen": 1.8742412328720093, "rewards/margins": 5.544894099235535, "rewards/rejected": -3.6706528663635254, "step": 18511 }, { "epoch": 0.9812100813611427, "grad_norm": 40.75, "kl": 1.6504592895507812, "learning_rate": 5e-07, "logits/chosen": 219962.25, "logits/rejected": -30868736.0, "logps/chosen": -95.36287689208984, "logps/rejected": -298.0434163411458, "loss": 0.2829, "rewards/chosen": 1.1956559419631958, "rewards/margins": 2.4275734821955366, "rewards/rejected": -1.2319175402323406, "step": 18512 }, { "epoch": 0.9812630853629449, "grad_norm": 47.75, "kl": 4.713960647583008, "learning_rate": 5e-07, "logits/chosen": -1902472.0, "logits/rejected": -21867068.0, "logps/chosen": -106.88775634765625, "logps/rejected": -367.8465270996094, "loss": 0.4024, "rewards/chosen": 0.7227120399475098, "rewards/margins": 2.497843623161316, "rewards/rejected": -1.7751315832138062, "step": 18513 }, { "epoch": 0.981316089364747, "grad_norm": 59.0, "kl": 0.3954582214355469, "learning_rate": 5e-07, "logits/chosen": -77995765.33333333, "logits/rejected": 5318570.0, "logps/chosen": -493.1886393229167, "logps/rejected": -358.01552734375, "loss": 0.2834, "rewards/chosen": 0.24254353841145834, "rewards/margins": 1.94037659962972, "rewards/rejected": -1.6978330612182617, "step": 18514 }, { "epoch": 0.9813690933665492, "grad_norm": 39.0, "kl": 2.8137598037719727, "learning_rate": 5e-07, "logits/chosen": -8766926.0, "logits/rejected": -26755724.0, "logps/chosen": -140.77951049804688, "logps/rejected": -264.62091064453125, "loss": 0.2304, "rewards/chosen": 0.775260865688324, "rewards/margins": 4.049473702907562, "rewards/rejected": -3.2742128372192383, "step": 18515 }, { "epoch": 0.9814220973683513, "grad_norm": 29.125, "kl": 2.1683425903320312, "learning_rate": 5e-07, "logits/chosen": 5242608.0, "logits/rejected": -43912376.0, "logps/chosen": -17.329666137695312, "logps/rejected": -405.1862386067708, "loss": 0.1658, "rewards/chosen": 0.44093331694602966, "rewards/margins": 3.3300670882066092, "rewards/rejected": -2.8891337712605796, "step": 18516 }, { "epoch": 0.9814751013701535, "grad_norm": 63.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -73680072.0, "logits/rejected": -31070390.0, "logps/chosen": -210.71031188964844, "logps/rejected": -509.53887939453125, "loss": 0.2145, "rewards/chosen": 0.7957793474197388, "rewards/margins": 3.6451834440231323, "rewards/rejected": -2.8494040966033936, "step": 18517 }, { "epoch": 0.9815281053719556, "grad_norm": 42.75, "kl": 0.022660255432128906, "learning_rate": 5e-07, "logits/chosen": -17872372.0, "logits/rejected": -21381996.0, "logps/chosen": -158.40750122070312, "logps/rejected": -193.58726501464844, "loss": 0.3704, "rewards/chosen": -0.24188122153282166, "rewards/margins": 1.6568421423435211, "rewards/rejected": -1.8987233638763428, "step": 18518 }, { "epoch": 0.9815811093737578, "grad_norm": 47.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2650176.75, "logits/rejected": -17886578.0, "logps/chosen": -127.04475402832031, "logps/rejected": -285.19891357421875, "loss": 0.3678, "rewards/chosen": -0.22903335094451904, "rewards/margins": 1.4617934226989746, "rewards/rejected": -1.6908267736434937, "step": 18519 }, { "epoch": 0.9816341133755598, "grad_norm": 67.5, "kl": 1.0066242218017578, "learning_rate": 5e-07, "logits/chosen": -75008072.0, "logits/rejected": -24250728.0, "logps/chosen": -555.8045043945312, "logps/rejected": -178.56698608398438, "loss": 0.3224, "rewards/chosen": 0.6375350952148438, "rewards/margins": 1.722406029701233, "rewards/rejected": -1.0848709344863892, "step": 18520 }, { "epoch": 0.981687117377362, "grad_norm": 27.625, "kl": 1.6171340942382812, "learning_rate": 5e-07, "logits/chosen": 2941367.0, "logits/rejected": -29439906.666666668, "logps/chosen": -42.568267822265625, "logps/rejected": -291.9716796875, "loss": 0.1543, "rewards/chosen": 0.9456191062927246, "rewards/margins": 3.524703025817871, "rewards/rejected": -2.5790839195251465, "step": 18521 }, { "epoch": 0.9817401213791641, "grad_norm": 29.375, "kl": 3.6002674102783203, "learning_rate": 5e-07, "logits/chosen": -15141496.0, "logits/rejected": -17211508.0, "logps/chosen": -178.30645751953125, "logps/rejected": -222.3558349609375, "loss": 0.2486, "rewards/chosen": 1.0447676181793213, "rewards/margins": 3.9448277950286865, "rewards/rejected": -2.9000601768493652, "step": 18522 }, { "epoch": 0.9817931253809663, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40881717.333333336, "logits/rejected": -966294.3, "logps/chosen": -241.5684814453125, "logps/rejected": -471.96845703125, "loss": 0.2338, "rewards/chosen": -0.06468504667282104, "rewards/margins": 3.137449657917023, "rewards/rejected": -3.202134704589844, "step": 18523 }, { "epoch": 0.9818461293827684, "grad_norm": 44.5, "kl": 3.1356124877929688, "learning_rate": 5e-07, "logits/chosen": -13356128.0, "logits/rejected": -28552290.0, "logps/chosen": -654.615966796875, "logps/rejected": -269.0035400390625, "loss": 0.2669, "rewards/chosen": 1.6256588697433472, "rewards/margins": 3.3765958547592163, "rewards/rejected": -1.7509369850158691, "step": 18524 }, { "epoch": 0.9818991333845706, "grad_norm": 39.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 21828384.0, "logits/rejected": -42876649.6, "logps/chosen": -256.15675862630206, "logps/rejected": -429.268505859375, "loss": 0.2475, "rewards/chosen": -0.06444078187147777, "rewards/margins": 3.367224638660749, "rewards/rejected": -3.4316654205322266, "step": 18525 }, { "epoch": 0.9819521373863727, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52988144.0, "logits/rejected": 644608.6666666666, "logps/chosen": -448.8916931152344, "logps/rejected": -185.3623046875, "loss": 0.2262, "rewards/chosen": 0.19760818779468536, "rewards/margins": 2.186285848418872, "rewards/rejected": -1.9886776606241863, "step": 18526 }, { "epoch": 0.9820051413881749, "grad_norm": 30.375, "kl": 1.0930299758911133, "learning_rate": 5e-07, "logits/chosen": 1590598.0, "logits/rejected": -2527846.25, "logps/chosen": -149.27345275878906, "logps/rejected": -207.01016235351562, "loss": 0.2686, "rewards/chosen": 0.13803155720233917, "rewards/margins": 3.8124757558107376, "rewards/rejected": -3.6744441986083984, "step": 18527 }, { "epoch": 0.9820581453899769, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27533021.333333332, "logits/rejected": -17482142.4, "logps/chosen": -550.483154296875, "logps/rejected": -192.6947021484375, "loss": 0.2532, "rewards/chosen": 0.9698913892110189, "rewards/margins": 2.6650659879048666, "rewards/rejected": -1.6951745986938476, "step": 18528 }, { "epoch": 0.9821111493917791, "grad_norm": 20.375, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1345907.8333333333, "logits/rejected": -2536211.6, "logps/chosen": -178.68477376302084, "logps/rejected": -179.625048828125, "loss": 0.1688, "rewards/chosen": 0.6975049177805582, "rewards/margins": 4.350490681330363, "rewards/rejected": -3.652985763549805, "step": 18529 }, { "epoch": 0.9821641533935812, "grad_norm": 54.75, "kl": 0.4850921630859375, "learning_rate": 5e-07, "logits/chosen": -9198582.0, "logits/rejected": -22547294.0, "logps/chosen": -272.8006998697917, "logps/rejected": -297.26849365234375, "loss": 0.332, "rewards/chosen": 0.5644537607828776, "rewards/margins": 2.2233962217966714, "rewards/rejected": -1.658942461013794, "step": 18530 }, { "epoch": 0.9822171573953834, "grad_norm": 56.75, "kl": 1.0120391845703125, "learning_rate": 5e-07, "logits/chosen": 16920698.666666668, "logits/rejected": -51687612.0, "logps/chosen": -335.723388671875, "logps/rejected": -544.5901489257812, "loss": 0.2923, "rewards/chosen": 0.6303595701853434, "rewards/margins": 4.507555405298869, "rewards/rejected": -3.8771958351135254, "step": 18531 }, { "epoch": 0.9822701613971855, "grad_norm": 22.375, "kl": 5.829750061035156, "learning_rate": 5e-07, "logits/chosen": -10991813.0, "logits/rejected": -44198284.0, "logps/chosen": -286.55316162109375, "logps/rejected": -346.7392578125, "loss": 0.2664, "rewards/chosen": 1.7028546333312988, "rewards/margins": 5.117761135101318, "rewards/rejected": -3.4149065017700195, "step": 18532 }, { "epoch": 0.9823231653989877, "grad_norm": 53.75, "kl": 0.7150712013244629, "learning_rate": 5e-07, "logits/chosen": -20009144.0, "logits/rejected": -34951720.0, "logps/chosen": -281.20001220703125, "logps/rejected": -161.67538452148438, "loss": 0.3437, "rewards/chosen": 0.17904606461524963, "rewards/margins": 1.571921020746231, "rewards/rejected": -1.3928749561309814, "step": 18533 }, { "epoch": 0.9823761694007898, "grad_norm": 39.25, "kl": 3.9302902221679688, "learning_rate": 5e-07, "logits/chosen": -45912042.666666664, "logits/rejected": -56396940.0, "logps/chosen": -301.73695882161456, "logps/rejected": -560.3682861328125, "loss": 0.3123, "rewards/chosen": 0.9371429284413656, "rewards/margins": 4.290077288945516, "rewards/rejected": -3.3529343605041504, "step": 18534 }, { "epoch": 0.982429173402592, "grad_norm": 48.5, "kl": 1.118124008178711, "learning_rate": 5e-07, "logits/chosen": -8451413.333333334, "logits/rejected": -31812448.0, "logps/chosen": -114.9793192545573, "logps/rejected": -270.086181640625, "loss": 0.2371, "rewards/chosen": 0.26675885915756226, "rewards/margins": 3.283982980251312, "rewards/rejected": -3.01722412109375, "step": 18535 }, { "epoch": 0.982482177404394, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9438177.0, "logits/rejected": -26878814.0, "logps/chosen": -397.4818115234375, "logps/rejected": -396.8358459472656, "loss": 0.1998, "rewards/chosen": 0.9584160447120667, "rewards/margins": 4.540332615375519, "rewards/rejected": -3.581916570663452, "step": 18536 }, { "epoch": 0.9825351814061962, "grad_norm": 35.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43440196.0, "logits/rejected": -58720426.666666664, "logps/chosen": -307.9442138671875, "logps/rejected": -257.1663004557292, "loss": 0.1771, "rewards/chosen": 0.9251381158828735, "rewards/margins": 3.1564988692601523, "rewards/rejected": -2.231360753377279, "step": 18537 }, { "epoch": 0.9825881854079983, "grad_norm": 63.5, "kl": 4.610897064208984, "learning_rate": 5e-07, "logits/chosen": -24273050.666666668, "logits/rejected": -30627426.0, "logps/chosen": -274.6097005208333, "logps/rejected": -157.82664489746094, "loss": 0.4594, "rewards/chosen": 0.42293794949849445, "rewards/margins": 1.6969003280003865, "rewards/rejected": -1.273962378501892, "step": 18538 }, { "epoch": 0.9826411894098004, "grad_norm": 60.0, "kl": 0.4724884033203125, "learning_rate": 5e-07, "logits/chosen": -31952920.0, "logits/rejected": -16776906.0, "logps/chosen": -560.9547119140625, "logps/rejected": -132.00242614746094, "loss": 0.2399, "rewards/chosen": 0.7311363220214844, "rewards/margins": 3.1674082279205322, "rewards/rejected": -2.436271905899048, "step": 18539 }, { "epoch": 0.9826941934116026, "grad_norm": 42.0, "kl": 1.9568920135498047, "learning_rate": 5e-07, "logits/chosen": -16301724.8, "logits/rejected": -50864074.666666664, "logps/chosen": -182.038232421875, "logps/rejected": -342.7289225260417, "loss": 0.3185, "rewards/chosen": 0.7640045642852783, "rewards/margins": 2.4238534132639566, "rewards/rejected": -1.6598488489786785, "step": 18540 }, { "epoch": 0.9827471974134047, "grad_norm": 57.25, "kl": 1.6926231384277344, "learning_rate": 5e-07, "logits/chosen": -32964754.666666668, "logits/rejected": -40913692.0, "logps/chosen": -615.6680501302084, "logps/rejected": -414.43853759765625, "loss": 0.2719, "rewards/chosen": 0.8611470858256022, "rewards/margins": 4.254482905069987, "rewards/rejected": -3.3933358192443848, "step": 18541 }, { "epoch": 0.9828002014152069, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 50689080.0, "logits/rejected": -43598372.0, "logps/chosen": -305.30169677734375, "logps/rejected": -503.6527099609375, "loss": 0.2465, "rewards/chosen": 0.4373626708984375, "rewards/margins": 3.8269402980804443, "rewards/rejected": -3.389577627182007, "step": 18542 }, { "epoch": 0.9828532054170089, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45376896.0, "logits/rejected": -51733968.0, "logps/chosen": -471.13372802734375, "logps/rejected": -598.8810424804688, "loss": 0.1749, "rewards/chosen": 0.8800935745239258, "rewards/margins": 4.403368711471558, "rewards/rejected": -3.523275136947632, "step": 18543 }, { "epoch": 0.9829062094188111, "grad_norm": 28.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -26844844.0, "logits/rejected": -10650811.0, "logps/chosen": -309.71453857421875, "logps/rejected": -344.7900085449219, "loss": 0.168, "rewards/chosen": 1.7187600135803223, "rewards/margins": 4.875794887542725, "rewards/rejected": -3.1570348739624023, "step": 18544 }, { "epoch": 0.9829592134206132, "grad_norm": 46.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -48047913.6, "logits/rejected": -90694186.66666667, "logps/chosen": -259.6163330078125, "logps/rejected": -651.2036539713541, "loss": 0.3258, "rewards/chosen": -0.0006871700286865234, "rewards/margins": 3.5607793649037682, "rewards/rejected": -3.5614665349324546, "step": 18545 }, { "epoch": 0.9830122174224154, "grad_norm": 49.0, "kl": 1.4908332824707031, "learning_rate": 5e-07, "logits/chosen": -9618418.666666666, "logits/rejected": -15693193.6, "logps/chosen": -201.43526204427084, "logps/rejected": -222.574072265625, "loss": 0.3364, "rewards/chosen": 0.121942271788915, "rewards/margins": 2.0814294238885243, "rewards/rejected": -1.9594871520996093, "step": 18546 }, { "epoch": 0.9830652214242175, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44701148.8, "logits/rejected": -20086716.0, "logps/chosen": -310.9978515625, "logps/rejected": -564.47802734375, "loss": 0.2962, "rewards/chosen": 0.3774728298187256, "rewards/margins": 3.9109242280324303, "rewards/rejected": -3.5334513982137046, "step": 18547 }, { "epoch": 0.9831182254260197, "grad_norm": 56.5, "kl": 1.4055633544921875, "learning_rate": 5e-07, "logits/chosen": -9068180.0, "logits/rejected": -19477450.666666668, "logps/chosen": -148.8242431640625, "logps/rejected": -328.927490234375, "loss": 0.3927, "rewards/chosen": 0.01750510334968567, "rewards/margins": 3.262944314877192, "rewards/rejected": -3.2454392115275064, "step": 18548 }, { "epoch": 0.9831712294278218, "grad_norm": 84.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -13417952.0, "logits/rejected": -741491.25, "logps/chosen": -139.71957397460938, "logps/rejected": -158.0958251953125, "loss": 0.4784, "rewards/chosen": -0.5279636383056641, "rewards/margins": 0.22460949420928955, "rewards/rejected": -0.7525731325149536, "step": 18549 }, { "epoch": 0.983224233429624, "grad_norm": 36.75, "kl": 3.446915626525879, "learning_rate": 5e-07, "logits/chosen": -3944642.6666666665, "logits/rejected": -9024368.0, "logps/chosen": -336.981689453125, "logps/rejected": -198.37188720703125, "loss": 0.2152, "rewards/chosen": 0.5224417050679525, "rewards/margins": 3.296841843922933, "rewards/rejected": -2.7744001388549804, "step": 18550 }, { "epoch": 0.983277237431426, "grad_norm": 34.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45867264.0, "logits/rejected": -10302876.8, "logps/chosen": -250.98697916666666, "logps/rejected": -375.3355224609375, "loss": 0.1843, "rewards/chosen": 1.1521581013997395, "rewards/margins": 4.040526707967122, "rewards/rejected": -2.888368606567383, "step": 18551 }, { "epoch": 0.9833302414332282, "grad_norm": 65.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 156485.0, "logits/rejected": -28960354.666666668, "logps/chosen": -551.9024658203125, "logps/rejected": -316.4658203125, "loss": 0.2413, "rewards/chosen": 0.7484191656112671, "rewards/margins": 2.7624834775924683, "rewards/rejected": -2.014064311981201, "step": 18552 }, { "epoch": 0.9833832454350303, "grad_norm": 53.75, "kl": 4.536214828491211, "learning_rate": 5e-07, "logits/chosen": -18086641.333333332, "logits/rejected": -7026847.0, "logps/chosen": -198.4454345703125, "logps/rejected": -48.385292053222656, "loss": 0.4504, "rewards/chosen": 0.31356088320414227, "rewards/margins": 2.0067543188730874, "rewards/rejected": -1.6931934356689453, "step": 18553 }, { "epoch": 0.9834362494368325, "grad_norm": 40.75, "kl": 2.6137208938598633, "learning_rate": 5e-07, "logits/chosen": -11955532.8, "logits/rejected": -1695349.3333333333, "logps/chosen": -242.3864501953125, "logps/rejected": -191.68241373697916, "loss": 0.2988, "rewards/chosen": 0.8970730781555176, "rewards/margins": 3.5856373786926268, "rewards/rejected": -2.6885643005371094, "step": 18554 }, { "epoch": 0.9834892534386346, "grad_norm": 53.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -52768824.0, "logits/rejected": 865271.8125, "logps/chosen": -863.7180786132812, "logps/rejected": -146.54226684570312, "loss": 0.2313, "rewards/chosen": 0.9261535406112671, "rewards/margins": 3.8000954389572144, "rewards/rejected": -2.8739418983459473, "step": 18555 }, { "epoch": 0.9835422574404368, "grad_norm": 59.0, "kl": 2.3267669677734375, "learning_rate": 5e-07, "logits/chosen": -19717344.0, "logits/rejected": -19908197.333333332, "logps/chosen": -239.756396484375, "logps/rejected": -139.8138224283854, "loss": 0.3511, "rewards/chosen": 0.46742262840271, "rewards/margins": 2.691320244471232, "rewards/rejected": -2.223897616068522, "step": 18556 }, { "epoch": 0.9835952614422389, "grad_norm": 43.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -69156912.0, "logits/rejected": 36229597.333333336, "logps/chosen": -203.92076110839844, "logps/rejected": -418.115966796875, "loss": 0.2307, "rewards/chosen": -0.3834232687950134, "rewards/margins": 2.2940364480018616, "rewards/rejected": -2.677459716796875, "step": 18557 }, { "epoch": 0.983648265444041, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27037373.333333332, "logits/rejected": -25414754.0, "logps/chosen": -333.2816975911458, "logps/rejected": -332.87908935546875, "loss": 0.2269, "rewards/chosen": 1.0213130315144856, "rewards/margins": 4.999581654866536, "rewards/rejected": -3.978268623352051, "step": 18558 }, { "epoch": 0.9837012694458431, "grad_norm": 31.625, "kl": 2.8996753692626953, "learning_rate": 5e-07, "logits/chosen": 11108306.666666666, "logits/rejected": -27751446.4, "logps/chosen": -37.372965494791664, "logps/rejected": -396.266015625, "loss": 0.2395, "rewards/chosen": 0.6653329531351725, "rewards/margins": 3.3698516527811684, "rewards/rejected": -2.704518699645996, "step": 18559 }, { "epoch": 0.9837542734476453, "grad_norm": 45.5, "kl": 3.168844223022461, "learning_rate": 5e-07, "logits/chosen": -23472049.6, "logits/rejected": -24813682.666666668, "logps/chosen": -824.778515625, "logps/rejected": -282.6481526692708, "loss": 0.2523, "rewards/chosen": 1.2942235946655274, "rewards/margins": 3.842901039123535, "rewards/rejected": -2.548677444458008, "step": 18560 }, { "epoch": 0.9838072774494474, "grad_norm": 43.75, "kl": 2.8967056274414062, "learning_rate": 5e-07, "logits/chosen": -62551136.0, "logits/rejected": -27895136.0, "logps/chosen": -295.32879638671875, "logps/rejected": -371.5854797363281, "loss": 0.2732, "rewards/chosen": 0.9406558275222778, "rewards/margins": 4.074496865272522, "rewards/rejected": -3.133841037750244, "step": 18561 }, { "epoch": 0.9838602814512496, "grad_norm": 41.5, "kl": 1.9308786392211914, "learning_rate": 5e-07, "logits/chosen": 2189750.3333333335, "logits/rejected": -11613936.0, "logps/chosen": -97.8746337890625, "logps/rejected": -170.36795654296876, "loss": 0.3519, "rewards/chosen": -0.2520776192347209, "rewards/margins": 1.2511254866917927, "rewards/rejected": -1.5032031059265136, "step": 18562 }, { "epoch": 0.9839132854530517, "grad_norm": 39.75, "kl": 0.6335763931274414, "learning_rate": 5e-07, "logits/chosen": -26850856.0, "logits/rejected": -25172476.8, "logps/chosen": -200.2484130859375, "logps/rejected": -156.35330810546876, "loss": 0.2647, "rewards/chosen": 0.2749338348706563, "rewards/margins": 2.5463482101758323, "rewards/rejected": -2.271414375305176, "step": 18563 }, { "epoch": 0.9839662894548539, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 1825461.3333333333, "logits/rejected": -48818608.0, "logps/chosen": -318.8773600260417, "logps/rejected": -216.4275146484375, "loss": 0.2647, "rewards/chosen": 0.17078202962875366, "rewards/margins": 2.3987983107566833, "rewards/rejected": -2.2280162811279296, "step": 18564 }, { "epoch": 0.984019293456656, "grad_norm": 62.5, "kl": 0.6547813415527344, "learning_rate": 5e-07, "logits/chosen": -90905484.8, "logits/rejected": -20967309.333333332, "logps/chosen": -444.12724609375, "logps/rejected": -290.3655598958333, "loss": 0.3225, "rewards/chosen": 0.519537353515625, "rewards/margins": 2.1013163566589355, "rewards/rejected": -1.5817790031433105, "step": 18565 }, { "epoch": 0.9840722974584581, "grad_norm": 100.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -66550586.666666664, "logits/rejected": 219821.2, "logps/chosen": -360.1531575520833, "logps/rejected": -183.58909912109374, "loss": 0.2647, "rewards/chosen": 0.3669576247533162, "rewards/margins": 2.3441232283910116, "rewards/rejected": -1.9771656036376952, "step": 18566 }, { "epoch": 0.9841253014602602, "grad_norm": 41.0, "kl": 0.6377944946289062, "learning_rate": 5e-07, "logits/chosen": -36627076.0, "logits/rejected": -34425764.0, "logps/chosen": -277.79461669921875, "logps/rejected": -341.17913818359375, "loss": 0.3408, "rewards/chosen": 0.14862704277038574, "rewards/margins": 1.8131375312805176, "rewards/rejected": -1.6645104885101318, "step": 18567 }, { "epoch": 0.9841783054620624, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -62061008.0, "logits/rejected": -20828780.0, "logps/chosen": -386.70892333984375, "logps/rejected": -369.55963134765625, "loss": 0.1584, "rewards/chosen": 1.447209358215332, "rewards/margins": 5.445485591888428, "rewards/rejected": -3.9982762336730957, "step": 18568 }, { "epoch": 0.9842313094638645, "grad_norm": 60.75, "kl": 3.9727935791015625, "learning_rate": 5e-07, "logits/chosen": -25881932.8, "logits/rejected": -29038402.666666668, "logps/chosen": -288.666552734375, "logps/rejected": -304.8468017578125, "loss": 0.3293, "rewards/chosen": 0.6276380538940429, "rewards/margins": 3.8503833134969074, "rewards/rejected": -3.2227452596028647, "step": 18569 }, { "epoch": 0.9842843134656667, "grad_norm": 54.5, "kl": 1.0639839172363281, "learning_rate": 5e-07, "logits/chosen": -62018521.6, "logits/rejected": -5876294.666666667, "logps/chosen": -385.1983154296875, "logps/rejected": -568.9863688151041, "loss": 0.2472, "rewards/chosen": 1.096246337890625, "rewards/margins": 3.599095598856608, "rewards/rejected": -2.502849260965983, "step": 18570 }, { "epoch": 0.9843373174674688, "grad_norm": 44.25, "kl": 2.4502334594726562, "learning_rate": 5e-07, "logits/chosen": -60135430.4, "logits/rejected": -39665152.0, "logps/chosen": -649.878662109375, "logps/rejected": -343.3899739583333, "loss": 0.2244, "rewards/chosen": 1.575070285797119, "rewards/margins": 4.035563691457113, "rewards/rejected": -2.4604934056599936, "step": 18571 }, { "epoch": 0.984390321469271, "grad_norm": 45.0, "kl": 3.7229080200195312, "learning_rate": 5e-07, "logits/chosen": -43163060.0, "logits/rejected": -14688509.0, "logps/chosen": -415.06982421875, "logps/rejected": -387.80072021484375, "loss": 0.2123, "rewards/chosen": 0.8905324935913086, "rewards/margins": 4.452286720275879, "rewards/rejected": -3.5617542266845703, "step": 18572 }, { "epoch": 0.984443325471073, "grad_norm": 42.75, "kl": 2.0363101959228516, "learning_rate": 5e-07, "logits/chosen": -725139.5, "logits/rejected": -18290888.0, "logps/chosen": -222.32255859375, "logps/rejected": -378.7506917317708, "loss": 0.2553, "rewards/chosen": 0.7058629989624023, "rewards/margins": 3.833674748738607, "rewards/rejected": -3.1278117497762046, "step": 18573 }, { "epoch": 0.9844963294728752, "grad_norm": 37.0, "kl": 2.7052669525146484, "learning_rate": 5e-07, "logits/chosen": -9791000.666666666, "logits/rejected": -15760336.0, "logps/chosen": -219.9169921875, "logps/rejected": -207.1361572265625, "loss": 0.1827, "rewards/chosen": 1.4019163449605305, "rewards/margins": 3.2237634976704914, "rewards/rejected": -1.821847152709961, "step": 18574 }, { "epoch": 0.9845493334746773, "grad_norm": 48.5, "kl": 2.2126541137695312, "learning_rate": 5e-07, "logits/chosen": -10549100.0, "logits/rejected": 21094316.8, "logps/chosen": -322.8577880859375, "logps/rejected": -556.8486328125, "loss": 0.1786, "rewards/chosen": 1.3705531756083171, "rewards/margins": 4.199584166208903, "rewards/rejected": -2.829030990600586, "step": 18575 }, { "epoch": 0.9846023374764795, "grad_norm": 37.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -8483057.0, "logits/rejected": -6105186.0, "logps/chosen": -341.87152099609375, "logps/rejected": -151.18558756510416, "loss": 0.2129, "rewards/chosen": 2.021104335784912, "rewards/margins": 3.811708927154541, "rewards/rejected": -1.790604591369629, "step": 18576 }, { "epoch": 0.9846553414782816, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27940586.666666668, "logits/rejected": -21136228.8, "logps/chosen": -176.3390909830729, "logps/rejected": -338.91962890625, "loss": 0.2056, "rewards/chosen": 0.31393688917160034, "rewards/margins": 3.4011528849601746, "rewards/rejected": -3.0872159957885743, "step": 18577 }, { "epoch": 0.9847083454800838, "grad_norm": 65.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20891211.2, "logits/rejected": -7332441.333333333, "logps/chosen": -404.7975341796875, "logps/rejected": -222.28263346354166, "loss": 0.2614, "rewards/chosen": 0.7039407730102539, "rewards/margins": 2.8109792709350585, "rewards/rejected": -2.1070384979248047, "step": 18578 }, { "epoch": 0.9847613494818859, "grad_norm": 51.0, "kl": 1.0486669540405273, "learning_rate": 5e-07, "logits/chosen": -39156544.0, "logits/rejected": -35369002.666666664, "logps/chosen": -247.1841552734375, "logps/rejected": -283.8789876302083, "loss": 0.4169, "rewards/chosen": -0.2423025608062744, "rewards/margins": 1.7221665541330975, "rewards/rejected": -1.9644691149393718, "step": 18579 }, { "epoch": 0.9848143534836881, "grad_norm": 43.5, "kl": 2.4487152099609375, "learning_rate": 5e-07, "logits/chosen": -15721539.2, "logits/rejected": -29463005.333333332, "logps/chosen": -213.541943359375, "logps/rejected": -245.55192057291666, "loss": 0.3186, "rewards/chosen": 0.33170504570007325, "rewards/margins": 3.760709079106649, "rewards/rejected": -3.4290040334065757, "step": 18580 }, { "epoch": 0.9848673574854901, "grad_norm": 43.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18461194.0, "logits/rejected": -8296006.666666667, "logps/chosen": -361.7055358886719, "logps/rejected": -211.6021728515625, "loss": 0.2211, "rewards/chosen": 0.7633354067802429, "rewards/margins": 3.0106908679008484, "rewards/rejected": -2.2473554611206055, "step": 18581 }, { "epoch": 0.9849203614872923, "grad_norm": 54.5, "kl": 3.1414146423339844, "learning_rate": 5e-07, "logits/chosen": -10492776.0, "logits/rejected": -20704884.0, "logps/chosen": -456.0472412109375, "logps/rejected": -215.04470825195312, "loss": 0.2798, "rewards/chosen": 0.7340036630630493, "rewards/margins": 2.6433075666427612, "rewards/rejected": -1.909303903579712, "step": 18582 }, { "epoch": 0.9849733654890944, "grad_norm": 61.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -2822484.6666666665, "logits/rejected": -381491.35, "logps/chosen": -299.89731852213544, "logps/rejected": -135.0712890625, "loss": 0.2535, "rewards/chosen": 0.9138970375061035, "rewards/margins": 3.498955249786377, "rewards/rejected": -2.5850582122802734, "step": 18583 }, { "epoch": 0.9850263694908966, "grad_norm": 46.5, "kl": 2.29632568359375, "learning_rate": 5e-07, "logits/chosen": -54817077.333333336, "logits/rejected": -11569663.0, "logps/chosen": -370.2918294270833, "logps/rejected": -445.8514709472656, "loss": 0.4124, "rewards/chosen": 0.17332867781321207, "rewards/margins": 2.684279958407084, "rewards/rejected": -2.510951280593872, "step": 18584 }, { "epoch": 0.9850793734926987, "grad_norm": 54.5, "kl": 1.46435546875, "learning_rate": 5e-07, "logits/chosen": 7290464.0, "logits/rejected": -9503197.333333334, "logps/chosen": -551.843115234375, "logps/rejected": -127.09780883789062, "loss": 0.2279, "rewards/chosen": 1.2759475708007812, "rewards/margins": 3.8197043736775718, "rewards/rejected": -2.5437568028767905, "step": 18585 }, { "epoch": 0.9851323774945009, "grad_norm": 48.5, "kl": 4.252384185791016, "learning_rate": 5e-07, "logits/chosen": 14708240.0, "logits/rejected": -1499226.0, "logps/chosen": -255.77365112304688, "logps/rejected": -600.3885498046875, "loss": 0.201, "rewards/chosen": 1.3127565383911133, "rewards/margins": 4.882916450500488, "rewards/rejected": -3.570159912109375, "step": 18586 }, { "epoch": 0.985185381496303, "grad_norm": 37.25, "kl": 1.713348388671875, "learning_rate": 5e-07, "logits/chosen": -28469444.0, "logits/rejected": -16230167.0, "logps/chosen": -194.9872589111328, "logps/rejected": -247.6853485107422, "loss": 0.3038, "rewards/chosen": -0.02412986382842064, "rewards/margins": 3.4924418963491917, "rewards/rejected": -3.5165717601776123, "step": 18587 }, { "epoch": 0.985238385498105, "grad_norm": 47.75, "kl": 4.214221954345703, "learning_rate": 5e-07, "logits/chosen": -24440292.0, "logits/rejected": -12679839.0, "logps/chosen": -291.5823059082031, "logps/rejected": -334.73974609375, "loss": 0.2248, "rewards/chosen": 1.5493439435958862, "rewards/margins": 4.258952021598816, "rewards/rejected": -2.7096080780029297, "step": 18588 }, { "epoch": 0.9852913894999072, "grad_norm": 48.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -31008229.333333332, "logits/rejected": -31626403.2, "logps/chosen": -150.23583984375, "logps/rejected": -329.1170654296875, "loss": 0.3452, "rewards/chosen": 0.05201448996861776, "rewards/margins": 1.2400367220242818, "rewards/rejected": -1.1880222320556642, "step": 18589 }, { "epoch": 0.9853443935017093, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -27269184.0, "logits/rejected": -40592601.6, "logps/chosen": -137.32514444986978, "logps/rejected": -421.660595703125, "loss": 0.2217, "rewards/chosen": 0.3062123457590739, "rewards/margins": 2.8184022108713784, "rewards/rejected": -2.5121898651123047, "step": 18590 }, { "epoch": 0.9853973975035115, "grad_norm": 54.75, "kl": 1.0093803405761719, "learning_rate": 5e-07, "logits/chosen": -60751756.8, "logits/rejected": -49881322.666666664, "logps/chosen": -478.95126953125, "logps/rejected": -424.9808349609375, "loss": 0.3646, "rewards/chosen": -0.013991081714630127, "rewards/margins": 2.5497261246045433, "rewards/rejected": -2.5637172063191733, "step": 18591 }, { "epoch": 0.9854504015053136, "grad_norm": 45.25, "kl": 0.94586181640625, "learning_rate": 5e-07, "logits/chosen": -33286917.333333332, "logits/rejected": -26521894.4, "logps/chosen": -556.2169596354166, "logps/rejected": -186.936181640625, "loss": 0.1881, "rewards/chosen": 1.0646107991536458, "rewards/margins": 4.5735532124837235, "rewards/rejected": -3.508942413330078, "step": 18592 }, { "epoch": 0.9855034055071158, "grad_norm": 29.0, "kl": 1.9659337997436523, "learning_rate": 5e-07, "logits/chosen": -10689370.0, "logits/rejected": -19526370.0, "logps/chosen": -303.0225830078125, "logps/rejected": -377.3546447753906, "loss": 0.1155, "rewards/chosen": 1.8454526662826538, "rewards/margins": 7.251604914665222, "rewards/rejected": -5.406152248382568, "step": 18593 }, { "epoch": 0.9855564095089179, "grad_norm": 79.5, "kl": 1.1712417602539062, "learning_rate": 5e-07, "logits/chosen": -17933884.0, "logits/rejected": -34374716.0, "logps/chosen": -501.7072448730469, "logps/rejected": -395.3896484375, "loss": 0.1971, "rewards/chosen": 0.9480020999908447, "rewards/margins": 3.9334051609039307, "rewards/rejected": -2.985403060913086, "step": 18594 }, { "epoch": 0.9856094135107201, "grad_norm": 64.5, "kl": 0.9366531372070312, "learning_rate": 5e-07, "logits/chosen": -17898676.0, "logits/rejected": -81750128.0, "logps/chosen": -390.4689636230469, "logps/rejected": -247.02670288085938, "loss": 0.3236, "rewards/chosen": 0.5839259028434753, "rewards/margins": 1.6625933051109314, "rewards/rejected": -1.078667402267456, "step": 18595 }, { "epoch": 0.9856624175125221, "grad_norm": 59.5, "kl": 2.662734031677246, "learning_rate": 5e-07, "logits/chosen": -30533478.0, "logits/rejected": -334470.0, "logps/chosen": -342.773681640625, "logps/rejected": -253.9876251220703, "loss": 0.3292, "rewards/chosen": 0.695827305316925, "rewards/margins": 2.1188283562660217, "rewards/rejected": -1.4230010509490967, "step": 18596 }, { "epoch": 0.9857154215143243, "grad_norm": 45.25, "kl": 1.0374116897583008, "learning_rate": 5e-07, "logits/chosen": -15293232.0, "logits/rejected": 82065.4, "logps/chosen": -327.5181070963542, "logps/rejected": -486.017041015625, "loss": 0.2022, "rewards/chosen": 0.5067559083302816, "rewards/margins": 3.3790703614552817, "rewards/rejected": -2.872314453125, "step": 18597 }, { "epoch": 0.9857684255161264, "grad_norm": 46.0, "kl": 0.943730354309082, "learning_rate": 5e-07, "logits/chosen": 5199106.4, "logits/rejected": -57255146.666666664, "logps/chosen": -44.43968200683594, "logps/rejected": -508.3646647135417, "loss": 0.2938, "rewards/chosen": 0.3039604663848877, "rewards/margins": 3.4918155511220297, "rewards/rejected": -3.187855084737142, "step": 18598 }, { "epoch": 0.9858214295179286, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40975096.0, "logits/rejected": -2432839.6, "logps/chosen": -156.455810546875, "logps/rejected": -215.255810546875, "loss": 0.2833, "rewards/chosen": 0.23023784160614014, "rewards/margins": 2.0819477796554566, "rewards/rejected": -1.8517099380493165, "step": 18599 }, { "epoch": 0.9858744335197307, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -846064.5, "logits/rejected": -70679545.6, "logps/chosen": -139.29486083984375, "logps/rejected": -351.1083251953125, "loss": 0.2398, "rewards/chosen": 0.231022318204244, "rewards/margins": 3.0360592444737753, "rewards/rejected": -2.805036926269531, "step": 18600 }, { "epoch": 0.9859274375215329, "grad_norm": 44.75, "kl": 2.8435192108154297, "learning_rate": 5e-07, "logits/chosen": -46385152.0, "logits/rejected": -60591856.0, "logps/chosen": -189.2583251953125, "logps/rejected": -279.8209228515625, "loss": 0.3034, "rewards/chosen": 0.4606298923492432, "rewards/margins": 3.2161033471425378, "rewards/rejected": -2.7554734547932944, "step": 18601 }, { "epoch": 0.985980441523335, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32030976.0, "logits/rejected": -51188245.333333336, "logps/chosen": -430.82802734375, "logps/rejected": -176.2838338216146, "loss": 0.2557, "rewards/chosen": 0.9675695419311523, "rewards/margins": 2.7816494623819987, "rewards/rejected": -1.8140799204508464, "step": 18602 }, { "epoch": 0.9860334455251372, "grad_norm": 52.25, "kl": 4.74908447265625, "learning_rate": 5e-07, "logits/chosen": 31815356.8, "logits/rejected": -35547360.0, "logps/chosen": -672.3845703125, "logps/rejected": -124.95654296875, "loss": 0.2365, "rewards/chosen": 1.5149598121643066, "rewards/margins": 3.786537011464437, "rewards/rejected": -2.2715771993001304, "step": 18603 }, { "epoch": 0.9860864495269392, "grad_norm": 52.75, "kl": 0.8277111053466797, "learning_rate": 5e-07, "logits/chosen": -14226962.666666666, "logits/rejected": 32751072.0, "logps/chosen": -213.7404988606771, "logps/rejected": -465.6902160644531, "loss": 0.3109, "rewards/chosen": 0.41041143735249835, "rewards/margins": 7.1685958703358965, "rewards/rejected": -6.758184432983398, "step": 18604 }, { "epoch": 0.9861394535287414, "grad_norm": 43.25, "kl": 1.630497932434082, "learning_rate": 5e-07, "logits/chosen": -16289258.285714285, "logits/rejected": -52572520.0, "logps/chosen": -224.54769461495536, "logps/rejected": -983.236083984375, "loss": 0.2851, "rewards/chosen": 0.9323469570704869, "rewards/margins": 4.354887383324759, "rewards/rejected": -3.4225404262542725, "step": 18605 }, { "epoch": 0.9861924575305435, "grad_norm": 44.75, "kl": 0.09946918487548828, "learning_rate": 5e-07, "logits/chosen": -34753589.333333336, "logits/rejected": -30054755.2, "logps/chosen": -428.4427897135417, "logps/rejected": -361.83798828125, "loss": 0.1514, "rewards/chosen": 1.169966459274292, "rewards/margins": 4.124995374679566, "rewards/rejected": -2.9550289154052733, "step": 18606 }, { "epoch": 0.9862454615323457, "grad_norm": 40.0, "kl": 0.9141998291015625, "learning_rate": 5e-07, "logits/chosen": -16112020.8, "logits/rejected": -45325274.666666664, "logps/chosen": -190.7848876953125, "logps/rejected": -310.5950520833333, "loss": 0.3106, "rewards/chosen": 0.32587902545928954, "rewards/margins": 2.594142206509908, "rewards/rejected": -2.2682631810506186, "step": 18607 }, { "epoch": 0.9862984655341478, "grad_norm": 55.25, "kl": 1.1395721435546875, "learning_rate": 5e-07, "logits/chosen": -20766486.4, "logits/rejected": 75583530.66666667, "logps/chosen": -299.449609375, "logps/rejected": -403.4143473307292, "loss": 0.3047, "rewards/chosen": 0.4739584445953369, "rewards/margins": 3.613449843724569, "rewards/rejected": -3.139491399129232, "step": 18608 }, { "epoch": 0.98635146953595, "grad_norm": 57.75, "kl": 3.3329849243164062, "learning_rate": 5e-07, "logits/chosen": -47564176.0, "logits/rejected": -23874702.0, "logps/chosen": -574.0462036132812, "logps/rejected": -453.46807861328125, "loss": 0.2738, "rewards/chosen": 1.2778232097625732, "rewards/margins": 2.656721830368042, "rewards/rejected": -1.3788986206054688, "step": 18609 }, { "epoch": 0.9864044735377521, "grad_norm": 39.0, "kl": 3.522197723388672, "learning_rate": 5e-07, "logits/chosen": -28611955.2, "logits/rejected": -36676421.333333336, "logps/chosen": -243.5445556640625, "logps/rejected": -505.1263020833333, "loss": 0.2831, "rewards/chosen": 1.0065387725830077, "rewards/margins": 5.213027699788411, "rewards/rejected": -4.206488927205403, "step": 18610 }, { "epoch": 0.9864574775395543, "grad_norm": 49.25, "kl": 6.009464263916016, "learning_rate": 5e-07, "logits/chosen": -22584499.2, "logits/rejected": -67312373.33333333, "logps/chosen": -248.8257568359375, "logps/rejected": -417.091796875, "loss": 0.3397, "rewards/chosen": 0.8125335693359375, "rewards/margins": 4.14599723815918, "rewards/rejected": -3.333463668823242, "step": 18611 }, { "epoch": 0.9865104815413563, "grad_norm": 33.5, "kl": 4.7399749755859375, "learning_rate": 5e-07, "logits/chosen": -25910905.6, "logits/rejected": -162716117.33333334, "logps/chosen": -428.0482421875, "logps/rejected": -214.29789225260416, "loss": 0.2316, "rewards/chosen": 1.4210901260375977, "rewards/margins": 3.8408886591593423, "rewards/rejected": -2.4197985331217446, "step": 18612 }, { "epoch": 0.9865634855431585, "grad_norm": 46.25, "kl": 0.3308534622192383, "learning_rate": 5e-07, "logits/chosen": -41571956.0, "logits/rejected": -41405061.333333336, "logps/chosen": -222.36734008789062, "logps/rejected": -501.930908203125, "loss": 0.1869, "rewards/chosen": 0.19867554306983948, "rewards/margins": 2.8337943454583487, "rewards/rejected": -2.6351188023885093, "step": 18613 }, { "epoch": 0.9866164895449606, "grad_norm": 47.0, "kl": 4.951663970947266, "learning_rate": 5e-07, "logits/chosen": 3413338.8, "logits/rejected": -35430418.666666664, "logps/chosen": -236.8667724609375, "logps/rejected": -395.9083251953125, "loss": 0.2993, "rewards/chosen": 0.6842061519622803, "rewards/margins": 4.057857020696004, "rewards/rejected": -3.373650868733724, "step": 18614 }, { "epoch": 0.9866694935467628, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4391894.5, "logits/rejected": -20104990.666666668, "logps/chosen": -285.3587646484375, "logps/rejected": -336.8269449869792, "loss": 0.2636, "rewards/chosen": 0.8286224007606506, "rewards/margins": 2.632136881351471, "rewards/rejected": -1.8035144805908203, "step": 18615 }, { "epoch": 0.9867224975485649, "grad_norm": 30.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7202541.5, "logits/rejected": -12236997.333333334, "logps/chosen": -50.680389404296875, "logps/rejected": -422.4518229166667, "loss": 0.1779, "rewards/chosen": 0.44620752334594727, "rewards/margins": 3.5414700508117676, "rewards/rejected": -3.0952625274658203, "step": 18616 }, { "epoch": 0.9867755015503671, "grad_norm": 26.125, "kl": 1.128194808959961, "learning_rate": 5e-07, "logits/chosen": -11323931.0, "logits/rejected": -8424818.857142856, "logps/chosen": -126.14140319824219, "logps/rejected": -190.59340122767858, "loss": 0.141, "rewards/chosen": 0.46648406982421875, "rewards/margins": 3.006353105817522, "rewards/rejected": -2.5398690359933034, "step": 18617 }, { "epoch": 0.9868285055521692, "grad_norm": 41.75, "kl": 0.08284854888916016, "learning_rate": 5e-07, "logits/chosen": -20499894.4, "logits/rejected": -55031466.666666664, "logps/chosen": -165.216357421875, "logps/rejected": -480.9900716145833, "loss": 0.2352, "rewards/chosen": 0.5904962539672851, "rewards/margins": 4.543992551167806, "rewards/rejected": -3.953496297200521, "step": 18618 }, { "epoch": 0.9868815095539714, "grad_norm": 34.5, "kl": 0.593994140625, "learning_rate": 5e-07, "logits/chosen": -27590490.666666668, "logits/rejected": -77199955.2, "logps/chosen": -769.31494140625, "logps/rejected": -398.00634765625, "loss": 0.1613, "rewards/chosen": 1.5571516354878743, "rewards/margins": 4.469619782765706, "rewards/rejected": -2.912468147277832, "step": 18619 }, { "epoch": 0.9869345135557734, "grad_norm": 38.75, "kl": 2.189668655395508, "learning_rate": 5e-07, "logits/chosen": -17610488.0, "logits/rejected": -21803904.0, "logps/chosen": -214.278564453125, "logps/rejected": -251.9359588623047, "loss": 0.2497, "rewards/chosen": 0.878699004650116, "rewards/margins": 3.9345826506614685, "rewards/rejected": -3.0558836460113525, "step": 18620 }, { "epoch": 0.9869875175575756, "grad_norm": 33.25, "kl": 2.3721561431884766, "learning_rate": 5e-07, "logits/chosen": -21379392.0, "logits/rejected": -11757520.0, "logps/chosen": -187.41360473632812, "logps/rejected": -346.1468505859375, "loss": 0.3144, "rewards/chosen": 0.12693580985069275, "rewards/margins": 5.074092715978622, "rewards/rejected": -4.94715690612793, "step": 18621 }, { "epoch": 0.9870405215593777, "grad_norm": 31.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22133684.0, "logits/rejected": -20933960.0, "logps/chosen": -281.43096923828125, "logps/rejected": -478.4154052734375, "loss": 0.1055, "rewards/chosen": 1.3868179321289062, "rewards/margins": 4.0118039449055996, "rewards/rejected": -2.624986012776693, "step": 18622 }, { "epoch": 0.9870935255611799, "grad_norm": 47.0, "kl": 4.027270317077637, "learning_rate": 5e-07, "logits/chosen": -2710874.3333333335, "logits/rejected": -26910560.0, "logps/chosen": -115.6045430501302, "logps/rejected": -275.161474609375, "loss": 0.3377, "rewards/chosen": 0.1575820247332255, "rewards/margins": 2.0927733699480693, "rewards/rejected": -1.9351913452148437, "step": 18623 }, { "epoch": 0.987146529562982, "grad_norm": 41.25, "kl": 2.7775163650512695, "learning_rate": 5e-07, "logits/chosen": -22349673.6, "logits/rejected": -23364920.0, "logps/chosen": -194.84248046875, "logps/rejected": -387.0001220703125, "loss": 0.3039, "rewards/chosen": 0.7964380264282227, "rewards/margins": 4.343276151021322, "rewards/rejected": -3.546838124593099, "step": 18624 }, { "epoch": 0.9871995335647842, "grad_norm": 46.25, "kl": 0.04268074035644531, "learning_rate": 5e-07, "logits/chosen": -39031734.4, "logits/rejected": -22899450.666666668, "logps/chosen": -170.9364501953125, "logps/rejected": -442.8544514973958, "loss": 0.2639, "rewards/chosen": 0.7476210594177246, "rewards/margins": 3.0911291440327964, "rewards/rejected": -2.3435080846150718, "step": 18625 }, { "epoch": 0.9872525375665863, "grad_norm": 60.25, "kl": 2.088010787963867, "learning_rate": 5e-07, "logits/chosen": -38378484.0, "logits/rejected": -25459816.0, "logps/chosen": -395.3118591308594, "logps/rejected": -332.1090087890625, "loss": 0.1789, "rewards/chosen": 1.329237937927246, "rewards/margins": 3.418635845184326, "rewards/rejected": -2.08939790725708, "step": 18626 }, { "epoch": 0.9873055415683885, "grad_norm": 47.75, "kl": 2.0517139434814453, "learning_rate": 5e-07, "logits/chosen": -27172552.0, "logits/rejected": -29123236.0, "logps/chosen": -310.3808288574219, "logps/rejected": -174.62709045410156, "loss": 0.2687, "rewards/chosen": 0.5100404620170593, "rewards/margins": 3.242362678050995, "rewards/rejected": -2.7323222160339355, "step": 18627 }, { "epoch": 0.9873585455701905, "grad_norm": 55.5, "kl": 1.2414264678955078, "learning_rate": 5e-07, "logits/chosen": -26569836.8, "logits/rejected": -39934933.333333336, "logps/chosen": -190.7384765625, "logps/rejected": -365.1896158854167, "loss": 0.3435, "rewards/chosen": 0.5094837188720703, "rewards/margins": 2.3629225730895995, "rewards/rejected": -1.8534388542175293, "step": 18628 }, { "epoch": 0.9874115495719927, "grad_norm": 44.5, "kl": 1.2431707382202148, "learning_rate": 5e-07, "logits/chosen": -23004120.0, "logits/rejected": -24191668.0, "logps/chosen": -333.1407165527344, "logps/rejected": -204.38983154296875, "loss": 0.2674, "rewards/chosen": 0.8987467288970947, "rewards/margins": 3.3219821453094482, "rewards/rejected": -2.4232354164123535, "step": 18629 }, { "epoch": 0.9874645535737948, "grad_norm": 40.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24311868.0, "logits/rejected": -28079004.0, "logps/chosen": -253.0498809814453, "logps/rejected": -296.334228515625, "loss": 0.2114, "rewards/chosen": 0.5555973052978516, "rewards/margins": 5.090588569641113, "rewards/rejected": -4.534991264343262, "step": 18630 }, { "epoch": 0.987517557575597, "grad_norm": 50.5, "kl": 1.529083251953125, "learning_rate": 5e-07, "logits/chosen": -13356436.0, "logits/rejected": -16077155.0, "logps/chosen": -325.1061096191406, "logps/rejected": -349.81939697265625, "loss": 0.2991, "rewards/chosen": 0.49599403142929077, "rewards/margins": 2.6695961356163025, "rewards/rejected": -2.1736021041870117, "step": 18631 }, { "epoch": 0.9875705615773991, "grad_norm": 56.75, "kl": 3.0016117095947266, "learning_rate": 5e-07, "logits/chosen": -127958568.0, "logits/rejected": -21261356.0, "logps/chosen": -452.7430419921875, "logps/rejected": -142.39097595214844, "loss": 0.2177, "rewards/chosen": 1.4391661882400513, "rewards/margins": 3.9160655736923218, "rewards/rejected": -2.4768993854522705, "step": 18632 }, { "epoch": 0.9876235655792013, "grad_norm": 63.25, "kl": 3.4392471313476562, "learning_rate": 5e-07, "logits/chosen": -33542666.666666668, "logits/rejected": 5666497.0, "logps/chosen": -328.03004964192706, "logps/rejected": -275.4392395019531, "loss": 0.3544, "rewards/chosen": 0.9211997985839844, "rewards/margins": 4.178657293319702, "rewards/rejected": -3.2574574947357178, "step": 18633 }, { "epoch": 0.9876765695810034, "grad_norm": 46.25, "kl": 1.2659902572631836, "learning_rate": 5e-07, "logits/chosen": -34649307.428571425, "logits/rejected": 45877216.0, "logps/chosen": -222.90035574776786, "logps/rejected": -739.1138916015625, "loss": 0.3848, "rewards/chosen": 0.512531076158796, "rewards/margins": 2.8818304879324774, "rewards/rejected": -2.3692994117736816, "step": 18634 }, { "epoch": 0.9877295735828056, "grad_norm": 61.75, "kl": 1.5845565795898438, "learning_rate": 5e-07, "logits/chosen": -45024261.333333336, "logits/rejected": -24436601.6, "logps/chosen": -913.6332194010416, "logps/rejected": -332.6559814453125, "loss": 0.2379, "rewards/chosen": 0.6652363936106364, "rewards/margins": 3.1826363722483317, "rewards/rejected": -2.5173999786376955, "step": 18635 }, { "epoch": 0.9877825775846076, "grad_norm": 45.0, "kl": 1.6228084564208984, "learning_rate": 5e-07, "logits/chosen": -52347994.666666664, "logits/rejected": -4499716.5, "logps/chosen": -394.4453125, "logps/rejected": -118.26985168457031, "loss": 0.3323, "rewards/chosen": 0.6776603062947592, "rewards/margins": 5.370764096577962, "rewards/rejected": -4.693103790283203, "step": 18636 }, { "epoch": 0.9878355815864098, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37516448.0, "logits/rejected": -22125754.666666668, "logps/chosen": -539.9447021484375, "logps/rejected": -196.94986979166666, "loss": 0.1545, "rewards/chosen": 2.010693311691284, "rewards/margins": 4.579881747563681, "rewards/rejected": -2.569188435872396, "step": 18637 }, { "epoch": 0.9878885855882119, "grad_norm": 43.25, "kl": 2.099090576171875, "learning_rate": 5e-07, "logits/chosen": -59353241.6, "logits/rejected": -6910436.0, "logps/chosen": -198.8116455078125, "logps/rejected": -208.81681315104166, "loss": 0.2654, "rewards/chosen": 0.9084726333618164, "rewards/margins": 3.1935274759928385, "rewards/rejected": -2.285054842631022, "step": 18638 }, { "epoch": 0.987941589590014, "grad_norm": 39.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -11548769.333333334, "logits/rejected": -26075670.4, "logps/chosen": -322.31640625, "logps/rejected": -269.8970947265625, "loss": 0.1833, "rewards/chosen": 1.2588953971862793, "rewards/margins": 3.6593344688415526, "rewards/rejected": -2.4004390716552733, "step": 18639 }, { "epoch": 0.9879945935918162, "grad_norm": 44.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -7521811.0, "logits/rejected": -25927536.0, "logps/chosen": -160.04832458496094, "logps/rejected": -470.92791748046875, "loss": 0.23, "rewards/chosen": 0.36061590909957886, "rewards/margins": 3.954573333263397, "rewards/rejected": -3.5939574241638184, "step": 18640 }, { "epoch": 0.9880475975936183, "grad_norm": 49.5, "kl": 2.79522705078125, "learning_rate": 5e-07, "logits/chosen": -33999944.0, "logits/rejected": -4953906.0, "logps/chosen": -273.9495544433594, "logps/rejected": -198.416015625, "loss": 0.1637, "rewards/chosen": 1.3324675559997559, "rewards/margins": 4.053751786549887, "rewards/rejected": -2.7212842305501304, "step": 18641 }, { "epoch": 0.9881006015954205, "grad_norm": 60.5, "kl": 0.6578950881958008, "learning_rate": 5e-07, "logits/chosen": -72124848.0, "logits/rejected": -5821341.6, "logps/chosen": -407.411376953125, "logps/rejected": -180.76826171875, "loss": 0.3512, "rewards/chosen": 0.07116599877675374, "rewards/margins": 1.8063357909520465, "rewards/rejected": -1.7351697921752929, "step": 18642 }, { "epoch": 0.9881536055972225, "grad_norm": 43.0, "kl": 0.3546905517578125, "learning_rate": 5e-07, "logits/chosen": -16568825.6, "logits/rejected": -62862032.0, "logps/chosen": -306.55830078125, "logps/rejected": -240.1072998046875, "loss": 0.2439, "rewards/chosen": 1.0365282058715821, "rewards/margins": 3.144233798980713, "rewards/rejected": -2.107705593109131, "step": 18643 }, { "epoch": 0.9882066095990247, "grad_norm": 30.75, "kl": 1.2923941612243652, "learning_rate": 5e-07, "logits/chosen": -22702213.333333332, "logits/rejected": -25308540.8, "logps/chosen": -242.24576822916666, "logps/rejected": -565.8849609375, "loss": 0.1932, "rewards/chosen": 0.2219062844912211, "rewards/margins": 5.429488758246104, "rewards/rejected": -5.207582473754883, "step": 18644 }, { "epoch": 0.9882596136008268, "grad_norm": 39.25, "kl": 4.111213684082031, "learning_rate": 5e-07, "logits/chosen": -15438002.0, "logits/rejected": 26459298.0, "logps/chosen": -510.0955505371094, "logps/rejected": -399.3316345214844, "loss": 0.1911, "rewards/chosen": 1.5178102254867554, "rewards/margins": 4.957009434700012, "rewards/rejected": -3.439199209213257, "step": 18645 }, { "epoch": 0.988312617602629, "grad_norm": 61.5, "kl": 1.0426626205444336, "learning_rate": 5e-07, "logits/chosen": -58689324.8, "logits/rejected": -12749906.666666666, "logps/chosen": -203.6979248046875, "logps/rejected": -523.4874267578125, "loss": 0.3916, "rewards/chosen": -0.014705848693847657, "rewards/margins": 2.072062619527181, "rewards/rejected": -2.086768468221029, "step": 18646 }, { "epoch": 0.9883656216044311, "grad_norm": 49.5, "kl": 2.917104721069336, "learning_rate": 5e-07, "logits/chosen": -48839708.0, "logits/rejected": -13633718.0, "logps/chosen": -195.5595703125, "logps/rejected": -516.23193359375, "loss": 0.338, "rewards/chosen": 0.02757592685520649, "rewards/margins": 2.93247695453465, "rewards/rejected": -2.9049010276794434, "step": 18647 }, { "epoch": 0.9884186256062333, "grad_norm": 64.5, "kl": 0.43524932861328125, "learning_rate": 5e-07, "logits/chosen": -36593693.333333336, "logits/rejected": 10164510.4, "logps/chosen": -531.9821370442709, "logps/rejected": -396.7501953125, "loss": 0.2233, "rewards/chosen": 0.4529683589935303, "rewards/margins": 2.961378240585327, "rewards/rejected": -2.508409881591797, "step": 18648 }, { "epoch": 0.9884716296080354, "grad_norm": 38.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18369856.0, "logits/rejected": -30695790.0, "logps/chosen": -232.56271362304688, "logps/rejected": -329.0751953125, "loss": 0.2433, "rewards/chosen": 0.5570930242538452, "rewards/margins": 2.9864243268966675, "rewards/rejected": -2.4293313026428223, "step": 18649 }, { "epoch": 0.9885246336098376, "grad_norm": 42.75, "kl": 5.970420837402344, "learning_rate": 5e-07, "logits/chosen": -17848016.0, "logits/rejected": -74889989.33333333, "logps/chosen": -256.883203125, "logps/rejected": -632.6475423177084, "loss": 0.3668, "rewards/chosen": 0.8252876281738282, "rewards/margins": 3.9251230239868162, "rewards/rejected": -3.0998353958129883, "step": 18650 }, { "epoch": 0.9885776376116396, "grad_norm": 88.0, "kl": 2.5267715454101562, "learning_rate": 5e-07, "logits/chosen": -40693734.4, "logits/rejected": -94515765.33333333, "logps/chosen": -429.83232421875, "logps/rejected": -758.939697265625, "loss": 0.3005, "rewards/chosen": 0.682833194732666, "rewards/margins": 3.457782586415609, "rewards/rejected": -2.774949391682943, "step": 18651 }, { "epoch": 0.9886306416134418, "grad_norm": 56.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20265856.0, "logits/rejected": -12745032.0, "logps/chosen": -325.500927734375, "logps/rejected": -553.7945963541666, "loss": 0.3201, "rewards/chosen": -0.03584571480751038, "rewards/margins": 5.227179569005966, "rewards/rejected": -5.263025283813477, "step": 18652 }, { "epoch": 0.9886836456152439, "grad_norm": 49.5, "kl": 0.0648040771484375, "learning_rate": 5e-07, "logits/chosen": -65939600.0, "logits/rejected": -54289984.0, "logps/chosen": -463.5775146484375, "logps/rejected": -185.8542236328125, "loss": 0.3126, "rewards/chosen": 0.3825032711029053, "rewards/margins": 2.240944242477417, "rewards/rejected": -1.8584409713745118, "step": 18653 }, { "epoch": 0.9887366496170461, "grad_norm": 37.25, "kl": 1.0120258331298828, "learning_rate": 5e-07, "logits/chosen": -17626602.0, "logits/rejected": -27591029.333333332, "logps/chosen": -142.18533325195312, "logps/rejected": -387.5839436848958, "loss": 0.2432, "rewards/chosen": 0.7326769828796387, "rewards/margins": 2.6271107991536455, "rewards/rejected": -1.894433816274007, "step": 18654 }, { "epoch": 0.9887896536188482, "grad_norm": 59.25, "kl": 1.8158302307128906, "learning_rate": 5e-07, "logits/chosen": -9305999.333333334, "logits/rejected": -5689364.8, "logps/chosen": -212.99894205729166, "logps/rejected": -256.65546875, "loss": 0.2533, "rewards/chosen": -0.011126461128393808, "rewards/margins": 3.2297197913130127, "rewards/rejected": -3.2408462524414063, "step": 18655 }, { "epoch": 0.9888426576206504, "grad_norm": 48.0, "kl": 4.3145647048950195, "learning_rate": 5e-07, "logits/chosen": 5896114.0, "logits/rejected": -38433764.0, "logps/chosen": -43.53178024291992, "logps/rejected": -288.474609375, "loss": 0.2785, "rewards/chosen": 1.0062247514724731, "rewards/margins": 3.4706467390060425, "rewards/rejected": -2.4644219875335693, "step": 18656 }, { "epoch": 0.9888956616224525, "grad_norm": 47.75, "kl": 0.2171621322631836, "learning_rate": 5e-07, "logits/chosen": -25872288.0, "logits/rejected": 7632715.2, "logps/chosen": -277.24281819661456, "logps/rejected": -186.4416015625, "loss": 0.2857, "rewards/chosen": 0.3595591386159261, "rewards/margins": 2.6234862168629967, "rewards/rejected": -2.2639270782470704, "step": 18657 }, { "epoch": 0.9889486656242547, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17892964.0, "logits/rejected": -24092517.333333332, "logps/chosen": -547.9622802734375, "logps/rejected": -443.3275960286458, "loss": 0.1062, "rewards/chosen": 1.2953553199768066, "rewards/margins": 4.642949263254801, "rewards/rejected": -3.3475939432779946, "step": 18658 }, { "epoch": 0.9890016696260567, "grad_norm": 48.0, "kl": 0.38361263275146484, "learning_rate": 5e-07, "logits/chosen": -54425896.0, "logits/rejected": -18968404.0, "logps/chosen": -285.1401062011719, "logps/rejected": -405.19879150390625, "loss": 0.2549, "rewards/chosen": 0.6410480737686157, "rewards/margins": 2.8548587560653687, "rewards/rejected": -2.213810682296753, "step": 18659 }, { "epoch": 0.9890546736278589, "grad_norm": 43.5, "kl": 0.3712491989135742, "learning_rate": 5e-07, "logits/chosen": -41891184.0, "logits/rejected": -14317586.666666666, "logps/chosen": -272.09246826171875, "logps/rejected": -298.57484944661456, "loss": 0.2456, "rewards/chosen": 0.18027058243751526, "rewards/margins": 2.1117101411024732, "rewards/rejected": -1.9314395586649578, "step": 18660 }, { "epoch": 0.989107677629661, "grad_norm": 54.0, "kl": 6.79656982421875, "learning_rate": 5e-07, "logits/chosen": -10667252.57142857, "logits/rejected": -1538013.0, "logps/chosen": -206.29434640066964, "logps/rejected": -67.93818664550781, "loss": 0.4842, "rewards/chosen": 0.8094032151358468, "rewards/margins": 1.626268778528486, "rewards/rejected": -0.8168655633926392, "step": 18661 }, { "epoch": 0.9891606816314632, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -57648256.0, "logits/rejected": -14766438.4, "logps/chosen": -547.4016520182291, "logps/rejected": -418.190283203125, "loss": 0.1892, "rewards/chosen": 0.40470075607299805, "rewards/margins": 4.241778469085693, "rewards/rejected": -3.8370777130126954, "step": 18662 }, { "epoch": 0.9892136856332653, "grad_norm": 60.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 34965108.0, "logits/rejected": -19116698.285714287, "logps/chosen": -340.4105529785156, "logps/rejected": -491.4644252232143, "loss": 0.179, "rewards/chosen": 0.05170593410730362, "rewards/margins": 2.5193731050406183, "rewards/rejected": -2.4676671709333147, "step": 18663 }, { "epoch": 0.9892666896350675, "grad_norm": 34.75, "kl": 0.9587631225585938, "learning_rate": 5e-07, "logits/chosen": -11250529.333333334, "logits/rejected": -35111283.2, "logps/chosen": -391.0621337890625, "logps/rejected": -383.25546875, "loss": 0.178, "rewards/chosen": 0.676152229309082, "rewards/margins": 4.113656806945801, "rewards/rejected": -3.4375045776367186, "step": 18664 }, { "epoch": 0.9893196936368696, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -43921284.0, "logits/rejected": -14468373.714285715, "logps/chosen": -1793.5390625, "logps/rejected": -301.6123046875, "loss": 0.2125, "rewards/chosen": 4.238745212554932, "rewards/margins": 6.123488903045654, "rewards/rejected": -1.8847436904907227, "step": 18665 }, { "epoch": 0.9893726976386717, "grad_norm": 23.875, "kl": 1.3914871215820312, "learning_rate": 5e-07, "logits/chosen": 8375471.5, "logits/rejected": -21055434.666666668, "logps/chosen": -724.321044921875, "logps/rejected": -198.66670735677084, "loss": 0.0712, "rewards/chosen": 2.744929552078247, "rewards/margins": 5.982203722000122, "rewards/rejected": -3.237274169921875, "step": 18666 }, { "epoch": 0.9894257016404738, "grad_norm": 52.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -28400885.333333332, "logits/rejected": -5454670.0, "logps/chosen": -196.29829915364584, "logps/rejected": -82.18150329589844, "loss": 0.3556, "rewards/chosen": 0.3323381741841634, "rewards/margins": 2.786463816960653, "rewards/rejected": -2.4541256427764893, "step": 18667 }, { "epoch": 0.989478705642276, "grad_norm": 66.5, "kl": 0.7908554077148438, "learning_rate": 5e-07, "logits/chosen": -82410764.8, "logits/rejected": -37938058.666666664, "logps/chosen": -512.57802734375, "logps/rejected": -451.33740234375, "loss": 0.3028, "rewards/chosen": 0.14186493158340455, "rewards/margins": 6.158737337589264, "rewards/rejected": -6.016872406005859, "step": 18668 }, { "epoch": 0.9895317096440781, "grad_norm": 41.75, "kl": 4.983658790588379, "learning_rate": 5e-07, "logits/chosen": -14265368.0, "logits/rejected": -12827513.333333334, "logps/chosen": -392.2835205078125, "logps/rejected": -265.5651448567708, "loss": 0.215, "rewards/chosen": 1.7867513656616212, "rewards/margins": 6.191448148091634, "rewards/rejected": -4.404696782430013, "step": 18669 }, { "epoch": 0.9895847136458803, "grad_norm": 59.5, "kl": 1.3200798034667969, "learning_rate": 5e-07, "logits/chosen": -16852860.8, "logits/rejected": 35780125.333333336, "logps/chosen": -274.6087890625, "logps/rejected": -434.1597086588542, "loss": 0.2207, "rewards/chosen": 1.2039714813232423, "rewards/margins": 3.2482423146565758, "rewards/rejected": -2.0442708333333335, "step": 18670 }, { "epoch": 0.9896377176476824, "grad_norm": 35.75, "kl": 0.5599441528320312, "learning_rate": 5e-07, "logits/chosen": -14536006.666666666, "logits/rejected": -8591957.6, "logps/chosen": -58.84663391113281, "logps/rejected": -331.52685546875, "loss": 0.295, "rewards/chosen": 0.05036774277687073, "rewards/margins": 1.8920436918735504, "rewards/rejected": -1.8416759490966796, "step": 18671 }, { "epoch": 0.9896907216494846, "grad_norm": 53.0, "kl": 1.6204252243041992, "learning_rate": 5e-07, "logits/chosen": -7686886.666666667, "logits/rejected": 2314467.6, "logps/chosen": -679.2032470703125, "logps/rejected": -290.7353515625, "loss": 0.2076, "rewards/chosen": 2.0198262532552085, "rewards/margins": 3.966359837849935, "rewards/rejected": -1.9465335845947265, "step": 18672 }, { "epoch": 0.9897437256512867, "grad_norm": 46.75, "kl": 1.975947380065918, "learning_rate": 5e-07, "logits/chosen": -31676227.2, "logits/rejected": -12968765.333333334, "logps/chosen": -414.790185546875, "logps/rejected": -297.9287109375, "loss": 0.2978, "rewards/chosen": 1.0885793685913085, "rewards/margins": 2.9660357157389323, "rewards/rejected": -1.8774563471476238, "step": 18673 }, { "epoch": 0.9897967296530888, "grad_norm": 26.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -14341160.0, "logits/rejected": -20040032.0, "logps/chosen": -134.8815460205078, "logps/rejected": -255.7619425455729, "loss": 0.1318, "rewards/chosen": 0.8956035375595093, "rewards/margins": 3.6717017889022827, "rewards/rejected": -2.7760982513427734, "step": 18674 }, { "epoch": 0.9898497336548909, "grad_norm": 56.5, "kl": 1.094573974609375, "learning_rate": 5e-07, "logits/chosen": -11533781.6, "logits/rejected": -34029218.666666664, "logps/chosen": -332.69599609375, "logps/rejected": -441.7150472005208, "loss": 0.3662, "rewards/chosen": 0.1707627773284912, "rewards/margins": 1.7059195041656494, "rewards/rejected": -1.5351567268371582, "step": 18675 }, { "epoch": 0.9899027376566931, "grad_norm": 36.5, "kl": 0.30915069580078125, "learning_rate": 5e-07, "logits/chosen": -12636231.2, "logits/rejected": -17955833.333333332, "logps/chosen": -218.1541015625, "logps/rejected": -139.45613606770834, "loss": 0.2273, "rewards/chosen": 0.9316641807556152, "rewards/margins": 4.752909056345621, "rewards/rejected": -3.8212448755900064, "step": 18676 }, { "epoch": 0.9899557416584952, "grad_norm": 61.0, "kl": 0.22844886779785156, "learning_rate": 5e-07, "logits/chosen": -20435848.0, "logits/rejected": -2246359.2, "logps/chosen": -240.8428955078125, "logps/rejected": -213.7881591796875, "loss": 0.2133, "rewards/chosen": 1.349557876586914, "rewards/margins": 2.8314342498779297, "rewards/rejected": -1.4818763732910156, "step": 18677 }, { "epoch": 0.9900087456602974, "grad_norm": 17.5, "kl": 2.2053213119506836, "learning_rate": 5e-07, "logits/chosen": -6184734.0, "logits/rejected": -51198752.0, "logps/chosen": -290.72662353515625, "logps/rejected": -379.3353271484375, "loss": 0.0907, "rewards/chosen": 1.3567885160446167, "rewards/margins": 5.251275897026062, "rewards/rejected": -3.8944873809814453, "step": 18678 }, { "epoch": 0.9900617496620995, "grad_norm": 37.25, "kl": 1.3920764923095703, "learning_rate": 5e-07, "logits/chosen": -663664.4285714285, "logits/rejected": -45235656.0, "logps/chosen": -130.86344691685267, "logps/rejected": -280.62518310546875, "loss": 0.3941, "rewards/chosen": 0.5946993146623883, "rewards/margins": 2.5670137916292464, "rewards/rejected": -1.972314476966858, "step": 18679 }, { "epoch": 0.9901147536639017, "grad_norm": 69.5, "kl": 1.4210929870605469, "learning_rate": 5e-07, "logits/chosen": -60358992.0, "logits/rejected": -30413795.2, "logps/chosen": -259.92950439453125, "logps/rejected": -173.5469970703125, "loss": 0.276, "rewards/chosen": 0.11453859011332194, "rewards/margins": 2.264956680933634, "rewards/rejected": -2.1504180908203123, "step": 18680 }, { "epoch": 0.9901677576657038, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19486770.0, "logits/rejected": -10328187.0, "logps/chosen": -241.7361602783203, "logps/rejected": -276.79705810546875, "loss": 0.3945, "rewards/chosen": -0.1324707716703415, "rewards/margins": 2.36414797604084, "rewards/rejected": -2.4966187477111816, "step": 18681 }, { "epoch": 0.9902207616675059, "grad_norm": 52.5, "kl": 4.1475067138671875, "learning_rate": 5e-07, "logits/chosen": -21580646.4, "logits/rejected": -27960714.666666668, "logps/chosen": -452.815869140625, "logps/rejected": -318.4876302083333, "loss": 0.3644, "rewards/chosen": 0.7137855052947998, "rewards/margins": 2.78631911277771, "rewards/rejected": -2.07253360748291, "step": 18682 }, { "epoch": 0.990273765669308, "grad_norm": 45.25, "kl": 5.1439361572265625, "learning_rate": 5e-07, "logits/chosen": -16734037.333333334, "logits/rejected": -34976396.0, "logps/chosen": -271.79294840494794, "logps/rejected": -320.9015808105469, "loss": 0.2658, "rewards/chosen": 1.3707561492919922, "rewards/margins": 5.844464302062988, "rewards/rejected": -4.473708152770996, "step": 18683 }, { "epoch": 0.9903267696711102, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -21406120.0, "logits/rejected": -33474086.4, "logps/chosen": -332.74709065755206, "logps/rejected": -340.251171875, "loss": 0.2012, "rewards/chosen": 0.6228668689727783, "rewards/margins": 3.1869974613189695, "rewards/rejected": -2.5641305923461912, "step": 18684 }, { "epoch": 0.9903797736729123, "grad_norm": 41.0, "kl": 1.6138858795166016, "learning_rate": 5e-07, "logits/chosen": -6459100.0, "logits/rejected": -28095933.333333332, "logps/chosen": -158.08223876953124, "logps/rejected": -233.9756062825521, "loss": 0.3144, "rewards/chosen": 0.6968954086303711, "rewards/margins": 2.3577433904012044, "rewards/rejected": -1.6608479817708333, "step": 18685 }, { "epoch": 0.9904327776747145, "grad_norm": 62.25, "kl": 1.7372627258300781, "learning_rate": 5e-07, "logits/chosen": -15242416.0, "logits/rejected": -16170538.0, "logps/chosen": -169.07223510742188, "logps/rejected": -442.747314453125, "loss": 0.2861, "rewards/chosen": 0.8257888555526733, "rewards/margins": 4.253518462181091, "rewards/rejected": -3.427729606628418, "step": 18686 }, { "epoch": 0.9904857816765166, "grad_norm": 50.5, "kl": 0.6034889221191406, "learning_rate": 5e-07, "logits/chosen": 20591517.333333332, "logits/rejected": -52294195.2, "logps/chosen": -169.81163533528647, "logps/rejected": -498.2490234375, "loss": 0.2036, "rewards/chosen": 0.7007110913594564, "rewards/margins": 3.1206144650777183, "rewards/rejected": -2.419903373718262, "step": 18687 }, { "epoch": 0.9905387856783188, "grad_norm": 25.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16433.25, "logits/rejected": -4817218.0, "logps/chosen": -177.43154907226562, "logps/rejected": -201.6033172607422, "loss": 0.1539, "rewards/chosen": 1.212788462638855, "rewards/margins": 4.610706210136414, "rewards/rejected": -3.3979177474975586, "step": 18688 }, { "epoch": 0.9905917896801208, "grad_norm": 48.0, "kl": 0.965667724609375, "learning_rate": 5e-07, "logits/chosen": -38301656.0, "logits/rejected": -51237644.0, "logps/chosen": -366.7682189941406, "logps/rejected": -557.8074951171875, "loss": 0.2759, "rewards/chosen": 0.9372831583023071, "rewards/margins": 3.78286612033844, "rewards/rejected": -2.845582962036133, "step": 18689 }, { "epoch": 0.9906447936819229, "grad_norm": 39.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -6081655.5, "logits/rejected": -35306985.14285714, "logps/chosen": -116.18421936035156, "logps/rejected": -271.6328125, "loss": 0.2225, "rewards/chosen": -0.25089722871780396, "rewards/margins": 2.0235481687954495, "rewards/rejected": -2.2744453975132535, "step": 18690 }, { "epoch": 0.9906977976837251, "grad_norm": 43.0, "kl": 2.4622926712036133, "learning_rate": 5e-07, "logits/chosen": -15741004.8, "logits/rejected": -26523701.333333332, "logps/chosen": -379.9727783203125, "logps/rejected": -266.57489013671875, "loss": 0.2294, "rewards/chosen": 1.1506674766540528, "rewards/margins": 3.5805542945861815, "rewards/rejected": -2.429886817932129, "step": 18691 }, { "epoch": 0.9907508016855272, "grad_norm": 48.0, "kl": 2.9430999755859375, "learning_rate": 5e-07, "logits/chosen": 3257490.6666666665, "logits/rejected": -25299908.8, "logps/chosen": -316.1606852213542, "logps/rejected": -429.385498046875, "loss": 0.1907, "rewards/chosen": 1.0223957697550456, "rewards/margins": 4.054848543802898, "rewards/rejected": -3.0324527740478517, "step": 18692 }, { "epoch": 0.9908038056873294, "grad_norm": 25.75, "kl": 0.5247440338134766, "learning_rate": 5e-07, "logits/chosen": 10053283.333333334, "logits/rejected": -30338233.6, "logps/chosen": -84.97968037923177, "logps/rejected": -315.958740234375, "loss": 0.2261, "rewards/chosen": 0.23775633176167807, "rewards/margins": 2.8968557198842366, "rewards/rejected": -2.6590993881225584, "step": 18693 }, { "epoch": 0.9908568096891315, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -16295587.2, "logits/rejected": -32647930.666666668, "logps/chosen": -409.4863525390625, "logps/rejected": -467.598388671875, "loss": 0.3043, "rewards/chosen": 0.21248974800109863, "rewards/margins": 3.544771178563436, "rewards/rejected": -3.3322814305623374, "step": 18694 }, { "epoch": 0.9909098136909337, "grad_norm": 55.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -121432752.0, "logits/rejected": -21650080.0, "logps/chosen": -380.38592529296875, "logps/rejected": -416.906982421875, "loss": 0.238, "rewards/chosen": 0.09349821507930756, "rewards/margins": 2.088998143871625, "rewards/rejected": -1.9954999287923176, "step": 18695 }, { "epoch": 0.9909628176927358, "grad_norm": 41.25, "kl": 2.6383514404296875, "learning_rate": 5e-07, "logits/chosen": -26626846.0, "logits/rejected": -46353272.0, "logps/chosen": -557.05126953125, "logps/rejected": -314.0074157714844, "loss": 0.268, "rewards/chosen": 1.4778599739074707, "rewards/margins": 3.043303966522217, "rewards/rejected": -1.565443992614746, "step": 18696 }, { "epoch": 0.991015821694538, "grad_norm": 41.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63977445.333333336, "logits/rejected": -25001726.4, "logps/chosen": -340.521240234375, "logps/rejected": -449.94013671875, "loss": 0.3153, "rewards/chosen": -0.3941437005996704, "rewards/margins": 1.595627760887146, "rewards/rejected": -1.9897714614868165, "step": 18697 }, { "epoch": 0.99106882569634, "grad_norm": 38.25, "kl": 4.267892837524414, "learning_rate": 5e-07, "logits/chosen": -50238489.6, "logits/rejected": -15083385.333333334, "logps/chosen": -306.69248046875, "logps/rejected": -171.025634765625, "loss": 0.2745, "rewards/chosen": 1.158755111694336, "rewards/margins": 4.199305661519369, "rewards/rejected": -3.0405505498250327, "step": 18698 }, { "epoch": 0.9911218296981422, "grad_norm": 54.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46629285.333333336, "logits/rejected": -18594434.0, "logps/chosen": -412.4027099609375, "logps/rejected": -188.95693969726562, "loss": 0.2526, "rewards/chosen": 0.9425455729166666, "rewards/margins": 4.3976554075876875, "rewards/rejected": -3.4551098346710205, "step": 18699 }, { "epoch": 0.9911748336999443, "grad_norm": 47.5, "kl": 1.8486328125, "learning_rate": 5e-07, "logits/chosen": -37927509.333333336, "logits/rejected": -33235788.8, "logps/chosen": -389.6541341145833, "logps/rejected": -334.77138671875, "loss": 0.2247, "rewards/chosen": 0.9152588049570719, "rewards/margins": 2.7598381201426188, "rewards/rejected": -1.844579315185547, "step": 18700 }, { "epoch": 0.9912278377017465, "grad_norm": 48.0, "kl": 0.9833498001098633, "learning_rate": 5e-07, "logits/chosen": 2371086.5, "logits/rejected": -59630380.8, "logps/chosen": -319.39076741536456, "logps/rejected": -403.7704345703125, "loss": 0.1917, "rewards/chosen": 0.9484502474466959, "rewards/margins": 3.6837457338968913, "rewards/rejected": -2.7352954864501955, "step": 18701 }, { "epoch": 0.9912808417035486, "grad_norm": 41.75, "kl": 0.7336158752441406, "learning_rate": 5e-07, "logits/chosen": -13147548.0, "logits/rejected": -12528822.4, "logps/chosen": -184.7042236328125, "logps/rejected": -262.9365966796875, "loss": 0.3226, "rewards/chosen": -0.531534751256307, "rewards/margins": 1.7777959982554115, "rewards/rejected": -2.3093307495117186, "step": 18702 }, { "epoch": 0.9913338457053508, "grad_norm": 29.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 18260088.0, "logits/rejected": -53554133.333333336, "logps/chosen": -283.8441162109375, "logps/rejected": -306.75311279296875, "loss": 0.1615, "rewards/chosen": 1.2073936462402344, "rewards/margins": 4.1679792404174805, "rewards/rejected": -2.960585594177246, "step": 18703 }, { "epoch": 0.9913868497071529, "grad_norm": 51.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46397792.0, "logits/rejected": -25805102.4, "logps/chosen": -299.6344807942708, "logps/rejected": -535.192724609375, "loss": 0.2451, "rewards/chosen": -0.1281336545944214, "rewards/margins": 2.9055415391921997, "rewards/rejected": -3.033675193786621, "step": 18704 }, { "epoch": 0.991439853708955, "grad_norm": 51.0, "kl": 3.3995094299316406, "learning_rate": 5e-07, "logits/chosen": 3771612.8, "logits/rejected": 9269234.666666666, "logps/chosen": -156.1907958984375, "logps/rejected": -161.92357381184897, "loss": 0.3436, "rewards/chosen": 0.8257000923156739, "rewards/margins": 1.5945109208424886, "rewards/rejected": -0.7688108285268148, "step": 18705 }, { "epoch": 0.9914928577107571, "grad_norm": 54.0, "kl": 1.4041976928710938, "learning_rate": 5e-07, "logits/chosen": 1179411.5, "logits/rejected": -4385328.0, "logps/chosen": -332.9398498535156, "logps/rejected": -384.27301025390625, "loss": 0.2642, "rewards/chosen": 0.49665069580078125, "rewards/margins": 2.8786258697509766, "rewards/rejected": -2.3819751739501953, "step": 18706 }, { "epoch": 0.9915458617125593, "grad_norm": 36.0, "kl": 2.1903343200683594, "learning_rate": 5e-07, "logits/chosen": 1265251.375, "logits/rejected": -20792002.0, "logps/chosen": -186.391845703125, "logps/rejected": -481.5235595703125, "loss": 0.2501, "rewards/chosen": 0.4910409450531006, "rewards/margins": 3.5285747051239014, "rewards/rejected": -3.037533760070801, "step": 18707 }, { "epoch": 0.9915988657143614, "grad_norm": 47.75, "kl": 0.9959678649902344, "learning_rate": 5e-07, "logits/chosen": -34398960.0, "logits/rejected": 5197398.333333333, "logps/chosen": -303.8633056640625, "logps/rejected": -52.309722900390625, "loss": 0.3381, "rewards/chosen": 0.7297597408294678, "rewards/margins": 2.1309152126312254, "rewards/rejected": -1.4011554718017578, "step": 18708 }, { "epoch": 0.9916518697161636, "grad_norm": 40.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -18191642.0, "logits/rejected": -40990474.666666664, "logps/chosen": -468.65557861328125, "logps/rejected": -507.3177897135417, "loss": 0.1526, "rewards/chosen": 0.5527088046073914, "rewards/margins": 3.3945549527804055, "rewards/rejected": -2.841846148173014, "step": 18709 }, { "epoch": 0.9917048737179657, "grad_norm": 70.0, "kl": 5.86566162109375, "learning_rate": 5e-07, "logits/chosen": -36509965.71428572, "logits/rejected": -20522938.0, "logps/chosen": -377.93014090401783, "logps/rejected": -334.7067565917969, "loss": 0.3553, "rewards/chosen": 1.2129868098667689, "rewards/margins": 4.024452311652047, "rewards/rejected": -2.8114655017852783, "step": 18710 }, { "epoch": 0.9917578777197679, "grad_norm": 34.75, "kl": 2.144184112548828, "learning_rate": 5e-07, "logits/chosen": -3478968.75, "logits/rejected": -14633232.0, "logps/chosen": -471.0714416503906, "logps/rejected": -161.06326293945312, "loss": 0.1884, "rewards/chosen": 1.5185997486114502, "rewards/margins": 4.408889055252075, "rewards/rejected": -2.890289306640625, "step": 18711 }, { "epoch": 0.99181088172157, "grad_norm": 35.0, "kl": 0.00472259521484375, "learning_rate": 5e-07, "logits/chosen": -19385608.0, "logits/rejected": -27638672.0, "logps/chosen": -78.69298553466797, "logps/rejected": -349.1460367838542, "loss": 0.2359, "rewards/chosen": -0.07993603497743607, "rewards/margins": 2.2045826837420464, "rewards/rejected": -2.2845187187194824, "step": 18712 }, { "epoch": 0.9918638857233721, "grad_norm": 48.75, "kl": 0.44942378997802734, "learning_rate": 5e-07, "logits/chosen": -64958186.666666664, "logits/rejected": -18676859.2, "logps/chosen": -702.6311848958334, "logps/rejected": -186.0963134765625, "loss": 0.3054, "rewards/chosen": 0.9123636881510416, "rewards/margins": 2.041752592722575, "rewards/rejected": -1.1293889045715333, "step": 18713 }, { "epoch": 0.9919168897251742, "grad_norm": 35.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9680028.0, "logits/rejected": -19094224.0, "logps/chosen": -149.53872680664062, "logps/rejected": -176.24169921875, "loss": 0.1977, "rewards/chosen": -0.02542114444077015, "rewards/margins": 2.563362119719386, "rewards/rejected": -2.5887832641601562, "step": 18714 }, { "epoch": 0.9919698937269764, "grad_norm": 41.25, "kl": 0.2969675064086914, "learning_rate": 5e-07, "logits/chosen": -41098948.0, "logits/rejected": -17814166.0, "logps/chosen": -158.8095245361328, "logps/rejected": -333.6180419921875, "loss": 0.2437, "rewards/chosen": 0.5273928642272949, "rewards/margins": 3.0100224018096924, "rewards/rejected": -2.4826295375823975, "step": 18715 }, { "epoch": 0.9920228977287785, "grad_norm": 41.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -38612645.333333336, "logits/rejected": -41169011.2, "logps/chosen": -223.5257568359375, "logps/rejected": -341.867724609375, "loss": 0.276, "rewards/chosen": -0.49017810821533203, "rewards/margins": 2.335443687438965, "rewards/rejected": -2.825621795654297, "step": 18716 }, { "epoch": 0.9920759017305807, "grad_norm": 47.0, "kl": 1.601271629333496, "learning_rate": 5e-07, "logits/chosen": -22939202.0, "logits/rejected": -8750134.0, "logps/chosen": -211.2601318359375, "logps/rejected": -367.160888671875, "loss": 0.2777, "rewards/chosen": 0.8291336894035339, "rewards/margins": 2.611780822277069, "rewards/rejected": -1.7826471328735352, "step": 18717 }, { "epoch": 0.9921289057323828, "grad_norm": 35.25, "kl": 1.2373991012573242, "learning_rate": 5e-07, "logits/chosen": -56519130.666666664, "logits/rejected": -34110342.4, "logps/chosen": -289.8235677083333, "logps/rejected": -383.562109375, "loss": 0.1559, "rewards/chosen": 1.2492001851399739, "rewards/margins": 3.5809266408284506, "rewards/rejected": -2.3317264556884765, "step": 18718 }, { "epoch": 0.992181909734185, "grad_norm": 45.75, "kl": 0.3090496063232422, "learning_rate": 5e-07, "logits/chosen": -75913288.0, "logits/rejected": -6544796.0, "logps/chosen": -301.8183898925781, "logps/rejected": -277.29258219401044, "loss": 0.1897, "rewards/chosen": 0.7023844718933105, "rewards/margins": 3.122082074483236, "rewards/rejected": -2.4196976025899253, "step": 18719 }, { "epoch": 0.992234913735987, "grad_norm": 76.0, "kl": 0.15749740600585938, "learning_rate": 5e-07, "logits/chosen": -59051264.0, "logits/rejected": -22884518.0, "logps/chosen": -224.90963745117188, "logps/rejected": -284.4664611816406, "loss": 0.3295, "rewards/chosen": -0.024750325828790665, "rewards/margins": 2.398276474326849, "rewards/rejected": -2.4230268001556396, "step": 18720 }, { "epoch": 0.9922879177377892, "grad_norm": 45.75, "kl": 1.4202194213867188, "learning_rate": 5e-07, "logits/chosen": -41304064.0, "logits/rejected": -53238568.0, "logps/chosen": -446.9038899739583, "logps/rejected": -545.5633544921875, "loss": 0.2397, "rewards/chosen": 1.352319876352946, "rewards/margins": 4.095612208048503, "rewards/rejected": -2.7432923316955566, "step": 18721 }, { "epoch": 0.9923409217395913, "grad_norm": 50.25, "kl": 0.43128013610839844, "learning_rate": 5e-07, "logits/chosen": -40048208.0, "logits/rejected": -9509723.0, "logps/chosen": -212.02597045898438, "logps/rejected": -172.44912719726562, "loss": 0.2875, "rewards/chosen": 0.274821937084198, "rewards/margins": 2.576948344707489, "rewards/rejected": -2.302126407623291, "step": 18722 }, { "epoch": 0.9923939257413935, "grad_norm": 56.0, "kl": 0.8808116912841797, "learning_rate": 5e-07, "logits/chosen": -40927504.0, "logits/rejected": -2415103.1666666665, "logps/chosen": -250.893896484375, "logps/rejected": -181.925048828125, "loss": 0.4533, "rewards/chosen": -0.29520387649536134, "rewards/margins": 1.7020117123921712, "rewards/rejected": -1.9972155888875325, "step": 18723 }, { "epoch": 0.9924469297431956, "grad_norm": 46.25, "kl": 1.107208251953125, "learning_rate": 5e-07, "logits/chosen": -30061292.0, "logits/rejected": -37338276.0, "logps/chosen": -623.224853515625, "logps/rejected": -241.95941162109375, "loss": 0.215, "rewards/chosen": 1.495351791381836, "rewards/margins": 3.4906784296035767, "rewards/rejected": -1.9953266382217407, "step": 18724 }, { "epoch": 0.9924999337449978, "grad_norm": 38.0, "kl": 0.8583488464355469, "learning_rate": 5e-07, "logits/chosen": -37733973.333333336, "logits/rejected": -30538451.2, "logps/chosen": -308.33905029296875, "logps/rejected": -158.2628173828125, "loss": 0.1966, "rewards/chosen": 1.2158290545145671, "rewards/margins": 4.082503668467204, "rewards/rejected": -2.866674613952637, "step": 18725 }, { "epoch": 0.9925529377467999, "grad_norm": 46.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -55151507.2, "logits/rejected": -36798944.0, "logps/chosen": -305.903173828125, "logps/rejected": -291.1562093098958, "loss": 0.2776, "rewards/chosen": 0.48752799034118655, "rewards/margins": 3.468228483200073, "rewards/rejected": -2.9807004928588867, "step": 18726 }, { "epoch": 0.9926059417486021, "grad_norm": 48.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 9524308.0, "logits/rejected": -29063648.0, "logps/chosen": -44.24979782104492, "logps/rejected": -351.14453125, "loss": 0.2291, "rewards/chosen": 0.21348878741264343, "rewards/margins": 2.1191483040650683, "rewards/rejected": -1.905659516652425, "step": 18727 }, { "epoch": 0.9926589457504041, "grad_norm": 47.5, "kl": 0.465850830078125, "learning_rate": 5e-07, "logits/chosen": -38211526.4, "logits/rejected": -42605440.0, "logps/chosen": -178.48564453125, "logps/rejected": -432.7226969401042, "loss": 0.327, "rewards/chosen": 0.04930614829063416, "rewards/margins": 3.863573306798935, "rewards/rejected": -3.814267158508301, "step": 18728 }, { "epoch": 0.9927119497522063, "grad_norm": 77.0, "kl": 0.7902994155883789, "learning_rate": 5e-07, "logits/chosen": -25898597.333333332, "logits/rejected": -16637350.4, "logps/chosen": -264.4185791015625, "logps/rejected": -248.0401123046875, "loss": 0.2024, "rewards/chosen": 0.5459014972050985, "rewards/margins": 2.811471756299337, "rewards/rejected": -2.2655702590942384, "step": 18729 }, { "epoch": 0.9927649537540084, "grad_norm": 28.625, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29274325.333333332, "logits/rejected": -41495132.8, "logps/chosen": -214.8028361002604, "logps/rejected": -312.754541015625, "loss": 0.1782, "rewards/chosen": 0.8311944802602133, "rewards/margins": 3.7379871209462485, "rewards/rejected": -2.906792640686035, "step": 18730 }, { "epoch": 0.9928179577558106, "grad_norm": 53.0, "kl": 1.099334716796875, "learning_rate": 5e-07, "logits/chosen": -32651768.0, "logits/rejected": -24149876.0, "logps/chosen": -431.0465393066406, "logps/rejected": -252.96099853515625, "loss": 0.2303, "rewards/chosen": 0.4390876889228821, "rewards/margins": 3.8770259022712708, "rewards/rejected": -3.4379382133483887, "step": 18731 }, { "epoch": 0.9928709617576127, "grad_norm": 53.0, "kl": 3.082538604736328, "learning_rate": 5e-07, "logits/chosen": -9446162.4, "logits/rejected": -48163333.333333336, "logps/chosen": -207.721484375, "logps/rejected": -547.2235921223959, "loss": 0.2853, "rewards/chosen": 0.6805026054382324, "rewards/margins": 3.4456555366516115, "rewards/rejected": -2.765152931213379, "step": 18732 }, { "epoch": 0.9929239657594149, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19306381.333333332, "logits/rejected": -39933651.2, "logps/chosen": -339.18825276692706, "logps/rejected": -250.8217529296875, "loss": 0.2105, "rewards/chosen": 0.8712797164916992, "rewards/margins": 3.1414297103881834, "rewards/rejected": -2.270149993896484, "step": 18733 }, { "epoch": 0.992976969761217, "grad_norm": 45.0, "kl": 4.306236267089844, "learning_rate": 5e-07, "logits/chosen": -16445036.0, "logits/rejected": -63073596.0, "logps/chosen": -348.7379150390625, "logps/rejected": -230.1895294189453, "loss": 0.2013, "rewards/chosen": 1.6710162162780762, "rewards/margins": 3.312583565711975, "rewards/rejected": -1.641567349433899, "step": 18734 }, { "epoch": 0.9930299737630192, "grad_norm": 50.25, "kl": 3.0623435974121094, "learning_rate": 5e-07, "logits/chosen": -26163368.0, "logits/rejected": -25403397.333333332, "logps/chosen": -160.87659912109376, "logps/rejected": -261.28741455078125, "loss": 0.338, "rewards/chosen": 0.5632154941558838, "rewards/margins": 2.727888345718384, "rewards/rejected": -2.1646728515625, "step": 18735 }, { "epoch": 0.9930829777648212, "grad_norm": 35.5, "kl": 0.027980804443359375, "learning_rate": 5e-07, "logits/chosen": -18044680.0, "logits/rejected": -20069430.0, "logps/chosen": -291.76873779296875, "logps/rejected": -323.7985534667969, "loss": 0.1603, "rewards/chosen": 1.3645992279052734, "rewards/margins": 4.1880364418029785, "rewards/rejected": -2.823437213897705, "step": 18736 }, { "epoch": 0.9931359817666234, "grad_norm": 33.25, "kl": 3.098698616027832, "learning_rate": 5e-07, "logits/chosen": -7255225.333333333, "logits/rejected": 305365683.2, "logps/chosen": -129.48041788736978, "logps/rejected": -337.5507568359375, "loss": 0.2468, "rewards/chosen": 0.20581279198328653, "rewards/margins": 2.8821197072664897, "rewards/rejected": -2.6763069152832033, "step": 18737 }, { "epoch": 0.9931889857684255, "grad_norm": 46.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -112406584.0, "logits/rejected": -58879508.0, "logps/chosen": -209.03155517578125, "logps/rejected": -366.7843322753906, "loss": 0.2726, "rewards/chosen": 0.40362483263015747, "rewards/margins": 3.690247356891632, "rewards/rejected": -3.2866225242614746, "step": 18738 }, { "epoch": 0.9932419897702277, "grad_norm": 52.25, "kl": 4.383936882019043, "learning_rate": 5e-07, "logits/chosen": -44613440.0, "logits/rejected": -40293024.0, "logps/chosen": -129.0428466796875, "logps/rejected": -663.26806640625, "loss": 0.4542, "rewards/chosen": 0.07468536496162415, "rewards/margins": 4.703542977571487, "rewards/rejected": -4.628857612609863, "step": 18739 }, { "epoch": 0.9932949937720298, "grad_norm": 46.5, "kl": 0.298583984375, "learning_rate": 5e-07, "logits/chosen": -31867558.0, "logits/rejected": -39033964.0, "logps/chosen": -314.97113037109375, "logps/rejected": -558.3829345703125, "loss": 0.2239, "rewards/chosen": 0.4821878671646118, "rewards/margins": 6.0455509424209595, "rewards/rejected": -5.563363075256348, "step": 18740 }, { "epoch": 0.9933479977738319, "grad_norm": 40.0, "kl": 0.6663284301757812, "learning_rate": 5e-07, "logits/chosen": 35830912.0, "logits/rejected": 822113.5, "logps/chosen": -274.22637939453125, "logps/rejected": -117.67546081542969, "loss": 0.2431, "rewards/chosen": 0.6036974191665649, "rewards/margins": 5.154452919960022, "rewards/rejected": -4.550755500793457, "step": 18741 }, { "epoch": 0.9934010017756341, "grad_norm": 59.25, "kl": 0.3599815368652344, "learning_rate": 5e-07, "logits/chosen": -32757682.285714287, "logits/rejected": -41862304.0, "logps/chosen": -371.0150669642857, "logps/rejected": -185.90887451171875, "loss": 0.3789, "rewards/chosen": 0.4585509981427874, "rewards/margins": 1.924906917980739, "rewards/rejected": -1.4663559198379517, "step": 18742 }, { "epoch": 0.9934540057774361, "grad_norm": 63.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -72324800.0, "logits/rejected": 1995742.6666666667, "logps/chosen": -625.6083984375, "logps/rejected": -337.38018798828125, "loss": 0.2738, "rewards/chosen": 0.08126220852136612, "rewards/margins": 1.850867527226607, "rewards/rejected": -1.769605318705241, "step": 18743 }, { "epoch": 0.9935070097792383, "grad_norm": 38.25, "kl": 0.24128341674804688, "learning_rate": 5e-07, "logits/chosen": -33074482.666666668, "logits/rejected": -15946752.0, "logps/chosen": -180.69708251953125, "logps/rejected": -327.592431640625, "loss": 0.2319, "rewards/chosen": 0.44971632957458496, "rewards/margins": 2.4691850185394286, "rewards/rejected": -2.0194686889648437, "step": 18744 }, { "epoch": 0.9935600137810404, "grad_norm": 57.5, "kl": 1.0900936126708984, "learning_rate": 5e-07, "logits/chosen": -1367464.875, "logits/rejected": -6068834.0, "logps/chosen": -123.6205062866211, "logps/rejected": -301.41241455078125, "loss": 0.2764, "rewards/chosen": 0.7764800786972046, "rewards/margins": 2.5969865322113037, "rewards/rejected": -1.8205064535140991, "step": 18745 }, { "epoch": 0.9936130177828426, "grad_norm": 36.0, "kl": 0.22411727905273438, "learning_rate": 5e-07, "logits/chosen": -28532458.666666668, "logits/rejected": -15024606.4, "logps/chosen": -360.0991617838542, "logps/rejected": -209.3648193359375, "loss": 0.1979, "rewards/chosen": 1.3635142644246419, "rewards/margins": 3.8170533498128254, "rewards/rejected": -2.4535390853881838, "step": 18746 }, { "epoch": 0.9936660217846447, "grad_norm": 37.25, "kl": 1.123800277709961, "learning_rate": 5e-07, "logits/chosen": -22711924.0, "logits/rejected": -20600029.714285713, "logps/chosen": -244.60595703125, "logps/rejected": -266.8643101283482, "loss": 0.1538, "rewards/chosen": 0.732025146484375, "rewards/margins": 3.8452017647879466, "rewards/rejected": -3.1131766183035716, "step": 18747 }, { "epoch": 0.9937190257864469, "grad_norm": 49.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24609354.666666668, "logits/rejected": -17496281.6, "logps/chosen": -417.9724934895833, "logps/rejected": -244.9138671875, "loss": 0.2395, "rewards/chosen": 0.23457640409469604, "rewards/margins": 3.1137510180473327, "rewards/rejected": -2.8791746139526366, "step": 18748 }, { "epoch": 0.993772029788249, "grad_norm": 55.25, "kl": 0.4280414581298828, "learning_rate": 5e-07, "logits/chosen": -11488901.333333334, "logits/rejected": -29240870.4, "logps/chosen": -126.24734497070312, "logps/rejected": -435.06201171875, "loss": 0.2303, "rewards/chosen": 0.5935638348261515, "rewards/margins": 3.2541156689325965, "rewards/rejected": -2.660551834106445, "step": 18749 }, { "epoch": 0.9938250337900512, "grad_norm": 32.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 4963632.0, "logits/rejected": -32529194.0, "logps/chosen": -298.0136413574219, "logps/rejected": -138.1060791015625, "loss": 0.2197, "rewards/chosen": 0.9531837701797485, "rewards/margins": 4.419668793678284, "rewards/rejected": -3.466485023498535, "step": 18750 }, { "epoch": 0.9938780377918532, "grad_norm": 47.5, "kl": 0.2598419189453125, "learning_rate": 5e-07, "logits/chosen": 11462104.0, "logits/rejected": -49912352.0, "logps/chosen": -63.05611673990885, "logps/rejected": -240.5469970703125, "loss": 0.2824, "rewards/chosen": 0.15088069438934326, "rewards/margins": 2.1981768369674684, "rewards/rejected": -2.047296142578125, "step": 18751 }, { "epoch": 0.9939310417936554, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -54995552.0, "logits/rejected": -817440.6666666666, "logps/chosen": -408.986181640625, "logps/rejected": -419.151611328125, "loss": 0.2585, "rewards/chosen": 0.454477596282959, "rewards/margins": 5.49833771387736, "rewards/rejected": -5.043860117594401, "step": 18752 }, { "epoch": 0.9939840457954575, "grad_norm": 73.5, "kl": 2.531290054321289, "learning_rate": 5e-07, "logits/chosen": -24043696.0, "logits/rejected": -5041920.4, "logps/chosen": -162.62035115559897, "logps/rejected": -223.20205078125, "loss": 0.2957, "rewards/chosen": 0.5545517603556315, "rewards/margins": 3.084011141459147, "rewards/rejected": -2.5294593811035155, "step": 18753 }, { "epoch": 0.9940370497972597, "grad_norm": 37.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -42515036.0, "logits/rejected": -4168568.0, "logps/chosen": -353.0343017578125, "logps/rejected": -149.85922241210938, "loss": 0.1507, "rewards/chosen": 0.17448577284812927, "rewards/margins": 3.4390712678432465, "rewards/rejected": -3.264585494995117, "step": 18754 }, { "epoch": 0.9940900537990618, "grad_norm": 52.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46514867.2, "logits/rejected": 14256844.0, "logps/chosen": -412.83017578125, "logps/rejected": -339.1266276041667, "loss": 0.2958, "rewards/chosen": 0.6459582805633545, "rewards/margins": 2.811855967839559, "rewards/rejected": -2.1658976872762046, "step": 18755 }, { "epoch": 0.994143057800864, "grad_norm": 56.25, "kl": 3.0998592376708984, "learning_rate": 5e-07, "logits/chosen": 33870422.4, "logits/rejected": -25185098.666666668, "logps/chosen": -498.48037109375, "logps/rejected": -517.3419596354166, "loss": 0.2961, "rewards/chosen": 1.2517988204956054, "rewards/margins": 3.7813579559326174, "rewards/rejected": -2.5295591354370117, "step": 18756 }, { "epoch": 0.9941960618026661, "grad_norm": 40.5, "kl": 3.5294628143310547, "learning_rate": 5e-07, "logits/chosen": -55379270.4, "logits/rejected": -19577790.666666668, "logps/chosen": -217.6502197265625, "logps/rejected": -307.9877522786458, "loss": 0.3885, "rewards/chosen": 0.2880614519119263, "rewards/margins": 3.376036318143209, "rewards/rejected": -3.0879748662312827, "step": 18757 }, { "epoch": 0.9942490658044683, "grad_norm": 54.0, "kl": 1.8014678955078125, "learning_rate": 5e-07, "logits/chosen": -68189429.33333333, "logits/rejected": -1146093.2, "logps/chosen": -886.469482421875, "logps/rejected": -432.844287109375, "loss": 0.2527, "rewards/chosen": 1.5876291592915852, "rewards/margins": 3.3460794766743978, "rewards/rejected": -1.7584503173828125, "step": 18758 }, { "epoch": 0.9943020698062703, "grad_norm": 51.5, "kl": 0.09101676940917969, "learning_rate": 5e-07, "logits/chosen": -8189525.5, "logits/rejected": -2370062.5, "logps/chosen": -90.92919158935547, "logps/rejected": -307.31463623046875, "loss": 0.3138, "rewards/chosen": 0.33037978410720825, "rewards/margins": 2.2262718081474304, "rewards/rejected": -1.8958920240402222, "step": 18759 }, { "epoch": 0.9943550738080725, "grad_norm": 46.0, "kl": 0.3908576965332031, "learning_rate": 5e-07, "logits/chosen": -26601073.6, "logits/rejected": -27001466.666666668, "logps/chosen": -238.9157470703125, "logps/rejected": -275.48044840494794, "loss": 0.3194, "rewards/chosen": 0.6782773971557617, "rewards/margins": 2.0610581080118817, "rewards/rejected": -1.3827807108561199, "step": 18760 }, { "epoch": 0.9944080778098746, "grad_norm": 42.5, "kl": 0.6500434875488281, "learning_rate": 5e-07, "logits/chosen": -26001584.0, "logits/rejected": -22221784.0, "logps/chosen": -318.0491638183594, "logps/rejected": -220.68310546875, "loss": 0.2369, "rewards/chosen": 0.15815353393554688, "rewards/margins": 3.2825037638346353, "rewards/rejected": -3.1243502298990884, "step": 18761 }, { "epoch": 0.9944610818116768, "grad_norm": 38.25, "kl": 0.10136604309082031, "learning_rate": 5e-07, "logits/chosen": -35619760.0, "logits/rejected": -43392268.8, "logps/chosen": -304.2266845703125, "logps/rejected": -524.495458984375, "loss": 0.1889, "rewards/chosen": 0.6577468713124593, "rewards/margins": 3.2296550591786706, "rewards/rejected": -2.571908187866211, "step": 18762 }, { "epoch": 0.9945140858134789, "grad_norm": 57.0, "kl": 0.6215724945068359, "learning_rate": 5e-07, "logits/chosen": -26475100.8, "logits/rejected": -28972688.0, "logps/chosen": -290.98564453125, "logps/rejected": -336.3549397786458, "loss": 0.2959, "rewards/chosen": 0.5475624084472657, "rewards/margins": 2.6353780110677087, "rewards/rejected": -2.087815602620443, "step": 18763 }, { "epoch": 0.9945670898152811, "grad_norm": 47.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -46259829.333333336, "logits/rejected": 56677305.6, "logps/chosen": -539.7912190755209, "logps/rejected": -336.977099609375, "loss": 0.2371, "rewards/chosen": 0.6614761352539062, "rewards/margins": 2.6704032897949217, "rewards/rejected": -2.0089271545410154, "step": 18764 }, { "epoch": 0.9946200938170832, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -37698069.333333336, "logits/rejected": 4721088.0, "logps/chosen": -323.42946370442706, "logps/rejected": -165.2694091796875, "loss": 0.3685, "rewards/chosen": 0.257457971572876, "rewards/margins": 3.483283519744873, "rewards/rejected": -3.225825548171997, "step": 18765 }, { "epoch": 0.9946730978188854, "grad_norm": 57.0, "kl": 0.6279773712158203, "learning_rate": 5e-07, "logits/chosen": -53904000.0, "logits/rejected": -37398656.0, "logps/chosen": -333.6317952473958, "logps/rejected": -731.8819580078125, "loss": 0.3378, "rewards/chosen": 0.44116441408793133, "rewards/margins": 5.151177326838176, "rewards/rejected": -4.710012912750244, "step": 18766 }, { "epoch": 0.9947261018206874, "grad_norm": 39.5, "kl": 0.4222755432128906, "learning_rate": 5e-07, "logits/chosen": -167314944.0, "logits/rejected": -30511146.666666668, "logps/chosen": -201.0272216796875, "logps/rejected": -454.5586751302083, "loss": 0.1286, "rewards/chosen": 0.9349594116210938, "rewards/margins": 3.7005577087402344, "rewards/rejected": -2.7655982971191406, "step": 18767 }, { "epoch": 0.9947791058224896, "grad_norm": 68.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 2910688.0, "logits/rejected": -1618505.0, "logps/chosen": -399.161865234375, "logps/rejected": -256.5943908691406, "loss": 0.1701, "rewards/chosen": 1.0077571868896484, "rewards/margins": 4.715416669845581, "rewards/rejected": -3.7076594829559326, "step": 18768 }, { "epoch": 0.9948321098242917, "grad_norm": 43.5, "kl": 0.3507251739501953, "learning_rate": 5e-07, "logits/chosen": -27207998.0, "logits/rejected": -55843392.0, "logps/chosen": -260.46514892578125, "logps/rejected": -286.4760437011719, "loss": 0.1981, "rewards/chosen": 0.8505799770355225, "rewards/margins": 3.9735825061798096, "rewards/rejected": -3.123002529144287, "step": 18769 }, { "epoch": 0.9948851138260939, "grad_norm": 53.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -3950789.3333333335, "logits/rejected": -59419032.0, "logps/chosen": -318.3663330078125, "logps/rejected": -451.8326416015625, "loss": 0.3696, "rewards/chosen": 0.11978318293889363, "rewards/margins": 3.6733858784039817, "rewards/rejected": -3.553602695465088, "step": 18770 }, { "epoch": 0.994938117827896, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34074149.333333336, "logits/rejected": -32530473.6, "logps/chosen": -214.31107584635416, "logps/rejected": -307.429931640625, "loss": 0.2037, "rewards/chosen": 0.8563440640767416, "rewards/margins": 2.952680619557699, "rewards/rejected": -2.096336555480957, "step": 18771 }, { "epoch": 0.9949911218296982, "grad_norm": 54.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -10971100.666666666, "logits/rejected": -34541308.0, "logps/chosen": -249.94852701822916, "logps/rejected": -156.12306213378906, "loss": 0.3754, "rewards/chosen": 0.21523398160934448, "rewards/margins": 2.479026734828949, "rewards/rejected": -2.2637927532196045, "step": 18772 }, { "epoch": 0.9950441258315003, "grad_norm": 36.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -44511160.0, "logits/rejected": -10751520.0, "logps/chosen": -364.0293884277344, "logps/rejected": -144.64334106445312, "loss": 0.1676, "rewards/chosen": 1.1172088384628296, "rewards/margins": 3.999161124229431, "rewards/rejected": -2.8819522857666016, "step": 18773 }, { "epoch": 0.9950971298333025, "grad_norm": 44.75, "kl": 1.6415557861328125, "learning_rate": 5e-07, "logits/chosen": -17244003.2, "logits/rejected": -37154314.666666664, "logps/chosen": -210.0880859375, "logps/rejected": -336.14453125, "loss": 0.2813, "rewards/chosen": 0.6013091564178467, "rewards/margins": 4.484129285812378, "rewards/rejected": -3.8828201293945312, "step": 18774 }, { "epoch": 0.9951501338351045, "grad_norm": 54.25, "kl": 0.37171268463134766, "learning_rate": 5e-07, "logits/chosen": -24782256.0, "logits/rejected": -15514418.0, "logps/chosen": -397.5850423177083, "logps/rejected": -311.19512939453125, "loss": 0.4159, "rewards/chosen": -0.1078239381313324, "rewards/margins": 2.6389602720737457, "rewards/rejected": -2.746784210205078, "step": 18775 }, { "epoch": 0.9952031378369067, "grad_norm": 36.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -20036262.0, "logits/rejected": -20692571.42857143, "logps/chosen": -428.4052734375, "logps/rejected": -213.45828683035714, "loss": 0.1187, "rewards/chosen": 0.7844910025596619, "rewards/margins": 3.8949864336422513, "rewards/rejected": -3.1104954310825894, "step": 18776 }, { "epoch": 0.9952561418387088, "grad_norm": 42.25, "kl": 1.3230819702148438, "learning_rate": 5e-07, "logits/chosen": -16006995.2, "logits/rejected": -11174670.666666666, "logps/chosen": -257.7261962890625, "logps/rejected": -243.3680623372396, "loss": 0.3396, "rewards/chosen": 0.6295287609100342, "rewards/margins": 3.3636739253997803, "rewards/rejected": -2.734145164489746, "step": 18777 }, { "epoch": 0.995309145840511, "grad_norm": 48.0, "kl": 1.0156173706054688, "learning_rate": 5e-07, "logits/chosen": -20342841.333333332, "logits/rejected": -61547224.0, "logps/chosen": -498.8055826822917, "logps/rejected": -671.3148193359375, "loss": 0.2899, "rewards/chosen": 0.8256903489430746, "rewards/margins": 4.839020808537801, "rewards/rejected": -4.013330459594727, "step": 18778 }, { "epoch": 0.9953621498423131, "grad_norm": 62.75, "kl": 3.7851943969726562, "learning_rate": 5e-07, "logits/chosen": -32502178.0, "logps/chosen": -357.5384521484375, "loss": 0.4308, "rewards/chosen": 0.7501029372215271, "step": 18779 }, { "epoch": 0.9954151538441153, "grad_norm": 79.0, "kl": 4.626926422119141, "learning_rate": 5e-07, "logits/chosen": -36494308.571428575, "logits/rejected": 3399729.5, "logps/chosen": -451.03236607142856, "logps/rejected": -69.20048522949219, "loss": 0.4501, "rewards/chosen": 0.4838049752371652, "rewards/margins": 4.305606876100812, "rewards/rejected": -3.8218019008636475, "step": 18780 }, { "epoch": 0.9954681578459174, "grad_norm": 47.5, "kl": 1.2874526977539062, "learning_rate": 5e-07, "logits/chosen": -31141216.0, "logits/rejected": -18361640.0, "logps/chosen": -304.94671630859375, "logps/rejected": -197.07928466796875, "loss": 0.2216, "rewards/chosen": 1.5618360042572021, "rewards/margins": 3.477996349334717, "rewards/rejected": -1.9161603450775146, "step": 18781 }, { "epoch": 0.9955211618477195, "grad_norm": 46.5, "kl": 1.9344253540039062, "learning_rate": 5e-07, "logits/chosen": -33586168.0, "logits/rejected": -15181118.0, "logps/chosen": -284.7503967285156, "logps/rejected": -333.0193176269531, "loss": 0.2651, "rewards/chosen": 0.49687159061431885, "rewards/margins": 3.575548768043518, "rewards/rejected": -3.078677177429199, "step": 18782 }, { "epoch": 0.9955741658495216, "grad_norm": 40.75, "kl": 0.9586524963378906, "learning_rate": 5e-07, "logits/chosen": -11211956.0, "logits/rejected": -10331097.0, "logps/chosen": -380.1429443359375, "logps/rejected": -195.1188507080078, "loss": 0.2226, "rewards/chosen": 0.869706392288208, "rewards/margins": 3.040498971939087, "rewards/rejected": -2.170792579650879, "step": 18783 }, { "epoch": 0.9956271698513238, "grad_norm": 50.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -45161674.666666664, "logits/rejected": 4710326.8, "logps/chosen": -191.94026692708334, "logps/rejected": -549.0119140625, "loss": 0.2116, "rewards/chosen": 0.8307482401529948, "rewards/margins": 3.7753500620524085, "rewards/rejected": -2.944601821899414, "step": 18784 }, { "epoch": 0.9956801738531259, "grad_norm": 44.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -17347740.0, "logits/rejected": -26171706.0, "logps/chosen": -157.20364379882812, "logps/rejected": -449.38909912109375, "loss": 0.2508, "rewards/chosen": 0.44876956939697266, "rewards/margins": 2.917628765106201, "rewards/rejected": -2.4688591957092285, "step": 18785 }, { "epoch": 0.9957331778549281, "grad_norm": 48.75, "kl": 3.2770156860351562, "learning_rate": 5e-07, "logits/chosen": -45082412.0, "logits/rejected": -23065356.0, "logps/chosen": -357.0407409667969, "logps/rejected": -314.6214904785156, "loss": 0.2936, "rewards/chosen": 0.3968111276626587, "rewards/margins": 2.8663562536239624, "rewards/rejected": -2.4695451259613037, "step": 18786 }, { "epoch": 0.9957861818567302, "grad_norm": 45.5, "kl": 1.4469413757324219, "learning_rate": 5e-07, "logits/chosen": -52579372.8, "logits/rejected": -42268464.0, "logps/chosen": -322.313525390625, "logps/rejected": -438.8466796875, "loss": 0.2939, "rewards/chosen": 0.6008013248443603, "rewards/margins": 3.23471786181132, "rewards/rejected": -2.6339165369669595, "step": 18787 }, { "epoch": 0.9958391858585324, "grad_norm": 45.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24895820.0, "logits/rejected": -34584941.333333336, "logps/chosen": -379.9327392578125, "logps/rejected": -352.9152425130208, "loss": 0.1228, "rewards/chosen": 1.882382869720459, "rewards/margins": 4.0033278465271, "rewards/rejected": -2.1209449768066406, "step": 18788 }, { "epoch": 0.9958921898603345, "grad_norm": 68.5, "kl": 1.7939987182617188, "learning_rate": 5e-07, "logits/chosen": -51526022.4, "logits/rejected": -24805066.666666668, "logps/chosen": -265.6580078125, "logps/rejected": -238.92403157552084, "loss": 0.3119, "rewards/chosen": 0.6314638137817383, "rewards/margins": 2.5978525161743162, "rewards/rejected": -1.9663887023925781, "step": 18789 }, { "epoch": 0.9959451938621366, "grad_norm": 61.75, "kl": 0.5567741394042969, "learning_rate": 5e-07, "logits/chosen": -20061982.0, "logits/rejected": -28019248.0, "logps/chosen": -428.20538330078125, "logps/rejected": -444.64056396484375, "loss": 0.1723, "rewards/chosen": 1.3177237510681152, "rewards/margins": 4.195337295532227, "rewards/rejected": -2.8776135444641113, "step": 18790 }, { "epoch": 0.9959981978639387, "grad_norm": 44.75, "kl": 0.8473072052001953, "learning_rate": 5e-07, "logits/chosen": 4216654.5, "logits/rejected": -32283566.0, "logps/chosen": -132.93875122070312, "logps/rejected": -293.610595703125, "loss": 0.247, "rewards/chosen": 0.5061622858047485, "rewards/margins": 3.3989838361740112, "rewards/rejected": -2.8928215503692627, "step": 18791 }, { "epoch": 0.9960512018657408, "grad_norm": 33.0, "kl": 0.40285682678222656, "learning_rate": 5e-07, "logits/chosen": -32116394.0, "logits/rejected": -31297928.0, "logps/chosen": -345.5126953125, "logps/rejected": -264.6727701822917, "loss": 0.1588, "rewards/chosen": 0.7670661807060242, "rewards/margins": 3.2427470088005066, "rewards/rejected": -2.4756808280944824, "step": 18792 }, { "epoch": 0.996104205867543, "grad_norm": 38.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -24837292.0, "logits/rejected": -18171702.0, "logps/chosen": -275.451171875, "logps/rejected": -324.6109924316406, "loss": 0.263, "rewards/chosen": 0.8275737762451172, "rewards/margins": 2.6973334550857544, "rewards/rejected": -1.8697596788406372, "step": 18793 }, { "epoch": 0.9961572098693451, "grad_norm": 37.5, "kl": 4.056872367858887, "learning_rate": 5e-07, "logits/chosen": 692275.2, "logits/rejected": -24130589.333333332, "logps/chosen": -347.1915283203125, "logps/rejected": -451.65625, "loss": 0.366, "rewards/chosen": 0.6031478404998779, "rewards/margins": 3.4056205908457438, "rewards/rejected": -2.8024727503458657, "step": 18794 }, { "epoch": 0.9962102138711473, "grad_norm": 33.25, "kl": 0.9827423095703125, "learning_rate": 5e-07, "logits/chosen": -25316746.666666668, "logits/rejected": -4756862.8, "logps/chosen": -275.4959309895833, "logps/rejected": -240.1483642578125, "loss": 0.2031, "rewards/chosen": 1.0738224983215332, "rewards/margins": 3.5569918632507322, "rewards/rejected": -2.483169364929199, "step": 18795 }, { "epoch": 0.9962632178729494, "grad_norm": 42.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61101866.666666664, "logits/rejected": -25909942.4, "logps/chosen": -366.9241536458333, "logps/rejected": -495.62958984375, "loss": 0.1981, "rewards/chosen": 0.5834564765294393, "rewards/margins": 3.7939735968907677, "rewards/rejected": -3.2105171203613283, "step": 18796 }, { "epoch": 0.9963162218747516, "grad_norm": 57.0, "kl": 0.20965576171875, "learning_rate": 5e-07, "logits/chosen": -16176832.0, "logits/rejected": -23750836.0, "logps/chosen": -245.68533325195312, "logps/rejected": -255.0320587158203, "loss": 0.3264, "rewards/chosen": -0.009583093225955963, "rewards/margins": 1.7438589558005333, "rewards/rejected": -1.7534420490264893, "step": 18797 }, { "epoch": 0.9963692258765536, "grad_norm": 42.5, "kl": 1.6143112182617188, "learning_rate": 5e-07, "logits/chosen": -21395596.0, "logits/rejected": -5179361.5, "logps/chosen": -304.69525146484375, "logps/rejected": -324.8240966796875, "loss": 0.1519, "rewards/chosen": 1.6257225275039673, "rewards/margins": 4.299558520317078, "rewards/rejected": -2.6738359928131104, "step": 18798 }, { "epoch": 0.9964222298783558, "grad_norm": 65.5, "kl": 0.8546295166015625, "learning_rate": 5e-07, "logits/chosen": -8356253.5, "logits/rejected": -100809728.0, "logps/chosen": -574.9338989257812, "logps/rejected": -364.2554931640625, "loss": 0.2336, "rewards/chosen": 0.5356196761131287, "rewards/margins": 3.279972493648529, "rewards/rejected": -2.7443528175354004, "step": 18799 }, { "epoch": 0.9964752338801579, "grad_norm": 34.25, "kl": 2.3740577697753906, "learning_rate": 5e-07, "logits/chosen": -8551500.8, "logits/rejected": -7482954.666666667, "logps/chosen": -229.400146484375, "logps/rejected": -163.79849243164062, "loss": 0.2867, "rewards/chosen": 0.7147167205810547, "rewards/margins": 5.205157089233398, "rewards/rejected": -4.490440368652344, "step": 18800 }, { "epoch": 0.9965282378819601, "grad_norm": 43.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7706706.0, "logits/rejected": -23678672.0, "logps/chosen": -29.28108787536621, "logps/rejected": -364.25425502232144, "loss": 0.1605, "rewards/chosen": 0.028136445209383965, "rewards/margins": 2.3909681058887924, "rewards/rejected": -2.3628316606794084, "step": 18801 }, { "epoch": 0.9965812418837622, "grad_norm": 30.875, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 11618130.0, "logits/rejected": -18064040.0, "logps/chosen": -101.79074096679688, "logps/rejected": -272.067626953125, "loss": 0.1806, "rewards/chosen": -0.19681674242019653, "rewards/margins": 3.201369305451711, "rewards/rejected": -3.3981860478719077, "step": 18802 }, { "epoch": 0.9966342458855644, "grad_norm": 51.25, "kl": 2.9752464294433594, "learning_rate": 5e-07, "logits/chosen": 7833739.333333333, "logits/rejected": -17520940.8, "logps/chosen": -419.8494466145833, "logps/rejected": -460.906396484375, "loss": 0.2286, "rewards/chosen": 1.2986807028452556, "rewards/margins": 5.7870767752329515, "rewards/rejected": -4.488396072387696, "step": 18803 }, { "epoch": 0.9966872498873665, "grad_norm": 59.25, "kl": 4.693129539489746, "learning_rate": 5e-07, "logits/chosen": -4092240.25, "logits/rejected": -26350494.0, "logps/chosen": -185.293212890625, "logps/rejected": -182.5238037109375, "loss": 0.2759, "rewards/chosen": 1.1875810623168945, "rewards/margins": 2.485202670097351, "rewards/rejected": -1.2976216077804565, "step": 18804 }, { "epoch": 0.9967402538891686, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -34391988.0, "logits/rejected": -39111136.0, "logps/chosen": -221.31707763671875, "logps/rejected": -325.484130859375, "loss": 0.2229, "rewards/chosen": 0.6398662328720093, "rewards/margins": 5.156536936759949, "rewards/rejected": -4.5166707038879395, "step": 18805 }, { "epoch": 0.9967932578909707, "grad_norm": 63.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -9355804.0, "logits/rejected": -36612624.0, "logps/chosen": -274.01824951171875, "logps/rejected": -260.7430419921875, "loss": 0.2808, "rewards/chosen": -0.052510082721710205, "rewards/margins": 1.6975379188855488, "rewards/rejected": -1.750048001607259, "step": 18806 }, { "epoch": 0.9968462618927729, "grad_norm": 64.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 7664561.0, "logits/rejected": -14386541.0, "logps/chosen": -348.5055236816406, "logps/rejected": -389.16302490234375, "loss": 0.274, "rewards/chosen": 0.6378939747810364, "rewards/margins": 2.8790624737739563, "rewards/rejected": -2.24116849899292, "step": 18807 }, { "epoch": 0.996899265894575, "grad_norm": 45.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -4865068.0, "logits/rejected": -10660982.0, "logps/chosen": -182.40419921875, "logps/rejected": -587.1223958333334, "loss": 0.302, "rewards/chosen": 0.24895942211151123, "rewards/margins": 4.545915484428406, "rewards/rejected": -4.2969560623168945, "step": 18808 }, { "epoch": 0.9969522698963772, "grad_norm": 35.75, "kl": 0.9240951538085938, "learning_rate": 5e-07, "logits/chosen": -38906680.0, "logits/rejected": -40990216.0, "logps/chosen": -314.0687561035156, "logps/rejected": -467.7496337890625, "loss": 0.2106, "rewards/chosen": 1.136555790901184, "rewards/margins": 4.766468644142151, "rewards/rejected": -3.629912853240967, "step": 18809 }, { "epoch": 0.9970052738981793, "grad_norm": 42.0, "kl": 2.1288528442382812, "learning_rate": 5e-07, "logits/chosen": -32996064.0, "logits/rejected": -17949202.666666668, "logps/chosen": -242.5138427734375, "logps/rejected": -187.2426554361979, "loss": 0.3281, "rewards/chosen": 0.5253930568695069, "rewards/margins": 2.4774049599965413, "rewards/rejected": -1.9520119031270344, "step": 18810 }, { "epoch": 0.9970582778999815, "grad_norm": 37.25, "kl": 1.441579818725586, "learning_rate": 5e-07, "logits/chosen": -31767184.0, "logits/rejected": -7932044.0, "logps/chosen": -312.57451171875, "logps/rejected": -340.12091064453125, "loss": 0.251, "rewards/chosen": 0.9497919082641602, "rewards/margins": 3.4421472549438477, "rewards/rejected": -2.4923553466796875, "step": 18811 }, { "epoch": 0.9971112819017836, "grad_norm": 30.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -19407337.333333332, "logits/rejected": -39145366.4, "logps/chosen": -294.7517903645833, "logps/rejected": -576.646435546875, "loss": 0.1072, "rewards/chosen": 1.2428905169169109, "rewards/margins": 5.638204924265544, "rewards/rejected": -4.395314407348633, "step": 18812 }, { "epoch": 0.9971642859035857, "grad_norm": 66.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40888326.4, "logits/rejected": -21961158.666666668, "logps/chosen": -431.9568359375, "logps/rejected": -208.67268880208334, "loss": 0.3419, "rewards/chosen": 0.23569674491882325, "rewards/margins": 1.973221731185913, "rewards/rejected": -1.7375249862670898, "step": 18813 }, { "epoch": 0.9972172899053878, "grad_norm": 34.0, "kl": 2.2026515007019043, "learning_rate": 5e-07, "logits/chosen": -6159682.0, "logits/rejected": -7926514.4, "logps/chosen": -54.98076375325521, "logps/rejected": -374.8312744140625, "loss": 0.2476, "rewards/chosen": 0.05500564972559611, "rewards/margins": 2.0796054879824317, "rewards/rejected": -2.0245998382568358, "step": 18814 }, { "epoch": 0.99727029390719, "grad_norm": 47.5, "kl": 0.0343170166015625, "learning_rate": 5e-07, "logits/chosen": -3885302.4, "logits/rejected": -140492.33333333334, "logps/chosen": -96.77720947265625, "logps/rejected": -182.73185221354166, "loss": 0.3223, "rewards/chosen": 0.11136434078216553, "rewards/margins": 3.8825117508570353, "rewards/rejected": -3.7711474100748696, "step": 18815 }, { "epoch": 0.9973232979089921, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -22476004.0, "logits/rejected": -2047315.75, "logps/chosen": -364.84124755859375, "logps/rejected": -54.938270568847656, "loss": 0.2801, "rewards/chosen": 0.8369529247283936, "rewards/margins": 2.6755168437957764, "rewards/rejected": -1.8385639190673828, "step": 18816 }, { "epoch": 0.9973763019107943, "grad_norm": 52.25, "kl": 4.75640869140625, "learning_rate": 5e-07, "logits/chosen": -18728326.4, "logits/rejected": -10180258.666666666, "logps/chosen": -264.0183349609375, "logps/rejected": -216.36767578125, "loss": 0.3786, "rewards/chosen": 0.7708552360534668, "rewards/margins": 3.9272393862406414, "rewards/rejected": -3.1563841501871743, "step": 18817 }, { "epoch": 0.9974293059125964, "grad_norm": 51.75, "kl": 3.026712417602539, "learning_rate": 5e-07, "logits/chosen": 1802966.875, "logits/rejected": -47738292.0, "logps/chosen": -109.33138275146484, "logps/rejected": -324.35784912109375, "loss": 0.351, "rewards/chosen": 0.19959479570388794, "rewards/margins": 2.2665475010871887, "rewards/rejected": -2.066952705383301, "step": 18818 }, { "epoch": 0.9974823099143986, "grad_norm": 35.0, "kl": 1.2686958312988281, "learning_rate": 5e-07, "logits/chosen": -49633877.333333336, "logits/rejected": -18912163.2, "logps/chosen": -285.36171468098956, "logps/rejected": -631.27216796875, "loss": 0.1572, "rewards/chosen": 0.6183441082636515, "rewards/margins": 4.651268378893534, "rewards/rejected": -4.0329242706298825, "step": 18819 }, { "epoch": 0.9975353139162006, "grad_norm": 50.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -80980488.0, "logits/rejected": -49675176.0, "logps/chosen": -422.6887512207031, "logps/rejected": -339.1846618652344, "loss": 0.293, "rewards/chosen": 0.3215934634208679, "rewards/margins": 2.415962874889374, "rewards/rejected": -2.094369411468506, "step": 18820 }, { "epoch": 0.9975883179180028, "grad_norm": 37.0, "kl": 1.3383660316467285, "learning_rate": 5e-07, "logits/chosen": -7835792.8, "logits/rejected": -16857640.0, "logps/chosen": -192.1002685546875, "logps/rejected": -239.72306315104166, "loss": 0.2034, "rewards/chosen": 1.223748779296875, "rewards/margins": 3.9166319529215494, "rewards/rejected": -2.6928831736246743, "step": 18821 }, { "epoch": 0.9976413219198049, "grad_norm": 39.25, "kl": 3.5394935607910156, "learning_rate": 5e-07, "logits/chosen": -4461320.0, "logits/rejected": -23563624.0, "logps/chosen": -212.388232421875, "logps/rejected": -374.0850016276042, "loss": 0.2006, "rewards/chosen": 1.3584877014160157, "rewards/margins": 5.103665606180827, "rewards/rejected": -3.745177904764811, "step": 18822 }, { "epoch": 0.9976943259216071, "grad_norm": 45.0, "kl": 0.2583274841308594, "learning_rate": 5e-07, "logits/chosen": 17329312.0, "logits/rejected": -37218074.666666664, "logps/chosen": -45.15190124511719, "logps/rejected": -300.68239339192706, "loss": 0.1368, "rewards/chosen": 1.481595754623413, "rewards/margins": 4.289802312850952, "rewards/rejected": -2.808206558227539, "step": 18823 }, { "epoch": 0.9977473299234092, "grad_norm": 37.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -101150552.0, "logits/rejected": -26586569.14285714, "logps/chosen": -377.18487548828125, "logps/rejected": -289.2745884486607, "loss": 0.1566, "rewards/chosen": 1.416015625, "rewards/margins": 3.501275198800223, "rewards/rejected": -2.085259573800223, "step": 18824 }, { "epoch": 0.9978003339252114, "grad_norm": 39.75, "kl": 2.6154422760009766, "learning_rate": 5e-07, "logits/chosen": -122755.33333333333, "logits/rejected": -7830328.0, "logps/chosen": -43.91865030924479, "logps/rejected": -266.3256103515625, "loss": 0.27, "rewards/chosen": 0.5647978782653809, "rewards/margins": 2.9162829399108885, "rewards/rejected": -2.3514850616455076, "step": 18825 }, { "epoch": 0.9978533379270135, "grad_norm": 42.5, "kl": 0.9001045227050781, "learning_rate": 5e-07, "logits/chosen": -22508392.0, "logits/rejected": 163552544.0, "logps/chosen": -296.1368713378906, "logps/rejected": -513.2525024414062, "loss": 0.1855, "rewards/chosen": 1.0613888502120972, "rewards/margins": 4.113053202629089, "rewards/rejected": -3.051664352416992, "step": 18826 }, { "epoch": 0.9979063419288157, "grad_norm": 43.5, "kl": 0.7161846160888672, "learning_rate": 5e-07, "logits/chosen": -74020748.8, "logits/rejected": 111350133.33333333, "logps/chosen": -326.907470703125, "logps/rejected": -515.757080078125, "loss": 0.312, "rewards/chosen": 0.18381623029708863, "rewards/margins": 3.3845627665519715, "rewards/rejected": -3.200746536254883, "step": 18827 }, { "epoch": 0.9979593459306177, "grad_norm": 50.0, "kl": 3.9157981872558594, "learning_rate": 5e-07, "logits/chosen": -27189624.0, "logps/chosen": -377.9462890625, "loss": 0.3914, "rewards/chosen": 1.015968680381775, "step": 18828 }, { "epoch": 0.9980123499324199, "grad_norm": 26.625, "kl": 2.1714439392089844, "learning_rate": 5e-07, "logits/chosen": 4454435.5, "logits/rejected": -28645740.0, "logps/chosen": -466.48675537109375, "logps/rejected": -289.6086730957031, "loss": 0.2052, "rewards/chosen": 1.265304446220398, "rewards/margins": 4.9239853620529175, "rewards/rejected": -3.6586809158325195, "step": 18829 }, { "epoch": 0.998065353934222, "grad_norm": 46.5, "kl": 0.3423042297363281, "learning_rate": 5e-07, "logits/chosen": -18188145.333333332, "logits/rejected": -17724422.4, "logps/chosen": -367.3495686848958, "logps/rejected": -400.809912109375, "loss": 0.1477, "rewards/chosen": 1.0016059875488281, "rewards/margins": 4.89402961730957, "rewards/rejected": -3.892423629760742, "step": 18830 }, { "epoch": 0.9981183579360242, "grad_norm": 34.5, "kl": 1.5443687438964844, "learning_rate": 5e-07, "logits/chosen": -45781088.0, "logits/rejected": -27069305.6, "logps/chosen": -232.6817423502604, "logps/rejected": -280.5783935546875, "loss": 0.2497, "rewards/chosen": 0.7550449371337891, "rewards/margins": 3.085955810546875, "rewards/rejected": -2.330910873413086, "step": 18831 }, { "epoch": 0.9981713619378263, "grad_norm": 40.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -32174021.333333332, "logits/rejected": -15946540.8, "logps/chosen": -296.0008544921875, "logps/rejected": -233.0449462890625, "loss": 0.2613, "rewards/chosen": 0.40525996685028076, "rewards/margins": 3.004132342338562, "rewards/rejected": -2.5988723754882814, "step": 18832 }, { "epoch": 0.9982243659396285, "grad_norm": 32.0, "kl": 1.050628662109375, "learning_rate": 5e-07, "logits/chosen": -39822352.0, "logits/rejected": -30981352.0, "logps/chosen": -385.29266357421875, "logps/rejected": -387.703125, "loss": 0.1444, "rewards/chosen": 1.7643197774887085, "rewards/margins": 5.315862774848938, "rewards/rejected": -3.5515429973602295, "step": 18833 }, { "epoch": 0.9982773699414306, "grad_norm": 47.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -29053324.0, "logits/rejected": -26999562.0, "logps/chosen": -250.47145080566406, "logps/rejected": -230.24502563476562, "loss": 0.2964, "rewards/chosen": -0.2281862199306488, "rewards/margins": 3.5286015570163727, "rewards/rejected": -3.7567877769470215, "step": 18834 }, { "epoch": 0.9983303739432328, "grad_norm": 43.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -68350885.33333333, "logits/rejected": -5666646.4, "logps/chosen": -142.41278076171875, "logps/rejected": -206.014111328125, "loss": 0.203, "rewards/chosen": 1.2681574821472168, "rewards/margins": 3.294851207733154, "rewards/rejected": -2.0266937255859374, "step": 18835 }, { "epoch": 0.9983833779450348, "grad_norm": 35.0, "kl": 2.025777816772461, "learning_rate": 5e-07, "logits/chosen": 1515821.3333333333, "logits/rejected": -28684857.6, "logps/chosen": -122.19584147135417, "logps/rejected": -473.030859375, "loss": 0.2992, "rewards/chosen": -0.5151949723561605, "rewards/margins": 2.8750733534495034, "rewards/rejected": -3.390268325805664, "step": 18836 }, { "epoch": 0.998436381946837, "grad_norm": 49.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -65359178.666666664, "logits/rejected": -27956777.6, "logps/chosen": -614.7300618489584, "logps/rejected": -319.99287109375, "loss": 0.1832, "rewards/chosen": 0.5225402514139811, "rewards/margins": 3.8088546435038246, "rewards/rejected": -3.2863143920898437, "step": 18837 }, { "epoch": 0.9984893859486391, "grad_norm": 45.0, "kl": 10.529460906982422, "learning_rate": 5e-07, "logits/chosen": -21028411.2, "logits/rejected": -14488224.0, "logps/chosen": -393.2625, "logps/rejected": -182.8486328125, "loss": 0.2576, "rewards/chosen": 1.5579795837402344, "rewards/margins": 3.9271764755249023, "rewards/rejected": -2.369196891784668, "step": 18838 }, { "epoch": 0.9985423899504413, "grad_norm": 51.25, "kl": 5.632544994354248, "learning_rate": 5e-07, "logits/chosen": -25175804.8, "logits/rejected": 1909286.5, "logps/chosen": -263.108154296875, "logps/rejected": -282.75091552734375, "loss": 0.4537, "rewards/chosen": 0.22276685237884522, "rewards/margins": 3.0755144516626993, "rewards/rejected": -2.852747599283854, "step": 18839 }, { "epoch": 0.9985953939522434, "grad_norm": 42.0, "kl": 6.850982666015625, "learning_rate": 5e-07, "logits/chosen": -18635046.4, "logits/rejected": -6082372.666666667, "logps/chosen": -302.902685546875, "logps/rejected": -310.51171875, "loss": 0.287, "rewards/chosen": 1.6580341339111329, "rewards/margins": 4.796774355570475, "rewards/rejected": -3.1387402216593423, "step": 18840 }, { "epoch": 0.9986483979540456, "grad_norm": 55.75, "kl": 2.3162670135498047, "learning_rate": 5e-07, "logits/chosen": 7831018.666666667, "logits/rejected": -7470747.0, "logps/chosen": -347.6049397786458, "logps/rejected": -96.74855041503906, "loss": 0.2545, "rewards/chosen": 1.2181503772735596, "rewards/margins": 5.169892072677612, "rewards/rejected": -3.9517416954040527, "step": 18841 }, { "epoch": 0.9987014019558477, "grad_norm": 43.75, "kl": 1.8762550354003906, "learning_rate": 5e-07, "logits/chosen": -44598092.0, "logits/rejected": 3596349.5, "logps/chosen": -314.7879638671875, "logps/rejected": -360.059326171875, "loss": 0.2409, "rewards/chosen": 0.5230217576026917, "rewards/margins": 4.466170370578766, "rewards/rejected": -3.943148612976074, "step": 18842 }, { "epoch": 0.9987544059576497, "grad_norm": 54.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -51016164.0, "logits/rejected": -23163924.0, "logps/chosen": -324.5580139160156, "logps/rejected": -259.611083984375, "loss": 0.3421, "rewards/chosen": -0.16348084807395935, "rewards/margins": 1.9273106157779694, "rewards/rejected": -2.0907914638519287, "step": 18843 }, { "epoch": 0.9988074099594519, "grad_norm": 42.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -49338652.0, "logits/rejected": -30979078.0, "logps/chosen": -300.042236328125, "logps/rejected": -365.483642578125, "loss": 0.2745, "rewards/chosen": 0.23819006979465485, "rewards/margins": 3.127153769135475, "rewards/rejected": -2.8889636993408203, "step": 18844 }, { "epoch": 0.998860413961254, "grad_norm": 53.5, "kl": 1.6178550720214844, "learning_rate": 5e-07, "logits/chosen": -21640243.2, "logits/rejected": -14021846.666666666, "logps/chosen": -648.65126953125, "logps/rejected": -224.70357259114584, "loss": 0.2549, "rewards/chosen": 0.7981231689453125, "rewards/margins": 5.236976750691731, "rewards/rejected": -4.438853581746419, "step": 18845 }, { "epoch": 0.9989134179630562, "grad_norm": 43.75, "kl": 4.587453842163086, "learning_rate": 5e-07, "logits/chosen": -8221794.0, "logits/rejected": 14826698.0, "logps/chosen": -180.08072916666666, "logps/rejected": -414.6297607421875, "loss": 0.4716, "rewards/chosen": 0.0781150758266449, "rewards/margins": 2.1860452592372894, "rewards/rejected": -2.1079301834106445, "step": 18846 }, { "epoch": 0.9989664219648583, "grad_norm": 44.75, "kl": 3.2889022827148438, "learning_rate": 5e-07, "logits/chosen": -80544640.0, "logits/rejected": -43675690.666666664, "logps/chosen": -315.18291015625, "logps/rejected": -419.419921875, "loss": 0.376, "rewards/chosen": 0.041630092263221743, "rewards/margins": 2.366192165017128, "rewards/rejected": -2.3245620727539062, "step": 18847 }, { "epoch": 0.9990194259666605, "grad_norm": 42.25, "kl": 1.7916812896728516, "learning_rate": 5e-07, "logits/chosen": -41506393.6, "logits/rejected": -48035562.666666664, "logps/chosen": -760.15810546875, "logps/rejected": -509.815673828125, "loss": 0.1933, "rewards/chosen": 1.7277429580688477, "rewards/margins": 5.420527903238932, "rewards/rejected": -3.6927849451700845, "step": 18848 }, { "epoch": 0.9990724299684626, "grad_norm": 47.0, "kl": 4.0577545166015625, "learning_rate": 5e-07, "logits/chosen": -1967720.0, "logits/rejected": 652703.75, "logps/chosen": -116.58009847005208, "logps/rejected": -135.61837768554688, "loss": 0.4445, "rewards/chosen": 0.35784482955932617, "rewards/margins": 2.8346798419952393, "rewards/rejected": -2.476835012435913, "step": 18849 }, { "epoch": 0.9991254339702648, "grad_norm": 58.25, "kl": 3.1887130737304688, "learning_rate": 5e-07, "logits/chosen": -68303888.0, "logits/rejected": -17698946.0, "logps/chosen": -476.62725830078125, "logps/rejected": -225.39930725097656, "loss": 0.3464, "rewards/chosen": 0.31108859181404114, "rewards/margins": 1.5419327318668365, "rewards/rejected": -1.2308441400527954, "step": 18850 }, { "epoch": 0.9991784379720668, "grad_norm": 55.0, "kl": 3.80914306640625, "learning_rate": 5e-07, "logits/chosen": -37340233.6, "logits/rejected": -52200789.333333336, "logps/chosen": -392.7615966796875, "logps/rejected": -338.2024739583333, "loss": 0.3677, "rewards/chosen": 0.11629902124404908, "rewards/margins": 2.289503443241119, "rewards/rejected": -2.1732044219970703, "step": 18851 }, { "epoch": 0.999231441973869, "grad_norm": 62.75, "kl": 3.5475330352783203, "learning_rate": 5e-07, "logits/chosen": -42561184.0, "logits/rejected": -23211354.0, "logps/chosen": -418.41162109375, "logps/rejected": -147.31893920898438, "loss": 0.3969, "rewards/chosen": 0.35899380842844647, "rewards/margins": 4.477189342180888, "rewards/rejected": -4.118195533752441, "step": 18852 }, { "epoch": 0.9992844459756711, "grad_norm": 59.75, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -40998664.0, "logits/rejected": -4296182.0, "logps/chosen": -264.5574035644531, "logps/rejected": -208.21636962890625, "loss": 0.267, "rewards/chosen": 0.00724029541015625, "rewards/margins": 1.9449377059936523, "rewards/rejected": -1.937697410583496, "step": 18853 }, { "epoch": 0.9993374499774733, "grad_norm": 64.5, "kl": 0.9254322052001953, "learning_rate": 5e-07, "logits/chosen": -30313820.0, "logits/rejected": -7149895.5, "logps/chosen": -121.02891540527344, "logps/rejected": -115.71485137939453, "loss": 0.3861, "rewards/chosen": -0.10887466371059418, "rewards/margins": 1.2206503301858902, "rewards/rejected": -1.3295249938964844, "step": 18854 }, { "epoch": 0.9993904539792754, "grad_norm": 36.25, "kl": 0.029987335205078125, "learning_rate": 5e-07, "logits/chosen": -17550684.0, "logits/rejected": -24819398.0, "logps/chosen": -125.77578735351562, "logps/rejected": -301.414306640625, "loss": 0.3497, "rewards/chosen": 0.2525822917620341, "rewards/margins": 3.749204377333323, "rewards/rejected": -3.496622085571289, "step": 18855 }, { "epoch": 0.9994434579810776, "grad_norm": 62.75, "kl": 0.9138450622558594, "learning_rate": 5e-07, "logits/chosen": -36305684.0, "logits/rejected": -15294021.333333334, "logps/chosen": -493.1812438964844, "logps/rejected": -297.24365234375, "loss": 0.1622, "rewards/chosen": 0.839324951171875, "rewards/margins": 3.773258845011393, "rewards/rejected": -2.933933893839518, "step": 18856 }, { "epoch": 0.9994964619828797, "grad_norm": 38.25, "kl": 3.1429443359375, "learning_rate": 5e-07, "logits/chosen": -3586211.0, "logits/rejected": -41549068.0, "logps/chosen": -393.52972412109375, "logps/rejected": -484.3359680175781, "loss": 0.2009, "rewards/chosen": 1.1575233936309814, "rewards/margins": 3.9451301097869873, "rewards/rejected": -2.787606716156006, "step": 18857 }, { "epoch": 0.9995494659846819, "grad_norm": 48.5, "kl": 3.472562789916992, "learning_rate": 5e-07, "logits/chosen": -68663152.0, "logits/rejected": -24443539.2, "logps/chosen": -278.17901611328125, "logps/rejected": -211.853369140625, "loss": 0.2661, "rewards/chosen": 0.7196798324584961, "rewards/margins": 3.414389801025391, "rewards/rejected": -2.6947099685668947, "step": 18858 }, { "epoch": 0.9996024699864839, "grad_norm": 47.75, "kl": 0.22740936279296875, "learning_rate": 5e-07, "logits/chosen": -13682745.0, "logits/rejected": -21746618.0, "logps/chosen": -329.49072265625, "logps/rejected": -148.47508239746094, "loss": 0.2495, "rewards/chosen": 0.4365377426147461, "rewards/margins": 2.9345169067382812, "rewards/rejected": -2.497979164123535, "step": 18859 }, { "epoch": 0.9996554739882861, "grad_norm": 38.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -61255392.0, "logits/rejected": -78831302.4, "logps/chosen": -457.2833658854167, "logps/rejected": -263.459521484375, "loss": 0.1471, "rewards/chosen": 1.0795501867930095, "rewards/margins": 3.9004432837168377, "rewards/rejected": -2.820893096923828, "step": 18860 }, { "epoch": 0.9997084779900882, "grad_norm": 72.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -63324992.0, "logits/rejected": -10885596.8, "logps/chosen": -378.388427734375, "logps/rejected": -476.72802734375, "loss": 0.2972, "rewards/chosen": 0.46337080001831055, "rewards/margins": 2.46561803817749, "rewards/rejected": -2.0022472381591796, "step": 18861 }, { "epoch": 0.9997614819918904, "grad_norm": 40.5, "kl": 0.9050140380859375, "learning_rate": 5e-07, "logits/chosen": -63719716.0, "logits/rejected": -27709136.0, "logps/chosen": -196.8599395751953, "logps/rejected": -388.43707275390625, "loss": 0.3169, "rewards/chosen": 0.2493787705898285, "rewards/margins": 2.136397808790207, "rewards/rejected": -1.8870190382003784, "step": 18862 }, { "epoch": 0.9998144859936925, "grad_norm": 29.125, "kl": 2.797443389892578, "learning_rate": 5e-07, "logits/chosen": 6671271.5, "logits/rejected": -12481520.0, "logps/chosen": -10.931475639343262, "logps/rejected": -176.43310546875, "loss": 0.2621, "rewards/chosen": -0.07620535045862198, "rewards/margins": 2.4339048688610396, "rewards/rejected": -2.5101102193196616, "step": 18863 }, { "epoch": 0.9998674899954947, "grad_norm": 38.5, "kl": 2.0345306396484375, "learning_rate": 5e-07, "logits/chosen": -63743992.0, "logits/rejected": -20142662.0, "logps/chosen": -661.7557983398438, "logps/rejected": -297.8194580078125, "loss": 0.1918, "rewards/chosen": 2.126572847366333, "rewards/margins": 4.458284854888916, "rewards/rejected": -2.331712007522583, "step": 18864 }, { "epoch": 0.9999204939972968, "grad_norm": 61.5, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": 13064617.0, "logits/rejected": -24658180.0, "logps/chosen": -266.12603759765625, "logps/rejected": -574.2349853515625, "loss": 0.3911, "rewards/chosen": -0.19895124435424805, "rewards/margins": 1.3501454591751099, "rewards/rejected": -1.549096703529358, "step": 18865 }, { "epoch": 0.999973497999099, "grad_norm": 54.0, "kl": 2.488053321838379, "learning_rate": 5e-07, "logits/chosen": -28863666.0, "logits/rejected": -1963090.0, "logps/chosen": -345.57049560546875, "logps/rejected": -237.34454345703125, "loss": 0.3204, "rewards/chosen": 0.9786570072174072, "rewards/margins": 2.691565990447998, "rewards/rejected": -1.7129089832305908, "step": 18866 }, { "epoch": 1.0, "grad_norm": 41.25, "kl": 0.3965892791748047, "learning_rate": 5e-07, "logits/chosen": -13086869.333333334, "logits/rejected": 3484599.0, "logps/chosen": -176.5341593424479, "logps/rejected": -456.48712158203125, "loss": 0.1769, "rewards/chosen": 0.2930729587872823, "rewards/margins": 2.6646154125531516, "rewards/rejected": -2.371542453765869, "step": 18867 } ], "logging_steps": 1, "max_steps": 18867, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9434, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }